class StreamsBounceTest(KafkaTest): """ Simple test of Kafka Streams. """ def __init__(self, test_context): super(StreamsBounceTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={ 'echo' : { 'partitions': 5, 'replication-factor': 2 }, 'data' : { 'partitions': 5, 'replication-factor': 2 }, 'min' : { 'partitions': 5, 'replication-factor': 2 }, 'max' : { 'partitions': 5, 'replication-factor': 2 }, 'sum' : { 'partitions': 5, 'replication-factor': 2 }, 'dif' : { 'partitions': 5, 'replication-factor': 2 }, 'cnt' : { 'partitions': 5, 'replication-factor': 2 }, 'avg' : { 'partitions': 5, 'replication-factor': 2 }, 'wcnt' : { 'partitions': 5, 'replication-factor': 2 }, 'tagg' : { 'partitions': 5, 'replication-factor': 2 } }) self.driver = StreamsSmokeTestDriverService(test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) @cluster(num_nodes=6) def test_bounce(self): """ Start a smoke test client, then abort (kill -9) and restart it a few times. Ensure that all records are delivered. """ self.driver.start() self.processor1.start() time.sleep(15) self.processor1.abortThenRestart() time.sleep(15) # enable this after we add change log partition replicas #self.kafka.signal_leader("data") #time.sleep(15); self.processor1.abortThenRestart() self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False)
class StreamsSmokeTest(KafkaTest): """ Simple test of Kafka Streams. """ def __init__(self, test_context): super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=3, topics={ 'echo' : { 'partitions': 5, 'replication-factor': 1 }, 'data' : { 'partitions': 5, 'replication-factor': 1 }, 'min' : { 'partitions': 5, 'replication-factor': 1 }, 'max' : { 'partitions': 5, 'replication-factor': 1 }, 'sum' : { 'partitions': 5, 'replication-factor': 1 }, 'dif' : { 'partitions': 5, 'replication-factor': 1 }, 'cnt' : { 'partitions': 5, 'replication-factor': 1 }, 'avg' : { 'partitions': 5, 'replication-factor': 1 }, 'wcnt' : { 'partitions': 5, 'replication-factor': 1 }, 'tagg' : { 'partitions': 5, 'replication-factor': 1 } }) self.driver = StreamsSmokeTestDriverService(test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) self.processor2 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) self.processor3 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) self.processor4 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) @cluster(num_nodes=9) def test_streams(self): """ Start a few smoke test clients, then repeat start a new one, stop (cleanly) running one a few times. Ensure that all results (stats on values computed by Kafka Streams) are correct. """ self.driver.start() self.processor1.start() self.processor2.start() time.sleep(15) self.processor3.start() self.processor1.stop() time.sleep(15) self.processor4.start() self.driver.wait() self.driver.stop() self.processor2.stop() self.processor3.stop() self.processor4.stop() node = self.driver.node node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
class StreamsSmokeTest(KafkaTest): """ Simple test of Kafka Streams. """ def __init__(self, test_context): super(StreamsSmokeTest, self).__init__(test_context, num_zk=1, num_brokers=2, topics={ 'echo' : { 'partitions': 5, 'replication-factor': 1 }, 'data' : { 'partitions': 5, 'replication-factor': 1 }, 'min' : { 'partitions': 5, 'replication-factor': 1 }, 'max' : { 'partitions': 5, 'replication-factor': 1 }, 'sum' : { 'partitions': 5, 'replication-factor': 1 }, 'dif' : { 'partitions': 5, 'replication-factor': 1 }, 'cnt' : { 'partitions': 5, 'replication-factor': 1 }, 'avg' : { 'partitions': 5, 'replication-factor': 1 }, 'wcnt' : { 'partitions': 5, 'replication-factor': 1 }, 'tagg' : { 'partitions': 5, 'replication-factor': 1 } }) self.driver = StreamsSmokeTestDriverService(test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) self.processor2 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) self.processor3 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) self.processor4 = StreamsSmokeTestJobRunnerService(test_context, self.kafka) @cluster(num_nodes=8) def test_streams(self): """ Start a few smoke test clients, then repeat start a new one, stop (cleanly) running one a few times. Ensure that all results (stats on values computed by Kafka Streams) are correct. """ self.driver.start() self.processor1.start() self.processor2.start() time.sleep(15) self.processor3.start() self.processor1.stop() time.sleep(15) self.processor4.start() self.driver.wait() self.driver.stop() self.processor2.stop() self.processor3.stop() self.processor4.stop() node = self.driver.node node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
class StreamsBrokerBounceTest(Test): """ Simple test of Kafka Streams with brokers failing """ def __init__(self, test_context): super(StreamsBrokerBounceTest, self).__init__(test_context) self.replication = 3 self.partitions = 3 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, '__consumer_offsets': { 'partitions': 50, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } } } def fail_broker_type(self, failure_mode, broker_type): # Pick a random topic and bounce it's leader topic_index = randint(0, len(self.topics.keys()) - 1) topic = self.topics.keys()[topic_index] failures[failure_mode](self, topic, broker_type) def fail_many_brokers(self, failure_mode, num_failures): sig = signal.SIGTERM if (failure_mode == "clean_shutdown"): sig = signal.SIGTERM else: sig = signal.SIGKILL for num in range(0, num_failures - 1): signal_node(self, self.kafka.nodes[num], sig) def setup_system(self, start_processor=True): # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=self.replication, zk=self.zk, topics=self.topics) self.kafka.start() # Start test harness self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() if (start_processor): self.processor1.start() def collect_results(self, sleep_time_secs): data = {} # End test self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node # Success is declared if streams does not crash when sleep time > 0 # It should give an exception when sleep time is 0 since we kill the brokers immediately # and the topic manager cannot create internal topics with the desired replication factor if (sleep_time_secs == 0): output_streams = self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-EXCEPTION %s" % self.processor1.STDOUT_FILE, allow_fail=False) else: output_streams = self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) for line in output_streams: data["Client closed"] = line # Currently it is hard to guarantee anything about Kafka since we don't have exactly once. # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS output = node.account.ssh_capture( "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False) for line in output: data["Records Delivered"] = line output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" % self.driver.STDOUT_FILE, allow_fail=False) for line in output: data["Logic Success/Failure"] = line return data @cluster(num_nodes=7) @matrix(failure_mode=[ "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce" ], broker_type=["leader", "controller"], sleep_time_secs=[120]) def test_broker_type_bounce(self, failure_mode, broker_type, sleep_time_secs): """ Start a smoke test client, then kill one particular broker and ensure data is still received Record if records are delivered. """ self.setup_system() # Sleep to allow test to run for a bit time.sleep(sleep_time_secs) # Fail brokers self.fail_broker_type(failure_mode, broker_type) return self.collect_results(sleep_time_secs) @ignore @cluster(num_nodes=7) @matrix(failure_mode=["clean_shutdown"], broker_type=["controller"], sleep_time_secs=[0]) def test_broker_type_bounce_at_start(self, failure_mode, broker_type, sleep_time_secs): """ Start a smoke test client, then kill one particular broker immediately before streams stats Streams should throw an exception since it cannot create topics with the desired replication factor of 3 """ self.setup_system(start_processor=False) # Sleep to allow test to run for a bit time.sleep(sleep_time_secs) # Fail brokers self.fail_broker_type(failure_mode, broker_type) self.processor1.start() return self.collect_results(sleep_time_secs) @cluster(num_nodes=7) @matrix(failure_mode=[ "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce" ], num_failures=[2]) def test_many_brokers_bounce(self, failure_mode, num_failures): """ Start a smoke test client, then kill a few brokers and ensure data is still received Record if records are delivered """ self.setup_system() # Sleep to allow test to run for a bit time.sleep(120) # Fail brokers self.fail_many_brokers(failure_mode, num_failures) return self.collect_results(120) @cluster(num_nodes=7) @matrix(failure_mode=["clean_bounce", "hard_bounce"], num_failures=[3]) def test_all_brokers_bounce(self, failure_mode, num_failures): """ Start a smoke test client, then kill a few brokers and ensure data is still received Record if records are delivered """ self.setup_system() # Sleep to allow test to run for a bit time.sleep(120) # Fail brokers self.fail_many_brokers(failure_mode, num_failures) return self.collect_results(120)
def test_upgrade_downgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ if from_version == to_version: return self.replication = 3 self.num_kafka_nodes = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } } } # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=self.num_kafka_nodes, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created wait_until(lambda: self.confirm_topics_on_all_brokers( set(self.topics.keys())), timeout_sec=60, err_msg="Broker did not create all topics in 60 seconds ") self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) processor = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka) with self.driver.node.account.monitor_log( self.driver.STDOUT_FILE) as driver_monitor: self.driver.start() with processor.node.account.monitor_log( processor.STDOUT_FILE) as monitor: processor.start() monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on " % self.processed_msg + str(processor.node)) connected_message = "Discovered group coordinator" with processor.node.account.monitor_log( processor.LOG_FILE) as log_monitor: with processor.node.account.monitor_log( processor.STDOUT_FILE) as stdout_monitor: self.perform_broker_upgrade(to_version) log_monitor.wait_until( connected_message, timeout_sec=120, err_msg=("Never saw output '%s' on " % connected_message) + str(processor.node.account)) stdout_monitor.wait_until( self.processed_msg, timeout_sec=60, err_msg="Never saw output '%s' on" % self.processed_msg + str(processor.node.account)) # SmokeTestDriver allows up to 6 minutes to consume all # records for the verification step so this timeout is set to # 6 minutes (360 seconds) for consuming of verification records # and a very conservative additional 2 minutes (120 seconds) to process # the records in the verification step driver_monitor.wait_until( 'ALL-RECORDS-DELIVERED\|PROCESSED-MORE-THAN-GENERATED', timeout_sec=480, err_msg="Never saw output '%s' on" % 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED' + str(self.driver.node.account)) self.driver.stop() processor.stop() processor.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % processor.STDOUT_FILE, allow_fail=False)
class StreamsUpgradeTest(Test): """ Tests rolling upgrades and downgrades of the Kafka Streams library. """ def __init__(self, test_context): super(StreamsUpgradeTest, self).__init__(test_context) self.replication = 3 self.partitions = 1 self.isr = 2 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": self.isr } } } def perform_streams_upgrade(self, to_version): self.logger.info("First pass bounce - rolling streams upgrade") # get the node running the streams app node = self.processor1.node self.processor1.stop() # change it's version. This will automatically make it pick up a different # JAR when it starts again node.version = KafkaVersion(to_version) self.processor1.start() def perform_broker_upgrade(self, to_version): self.logger.info("First pass bounce - rolling broker upgrade") for node in self.kafka.nodes: self.kafka.stop_node(node) node.version = KafkaVersion(to_version) self.kafka.start_node(node) @cluster(num_nodes=6) @parametrize(from_version=str(LATEST_0_10_1), to_version=str(DEV_BRANCH)) @parametrize(from_version=str(LATEST_0_10_2), to_version=str(DEV_BRANCH)) @parametrize(from_version=str(LATEST_0_10_1), to_version=str(LATEST_0_11_0)) @parametrize(from_version=str(LATEST_0_10_2), to_version=str(LATEST_0_11_0)) @parametrize(from_version=str(LATEST_0_11_0), to_version=str(LATEST_0_10_2)) @parametrize(from_version=str(DEV_BRANCH), to_version=str(LATEST_0_10_2)) def test_upgrade_downgrade_streams(self, from_version, to_version): """ Start a smoke test client, then abort (kill -9) and restart it a few times. Ensure that all records are delivered. Note, that just like tests/core/upgrade_test.py, a prerequisite for this test to succeed if the inclusion of all parametrized versions of kafka in kafka/vagrant/base.sh (search for get_kafka()). For streams in particular, that means that someone has manually copies the kafka-stream-$version-test.jar in the right S3 bucket as shown in base.sh. """ # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created time.sleep(10) self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.processor1.start() time.sleep(15) self.perform_streams_upgrade(to_version) time.sleep(15) self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False) self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) @cluster(num_nodes=6) @parametrize(from_version=str(LATEST_0_10_2), to_version=str(DEV_BRANCH)) def test_upgrade_brokers(self, from_version, to_version): """ Start a smoke test client then perform rolling upgrades on the broker. """ # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() # number of nodes needs to be >= 3 for the smoke test self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=KafkaVersion(from_version), topics=self.topics) self.kafka.start() # allow some time for topics to be created time.sleep(10) self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka) self.driver.start() self.processor1.start() time.sleep(15) self.perform_broker_upgrade(to_version) time.sleep(15) self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node node.account.ssh("grep ALL-RECORDS-DELIVERED %s" % self.driver.STDOUT_FILE, allow_fail=False) self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False)
def test_streams(self, processing_guarantee, crash, metadata_quorum=quorum.zk): processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee) processor2 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee) processor3 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka, processing_guarantee) with processor1.node.account.monitor_log(processor1.STDOUT_FILE) as monitor1: processor1.start() monitor1.wait_until('REBALANCING -> RUNNING', timeout_sec=60, err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor1.node.account) ) self.driver.start() monitor1.wait_until('processed', timeout_sec=30, err_msg="Didn't see any processing messages " + str(processor1.node.account) ) # make sure we're not already done processing (which would invalidate the test) self.driver.node.account.ssh("! grep 'Result Verification' %s" % self.driver.STDOUT_FILE, allow_fail=False) processor1.stop_nodes(not crash) with processor2.node.account.monitor_log(processor2.STDOUT_FILE) as monitor2: processor2.start() monitor2.wait_until('REBALANCING -> RUNNING', timeout_sec=120, err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor2.node.account) ) monitor2.wait_until('processed', timeout_sec=30, err_msg="Didn't see any processing messages " + str(processor2.node.account) ) # make sure we're not already done processing (which would invalidate the test) self.driver.node.account.ssh("! grep 'Result Verification' %s" % self.driver.STDOUT_FILE, allow_fail=False) processor2.stop_nodes(not crash) with processor3.node.account.monitor_log(processor3.STDOUT_FILE) as monitor3: processor3.start() monitor3.wait_until('REBALANCING -> RUNNING', timeout_sec=120, err_msg="Never saw 'REBALANCING -> RUNNING' message " + str(processor3.node.account) ) # there should still be some data left for this processor to work on. monitor3.wait_until('processed', timeout_sec=30, err_msg="Didn't see any processing messages " + str(processor3.node.account) ) self.driver.wait() self.driver.stop() processor3.stop() if crash and processing_guarantee == 'at_least_once': self.driver.node.account.ssh("grep -E 'SUCCESS|PROCESSED-MORE-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False) else: self.driver.node.account.ssh("grep SUCCESS %s" % self.driver.STDOUT_FILE, allow_fail=False)
class StreamsBrokerBounceTest(Test): """ Simple test of Kafka Streams with brokers failing """ def __init__(self, test_context): super(StreamsBrokerBounceTest, self).__init__(test_context) self.replication = 3 self.partitions = 3 self.topics = { 'echo' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2}}, 'data' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'min' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'max' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'sum' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'dif' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'cnt' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'avg' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'wcnt' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, 'tagg' : { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} }, '__consumer_offsets' : { 'partitions': 50, 'replication-factor': self.replication, 'configs': {"min.insync.replicas": 2} } } def fail_broker_type(self, failure_mode, broker_type): # Pick a random topic and bounce it's leader topic_index = randint(0, len(self.topics.keys()) - 1) topic = self.topics.keys()[topic_index] failures[failure_mode](self, topic, broker_type) def fail_many_brokers(self, failure_mode, num_failures): sig = signal.SIGTERM if (failure_mode == "clean_shutdown"): sig = signal.SIGTERM else: sig = signal.SIGKILL for num in range(0, num_failures - 1): signal_node(self, self.kafka.nodes[num], sig) def confirm_topics_on_all_brokers(self, expected_topic_set): for node in self.kafka.nodes: match_count = 0 # need to iterate over topic_list_generator as kafka.list_topics() # returns a python generator so values are fetched lazily # so we can't just compare directly we must iterate over what's returned topic_list_generator = self.kafka.list_topics(node=node) for topic in topic_list_generator: if topic in expected_topic_set: match_count += 1 if len(expected_topic_set) != match_count: return False return True def setup_system(self, start_processor=True): # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=self.replication, zk=self.zk, topics=self.topics) self.kafka.start() # allow some time for topics to be created wait_until(lambda: self.confirm_topics_on_all_brokers(set(self.topics.keys())), timeout_sec=60, err_msg="Broker did not create all topics in 60 seconds ") # Start test harness self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService(self.test_context, self.kafka) self.driver.start() if (start_processor): self.processor1.start() def collect_results(self, sleep_time_secs): data = {} # End test self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node # Success is declared if streams does not crash when sleep time > 0 # It should give an exception when sleep time is 0 since we kill the brokers immediately # and the topic manager cannot create internal topics with the desired replication factor if (sleep_time_secs == 0): output_streams = self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-EXCEPTION %s" % self.processor1.STDOUT_FILE, allow_fail=False) else: output_streams = self.processor1.node.account.ssh_capture("grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) for line in output_streams: data["Client closed"] = line # Currently it is hard to guarantee anything about Kafka since we don't have exactly once. # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS output = node.account.ssh_capture("grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False) for line in output: data["Records Delivered"] = line output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" % self.driver.STDOUT_FILE, allow_fail=False) for line in output: data["Logic Success/Failure"] = line return data @cluster(num_nodes=7) @matrix(failure_mode=["clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"], broker_type=["leader", "controller"], sleep_time_secs=[120]) def test_broker_type_bounce(self, failure_mode, broker_type, sleep_time_secs): """ Start a smoke test client, then kill one particular broker and ensure data is still received Record if records are delivered. """ self.setup_system() # Sleep to allow test to run for a bit time.sleep(sleep_time_secs) # Fail brokers self.fail_broker_type(failure_mode, broker_type) return self.collect_results(sleep_time_secs) @ignore @cluster(num_nodes=7) @matrix(failure_mode=["clean_shutdown"], broker_type=["controller"], sleep_time_secs=[0]) def test_broker_type_bounce_at_start(self, failure_mode, broker_type, sleep_time_secs): """ Start a smoke test client, then kill one particular broker immediately before streams stats Streams should throw an exception since it cannot create topics with the desired replication factor of 3 """ self.setup_system(start_processor=False) # Sleep to allow test to run for a bit time.sleep(sleep_time_secs) # Fail brokers self.fail_broker_type(failure_mode, broker_type) self.processor1.start() return self.collect_results(sleep_time_secs) @cluster(num_nodes=7) @matrix(failure_mode=["clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce"], num_failures=[2]) def test_many_brokers_bounce(self, failure_mode, num_failures): """ Start a smoke test client, then kill a few brokers and ensure data is still received Record if records are delivered """ self.setup_system() # Sleep to allow test to run for a bit time.sleep(120) # Fail brokers self.fail_many_brokers(failure_mode, num_failures) return self.collect_results(120) @cluster(num_nodes=7) @matrix(failure_mode=["clean_bounce", "hard_bounce"], num_failures=[3]) def test_all_brokers_bounce(self, failure_mode, num_failures): """ Start a smoke test client, then kill a few brokers and ensure data is still received Record if records are delivered """ self.setup_system() # Sleep to allow test to run for a bit time.sleep(120) # Fail brokers self.fail_many_brokers(failure_mode, num_failures) return self.collect_results(120)
class StreamsBrokerBounceTest(Test): """ Simple test of Kafka Streams with brokers failing """ def __init__(self, test_context): super(StreamsBrokerBounceTest, self).__init__(test_context) self.replication = 3 self.partitions = 3 self.topics = { 'echo': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'data': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'min': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'max': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'sum': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'dif': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'cnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'avg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'wcnt': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, 'tagg': { 'partitions': self.partitions, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } }, '__consumer_offsets': { 'partitions': 50, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 2 } } } def fail_broker_type(self, failure_mode, broker_type): # Pick a random topic and bounce it's leader topic_index = randint(0, len(self.topics.keys()) - 1) topic = self.topics.keys()[topic_index] failures[failure_mode](self, topic, broker_type) def fail_many_brokers(self, failure_mode, num_failures): sig = signal.SIGTERM if (failure_mode == "clean_shutdown"): sig = signal.SIGTERM else: sig = signal.SIGKILL for num in range(0, num_failures - 1): signal_node(self, self.kafka.nodes[num], sig) def confirm_topics_on_all_brokers(self, expected_topic_set): for node in self.kafka.nodes: match_count = 0 # need to iterate over topic_list_generator as kafka.list_topics() # returns a python generator so values are fetched lazily # so we can't just compare directly we must iterate over what's returned topic_list_generator = self.kafka.list_topics(node=node) for topic in topic_list_generator: if topic in expected_topic_set: match_count += 1 if len(expected_topic_set) != match_count: return False return True def setup_system(self, start_processor=True, num_threads=3): # Setup phase self.zk = ZookeeperService(self.test_context, num_nodes=1) self.zk.start() self.kafka = KafkaService(self.test_context, num_nodes=self.replication, zk=self.zk, topics=self.topics) self.kafka.start() # allow some time for topics to be created wait_until(lambda: self.confirm_topics_on_all_brokers( set(self.topics.keys())), timeout_sec=60, err_msg="Broker did not create all topics in 60 seconds ") # Start test harness self.driver = StreamsSmokeTestDriverService(self.test_context, self.kafka) self.processor1 = StreamsSmokeTestJobRunnerService( self.test_context, self.kafka, "at_least_once", num_threads) self.driver.start() if (start_processor): self.processor1.start() def collect_results(self, sleep_time_secs): data = {} # End test self.driver.wait() self.driver.stop() self.processor1.stop() node = self.driver.node # Success is declared if streams does not crash when sleep time > 0 # It should give an exception when sleep time is 0 since we kill the brokers immediately # and the topic manager cannot create internal topics with the desired replication factor if (sleep_time_secs == 0): output_streams = self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-EXCEPTION %s" % self.processor1.STDOUT_FILE, allow_fail=False) else: output_streams = self.processor1.node.account.ssh_capture( "grep SMOKE-TEST-CLIENT-CLOSED %s" % self.processor1.STDOUT_FILE, allow_fail=False) for line in output_streams: data["Client closed"] = line # Currently it is hard to guarantee anything about Kafka since we don't have exactly once. # With exactly once in place, success will be defined as ALL-RECORDS-DELIEVERD and SUCCESS output = node.account.ssh_capture( "grep -E 'ALL-RECORDS-DELIVERED|PROCESSED-MORE-THAN-GENERATED|PROCESSED-LESS-THAN-GENERATED' %s" % self.driver.STDOUT_FILE, allow_fail=False) for line in output: data["Records Delivered"] = line output = node.account.ssh_capture("grep -E 'SUCCESS|FAILURE' %s" % self.driver.STDOUT_FILE, allow_fail=False) for line in output: data["Logic Success/Failure"] = line return data @cluster(num_nodes=7) @matrix(failure_mode=[ "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce" ], broker_type=["leader", "controller"], num_threads=[1, 3], sleep_time_secs=[120]) def test_broker_type_bounce(self, failure_mode, broker_type, sleep_time_secs, num_threads): """ Start a smoke test client, then kill one particular broker and ensure data is still received Record if records are delivered. We also add a single thread stream client to make sure we could get all partitions reassigned in next generation so to verify the partition lost is correctly triggered. """ self.setup_system(num_threads=num_threads) # Sleep to allow test to run for a bit time.sleep(sleep_time_secs) # Fail brokers self.fail_broker_type(failure_mode, broker_type) return self.collect_results(sleep_time_secs) @ignore @cluster(num_nodes=7) @matrix(failure_mode=["clean_shutdown"], broker_type=["controller"], sleep_time_secs=[0]) def test_broker_type_bounce_at_start(self, failure_mode, broker_type, sleep_time_secs): """ Start a smoke test client, then kill one particular broker immediately before streams stats Streams should throw an exception since it cannot create topics with the desired replication factor of 3 """ self.setup_system(start_processor=False) # Sleep to allow test to run for a bit time.sleep(sleep_time_secs) # Fail brokers self.fail_broker_type(failure_mode, broker_type) self.processor1.start() return self.collect_results(sleep_time_secs) @cluster(num_nodes=7) @matrix(failure_mode=[ "clean_shutdown", "hard_shutdown", "clean_bounce", "hard_bounce" ], num_failures=[2]) def test_many_brokers_bounce(self, failure_mode, num_failures): """ Start a smoke test client, then kill a few brokers and ensure data is still received Record if records are delivered """ self.setup_system() # Sleep to allow test to run for a bit time.sleep(120) # Fail brokers self.fail_many_brokers(failure_mode, num_failures) return self.collect_results(120) @cluster(num_nodes=7) @matrix(failure_mode=["clean_bounce", "hard_bounce"], num_failures=[3]) def test_all_brokers_bounce(self, failure_mode, num_failures): """ Start a smoke test client, then kill a few brokers and ensure data is still received Record if records are delivered """ # Set min.insync.replicas to 1 because in the last stage of the test there is only one broker left. # Otherwise the last offset commit will never succeed and time out and potentially take longer as # duration passed to the close method of the Kafka Streams client. self.topics['__consumer_offsets'] = { 'partitions': 50, 'replication-factor': self.replication, 'configs': { "min.insync.replicas": 1 } } self.setup_system() # Sleep to allow test to run for a bit time.sleep(120) # Fail brokers self.fail_many_brokers(failure_mode, num_failures) return self.collect_results(120)