class BenchmarkTest(ProduceConsumeValidateTest): """ A benchmark of Waltz producer/consumer performance. """ MIN_CLUSTER_SPEC = ClusterSpec.from_list([ {'cpu':1, 'mem':'1GB', 'disk':'25GB', 'additional_disks':{'/dev/sdb':'100GB'}, 'num_nodes':3}, {'cpu':1, 'mem':'3GB', 'disk':'15GB', 'num_nodes':2}, {'cpu':1, 'mem':'1GB', 'disk':'25GB', 'num_nodes':1}]) def __init__(self, test_context): super(BenchmarkTest, self).__init__(test_context=test_context) @cluster(cluster_spec=MIN_CLUSTER_SPEC) @parametrize(txn_size=512, txn_per_thread=1000, num_thread=100, interval=10, lock_pool_size=0, num_active_partitions=1, timeout=360) @parametrize(txn_size=512, txn_per_thread=1000, num_thread=100, interval=20, lock_pool_size=0, num_active_partitions=1, timeout=360) @parametrize(txn_size=512, txn_per_thread=2000, num_thread=50, interval=10, lock_pool_size=0, num_active_partitions=1, timeout=360) @parametrize(txn_size=1024, txn_per_thread=1000, num_thread=100, interval=10, lock_pool_size=0, num_active_partitions=1, timeout=360) @parametrize(txn_size=512, txn_per_thread=100, num_thread=100, interval=10, lock_pool_size=64, num_active_partitions=1, timeout=360) @parametrize(txn_size=512, txn_per_thread=100, num_thread=100, interval=10, lock_pool_size=128, num_active_partitions=1, timeout=360) @parametrize(txn_size=512, txn_per_thread=100, num_thread=100, interval=10, lock_pool_size=128, num_active_partitions=2, timeout=360) def test_producer_performance(self, txn_size, txn_per_thread, num_thread, interval, lock_pool_size, num_active_partitions, timeout): test_cmd = self.performance_cli.producer_test_cmd(self.log_file_path, txn_size, txn_per_thread, num_thread, interval, lock_pool_size, num_active_partitions) test_output = self.run_produce_consume_validate(lambda: self.simple_validation_func(test_cmd, timeout)) self.print_producer_performance(test_output) @cluster(cluster_spec=MIN_CLUSTER_SPEC) @parametrize(txn_size=512, num_txn=100000, num_active_partitions=1, timeout=360) @parametrize(txn_size=512, num_txn=100000, num_active_partitions=4, timeout=360) @parametrize(txn_size=1024, num_txn=100000, num_active_partitions=1, timeout=360) def test_consumer_performance(self, txn_size, num_txn, num_active_partitions, timeout): test_cmd = self.performance_cli.consumer_test_cmd(self.log_file_path, txn_size, num_txn, num_active_partitions) test_output = self.run_produce_consume_validate(lambda: self.simple_validation_func(test_cmd, timeout)) self.print_consumer_performance(test_output) def print_producer_performance(self, test_output): performance = search(".*transactions(.|\n)*MilliSec\/Transaction.*", test_output).group(0) print("\n####################### PRODUCER PERFORMANCE REPORT #######################\n" + \ "\n{performance}\n".format(performance=performance) + \ "\n###########################################################################\n") def print_consumer_performance(self, test_output): performance = search(".*transactions(.|\n)*MB/sec.*", test_output).group(0) print("\n####################### CONSUMER PERFORMANCE REPORT #######################\n" + \ "\n{performance}\n".format(performance=performance) + \ "\n###########################################################################\n")
def check_from_list(self): empty = ClusterSpec.empty() assert "[]" == str(empty) node_specs_dict_list = [{ 'cpu': 2, 'mem': '2GB', 'disk': '20GB', 'num_nodes': 2 }, { 'cpu': 4, 'mem': '4GB', 'disk': '40GB', 'num_nodes': 4 }] custom_linux_2 = ClusterSpec.from_list(node_specs_dict_list) assert '[{"additional_disks(GB)": {}, "cpu": 2, "disk(GB)": 20.0, "mem(GB)": 2.0, "num_nodes": 2, "os": "linux"},' \ ' {"additional_disks(GB)": {}, "cpu": 4, "disk(GB)": 40.0, "mem(GB)": 4.0, "num_nodes": 4, "os": "linux"}]' \ == str(custom_linux_2)
class RecoveryTest(ProduceConsumeValidateTest): """ Test Waltz recovery by running offline recovery with CLI tools, including recover dirty replicas, bring up offline replica, and so on. """ MIN_CLUSTER_SPEC = ClusterSpec.from_list([{ 'cpu': 1, 'mem': '1GB', 'disk': '25GB', 'additional_disks': { '/dev/sdb': '100GB' }, 'num_nodes': 3 }, { 'cpu': 1, 'mem': '3GB', 'disk': '15GB', 'num_nodes': 2 }, { 'cpu': 1, 'mem': '1GB', 'disk': '25GB', 'num_nodes': 1 }]) def __init__(self, test_context): super(RecoveryTest, self).__init__(test_context=test_context) @cluster(cluster_spec=MIN_CLUSTER_SPEC) @parametrize(num_active_partitions=1, txn_per_client=250, num_clients=1, interval=100, timeout=240) def test_recover_dirty_replica(self, num_active_partitions, txn_per_client, num_clients, interval, timeout): src_replica_idx = 0 dst_replica_idx = 2 self.run_produce_consume_validate(lambda: self.recover_dirty_replica( src_replica_idx, dst_replica_idx, num_active_partitions, txn_per_client, num_clients, interval, timeout)) @cluster(cluster_spec=MIN_CLUSTER_SPEC) @parametrize(num_active_partitions=1, txn_per_client=250, num_clients=1, interval=100, timeout=240) def test_bring_replica_back_online(self, num_active_partitions, txn_per_client, num_clients, interval, timeout): offline_replica_idx = 0 self.run_produce_consume_validate( lambda: self.bring_replica_back_online( offline_replica_idx, num_active_partitions, txn_per_client, num_clients, interval, timeout)) def recover_dirty_replica(self, src_replica_idx, dst_replica_idx, num_active_partitions, txn_per_client, num_clients, interval, timeout): """ A validate function to test offline recovery if a dirty replica. :param src_replica_idx: The index of source replica, where new replica recovers from :param dst_replica_idx: The index of destination replica :param num_active_partitions: Number of active partitions :param txn_per_client: Number of transactions per client :param num_clients: Number of total clients :param interval: Average interval(millisecond) between transactions :param timeout: Test timeout """ port = self.waltz_storage.port admin_port = self.waltz_storage.admin_port src_node = self.waltz_storage.nodes[src_replica_idx] src_node_hostname = src_node.account.ssh_hostname src_storage = self.get_host(src_node_hostname, admin_port) dst_node = self.waltz_storage.nodes[dst_replica_idx] dst_node_hostname = dst_node.account.ssh_hostname dst_storage = self.get_host(dst_node_hostname, admin_port) partition = randrange(num_active_partitions) # Step 1: Submit transactions to all replicas. cmd = self.client_cli.validate_txn_cmd(num_active_partitions, txn_per_client, num_clients, interval) self.verifiable_client.start(cmd) wait_until(lambda: self.is_max_transaction_id_updated( src_storage, port, partition, -1), timeout_sec=timeout) # Step 2: Mark destination replica offline for reads and writes self.storage_set_availability(storage=dst_storage, partition=partition, online=False) # Step 3: Trigger recovery to update source replicas' low watermark. self.trigger_recovery(bounce_node_idx=src_replica_idx) wait_until(lambda: self.is_triggered_recovery_completed(), timeout_sec=timeout) src_node_local_low_watermark = self.get_storage_local_low_watermark( self.get_host(src_node_hostname, admin_port), partition) # Step 4: Run recovery operation on offline replica. # Source replica's partition low watermark will be used as target for recovery. self.storage_recover_partition(source_storage=src_storage, destination_storage=dst_storage, destination_storage_port=port, partition=partition, batch_size=20) # Step 5: Check if destination replica catch up with source replica. dst_node_max_transaction_id = self.get_storage_max_transaction_id( self.get_host(dst_node_hostname, admin_port), port, partition, True) assert src_node_local_low_watermark == dst_node_max_transaction_id, \ "partition recovery failed on storage {}, expected max transaction ID = {}, actual max transaction ID = {}" \ .format(dst_node_hostname, src_node_local_low_watermark, dst_node_max_transaction_id) # Step 6: Wait until validation complete. wait_until( lambda: self.verifiable_client.task_complete() == True, timeout_sec=timeout, err_msg="verifiable_client failed to complete task in %d seconds." % timeout) def bring_replica_back_online(self, offline_replica_idx, num_active_partitions, txn_per_client, num_clients, interval, timeout): """ A validate function to test if a replica can successfully recover when brought back online. :param offline_replica_idx: The index of offline replica :param num_active_partitions: Number of active partitions :param txn_per_client: Number of transactions per client :param num_clients: Number of total clients :param interval: Average interval(millisecond) between transactions :param timeout: Test timeout """ admin_port = self.waltz_storage.admin_port node = self.waltz_storage.nodes[offline_replica_idx] hostname = node.account.ssh_hostname partition = randrange(num_active_partitions) # Step 1: Produce a number of transactions. cmd = self.client_cli.validate_txn_cmd(num_active_partitions, txn_per_client, num_clients, interval) self.verifiable_client.start(cmd) # Step 2: Mark storage node 0 offline for reads and writes. storage = self.get_host(hostname, admin_port) self.storage_set_availability(storage=storage, partition=partition, online=False) storage_session_id_offline = self.get_storage_session_id( self.get_host(hostname, admin_port), partition) # Step 3: Mark storage node online. Wait until recovery is completed. self.storage_set_availability(storage=storage, partition=partition, online=True) wait_until(lambda: self.is_triggered_recovery_completed(), timeout_sec=timeout) # Step 4: Check if storage node's session ID bumps up by 1. storage_session_id_online = self.get_storage_session_id( storage, partition) assert storage_session_id_online == storage_session_id_offline + 1, \ "recovery failed to complete on storage {}, expected session ID = {}, actual session ID = {}" \ .format(hostname, storage_session_id_offline + 1, storage_session_id_online) # Step 5: Wait until all transactions appended. wait_until( lambda: self.verifiable_client.task_complete() == True, timeout_sec=timeout, err_msg="verifiable_client failed to complete task in %d seconds." % timeout)
class SmokeTest(ProduceConsumeValidateTest): """ A class of torture tests that turns on a bunch of ZK, storage, server, and client nodes. Fire transactions while turning things off and on, to ensure Waltz can recover from expected failure. """ MIN_CLUSTER_SPEC = ClusterSpec.from_list([{ 'cpu': 1, 'mem': '1GB', 'disk': '25GB', 'additional_disks': { '/dev/sdb': '100GB' }, 'num_nodes': 3 }, { 'cpu': 1, 'mem': '3GB', 'disk': '15GB', 'num_nodes': 2 }, { 'cpu': 1, 'mem': '1GB', 'disk': '25GB', 'num_nodes': 1 }]) def __init__(self, test_context): super(SmokeTest, self).__init__(test_context=test_context) @cluster(cluster_spec=MIN_CLUSTER_SPEC) @parametrize(num_active_partitions=1, txn_per_client=500, num_clients=10, interval=120, timeout=240) @parametrize(num_active_partitions=4, txn_per_client=500, num_clients=10, interval=120, timeout=240) def test_produce_consume_no_torture(self, num_active_partitions, txn_per_client, num_clients, interval, timeout): validation_cmd = self.client_cli.validate_txn_cmd( num_active_partitions, txn_per_client, num_clients, interval) self.run_produce_consume_validate( lambda: self.simple_validation_func(validation_cmd, timeout)) @cluster(cluster_spec=MIN_CLUSTER_SPEC) @parametrize(num_active_partitions=1, txn_per_client=500, num_clients=10, interval=120, timeout=480) @parametrize(num_active_partitions=4, txn_per_client=500, num_clients=10, interval=120, timeout=480) def test_produce_consume_while_bouncing_storage_nodes( self, num_active_partitions, txn_per_client, num_clients, interval, timeout): validation_cmd = self.client_cli.validate_txn_cmd( num_active_partitions, txn_per_client, num_clients, interval) validation_result = self.run_produce_consume_validate( lambda: self.simple_validation_func(validation_cmd, timeout), lambda: self._bounce_storage_nodes(3)) assert "exception" not in validation_result.lower( ), "Test failed with exception:\n{}".format(validation_result) @cluster(cluster_spec=MIN_CLUSTER_SPEC) @parametrize(num_active_partitions=1, txn_per_client=500, num_clients=2, interval=120, timeout=240) @parametrize(num_active_partitions=4, txn_per_client=500, num_clients=2, interval=120, timeout=240) def test_produce_consume_while_killing_a_server_node( self, num_active_partitions, txn_per_client, num_clients, interval, timeout): validation_cmd = self.client_cli.validate_txn_cmd( num_active_partitions, txn_per_client, num_clients, interval) self.run_produce_consume_validate( lambda: self.simple_validation_func(validation_cmd, timeout), lambda: self._kill_a_server_node(num_active_partitions)) def _bounce_storage_nodes(self, interval): storage_node_bounce_scheduler = NodeBounceScheduler( service=self.waltz_storage, interval=interval, stop_condition=lambda: self.verifiable_client.task_complete()) storage_node_bounce_scheduler.start() def _kill_a_server_node(self, num_active_partitions): node_idx = self.get_server_node_idx(randrange(num_active_partitions)) cmd_list = [{ "action": NodeBounceScheduler.IDLE }, { "action": NodeBounceScheduler.STOP_A_NODE, "node": node_idx }] server_node_bounce_scheduler = NodeBounceScheduler( service=self.waltz_server, interval=3, stop_condition=lambda: self.verifiable_client.task_complete(), iterable_cmd_list=iter(cmd_list)) server_node_bounce_scheduler.start()
class LoadRebalancingTest(ProduceConsumeValidateTest): """ Class for tests simulating load rebalancing on server nodes """ MIN_CLUSTER_SPEC = ClusterSpec.from_list([{ 'cpu': 1, 'mem': '1GB', 'disk': '25GB', 'additional_disks': { '/dev/sdb': '100GB' }, 'num_nodes': 3 }, { 'cpu': 1, 'mem': '3GB', 'disk': '15GB', 'num_nodes': 2 }, { 'cpu': 1, 'mem': '1GB', 'disk': '25GB', 'num_nodes': 1 }]) def __init__(self, test_context): super(LoadRebalancingTest, self).__init__(test_context=test_context) self.server_cli = ServerCli(self.verifiable_client.nodes[0], self.client_config_path) @cluster(cluster_spec=MIN_CLUSTER_SPEC) @parametrize(num_active_partitions=1, txn_per_client=150, num_clients=2, interval=600, timeout=360) @parametrize(num_active_partitions=4, txn_per_client=100, num_clients=2, interval=600, timeout=360) def test_rebalancing_on_server_nodes(self, num_active_partitions, txn_per_client, num_clients, interval, timeout): validation_cmd = self.client_cli.validate_txn_cmd( self.log_file_path, num_active_partitions, txn_per_client, num_clients, interval) self.run_produce_consume_validate( lambda: self.rebalancing_on_server_nodes( validation_cmd, timeout, num_active_partitions, txn_per_client * num_clients)) def rebalancing_on_server_nodes(self, validation_cmd, timeout, num_active_partitions, expected_number_of_transactions): """ A validate function to simulate load rebalancing on server nodes using preferred-partitions. :param validation_cmd: Command that is passed to client node :param timeout: Test timeout :param num_active_partitions: number of active partitions :param expected_number_of_transactions: number of transactions expected to be stored on a waltz storage node """ storage_admin_port = self.waltz_storage.admin_port server_port = self.waltz_server.port storage_port = self.waltz_storage.port storage_node = self.waltz_storage.nodes[randrange( len(self.waltz_storage.nodes))] random_active_partition = randrange(num_active_partitions) storage = self.get_host(storage_node.account.ssh_hostname, storage_admin_port) # Step 1: Start waltz cluster and execute validation command self.verifiable_client.start(validation_cmd) # Step 2: Wait till transactions get processed wait_until(lambda: self.is_max_transaction_id_updated( storage, storage_port, random_active_partition, -1), timeout_sec=timeout) # Step 3: Change assigned server node for random_active_partition partition_current_server_index = self.get_server_node_idx( random_active_partition) while True: partition_future_server_index = randrange( len(self.waltz_server.nodes)) if not partition_current_server_index == partition_future_server_index: break server_node = self.waltz_server.nodes[partition_future_server_index] server_node_hostname = server_node.account.ssh_hostname server = self.get_host(server_node_hostname, server_port) self.server_cli.add_preferred_partition(server, random_active_partition) assert self.get_server_node_idx(random_active_partition) == partition_future_server_index, \ "partition assignment for server nodes remained untouched" # Step 4: Wait until verifiable client ends its task wait_until( lambda: self.verifiable_client.task_complete() == True, timeout_sec=timeout, err_msg="verifiable_client failed to complete task in {} seconds.". format(timeout)) # Step 5: Assert all transactions are persistently stored. assert expected_number_of_transactions == self.get_storage_num_of_all_transactions(storage, storage_port, num_active_partitions), \ "number of transactions stored in storage partition does not match with all the transactions sent by producers. " \ "Client {}, Storage = {}" \ .format(expected_number_of_transactions, self.get_storage_num_of_all_transactions(storage, storage_port, num_active_partitions))
class ClientValidationTest(ProduceConsumeValidateTest): """ A class of waltz tests where each client (producer/consumer) in waltz cluster is run as a single process. """ MIN_CLUSTER_SPEC = ClusterSpec.from_list([{ 'cpu': 1, 'mem': '1GB', 'disk': '25GB', 'additional_disks': { '/dev/sdb': '100GB' }, 'num_nodes': 3 }, { 'cpu': 1, 'mem': '3GB', 'disk': '15GB', 'num_nodes': 2 }, { 'cpu': 1, 'mem': '1GB', 'disk': '25GB', 'num_nodes': 1 }]) def __init__(self, test_context): super(ClientValidationTest, self).__init__(test_context=test_context) @cluster(cluster_spec=MIN_CLUSTER_SPEC) @parametrize(num_active_partitions=1, txn_per_client=75, num_producers=3, num_consumers=2, interval=500, timeout=360) @parametrize(num_active_partitions=4, txn_per_client=100, num_producers=2, num_consumers=2, interval=250, timeout=360) def test_produce_consume_no_torture(self, num_active_partitions, txn_per_client, num_producers, num_consumers, interval, timeout): validation_cmd = self.get_produce_consume_parallel( num_active_partitions, txn_per_client, num_producers, num_consumers, interval) self.run_produce_consume_validate( lambda: self.produce_consume_no_torture(validation_cmd, timeout)) @cluster(cluster_spec=MIN_CLUSTER_SPEC) @parametrize(num_active_partitions=1, txn_per_client=200, num_producers=3, num_consumers=2, interval=500, num_consumers_to_stop=1, delay_before_torture=40, timeout=360) @parametrize(num_active_partitions=4, txn_per_client=100, num_producers=2, num_consumers=4, interval=500, num_consumers_to_stop=3, delay_before_torture=35, timeout=360) def test_produce_consume_abrupt_stop_of_consumers( self, num_active_partitions, txn_per_client, num_producers, num_consumers, interval, num_consumers_to_stop, delay_before_torture, timeout): validation_cmd = self.get_produce_consume_parallel( num_active_partitions, txn_per_client, num_producers, num_consumers, interval) self.run_produce_consume_validate( lambda: self.produce_consume_consumer_torture( validation_cmd, timeout, txn_per_client * num_producers, num_active_partitions, num_consumers_to_stop, delay_before_torture, num_consumers)) def get_num_failed_processes_cmd(self): return "fail=0; for job in `jobs -p`; do wait $job || let \"fail+=1\"; done ; echo \"number of failed processes: $fail\"" def get_produce_consume_parallel(self, num_active_partitions, txn_per_client, num_producers, num_consumers, interval): cmd_parallel = "" for i in range(num_producers): cmd_parallel += self.client_cli.create_producer_cmd( self.log_file_path, txn_per_client, interval, num_active_partitions) cmd_parallel += " & " for i in range(num_consumers): cmd_parallel += self.client_cli.create_consumer_cmd( self.log_file_path, txn_per_client * num_producers, num_active_partitions) cmd_parallel += " & " cmd_parallel += self.get_num_failed_processes_cmd() return cmd_parallel def produce_consume_no_torture(self, validation_cmd, timeout): """ A validate function to test producers and consumers running in parallel. :param validation_cmd: Command that is passed to client node :param timeout: Test timeout """ # Step 1: Start waltz cluster and execute validation command self.verifiable_client.start(validation_cmd) # Step 2: Wait until verifiable client ends its task wait_until( lambda: self.verifiable_client.task_complete() == True, timeout_sec=timeout, err_msg=lambda: "verifiable_client failed to complete task in {} seconds. Number of producers still running: {}, " "number of consumers still running: {}".format( timeout, self.number_of_running_producers(), self.number_of_running_consumers())) # Step 3: Verify child processes of a main process finished successfully (exit code 0 received) num_failed_processes = int( re.search('failed processes: (\d+)', self.verifiable_client.get_validation_result()).group(1)) assert num_failed_processes == 0, "number of failed processes: {}".format( num_failed_processes) def number_of_running_producers(self): return int(self.verifiable_client.nodes[0].account.ssh_output( "ps -eo command | grep -c \"^java .* create-producer\" | cat"). strip()) def number_of_running_consumers(self): return int(self.verifiable_client.nodes[0].account.ssh_output( "ps -eo command | grep -c \"^java .* create-consumer\" | cat"). strip()) def produce_consume_consumer_torture(self, validation_cmd, timeout, expected_number_of_transactions, num_active_partitions, num_consumers_to_stop, delay_before_torture, num_consumers): """ A validate function to test producers and consumers running in parallel with a torture test that kills consumer client nodes. :param validation_cmd: Command that is passed to client node :param timeout: Test timeout :param expected_number_of_transactions: The expected number of created transaction during this test :param num_active_partitions: Number of active partitions :param num_consumers_to_stop: Number of consumers to stop with kill SIGTERM command :param delay_before_torture: The delay in seconds between first transaction being processed and beginning of the torture test :param num_consumers: Total number of consumer clients """ admin_port = self.waltz_storage.admin_port port = self.waltz_storage.port node = self.waltz_storage.nodes[randrange(len( self.waltz_storage.nodes))] storage = self.get_host(node.account.ssh_hostname, admin_port) random_active_partition = randrange(num_active_partitions) # Step 1: Start waltz cluster and execute validation command self.verifiable_client.start(validation_cmd) # Step 2: Wait till transactions get processed wait_until(lambda: self.is_max_transaction_id_updated( storage, port, random_active_partition, -1), timeout_sec=timeout) # Step 3: Wait before staring torture test sleep(delay_before_torture) # Step 4: Start torture self.abrupt_stop_consumer_nodes(num_consumers, num_consumers_to_stop) # Step 5: Wait until verifiable client ends its task wait_until( lambda: self.verifiable_client.task_complete() == True, timeout_sec=timeout, err_msg="verifiable_client failed to complete task in {} seconds.". format(timeout)) # Step 6: Verify child processes of a main process finished as expected (num_consumers_to_stop exited with other code than 0) num_failed_processes = int( re.search('failed processes: (\d+)', self.verifiable_client.get_validation_result()).group(1)) assert num_failed_processes == num_consumers_to_stop, "number of failed processes: {}, expected: {}".format( num_failed_processes, num_consumers_to_stop) # Step 7: Assert all transactions are persistently stored. assert expected_number_of_transactions == self.get_storage_num_of_all_transactions(storage, port, num_active_partitions), \ "number of transactions stored in storage partition does not match with all the transactions sent by producers. " \ "Client {}, Storage = {}" \ .format(expected_number_of_transactions, self.get_storage_num_of_all_transactions(storage, port, num_active_partitions)) def abrupt_stop_consumer_nodes(self, num_consumers, num_consumers_to_stop): get_consumer_pids = "ps -ef | grep \"create-consumer\" | awk \'{{print $2}}\' | head -n -1 | sort -n" consumer_pids = self.verifiable_client.nodes[0].account.ssh_output( get_consumer_pids).strip().splitlines() # remove parent process and grep process pid's i.e. 1st and last consumer_process_pids = consumer_pids[1:num_consumers + 1] shuffle(consumer_process_pids) # Select the consumer pids from the beginning of the shuffled list to kill the respective consumer process. for i in range(num_consumers_to_stop): cmd = "kill -SIGTERM {}".format(consumer_process_pids[i]) self.verifiable_client.nodes[0].account.ssh(cmd)
class ConnectionInterruptionTest(ProduceConsumeValidateTest): """ Class for tests simulating network issues corresponding to Waltz Server and Waltz Storage nodes """ MIN_CLUSTER_SPEC = ClusterSpec.from_list([{ 'cpu': 1, 'mem': '1GB', 'disk': '25GB', 'additional_disks': { '/dev/sdb': '100GB' }, 'num_nodes': 3 }, { 'cpu': 1, 'mem': '3GB', 'disk': '15GB', 'num_nodes': 2 }, { 'cpu': 1, 'mem': '1GB', 'disk': '25GB', 'num_nodes': 1 }]) def __init__(self, test_context): super(ConnectionInterruptionTest, self).__init__(test_context=test_context) @cluster(cluster_spec=MIN_CLUSTER_SPEC) @parametrize(num_active_partitions=1, txn_per_client=150, num_clients=2, interval=600, timeout=360, interrupt_duration=10, num_interruptions=3, delay_between_interruptions=25) @parametrize(num_active_partitions=4, txn_per_client=100, num_clients=2, interval=1000, timeout=300, interrupt_duration=20, num_interruptions=1, delay_between_interruptions=20) def test_client_server_network_interruption(self, num_active_partitions, txn_per_client, num_clients, interval, timeout, interrupt_duration, num_interruptions, delay_between_interruptions): validation_cmd = self.client_cli.validate_txn_cmd( self.log_file_path, num_active_partitions, txn_per_client, num_clients, interval) self.run_produce_consume_validate( lambda: self.client_server_network_interruption( validation_cmd, timeout, interrupt_duration, num_interruptions, delay_between_interruptions, num_active_partitions, interval / 1000)) @cluster(cluster_spec=MIN_CLUSTER_SPEC) @parametrize(num_active_partitions=1, txn_per_client=200, num_clients=1, interval=100, timeout=240, interrupt_duration=5, num_of_nodes_to_bounce=2) @parametrize(num_active_partitions=4, txn_per_client=200, num_clients=2, interval=100, timeout=240, interrupt_duration=5, num_of_nodes_to_bounce=2) @parametrize(num_active_partitions=1, txn_per_client=200, num_clients=1, interval=100, timeout=240, interrupt_duration=5, num_of_nodes_to_bounce=1) @parametrize(num_active_partitions=4, txn_per_client=200, num_clients=2, interval=100, timeout=300, interrupt_duration=5, num_of_nodes_to_bounce=1) def test_storage_node_network_interruption(self, num_active_partitions, txn_per_client, num_clients, interval, timeout, interrupt_duration, num_of_nodes_to_bounce): validation_cmd = self.client_cli.validate_txn_cmd( self.log_file_path, num_active_partitions, txn_per_client, num_clients, interval) self.run_produce_consume_validate( lambda: self.storage_node_network_interruption( validation_cmd, num_active_partitions, txn_per_client, num_clients, timeout, interrupt_duration, num_of_nodes_to_bounce)) def drop_traffic_to_port(self, node, port): node.account.ssh_capture( "sudo iptables -I INPUT -p tcp --destination-port {} -j DROP". format(port)) def enable_traffic_to_port(self, node, port): node.account.ssh_capture( "sudo iptables -D INPUT -p tcp --destination-port {} -j DROP". format(port)) def client_server_network_interruption(self, validation_cmd, timeout, interrupt_duration, num_interruptions, delay_between_interruptions, num_active_partitions, processing_duration): """ Set up waltz and interrupt network between a waltz client node and a server node. :param validation_cmd: The command that is send to ClientCli :param timeout: Test timeout :param interrupt_duration: Interval in seconds during which client won't be able to connect to server :param num_interruptions: Number of connection interruption cycles :param delay_between_interruptions: Interval in seconds that represents duration between network interruptions :param num_active_partitions: Number of active partitions :param processing_duration: Time in seconds within which it is safe to assume that processed transactions gets stored """ partition = randrange(num_active_partitions) node_idx = self.get_server_node_idx(partition) # Start waltz cluster and wait until a storage node registers first transaction self.verifiable_client.start(validation_cmd) admin_port = self.waltz_storage.admin_port port = self.waltz_storage.port storage = self.get_host( self.waltz_storage.nodes[0].account.ssh_hostname, admin_port) wait_until(lambda: self.is_max_transaction_id_updated( storage, port, partition, -1), timeout_sec=timeout) node = self.waltz_server.nodes[node_idx] for interruption in range(num_interruptions): sleep(delay_between_interruptions) try: # disable connection on port self.drop_traffic_to_port(node, self.waltz_server.port) # verify that during network interruption number of stored transactions didn't increase # sleep time to assure that transactions from server are propagated to storage nodes sleep(processing_duration) cur_high_watermark = self.get_storage_max_transaction_id( storage, port, partition) sleep(max(interrupt_duration - processing_duration, 0)) assert not self.is_max_transaction_id_updated(storage, port, partition, cur_high_watermark), \ 'Network interruption failed, newly stored transactions detected' finally: """ delete the added iptable rule as it is not removed with the end of waltz process and could persist on waltz-server VM, if process is signaled to end (^C) or an exception is thrown during execution of the try block. """ self.enable_traffic_to_port(node, self.waltz_server.port) wait_until( lambda: self.verifiable_client.task_complete() == True, timeout_sec=timeout, err_msg="verifiable_client failed to complete task in %d seconds." % timeout) class StorageNodeInfo: """ Representation of a storage node and number of transactions stored on a storage node """ def __init__(self, node): self.node = node self.total_num_of_transactions = 0 def storage_node_network_interruption(self, validation_cmd, num_active_partitions, txn_per_client, num_clients, timeout, interrupt_duration, num_of_nodes_to_bounce): """ A validate function to test bouncing network connection between server and storage. Verification of correctness is done by comparing expected number of stored transactions with current transactions in the Waltz Cluster. :param validation_cmd: The command that is send to ClientCli :param num_active_partitions: Number of active partitions :param txn_per_client: Number of transactions per client :param num_clients: Number of waltz clients :param timeout: Test timeout :param interrupt_duration: Duration (milliseconds) of communication interruption between server and storage :param num_of_nodes_to_bounce: Number of storage nodes to bounce. This may affect the quorum """ bounced_nodes = [] for node_number in sample(range(len(self.waltz_storage.nodes)), num_of_nodes_to_bounce): bounced_nodes.append( self.StorageNodeInfo(self.waltz_storage.nodes[node_number])) admin_port = self.waltz_storage.admin_port port = self.waltz_storage.port # Step 1: Get current sum of transactions across partitions for bounced_node_info in bounced_nodes: for partition in range(num_active_partitions): bounced_node_info.total_num_of_transactions += max( 0, self.get_storage_max_transaction_id( self.get_host( bounced_node_info.node.account.ssh_hostname, admin_port), port, partition) + 1) # Step 2: Submit transactions to all replicas. self.verifiable_client.start(validation_cmd) wait_until(lambda: self.is_max_transaction_id_updated( self.get_host(bounced_nodes[0].node.account. ssh_hostname, admin_port), port, randrange(num_active_partitions), -1), timeout_sec=timeout) try: # Step 3: Interrupt connection for bounced_node_info in bounced_nodes: self.drop_traffic_to_port(bounced_node_info.node, port) # Step 4: Verify that storage port is closed for bounced_node_info in bounced_nodes: partition = randrange(num_active_partitions) # RemoteCommandError raised when get_storage_max_transaction_id request fails # because connection to the storage node port is blocked try: self.get_storage_max_transaction_id( self.get_host( bounced_node_info.node.account.ssh_hostname, admin_port), port, partition) raise AssertionError( "Network interruption failed. get_storage_max_transaction_id didn't return RemoteCommandError" ) except RemoteCommandError: pass sleep(interrupt_duration) finally: # Step 5: Enable connection, Do this step even when the Step 4 fails, as the added iptable # rules aren't removed from VM with the end of this process for bounced_node_info in bounced_nodes: self.enable_traffic_to_port(bounced_node_info.node, port) # Step 6: Verify that total number of expected transactions matches number of transactions stored in waltz storage nodes for bounced_node_info in bounced_nodes: expected_number_of_transactions = ( txn_per_client * num_clients) + bounced_node_info.total_num_of_transactions storage = self.get_host( bounced_node_info.node.account.ssh_hostname, admin_port) wait_until(lambda: expected_number_of_transactions == self.get_storage_num_of_all_transactions(storage, port, num_active_partitions), timeout_sec=timeout, err_msg="number of transactions stored in storage partition does not match with all the transactions sent by client. " "Client {}, Strage = {} after {} seconds" \ .format(expected_number_of_transactions, self.get_storage_num_of_all_transactions(storage, port, num_active_partitions), timeout))
class ScalabilityTest(ProduceConsumeValidateTest): """ Test Waltz scalability by scaling up and scaling down replicas. """ MIN_CLUSTER_SPEC = ClusterSpec.from_list([{ 'cpu': 1, 'mem': '1GB', 'disk': '25GB', 'additional_disks': { '/dev/sdb': '100GB' }, 'num_nodes': 4 }, { 'cpu': 1, 'mem': '3GB', 'disk': '15GB', 'num_nodes': 2 }, { 'cpu': 1, 'mem': '1GB', 'disk': '25GB', 'num_nodes': 1 }]) def __init__(self, test_context): super(ScalabilityTest, self).__init__(test_context=test_context, num_storage_nodes=4) @cluster(cluster_spec=MIN_CLUSTER_SPEC) @parametrize(num_active_partitions=1, txn_per_client=500, num_clients=1, interval=100, timeout=240) def test_scale_up_replica(self, num_active_partitions, txn_per_client, num_clients, interval, timeout): src_node_idx = 0 added_node_idx = 3 # new replica will not receive appends until offline recovery completes self.set_storage_nodes_to_ignore([added_node_idx]) self.run_produce_consume_validate(lambda: self.scale_up_replica( src_node_idx, added_node_idx, num_active_partitions, txn_per_client, num_clients, interval, timeout)) def scale_up_replica(self, src_node_idx, added_node_idx, num_active_partitions, txn_per_client, num_clients, interval, timeout): """ A validate function to test scaling up replica for given partition. :param src_node_idx: The index of source node, where new replica recovers from :param added_node_idx: The index of replica to add :param num_active_partitions: Number of active partitions :param txn_per_client: Number of transactions per client :param num_clients: Number of total clients :param interval: Average interval(millisecond) between transactions :param timeout: Test timeout :returns: Validation result """ port = self.waltz_storage.port admin_port = self.waltz_storage.admin_port src_node = self.waltz_storage.nodes[src_node_idx] src_node_hostname = src_node.account.ssh_hostname src_storage = self.get_host(src_node_hostname, admin_port) added_node = self.waltz_storage.nodes[added_node_idx] added_node_hostname = added_node.account.ssh_hostname added_storage = self.get_host(added_node_hostname, admin_port) partition = randrange(num_active_partitions) # Step 1: Produce transactions with current cluster. cmd = self.client_cli.validate_txn_cmd(num_active_partitions, txn_per_client, num_clients, interval) self.verifiable_client.start(cmd) wait_until(lambda: self.is_max_transaction_id_updated( src_storage, port, partition, -1), timeout_sec=timeout) # Step 2: Trigger recovery to update source replicas' low watermark. self.trigger_recovery(bounce_node_idx=src_node_idx) wait_until(lambda: self.is_triggered_recovery_completed(), timeout_sec=timeout) src_node_local_low_watermark = self.get_storage_local_low_watermark( self.get_host(src_node_hostname, admin_port), partition) # Step 3: Add an empty replica and add partition to it. self.storage_add_partition(storage=added_storage, partition=partition) # Step 4: Mark added replica offline for reads and writes self.storage_set_availability(storage=added_storage, partition=partition, online=False) # Step 5: Run recovery operation on new replica. # Source replica's partition low watermark will be used as target for recovery. self.storage_recover_partition(source_storage=src_storage, destination_storage=added_storage, destination_storage_port=port, partition=partition, batch_size=20) # Step 6: Check if new replica catch up with source replica. added_node_max_transaction_id = self.get_storage_max_transaction_id( self.get_host(added_node_hostname, admin_port), port, partition, True) assert src_node_local_low_watermark == added_node_max_transaction_id, \ "Partition recovery failed on storage {}, expected max transaction ID = {}, actual max transaction ID = {}" \ .format(added_node_hostname, src_node_local_low_watermark, added_node_max_transaction_id) # Step 7: Mark new replica online for reads and writes self.storage_set_availability(storage=added_storage, partition=partition, online=True) # Step 8: Add new replica to the replica set in ZooKeeper storage = self.get_host(added_node_hostname, port) self.zk_add_storage_node(storage=storage, storage_admin_port=admin_port, group=added_node_idx) self.zk_assign_partition(storage=storage, partition=partition) # Step 9: Produce transactions after adding new replica. wait_until( lambda: self.verifiable_client.task_complete() == True, timeout_sec=timeout, err_msg="verifiable_client failed to complete task in %d seconds." % timeout) # Step 10: Check if new transactions can reach new replica. expected_max_transaction_id = -1 + num_clients * txn_per_client added_node_max_transaction_id = self.get_storage_max_transaction_id( self.get_host(added_node_hostname, admin_port), port, partition) assert added_node_max_transaction_id == expected_max_transaction_id, \ "New transactions failed to reach new replica, expected max transaction ID = {}, actual max transaction ID = {}" \ .format(expected_max_transaction_id, added_node_max_transaction_id)