Exemplo n.º 1
0
class BenchmarkTest(ProduceConsumeValidateTest):
    """
    A benchmark of Waltz producer/consumer performance.
    """
    MIN_CLUSTER_SPEC = ClusterSpec.from_list([
        {'cpu':1, 'mem':'1GB', 'disk':'25GB', 'additional_disks':{'/dev/sdb':'100GB'}, 'num_nodes':3},
        {'cpu':1, 'mem':'3GB', 'disk':'15GB', 'num_nodes':2},
        {'cpu':1, 'mem':'1GB', 'disk':'25GB', 'num_nodes':1}])

    def __init__(self, test_context):
        super(BenchmarkTest, self).__init__(test_context=test_context)

    @cluster(cluster_spec=MIN_CLUSTER_SPEC)
    @parametrize(txn_size=512, txn_per_thread=1000, num_thread=100, interval=10, lock_pool_size=0, num_active_partitions=1, timeout=360)
    @parametrize(txn_size=512, txn_per_thread=1000, num_thread=100, interval=20, lock_pool_size=0, num_active_partitions=1, timeout=360)
    @parametrize(txn_size=512, txn_per_thread=2000, num_thread=50, interval=10, lock_pool_size=0, num_active_partitions=1, timeout=360)
    @parametrize(txn_size=1024, txn_per_thread=1000, num_thread=100, interval=10, lock_pool_size=0, num_active_partitions=1, timeout=360)
    @parametrize(txn_size=512, txn_per_thread=100, num_thread=100, interval=10, lock_pool_size=64, num_active_partitions=1, timeout=360)
    @parametrize(txn_size=512, txn_per_thread=100, num_thread=100, interval=10, lock_pool_size=128, num_active_partitions=1, timeout=360)
    @parametrize(txn_size=512, txn_per_thread=100, num_thread=100, interval=10, lock_pool_size=128, num_active_partitions=2, timeout=360)
    def test_producer_performance(self, txn_size, txn_per_thread, num_thread, interval, lock_pool_size, num_active_partitions, timeout):
        test_cmd = self.performance_cli.producer_test_cmd(self.log_file_path, txn_size, txn_per_thread, num_thread,
                                                          interval, lock_pool_size, num_active_partitions)
        test_output = self.run_produce_consume_validate(lambda: self.simple_validation_func(test_cmd, timeout))
        self.print_producer_performance(test_output)

    @cluster(cluster_spec=MIN_CLUSTER_SPEC)
    @parametrize(txn_size=512, num_txn=100000, num_active_partitions=1, timeout=360)
    @parametrize(txn_size=512, num_txn=100000, num_active_partitions=4, timeout=360)
    @parametrize(txn_size=1024, num_txn=100000, num_active_partitions=1, timeout=360)
    def test_consumer_performance(self, txn_size, num_txn, num_active_partitions, timeout):
        test_cmd = self.performance_cli.consumer_test_cmd(self.log_file_path, txn_size, num_txn, num_active_partitions)
        test_output = self.run_produce_consume_validate(lambda: self.simple_validation_func(test_cmd, timeout))
        self.print_consumer_performance(test_output)

    def print_producer_performance(self, test_output):
        performance = search(".*transactions(.|\n)*MilliSec\/Transaction.*", test_output).group(0)
        print("\n####################### PRODUCER PERFORMANCE REPORT #######################\n" + \
              "\n{performance}\n".format(performance=performance) + \
              "\n###########################################################################\n")

    def print_consumer_performance(self, test_output):
        performance = search(".*transactions(.|\n)*MB/sec.*", test_output).group(0)
        print("\n####################### CONSUMER PERFORMANCE REPORT #######################\n" + \
              "\n{performance}\n".format(performance=performance) + \
              "\n###########################################################################\n")
Exemplo n.º 2
0
 def check_from_list(self):
     empty = ClusterSpec.empty()
     assert "[]" == str(empty)
     node_specs_dict_list = [{
         'cpu': 2,
         'mem': '2GB',
         'disk': '20GB',
         'num_nodes': 2
     }, {
         'cpu': 4,
         'mem': '4GB',
         'disk': '40GB',
         'num_nodes': 4
     }]
     custom_linux_2 = ClusterSpec.from_list(node_specs_dict_list)
     assert '[{"additional_disks(GB)": {}, "cpu": 2, "disk(GB)": 20.0, "mem(GB)": 2.0, "num_nodes": 2, "os": "linux"},' \
            ' {"additional_disks(GB)": {}, "cpu": 4, "disk(GB)": 40.0, "mem(GB)": 4.0, "num_nodes": 4, "os": "linux"}]' \
            == str(custom_linux_2)
Exemplo n.º 3
0
class RecoveryTest(ProduceConsumeValidateTest):
    """
    Test Waltz recovery by running offline recovery with CLI tools,
    including recover dirty replicas, bring up offline replica, and
    so on.
    """
    MIN_CLUSTER_SPEC = ClusterSpec.from_list([{
        'cpu': 1,
        'mem': '1GB',
        'disk': '25GB',
        'additional_disks': {
            '/dev/sdb': '100GB'
        },
        'num_nodes': 3
    }, {
        'cpu': 1,
        'mem': '3GB',
        'disk': '15GB',
        'num_nodes': 2
    }, {
        'cpu': 1,
        'mem': '1GB',
        'disk': '25GB',
        'num_nodes': 1
    }])

    def __init__(self, test_context):
        super(RecoveryTest, self).__init__(test_context=test_context)

    @cluster(cluster_spec=MIN_CLUSTER_SPEC)
    @parametrize(num_active_partitions=1,
                 txn_per_client=250,
                 num_clients=1,
                 interval=100,
                 timeout=240)
    def test_recover_dirty_replica(self, num_active_partitions, txn_per_client,
                                   num_clients, interval, timeout):
        src_replica_idx = 0
        dst_replica_idx = 2
        self.run_produce_consume_validate(lambda: self.recover_dirty_replica(
            src_replica_idx, dst_replica_idx, num_active_partitions,
            txn_per_client, num_clients, interval, timeout))

    @cluster(cluster_spec=MIN_CLUSTER_SPEC)
    @parametrize(num_active_partitions=1,
                 txn_per_client=250,
                 num_clients=1,
                 interval=100,
                 timeout=240)
    def test_bring_replica_back_online(self, num_active_partitions,
                                       txn_per_client, num_clients, interval,
                                       timeout):
        offline_replica_idx = 0

        self.run_produce_consume_validate(
            lambda: self.bring_replica_back_online(
                offline_replica_idx, num_active_partitions, txn_per_client,
                num_clients, interval, timeout))

    def recover_dirty_replica(self, src_replica_idx, dst_replica_idx,
                              num_active_partitions, txn_per_client,
                              num_clients, interval, timeout):
        """
        A validate function to test offline recovery if a dirty replica.

        :param src_replica_idx: The index of source replica, where new replica recovers from
        :param dst_replica_idx: The index of destination replica
        :param num_active_partitions: Number of active partitions
        :param txn_per_client: Number of transactions per client
        :param num_clients: Number of total clients
        :param interval: Average interval(millisecond) between transactions
        :param timeout: Test timeout
        """
        port = self.waltz_storage.port
        admin_port = self.waltz_storage.admin_port
        src_node = self.waltz_storage.nodes[src_replica_idx]
        src_node_hostname = src_node.account.ssh_hostname
        src_storage = self.get_host(src_node_hostname, admin_port)
        dst_node = self.waltz_storage.nodes[dst_replica_idx]
        dst_node_hostname = dst_node.account.ssh_hostname
        dst_storage = self.get_host(dst_node_hostname, admin_port)
        partition = randrange(num_active_partitions)

        # Step 1: Submit transactions to all replicas.
        cmd = self.client_cli.validate_txn_cmd(num_active_partitions,
                                               txn_per_client, num_clients,
                                               interval)
        self.verifiable_client.start(cmd)
        wait_until(lambda: self.is_max_transaction_id_updated(
            src_storage, port, partition, -1),
                   timeout_sec=timeout)

        # Step 2: Mark destination replica offline for reads and writes
        self.storage_set_availability(storage=dst_storage,
                                      partition=partition,
                                      online=False)

        # Step 3: Trigger recovery to update source replicas' low watermark.
        self.trigger_recovery(bounce_node_idx=src_replica_idx)
        wait_until(lambda: self.is_triggered_recovery_completed(),
                   timeout_sec=timeout)
        src_node_local_low_watermark = self.get_storage_local_low_watermark(
            self.get_host(src_node_hostname, admin_port), partition)

        # Step 4: Run recovery operation on offline replica.
        # Source replica's partition low watermark will be used as target for recovery.
        self.storage_recover_partition(source_storage=src_storage,
                                       destination_storage=dst_storage,
                                       destination_storage_port=port,
                                       partition=partition,
                                       batch_size=20)

        # Step 5: Check if destination replica catch up with source replica.
        dst_node_max_transaction_id = self.get_storage_max_transaction_id(
            self.get_host(dst_node_hostname, admin_port), port, partition,
            True)
        assert src_node_local_low_watermark == dst_node_max_transaction_id, \
            "partition recovery failed on storage {}, expected max transaction ID = {}, actual max transaction ID = {}" \
            .format(dst_node_hostname, src_node_local_low_watermark, dst_node_max_transaction_id)

        # Step 6: Wait until validation complete.
        wait_until(
            lambda: self.verifiable_client.task_complete() == True,
            timeout_sec=timeout,
            err_msg="verifiable_client failed to complete task in %d seconds."
            % timeout)

    def bring_replica_back_online(self, offline_replica_idx,
                                  num_active_partitions, txn_per_client,
                                  num_clients, interval, timeout):
        """
        A validate function to test if a replica can successfully recover when brought back online.

        :param offline_replica_idx: The index of offline replica
        :param num_active_partitions: Number of active partitions
        :param txn_per_client: Number of transactions per client
        :param num_clients: Number of total clients
        :param interval: Average interval(millisecond) between transactions
        :param timeout: Test timeout
        """
        admin_port = self.waltz_storage.admin_port
        node = self.waltz_storage.nodes[offline_replica_idx]
        hostname = node.account.ssh_hostname
        partition = randrange(num_active_partitions)

        # Step 1: Produce a number of transactions.
        cmd = self.client_cli.validate_txn_cmd(num_active_partitions,
                                               txn_per_client, num_clients,
                                               interval)
        self.verifiable_client.start(cmd)

        # Step 2: Mark storage node 0 offline for reads and writes.
        storage = self.get_host(hostname, admin_port)
        self.storage_set_availability(storage=storage,
                                      partition=partition,
                                      online=False)
        storage_session_id_offline = self.get_storage_session_id(
            self.get_host(hostname, admin_port), partition)

        # Step 3: Mark storage node online. Wait until recovery is completed.
        self.storage_set_availability(storage=storage,
                                      partition=partition,
                                      online=True)
        wait_until(lambda: self.is_triggered_recovery_completed(),
                   timeout_sec=timeout)

        # Step 4: Check if storage node's session ID bumps up by 1.
        storage_session_id_online = self.get_storage_session_id(
            storage, partition)
        assert storage_session_id_online == storage_session_id_offline + 1, \
               "recovery failed to complete on storage {}, expected session ID = {}, actual session ID = {}" \
               .format(hostname, storage_session_id_offline + 1, storage_session_id_online)

        # Step 5: Wait until all transactions appended.
        wait_until(
            lambda: self.verifiable_client.task_complete() == True,
            timeout_sec=timeout,
            err_msg="verifiable_client failed to complete task in %d seconds."
            % timeout)
Exemplo n.º 4
0
class SmokeTest(ProduceConsumeValidateTest):
    """
    A class of torture tests that turns on a bunch of ZK, storage, server,
    and client nodes. Fire transactions while turning things off and on,
    to ensure Waltz can recover from expected failure.
    """
    MIN_CLUSTER_SPEC = ClusterSpec.from_list([{
        'cpu': 1,
        'mem': '1GB',
        'disk': '25GB',
        'additional_disks': {
            '/dev/sdb': '100GB'
        },
        'num_nodes': 3
    }, {
        'cpu': 1,
        'mem': '3GB',
        'disk': '15GB',
        'num_nodes': 2
    }, {
        'cpu': 1,
        'mem': '1GB',
        'disk': '25GB',
        'num_nodes': 1
    }])

    def __init__(self, test_context):
        super(SmokeTest, self).__init__(test_context=test_context)

    @cluster(cluster_spec=MIN_CLUSTER_SPEC)
    @parametrize(num_active_partitions=1,
                 txn_per_client=500,
                 num_clients=10,
                 interval=120,
                 timeout=240)
    @parametrize(num_active_partitions=4,
                 txn_per_client=500,
                 num_clients=10,
                 interval=120,
                 timeout=240)
    def test_produce_consume_no_torture(self, num_active_partitions,
                                        txn_per_client, num_clients, interval,
                                        timeout):
        validation_cmd = self.client_cli.validate_txn_cmd(
            num_active_partitions, txn_per_client, num_clients, interval)
        self.run_produce_consume_validate(
            lambda: self.simple_validation_func(validation_cmd, timeout))

    @cluster(cluster_spec=MIN_CLUSTER_SPEC)
    @parametrize(num_active_partitions=1,
                 txn_per_client=500,
                 num_clients=10,
                 interval=120,
                 timeout=480)
    @parametrize(num_active_partitions=4,
                 txn_per_client=500,
                 num_clients=10,
                 interval=120,
                 timeout=480)
    def test_produce_consume_while_bouncing_storage_nodes(
            self, num_active_partitions, txn_per_client, num_clients, interval,
            timeout):
        validation_cmd = self.client_cli.validate_txn_cmd(
            num_active_partitions, txn_per_client, num_clients, interval)
        validation_result = self.run_produce_consume_validate(
            lambda: self.simple_validation_func(validation_cmd, timeout),
            lambda: self._bounce_storage_nodes(3))
        assert "exception" not in validation_result.lower(
        ), "Test failed with exception:\n{}".format(validation_result)

    @cluster(cluster_spec=MIN_CLUSTER_SPEC)
    @parametrize(num_active_partitions=1,
                 txn_per_client=500,
                 num_clients=2,
                 interval=120,
                 timeout=240)
    @parametrize(num_active_partitions=4,
                 txn_per_client=500,
                 num_clients=2,
                 interval=120,
                 timeout=240)
    def test_produce_consume_while_killing_a_server_node(
            self, num_active_partitions, txn_per_client, num_clients, interval,
            timeout):
        validation_cmd = self.client_cli.validate_txn_cmd(
            num_active_partitions, txn_per_client, num_clients, interval)
        self.run_produce_consume_validate(
            lambda: self.simple_validation_func(validation_cmd, timeout),
            lambda: self._kill_a_server_node(num_active_partitions))

    def _bounce_storage_nodes(self, interval):
        storage_node_bounce_scheduler = NodeBounceScheduler(
            service=self.waltz_storage,
            interval=interval,
            stop_condition=lambda: self.verifiable_client.task_complete())
        storage_node_bounce_scheduler.start()

    def _kill_a_server_node(self, num_active_partitions):
        node_idx = self.get_server_node_idx(randrange(num_active_partitions))
        cmd_list = [{
            "action": NodeBounceScheduler.IDLE
        }, {
            "action": NodeBounceScheduler.STOP_A_NODE,
            "node": node_idx
        }]
        server_node_bounce_scheduler = NodeBounceScheduler(
            service=self.waltz_server,
            interval=3,
            stop_condition=lambda: self.verifiable_client.task_complete(),
            iterable_cmd_list=iter(cmd_list))
        server_node_bounce_scheduler.start()
Exemplo n.º 5
0
class LoadRebalancingTest(ProduceConsumeValidateTest):
    """
        Class for tests simulating load rebalancing on server nodes
    """

    MIN_CLUSTER_SPEC = ClusterSpec.from_list([{
        'cpu': 1,
        'mem': '1GB',
        'disk': '25GB',
        'additional_disks': {
            '/dev/sdb': '100GB'
        },
        'num_nodes': 3
    }, {
        'cpu': 1,
        'mem': '3GB',
        'disk': '15GB',
        'num_nodes': 2
    }, {
        'cpu': 1,
        'mem': '1GB',
        'disk': '25GB',
        'num_nodes': 1
    }])

    def __init__(self, test_context):
        super(LoadRebalancingTest, self).__init__(test_context=test_context)
        self.server_cli = ServerCli(self.verifiable_client.nodes[0],
                                    self.client_config_path)

    @cluster(cluster_spec=MIN_CLUSTER_SPEC)
    @parametrize(num_active_partitions=1,
                 txn_per_client=150,
                 num_clients=2,
                 interval=600,
                 timeout=360)
    @parametrize(num_active_partitions=4,
                 txn_per_client=100,
                 num_clients=2,
                 interval=600,
                 timeout=360)
    def test_rebalancing_on_server_nodes(self, num_active_partitions,
                                         txn_per_client, num_clients, interval,
                                         timeout):
        validation_cmd = self.client_cli.validate_txn_cmd(
            self.log_file_path, num_active_partitions, txn_per_client,
            num_clients, interval)
        self.run_produce_consume_validate(
            lambda: self.rebalancing_on_server_nodes(
                validation_cmd, timeout, num_active_partitions, txn_per_client
                * num_clients))

    def rebalancing_on_server_nodes(self, validation_cmd, timeout,
                                    num_active_partitions,
                                    expected_number_of_transactions):
        """
        A validate function to simulate load rebalancing on server nodes using preferred-partitions.

        :param validation_cmd: Command that is passed to client node
        :param timeout: Test timeout
        :param num_active_partitions: number of active partitions
        :param expected_number_of_transactions: number of transactions expected to be stored on a waltz storage node
        """

        storage_admin_port = self.waltz_storage.admin_port
        server_port = self.waltz_server.port
        storage_port = self.waltz_storage.port
        storage_node = self.waltz_storage.nodes[randrange(
            len(self.waltz_storage.nodes))]
        random_active_partition = randrange(num_active_partitions)
        storage = self.get_host(storage_node.account.ssh_hostname,
                                storage_admin_port)

        # Step 1: Start waltz cluster and execute validation command
        self.verifiable_client.start(validation_cmd)

        # Step 2: Wait till transactions get processed
        wait_until(lambda: self.is_max_transaction_id_updated(
            storage, storage_port, random_active_partition, -1),
                   timeout_sec=timeout)

        # Step 3: Change assigned server node for random_active_partition
        partition_current_server_index = self.get_server_node_idx(
            random_active_partition)

        while True:
            partition_future_server_index = randrange(
                len(self.waltz_server.nodes))
            if not partition_current_server_index == partition_future_server_index:
                break

        server_node = self.waltz_server.nodes[partition_future_server_index]
        server_node_hostname = server_node.account.ssh_hostname
        server = self.get_host(server_node_hostname, server_port)
        self.server_cli.add_preferred_partition(server,
                                                random_active_partition)

        assert self.get_server_node_idx(random_active_partition) == partition_future_server_index, \
            "partition assignment for server nodes remained untouched"

        # Step 4: Wait until verifiable client ends its task
        wait_until(
            lambda: self.verifiable_client.task_complete() == True,
            timeout_sec=timeout,
            err_msg="verifiable_client failed to complete task in {} seconds.".
            format(timeout))

        # Step 5: Assert all transactions are persistently stored.
        assert expected_number_of_transactions == self.get_storage_num_of_all_transactions(storage, storage_port, num_active_partitions), \
            "number of transactions stored in storage partition does not match with all the transactions sent by producers. " \
            "Client {}, Storage = {}" \
                .format(expected_number_of_transactions, self.get_storage_num_of_all_transactions(storage, storage_port, num_active_partitions))
Exemplo n.º 6
0
class ClientValidationTest(ProduceConsumeValidateTest):
    """
    A class of waltz tests where each client (producer/consumer) in waltz cluster is run as a single process.
    """
    MIN_CLUSTER_SPEC = ClusterSpec.from_list([{
        'cpu': 1,
        'mem': '1GB',
        'disk': '25GB',
        'additional_disks': {
            '/dev/sdb': '100GB'
        },
        'num_nodes': 3
    }, {
        'cpu': 1,
        'mem': '3GB',
        'disk': '15GB',
        'num_nodes': 2
    }, {
        'cpu': 1,
        'mem': '1GB',
        'disk': '25GB',
        'num_nodes': 1
    }])

    def __init__(self, test_context):
        super(ClientValidationTest, self).__init__(test_context=test_context)

    @cluster(cluster_spec=MIN_CLUSTER_SPEC)
    @parametrize(num_active_partitions=1,
                 txn_per_client=75,
                 num_producers=3,
                 num_consumers=2,
                 interval=500,
                 timeout=360)
    @parametrize(num_active_partitions=4,
                 txn_per_client=100,
                 num_producers=2,
                 num_consumers=2,
                 interval=250,
                 timeout=360)
    def test_produce_consume_no_torture(self, num_active_partitions,
                                        txn_per_client, num_producers,
                                        num_consumers, interval, timeout):
        validation_cmd = self.get_produce_consume_parallel(
            num_active_partitions, txn_per_client, num_producers,
            num_consumers, interval)
        self.run_produce_consume_validate(
            lambda: self.produce_consume_no_torture(validation_cmd, timeout))

    @cluster(cluster_spec=MIN_CLUSTER_SPEC)
    @parametrize(num_active_partitions=1,
                 txn_per_client=200,
                 num_producers=3,
                 num_consumers=2,
                 interval=500,
                 num_consumers_to_stop=1,
                 delay_before_torture=40,
                 timeout=360)
    @parametrize(num_active_partitions=4,
                 txn_per_client=100,
                 num_producers=2,
                 num_consumers=4,
                 interval=500,
                 num_consumers_to_stop=3,
                 delay_before_torture=35,
                 timeout=360)
    def test_produce_consume_abrupt_stop_of_consumers(
            self, num_active_partitions, txn_per_client, num_producers,
            num_consumers, interval, num_consumers_to_stop,
            delay_before_torture, timeout):
        validation_cmd = self.get_produce_consume_parallel(
            num_active_partitions, txn_per_client, num_producers,
            num_consumers, interval)
        self.run_produce_consume_validate(
            lambda: self.produce_consume_consumer_torture(
                validation_cmd, timeout, txn_per_client * num_producers,
                num_active_partitions, num_consumers_to_stop,
                delay_before_torture, num_consumers))

    def get_num_failed_processes_cmd(self):
        return "fail=0; for job in `jobs -p`; do wait $job || let \"fail+=1\"; done ; echo \"number of failed processes: $fail\""

    def get_produce_consume_parallel(self, num_active_partitions,
                                     txn_per_client, num_producers,
                                     num_consumers, interval):
        cmd_parallel = ""

        for i in range(num_producers):
            cmd_parallel += self.client_cli.create_producer_cmd(
                self.log_file_path, txn_per_client, interval,
                num_active_partitions)
            cmd_parallel += " & "
        for i in range(num_consumers):
            cmd_parallel += self.client_cli.create_consumer_cmd(
                self.log_file_path, txn_per_client * num_producers,
                num_active_partitions)
            cmd_parallel += " & "

        cmd_parallel += self.get_num_failed_processes_cmd()
        return cmd_parallel

    def produce_consume_no_torture(self, validation_cmd, timeout):
        """
        A validate function to test producers and consumers running in parallel.

        :param validation_cmd: Command that is passed to client node
        :param timeout: Test timeout
        """

        # Step 1: Start waltz cluster and execute validation command
        self.verifiable_client.start(validation_cmd)

        # Step 2: Wait until verifiable client ends its task
        wait_until(
            lambda: self.verifiable_client.task_complete() == True,
            timeout_sec=timeout,
            err_msg=lambda:
            "verifiable_client failed to complete task in {} seconds. Number of producers still running: {}, "
            "number of consumers still running: {}".format(
                timeout, self.number_of_running_producers(),
                self.number_of_running_consumers()))

        # Step 3: Verify child processes of a main process finished successfully (exit code 0 received)
        num_failed_processes = int(
            re.search('failed processes: (\d+)',
                      self.verifiable_client.get_validation_result()).group(1))
        assert num_failed_processes == 0, "number of failed processes: {}".format(
            num_failed_processes)

    def number_of_running_producers(self):
        return int(self.verifiable_client.nodes[0].account.ssh_output(
            "ps -eo command | grep -c \"^java .* create-producer\" | cat").
                   strip())

    def number_of_running_consumers(self):
        return int(self.verifiable_client.nodes[0].account.ssh_output(
            "ps -eo command | grep -c \"^java .* create-consumer\" | cat").
                   strip())

    def produce_consume_consumer_torture(self, validation_cmd, timeout,
                                         expected_number_of_transactions,
                                         num_active_partitions,
                                         num_consumers_to_stop,
                                         delay_before_torture, num_consumers):
        """
        A validate function to test producers and consumers running in parallel with a torture test that kills consumer client nodes.

        :param validation_cmd: Command that is passed to client node
        :param timeout: Test timeout
        :param expected_number_of_transactions: The expected number of created transaction during this test
        :param num_active_partitions: Number of active partitions
        :param num_consumers_to_stop: Number of consumers to stop with kill SIGTERM command
        :param delay_before_torture: The delay in seconds between first transaction being processed and beginning of the torture test
        :param num_consumers: Total number of consumer clients
        """

        admin_port = self.waltz_storage.admin_port
        port = self.waltz_storage.port
        node = self.waltz_storage.nodes[randrange(len(
            self.waltz_storage.nodes))]
        storage = self.get_host(node.account.ssh_hostname, admin_port)
        random_active_partition = randrange(num_active_partitions)

        # Step 1: Start waltz cluster and execute validation command
        self.verifiable_client.start(validation_cmd)

        # Step 2: Wait till transactions get processed
        wait_until(lambda: self.is_max_transaction_id_updated(
            storage, port, random_active_partition, -1),
                   timeout_sec=timeout)

        # Step 3: Wait before staring torture test
        sleep(delay_before_torture)

        # Step 4: Start torture
        self.abrupt_stop_consumer_nodes(num_consumers, num_consumers_to_stop)

        # Step 5: Wait until verifiable client ends its task
        wait_until(
            lambda: self.verifiable_client.task_complete() == True,
            timeout_sec=timeout,
            err_msg="verifiable_client failed to complete task in {} seconds.".
            format(timeout))

        # Step 6: Verify child processes of a main process finished as expected (num_consumers_to_stop exited with other code than 0)
        num_failed_processes = int(
            re.search('failed processes: (\d+)',
                      self.verifiable_client.get_validation_result()).group(1))
        assert num_failed_processes == num_consumers_to_stop, "number of failed processes: {}, expected: {}".format(
            num_failed_processes, num_consumers_to_stop)

        # Step 7: Assert all transactions are persistently stored.
        assert expected_number_of_transactions == self.get_storage_num_of_all_transactions(storage, port, num_active_partitions), \
            "number of transactions stored in storage partition does not match with all the transactions sent by producers. " \
            "Client {}, Storage = {}" \
                .format(expected_number_of_transactions,
                        self.get_storage_num_of_all_transactions(storage, port, num_active_partitions))

    def abrupt_stop_consumer_nodes(self, num_consumers, num_consumers_to_stop):
        get_consumer_pids = "ps -ef | grep \"create-consumer\" | awk \'{{print $2}}\' | head -n -1 | sort -n"
        consumer_pids = self.verifiable_client.nodes[0].account.ssh_output(
            get_consumer_pids).strip().splitlines()

        # remove parent process and grep process pid's i.e. 1st and last
        consumer_process_pids = consumer_pids[1:num_consumers + 1]

        shuffle(consumer_process_pids)

        # Select the consumer pids from the beginning of the shuffled list to kill the respective consumer process.
        for i in range(num_consumers_to_stop):
            cmd = "kill -SIGTERM {}".format(consumer_process_pids[i])
            self.verifiable_client.nodes[0].account.ssh(cmd)
class ConnectionInterruptionTest(ProduceConsumeValidateTest):
    """
        Class for tests simulating network issues corresponding to Waltz Server and Waltz Storage nodes
    """

    MIN_CLUSTER_SPEC = ClusterSpec.from_list([{
        'cpu': 1,
        'mem': '1GB',
        'disk': '25GB',
        'additional_disks': {
            '/dev/sdb': '100GB'
        },
        'num_nodes': 3
    }, {
        'cpu': 1,
        'mem': '3GB',
        'disk': '15GB',
        'num_nodes': 2
    }, {
        'cpu': 1,
        'mem': '1GB',
        'disk': '25GB',
        'num_nodes': 1
    }])

    def __init__(self, test_context):
        super(ConnectionInterruptionTest,
              self).__init__(test_context=test_context)

    @cluster(cluster_spec=MIN_CLUSTER_SPEC)
    @parametrize(num_active_partitions=1,
                 txn_per_client=150,
                 num_clients=2,
                 interval=600,
                 timeout=360,
                 interrupt_duration=10,
                 num_interruptions=3,
                 delay_between_interruptions=25)
    @parametrize(num_active_partitions=4,
                 txn_per_client=100,
                 num_clients=2,
                 interval=1000,
                 timeout=300,
                 interrupt_duration=20,
                 num_interruptions=1,
                 delay_between_interruptions=20)
    def test_client_server_network_interruption(self, num_active_partitions,
                                                txn_per_client, num_clients,
                                                interval, timeout,
                                                interrupt_duration,
                                                num_interruptions,
                                                delay_between_interruptions):
        validation_cmd = self.client_cli.validate_txn_cmd(
            self.log_file_path, num_active_partitions, txn_per_client,
            num_clients, interval)
        self.run_produce_consume_validate(
            lambda: self.client_server_network_interruption(
                validation_cmd, timeout, interrupt_duration, num_interruptions,
                delay_between_interruptions, num_active_partitions, interval /
                1000))

    @cluster(cluster_spec=MIN_CLUSTER_SPEC)
    @parametrize(num_active_partitions=1,
                 txn_per_client=200,
                 num_clients=1,
                 interval=100,
                 timeout=240,
                 interrupt_duration=5,
                 num_of_nodes_to_bounce=2)
    @parametrize(num_active_partitions=4,
                 txn_per_client=200,
                 num_clients=2,
                 interval=100,
                 timeout=240,
                 interrupt_duration=5,
                 num_of_nodes_to_bounce=2)
    @parametrize(num_active_partitions=1,
                 txn_per_client=200,
                 num_clients=1,
                 interval=100,
                 timeout=240,
                 interrupt_duration=5,
                 num_of_nodes_to_bounce=1)
    @parametrize(num_active_partitions=4,
                 txn_per_client=200,
                 num_clients=2,
                 interval=100,
                 timeout=300,
                 interrupt_duration=5,
                 num_of_nodes_to_bounce=1)
    def test_storage_node_network_interruption(self, num_active_partitions,
                                               txn_per_client, num_clients,
                                               interval, timeout,
                                               interrupt_duration,
                                               num_of_nodes_to_bounce):
        validation_cmd = self.client_cli.validate_txn_cmd(
            self.log_file_path, num_active_partitions, txn_per_client,
            num_clients, interval)
        self.run_produce_consume_validate(
            lambda: self.storage_node_network_interruption(
                validation_cmd, num_active_partitions, txn_per_client,
                num_clients, timeout, interrupt_duration,
                num_of_nodes_to_bounce))

    def drop_traffic_to_port(self, node, port):
        node.account.ssh_capture(
            "sudo iptables -I INPUT -p tcp --destination-port {} -j DROP".
            format(port))

    def enable_traffic_to_port(self, node, port):
        node.account.ssh_capture(
            "sudo iptables -D INPUT -p tcp --destination-port {} -j DROP".
            format(port))

    def client_server_network_interruption(self, validation_cmd, timeout,
                                           interrupt_duration,
                                           num_interruptions,
                                           delay_between_interruptions,
                                           num_active_partitions,
                                           processing_duration):
        """
        Set up waltz and interrupt network between a waltz client node and a server node.

        :param validation_cmd: The command that is send to ClientCli
        :param timeout: Test timeout
        :param interrupt_duration: Interval in seconds during which client won't be able to connect to server
        :param num_interruptions: Number of connection interruption cycles
        :param delay_between_interruptions: Interval in seconds that represents duration between network interruptions
        :param num_active_partitions: Number of active partitions
        :param processing_duration: Time in seconds within which it is safe to assume that processed transactions gets stored
        """

        partition = randrange(num_active_partitions)
        node_idx = self.get_server_node_idx(partition)

        # Start waltz cluster and wait until a storage node registers first transaction
        self.verifiable_client.start(validation_cmd)
        admin_port = self.waltz_storage.admin_port
        port = self.waltz_storage.port
        storage = self.get_host(
            self.waltz_storage.nodes[0].account.ssh_hostname, admin_port)
        wait_until(lambda: self.is_max_transaction_id_updated(
            storage, port, partition, -1),
                   timeout_sec=timeout)

        node = self.waltz_server.nodes[node_idx]
        for interruption in range(num_interruptions):
            sleep(delay_between_interruptions)
            try:
                # disable connection on port
                self.drop_traffic_to_port(node, self.waltz_server.port)

                # verify that during network interruption number of stored transactions didn't increase
                # sleep time to assure that transactions from server are propagated to storage nodes
                sleep(processing_duration)
                cur_high_watermark = self.get_storage_max_transaction_id(
                    storage, port, partition)
                sleep(max(interrupt_duration - processing_duration, 0))
                assert not self.is_max_transaction_id_updated(storage, port, partition, cur_high_watermark), \
                    'Network interruption failed, newly stored transactions detected'

            finally:
                """
                delete the added iptable rule as it is not removed with the end of waltz process and could persist on 
                waltz-server VM, if process is signaled to end (^C) or an exception is thrown during execution of the try block.
                """
                self.enable_traffic_to_port(node, self.waltz_server.port)

        wait_until(
            lambda: self.verifiable_client.task_complete() == True,
            timeout_sec=timeout,
            err_msg="verifiable_client failed to complete task in %d seconds."
            % timeout)

    class StorageNodeInfo:
        """
        Representation of a storage node and number of transactions stored on a storage node
        """
        def __init__(self, node):
            self.node = node
            self.total_num_of_transactions = 0

    def storage_node_network_interruption(self, validation_cmd,
                                          num_active_partitions,
                                          txn_per_client, num_clients, timeout,
                                          interrupt_duration,
                                          num_of_nodes_to_bounce):
        """
        A validate function to test bouncing network connection between server and storage. Verification of correctness
        is done by comparing expected number of stored transactions with current transactions in the Waltz Cluster.

        :param validation_cmd: The command that is send to ClientCli
        :param num_active_partitions: Number of active partitions
        :param txn_per_client: Number of transactions per client
        :param num_clients: Number of waltz clients
        :param timeout: Test timeout
        :param interrupt_duration: Duration (milliseconds) of communication interruption between server and storage
        :param num_of_nodes_to_bounce: Number of storage nodes to bounce. This may affect the quorum
        """

        bounced_nodes = []
        for node_number in sample(range(len(self.waltz_storage.nodes)),
                                  num_of_nodes_to_bounce):
            bounced_nodes.append(
                self.StorageNodeInfo(self.waltz_storage.nodes[node_number]))

        admin_port = self.waltz_storage.admin_port
        port = self.waltz_storage.port

        # Step 1: Get current sum of transactions across partitions
        for bounced_node_info in bounced_nodes:
            for partition in range(num_active_partitions):
                bounced_node_info.total_num_of_transactions += max(
                    0,
                    self.get_storage_max_transaction_id(
                        self.get_host(
                            bounced_node_info.node.account.ssh_hostname,
                            admin_port), port, partition) + 1)

        # Step 2: Submit transactions to all replicas.
        self.verifiable_client.start(validation_cmd)
        wait_until(lambda: self.is_max_transaction_id_updated(
            self.get_host(bounced_nodes[0].node.account.
                          ssh_hostname, admin_port), port,
            randrange(num_active_partitions), -1),
                   timeout_sec=timeout)
        try:
            # Step 3: Interrupt connection
            for bounced_node_info in bounced_nodes:
                self.drop_traffic_to_port(bounced_node_info.node, port)

            # Step 4: Verify that storage port is closed
            for bounced_node_info in bounced_nodes:
                partition = randrange(num_active_partitions)

                # RemoteCommandError raised when get_storage_max_transaction_id request fails
                # because connection to the storage node port is blocked
                try:
                    self.get_storage_max_transaction_id(
                        self.get_host(
                            bounced_node_info.node.account.ssh_hostname,
                            admin_port), port, partition)
                    raise AssertionError(
                        "Network interruption failed. get_storage_max_transaction_id didn't return RemoteCommandError"
                    )
                except RemoteCommandError:
                    pass

            sleep(interrupt_duration)
        finally:
            # Step 5: Enable connection, Do this step even when the Step 4 fails, as the added iptable
            # rules aren't removed from VM with the end of this process
            for bounced_node_info in bounced_nodes:
                self.enable_traffic_to_port(bounced_node_info.node, port)

        # Step 6: Verify that total number of expected transactions matches number of transactions stored in waltz storage nodes
        for bounced_node_info in bounced_nodes:
            expected_number_of_transactions = (
                txn_per_client *
                num_clients) + bounced_node_info.total_num_of_transactions
            storage = self.get_host(
                bounced_node_info.node.account.ssh_hostname, admin_port)
            wait_until(lambda: expected_number_of_transactions == self.get_storage_num_of_all_transactions(storage, port, num_active_partitions),
                       timeout_sec=timeout, err_msg="number of transactions stored in storage partition does not match with all the transactions sent by client. "
                                             "Client {}, Strage = {} after {} seconds" \
                       .format(expected_number_of_transactions, self.get_storage_num_of_all_transactions(storage, port, num_active_partitions), timeout))
Exemplo n.º 8
0
class ScalabilityTest(ProduceConsumeValidateTest):
    """
    Test Waltz scalability by scaling up and scaling down replicas.
    """
    MIN_CLUSTER_SPEC = ClusterSpec.from_list([{
        'cpu': 1,
        'mem': '1GB',
        'disk': '25GB',
        'additional_disks': {
            '/dev/sdb': '100GB'
        },
        'num_nodes': 4
    }, {
        'cpu': 1,
        'mem': '3GB',
        'disk': '15GB',
        'num_nodes': 2
    }, {
        'cpu': 1,
        'mem': '1GB',
        'disk': '25GB',
        'num_nodes': 1
    }])

    def __init__(self, test_context):
        super(ScalabilityTest, self).__init__(test_context=test_context,
                                              num_storage_nodes=4)

    @cluster(cluster_spec=MIN_CLUSTER_SPEC)
    @parametrize(num_active_partitions=1,
                 txn_per_client=500,
                 num_clients=1,
                 interval=100,
                 timeout=240)
    def test_scale_up_replica(self, num_active_partitions, txn_per_client,
                              num_clients, interval, timeout):
        src_node_idx = 0
        added_node_idx = 3
        # new replica will not receive appends until offline recovery completes
        self.set_storage_nodes_to_ignore([added_node_idx])
        self.run_produce_consume_validate(lambda: self.scale_up_replica(
            src_node_idx, added_node_idx, num_active_partitions,
            txn_per_client, num_clients, interval, timeout))

    def scale_up_replica(self, src_node_idx, added_node_idx,
                         num_active_partitions, txn_per_client, num_clients,
                         interval, timeout):
        """
        A validate function to test scaling up replica for given partition.

        :param src_node_idx: The index of source node, where new replica recovers from
        :param added_node_idx: The index of replica to add
        :param num_active_partitions: Number of active partitions
        :param txn_per_client: Number of transactions per client
        :param num_clients: Number of total clients
        :param interval: Average interval(millisecond) between transactions
        :param timeout: Test timeout
        :returns: Validation result
        """
        port = self.waltz_storage.port
        admin_port = self.waltz_storage.admin_port
        src_node = self.waltz_storage.nodes[src_node_idx]
        src_node_hostname = src_node.account.ssh_hostname
        src_storage = self.get_host(src_node_hostname, admin_port)
        added_node = self.waltz_storage.nodes[added_node_idx]
        added_node_hostname = added_node.account.ssh_hostname
        added_storage = self.get_host(added_node_hostname, admin_port)
        partition = randrange(num_active_partitions)

        # Step 1: Produce transactions with current cluster.
        cmd = self.client_cli.validate_txn_cmd(num_active_partitions,
                                               txn_per_client, num_clients,
                                               interval)
        self.verifiable_client.start(cmd)
        wait_until(lambda: self.is_max_transaction_id_updated(
            src_storage, port, partition, -1),
                   timeout_sec=timeout)

        # Step 2: Trigger recovery to update source replicas' low watermark.
        self.trigger_recovery(bounce_node_idx=src_node_idx)
        wait_until(lambda: self.is_triggered_recovery_completed(),
                   timeout_sec=timeout)
        src_node_local_low_watermark = self.get_storage_local_low_watermark(
            self.get_host(src_node_hostname, admin_port), partition)

        # Step 3: Add an empty replica and add partition to it.
        self.storage_add_partition(storage=added_storage, partition=partition)

        # Step 4: Mark added replica offline for reads and writes
        self.storage_set_availability(storage=added_storage,
                                      partition=partition,
                                      online=False)

        # Step 5: Run recovery operation on new replica.
        # Source replica's partition low watermark will be used as target for recovery.
        self.storage_recover_partition(source_storage=src_storage,
                                       destination_storage=added_storage,
                                       destination_storage_port=port,
                                       partition=partition,
                                       batch_size=20)

        # Step 6: Check if new replica catch up with source replica.
        added_node_max_transaction_id = self.get_storage_max_transaction_id(
            self.get_host(added_node_hostname, admin_port), port, partition,
            True)
        assert src_node_local_low_watermark == added_node_max_transaction_id, \
            "Partition recovery failed on storage {}, expected max transaction ID = {}, actual max transaction ID = {}" \
            .format(added_node_hostname, src_node_local_low_watermark, added_node_max_transaction_id)

        # Step 7: Mark new replica online for reads and writes
        self.storage_set_availability(storage=added_storage,
                                      partition=partition,
                                      online=True)

        # Step 8: Add new replica to the replica set in ZooKeeper
        storage = self.get_host(added_node_hostname, port)
        self.zk_add_storage_node(storage=storage,
                                 storage_admin_port=admin_port,
                                 group=added_node_idx)
        self.zk_assign_partition(storage=storage, partition=partition)

        # Step 9: Produce transactions after adding new replica.
        wait_until(
            lambda: self.verifiable_client.task_complete() == True,
            timeout_sec=timeout,
            err_msg="verifiable_client failed to complete task in %d seconds."
            % timeout)

        # Step 10: Check if new transactions can reach new replica.
        expected_max_transaction_id = -1 + num_clients * txn_per_client
        added_node_max_transaction_id = self.get_storage_max_transaction_id(
            self.get_host(added_node_hostname, admin_port), port, partition)
        assert added_node_max_transaction_id == expected_max_transaction_id, \
            "New transactions failed to reach new replica, expected max transaction ID = {}, actual max transaction ID = {}" \
            .format(expected_max_transaction_id, added_node_max_transaction_id)