示例#1
0
    def test_baseline_two_nodes_removed_one_added_with_loading(self):
        with PiClient(self.ignite, self.get_client_config()):
            with TransactionalLoading(self):
                new_server_num = len(self.ignite.get_alive_default_nodes()) - 2
                self.ignite.kill_node(2)
                self.ignite.kill_node(3)

                self.ignite.wait_for_topology_snapshot(
                    server_num=new_server_num)

                self.ignite.add_additional_nodes(self.get_server_config(), 1)
                self.ignite.start_additional_nodes(
                    self.ignite.get_all_additional_nodes())

                self._set_baseline_few_times()

        self.ignite.wait_for_topology_snapshot(client_num=0)

        util_sleep_for_a_while(self.rebalance_timeout)

        self.util_verify(save_lfs_on_exception=True)

        tiden_assert_equal(
            0, self.ignite.find_exception_in_logs('java.lang.AssertionError'),
            "No AssertionError in logs")
    def test_baseline_adding_one_node_with_loading(self):
        with PiClient(self.ignite, self.get_client_config()):
            with TransactionalLoading(self,
                                      loading_profile=LoadingProfile(delay=1,
                                                                     transaction_timeout=50),
                                      skip_consistency_check=True,
                                      tx_metrics=['txCreated', 'txCommit', 'txRollback', 'txFailed']) as tx_loading:
                self.ignite.add_additional_nodes(self.get_server_config(), 1)
                self._sleep_and_custom_event(tx_loading, 'start nodes')
                self.ignite.start_additional_nodes(self.ignite.get_all_additional_nodes())

                self._sleep_and_custom_event(tx_loading, 'set blt')
                self._set_baseline_few_times()

                self._sleep_and_custom_event(tx_loading, 'sleep')
                self._sleep_and_custom_event(tx_loading, 'end loading')

                metrics = tx_loading.metrics

        self.ignite.wait_for_topology_snapshot(client_num=0)

        self.create_loading_metrics_graph('test_baseline_adding_one_node_with_loading',
                                          metrics)

        util_sleep_for_a_while(self.rebalance_timeout)

        self.cu.control_utility('--cache', 'idle_verify')
        
        tiden_assert_equal(0, self.ignite.find_exception_in_logs('java.lang.AssertionError'),
                           "No AssertionError in logs"
                           )
    def zookeeper_fail_test(self, scenario, expecting_broken_cluster=False):
        node_connection = self.get_server_connections()
        try:
            with PiClient(self.ignite, self.get_client_config()):
                with TransactionalLoading(self, skip_consistency_check=True):
                    util_sleep_for_a_while(10, msg='Wait until load started')
                    self.zookeeper_fail_scenario(scenario)
                    self.su.snapshot_utility('SNAPSHOT', '-type=full')
                    self.ignite.kill_node(2)
                    util_sleep_for_a_while(60,
                                           msg='Wait after zookeeper issue')

            self.ignite.start_node(2)

            for node_id in node_connection.keys():
                tiden_assert(self.ignite.check_node_is_alive(node_id),
                             "Node {} is expected to be alive".format(node_id))
            if expecting_broken_cluster:
                tiden_assert(
                    False,
                    'split up all zookeeper host expected to broke cluster')

        except Exception as e:
            if expecting_broken_cluster:
                util_sleep_for_a_while(60, msg='Wait all node segmented')
                for node_id in node_connection.keys():
                    tiden_assert(
                        not self.ignite.check_node_is_alive(node_id),
                        "Node {} is expected to be dead".format(node_id))
            else:
                raise e
    def test_baseline_removing_two_nodes_with_loading(self):
        with PiClient(self.ignite, self.get_client_config()):
            with TransactionalLoading(self,
                                      loading_profile=LoadingProfile(delay=1,
                                                                     transaction_timeout=50),
                                      tx_metrics=['txCreated', 'txCommit', 'txRollback', 'txFailed']) as tx_loading:
                new_server_num = len(self.ignite.get_alive_default_nodes()) - 2
                self._sleep_and_custom_event(tx_loading, 'kill nodes')
                self.ignite.kill_node(2)
                self.ignite.kill_node(3)

                self.ignite.wait_for_topology_snapshot(server_num=new_server_num)

                self._sleep_and_custom_event(tx_loading, 'set blt')
                self._set_baseline_few_times()

                self._sleep_and_custom_event(tx_loading, 'sleep')
                self._sleep_and_custom_event(tx_loading, 'end loading')

                metrics = tx_loading.metrics

        self.ignite.wait_for_topology_snapshot(client_num=0)

        self.create_loading_metrics_graph('test_baseline_removing_two_nodes_with_loading',
                                          metrics)

        util_sleep_for_a_while(self.rebalance_timeout)

        self.cu.control_utility('--cache', 'idle_verify')
        
        tiden_assert_equal(0, self.ignite.find_exception_in_logs('java.lang.AssertionError'),
                           "No AssertionError in logs"
                           )
示例#5
0
    def test_baseline_restart_node_add_one_additional_node(self):
        with PiClient(self.ignite, self.get_client_config()):
            with TransactionalLoading(self):
                self.ignite.kill_node(2)

                self.delete_lfs(node_ids=[
                    2,
                ])

                self.ignite.start_node(2)

                util_sleep_for_a_while(5)

                new_nodes = self.ignite.add_additional_nodes(
                    self.get_server_config(), 1)
                self.ignite.start_additional_nodes(new_nodes)

                # self.cu.kill_transactions()
                self._set_baseline_few_times(5)

        self.util_verify(save_lfs_on_exception=True)

        tiden_assert_equal(
            0, self.ignite.find_exception_in_logs('java.lang.AssertionError'),
            "No AssertionError in logs")
    def test_cluster_stress_tolerance(self, node_under_test, other_node, fault_combination):
        timeout = 15
        thread_timeout = 10
        take_a_rest_timeout = 10
        host_under_test = node_under_test.get('host')
        other_host = other_node.get('host')

        stress = StressT(self.ssh)

        with PiClient(self.ignite, self.get_client_config()):
            with TransactionalLoading(self,
                                      loading_profile=LoadingProfile(delay=1,
                                                                     commit_possibility=1.0,
                                                                     transaction_timeout=5000),
                                      tx_metrics=['txCreated', 'txCommit', 'txRollback', 'txFailed']) as tx_loading:
                util_sleep_for_a_while(take_a_rest_timeout, msg='Loading warm up for')
                self._custom_event(tx_loading, 'start')
                util_sleep_for_a_while(take_a_rest_timeout)
                for key, value in fault_combination.items():
                    if value:
                        self._sleep_and_custom_event(tx_loading, '%s start' % key)
                        print_red('%s start' % key)
                        if key == 'disc load' and value:
                            stress.load_disk(node_under_test['ignite_home'], host_under_test, timeout=timeout)
                        if key == 'network load' and value:
                            stress.load_network(host_under_test, other_host, timeout=timeout)
                        if key == 'cpu load' and value:
                            stress.load_cpu(host_under_test, timeout=timeout)
                        if key == 'ram load' and value:
                            stress.load_ram(host_under_test, timeout=timeout)
                        if key in ['sigstop_server', 'sigstop_client'] and value:
                            if key == 'sigstop_server':
                                pid = stress.get_random_server_pid(host_under_test)
                            else:
                                pid = stress.get_random_client_pid(host_under_test)

                            stress.sigstop_process(host_under_test, pid, timeout=thread_timeout)
                        if key in ['packets loss', ' packets duplicate', 'packets corrupt'] and value:
                            stress.network_emulate_packet(host_under_test, other_host, lost_rate='5.0%',
                                                          timeout=timeout, type=key.split()[-1])
                        self._custom_event(tx_loading, ' ')
                        print_red('%s stop' % key)
                        # util_sleep_for_a_while(take_a_rest_timeout, msg='Rest between tests for')
                util_sleep_for_a_while(take_a_rest_timeout)
                self._custom_event(tx_loading, 'end')
                metrics = tx_loading.metrics

        self.ignite.wait_for_topology_snapshot(client_num=0)
        self.create_loading_metrics_graph('test_baseline_adding_two_nodes_with_loading', metrics)
        self.cu.list_transactions()
        self.cu.control_utility('--cache', 'idle_verify')
        tiden_assert_equal(0, self.ignite.find_exception_in_logs('java.lang.AssertionError'),
                           "No AssertionError in logs"
                           )
示例#7
0
    def test_mixed_cluster_load_caches_old_server(self):
        """
        1. start mixed cluster (new version servers + old version servers)
        2. activate from new version control.sh
        3. start old version server
        4. add it to baseline
        5. smoke check:
        5.1. create dynamic caches from old server node
        5.2. do some load from old server node
        """

        self.ignite_new_version.cu.activate()
        created_caches = []
        self.server_config = Ignite.config_builder.get_config(
            'server', config_set_name='base')
        ignite = self.ignite_old_version
        with PiClient(ignite, self.server_config, nodes_num=1) as piclient:
            ignite.cu.add_node_to_baseline(
                ignite.get_node_consistent_id(piclient.node_ids[0]))

            dynamic_caches_factory = DynamicCachesFactory()
            async_ops = []
            for method in dynamic_caches_factory.dynamic_cache_configs:
                cache_name = "cache_group_%s" % method
                log_print('Loading {}...'.format(cache_name), color='green')

                ignite = piclient.get_ignite()

                ignite.getOrCreateCache(
                    getattr(dynamic_caches_factory, method)(cache_name))

                async_operation = create_async_operation(
                    create_put_all_operation,
                    cache_name,
                    1,
                    1001,
                    10,
                    value_type=self.data_model)
                async_ops.append(async_operation)
                async_operation.evaluate()
                created_caches.append(cache_name)

            log_print('Waiting async results...', color='debug')
            # wait for streamer to complete
            for async_op in async_ops:
                async_op.getResult()

            with TransactionalLoading(MixedTestLoadingAdapter(self),
                                      config_file=self.server_config,
                                      loading_profile=LoadingProfile(
                                          delay=1,
                                          transaction_timeout=100000)):
                sleep(60)
示例#8
0
    def test_baseline_remove_and_back_one_node_with_additional_in_baseline(
            self):
        with PiClient(self.ignite, self.get_client_config()):
            with TransactionalLoading(self,
                                      loading_profile=LoadingProfile(
                                          run_for_seconds=30, delay=10)):
                current_server_nodes = self.ignite.get_nodes_num('server')

                self.start_additional_nodes(
                    self.ignite.add_additional_nodes(self.get_server_config(),
                                                     2))

                self.ignite.wait_for_topology_snapshot(
                    server_num=current_server_nodes + 2)

                log_print('Kill one nodes: baseline and additional')
                self.ignite.kill_node(2)
                self.ignite.kill_node(
                    self.ignite.get_alive_additional_nodes()[0])

                log_print('Remove node from baseline')
                self.ignite.wait_for_topology_snapshot(
                    server_num=current_server_nodes)

                # self._set_baseline_few_times()
                self.cu.control_utility('--baseline')
                self.cu.remove_node_from_baseline(
                    self.ignite.get_node_consistent_id(2))

                self.load_data_with_streamer(1001, 1501)

                log_print('Start node again')
                self.ignite.start_node(2)

                self.ignite.wait_for_topology_snapshot(
                    server_num=current_server_nodes + 1)

                # self._set_baseline_few_times()
                self.cu.add_node_to_baseline(
                    self.ignite.get_node_consistent_id(2))

                self.ignite.wait_for_topology_snapshot(
                    server_num=current_server_nodes + 1)

        print_red("AssertExceptions: %s" % str(
            self.ignite.find_exception_in_logs("java.lang.AssertionError")))

        self.util_verify(save_lfs_on_exception=True)

        tiden_assert_equal(
            0, self.ignite.find_exception_in_logs('java.lang.AssertionError'),
            "No AssertionError in logs")
示例#9
0
    def test_master_master_master_blinking_blt(self):
        self.prepare_clusters()

        client_config = self.preconfigure_cluster_0()

        iterations = 10
        last_loaded_key = START_DATA_SIZE
        nodes_before = 6

        with PiClient(self.clusters[0].grid,
                      client_config,
                      jvm_options=['-ea']) as piclient:
            PiClientIgniteUtils.load_data_with_streamer(
                self.clusters[0].grid,
                client_config,
                end_key=last_loaded_key,
                jvm_options=['-ea'],
                check_clients=False)

            sleep(60)

            with TransactionalLoading(self,
                                      ignite=self.clusters[0].grid,
                                      config_file=client_config,
                                      skip_consistency_check=True):
                for i in range(0, iterations):
                    log_print(f'Current iteration {i + 1} from {iterations}',
                              color='debug')

                    self.clusters[0].grid.kill_node(2)

                    utility_baseline_log = 'control-utility-baseline.log'

                    self.clusters[0].grid.cu.set_current_topology_as_baseline(
                        background=True, log=utility_baseline_log)

                    self.clusters[0].grid.start_node(2,
                                                     skip_topology_check=True)

                    self.clusters[0].grid.wait_for_topology_snapshot(
                        server_num=6)

                    self.clusters[0].grid.update_started_node_status(2)

                    self.clusters[0].grid.cu.set_current_topology_as_baseline(
                        background=True, log=utility_baseline_log)

                    self.verify_cluster(0, nodes_before, last_loaded_key)
    def test_baseline_two_nodes_removed_one_added_with_cpu_loading(self):
        with PiClient(self.ignite, self.get_client_config()):
            with TransactionalLoading(self,
                                      loading_profile=LoadingProfile(delay=1,
                                                                     commit_possibility=0.2,
                                                                     transaction_timeout=1000),
                                      tx_metrics=['txCreated', 'txCommit', 'txRollback', 'txFailed']) as tx_loading:
                new_server_num = len(self.ignite.get_alive_default_nodes()) - 2
                self._sleep_and_custom_event(tx_loading, 'kill nodes')
                self.ignite.kill_node(2)
                self.ignite.kill_node(3)

                self.ignite.wait_for_topology_snapshot(server_num=new_server_num)

                self.ignite.add_additional_nodes(self.get_server_config(), 1)
                self._sleep_and_custom_event(tx_loading, 'add new nodes')
                self.ignite.start_additional_nodes(self.ignite.get_all_additional_nodes())

                self._sleep_and_custom_event(tx_loading, 'set blt')
                self._set_baseline_few_times()

                self._sleep_and_custom_event(tx_loading, 'sleep')

                self._sleep_and_custom_event(tx_loading, 'cpu_load')

                cpu_load_operation = create_cpu_load_operation(1.0, 1.0, 2)
                cpu_load_operation.evaluate()
                self._sleep_and_custom_event(tx_loading, 'cpu_load_sleep_end')
                cpu_load_operation.interrupt()

                self._sleep_and_custom_event(tx_loading, 'end loading')

                metrics = tx_loading.metrics

        self.ignite.wait_for_topology_snapshot(client_num=0)

        log_print(inspect.stack()[0].function)
        self.create_loading_metrics_graph('test_baseline_two_nodes_removed_one_added_with_loading',
                                          metrics)

        util_sleep_for_a_while(self.rebalance_timeout)

        self.cu.control_utility('--cache', 'idle_verify')
        
        tiden_assert_equal(0, self.ignite.find_exception_in_logs('java.lang.AssertionError'),
                           "No AssertionError in logs"
                           )
示例#11
0
 def make_data_loading(self, duration, role='master', func_on_load=None):
     app = self.ignite_master_app if role == 'master' else self.ignite_replica_app
     with PiClient(app, self.get_client_config(role), jvm_options=self.get_dr_jvm_options(role),
                   cu=app.cu) as piclient:
         with TransactionalLoading(piclient, ignite=app, skip_consistency_check=True,
                                   cross_cache_batch=100, skip_atomic=True,
                                   config_file=self.get_client_config(role),
                                   wait_before_kill=False,
                                   loading_profile=LoadingProfile(delay=1,
                                                                  start_key=0,
                                                                  end_key=100,
                                                                  transaction_timeout=500,
                                                                  run_for_seconds=duration + 10),
                                   tx_metrics=['txCreated', 'txCommit', 'txRollback', 'txFailed']) as tx_loading:
             sleep(20)
             if func_on_load == 'switch':
                 self.ignite_master_app.ru.replication_utility('switch')
             elif func_on_load == 'bootstrap':
                 self.ignite_master_app.ru.replication_utility('bootstrap',
                                                               '-role=master -archive=ZIP -single_copy -parallelism=4 -snapshot_folder=%s/snapshot' % self.dr_storage,
                                                               timeout=1200)
                 self.ignite_replica_app.ru.replication_utility('bootstrap',
                                                                '-role=replica -snapshot_folder=%s/snapshot -snapshot_id=%s' % (
                                                                    self.dr_storage,
                                                                    self.ignite_master_app.ru.get_session_id_from_bootstrap_command()),
                                                                timeout=1200)
             elif func_on_load == 'restart_on_load':
                 self.ignite_replica_app.ru.replication_utility('pause')
                 sleep(10)
                 self.restart_ignite_grid('replica')
                 sleep(10)
                 self.ignite_replica_app.ru.replication_utility('resume')
             elif func_on_load == 'pitr':
                 cache = piclient.get_ignite().getOrCreateCache(
                     'com.sbt.bm.ucp.published.api.retail.PublishedIndividual')
                 cache.put(10000, 1)
                 sleep(45)
                 self.ignite_replica_app.ru.replication_utility('stop', '-recovery')
             sleep(duration)
         log_print(tx_loading.metrics['txCommit'])
     app.wait_for_topology_snapshot(
         None,
         0,
         ''
     )
     log_print('Loading done')
 def test_loading_rest(self):
     with PiClient(self.ignite, self.get_client_config()):
         with TransactionalLoading(self,
                                   loading_profile=LoadingProfile(delay=1,
                                                                  transaction_timeout=5000),
                                   tx_metrics=True) as tx_loading:
             self._sleep_and_custom_event(tx_loading, 'start')
             util_sleep_for_a_while(100)
             self._sleep_and_custom_event(tx_loading, 'end')
     self.ignite.wait_for_topology_snapshot(client_num=0)
     self.create_loading_metrics_graph('test_baseline_adding_two_nodes_with_loading', tx_loading)
     self.cu.list_transactions()
     self.cu.control_utility('--cache', 'idle_verify')
     
     tiden_assert_equal(0, self.ignite.find_exception_in_logs('java.lang.AssertionError'),
                        "No AssertionError in logs"
                        )
示例#13
0
    def test_old_cluster_load_caches_new_client(self):
        """
        1. start old version grid
        2. activate from old version control.sh
        3. start new version client
        4. smoke check:
        4.1. create dynamic caches
        4.2. do some load
        """
        created_caches = []

        self.ignite_old_version.cu.activate()
        with PiClient(self.ignite_new_version, self.client_config,
                      nodes_num=1) as piclient:
            dynamic_caches_factory = DynamicCachesFactory()
            async_ops = []
            for method in dynamic_caches_factory.dynamic_cache_configs:
                cache_name = "cache_group_%s" % method
                log_print('Loading {}...'.format(cache_name), color='green')
                piclient.get_ignite().getOrCreateCache(
                    getattr(dynamic_caches_factory, method)(cache_name))

                async_operation = create_async_operation(
                    create_put_all_operation,
                    cache_name,
                    1,
                    1001,
                    10,
                    value_type=self.data_model)
                async_ops.append(async_operation)
                async_operation.evaluate()
                created_caches.append(cache_name)

            log_print('Waiting async results...', color='debug')
            # wait for streamer to complete
            for async_op in async_ops:
                async_op.getResult()

            with TransactionalLoading(MixedTestLoadingAdapter(self),
                                      config_file=self.client_config,
                                      loading_profile=LoadingProfile(
                                          delay=1,
                                          transaction_timeout=100000)):
                sleep(60)
示例#14
0
    def do_snapshot_bench(self):
        ex = None
        metrics = None
        try:
            with PiClient(self.ignite.ignite_cli_load,
                          self.get_client_config()):
                with TransactionalLoading(
                        self,
                        kill_transactions_on_exit=True,
                        cross_cache_batch=self.cross_cache_batch,
                        skip_atomic=True,
                        skip_consistency_check=not self.consistency_check,
                        collect_timeout=self.collect_timeout,
                        collect_timeout_metrics_thread=self.
                        collect_timeout_metrics_thread,
                        loading_profile=LoadingProfile(
                            delay=self.tx_delay,
                            commit_possibility=0.97,
                            start_key=1,
                            end_key=self.LOAD_FACTOR - 1,
                            transaction_timeout=10000),
                        tx_metrics=[
                            'txCreated', 'txCommit', 'txRollback', 'txFailed'
                        ]) as tx_loading:
                    metrics = tx_loading.metrics

                    self._prepare_before_test(tx_loading, 'snapshot')

                    self.ignite.ignite_srvs.su.snapshot_utility(
                        'SNAPSHOT', '-type=FULL')

                    self._measurements_after_test('snapshot',
                                                  skip_minor_exch=0)

            self.ignite.ignite_srvs.wait_for_topology_snapshot(client_num=0)
        except Exception as e:
            ex = e
        if metrics:
            self.create_loading_metrics_graph('pme_snapshot',
                                              metrics,
                                              dpi_factor=0.75)
        if ex:
            raise ex
    def test_baseline_sbt_model_loading(self):
        with PiClient(self.ignite, self.get_client_config()):
            import json

            with open("%s/json_model.json" % self.config['rt']['test_resource_dir'], 'r') as f:
                model_descriptor_file = f.read()

            model_descriptor = json.loads(json.loads(model_descriptor_file))
            caches_to_run = [item for item in model_descriptor.values()]

            with TransactionalLoading(self,
                                      caches_to_run=caches_to_run,
                                      kill_transactions_on_exit=True,
                                      cross_cache_batch=50,
                                      skip_atomic=True,
                                      loading_profile=LoadingProfile(delay=1,
                                                                     start_key=1,
                                                                     end_key=99,
                                                                     transaction_timeout=1000),
                                      tx_metrics=['txCreated', 'txCommit', 'txRollback', 'txFailed']) as tx_loading:
                self.ignite.add_additional_nodes(self.get_server_config(), 1)
                self._sleep_and_custom_event(tx_loading, 'start nodes')
                self.ignite.start_additional_nodes(self.ignite.get_all_additional_nodes())

                self._sleep_and_custom_event(tx_loading, 'set blt')
                self._set_baseline_few_times()

                self._sleep_and_custom_event(tx_loading, 'sleep')
                self._sleep_and_custom_event(tx_loading, 'end loading')

                metrics = tx_loading.metrics

        self.ignite.wait_for_topology_snapshot(client_num=0)

        self.create_loading_metrics_graph('test_baseline_sbt_model_loading',
                                          metrics)

        util_sleep_for_a_while(self.rebalance_timeout)
        self.cu.control_utility('--cache', 'idle_verify')
        
        tiden_assert_equal(0, self.ignite.find_exception_in_logs('java.lang.AssertionError'),
                           "No AssertionError in logs"
                           )
示例#16
0
    def test_loading_blinking_node_baseline(self):
        with PiClient(self.ignite, self.get_client_config()) as piclient:
            self.wait_for_running_clients_num(piclient.nodes_num, 90)

            with ExitStack() as stack:
                stack.enter_context(
                    TransactionalLoading(
                        self,
                        cross_cache_batch=2,
                        skip_atomic=True,
                        post_checksum_action=self.idle_verify_action))

                if is_enabled(self.config.get('zookeeper_enabled')) and \
                        is_enabled(self.config.get('zookeeper_nodes_restart')):
                    stack.enter_context(ZkNodesRestart(self.zoo, 2))

                for iteration in range(0, self.iterations):
                    log_print("Iteration {}/{}".format(str(iteration + 1),
                                                       str(self.iterations)),
                              color='blue')

                    self.assert_nodes_alive()

                    self.ignite.kill_node(2)
                    self.ignite.wait_for_topology_snapshot(
                        server_num=len(self.ignite.get_alive_default_nodes()))

                    self.cu.set_current_topology_as_baseline()

                    util_sleep(5)

                    self.start_node(2)
                    self.ignite.wait_for_topology_snapshot(
                        server_num=len(self.ignite.get_alive_default_nodes()))
                    self.cu.set_current_topology_as_baseline()

                    self.ignite.jmx.wait_for_finish_rebalance(
                        self.rebalance_timeout, self.group_names)

        tiden_assert_equal(
            0, self.ignite.find_exception_in_logs('java.lang.AssertionError'),
            "# of AssertionError")
    def server_segmentation_emulate(self,
                                    first_hosts_list,
                                    second_hosts_list,
                                    reverse=False):
        try:
            self.iptables_clear()

            self.assert_no_errors_in_utility_output()

            with PiClient(self.ignite, self.get_client_config()):
                with TransactionalLoading(self, skip_consistency_check=True):
                    util_sleep_for_a_while(10, msg='Wait until load started')
                    self.split_nodes(first_hosts_list, second_hosts_list)
                    util_sleep_for_a_while(90, msg='Wait after network issue')
            util_sleep_for_a_while(5, msg='Wait after load')

            self.assert_no_errors_in_utility_output(tx_check=True,
                                                    reverse=reverse)

        finally:
            self.iptables_clear()
示例#18
0
    def test_loading_blinking_two_nodes_blt_and_extra_node(self):
        with PiClient(self.ignite, self.get_client_config()):
            additional_node = self.ignite.add_additional_nodes(
                self.get_server_config())[0]
            self.ignite.start_additional_nodes(additional_node)

            with ExitStack() as stack:
                stack.enter_context(
                    TransactionalLoading(
                        self,
                        cross_cache_batch=2,
                        skip_atomic=True,
                        post_checksum_action=self.idle_verify_action))

                if is_enabled(self.config.get('zookeeper_enabled')) and \
                        is_enabled(self.config.get('zookeeper_nodes_restart')):
                    stack.enter_context(ZkNodesRestart(self.zoo, 2))

                for iteration in range(0, self.iterations):
                    log_print("Iteration {}/{}".format(str(iteration + 1),
                                                       str(self.iterations)),
                              color='blue')

                    self.assert_nodes_alive()

                    self.ignite.kill_node(2)
                    self.ignite.kill_node(additional_node)

                    # self.ignite.start_node(2)
                    # self.ignite.start_additional_nodes(additional_node)
                    self.start_node(2)
                    self.start_node(additional_node)

                    self.ignite.jmx.wait_for_finish_rebalance(
                        self.rebalance_timeout, self.group_names)

        tiden_assert_equal(
            0, self.ignite.find_exception_in_logs('java.lang.AssertionError'),
            "# of AssertionError")
    def server_check_cluster_behaviour(self, node_segmented_group):
        """
        Func got expected segmented group:
            1. check that all nodes in this group are dead
            2. start all these nodes
            3. wait rebalance timeout
            4. check there was no data corruption:
                - call idle_verify
                - try to do some loading
                - call idle_verify again and check transactions
        :param node_segmented_group: group of nodes expected to be dead
        """
        # check all nodes are dead
        for node_id in node_segmented_group.keys():
            tiden_assert(
                not self.ignite.check_node_is_alive(node_id),
                "Node {} is expected to be dead".format(
                    node_segmented_group.get(node_id)))

        second_hosts_node_ids = [
            int(node) for node in node_segmented_group.keys()
        ]

        # start all nodes and wait for rebalance completed
        self.ignite.start_nodes(*second_hosts_node_ids, force=True)
        util_sleep_for_a_while(90, msg='Wait rebalance timeout')

        # check idle verify does not return any errors
        self.assert_no_errors_in_utility_output()

        # check with some loading
        with PiClient(self.ignite, self.get_client_config()):
            with TransactionalLoading(self, skip_consistency_check=True):
                util_sleep_for_a_while(15, msg='Little load')

        util_sleep_for_a_while(5, msg='Wait after load')

        self.assert_no_errors_in_utility_output(tx_check=True)
示例#20
0
    def _run_iteration(self, ignite, iteration):
        """
        One iteration of clients PME benchmark is as follows:

            1. start transactional loading at `loading_clients_hosts`, sleep `warmup_clients_delay` so load stabilize
            2.    start `num_clients_to_kill` clients at `clients_hosts` (different from `loading_clients_hosts`)
                  measure JOIN exchange time, sleep `stabilization_delay`
            3.    stop started additional clients, measure LEAVE exchange time, sleep `cooldown_delay`
        :param ignite:
        :param iteration:
        :return:
        """
        log_print("===> PME {} Clients(s) Left-Join Benchmark iteration {}/{} artifact started ".format(
            self.config['num_clients_to_kill'],
            iteration,
            self.config['iterations']
        ), color='green')

        loading_client_hosts = self._get_loading_client_hosts()
        client_hosts = self._get_client_hosts(loading_client_hosts)
        num_clients = self.config['num_clients_to_kill']

        metrics = None
        ex = None
        x1_join_time = None
        x1_leave_time = None

        try:
            # start loading clients
            with PiClient(
                    ignite,
                    self.test_class.client_config,
                    client_hosts=loading_client_hosts,
                    clients_per_host=self.config.get('loading_clients_per_host', 1)
            ):
                # initiate transactional loading
                with TransactionalLoading(
                        self.test_class,
                        ignite=ignite,
                        kill_transactions_on_exit=self.config['kill_transactions_on_exit'],
                        cross_cache_batch=self.config['cross_cache_batch'],
                        skip_atomic=self.config['skip_atomic'],
                        skip_consistency_check=not self.config['consistency_check_enabled'],
                        loading_profile=LoadingProfile(
                            delay=self.config['tx_delay'],
                            commit_possibility=self.config['commit_possibility'],
                            start_key=1,
                            end_key=self.config['load_factor'] - 1,
                            transaction_timeout=self.config['transaction_timeout']
                        ),
                        tx_metrics=['txCreated', 'txCommit', 'txFailed', 'txRollback']
                ) as tx_loading:
                    metrics = tx_loading.metrics

                    util_sleep_for_a_while(self.config['warmup_clients_delay'], "Before JOIN")

                    current_clients_num = ignite.get_nodes_num('client')
                    expected_total_clients_num = current_clients_num + num_clients

                    self.test_class._prepare_before_test(ignite, tx_loading, 'JOIN %d client(s)' % num_clients)

                    # start num_clients client nodes on 'flaky' hosts
                    with PiClient(
                            ignite,
                            self.test_class.client_config,
                            client_hosts=client_hosts,
                            clients_per_host=self.config.get('clients_per_host', 1),
                            nodes_num=num_clients,
                            new_instance=True,
                    ):

                        ignite.wait_for_topology_snapshot(client_num=expected_total_clients_num, timeout=600, check_only_servers=True, exclude_nodes_from_check=[])
                        tx_loading.metrics_thread.add_custom_event('%d client(s) joined' % num_clients)
                        new_topVer = self.test_class._get_new_top_after_test(ignite)
                        self.test_class._wait_exchange_finished(ignite, new_topVer)

                        x1_join_time, x2_time = self.test_class._measurements_after_test('JOIN %d client(s)' % num_clients, skip_exch=1)

                        util_sleep_for_a_while(self.config['stabilization_delay'])

                        # upon exit from with block, num_clients client nodes will be killed
                        self.test_class._prepare_before_test(ignite, tx_loading, 'LEAVE %d client(s)' % num_clients)

                    ignite.wait_for_topology_snapshot(client_num=current_clients_num, timeout=600, check_only_servers=True, exclude_nodes_from_check=[])
                    tx_loading.metrics_thread.add_custom_event('%d client(s) left' % num_clients)
                    new_topVer = self.test_class._get_new_top_after_test(ignite)
                    self.test_class._wait_exchange_finished(ignite, new_topVer)

                    x1_leave_time, x2_time = self.test_class._measurements_after_test('LEAVE %d client(s)' % num_clients, skip_exch=1)
                    util_sleep_for_a_while(self.config['cooldown_delay'])

            ignite.wait_for_topology_snapshot(client_num=0)
        except Exception as e:
            ex = e
        if metrics:
            self.test_class.create_loading_metrics_graph(
                'pme_%d_clients_left_join_%s_%d' % (num_clients, self.run_id, iteration),
                metrics,
                dpi_factor=0.75
            )
        if ex:
            raise ex

        return {
            'Exchange Client Join': x1_join_time,
            'Exchange Client Leave': x1_leave_time,
        }
示例#21
0
    def test_indexes_rebuilded(self):
        """
        https://ggsystems.atlassian.net/browse/GG-17428

        1. Start cluster.
        2. Start transactional loading.
        3. Stop one node and remove index.bin files for the caches.
        4. Start node and let it finish rebalance.
        5. Check indexes are not broken after rebalance.
        :return:
        """
        self.need_delete_lfs_on_teardown = True
        debug = False

        with PiClient(self.ignite, self.get_client_config()) as piclient:
            self.wait_for_running_clients_num(piclient.nodes_num, 90)

            with ExitStack() as stack:
                # todo unreachable code
                if False:
                    stack.enter_context(
                        TransactionalLoading(
                            self,
                            cross_cache_batch=2,
                            skip_atomic=True,
                            post_checksum_action=self.idle_verify_action))

                if is_enabled(self.config.get('zookeeper_enabled')) and \
                        is_enabled(self.config.get('zookeeper_nodes_restart')):
                    stack.enter_context(ZkNodesRestart(self.zoo, 2))

                for iteration in range(0, self.iterations):
                    log_print("Iteration {}/{}".format(str(iteration + 1),
                                                       str(self.iterations)),
                              color='blue')

                    self.assert_nodes_alive()

                    with TransactionalLoading(self,
                                              cross_cache_batch=2,
                                              skip_atomic=True):

                        util_sleep(20)
                        self.ignite.kill_node(2)

                    if debug:
                        self.cu.control_utility(
                            '--cache idle_verify --dump --skip-zeros')

                    self.remove_index_bin_files(2)
                    util_sleep(10)

                    if debug:
                        self.cu.control_utility(
                            '--cache idle_verify --dump --skip-zeros')

                    self.start_node(2)

                    self.ignite.jmx.wait_for_finish_rebalance(
                        self.rebalance_timeout, self.group_names)
                    util_sleep(30)
                    log_print("Check indexes")
                    try:
                        if debug:
                            self.cu.control_utility(
                                '--cache idle_verify --dump --skip-zeros')
                        self.idle_verify_action(None)
                    except TidenException:
                        if debug:
                            self.cu.control_utility(
                                '--cache idle_verify --dump --skip-zeros')
                        raise TidenException('validate_index failed')

        tiden_assert_equal(
            0, self.ignite.find_exception_in_logs('java.lang.AssertionError'),
            "# of AssertionError")
示例#22
0
    def run(self, artifact_name):
        """
        Run rebalance scenario for defined artifact

        Scenario is very simple
        1. start cluster
        2. load data to one cache with backups until size reached 'data_size_kb' from config (5GB is optimal)
        3. start or skip loading
        4. kill node with cache, clean lfs, start node again
        5. using JMX utility wait until LocalNodeMovingPartitionsCount for cache will be 0
        6. save this value and divide by spent time

        Also netstat metrics collected while running this scenario
        (In this case we don't need separate probe to collect more precise metrics)

        :param artifact_name: name from artifact configuration file
        """
        super().run(artifact_name)

        log_print("Running rebalance benchmark with config: %s" % self.config, color='green')

        version = self.test_class.tiden.config['artifacts'][artifact_name]['ignite_version']
        ignite = None
        try:
            # collect properties from config
            self.initialize_config()

            in_memory = self.config.get('in_memory', False)
            xml_config_set_name = 'caches_%s.xml' % REBALANCE_CONFIG_SET \
                if 'single_cache' not in self.config else 'single_cache_%s.xml' % REBALANCE_CONFIG_SET
            self.test_class.create_app_config_set(
                Ignite, REBALANCE_CONFIG_SET,
                deploy=True,
                caches_list_file=xml_config_set_name,
                snapshots_enabled=True,
                logger=True,
                wal_segment_size=self.test_class.consumption_config.get('wal_segment_size',
                                                                        64 * 1024 * 1024),
                logger_path='%s/ignite-log4j2.xml' %
                            self.test_class.tiden.config['rt']['remote']['test_module_dir'],
                disabled_cache_configs=False,
                zookeeper_enabled=False,
                rebalance_pool_size=self.config.get('rebalance_pool_size', 8),
                system_pool_size=self.config.get('rebalance_pool_size', 8) + 8,
                checkpoint_read_lock_timeout=self.read_lock_property_value(version),
                wal_compaction_enabled=self.artifact_config_variables.get('wal_compaction_enabled', False),
                # caches related variables
                additional_configs=['caches.tmpl.xml', ] if 'single_cache' not in self.config else [
                    'single_cache.tmpl.xml', ],
                partitions=5 if self.parts_distribution else 1024,
                part_32=self.test_class.consumption_config.get('part_32',
                                                               32),  # see cache.tmpl.xml for more details
                part_64=self.test_class.consumption_config.get('part_64',
                                                               64),
                part_128=self.test_class.consumption_config.get('part_64',
                                                                128),
                in_memory=in_memory,
                backups=self.config.get('backups', 0),
                load_type=self.load_type,
            )

            # run ignite app
            keys_to_load = int(self.config.get('keys_to_load'))
            ignite, last_end_key, version = self.start_cluster_with_data(keys_to_load, False)

            ignite.set_snapshot_timeout(600)

            # wait for checkpoint
            sleep(CHECKPOINT_SLEEP)

            # dump idle_verify if need and no loading
            dump_before = None
            if self.idle_verify and not self.with_loading:
                dump_before = ignite.cu.idle_verify_dump()

            self.start_probes(artifact_name)

            warmup_runs, prod_runs = self._get_number_of_runs()

            # run rebalance calculation
            if self.with_loading:
                client_config = Ignite.config_builder.get_config('client', config_set_name=REBALANCE_CONFIG_SET)
                with PiClient(ignite, client_config, jvm_options=['-DPICLIENT_OPERATIONS_POOL_SIZE=64']) as piclient:
                    if self.parts_distribution:
                        # for partition distribution we need to pass config_loading_dict
                        cache_load_map = {
                            CACHE_NAME: {
                                # following keys are key generator builder arguments
                                # we build java object later, when we will knew exact gateway
                                'key_generator_builder': AffinityPartitionKeyGeneratorBuilder(
                                    CACHE_NAME,
                                    self.parts_distribution,
                                    1,
                                    keys_to_load,
                                ).set_collision_possibility(0.5),

                                # this is metrics postfix (need to separate different caches in plot)
                                'metric_postfix': 'rebalance',  # metrics postfix for plot
                            },
                        }
                    else:
                        cache_load_map = {
                            CACHE_NAME: {
                                # following keys are key generator builder arguments
                                # we build java object later, when we will knew exact gateway
                                'key_generator_builder':
                                    AffinityCountKeyGeneratorBuilder(
                                        CACHE_NAME,
                                        ignite.get_node_consistent_id(
                                            NODE_TO_REBALANCE),
                                        1,
                                        keys_to_load,
                                        True
                                    ).set_collision_possibility(0.5),

                                # this is metrics postfix (need to separate different caches in plot)
                                'metric_postfix': 'rebalance',  # metrics postfix for plot
                            },
                        }

                    caches_to_load = [CACHE_NAME, ]

                    # define tx_metrics for TransactionalLoading
                    tx_metrics = [
                        'txCreated_rebalance',
                        'txFailed_rebalance',
                    ]

                    if self.with_no_rebalance_cache:
                        # this cache will not be on NODE_TO_REBALANCE but will be under transactionalLoading
                        cache_load_map[CACHE_NAME_NOT_IN_REBALANCE] = {
                            # following keys are key generator builder arguments
                            # we build java object later, when we will knew exact gateway
                            'key_generator_builder': AffinityCountKeyGeneratorBuilder(
                                CACHE_NAME_NOT_IN_REBALANCE,
                                ignite.get_node_consistent_id(
                                    NODE_TO_REBALANCE),
                                1,
                                keys_to_load,
                                False
                            ).set_collision_possibility(0.5),

                            # this is metrics postfix (need to separate different caches in plot)
                            'metric_postfix': 'no_rebalance',  # metrics postfix for plot
                        }

                        caches_to_load.append(CACHE_NAME_NOT_IN_REBALANCE)

                        # mutate tx_metrics for TransactionalLoading
                        tx_metrics.append('txCreated_no_rebalance')
                        tx_metrics.append('txFailed_no_rebalance')

                    with TransactionalLoading(self.test_class,
                                              ignite=ignite,
                                              cu=ignite.cu,
                                              config_file=client_config,
                                              caches_to_run=caches_to_load,
                                              skip_consistency_check=True,
                                              cross_cache_batch=1,
                                              cache_load_map=cache_load_map,
                                              keys_count=keys_to_load,
                                              # multiply execution operations, because we load only in 1 or 2 caches
                                              load_threads=16 * piclient.nodes_num if self.single_cache else None,
                                              collect_timeout=5000,
                                              collision_possibility=0.5,
                                              with_exception=False,
                                              loading_profile=LoadingProfile(commit_possibility=0.8,
                                                                             end_key=last_end_key),
                                              tx_metrics=tx_metrics
                                              ) as tx_loading:
                        LoadingUtils.sleep_and_custom_event(tx_loading, 'Sleep before test', self.metrics_idle)

                        # define snapshot timeout for rebalance on loading
                        ignite.snapshot_timeout = 600

                        rebalance_speed = \
                            self.calculate_rebalance_speed(
                                ignite, prod_runs, warmup_runs, last_end_key, keys_to_load,
                                tx_loading=tx_loading,
                            )

                        metrics = tx_loading.metrics

                LoadingUtils.create_loading_metrics_graph(self.test_class.config['suite_var_dir'],
                                                          'rebalance_%s_%s' %
                                                          (
                                                              version,
                                                              'loading' if self.with_loading else ''
                                                          ),
                                                          metrics)
            else:
                rebalance_speed = self.calculate_rebalance_speed(
                    ignite, prod_runs, warmup_runs, last_end_key, keys_to_load,
                    version=version,
                )

            # dump idle_verify if need and no loading
            if self.idle_verify and not self.with_loading:
                dump_after = ignite.cu.idle_verify_dump()

                if dump_after != dump_before:
                    log_print('Failed idle_verify additional check', color='red')

            ignite.cu.deactivate()

            self.stop_probes(speed=rebalance_speed)

            self.results['evaluated'] = True
        finally:
            if ignite:
                self.kill_cluster(ignite)

            # remove config set
            self.test_class.remove_app_config_set(Ignite, REBALANCE_CONFIG_SET)
    def test_during_loading(self):
        """
        Should be fully fixed in 8.5.8-p1

        Scenario:

            1. Start 3 server nodes
            2. Load 1000 keys into 120 TX caches
            3. Start 3 client node and start TX loading (PESSIMISTIC/REPEATABLE_READ, OPTIMISTIC/SERIALIZABLE)
                    (12 transfer operations, 10 caches in each operation,
                        1000ms between each transaction i.e. ~ 4 tx per second from each client))
            4. In clients try to destroy caches
            5. Interesting things happens

        Fixed in 8.5.8-p1
        https://ggsystems.atlassian.net/browse/GG-19179

        Issues that was found during this test:
        https://ggsystems.atlassian.net/browse/GG-19411
        https://ggsystems.atlassian.net/browse/GG-19383

        :return:
        """
        PiClient.read_timeout = 600

        ignite = self.start_ignite_grid(self.ignite_name)

        ignite.cu.activate(activate_on_particular_node=1)

        PiClientIgniteUtils.load_data_with_putall(ignite, self.client_config, )

        def get_dumps():
            for node_id in ignite.nodes.keys():
                self.util_get_threads_from_jstack(ignite, node_id, 'END')

        try:
            with PiClient(ignite, self.client_config) as piclient:
                with TransactionalLoading(self,
                                          ignite=ignite,
                                          config_file=self.client_config,
                                          on_exit_action=get_dumps,
                                          kill_transactions_on_exit=True,
                                          with_exception=False,  # do interrupt loading operation if smth happens?
                                          skip_consistency_check=True,  # we are destroying caches here if you notice
                                          loading_profile=LoadingProfile(
                                              delay=1000,
                                              allowed_transactions=(
                                                      TxDescriptor(concurrency='OPTIMISTIC',
                                                                   isolation='SERIALIZABLE', ),)
                                          )):
                    # allowed_transactions=(TxDescriptor(), ))):
                    # )):
                    node_id = piclient.get_node_id()
                    client_ignite = piclient.get_ignite(node_id)

                    cache_names = client_ignite.cacheNames().toArray()

                    caches_to_kill_num = 50
                    frags = 0

                    for cache in cache_names:
                        node_id = piclient.get_node_id()

                        log_print('Destroying cache %s on node %s' % (cache, node_id), color='red')

                        piclient.get_ignite(node_id).cache(cache).destroy()

                        frags += 1

                        if frags >= caches_to_kill_num:
                            break
        finally:
            npe_errors = ignite.find_exception_in_logs(".*java.lang.NullPointerException.*")

            assertion_errors = ignite.find_exception_in_logs(".*java.lang.AssertionError.*")

            if npe_errors != 0 or assertion_errors != 0:
                assert False, "There are errors in logs: NPE - %s, AE - %s" % (npe_errors, assertion_errors)
示例#24
0
    def _run_iteration(self, ignite, iteration):
        """
        One iteration of server PME benchmark is as follows:

            1. start transactional loading, sleep `warmup_servers_delay` so that load stabilize
            2.   kill random N nodes, measure LEAVE exchange time, sleep `stabilization_delay`
            3.   restart killed nodes, measure JOIN exchange time, sleep `cooldown_delay`
            4. stop load

        :param ignite:
        :param iteration:
        :return:
        """
        log_print(
            "===> PME {} Server(s) Left-Join Benchmark iteration {}/{} started "
            .format(self.config['num_servers_to_kill'], iteration,
                    self.config['iterations']),
            color='green')

        # if debug:
        #     from pt.util import read_yaml_file
        #     from os.path import join
        #     base_path = 'pt/tests/res/exchanges'
        #     exch_test = iteration
        #     start_exch = read_yaml_file(join(base_path, 'start_exch.%d.yaml' % exch_test))
        #     finish_exch = read_yaml_file(join(base_path, 'finish_exch.%d.yaml' % exch_test))
        #     merge_exch = read_yaml_file(join(base_path, 'merge_exch.%d.yaml' % exch_test))
        #     self.test_class.exchanges = ExchangesCollection.create_from_log_data(start_exch, finish_exch, merge_exch)
        #     self.test_class.new_topVer = 5
        #     x1_leave_time, x2_time = self.test_class._measurements_after_test('test_leave', skip_exch=1)
        #     self.test_class.new_topVer = 6
        #     x1_join_time, x2_time = self.test_class._measurements_after_test('test_join', skip_exch=1)
        #
        #     return x1_leave_time, x1_join_time

        loading_client_hosts = self._get_loading_client_hosts()
        num_servers = self._get_num_server_nodes()
        num_servers_to_kill = self.config['num_servers_to_kill']
        kill_coordinator = self.config['kill_coordinator']

        metrics = None
        ex = None
        x1_join_time = None
        x1_leave_time = None

        try:
            # start loading clients ...
            with PiClient(ignite,
                          self.test_class.client_config,
                          client_hosts=loading_client_hosts,
                          clients_per_host=self.config.get(
                              'loading_clients_per_host', 1)):
                # ... and initiate transactional load
                with TransactionalLoading(
                        self.test_class,
                        ignite=ignite,
                        kill_transactions_on_exit=self.
                        config['kill_transactions_on_exit'],
                        cross_cache_batch=self.config['cross_cache_batch'],
                        skip_atomic=self.config['skip_atomic'],
                        skip_consistency_check=not self.
                        config['consistency_check_enabled'],
                        loading_profile=LoadingProfile(
                            delay=self.config['tx_delay'],
                            commit_possibility=self.
                            config['commit_possibility'],
                            start_key=1,
                            end_key=self.config['load_factor'] - 1,
                            transaction_timeout=self.
                            config['transaction_timeout']),
                        tx_metrics=[
                            'txCreated', 'txCommit', 'txFailed', 'txRollback'
                        ]) as tx_loading:
                    metrics = tx_loading.metrics

                    # pick random server nodes
                    node_ids = ignite.get_random_server_nodes(
                        num_servers_to_kill,
                        use_coordinator=kill_coordinator,
                        node_ids=self.test_class.server_node_ids,
                    )

                    expected_total_server_num = num_servers - len(node_ids)

                    # ... wait load stabilize
                    util_sleep_for_a_while(self.config['warmup_servers_delay'],
                                           "Before LEAVE")

                    if is_enabled(self.config.get('jfr_enabled', False)):
                        ignite.make_cluster_jfr(60)

                    util_sleep_for_a_while(2)
                    self.test_class._prepare_before_test(
                        ignite, tx_loading,
                        'LEAVE %d server(s)' % len(node_ids))

                    # ... kill selected random nodes
                    ignite.kill_nodes(*node_ids)
                    ignite.wait_for_topology_snapshot(
                        server_num=expected_total_server_num)
                    tx_loading.metrics_thread.add_custom_event(
                        '%d server(s) left' % len(node_ids))

                    new_topVer = self.test_class._get_new_top_after_test(
                        ignite)
                    self.test_class._wait_exchange_finished(ignite, new_topVer)

                    x1_leave_time, x2_time = self.test_class._measurements_after_test(
                        'LEAVE %d server(s)' % len(node_ids), skip_exch=1)

                    if is_enabled(self.config.get('heapdump_enabled', False)):
                        ignite.make_cluster_heapdump(
                            [1], 'after_%d_server_leave' % len(node_ids))

                    # ... wait exchange stabilize
                    util_sleep_for_a_while(self.config['stabilization_delay'],
                                           "After LEAVE, before JOIN")

                    if self.config['measure_restart_nodes']:
                        self.test_class._prepare_before_test(
                            ignite, tx_loading,
                            'JOIN %d server(s)' % len(node_ids))

                    # ... restart killed nodes
                    ignite.start_nodes(*node_ids)
                    ignite.wait_for_topology_snapshot(
                        server_num=expected_total_server_num + len(node_ids))

                    if self.config['measure_restart_nodes']:
                        tx_loading.metrics_thread.add_custom_event(
                            '%d server(s) joined' % len(node_ids))

                        new_topVer = self.test_class._get_new_top_after_test(
                            ignite)
                        self.test_class._wait_exchange_finished(
                            ignite, new_topVer)
                        x1_join_time, x2_time = self.test_class._measurements_after_test(
                            'JOIN %d server(s)' % len(node_ids), skip_exch=1)
                        # if is_enabled(self.config.get('heapdump_enabled', False)):
                        #     ignite.make_cluster_heapdump([1], 'after_%d_server_join' % len(node_ids))

                    # ... wait exchange cooldown
                    util_sleep_for_a_while(self.config['cooldown_delay'],
                                           "After JOIN")

            ignite.wait_for_topology_snapshot(client_num=0)
        except Exception as e:
            ex = e
        if metrics:
            self.test_class.create_loading_metrics_graph(
                'pme_%d_servers_left_join_%s_%d' %
                (num_servers_to_kill, self.run_id, iteration),
                metrics,
                dpi_factor=0.75)
        if ex:
            raise ex

        return {
            'Exchange Server Join': x1_join_time,
            'Exchange Server Leave': x1_leave_time,
        }
示例#25
0
    def do_pme_server_bench(self, num_servers, kill_coordinator=False):
        metrics = None
        ex = None
        self.ignite.ignite_srvs.make_cluster_heapdump([1], 'before_load')
        try:
            with PiClient(self.ignite.ignite_cli_load,
                          self.get_client_config()):
                with TransactionalLoading(
                        self,
                        kill_transactions_on_exit=True,
                        cross_cache_batch=self.cross_cache_batch,
                        skip_atomic=True,
                        skip_consistency_check=not self.consistency_check,
                        loading_profile=LoadingProfile(
                            delay=self.tx_delay,
                            commit_possibility=0.97,
                            start_key=1,
                            end_key=self.LOAD_FACTOR - 1,
                            transaction_timeout=1000),
                        tx_metrics=[
                            'txCreated', 'txCommit', 'txRollback', 'txFailed'
                        ]) as tx_loading:

                    metrics = tx_loading.metrics
                    node_ids = self.ignite.ignite_srvs.get_random_server_nodes(
                        num_servers, use_coordinator=kill_coordinator)
                    expected_total_server_num = len(
                        self.ignite.get_all_default_nodes()) - len(node_ids)

                    self._prepare_before_test(
                        tx_loading, 'LEAVE %d server(s)' % len(node_ids))

                    self.ignite.ignite_srvs.make_cluster_jfr(60)
                    util_sleep_for_a_while(2)
                    self.ignite.ignite_srvs.kill_nodes(*node_ids)

                    self.ignite.ignite_srvs.wait_for_topology_snapshot(
                        server_num=expected_total_server_num)
                    tx_loading.metrics_thread.add_custom_event(
                        '%d server(s) left' % len(node_ids))

                    self._measurements_after_test('LEAVE %d server(s)' %
                                                  len(node_ids),
                                                  skip_exch=1)
                    # self.ssh.exec_on_host(self.ignite.ignite_srvs.nodes[1]['host'], [
                    #     'jmap -dump:format=b,file={testdir}/heapdump.{pid}.hprof {pid}'.format(
                    #         testdir=self.config['rt']['remote']['test_dir'],
                    #         pid=self.ignite.ignite_srvs.nodes[1]['PID'],
                    #     )
                    # ])
                    self.ignite.ignite_srvs.make_cluster_heapdump(
                        [1], 'after_server_leave')

                    util_sleep_for_a_while(self.stabilization_time)

                    self._prepare_before_test(
                        tx_loading, 'JOIN %d server(s)' % len(node_ids))
                    self.ignite.ignite_srvs.start_nodes(*node_ids)
                    self.ignite.ignite_srvs.wait_for_topology_snapshot(
                        server_num=expected_total_server_num + len(node_ids))
                    tx_loading.metrics_thread.add_custom_event(
                        '%d server(s) joined' % len(node_ids))
                    util_sleep_for_a_while(int(3 * self.LOAD_FACTOR / 1000))

                    self._measurements_after_test('JOIN %d server(s)' %
                                                  len(node_ids),
                                                  skip_exch=1)
                    self.ignite.ignite_srvs.make_cluster_heapdump(
                        [1], 'after_server_join')

                    util_sleep_for_a_while(self.stabilization_time)

            self.ignite.ignite_srvs.wait_for_topology_snapshot(client_num=0)
        except Exception as e:
            ex = e
        if metrics:
            self.create_loading_metrics_graph('pme_%d_servers_left_join' %
                                              num_servers,
                                              metrics,
                                              dpi_factor=0.75)
        if ex:
            raise ex
示例#26
0
    def do_pme_client_bench(self, num_clients):
        metrics = None
        ex = None
        try:
            with PiClient(self.ignite.ignite_cli_load,
                          self.get_client_config()):
                with TransactionalLoading(
                        self,
                        kill_transactions_on_exit=True,
                        cross_cache_batch=self.cross_cache_batch,
                        skip_atomic=True,
                        skip_consistency_check=not self.consistency_check,
                        loading_profile=LoadingProfile(
                            delay=self.tx_delay,
                            commit_possibility=0.97,
                            start_key=1,
                            end_key=self.LOAD_FACTOR - 1,
                            transaction_timeout=1000),
                        tx_metrics=[
                            'txCreated', 'txCommit', 'txRollback', 'txFailed'
                        ]) as tx_loading:
                    metrics = tx_loading.metrics

                    expected_total_num_clients = len(
                        self.ignite.get_all_client_nodes() +
                        self.ignite.get_all_common_nodes())

                    self._prepare_before_test(
                        tx_loading, 'JOIN %d client(s)' % num_clients)

                    # start num_clients client nodes on 'flaky' hosts
                    with PiClient(self.ignite.ignite_cli_flaky,
                                  self.get_client_config(),
                                  nodes_num=num_clients,
                                  new_instance=True):
                        self.ignite.ignite_srvs.wait_for_topology_snapshot(
                            client_num=expected_total_num_clients +
                            num_clients)
                        tx_loading.metrics_thread.add_custom_event(
                            '%d client(s) joined' % num_clients)

                        self._measurements_after_test('JOIN %d client(s)' %
                                                      num_clients,
                                                      skip_exch=1)

                        util_sleep_for_a_while(self.stabilization_time)

                        # upon exit from with block, num_clients client nodes will be killed
                        self._prepare_before_test(
                            tx_loading, 'LEAVE %d client(s)' % num_clients)

                    self.ignite.ignite_srvs.wait_for_topology_snapshot(
                        client_num=expected_total_num_clients)
                    tx_loading.metrics_thread.add_custom_event(
                        '%d client(s) left' % num_clients)

                    self._measurements_after_test('LEAVE %d client(s)' %
                                                  num_clients,
                                                  skip_exch=1)
                    util_sleep_for_a_while(self.stabilization_time)

            self.ignite.ignite_srvs.wait_for_topology_snapshot(client_num=0)
        except Exception as e:
            ex = e
        if metrics:
            self.create_loading_metrics_graph('pme_%d_clients_join_left' %
                                              num_clients,
                                              metrics,
                                              dpi_factor=0.75)
        if ex:
            raise ex
    def test_full_cluster_blinking(self):
        """

        Enable indexes

        Start servers with PDS, start clients, start some light tx loading.
        In loop try to blink with all cluster at the same time. Logically there should be no data loss:
            full cluster blinking - so there shouldn't be any data loss

        :return:
        """

        PiClient.read_timeout = 240

        self.set_current_context('indexed_types')

        self.util_copy_piclient_model_to_libs()
        self.ignite.set_activation_timeout(240)
        self.ignite.set_snapshot_timeout(240)
        self.ignite.set_node_option('*', 'jvm_options', ['-ea'])
        self.su.clear_snapshots_list()
        self.start_grid(skip_activation=True)

        self.ignite.cu.activate(activate_on_particular_node=1)

        PiClientIgniteUtils.load_data_with_streamer(self.ignite,
                                                    self.get_client_config(),
                                                    end_key=100000)

        nodes_before = self.ignite.get_alive_default_nodes()
        iterations = 50

        with PiClient(self.ignite, self.get_client_config()):
            with TransactionalLoading(
                    self, loading_profile=LoadingProfile(delay=1000)):
                for i in range(0, iterations):
                    log_print('Current iteration %s from %s' % (i, iterations),
                              color='debug')

                    for node_id in nodes_before:
                        self.ignite.kill_node(node_id)
                        sleep(
                            float(self.the_glue_timeout) if self.
                            the_glue_timeout else round(
                                random.uniform(0.1, 0.5), 1))

                    for node_id in nodes_before:
                        self.ignite.start_node(node_id,
                                               skip_topology_check=True)
                        sleep(
                            float(self.the_glue_timeout) if self.
                            the_glue_timeout else round(
                                random.uniform(0.1, 0.5), 1))

                    self.ignite.wait_for_topology_snapshot(
                        server_num=len(nodes_before))

                    for node_id in self.ignite.get_all_default_nodes():
                        self.ignite.update_started_node_status(node_id)

                    sleep(10)

                    self.cu.control_utility('--cache validate_indexes',
                                            all_required='no issues found.')

                    self.verify_cluster(nodes_before, 0)
示例#28
0
    def test_24_fitness_rolling_upgrade(self):
        """
        This test checks the main rolling upgrade scenario under the load:
            1. Old cluster up and running (consistent_id's are not set).
            2. First cycle (upgrade to new version and set property
                GG_DISABLE_SNAPSHOT_ON_BASELINE_CHANGE_WITH_ENABLED_PITR):
            3. Second cycle (set correct consistent_id with adding to baseline topology).

        """
        created_caches = []

        self.ignite_old_version.cu.activate()

        with PiClient(self.ignite_new_version, self.client_config,
                      nodes_num=1) as piclient:

            dynamic_caches_factory = DynamicCachesFactory()
            async_ops = []
            for method in dynamic_caches_factory.dynamic_cache_configs:
                cache_name = "cache_group_%s" % method
                log_print('Loading {}...'.format(cache_name), color='green')
                piclient.get_ignite().getOrCreateCache(
                    getattr(dynamic_caches_factory, method)(cache_name))

                async_operation = create_async_operation(
                    create_put_all_operation,
                    cache_name,
                    1,
                    1001,
                    10,
                    value_type=self.data_model)
                async_ops.append(async_operation)
                async_operation.evaluate()
                created_caches.append(cache_name)

            log_print('Waiting async results...', color='debug')
            # wait for streamer to complete
            for async_op in async_ops:
                async_op.getResult()

        util_sleep_for_a_while(60)

        with PiClient(self.ignite_old_version, self.client_config,
                      nodes_num=4) as piclient:
            cache_names = piclient.get_ignite().cacheNames()

            # Start transaction loading for TTL caches
            with TransactionalLoading(MixedTestLoadingAdapter(self),
                                      config_file=self.client_config,
                                      loading_profile=LoadingProfile(
                                          delay=0,
                                          transaction_timeout=100000,
                                          run_for_seconds=600)):
                util_sleep_for_a_while(20)
                log_print('Rolling upgrade', color='green')
                async_ops = []
                for cache_name in [
                        cache_name for cache_name in cache_names.toArray()
                        if cache_name.startswith("M2_PRODUCT")
                ]:
                    async_operation = create_async_operation(
                        create_put_all_operation,
                        cache_name,
                        1001,
                        400001,
                        10,
                        value_type=ModelTypes.VALUE_ALL_TYPES.value)
                    async_ops.append(async_operation)
                    async_operation.evaluate()

                # First cycle: upgrade version and set property.
                for i in range(1, 5):
                    self.ignite_old_version.cu.control_utility('--baseline')
                    log_print('Stopping node {}'.format(i), color='green')
                    self.ignite_old_version.kill_nodes(i)

                    self.ignite_new_version.cleanup_work_dir(i)
                    folder = self.ignite_old_version.get_work_dir(i)
                    log_print(folder, color='debug')
                    self.ignite_new_version.copy_work_dir_from(i, folder)

                    jvm_options = self.ignite_new_version.get_jvm_options(i)
                    jvm_options.append(
                        '-DGG_DISABLE_SNAPSHOT_ON_BASELINE_CHANGE_WITH_ENABLED_PITR=true'
                    )

                    util_sleep_for_a_while(10)
                    self.ignite_new_version.start_nodes(i,
                                                        already_nodes=(4 - i),
                                                        other_nodes=(4 - i),
                                                        timeout=240)
                    self.ignite_new_version.cu.control_utility('--baseline')

                for async_op in async_ops:
                    async_op.getResult()

                util_sleep_for_a_while(30)
                log_print('Change consistent ID', color='green')

                self.ignite_new_version.set_node_option(
                    '*', 'config',
                    Ignite.config_builder.get_config(
                        'server', config_set_name='24_fit_with_consist_id'))

                # Second cycle - change consistent_id and add to baseline topology.
                for i in range(1, 5):
                    self.ignite_new_version.cu.control_utility('--baseline')
                    log_print('Stopping node {}'.format(i), color='green')
                    self.ignite_new_version.kill_nodes(i)
                    log_print(
                        "Starting node {} with new consistent id".format(i),
                        color='debug')
                    self.ignite_new_version.start_nodes(i, timeout=240)
                    log_print("Changing baseline", color='debug')
                    self.ignite_new_version.cu.set_current_topology_as_baseline(
                    )
                    util_sleep_for_a_while(
                        60, msg='Wait for rebalance to completed')

                log_print('Transactional loading done', color='green')

            # Just to check client node still can interact with cluster - calculate checksum from client node.
            sorted_cache_names = []
            for cache_name in piclient.get_ignite().cacheNames().toArray():
                sorted_cache_names.append(cache_name)

            sorted_cache_names.sort()

            async_operations = []
            cache_operation = {}
            for cache_name in sorted_cache_names:
                async_operation = create_async_operation(
                    create_checksum_operation, cache_name, 1, 10000)
                async_operations.append(async_operation)
                cache_operation[async_operation] = cache_name
                async_operation.evaluate()

            checksums = ''
            cache_checksum = {}
            for async_operation in async_operations:
                result = str(async_operation.getResult())
                cache_checksum[cache_operation.get(async_operation)] = result
                checksums += result

            log_print('Calculating checksums done')