예제 #1
0
    def setup_testcase(self):
        if self.lfs_stored:
            self.restore_lfs('snapshot_util', timeout=1200)
        self.setup_testcase_without_start_gid()

        activation_timeout = 60
        if self.get_context_variable('sbt_model_enabled'):
            activation_timeout = 200
        self.start_grid(timeout=activation_timeout,
                        activate_on_particular_node=1)

        if not self.lfs_stored:
            if self.get_context_variable('dynamic_cache_enabled'):
                self.start_caches_dynamic(
                    caches_file_name=self.get_context_variable('caches'),
                    batch_size=10000)

            if self.get_context_variable('sbt_model_enabled'):
                PiClientIgniteUtils.load_data_with_txput_sbt_model(
                    self.config,
                    self.ignite,
                    self.get_client_config(),
                    only_caches_batch=None,
                    end_key=int(self.max_key * self.load_multiplier))

            else:
                PiClientIgniteUtils.load_data_with_streamer(
                    self.ignite,
                    self.get_client_config(),
                    end_key=int(self.max_key * self.load_multiplier),
                    allow_overwrite=True)
        log_print(repr(self.ignite), color='debug')
예제 #2
0
    def test_sim(self):
        version, ignite = self.start_ignite_grid(True)

        ignite.jmx.start_utility()

        client_config = Ignite.config_builder.get_config(
            'client', config_set_name='base')
        group_names = PiClientIgniteUtils.collect_cache_group_names(
            ignite, client_config)

        PiClientIgniteUtils.load_data_with_streamer(ignite,
                                                    client_config,
                                                    end_key=50)

        server_nodes_num = ignite.get_nodes_num('server')
        sim_engine = PigeonSimulation(server_nodes_num)

        for running_iteration in range(1, DEFAULT_ITERATIONS + 1):
            log_print("Running iteration %s" % running_iteration)

            ev, node = sim_engine.next_event()
            log_print("Evaluating event %s on node %s" % (ev, node))

            pigeon = self.get_pigeon(ignite, node)

            pigeon[ev]()

            ignite.jmx.wait_for_finish_rebalance(120, group_names)

            self.verify_cluster(ignite)

        ignite.jmx.kill_utility()
    def setup_testcase(self):
        self.logger.info('TestSetup is called')

        if self.get_context_variable('zookeeper_enabled'):
            self.start_zookeeper()

        self.ignite.set_activation_timeout(240)
        self.ignite.set_snapshot_timeout(240)
        self.su.clear_snapshots_list()
        self.start_grid(activate_on_particular_node=1)

        PiClientIgniteUtils.load_data_with_streamer(self.ignite,
                                                    self.get_client_config(),
                                                    end_key=1000)

        if self.get_context_variable(
                'pitr_enabled') and self.get_context_variable(
                    'snapshots_enabled'):
            self.su.wait_no_snapshots_activity_in_cluster()

        self.group_names = PiClientIgniteUtils.collect_cache_group_names(
            self.ignite, self.get_client_config())

        # if not self.ignite.jmx.is_started():
        #     self.ignite.jmx.start_utility()

        # wait for no client on cluster
        self.ignite.wait_for_topology_snapshot(client_num=0)

        print_debug(repr(self.ignite))
예제 #4
0
 def setup_shared_storage_test(self):
     self.success_run = False
     log_print('create transfer_folder')
     if self.use_local_shared_directory:
         self.dr_storage = self.local_shared_dir('transfer_folder_{}'.format(self.config['rt']['test_method']),
                                                 create=True)
     log_print('set transfer_folder path - {}'.format(self.dr_storage))
     log_print('start clusters in parallel')
     futures = []
     with ThreadPoolExecutor(max_workers=2) as executor:
         futures.append(executor.submit(self.start_ignite_grid, 'master'))
         futures.append(executor.submit(self.start_ignite_grid, 'replica'))
     self.ignite_master_app = futures[0].result()
     self.ignite_replica_app = futures[1].result()
     additional_node_id = self.ignite_master_app.add_additional_nodes(
         config=Ignite.config_builder.get_config('server', config_set_name='master'), name='base-ignite-2.8.1.b8')
     self.ignite_master_app.set_node_option(additional_node_id, 'jvm_options',
                         self.get_dr_jvm_options(role='master')
                         )
     self.ignite_master_app.start_additional_nodes(additional_node_id)
     PiClientIgniteUtils.load_data_with_txput_sbt_model(only_caches_batch=None, start_key=0,
                                                        end_key=100,
                                                        jvm_options=self.get_dr_jvm_options('master'),
                                                        config=self.get_client_config('master'),
                                                        tiden_config=self.ignite_master_app.config,
                                                        ignite=self.ignite_master_app)
     print_green('Preload done')
    def base_test(self, **kwargs):
        cache_to_test = 'cache_group_1_001'
        check_commands = [
            '!tables', '!index',
            '\'select count(*) from \"%s\".ALLTYPESINDEXED;\'' % cache_to_test
        ]

        expected = ['COUNT\(\*\)', '1000']

        if 'ssl_connection' in kwargs:
            self.set_current_context('ssl_enabled')

        self.start_grid(activate_on_particular_node=1)

        PiClientIgniteUtils.load_data_with_streamer(
            self.ignite,
            self.get_client_config('ssl_enabled'),
            value_type=ModelTypes.VALUE_ALL_TYPES_INDEXED.value,
            end_key=1000,
            allow_overwrite=True)

        sql_tool = Sqlline(self.ignite, **kwargs)
        output = sql_tool.run_sqlline(check_commands)
        self.su.check_content_all_required(output, expected)

        # base on GG-17465 (validate index with secure cluster)
        self.cu.control_utility('--cache', 'validate_indexes')
 def start_piclients():
     for _ in range(0, 3):
         try:
             PiClientIgniteUtils.load_data_with_putall(
                 self.ignite,
                 self.get_client_config(),
                 value_type=ModelTypes.VALUE_ACCOUNT.value,
                 nodes_num=24,
                 end_key=1000)
         except Exception as err:
             print(err)
예제 #7
0
    def calculate_checksums(self, idle_verify=True):
        if idle_verify:
            self.diff_idle_verify_dump()

        checksum_for_master = PiClientIgniteUtils.calc_checksums_distributed(self.ignite_master_app,
                                                                             config=self.get_client_config('master'),
                                                                             jvm_options=self.get_dr_jvm_options(
                                                                                 'master'))
        checksum_for_replica = PiClientIgniteUtils.calc_checksums_distributed(self.ignite_master_app,
                                                                              config=self.get_client_config('replica'),
                                                                              jvm_options=self.get_dr_jvm_options(
                                                                                  'replica'))
        if idle_verify:
            self.diff_idle_verify_dump(raise_flag=True)
        return checksum_for_master, checksum_for_replica
예제 #8
0
    def test_master_master_master_blinking_blt(self):
        self.prepare_clusters()

        client_config = self.preconfigure_cluster_0()

        iterations = 10
        last_loaded_key = START_DATA_SIZE
        nodes_before = 6

        with PiClient(self.clusters[0].grid,
                      client_config,
                      jvm_options=['-ea']) as piclient:
            PiClientIgniteUtils.load_data_with_streamer(
                self.clusters[0].grid,
                client_config,
                end_key=last_loaded_key,
                jvm_options=['-ea'],
                check_clients=False)

            sleep(60)

            with TransactionalLoading(self,
                                      ignite=self.clusters[0].grid,
                                      config_file=client_config,
                                      skip_consistency_check=True):
                for i in range(0, iterations):
                    log_print(f'Current iteration {i + 1} from {iterations}',
                              color='debug')

                    self.clusters[0].grid.kill_node(2)

                    utility_baseline_log = 'control-utility-baseline.log'

                    self.clusters[0].grid.cu.set_current_topology_as_baseline(
                        background=True, log=utility_baseline_log)

                    self.clusters[0].grid.start_node(2,
                                                     skip_topology_check=True)

                    self.clusters[0].grid.wait_for_topology_snapshot(
                        server_num=6)

                    self.clusters[0].grid.update_started_node_status(2)

                    self.clusters[0].grid.cu.set_current_topology_as_baseline(
                        background=True, log=utility_baseline_log)

                    self.verify_cluster(0, nodes_before, last_loaded_key)
    def _setup_with_context(self, context_name, **kwargs):
        self._start_grid_no_preload(context_name, **kwargs)

        if self.preloading_size > 0:
            if self.preloading_with_streamer:
                PiClientIgniteUtils.load_data_with_streamer(
                    self.ignite,
                    self.get_client_config(),
                    end_key=self.preloading_size)
            else:
                PiClientIgniteUtils.load_data_with_putall(
                    self.ignite,
                    self.get_client_config(),
                    end_key=self.preloading_size)

        self._wait_cluster_ready()
예제 #10
0
    def test_rebalancing_with_ttl_caches(self):
        """
        IGN-13549 (IGNITE-11400)

        Rebalancing caches with TTL enabled can cause data corruption.
        :return:
        """

        with PiClient(self.ignite, self.get_client_config()):
            checksums = create_distributed_checksum_operation().evaluate()

        self.wait_for_running_clients_num(client_num=0, timeout=120)
        self.ignite.cu.control_utility(
            '--cache idle_verify --dump --skip-zeros')
        log_print('Calculating checksums done: %s' % checksums)

        self.assert_nodes_alive()

        self.ignite.kill_node(2)

        PiClientIgniteUtils.load_data_with_putall(
            self.ignite,
            self.get_client_config(),
            start_key=self.preloading_size,
            end_key=self.preloading_size + 100000)

        util_sleep(5 * 60)

        self.ignite.start_node(2)

        self.ignite.jmx.wait_for_finish_rebalance(self.rebalance_timeout * 2,
                                                  self.group_names)

        with PiClient(self.ignite,
                      self.get_client_config()):  # , jvm_options=jvm_options):
            checksums = create_distributed_checksum_operation().evaluate()

        self.wait_for_running_clients_num(client_num=0, timeout=120)
        log_print('Calculating checksums done: %s' % checksums)
        tiden_assert_equal(
            0, self.ignite.find_exception_in_logs('java.lang.AssertionError'),
            "# of AssertionError")
예제 #11
0
    def common_load(self, start_key, end_key):
        if self.get_context_variable('sbt_model_enabled'):
            # add transactional put into 1 cache 2k times total
            PiClientIgniteUtils.load_data_with_txput_sbt_model(
                self.config,
                self.ignite,
                self.get_client_config(),
                only_caches_batch=1,
                start_key=start_key,
                end_key=start_key + 1)

            # load normal data into 200 caches 2k times total
            PiClientIgniteUtils.load_data_with_txput_sbt_model(
                self.config,
                self.ignite,
                self.get_client_config(),
                only_caches_batch=200,
                start_key=start_key + 2,
                end_key=int(end_key * self.load_multiplier))
        else:
            PiClientIgniteUtils.load_data_with_putall(
                self.ignite,
                self.get_client_config(),
                start_key=start_key,
                end_key=int(end_key * self.load_multiplier))
예제 #12
0
    def _wait_cluster_ready(self):
        if self.get_context_variable(
                'pitr_enabled') and self.get_context_variable(
                    'snapshots_enabled'):
            self.su.wait_no_snapshots_activity_in_cluster()

        self.group_names = PiClientIgniteUtils.collect_cache_group_names(
            self.ignite, self.get_client_config())

        if not self.ignite.jmx.is_started():
            self.ignite.jmx.start_utility()

        # wait for no client on cluster
        self.ignite.wait_for_topology_snapshot(client_num=0)

        if is_enabled(self.config.get('disable_baseline_autoadjustment')):
            log_print("Going to disable baseline autoadjustment",
                      color='green')
            if self.cu.is_baseline_autoajustment_supported():
                self.cu.disable_baseline_autoajustment()
                log_print("Baseline autoadjustment disabled", color='green')

        log_print(repr(self.ignite), color='debug')
예제 #13
0
 def load_data_with_streamer(self, *args, **kwargs):
     PiClientIgniteUtils.load_data_with_streamer(*args, **kwargs)
    def test_cycling_restart_grid_dynamic_caches_no_client(self):
        """
        Scenario The Glue
        (Assertions should be enabled)

        1. Start grid, load some data
        2. In the loop:
            2.1 define node restart timeout (0.5 - 2.0 seconds)
            2.2 Load more data
            2.3 Restart each node with defined timeout (DOES NOT LOOK ON TOPOLOGY SNAPSHOT)
            2.4 Try to activate, check AssertionErrors
            2.5 Try to baseline (If 2 operation failed -> PME, kill all nodes, start new test iteration)
            2.6 Try to load data
            2.7 Try to calculate checksum

        :return:
        """
        import random

        PiClient.read_timeout = 240

        # sleep_for_time = float(random.randrange(1, 15, 1)) / 5

        self.set_current_context('in_memory')

        self.util_copy_piclient_model_to_libs()
        self.ignite.set_activation_timeout(240)
        self.ignite.set_snapshot_timeout(240)
        self.ignite.set_node_option('*', 'jvm_options', ['-ea'])
        self.su.clear_snapshots_list()
        self.start_grid(skip_activation=True)

        self.start_dynamic_caches_with_node_filter()

        last_loaded_key = 1000
        PiClientIgniteUtils.load_data_with_streamer(self.ignite,
                                                    self.get_client_config(),
                                                    end_key=last_loaded_key,
                                                    jvm_options=['-ea'])

        nodes_before = self.ignite.get_alive_default_nodes()

        iterations = 50
        last_loaded_key += 1
        for i in range(0, iterations):
            with ExitStack() as stack:
                # load data before start zk restart thread
                self.start_dynamic_caches_with_node_filter()
                # PiClientIgniteUtils.wait_for_running_clients_num(self.ignite, 0, 120)
                PiClientIgniteUtils.load_data_with_streamer(
                    self.ignite,
                    self.get_client_config(),
                    start_key=last_loaded_key,
                    end_key=last_loaded_key + 500,
                    jvm_options=['-ea'],
                    check_clients=True)
                last_loaded_key += 500

                if self.get_context_variable('zookeeper_enabled') and \
                        is_enabled(self.config.get('zookeeper_nodes_restart')):
                    stack.enter_context(ZkNodesRestart(self.zoo, 3))

                log_print('Current iteration %s from %s' % (i, iterations),
                          color='debug')

                sleep_for_time = float(
                    self.the_glue_timeout) if self.the_glue_timeout else round(
                        random.uniform(0.5, 2.5), 1)
                log_print(
                    "In this run we are going to sleep for {} seconds after each node restart"
                    .format(sleep_for_time),
                    color='green')

                log_print('Trying to load data into created/existing caches',
                          color='yellow')

                log_print("Round restart")
                for node_id in self.ignite.get_alive_default_nodes():
                    self.ignite.kill_node(node_id)
                    self.ignite.start_node(node_id, skip_topology_check=True)
                    sleep(sleep_for_time)

                log_print("Wait for topology messages")
                for node_id in self.ignite.get_all_default_nodes():
                    self.ignite.update_started_node_status(node_id)

                sleep(15)

            last_loaded_key = self.verify_cluster(nodes_before,
                                                  last_loaded_key)
    def test_nodes_connecting_to_dead_cluster(self):
        """
        https://ggsystems.atlassian.net/browse/IGN-13800

        Two nodes are trying to connect to cluster, meanwhile cluster killed.
        They should send join request but didn't get self NodeAdded

        """
        PiClient.read_timeout = 240

        # sleep_for_time = float(random.randrange(1, 15, 1)) / 5

        self.set_current_context('in_memory')

        self.util_copy_piclient_model_to_libs()
        self.ignite.set_activation_timeout(240)
        self.ignite.set_snapshot_timeout(240)
        self.ignite.set_node_option('*', 'jvm_options', ['-ea'])
        self.su.clear_snapshots_list()
        self.start_grid(skip_activation=True)

        last_loaded_key = 1000
        PiClientIgniteUtils.load_data_with_streamer(self.ignite,
                                                    self.get_client_config(),
                                                    end_key=last_loaded_key)

        nodes_before = self.ignite.get_alive_default_nodes()

        additional_nodes = self.ignite.add_additional_nodes(
            config=self.get_client_config(), num_nodes=2)

        def start_piclients():
            try:
                self.ignite.start_additional_nodes(additional_nodes,
                                                   client_nodes=True,
                                                   skip_topology_check=True)
            except Exception as err:
                print(err)
            finally:
                sleep(10)

                self.ignite.update_started_node_status(additional_nodes)

                for add_node in additional_nodes:
                    self.ignite.kill_node(add_node)

        log_print("Starting clients under load", color="green")

        executor = ThreadPoolExecutor()

        executor.submit(start_piclients)

        try:
            time_to_sleep = round(random.uniform(3.5, 4.9), 1)
            sleep(time_to_sleep)

            log_print("Time to sleep: %s" % time_to_sleep, color='green')

            self.ignite.kill_nodes()

            sleep(30)

            self.verify_cluster(nodes_before, 0)
        except Exception as e:
            raise e
        finally:
            executor.shutdown(wait=True)

        self.ssh.killall('java')
    def test_clients_killed_few_coordinators(self):
        """
        1. Start grid, load some data
        2. Repeat:
            2.1. Start clients thread with loading (put all operation)
            2.2. Kill first node (coordinator) + second node (possible coordinator)
            2.3. Kill 4 next coordinators with some small timeout
            2.4. Sleep for 2 minutes to make cluster process failure
            2.5. Launch verify procedure
            2.6. Stop clients put thread

        :return:
        """

        self.set_current_context('default')

        self.util_copy_piclient_model_to_libs()
        self.util_deploy_sbt_model()
        self.ignite.set_activation_timeout(240)
        self.ignite.set_snapshot_timeout(240)
        self.ignite.set_node_option('*', 'jvm_options', ['-ea'])
        self.su.clear_snapshots_list()
        self.start_grid()

        sleep_for_time = float(
            self.the_glue_timeout) if self.the_glue_timeout else round(
                random.uniform(0.1, 2.9), 1)

        PiClientIgniteUtils.load_data_with_streamer(
            self.ignite,
            self.get_client_config(),
            value_type=ModelTypes.VALUE_ACCOUNT.value,
            end_key=1000)

        nodes_before = self.ignite.get_alive_default_nodes()

        def start_piclients():
            for _ in range(0, 3):
                try:
                    PiClientIgniteUtils.load_data_with_putall(
                        self.ignite,
                        self.get_client_config(),
                        value_type=ModelTypes.VALUE_ACCOUNT.value,
                        nodes_num=24,
                        end_key=1000)
                except Exception as err:
                    print(err)

        with PiClient(self.ignite, self.get_client_config()) as load:
            log_print("Starting clients under load", color="green")

            executor = ThreadPoolExecutor()

            executor.submit(start_piclients)

            sleep(5)

            try:
                self.ignite.kill_node(1)

                for i in range(0, 4):
                    sleep(sleep_for_time)

                    i = int(load.get_ignite().cluster().forOldest().node().
                            consistentId().replace('node_1_', ''))

                    if self.ignite.nodes[i]['status'] in [
                            NodeStatus.KILLED, NodeStatus.KILLING
                    ]:
                        sleep(sleep_for_time)

                        i = int(load.get_ignite().cluster().forOldest().node().
                                consistentId().replace('node_1_', ''))

                    self.ignite.kill_node(i)
                    log_print("Killing node %s" % i)

            except Exception as e:
                print(e)

            sleep(120)

            self.verify_cluster(nodes_before, 0)

            executor.shutdown(wait=True)
    def test_massive_index_rebuild(self):
        """
        1) 2 nodes, backupCnt = 1, persistenceEnabled
        2) Load (A, B) type into a cache with defined (A, B) types in index config
        3) Load new type of data into a cache (C,D)
        4) Kill one node
        5) Create new index in alive cluster
        6) Start node again

        :return:
        """

        PiClient.read_timeout = 1200

        self.set_current_context('indexed_types')

        self.util_copy_piclient_model_to_libs()
        self.ignite.set_activation_timeout(240)
        self.ignite.set_snapshot_timeout(240)
        self.ignite.set_node_option('*', 'jvm_options', ['-ea'])
        self.su.clear_snapshots_list()
        self.start_grid(skip_activation=True)

        self.ignite.cu.activate(activate_on_particular_node=1)

        PiClientIgniteUtils.load_data_with_streamer(
            self.ignite,
            self.get_client_config(),
            value_type=ModelTypes.VALUE_ALL_TYPES_30_INDEX.value,
            end_key=5000)

        PiClientIgniteUtils.load_data_with_streamer(
            self.ignite,
            self.get_client_config(),
            value_type=ModelTypes.VALUE_ACCOUNT.value,
            start_key=5000,
            end_key=10000)

        PiClientIgniteUtils.load_data_with_streamer(
            self.ignite,
            self.get_client_config(),
            value_type=ModelTypes.VALUE_EXT_ALL_TYPES_30_INDEX.value,
            start_key=10000,
            end_key=15000)

        # PiClientIgniteUtils.load_data_with_streamer(self.ignite,
        #                                             self.get_client_config(),
        #                                             cache_names_patterns=
        #                                             ['cache_group_3'],
        #                                             value_type=ModelTypes.VALUE_EXT_ALL_TYPES_30_INDEX.value,
        #                                             end_key=10000)

        iterations = 50

        sqlline = Sqlline(self.ignite)

        columns = [
            'longCol',
            'doubleCol',
            'stringCol',
            'booleanCol',
            'longCol1',
            # 'doubleCol1', 'stringCol1', 'intCol', 'intCol1',  # 'booleanCol1',
            # 'index', 'longCol2', 'doubleCol2', 'stringCol2', 'booleanCol2',
            # 'longCol12', 'doubleCol12', 'stringCol12', 'intCol12', 'intCol2',
            # 'shortCol2', 'longCol3', 'doubleCol3', 'stringCol3', 'booleanCol3',
            # 'longCol13', 'doubleCol13', 'stringCol13', 'intCol13', 'intCol3', 'shortCol3'
        ]

        with PiClient(self.ignite, self.get_client_config()) as piclient:
            cache_names = piclient.get_ignite().cacheNames().toArray()

            for i in range(0, iterations):
                log_print('Current iteration %s from %s' % (i, iterations),
                          color='debug')

                update_table = []

                self.ignite.kill_node(2)

                indexed_columns = ','.join(columns)

                for cache_name in cache_names:
                    # self.ssh.exec_on_host('REMOVE')
                    vtype = 'ALLTYPES30INDEX'  # if 'cache_group_3' not in cache_name else 'EXTALLTYPES30INDEX'

                    update_table.append(
                        f'\'CREATE INDEX IF NOT EXISTS {cache_name}_{vtype} on '
                        f'\"{cache_name}\".{vtype}({indexed_columns}) INLINE_SIZE 32 PARALLEL 28;\''
                    )

                update_table.append('!index')

                sqlline.run_sqlline(update_table)

                self.ignite.start_node(2)

                util_sleep_for_a_while(30)

                self.verify_no_assertion_errors()

                self.cu.control_utility('--cache validate_indexes',
                                        all_required='no issues found.')
    def test_full_cluster_blinking(self):
        """

        Enable indexes

        Start servers with PDS, start clients, start some light tx loading.
        In loop try to blink with all cluster at the same time. Logically there should be no data loss:
            full cluster blinking - so there shouldn't be any data loss

        :return:
        """

        PiClient.read_timeout = 240

        self.set_current_context('indexed_types')

        self.util_copy_piclient_model_to_libs()
        self.ignite.set_activation_timeout(240)
        self.ignite.set_snapshot_timeout(240)
        self.ignite.set_node_option('*', 'jvm_options', ['-ea'])
        self.su.clear_snapshots_list()
        self.start_grid(skip_activation=True)

        self.ignite.cu.activate(activate_on_particular_node=1)

        PiClientIgniteUtils.load_data_with_streamer(self.ignite,
                                                    self.get_client_config(),
                                                    end_key=100000)

        nodes_before = self.ignite.get_alive_default_nodes()
        iterations = 50

        with PiClient(self.ignite, self.get_client_config()):
            with TransactionalLoading(
                    self, loading_profile=LoadingProfile(delay=1000)):
                for i in range(0, iterations):
                    log_print('Current iteration %s from %s' % (i, iterations),
                              color='debug')

                    for node_id in nodes_before:
                        self.ignite.kill_node(node_id)
                        sleep(
                            float(self.the_glue_timeout) if self.
                            the_glue_timeout else round(
                                random.uniform(0.1, 0.5), 1))

                    for node_id in nodes_before:
                        self.ignite.start_node(node_id,
                                               skip_topology_check=True)
                        sleep(
                            float(self.the_glue_timeout) if self.
                            the_glue_timeout else round(
                                random.uniform(0.1, 0.5), 1))

                    self.ignite.wait_for_topology_snapshot(
                        server_num=len(nodes_before))

                    for node_id in self.ignite.get_all_default_nodes():
                        self.ignite.update_started_node_status(node_id)

                    sleep(10)

                    self.cu.control_utility('--cache validate_indexes',
                                            all_required='no issues found.')

                    self.verify_cluster(nodes_before, 0)
예제 #19
0
    def do_blink_backups_under_load(self, initial_remove_probability):

        iteration_size = self.config.get('iteration_size', 80000)
        start = 0
        keep_coordinator_busy = True
        start_value = 0

        # temporary save LFS even on test pass
        self.need_delete_lfs_on_teardown = False

        first_node = self.ignite.get_node_consistent_id(1)
        second_node = self.ignite.get_node_consistent_id(2)

        if keep_coordinator_busy:
            other_nodes = list(
                set(self.ignite.get_all_default_nodes()) - set([1]))
        else:
            other_nodes = list(
                set(self.ignite.get_all_default_nodes()) - set([1, 2]))

        current_server_num = self.ignite.get_nodes_num('server')

        tx_caches = []
        atomic_caches = []

        self.ignite.set_snapshot_timeout(600)

        with PiClient(self.ignite, self.get_client_config(),
                      nodes_num=1) as piclient:
            gateway = piclient.get_gateway()
            ignite = piclient.get_ignite()

            cache_names = ignite.cacheNames().toArray()
            for cache_name in cache_names:
                # run cross cache transfer task only for transactional caches
                if ignite.getOrCreateCache(cache_name).getConfiguration(
                        gateway.jvm.org.apache.ignite.configuration.
                        CacheConfiguration().getClass()).getAtomicityMode(
                        ).toString() == 'TRANSACTIONAL':
                    tx_caches.append(cache_name)
                else:
                    atomic_caches.append(cache_name)

        PiClientIgniteUtils.wait_for_running_clients_num(self.ignite, 0, 120)

        for iteration in range(0, self.iterations):
            log_print("Iteration {}/{}".format(str(iteration + 1),
                                               str(self.iterations)),
                      color='blue')

            start_key = start + iteration * iteration_size
            end_key = start_key + iteration_size
            if initial_remove_probability > 0.0:
                remove_probability = initial_remove_probability + iteration / self.iterations / 2.0
            else:
                remove_probability = 0.0

            current_client_num = self.ignite.get_nodes_num('client')

            for i in range(0, 3):
                with PiClient(self.ignite,
                              self.get_client_config()) as piclient:
                    log_print(
                        "Loading (remove {probability}%) {load} values per cache into {n_caches} caches"
                        .format(
                            probability=remove_probability,
                            load=iteration_size,
                            n_caches=len(tx_caches),
                        ))

                    async_operations = []
                    for cache_name in tx_caches:
                        node_id = piclient.get_node_id()
                        gateway = piclient.get_gateway(node_id)
                        tx_size = randint(1, 10)
                        log_print(
                            "Client {node_id} -> {cache_name}, tx size {tx_size}"
                            .format(
                                node_id=node_id,
                                cache_name=cache_name,
                                tx_size=tx_size,
                                removeProbability=remove_probability,
                            ))
                        async_operation = create_async_operation(
                            create_put_with_optional_remove_operation,
                            cache_name,
                            start_key,
                            end_key,
                            remove_probability,
                            gateway=gateway,
                            node_consistent_id=first_node
                            if keep_coordinator_busy else second_node,
                            tx_description=TxDescriptor(
                                concurrency='PESSIMISTIC',
                                isolation='REPEATABLE_READ',
                                size=tx_size),
                            use_monotonic_value=True,
                            monotonic_value_seed=start_value,
                        )
                        start_value = start_value + iteration_size

                        async_operations.append(async_operation)
                        async_operation.evaluate()

                    # little warm up
                    util_sleep(5)

                    node_id = self.ignite.get_random_server_nodes(
                        1, node_ids=other_nodes)[0]
                    self.ignite.kill_node(node_id)
                    self.ignite.wait_for_topology_snapshot(
                        server_num=current_server_num - 1)

                    # continue load data during node offline
                    util_sleep(15)
                    self.ignite.start_node(node_id)
                    self.ignite.wait_for_topology_snapshot(
                        server_num=current_server_num)

                PiClientIgniteUtils.wait_for_running_clients_num(
                    self.ignite, current_client_num, 120)

                self.wait_transactions_finish()

                self.ignite.jmx.wait_for_finish_rebalance(
                    self.rebalance_timeout, self.group_names)

                self.idle_verify_check_conflicts_action()

            self.idle_verify_dump_action()
    def base_test_with_all_users(self, ssl_connection):
        cache_to_test = 'cache_group_1_001'
        check_commands_read = [
            '!tables', '!index',
            '\'select count(*) from \"%s\".ALLTYPESINDEXED;\'' % cache_to_test
        ]

        check_commands_update = [
            '!tables', '!index',
            '\'update \"%s\".ALLTYPESINDEXED set LONGCOL=1;\'' % cache_to_test
        ]

        expected_read = ['COUNT\(\*\)', '1000']
        expected_update = ['1,000 rows affected']
        expected_for_no_access_user = ['Authorization failed']

        self.set_current_context('ssl_enabled')

        self.start_grid(activate_on_particular_node=1)

        PiClientIgniteUtils.load_data_with_streamer(
            self.ignite,
            self.get_client_config('ssl_enabled'),
            value_type=ModelTypes.VALUE_ALL_TYPES_INDEXED.value,
            end_key=1000,
            allow_overwrite=True)

        users = [
            {
                'login': '******',
                'password': '******',
                'read_check': {
                    'run': check_commands_read,
                    'expected': expected_read
                },
                'update_check': {
                    'run': check_commands_update,
                    'expected': expected_update
                }
            },
            {
                'login': '******',
                'password': '******',
                'read_check': {
                    'run': check_commands_read,
                    'expected': expected_read
                },
                'update_check': {
                    'run': check_commands_update,
                    'expected': expected_update
                }
            },
            {
                'login': '******',
                'password': '******',
                'read_check': {
                    'run': check_commands_read,
                    'expected': expected_read
                },
                'update_check': {
                    'run': check_commands_update,
                    'expected': expected_for_no_access_user
                }
            },
            {
                'login': '******',
                'password': '******',
                'read_check': {
                    'run': check_commands_read,
                    'expected': expected_for_no_access_user
                },
                'update_check': {
                    'run': check_commands_update,
                    'expected': expected_for_no_access_user
                }
            },
        ]

        def check_output(user_info):
            auth_info = namedtuple('auth_info', 'user password')
            auth = auth_info(user=user['login'], password=user['password'])
            sql_tool = Sqlline(self.ignite,
                               auth=auth,
                               ssl_connection=ssl_connection)

            for operation in ['read_check', 'update_check']:
                output = sql_tool.run_sqlline(user_info[operation].get('run'))
                self.su.check_content_all_required(
                    output, user_info[operation].get('expected'))

        for user in reversed(users):
            check_output(user)

        for user in users:
            check_output(user)
    def test_during_rebalance(self):
        ignite = self.start_ignite_grid(self.ignite_name)

        ignite.cu.activate(activate_on_particular_node=1)

        PiClientIgniteUtils.load_data_with_putall(ignite, self.client_config, )

        util_sleep_for_a_while(30)
        with PiClient(ignite, self.client_config) as piclient:
            cache_to_test = 'test_cache_with_index'
            # self.create_cache_with_indexed_data(cache_to_test)
            client_ignite = piclient.get_ignite()
            gateway = piclient.get_gateway()

            cache_config = IgniteCacheConfig(gateway)
            cache_config.set_name('cache_1')
            cache_config.set_cache_mode('replicated')
            cache_config.set_atomicity_mode('transactional')
            cache_config.set_write_synchronization_mode('full_sync')
            cache_config.set_affinity(False, 32)
            cache_config.set_group_name('some_new_group')

            cache_config1 = IgniteCacheConfig(gateway)
            cache_config1.set_name(cache_to_test)
            cache_config1.set_cache_mode('replicated')
            cache_config1.set_atomicity_mode('transactional')
            cache_config1.set_write_synchronization_mode('full_sync')
            cache_config1.set_affinity(False, 32)
            cache_config1.set_group_name('some_new_group')

            # set query entities

            caches = gateway.jvm.java.util.ArrayList()
            caches.add(cache_config.get_config_object())
            caches.add(cache_config1.get_config_object())

            log_print("Creating caches", color='green')
            client_ignite.getOrCreateCaches(caches)

            cache_names = piclient.get_ignite().cacheNames().toArray()
            if cache_to_test not in cache_names:
                log_print("Could not find cache in %s" % cache_names, color='red')

            util_sleep_for_a_while(10)

            ignite.kill_node(2)

            log_print("Overwrite values in cache %s" % cache_to_test, color='green')

            operation = create_put_all_operation(cache_to_test, 1, 1001, 100,
                                                 value_type=ModelTypes.VALUE_ALL_TYPES_INDEXED.value)
            operation.evaluate()

            util_sleep_for_a_while(15)
            ignite.start_node(2, skip_topology_check=True)

            util_sleep_for_a_while(5)
            client_ignite.cache(cache_to_test).destroy()

            ignite.update_starting_node_attrs()
            ignite.nodes[3]['status'] = NodeStatus.STARTED
            client_ignite.cache('cache_1').destroy()
    def test_during_loading(self):
        """
        Should be fully fixed in 8.5.8-p1

        Scenario:

            1. Start 3 server nodes
            2. Load 1000 keys into 120 TX caches
            3. Start 3 client node and start TX loading (PESSIMISTIC/REPEATABLE_READ, OPTIMISTIC/SERIALIZABLE)
                    (12 transfer operations, 10 caches in each operation,
                        1000ms between each transaction i.e. ~ 4 tx per second from each client))
            4. In clients try to destroy caches
            5. Interesting things happens

        Fixed in 8.5.8-p1
        https://ggsystems.atlassian.net/browse/GG-19179

        Issues that was found during this test:
        https://ggsystems.atlassian.net/browse/GG-19411
        https://ggsystems.atlassian.net/browse/GG-19383

        :return:
        """
        PiClient.read_timeout = 600

        ignite = self.start_ignite_grid(self.ignite_name)

        ignite.cu.activate(activate_on_particular_node=1)

        PiClientIgniteUtils.load_data_with_putall(ignite, self.client_config, )

        def get_dumps():
            for node_id in ignite.nodes.keys():
                self.util_get_threads_from_jstack(ignite, node_id, 'END')

        try:
            with PiClient(ignite, self.client_config) as piclient:
                with TransactionalLoading(self,
                                          ignite=ignite,
                                          config_file=self.client_config,
                                          on_exit_action=get_dumps,
                                          kill_transactions_on_exit=True,
                                          with_exception=False,  # do interrupt loading operation if smth happens?
                                          skip_consistency_check=True,  # we are destroying caches here if you notice
                                          loading_profile=LoadingProfile(
                                              delay=1000,
                                              allowed_transactions=(
                                                      TxDescriptor(concurrency='OPTIMISTIC',
                                                                   isolation='SERIALIZABLE', ),)
                                          )):
                    # allowed_transactions=(TxDescriptor(), ))):
                    # )):
                    node_id = piclient.get_node_id()
                    client_ignite = piclient.get_ignite(node_id)

                    cache_names = client_ignite.cacheNames().toArray()

                    caches_to_kill_num = 50
                    frags = 0

                    for cache in cache_names:
                        node_id = piclient.get_node_id()

                        log_print('Destroying cache %s on node %s' % (cache, node_id), color='red')

                        piclient.get_ignite(node_id).cache(cache).destroy()

                        frags += 1

                        if frags >= caches_to_kill_num:
                            break
        finally:
            npe_errors = ignite.find_exception_in_logs(".*java.lang.NullPointerException.*")

            assertion_errors = ignite.find_exception_in_logs(".*java.lang.AssertionError.*")

            if npe_errors != 0 or assertion_errors != 0:
                assert False, "There are errors in logs: NPE - %s, AE - %s" % (npe_errors, assertion_errors)
예제 #23
0
    def verify_cluster(self,
                       cluster_to_verify_id,
                       nodes_before,
                       last_loaded_key=None):
        client_config = Ignite.config_builder.get_config(
            'client', config_set_name='cluster_1_node_without_dr')

        servers = 0
        ignite = self.clusters[cluster_to_verify_id].grid

        for i in range(3):
            for res in ignite.last_topology_snapshot():
                if res['servers'] > servers:
                    servers = res['servers']
                else:
                    break
            util_sleep_for_a_while(5)

        if nodes_before != servers:
            log_print(
                f"There are missing nodes on cluster: Nodes in cluster: {servers} expecting {nodes_before}",
                color='yellow')

            self.verify_no_meaning_errors()

            log_print("Wait for topology messages again.", color='yellow')
            for node_id in ignite.get_all_default_nodes():
                ignite.update_started_node_status(node_id)

            log_print("Missing nodes case confirmed. Trying to restart node.",
                      color='red')
            current_cluster_nodes = ignite.get_nodes_num('server')
            if nodes_before != current_cluster_nodes:
                log_print(f"Current nodes in cluster {current_cluster_nodes}")
                nodes_to_start = []

                for node_id in ignite.get_alive_default_nodes():
                    # assert that node is not dead otherwise kill/restart again
                    if not ignite.check_node_status(node_id):
                        log_print("Restarting node %s" % node_id,
                                  color='yellow')
                        nodes_to_start.append(node_id)

                log_print(f"Going to restart nodes: {nodes_to_start}",
                          color='debug')
                for node_id in nodes_to_start:
                    ignite.start_node(node_id,
                                      skip_nodes_check=True,
                                      check_only_servers=True)

                current_cluster_nodes = ignite.get_nodes_num('server')
                if nodes_before != current_cluster_nodes:
                    log_print(
                        f"Current amount of nodes in cluster: {current_cluster_nodes}, expecting {nodes_before}",
                        color='debug')

                    for node_id in ignite.get_alive_default_nodes():
                        self.util_get_threads_from_jstack(
                            ignite, node_id, "FAILED")

                    assert False, "Failed to restart node"

        ignite.cu.control_utility('--activate')

        activate_failed = False
        log_print('Check that there is no Error in activate logs',
                  color='yellow')
        if 'Error' in ignite.cu.latest_utility_output:
            activate_failed = True
            log_print('Failed!', color='red')
        sleep(5)

        ignite.cu.control_utility('--baseline')
        self.verify_no_meaning_errors()
        log_print('Check that there is no Error in control.sh --baseline logs',
                  color='yellow')

        if 'Error' in ignite.cu.latest_utility_output:
            log_print('Failed! Second try after sleep 60 seconds', color='red')
            sleep(60)

            ignite.cu.control_utility('--baseline')

            if 'Error' in ignite.cu.latest_utility_output or activate_failed:
                log_print('Cluster looks hang.')

        log_print('Check that there is no AssertionError in logs',
                  color='yellow')
        self.verify_no_meaning_errors()

        if last_loaded_key:
            try:
                new_last_key = last_loaded_key - int(
                    random.uniform(0, 1) * LOAD_DATA_SIZE)
                log_print(
                    f'Trying to remove data from survivor caches ({new_last_key}, {last_loaded_key})',
                    color='yellow')
                PiClientIgniteUtils.remove_data(
                    ignite,
                    client_config,
                    start_key=new_last_key,
                    end_key=last_loaded_key,
                    check_clients=False,
                )

                last_loaded_key = new_last_key
            except Exception:
                for node_id in ignite.get_alive_default_nodes():
                    self.util_get_threads_from_jstack(ignite, node_id,
                                                      "FAILED")

                assert False, "Unable to connect client"
            finally:
                self.verify_no_meaning_errors()

        util_sleep_for_a_while(MINI_DR_STABILIZATION_TIMEOUT)

        checksum_master, checksum_slave = self.calculate_checksum_and_validate(
            last_loaded_key)

        tiden_assert(checksum_master == checksum_slave,
                     'Hash sum master and slave should be equal')

        return last_loaded_key
예제 #24
0
    def run_stress_restarts(self, cluster_id_to_restart, iterations,
                            nodes_to_restart, time_to_sleep_range):
        client_config = self.preconfigure_cluster_0()

        with PiClient(self.clusters[0].grid,
                      client_config,
                      jvm_options=['-ea']) as piclient:
            ignite = piclient.get_ignite()

            self.start_dynamic_caches_with_node_filter(client_config)

            last_loaded_key = START_DATA_SIZE
            PiClientIgniteUtils.load_data_with_putall(self.clusters[0].grid,
                                                      client_config,
                                                      end_key=last_loaded_key,
                                                      jvm_options=['-ea'],
                                                      check_clients=False)

            util_sleep_for_a_while(MINI_DR_STABILIZATION_TIMEOUT)

            nodes_before = 6

            last_loaded_key += 1
            for i in range(0, iterations):
                log_print(f'Current iteration {i + 1} from {iterations}',
                          color='debug')

                sleep_for_time = random.uniform(time_to_sleep_range[0],
                                                time_to_sleep_range[1])
                log_print(
                    f'In this run we are going to sleep for {sleep_for_time} seconds after each node restart',
                    color='green')

                log_print('Trying to load data into created/existing caches',
                          color='yellow')

                self.start_dynamic_caches_with_node_filter(client_config)

                PiClientIgniteUtils.load_data_with_putall(
                    self.clusters[0].grid,
                    client_config,
                    start_key=last_loaded_key,
                    end_key=last_loaded_key + LOAD_DATA_SIZE,
                    jvm_options=['-ea'],
                    check_clients=False)
                last_loaded_key += LOAD_DATA_SIZE

                self.increment_atomic(ignite)

                log_print("Round restart")
                for node_id in nodes_to_restart:
                    self.clusters[cluster_id_to_restart].grid.kill_node(
                        node_id)
                    self.clusters[cluster_id_to_restart].grid.start_node(
                        node_id, skip_topology_check=True)
                    sleep(sleep_for_time)

                log_print("Wait for topology messages")
                for node_id in nodes_to_restart:
                    self.clusters[
                        cluster_id_to_restart].grid.update_started_node_status(
                            node_id)

                util_sleep_for_a_while(MINI_DR_STABILIZATION_TIMEOUT)

                last_loaded_key = self.verify_cluster(0, nodes_before,
                                                      last_loaded_key)

        util_sleep_for_a_while(DR_STABILIZATION_TIMEOUT)

        checksum_master1, checksum_slave1 = self.calculate_checksum_and_validate(
            last_loaded_key)
        tiden_assert(checksum_master1 == checksum_slave1,
                     'Hash sum master and slave not match')

        self.put_data(self.clusters[1], 1, 'cluster_2_node_without_dr')

        util_sleep_for_a_while(MINI_DR_STABILIZATION_TIMEOUT)

        checksum_master2, checksum_slave2 = self.calculate_checksum_and_validate(
            last_loaded_key)
        tiden_assert(checksum_master2 == checksum_slave2,
                     'Hash sum master and slave not match')
    def run(self, artifact_name):
        """
        Run scenario for defined artifact

        :param artifact_name: name from artifact configuration file
        """
        super().run(artifact_name)

        log_print("Running snapshot benchmark with config: %s" % self.config,
                  color='green')

        version = self.test_class.tiden.config['artifacts'][artifact_name][
            'ignite_version']
        time_to_jfr = int(self.config.get('time_to_jfr', 0))
        now_plus_300 = datetime.datetime.now() + datetime.timedelta(seconds=1)
        try:
            self.test_class.create_app_config_set(
                Ignite,
                START_TIME_CONFIG_SET,
                caches_list_file='caches_%s.xml' % START_TIME_CONFIG_SET,
                deploy=True,
                snapshots_enabled=True,
                logger=False,
                wal_segment_size=self.test_class.consumption_config.get(
                    'wal_segment_size', 1024 * 1024 * 1024),
                logger_path='%s/ignite-log4j2.xml' %
                self.test_class.tiden.config['rt']['remote']
                ['test_module_dir'],
                disabled_cache_configs=False,
                zookeeper_enabled=False,
                checkpoint_read_lock_timeout=self.read_lock_property_value(
                    version),
                # caches related variables
                additional_configs=[
                    'caches.tmpl.xml',
                ],
                part_32=self.config.get('part_32', 10000),
                # 100),
                part_64=self.config.get('part_64', 10000),
                # 100),
                part_128=self.config.get('part_128', 10000),
                # 100),
                # artifact config variables
                **self.artifact_config_variables,
            )
            version, ignite = self.test_class.start_ignite_grid(
                artifact_name,
                activate=True,
                config_set=START_TIME_CONFIG_SET,
                jvm_options=self.artifact_jvm_properties)

            ignite.set_snapshot_timeout(1200)

            if time_to_jfr:
                now_plus_300 = datetime.datetime.now() + datetime.timedelta(
                    seconds=350)
                ignite.make_cluster_jfr(300)

            PiClientIgniteUtils.load_data_with_putall(
                ignite,
                Ignite.config_builder.get_config(
                    'client', config_set_name=START_TIME_CONFIG_SET),
                # end_key=int(self.config.get('data_size'))
                end_key=10000)

            # kill nodes
            ignite.kill_nodes()

            sleep(120)

            self.start_probes(artifact_name)

            ignite.start_nodes()

            self.stop_probes()

            ignite.kill_nodes()
            ignite.delete_lfs()

            # do some calculations
        finally:
            # if time_to_jfr:
            #     pause.until(now_plus_300)
            # remove config set
            self.test_class.remove_app_config_set(Ignite,
                                                  START_TIME_CONFIG_SET)
    def test_cycling_restart_grid_dynamic_caches_with_atomic_on_restart(self):
        """
        Scenario The Glue
        (Assertions should be enabled)

        1. Start grid, load some data
        2. In the loop:
            2.1 define node restart timeout (0.5 - 2.0 seconds)
            2.2 Load more data
            2.3 Restart each node with defined timeout (DOES NOT LOOK ON TOPOLOGY SNAPSHOT)
            2.4 Try to activate, check AssertionErrors
            2.5 Try to baseline (If 2 operation failed -> PME, kill all nodes, start new test iteration)
            2.6 Try to load data
            2.7 Try to calculate checksum

        :return:
        """
        import random

        PiClient.read_timeout = 240

        # sleep_for_time = float(random.randrange(1, 15, 1)) / 5

        self.set_current_context('in_memory')

        self.util_copy_piclient_model_to_libs()
        self.ignite.set_activation_timeout(240)
        self.ignite.set_snapshot_timeout(240)
        self.ignite.set_node_option('*', 'jvm_options', ['-ea'])
        self.su.clear_snapshots_list()
        self.start_grid(skip_activation=True)

        with PiClient(self.ignite,
                      self.get_client_config(),
                      jvm_options=['-ea']) as piclient:
            # ignite = piclient.get_ignite()

            self.start_dynamic_caches_with_node_filter()

            last_loaded_key = 1000
            PiClientIgniteUtils.load_data_with_streamer(
                self.ignite,
                self.get_client_config(),
                end_key=last_loaded_key,
                jvm_options=['-ea'])

            nodes_before = self.ignite.get_alive_default_nodes()

            iterations = 50
            last_loaded_key += 1
            for i in range(0, iterations):
                log_print('Current iteration %s from %s' % (i, iterations),
                          color='debug')
                # sleep_for_time = float(self.the_glue_timeout) if self.the_glue_timeout else random.choice([0.7, 0.9, 2.0])
                sleep_for_time = float(
                    self.the_glue_timeout) if self.the_glue_timeout else round(
                        random.uniform(0.5, 2.5), 1)
                log_print(
                    "In this run we are going to sleep for {} seconds after each node restart"
                    .format(sleep_for_time),
                    color='green')

                log_print('Trying to load data into created/existing caches',
                          color='yellow')
                self.start_dynamic_caches_with_node_filter()
                PiClientIgniteUtils.load_data_with_streamer(
                    self.ignite,
                    self.get_client_config(),
                    start_key=last_loaded_key,
                    end_key=last_loaded_key + 500,
                    jvm_options=['-ea'])
                last_loaded_key += 500

                log_print("Round restart")
                for node_id in self.ignite.get_alive_default_nodes():
                    self.ignite.kill_node(node_id)
                    self.ignite.start_node(node_id, skip_topology_check=True)
                    sleep(sleep_for_time)

                    try:
                        log_print(
                            "Incrementing atomics using distributed compute")
                        create_async_operation(
                            create_distributed_atomic_long).evaluate()
                    except Exception as e:
                        log_print("Failed to increment atomics")

                        # just print exception (https://issues.apache.org/jira/browse/IGNITE-11535)
                        traceback.print_exc()

                log_print("Wait for topology messages")
                for node_id in self.ignite.get_all_default_nodes():
                    self.ignite.update_started_node_status(node_id)

                sleep(15)

                log_print("Validating cluster")
                last_loaded_key = self.verify_cluster(nodes_before,
                                                      last_loaded_key)
예제 #27
0
    def run_snapshot(self, artifact_name, snapshot_type):
        """
        Run scenario for defined artifact

        :param artifact_name: name from artifact configuration file
        :param snapshot_type: inc/full
        """
        super().run(artifact_name)

        log_print("Running snapshot benchmark with config: %s" % self.config,
                  color='green')

        version = self.test_class.tiden.config['artifacts'][artifact_name][
            'ignite_version']
        incremental_snapshot = True if snapshot_type == 'inc' else False
        try:
            self.test_class.create_app_config_set(
                Ignite,
                SNAPSHOT_CONFIG_SET,
                caches_list_file='caches_%s.xml' % SNAPSHOT_CONFIG_SET,
                deploy=True,
                snapshots_enabled=True,
                logger=False,
                wal_segment_size=self.test_class.consumption_config.get(
                    'wal_segment_size', 64 * 1024 * 1024),
                logger_path='%s/ignite-log4j2.xml' %
                self.test_class.tiden.config['rt']['remote']
                ['test_module_dir'],
                disabled_cache_configs=False,
                zookeeper_enabled=False,
                checkpoint_read_lock_timeout=self.read_lock_property_value(
                    version),
                # caches related variables
                additional_configs=[
                    'caches.tmpl.xml',
                ],
                part_32=self.config.get('part_32', 32),
                part_64=self.config.get('part_64', 64),
                part_128=self.config.get('part_64', 128),
                # artifact config variables
                **self.artifact_config_variables,
            )

            version, ignite = self.test_class.start_ignite_grid(
                artifact_name,
                activate=True,
                config_set=SNAPSHOT_CONFIG_SET,
                jvm_options=self.artifact_jvm_properties)

            time_results = list()
            directory_size = list()

            self.start_probes(artifact_name)

            client_config = Ignite.config_builder.get_config(
                'client', config_set_name=SNAPSHOT_CONFIG_SET)
            PiClientIgniteUtils.load_data_with_putall(
                ignite,
                client_config,
                end_key=int(self.config.get('data_size')))

            if incremental_snapshot:
                ignite.su.snapshot_utility('snapshot', '-type=full')

            # default times to run
            # plus warmup times
            # plus rerun times
            warmup_runs, prod_runs = self._get_number_of_runs()

            log_print("Running {} iterations".format(warmup_runs + prod_runs))
            for i in range(0, warmup_runs + prod_runs):
                self.write_time_event('iteration_%s start' % i)

                warmup_iteration = False if warmup_runs == 0 else i < warmup_runs

                log_print("Running iteration %s, (%s)" %
                          (i, 'warmup' if warmup_iteration else 'prod'))

                ignite.su.snapshot_utility('snapshot',
                                           f'-type={snapshot_type}')

                latest_snapshot_id = ignite.su.snapshots[-1]['id']
                dir_size = get_nodes_directory_size(
                    ignite, self.test_class.ssh, 'work/snapshot/%s' % list(
                        SnapshotScenario.util_find_snapshot_folders_on_fs(
                            ignite, latest_snapshot_id).values())[0])

                m = re.search(
                    'Command \[SNAPSHOT\] successfully finished in (\d*) seconds',
                    ignite.su.latest_utility_output)

                if incremental_snapshot:
                    # todo user remove operation after dr-master merge
                    with PiClient(ignite, client_config) as piclient:
                        ignite_instance = piclient.get_ignite()
                        for cache_name in ignite_instance.cacheNames().toArray(
                        ):
                            ignite_instance.cache(cache_name).removeAll()

                    PiClientIgniteUtils.load_data_with_putall(
                        ignite,
                        client_config,
                        end_key=int(self.config.get('data_size')))

                # skip some operation as warmup
                if not warmup_iteration:
                    assert m, 'Unable to get snapshot time execution'

                    time_results.append(int(m.group(1)))
                    directory_size.append(int(dir_size))

                self.write_time_event('iteration_%s stop' % i)

            ignite.cu.deactivate()

            self.stop_probes(time_results=time_results,
                             avg_snapshot_dir_size=directory_size,
                             seconds=True)

            self.results['evaluated'] = True

            ignite.kill_nodes()
            ignite.delete_lfs()

            log_put("Cleanup Ignite LFS ... ")
            commands = {}
            for node_idx in ignite.nodes.keys():
                host = ignite.nodes[node_idx]['host']
                if commands.get(host) is None:
                    commands[host] = [
                        'rm -rf %s/work/*' %
                        ignite.nodes[node_idx]['ignite_home']
                    ]
                else:
                    commands[host].append(
                        'rm -rf %s/work/*' %
                        ignite.nodes[node_idx]['ignite_home'])
            results = self.test_class.tiden.ssh.exec(commands)
            print(results)
            log_put("Ignite LFS deleted.")
            log_print()
        finally:
            # remove config set
            self.test_class.remove_app_config_set(Ignite, SNAPSHOT_CONFIG_SET)
    def verify_cluster(self, nodes_before, last_loaded_key=None):
        if len(nodes_before) != self.ignite.get_nodes_num('server'):
            log_print("There are missing nodes on cluster.", color='yellow')

            self.verify_no_assertion_errors()

            log_print("Wait for topology messages again.", color='yellow')
            for node_id in self.ignite.get_all_default_nodes():
                self.ignite.update_started_node_status(node_id)

            log_print("Missing nodes case confirmed. Trying to restart node.",
                      color='red')
            if len(nodes_before) != self.ignite.get_nodes_num('server'):
                nodes_to_start = []

                for node_id in self.ignite.get_alive_default_nodes():
                    # assert that node is not dead otherwise kill/restart again
                    if not self.ignite.check_node_status(node_id):
                        log_print("Restarting node %s" % node_id,
                                  color='yellow')
                        nodes_to_start.append(node_id)

                for node_id in nodes_to_start:
                    self.ignite.start_node(node_id,
                                           skip_nodes_check=True,
                                           check_only_servers=True)

                if len(nodes_before) != self.ignite.get_nodes_num('server'):
                    for node_id in self.ignite.get_alive_default_nodes():
                        self.util_get_threads_from_jstack(node_id, "FAILED")

                    assert False, "Failed to restart node"

        self.cu.control_utility('--activate')
        self.verify_no_assertion_errors()

        activate_failed = False
        log_print('Check that there is no Error in activate logs',
                  color='yellow')
        if 'Error' in self.cu.latest_utility_output:
            activate_failed = True
            log_print('Failed!', color='red')
        sleep(5)

        self.cu.control_utility('--baseline')
        self.verify_no_assertion_errors()
        log_print('Check that there is no Error in control.sh --baseline logs',
                  color='yellow')

        if 'Error' in self.cu.latest_utility_output:
            log_print('Failed! Second try after sleep 60 seconds', color='red')
            sleep(60)

            self.cu.control_utility('--baseline')

            if 'Error' in self.cu.latest_utility_output or activate_failed:
                log_print('Cluster looks hang.')

        log_print('Check that there is no AssertionError in logs',
                  color='yellow')
        self.verify_no_assertion_errors()

        if last_loaded_key:
            try:
                log_print('Trying to load data into survivor caches',
                          color='yellow')
                PiClientIgniteUtils.load_data_with_streamer(
                    self.ignite,
                    self.get_client_config(),
                    start_key=last_loaded_key,
                    end_key=last_loaded_key + 500,
                    allow_overwrite=True,
                    check_clients=False,
                )

                last_loaded_key += 500

                log_print('Printing checksums of existing caches',
                          color='yellow')

                print(
                    PiClientIgniteUtils.calc_checksums_distributed(
                        self.ignite,
                        self.get_client_config(),
                        check_clients=False))

                log_print('Check that there is no AssertionError in logs',
                          color='yellow')
            except Exception as e:
                for node_id in self.ignite.get_alive_default_nodes():
                    self.util_get_threads_from_jstack(node_id, "FAILED")

                assert False, "Unable to connect client"
            finally:
                self.verify_no_assertion_errors()

        return last_loaded_key
    def test_start_on_lfs(self):
        PiClient.read_timeout = 3600
        SshPool.default_timeout = 1200

        self.ignite_old_version = self.start_ignite_grid('old', activate=True)

        key_map = {
            'cache_group_1_001': ModelTypes.KEY_ALL_TYPES_MAPPED.value,
            'cache_group_1_002': 'java.lang.Long',
            'cache_group_1_003': ModelTypes.KEY_DEFAULT_TABLE.value,
            'cache_group_1_004': ModelTypes.KEY_ACCOUNT.value,
        }

        batch_size = 10000
        PiClientIgniteUtils.load_data_with_putall(
            self.ignite_old_version,
            Ignite.config_builder.get_config('client',
                                             config_set_name=self.config_name),
            start_key=0,
            end_key=250000,
            batch_size=batch_size,
            key_map=key_map,
            value_type=ModelTypes.VALUE_ALL_TYPES_4_INDEX.value)

        sleep(120)

        self.ignite_old_version.kill_nodes()

        cmd = dict()
        for node_id, dscr in self.ignite_old_version.nodes.items():
            ln_to_work = 'ln -s %s/%s %s/%s' % (
                dscr['ignite_home'],
                'work',
                self.get_app(self.ignite_app_names['new']).nodes[node_id]
                ['ignite_home'],
                'work',
            )
            if dscr['host'] in cmd:
                cmd[dscr['host']].append(ln_to_work)
            else:
                cmd[dscr['host']] = [
                    ln_to_work,
                ]

        self.tiden.ssh.exec(cmd)

        sleep(10)

        self.ignite_old_version = self.start_ignite_grid('new',
                                                         activate=True,
                                                         replaced_name='old')

        sleep(120)

        self.ignite_old_version.cu.control_utility(
            '--cache validate_indexes', all_required=['no issues found'])

        PiClientIgniteUtils.load_data_with_putall(
            self.ignite_old_version,
            Ignite.config_builder.get_config('client',
                                             config_set_name=self.config_name),
            start_key=250001,
            end_key=500001,
            batch_size=batch_size,
            key_map=key_map,
            value_type=ModelTypes.VALUE_ALL_TYPES_4_INDEX.value)

        sleep(120)

        self.ignite_old_version.cu.control_utility('--cache', 'idle_verify')