def util_verify(self, save_lfs_on_exception=False): util_sleep_for_a_while(10, 'Sleep before IDLE_VERIFY') from pt.util import version_num if version_num(self.cu.get_ignite_version()) < version_num('2.5.0'): idle_verify_pass = [ 'Command \[IDLE_VERIFY.*\] started', 'Partition verification finished, no conflicts have been found.', 'Command \[IDLE_VERIFY\] successfully finished in [0-9\.]+ seconds.' ] self.cu.control_utility('--cache', 'idle_verify', all_required=idle_verify_pass) else: idle_verify_pass = [ 'idle_verify check has finished, no conflicts have been found.' ] try: self.cu.control_utility('--cache', 'idle_verify', all_required=idle_verify_pass) except TidenException as e: self.cu.control_utility('idle_verify', '--analyse') self.stop_grid(fail=False) if save_lfs_on_exception: self.save_lfs('bug') raise e
def restart_empty_grid_with_nodes_count(self, nodes_count): self.cu.deactivate() util_sleep_for_a_while(5) current_nodes = self.ignite.get_alive_default_nodes() self.ignite.stop_nodes() util_sleep_for_a_while(5) self.delete_lfs() additional_nodes_count = nodes_count - len(current_nodes) if additional_nodes_count < 0: print_blue('Going to remove nodes %s' % current_nodes[additional_nodes_count:]) for node_id in current_nodes[additional_nodes_count:]: current_nodes.remove(node_id) # if self.ignite.nodes.get(node_id): # del self.ignite.nodes[node_id] log_print('Going to start nodes {}'.format(current_nodes)) self.ignite.start_nodes(*current_nodes) if additional_nodes_count > 0: additional_nodes_count = nodes_count - len(current_nodes) print_blue('Starting %s additional nodes' % additional_nodes_count) node_id = list( self.ignite.add_additional_nodes(self.get_server_config(), additional_nodes_count)) self.ignite.start_additional_nodes(node_id) self.cu.activate()
def util_start_additional_nodes(self, node_type='server', add_to_baseline=False, nodes_ct=None): nodes_count = nodes_ct if nodes_ct else randint(1, 3) if node_type == 'server': config = self.get_server_config() else: config = self.get_client_config() additional_nodes = self.ignite.add_additional_nodes( config, nodes_count) self.start_additional_nodes(additional_nodes) baseline_msg = '' if add_to_baseline: self.cu.set_current_topology_as_baseline() baseline_msg = 'Server nodes added to baseline.' util_sleep_for_a_while(10) util_sleep_for_a_while(5) self.cu.control_utility('--baseline') print_green('Started %s %s nodes. %s' % (nodes_count, node_type, '' if not add_to_baseline else baseline_msg)) return additional_nodes
def restart_empty_inactive_grid(self): self.cu.deactivate() util_sleep_for_a_while(5) self.ignite.stop_nodes() util_sleep_for_a_while(5) self.cleanup_lfs() self.ignite.start_nodes()
def test_ignite_9398_deactivate(self): """ https://ggsystems.atlassian.net/browse/IGN-11435 https://issues.apache.org/jira/browse/IGNITE-9398 Fixed in: 8.5.1-p14 This ticket optimizes dispatching of Custom Discovery Messages by offloading their processsing to separate thread. If the fix is ok, then all nodes must mention custom discovery messages faster :return: """ self.start_grid() util_sleep_for_a_while(3) jmx = JmxUtility(self.ignite) jmx.deactivate(1) max_time = self._get_last_exchange_time() print_blue( "Max time diff between 'Started exchange init' and 'Finish exchange future' at all nodes: %s msec" % max_time) self._dump_exchange_time(max_time, "cluster deactivate") jmx.kill_utility()
def test_ignite_8897(self): """ 1. create LRT 2. start PME 3. try to add new node to baseline :return: """ # start grid, remove one random node from baseline self.start_grid() self.ignite.cu.set_current_topology_as_baseline() grid_size = len(self.ignite.get_all_alive_nodes()) stopping_node_id = self.ignite.get_random_server_nodes()[0] stopping_node_consistent_id = self.ignite.get_node_consistent_id( stopping_node_id) self.kill_node(stopping_node_id) self.ignite.wait_for_topology_snapshot(server_num=grid_size - 1, client_num=None) self.ignite.cu.remove_node_from_baseline(stopping_node_consistent_id) util_sleep_for_a_while(5) # create a long running transaction with PiClient(self.ignite, self.get_client_config(), nodes_num=1, jvm_options=self.jvm_options): self.ignite.wait_for_topology_snapshot(None, 1) TestLrt.create_transactional_cache() lrt_operations = TestLrt.launch_transaction_operations() self.release_transactions(lrt_operations)
def test_util_1_6_utility_control_cache_contention(self): """ Test control.sh --cache contention shows correct information about keys with contention. :return: """ try: LRT_TIMEOUT = 10 with PiClient(self.ignite, self.get_server_config(), nodes_num=1) as piclient: cache_name = 'test_cache_001' self.cu.control_utility('--cache contention 1') lrt_operations = self.launch_transaction_operations(cache_name) self.cu.control_utility('--cache contention 1') self.cu.control_utility('--cache contention 11') self.cu.control_utility('--cache contention 10') for operation in lrt_operations: transaction_xid = operation.getTransaction().xid().toString() if transaction_xid not in self.cu.latest_utility_output: log_print("Transaction with XID %s is not in list. Key=%s, Value=%s" % (transaction_xid, operation.getKey(), operation.getValue()), color='debug') self.release_transactions(lrt_operations) log_print("Sleep for %s seconds" % LRT_TIMEOUT) util_sleep_for_a_while(LRT_TIMEOUT) self.cu.control_utility('--cache contention 1') except TidenException as e: log_print(e, color='red') pass
def test_host_ignite(self): self.start_grid('host') nodes_num = len(self.ignite.get_alive_default_nodes()) self.ignite.cu.activate() self.ignite.cu.control_utility('--baseline') self.load_data_with_streamer( start_key=1, end_key=1000, value_type= 'org.apache.ignite.piclient.model.values.AllTypesIndexed', ignite=self.ignite) self.ignite.start_node_inside(1) self.ignite.wait_for_topology_snapshot(server_num=nodes_num - 1) util_sleep_for_a_while(10) self.ignite.start_node_inside(1) self.ignite.wait_for_topology_snapshot(server_num=nodes_num) self.load_data_with_streamer( start_key=1, end_key=1000, allow_overwrite=True, value_type= 'org.apache.ignite.piclient.model.values.AllTypesIndexed', ignite=self.ignite) log_print('Done')
def test_util_1_8_counters_detection_during_PME_node_from_baseline(self): """ Tests PME synchronise partition counters if some detected. :return: """ self.load_data_with_streamer(end_key=1000, value_type=ModelTypes.VALUE_ALL_TYPES_INDEXED.value) with PiClient(self.ignite, self.get_client_config(), nodes_num=1) as piclient: caches_before_lrt = [] for cache_name in piclient.get_ignite().cacheNames().toArray(): caches_before_lrt.append(cache_name) cache_under_test = caches_before_lrt[0] log_print('Cache under test: %s' % cache_under_test, color='blue') operation = create_broke_data_entry_operation(cache_under_test, 1, True, 'counter') operation.evaluate() expected = ['Conflict partition'] self.cu.control_utility('--cache idle_verify', all_required=expected) output = self.cu.latest_utility_output grp_id, part_id = None, None for line in output.split('\n'): m = search('Conflict partition: (PartitionKey|PartitionKeyV2) \[grpId=(\d+),.*partId=(\d+)\]', line) if m: grp_id = m.group(2) part_id = m.group(3) tiden_assert(grp_id and part_id, 'Expecting to find conflicts in output\n{}'.format(self.cu.latest_utility_output)) # Start one more server node and change baseline to run PME log_print("Going to start additional node", color='green') self.ignite.add_additional_nodes(self.get_server_config(), 1) self.ignite.start_additional_nodes(self.ignite.get_all_additional_nodes()) self.cu.control_utility('--baseline') self.cu.set_current_topology_as_baseline() self.cu.control_utility('--baseline') msg_in_log = self.find_in_node_log('Partition states validation has failed for group: %s' % cache_under_test, node_id=1) assert msg_in_log != [] # Check there are no conflicts after PME util_sleep_for_a_while(30) self.cu.control_utility('--cache', 'idle_verify') # Stop one more server node and change baseline to run PME self.ignite.kill_node(self.ignite.get_alive_additional_nodes()[0]) util_sleep_for_a_while(30) self.cu.control_utility('--baseline') self.cu.set_current_topology_as_baseline() self.cu.control_utility('--baseline') # Check there are no conflicts after PME self.cu.control_utility('--cache', 'idle_verify')
def restart_grid_with_deleted_wal(self): self.cu.deactivate() util_sleep_for_a_while(5) self.ignite.stop_nodes() util_sleep_for_a_while(5) self.delete_lfs(delete_db=False, delete_binary_meta=False, delete_marshaller=False, delete_snapshots=False) self.ignite.start_nodes() self.cu.activate(activate_on_particular_node=1)
def util_get_restore_point(cls, seconds_ago=1): from datetime import datetime, timedelta from time import timezone util_sleep_for_a_while(2) time_format = "%Y-%m-%d-%H:%M:%S.%f" restore_point = (datetime.now() - timedelta(seconds=seconds_ago)) # Hack to handle UTC timezone if int(timezone / -(60 * 60)) == 0: restore_point = restore_point + timedelta(hours=3) return restore_point.strftime(time_format)[:-3]
def test_util_1_7_broke_index_from_utility(self): """ This test is just check pmi-tool can break index for some cache. """ cache_under_test = 'cache_group_1_028' self.util_deploy_pmi_tool() custom_context = self.create_test_context('pmi_tool') custom_context.add_context_variables( fix_consistent_id=True ) custom_context.set_client_result_config("client_pmi.xml") custom_context.set_server_result_config("server_pmi.xml") custom_context.build_and_deploy(self.ssh) self.load_data_with_streamer(end_key=5000, value_type=ModelTypes.VALUE_ALL_TYPES_INDEXED.value) update_table = ['!tables', '!index', '\'create index example_idx on \"%s\".ALLTYPESINDEXED(STRINGCOL);\'' % cache_under_test, '!index'] sqlline = Sqlline(self.ignite) sqlline.run_sqlline(update_table) util_sleep_for_a_while(10) self.run_pmi_tool(cache_under_test) self.cu.control_utility('--cache', 'validate_indexes') output = self.cu.latest_utility_output expected = ['idle_verify failed', 'Idle verify failed'] self.cu.control_utility('--cache idle_verify', '-output=%s/idle_verify_output.txt' % self.config['rt']['remote']['test_dir'], all_required=expected) #self.su.snapshot_utility('idle_verify', # '-output=%s/idle_verify_output.txt' % self.config['rt']['remote']['test_dir'], # all_required=expected) log_print(output, color='debug') found_broken_index = False for line in output.split('\n'): m = search('IndexValidationIssue.*cacheName=%s, idxName=EXAMPLE_IDX' % cache_under_test, line) if m: found_broken_index = True log_print('Index EXAMPLE_IDX is broken', color='green') if not found_broken_index: raise TidenException('Expecting index broken, but it\'s not!!!')
def _set_baseline_few_times(self, times=2): topology_changed = False lst_output = '' utility_baseline_log = 'control-utility-baseline.log' util_sleep_for_a_while(20) for _ in range(0, times): self.cu.set_current_topology_as_baseline(background=True, log=utility_baseline_log) check_command = { self.cu.latest_utility_host: [ 'cat %s/%s' % (self.ignite.client_ignite_home, utility_baseline_log) ] } timeout_counter = 0 baseline_timeout = 120 completed = False while timeout_counter < baseline_timeout and not completed: lst_output = self.ignite.ssh.exec(check_command)[ self.cu.latest_utility_host][0] log_put('Waiting for topology changed %s/%s' % (timeout_counter, baseline_timeout)) if 'Connection to cluster failed.' in lst_output: print_red('Utility unable to connect to cluster') break if 'Number of baseline nodes: ' in lst_output: completed = True break util_sleep_for_a_while(5) timeout_counter += 5 if completed: topology_changed = True break log_print() if not topology_changed: print_red(lst_output) raise TidenException('Unable to change grid topology') return topology_changed
def test_ignite_10128(self): """ https://ggsystems.atlassian.net/browse/IGN-12187 https://issues.apache.org/jira/browse/IGNITE-10128 IO race during read\write cache configurations. :return: """ self.start_grid_no_activate() max_iterations = 100 for i in range(0, max_iterations): self.cu.activate() util_sleep_for_a_while(3) self.cu.deactivate() util_sleep_for_a_while(3)
def sbt_case_upload_data_and_create_restore_point(self): # start_key = int(self.max_key * self.load_multiplier) + 1 # self.sbt_like_load(start_key, 5000) self.upload_data_using_sbt_model() util_sleep_for_a_while(10) restore_point = self.util_get_restore_point() checksum = self.calc_checksums_distributed( config_file=self.get_client_config()) # start_key = int(5000 * self.load_multiplier) + 1 # self.sbt_like_load(start_key, 7000) self.upload_data_using_sbt_model(start_key=5000, end_key=7000) return restore_point, checksum
def test_util_1_7_broke_index(self): """ Tests control.sh --cache validate_indexes detects broken index for some cache. :return: """ self.load_data_with_streamer(end_key=5000, value_type=ModelTypes.VALUE_ALL_TYPES_INDEXED.value) update_table = ['!tables', '!index', '\'create index example_idx on \"cache_group_1_028\".ALLTYPESINDEXED(STRINGCOL);\'', '!index'] sqlline = Sqlline(self.ignite) sqlline.run_sqlline(update_table) with PiClient(self.ignite, self.get_client_config(), nodes_num=1): cache_under_test = 'cache_group_1_028' log_print('Cache under test: %s' % cache_under_test, color='blue') operation = create_broke_data_entry_operation(cache_under_test, 0, True, 'index') operation.evaluate() util_sleep_for_a_while(10) self.cu.control_utility('--cache', 'validate_indexes') output = self.cu.latest_utility_output expected = ['idle_verify failed', 'Idle verify failed'] self.cu.control_utility('--cache idle_verify', '-output=%s/idle_verify_output.txt' % self.config['rt']['remote']['test_dir'], all_required=expected) #self.su.snapshot_utility('idle_verify', # '-output=%s/idle_verify_output.txt' % self.config['rt']['remote']['test_dir'], # all_required=expected) log_print(output, color='debug') found_broken_index = False for line in output.split('\n'): m = search('IndexValidationIssue.*cacheName=%s, idxName=EXAMPLE_IDX' % cache_under_test, line) if m: found_broken_index = True log_print('Index EXAMPLE_IDX is broken', color='green') if not found_broken_index: raise TidenException('Expecting index broken, but it\'s not!!!')
def test_util_1_1_idle_verify(self): """ Test idle_verify command detects problem if some key is corrupted (there is some difference between key on primary partition and backup). """ partitions_to_break = [1, 2] with PiClient(self.ignite, self.get_client_config(), nodes_num=1): cache_under_test = 'cache_group_1_028' log_print('Cache under test: %s' % cache_under_test, color='blue') operation = create_broke_data_entry_operation(cache_under_test, partitions_to_break[0], True, 'value', 'counter') log_print(operation.evaluate()) operation = create_broke_data_entry_operation(cache_under_test, partitions_to_break[1], True, 'counter') log_print(operation.evaluate()) util_sleep_for_a_while(10) expected = ['Conflict partition'] self.cu.control_utility('--cache idle_verify', all_required=expected) log_print(self.cu.latest_utility_output) output = self.cu.latest_utility_output # m = search('See log for additional information. (.*)', self.cu.latest_utility_output) # if m: # conflict_file = m.group(1) # host = self.cu.latest_utility_host # output = self.ssh.exec_on_host(host, ['cat {}'.format(conflict_file)]) # log_print(output, color='blue') # else: # tiden_assert(False, 'Conflict file is not found in output:\n{}'.format(self.cu.latest_utility_output)) grpId, partId = [], [] for line in output.split('\n'): m = search('Conflict partition: (PartitionKey|PartitionKeyV2) \[grpId=(\d+),.*partId=(\d+)\]', line) if m: grpId.append(m.group(2)) partId.append(int(m.group(3))) tiden_assert(grpId and partId, 'Could not find partition id in buffer %s' % output) tiden_assert(len(set(grpId)), 'Should be one group in output %s' % output) tiden_assert(set(partId) == set(partitions_to_break), 'Partition ids should match %s' % output)
def teardown_testcase(self): log_print('TestTeardown is called', color='green') if self.reusable_lfs and not self.lfs_stored: log_print('Going to stop nodes....') self.ignite.stop_nodes() util_sleep_for_a_while(5) self.save_lfs('snapshot_util', timeout=1200) self.lfs_stored = True else: self.stop_grid_hard() self.su.copy_utility_log() if self.get_context_variable('zookeeper_enabled'): self.zoo.stop() self.cleanup_lfs() self.remove_additional_nodes() self.ignite.set_node_option('*', 'config', self.get_server_config()) self.set_current_context() self._reset_cluster() log_print(repr(self.ignite), color='debug')
def _change_grid_topology_and_set_baseline(self, disable_auto_baseline=True): """ Restart empty grid, start one additional node and add it to baseline topology. :param disable_auto_baseline: :return: """ self.restart_empty_grid() self.ignite.add_additional_nodes(self.get_server_config(), 1) self.ignite.start_additional_nodes( self.ignite.get_all_additional_nodes()) util_sleep_for_a_while(5) if disable_auto_baseline and self.cu.is_baseline_autoajustment_supported( ): self.cu.disable_baseline_autoajustment() log_print("Baseline auto adjustment disabled", color='green') self._set_baseline_few_times()
def start_and_config_jass_docker(self): jaas_remote_config_path = self.config['rt']['remote'][ 'test_module_dir'] jaas_host = self.config['environment']['server_hosts'][0] # replace host in jaas configs self.ssh.exec([ f'sed -i \'s/_HOST_/{jaas_host}/g\' {jaas_remote_config_path}/jaas.config' ]) self.ssh.exec([ f'sed -i \'s/_HOST_/{jaas_host}/g\' {jaas_remote_config_path}/jaas_ssl.config' ]) # change permission of docker start script and run it self.ssh.exec_on_host(jaas_host, [ 'docker run --name ldap -d -p 389:10389 -p 636:10636 openmicroscopy/apacheds' ]) util_sleep_for_a_while(5, 'wait ldap docker start') # setuo jaas users for auth_cred_name in self.auth_creds.values(): user = auth_cred_name['user'] password = auth_cred_name['pwd'] description = auth_cred_name['description'] log_print( f'Try to add user {user}:{password} with description {description}' ) for file in ['new_user', 'add_description', 'set_password']: self.ssh.exec([ f'sed \'s/_USER_/{user}/g\' {jaas_remote_config_path}/{file}.ldif | ' f'sed \'s/_DESCRIPTION_/{description}/g\' | ' f'sed \'s/_PASSWORD_/{password}/g\' > {jaas_remote_config_path}/{file}_{user}.ldif' ]) ldap_command = f"ldapmodify -a -D uid=admin,ou=system -w secret -h {jaas_host}:389 " \ f"-f {jaas_remote_config_path}/{file}_{user}.ldif" log_print(f'execute command - "{ldap_command}"') result = self.ssh.exec_on_host(jaas_host, [ldap_command]) log_print(result) log_print(f'Successfully add user {user} to ldap', color='green')
def test_24_fitness_set_baseline_with_properties(self): """ This test checks the cluster behaviour with option GG_DISABLE_SNAPSHOT_ON_BASELINE_CHANGE_WITH_ENABLED_PITR that could be set in different ways: 1. Set at one of the server nodes. 2. Set on some client node/nodes. """ created_caches = [] self.ignite_old_version.cu.activate() # Preloading with PiClient(self.ignite_new_version, self.client_config, nodes_num=1) as piclient: dynamic_caches_factory = DynamicCachesFactory() async_ops = [] for method in dynamic_caches_factory.dynamic_cache_configs: cache_name = "cache_group_%s" % method log_print('Loading {}...'.format(cache_name), color='green') piclient.get_ignite().getOrCreateCache( getattr(dynamic_caches_factory, method)(cache_name)) async_operation = create_async_operation( create_put_all_operation, cache_name, 1, 1001, 10, value_type=self.data_model) async_ops.append(async_operation) async_operation.evaluate() created_caches.append(cache_name) log_print('Waiting async results...', color='debug') # wait for streamer to complete for async_op in async_ops: async_op.getResult() util_sleep_for_a_while(20) new_client_config = Ignite.config_builder.get_config( 'client', config_set_name='24_fit_with_consist_id') jvm_options = self.ignite_new_version.get_jvm_options(1) jvm_options.append( '-DGG_DISABLE_SNAPSHOT_ON_BASELINE_CHANGE_WITH_ENABLED_PITR=true') # with PiClient(self.ignite_new_version, self.client_config, jvm_options=jvm_options, nodes_num=1) as piclient: with PiClient(self.ignite_new_version, self.client_config, nodes_num=1) as piclient: for i in range(1, 5): self.ignite_old_version.cu.control_utility('--baseline') log_print('Stopping node {}'.format(i), color='green') jvm_options = self.ignite_new_version.get_jvm_options(i) jvm_options.append( '-DGG_DISABLE_SNAPSHOT_ON_BASELINE_CHANGE_WITH_ENABLED_PITR=false' ) self.ignite_new_version.set_node_option( '*', 'config', Ignite.config_builder.get_config( 'server', config_set_name='24_fit_with_consist_id')) log_print("Starting node {} with new consistent id".format(i), color='debug') self.ignite_new_version.start_nodes(i, already_nodes=4, other_nodes=4, timeout=240) log_print("Changing baseline", color='debug') self.ignite_old_version.cu.set_current_topology_as_baseline() util_sleep_for_a_while(60, msg='Wait for rebalance to completed') log_print('Test is done')
def test_24_fitness_rolling_upgrade(self): """ This test checks the main rolling upgrade scenario under the load: 1. Old cluster up and running (consistent_id's are not set). 2. First cycle (upgrade to new version and set property GG_DISABLE_SNAPSHOT_ON_BASELINE_CHANGE_WITH_ENABLED_PITR): 3. Second cycle (set correct consistent_id with adding to baseline topology). """ created_caches = [] self.ignite_old_version.cu.activate() with PiClient(self.ignite_new_version, self.client_config, nodes_num=1) as piclient: dynamic_caches_factory = DynamicCachesFactory() async_ops = [] for method in dynamic_caches_factory.dynamic_cache_configs: cache_name = "cache_group_%s" % method log_print('Loading {}...'.format(cache_name), color='green') piclient.get_ignite().getOrCreateCache( getattr(dynamic_caches_factory, method)(cache_name)) async_operation = create_async_operation( create_put_all_operation, cache_name, 1, 1001, 10, value_type=self.data_model) async_ops.append(async_operation) async_operation.evaluate() created_caches.append(cache_name) log_print('Waiting async results...', color='debug') # wait for streamer to complete for async_op in async_ops: async_op.getResult() util_sleep_for_a_while(60) with PiClient(self.ignite_old_version, self.client_config, nodes_num=4) as piclient: cache_names = piclient.get_ignite().cacheNames() # Start transaction loading for TTL caches with TransactionalLoading(MixedTestLoadingAdapter(self), config_file=self.client_config, loading_profile=LoadingProfile( delay=0, transaction_timeout=100000, run_for_seconds=600)): util_sleep_for_a_while(20) log_print('Rolling upgrade', color='green') async_ops = [] for cache_name in [ cache_name for cache_name in cache_names.toArray() if cache_name.startswith("M2_PRODUCT") ]: async_operation = create_async_operation( create_put_all_operation, cache_name, 1001, 400001, 10, value_type=ModelTypes.VALUE_ALL_TYPES.value) async_ops.append(async_operation) async_operation.evaluate() # First cycle: upgrade version and set property. for i in range(1, 5): self.ignite_old_version.cu.control_utility('--baseline') log_print('Stopping node {}'.format(i), color='green') self.ignite_old_version.kill_nodes(i) self.ignite_new_version.cleanup_work_dir(i) folder = self.ignite_old_version.get_work_dir(i) log_print(folder, color='debug') self.ignite_new_version.copy_work_dir_from(i, folder) jvm_options = self.ignite_new_version.get_jvm_options(i) jvm_options.append( '-DGG_DISABLE_SNAPSHOT_ON_BASELINE_CHANGE_WITH_ENABLED_PITR=true' ) util_sleep_for_a_while(10) self.ignite_new_version.start_nodes(i, already_nodes=(4 - i), other_nodes=(4 - i), timeout=240) self.ignite_new_version.cu.control_utility('--baseline') for async_op in async_ops: async_op.getResult() util_sleep_for_a_while(30) log_print('Change consistent ID', color='green') self.ignite_new_version.set_node_option( '*', 'config', Ignite.config_builder.get_config( 'server', config_set_name='24_fit_with_consist_id')) # Second cycle - change consistent_id and add to baseline topology. for i in range(1, 5): self.ignite_new_version.cu.control_utility('--baseline') log_print('Stopping node {}'.format(i), color='green') self.ignite_new_version.kill_nodes(i) log_print( "Starting node {} with new consistent id".format(i), color='debug') self.ignite_new_version.start_nodes(i, timeout=240) log_print("Changing baseline", color='debug') self.ignite_new_version.cu.set_current_topology_as_baseline( ) util_sleep_for_a_while( 60, msg='Wait for rebalance to completed') log_print('Transactional loading done', color='green') # Just to check client node still can interact with cluster - calculate checksum from client node. sorted_cache_names = [] for cache_name in piclient.get_ignite().cacheNames().toArray(): sorted_cache_names.append(cache_name) sorted_cache_names.sort() async_operations = [] cache_operation = {} for cache_name in sorted_cache_names: async_operation = create_async_operation( create_checksum_operation, cache_name, 1, 10000) async_operations.append(async_operation) cache_operation[async_operation] = cache_name async_operation.evaluate() checksums = '' cache_checksum = {} for async_operation in async_operations: result = str(async_operation.getResult()) cache_checksum[cache_operation.get(async_operation)] = result checksums += result log_print('Calculating checksums done')
def test_ignite_8657(self): """ This test is based on IGNITE-8657: 1. start grid with EXCHANGE_HISTORY_SIZE smaller than N 2. activate 3. start simultaneously M > N clients 4. all client nodes should and be able to perform cache put/get operations and transactions NB: this test hangs with 2.5.1-p6, due to piclient wait Ignition.start() forever """ self.start_grid() self.load_random_data_with_streamer(0, 1000, nodes_num=2) self.cu.set_current_topology_as_baseline() nodes_before = set(self.ignite.get_all_common_nodes()) with PiClient(self.ignite, self.get_client_config(), nodes_num=10, jvm_options=self.jvm_options, read_timeout=300) as piclient: nodes_after = set(self.ignite.get_all_common_nodes()) nodes_started = list(nodes_after - nodes_before) node_ids = deque(nodes_started) node_id = node_ids[0] node_ids.rotate() for i in range(1, 5): gateway = piclient.get_gateway(node_id) ignite = piclient.get_ignite(node_id) tx = ignite.transactions().txStart() util_sleep_for_a_while(3) tx.commit() for concurrency in ['OPTIMISTIC', 'PESSIMISTIC']: for isolation in [ 'READ_COMMITTED', 'REPEATABLE_READ', 'SERIALIZABLE' ]: print_blue('Run transaction %s %s' % (concurrency, isolation)) node_id = node_ids[0] node_ids.rotate() gateway = piclient.get_gateway(node_id) ignite = piclient.get_ignite(node_id) concurrency_isolation_map = self._get_tx_type_map( gateway) cache_names = ignite.cacheNames().toArray() tx = ignite.transactions().txStart( concurrency_isolation_map.get(concurrency), concurrency_isolation_map.get(isolation)) for cache_name in cache_names: cache = ignite.getCache(cache_name) val = cache.get(int(random() * 1000)) # log_print('got %s' % repr(val)) if val: cache.put(int(random() * 1000), val) tx.commit() node_id = node_ids[0] node_ids.rotate() ignite = piclient.get_ignite(node_id) async_ops = [] for cache_name in ignite.cacheNames().toArray(): _async = create_async_operation(create_streamer_operation, cache_name, 1002, 2000) _async.evaluate() async_ops.append(_async) for async_op in async_ops: async_op.getResult()
def restart_grid_without_activation(self): self.cu.deactivate() util_sleep_for_a_while(5) self.ignite.stop_nodes() util_sleep_for_a_while(5) self.ignite.start_nodes()
def test_ignite_8855(self): """ This test is based on IGNITE-8855: 1. start grid with EXCHANGE_HISTORY_SIZE smaller than N 2. activate 3. start simultaneously M > N clients 4. there should be throttling on client reconnects, e.g. client out of exchange should not try to reconnect all at once. NB: this test fails on 8.5.1-p13 """ nodes_num = TestPmeRegress.MAX_CLIENT_NODES self.start_grid() self.load_random_data_with_streamer(0, 1000, nodes_num=2) self.cu.set_current_topology_as_baseline() pp = PrettyPrinter() try: with PiClient(self.ignite, self.get_client_config(), nodes_num=nodes_num, jvm_options=self.jvm_options, read_timeout=300 ): # that's enough to start if bug was fixed self.ignite.wait_for_topology_snapshot(None, nodes_num) n_tries = 0 res = None while n_tries < 3: util_sleep_for_a_while(5) self.ignite.get_data_from_log( 'server', 'Client node tries to connect but its exchange info is cleaned up from exchange history', '(.*)', 'log_kickoff') res = self._collect_msg('log_kickoff', 'server') log_print('Client kick off messages: \n' + pp.pformat(res)) if not res: n_tries = n_tries + 1 continue assert res, "There should be client kick off messages" util_sleep_for_a_while(5) self.ignite.get_data_from_log( 'client', 'Client node reconnected', '(.*)', 'reconnect' # and now Jinn would appear! ) res = self._collect_msg('reconnect') if not res: n_tries = n_tries + 1 continue log_print('Reconnect attempts: \n' + pp.pformat(res)) self.ignite.wait_for_topology_snapshot( None, nodes_num, "Ensure topology is stable") break except PiClientException as e: log_print("Got client exception: %s" % str(e)) assert False, "IGNITE-8855 reproduced"
def util_break_partition_using_pmi_tool(self, break_value=True): cache_under_test = 'cache_group_1_028' self.util_deploy_pmi_tool() custom_context = self.create_test_context('pmi_tool') custom_context.add_context_variables( fix_consistent_id=True ) custom_context.set_client_result_config("client_pmi.xml") custom_context.set_server_result_config("server_pmi.xml") custom_context.build_and_deploy(self.ssh) self.load_data_with_streamer(end_key=5000, value_type=ModelTypes.VALUE_ALL_TYPES_INDEXED.value) util_sleep_for_a_while(10) broke_type = 'counter' if break_value: broke_type = 'value' self.run_pmi_tool(cache_under_test, broke_type=broke_type) util_sleep_for_a_while(10) expected = ['idle_verify failed', 'Idle verify failed'] self.cu.control_utility('--cache idle_verify', '-output=%s/idle_verify_output.txt' % self.config['rt']['remote']['test_dir'], all_required=expected) # self.su.snapshot_utility('idle_verify', # '-output=%s/idle_verify_output.txt' % self.config['rt']['remote']['test_dir'], # all_required=expected) log_print(self.su.latest_utility_output) output = self.util_get_info_from_conflicts_file() log_print(output, color='green') records_info = [line for line in output if 'Partition instances' in line] if len(records_info) > 0: records_info = records_info[0].split('],') else: records_info = [] partition_hash_record = set() prim_part_hash, prim_update_cntr = None, None for line in records_info: m = search('PartitionHashRecord \[isPrimary=(.+), partHash=([\d,-]+), updateCntr=(\d+)', line) if m: if m.group(1) == 'true': prim_part_hash = m.group(2) prim_update_cntr = m.group(3) else: partition_hash_record.add((m.group(1), m.group(2), m.group(3))) for isPrimary, partHash, updateCntr in partition_hash_record: if break_value: assert partHash != prim_part_hash assert updateCntr == prim_update_cntr else: assert partHash == prim_part_hash assert updateCntr != prim_update_cntr grpId, partId = None, None for line in output: m = search('Conflict partition: PartitionKey \[grpId=(\d+),.*partId=(\d+)\]', line) if m: grpId = m.group(1) partId = m.group(2) if grpId and partId: self.cu.control_utility('--cache idle_verify', '-analyze -grpId=%s -partId=%s' % (grpId, partId), all_required=expected) #self.su.snapshot_utility('idle_verify', '-analyze -grpId=%s -partId=%s' % (grpId, partId), # all_required=expected) output = self.util_get_info_from_conflicts_file(file_name='idle_verify_conflicts_2.txt') log_print(output, color='debug') else: log_print('Could not find partition id', color='red') log_print(self.su.latest_utility_output)
def _run_iteration(self, ignite, iteration): """ One iteration of clients PME benchmark is as follows: 1. start transactional loading at `loading_clients_hosts`, sleep `warmup_clients_delay` so load stabilize 2. start `num_clients_to_kill` clients at `clients_hosts` (different from `loading_clients_hosts`) measure JOIN exchange time, sleep `stabilization_delay` 3. stop started additional clients, measure LEAVE exchange time, sleep `cooldown_delay` :param ignite: :param iteration: :return: """ log_print("===> PME {} Clients(s) Left-Join Benchmark iteration {}/{} artifact started ".format( self.config['num_clients_to_kill'], iteration, self.config['iterations'] ), color='green') loading_client_hosts = self._get_loading_client_hosts() client_hosts = self._get_client_hosts(loading_client_hosts) num_clients = self.config['num_clients_to_kill'] metrics = None ex = None x1_join_time = None x1_leave_time = None try: # start loading clients with PiClient( ignite, self.test_class.client_config, client_hosts=loading_client_hosts, clients_per_host=self.config.get('loading_clients_per_host', 1) ): # initiate transactional loading with TransactionalLoading( self.test_class, ignite=ignite, kill_transactions_on_exit=self.config['kill_transactions_on_exit'], cross_cache_batch=self.config['cross_cache_batch'], skip_atomic=self.config['skip_atomic'], skip_consistency_check=not self.config['consistency_check_enabled'], loading_profile=LoadingProfile( delay=self.config['tx_delay'], commit_possibility=self.config['commit_possibility'], start_key=1, end_key=self.config['load_factor'] - 1, transaction_timeout=self.config['transaction_timeout'] ), tx_metrics=['txCreated', 'txCommit', 'txFailed', 'txRollback'] ) as tx_loading: metrics = tx_loading.metrics util_sleep_for_a_while(self.config['warmup_clients_delay'], "Before JOIN") current_clients_num = ignite.get_nodes_num('client') expected_total_clients_num = current_clients_num + num_clients self.test_class._prepare_before_test(ignite, tx_loading, 'JOIN %d client(s)' % num_clients) # start num_clients client nodes on 'flaky' hosts with PiClient( ignite, self.test_class.client_config, client_hosts=client_hosts, clients_per_host=self.config.get('clients_per_host', 1), nodes_num=num_clients, new_instance=True, ): ignite.wait_for_topology_snapshot(client_num=expected_total_clients_num, timeout=600, check_only_servers=True, exclude_nodes_from_check=[]) tx_loading.metrics_thread.add_custom_event('%d client(s) joined' % num_clients) new_topVer = self.test_class._get_new_top_after_test(ignite) self.test_class._wait_exchange_finished(ignite, new_topVer) x1_join_time, x2_time = self.test_class._measurements_after_test('JOIN %d client(s)' % num_clients, skip_exch=1) util_sleep_for_a_while(self.config['stabilization_delay']) # upon exit from with block, num_clients client nodes will be killed self.test_class._prepare_before_test(ignite, tx_loading, 'LEAVE %d client(s)' % num_clients) ignite.wait_for_topology_snapshot(client_num=current_clients_num, timeout=600, check_only_servers=True, exclude_nodes_from_check=[]) tx_loading.metrics_thread.add_custom_event('%d client(s) left' % num_clients) new_topVer = self.test_class._get_new_top_after_test(ignite) self.test_class._wait_exchange_finished(ignite, new_topVer) x1_leave_time, x2_time = self.test_class._measurements_after_test('LEAVE %d client(s)' % num_clients, skip_exch=1) util_sleep_for_a_while(self.config['cooldown_delay']) ignite.wait_for_topology_snapshot(client_num=0) except Exception as e: ex = e if metrics: self.test_class.create_loading_metrics_graph( 'pme_%d_clients_left_join_%s_%d' % (num_clients, self.run_id, iteration), metrics, dpi_factor=0.75 ) if ex: raise ex return { 'Exchange Client Join': x1_join_time, 'Exchange Client Leave': x1_leave_time, }
def _run_iteration(self, ignite, iteration): """ One iteration of server PME benchmark is as follows: 1. start transactional loading, sleep `warmup_servers_delay` so that load stabilize 2. kill random N nodes, measure LEAVE exchange time, sleep `stabilization_delay` 3. restart killed nodes, measure JOIN exchange time, sleep `cooldown_delay` 4. stop load :param ignite: :param iteration: :return: """ log_print( "===> PME {} Server(s) Left-Join Benchmark iteration {}/{} started " .format(self.config['num_servers_to_kill'], iteration, self.config['iterations']), color='green') # if debug: # from pt.util import read_yaml_file # from os.path import join # base_path = 'pt/tests/res/exchanges' # exch_test = iteration # start_exch = read_yaml_file(join(base_path, 'start_exch.%d.yaml' % exch_test)) # finish_exch = read_yaml_file(join(base_path, 'finish_exch.%d.yaml' % exch_test)) # merge_exch = read_yaml_file(join(base_path, 'merge_exch.%d.yaml' % exch_test)) # self.test_class.exchanges = ExchangesCollection.create_from_log_data(start_exch, finish_exch, merge_exch) # self.test_class.new_topVer = 5 # x1_leave_time, x2_time = self.test_class._measurements_after_test('test_leave', skip_exch=1) # self.test_class.new_topVer = 6 # x1_join_time, x2_time = self.test_class._measurements_after_test('test_join', skip_exch=1) # # return x1_leave_time, x1_join_time loading_client_hosts = self._get_loading_client_hosts() num_servers = self._get_num_server_nodes() num_servers_to_kill = self.config['num_servers_to_kill'] kill_coordinator = self.config['kill_coordinator'] metrics = None ex = None x1_join_time = None x1_leave_time = None try: # start loading clients ... with PiClient(ignite, self.test_class.client_config, client_hosts=loading_client_hosts, clients_per_host=self.config.get( 'loading_clients_per_host', 1)): # ... and initiate transactional load with TransactionalLoading( self.test_class, ignite=ignite, kill_transactions_on_exit=self. config['kill_transactions_on_exit'], cross_cache_batch=self.config['cross_cache_batch'], skip_atomic=self.config['skip_atomic'], skip_consistency_check=not self. config['consistency_check_enabled'], loading_profile=LoadingProfile( delay=self.config['tx_delay'], commit_possibility=self. config['commit_possibility'], start_key=1, end_key=self.config['load_factor'] - 1, transaction_timeout=self. config['transaction_timeout']), tx_metrics=[ 'txCreated', 'txCommit', 'txFailed', 'txRollback' ]) as tx_loading: metrics = tx_loading.metrics # pick random server nodes node_ids = ignite.get_random_server_nodes( num_servers_to_kill, use_coordinator=kill_coordinator, node_ids=self.test_class.server_node_ids, ) expected_total_server_num = num_servers - len(node_ids) # ... wait load stabilize util_sleep_for_a_while(self.config['warmup_servers_delay'], "Before LEAVE") if is_enabled(self.config.get('jfr_enabled', False)): ignite.make_cluster_jfr(60) util_sleep_for_a_while(2) self.test_class._prepare_before_test( ignite, tx_loading, 'LEAVE %d server(s)' % len(node_ids)) # ... kill selected random nodes ignite.kill_nodes(*node_ids) ignite.wait_for_topology_snapshot( server_num=expected_total_server_num) tx_loading.metrics_thread.add_custom_event( '%d server(s) left' % len(node_ids)) new_topVer = self.test_class._get_new_top_after_test( ignite) self.test_class._wait_exchange_finished(ignite, new_topVer) x1_leave_time, x2_time = self.test_class._measurements_after_test( 'LEAVE %d server(s)' % len(node_ids), skip_exch=1) if is_enabled(self.config.get('heapdump_enabled', False)): ignite.make_cluster_heapdump( [1], 'after_%d_server_leave' % len(node_ids)) # ... wait exchange stabilize util_sleep_for_a_while(self.config['stabilization_delay'], "After LEAVE, before JOIN") if self.config['measure_restart_nodes']: self.test_class._prepare_before_test( ignite, tx_loading, 'JOIN %d server(s)' % len(node_ids)) # ... restart killed nodes ignite.start_nodes(*node_ids) ignite.wait_for_topology_snapshot( server_num=expected_total_server_num + len(node_ids)) if self.config['measure_restart_nodes']: tx_loading.metrics_thread.add_custom_event( '%d server(s) joined' % len(node_ids)) new_topVer = self.test_class._get_new_top_after_test( ignite) self.test_class._wait_exchange_finished( ignite, new_topVer) x1_join_time, x2_time = self.test_class._measurements_after_test( 'JOIN %d server(s)' % len(node_ids), skip_exch=1) # if is_enabled(self.config.get('heapdump_enabled', False)): # ignite.make_cluster_heapdump([1], 'after_%d_server_join' % len(node_ids)) # ... wait exchange cooldown util_sleep_for_a_while(self.config['cooldown_delay'], "After JOIN") ignite.wait_for_topology_snapshot(client_num=0) except Exception as e: ex = e if metrics: self.test_class.create_loading_metrics_graph( 'pme_%d_servers_left_join_%s_%d' % (num_servers_to_kill, self.run_id, iteration), metrics, dpi_factor=0.75) if ex: raise ex return { 'Exchange Server Join': x1_join_time, 'Exchange Server Leave': x1_leave_time, }
def test_massive_index_rebuild(self): """ 1) 2 nodes, backupCnt = 1, persistenceEnabled 2) Load (A, B) type into a cache with defined (A, B) types in index config 3) Load new type of data into a cache (C,D) 4) Kill one node 5) Create new index in alive cluster 6) Start node again :return: """ PiClient.read_timeout = 1200 self.set_current_context('indexed_types') self.util_copy_piclient_model_to_libs() self.ignite.set_activation_timeout(240) self.ignite.set_snapshot_timeout(240) self.ignite.set_node_option('*', 'jvm_options', ['-ea']) self.su.clear_snapshots_list() self.start_grid(skip_activation=True) self.ignite.cu.activate(activate_on_particular_node=1) PiClientIgniteUtils.load_data_with_streamer( self.ignite, self.get_client_config(), value_type=ModelTypes.VALUE_ALL_TYPES_30_INDEX.value, end_key=5000) PiClientIgniteUtils.load_data_with_streamer( self.ignite, self.get_client_config(), value_type=ModelTypes.VALUE_ACCOUNT.value, start_key=5000, end_key=10000) PiClientIgniteUtils.load_data_with_streamer( self.ignite, self.get_client_config(), value_type=ModelTypes.VALUE_EXT_ALL_TYPES_30_INDEX.value, start_key=10000, end_key=15000) # PiClientIgniteUtils.load_data_with_streamer(self.ignite, # self.get_client_config(), # cache_names_patterns= # ['cache_group_3'], # value_type=ModelTypes.VALUE_EXT_ALL_TYPES_30_INDEX.value, # end_key=10000) iterations = 50 sqlline = Sqlline(self.ignite) columns = [ 'longCol', 'doubleCol', 'stringCol', 'booleanCol', 'longCol1', # 'doubleCol1', 'stringCol1', 'intCol', 'intCol1', # 'booleanCol1', # 'index', 'longCol2', 'doubleCol2', 'stringCol2', 'booleanCol2', # 'longCol12', 'doubleCol12', 'stringCol12', 'intCol12', 'intCol2', # 'shortCol2', 'longCol3', 'doubleCol3', 'stringCol3', 'booleanCol3', # 'longCol13', 'doubleCol13', 'stringCol13', 'intCol13', 'intCol3', 'shortCol3' ] with PiClient(self.ignite, self.get_client_config()) as piclient: cache_names = piclient.get_ignite().cacheNames().toArray() for i in range(0, iterations): log_print('Current iteration %s from %s' % (i, iterations), color='debug') update_table = [] self.ignite.kill_node(2) indexed_columns = ','.join(columns) for cache_name in cache_names: # self.ssh.exec_on_host('REMOVE') vtype = 'ALLTYPES30INDEX' # if 'cache_group_3' not in cache_name else 'EXTALLTYPES30INDEX' update_table.append( f'\'CREATE INDEX IF NOT EXISTS {cache_name}_{vtype} on ' f'\"{cache_name}\".{vtype}({indexed_columns}) INLINE_SIZE 32 PARALLEL 28;\'' ) update_table.append('!index') sqlline.run_sqlline(update_table) self.ignite.start_node(2) util_sleep_for_a_while(30) self.verify_no_assertion_errors() self.cu.control_utility('--cache validate_indexes', all_required='no issues found.')
def test_pme_bench_activate(self): """ Perform activate and deactivate benchmarks, save results to .csv in var_dir """ self.start_grid_no_activate() self.last_top = self.ignite.ignite_srvs.last_topology_snapshot() self.last_topVer = max([_['ver'] for _ in self.last_top]) last_minorTopVer = 0 print_blue("Topology version before activate: %d" % self.last_topVer) util_sleep_for_a_while(3) self.ignite.ignite_srvs.jmx.activate(1) n_tries = 0 max_tries = 5 while n_tries < max_tries: n_tries += 1 max_time = self._get_last_exchange_time() if self.exchange_finished: break util_sleep_for_a_while(5) self.new_top = self.ignite.ignite_srvs.last_topology_snapshot() self.new_topVer = max([_['ver'] for _ in self.new_top]) assert self.new_topVer == self.last_topVer, "ERROR: major topology changed, possibly crash during activation" for exch_topVer, exch_data in self.agg_exch_x1.items(): exch_major_topVer = int(exch_topVer / 10000) exch_minor_topVer = exch_topVer - exch_major_topVer * 10000 if exch_major_topVer == self.last_topVer: x1_time = self.agg_exch_x1[exch_topVer]['max_duration'] x2_time = self.agg_exch_x2[exch_topVer]['max_duration'] self._dump_exchange_time( x1_time, x2_time, "activate [%d, %d]" % (exch_major_topVer, exch_minor_topVer), num_partitions=self.num_partitions) last_minorTopVer = exch_minor_topVer self.ignite.ignite_srvs.jmx.deactivate(1) n_tries = 0 max_tries = 5 while n_tries < max_tries: n_tries += 1 max_time = self._get_last_exchange_time() if self.exchange_finished: break util_sleep_for_a_while(5) for exch_topVer, exch_data in self.agg_exch_x1.items(): exch_major_topVer = int(exch_topVer / 10000) exch_minor_topVer = exch_topVer - exch_major_topVer * 10000 if exch_major_topVer == self.last_topVer and exch_minor_topVer > last_minorTopVer: x1_time = self.agg_exch_x1[exch_topVer]['max_duration'] x2_time = self.agg_exch_x2[exch_topVer]['max_duration'] self._dump_exchange_time( x1_time, x2_time, "deactivate [%d, %d]" % (exch_major_topVer, exch_minor_topVer), num_partitions=self.num_partitions)