def start_ignite_grid(self, name, activate=False, already_nodes=0, replaced_name=None): app = self.get_app(self.ignite_app_names[name]) app.set_node_option('*', 'config', Ignite.config_builder.get_config('server', config_set_name=self.config_name)) # app.activate_default_modules() artifact_cfg = self.tiden.config['artifacts'][app.name] app.reset() log_print("Ignite ver. %s, revision %s" % ( artifact_cfg['ignite_version'], artifact_cfg['ignite_revision'], )) if replaced_name: app.grid_name = replaced_name cmd = [ 'cp %s %s/libs/' % (app.config['artifacts']['piclient']['remote_path'], app.config['artifacts'][self.ignite_app_names[name]]['remote_path']) ] self.util_exec_on_all_hosts(app, cmd) app.start_nodes(already_nodes=already_nodes, other_nodes=already_nodes) if activate: app.cu.activate() return app
def teardown_testcase(self): self.logger.info('TestTeardown is called') log_print(repr(self.ignite), color='debug') log_print(self.ignite.get_all_additional_nodes(), color='blue') log_print(self.ignite.get_alive_additional_nodes(), color='blue') for additional_node in self.ignite.get_alive_additional_nodes(): self.ignite.kill_node(additional_node) # kill utility if exists if self.ignite.jmx.is_started(): self.ignite.jmx.kill_utility() self.stop_grid_hard() self.su.copy_utility_log() if self.get_context_variable('zookeeper_enabled'): self.zoo.stop_zookeeper() # self.zoo.collect_logs_to_folder(self.ignite.config['rt']['remote']['test_dir']) # c'mon, I'm too lazy to do it right log_print( f'Value for self.need_delete_lfs_on_teardown is {self.need_delete_lfs_on_teardown}', color='debug') if self.need_delete_lfs_on_teardown: self.cleanup_lfs() self.set_current_context() self.reset_cluster() log_print(repr(self.ignite), color='debug')
def download_run_logs(self): log_print('download logs') logs_dir = self.tiden.config['rt']['remote']['test_dir'] files = [] commands = dict([[host, [f"ls {logs_dir} | grep .log"]] for host in self.tiden.ssh.hosts]) items_list = self.cluster.ssh.exec(commands) for host in self.tiden.ssh.hosts: files.append([host, [f'{logs_dir}/{file}' for file in items_list[host][0].split('\n') if file]]) log_print('downloaded') files_receiver_url = self.tiden.config['environment'].get('report_files_url') if files_receiver_url: for host, paths in files: for file_path in paths: base_filename = basename(file_path) file_name = f'{uuid4()}_{base_filename}' self.tiden.ssh.exec_on_host( host, [f'cd {dirname(file_path)}; ' f'curl -H "filename: {file_name}" ' f'-F "file=@{base_filename};filename={base_filename}" ' f'{files_receiver_url}/files/add'] ) self.context['attachments'].append({ 'name': base_filename, 'source': file_name, 'type': 'file' })
def execute_action(self, state): for s in state: log_print(f'[state] {s} starting', color='yellow') split_state = s.split('#') self.context['temp_action_options'] = split_state[1:] if len(split_state) > 1 else [] self.action_methods[split_state[0]]() log_print(f'[state] {s} ended', color='yellow')
def execute_cache_check(self, verify_check_dir): """ Run control.sh idle verify command on existed cluster """ name = 'verify.log' host = self.tiden.ssh.hosts[0] remote_log_path = join(self.tiden.config['rt']['remote']['test_dir'], name) local_log_path = join(verify_check_dir, name) skip_download = False try: self.cluster.cu.idle_verify_dump(key_dump=False, log=remote_log_path, output_limit=300) except Exception as e: stacktrace = format_exc() log_print(stacktrace, color='red') self.context['step_failed'] = f'Caches verification is failed\n{format_exc()}' if 'Not found running server nodes' in stacktrace: skip_download = True finally: # add logs in any case if skip_download: return try: self.cluster.ssh.download_from_host(host, remote_log_path, local_log_path) with open(local_log_path) as file: fail_message = ''.join(file.readlines()) if 'no conflicts have been found' not in fail_message: self.context['step_failed'] = self.context.get('step_failed', '') + f'\nFound conflicts!\n{fail_message}' self.context['attachments'].append({ 'name': 'log', 'source': local_log_path, 'type': 'file' }) except: self.context['step_failed'] = self.context.get('step_failed', '') + f'\nDownload failed!\n{format_exc()}'
def test_sim(self): version, ignite = self.start_ignite_grid(True) ignite.jmx.start_utility() client_config = Ignite.config_builder.get_config( 'client', config_set_name='base') group_names = PiClientIgniteUtils.collect_cache_group_names( ignite, client_config) PiClientIgniteUtils.load_data_with_streamer(ignite, client_config, end_key=50) server_nodes_num = ignite.get_nodes_num('server') sim_engine = PigeonSimulation(server_nodes_num) for running_iteration in range(1, DEFAULT_ITERATIONS + 1): log_print("Running iteration %s" % running_iteration) ev, node = sim_engine.next_event() log_print("Evaluating event %s on node %s" % (ev, node)) pigeon = self.get_pigeon(ignite, node) pigeon[ev]() ignite.jmx.wait_for_finish_rebalance(120, group_names) self.verify_cluster(ignite) ignite.jmx.kill_utility()
def util_find_snapshot_folders_on_fs(ignite, snapshot_id, remote_dir=None): snapshot_dirs = {} remote_snapshot_dir = None search_in_dir = remote_dir if remote_dir else './work/snapshot/' output = SnapshotScenario.run_on_all_nodes( ignite, 'ls -1 %s | grep %s' % (search_in_dir, snapshot_id)) if len(output) > 0: for node_idx in output.keys(): snapshot_dir = output[node_idx].rstrip() if snapshot_dir: # Add only if directory exists snapshot_dirs[node_idx] = snapshot_dir remote_snapshot_dir = snapshot_dirs[node_idx] log_print( 'Snapshot directory %s for snapshot ID=%s on node %s' % (snapshot_dir if snapshot_dir else 'Not found', snapshot_id, node_idx), color='yellow') # if remote dir, it is the same for all servers, so don't need to iterate over all servers if remote_dir: return '%s/%s' % (remote_dir, remote_snapshot_dir) return snapshot_dirs
def parallel_restart(self, ignite): # cleanup dead nodes LFS to avoid BLAT errors alive_node_id = ignite.get_alive_default_nodes()[0] ignite.cu.get_current_topology_version() all_nodes = set( findall('ConsistentID=([^,;\n]+)', ignite.cu.latest_utility_output)) blt_nodes = set( findall('ConsistentID=(.*?), STATE=', ignite.cu.latest_utility_output)) nblt_nodes = all_nodes - blt_nodes log_print(nblt_nodes) for node_id in ignite.get_all_default_nodes(): if node_id not in ignite.get_alive_default_nodes( ) or ignite.get_node_consistent_id(node_id) in nblt_nodes: ignite.cleanup_work_dir(node_id) ignite.stop_nodes() # start first previously alive node to set BLAT correctly ignite.start_node(alive_node_id) for node_id in ignite.get_all_default_nodes(): if node_id == alive_node_id: continue ignite.start_node(node_id) ignite.cu.set_current_topology_as_baseline()
def start_ignite_grid(self, activate=False, already_nodes=0, config_set='base', jvm_options=None): app = Ignite(self.get_app_by_type('ignite')[0]) app.set_node_option( '*', 'config', Ignite.config_builder.get_config('server', config_set_name=config_set)) if jvm_options: app.set_node_option('*', 'jvm_options', jvm_options) artifact_cfg = self.tiden.config['artifacts'][app.name] app.reset() version = artifact_cfg['ignite_version'] log_print("Ignite ver. %s, revision %s" % ( version, artifact_cfg['ignite_revision'], )) app.start_nodes(already_nodes=already_nodes) if activate: app.cu.activate(activate_on_particular_node=1) return version, app
def start_ignite_grid(self, name, activate=False, already_nodes=0, config_set='base', jvm_options=None): app = self.get_app(name) app.set_node_option('*', 'config', Ignite.config_builder.get_config('server', config_set_name=config_set)) if jvm_options: app.set_node_option('*', 'jvm_options', jvm_options) artifact_cfg = self.tiden.config['artifacts'][app.name] app.reset() log_print("Ignite ver. %s, revision %s" % ( artifact_cfg['ignite_version'], artifact_cfg['ignite_revision'], )) app.set_activation_timeout(240) app.set_snapshot_timeout(240) app.start_nodes(already_nodes=already_nodes) if activate: app.cu.activate(activate_on_particular_node=1) return app
def register_config_set(self, config_set_name): """ Register new config set Context name: * common variables - variables for all configs in this context * exclusive variables - variables for particular node * common configs - result of config + common variables * exclusive configs - result of exclusive config + (common + exclusive variables) :param config_set_name: configuration set name :return: """ if config_set_name in self.config_sets: log_print( "Config set with name '%s' is already registered. Creating new config set." % config_set_name, color='yellow') self.config_sets[config_set_name] = {} if self.current_config_set is None: self.current_config_set = config_set_name self.config_sets[config_set_name]['common_variables'] = {} self.config_sets[config_set_name]['exclusive_variables'] = {} self.config_sets[config_set_name]['common_configs'] = {} self.config_sets[config_set_name]['exclusive_configs'] = {} self.config_sets[config_set_name]['additional_config_types'] = {} return self.config_sets[config_set_name]
def operation_deactivation(self): """ deactivate cluster one time Start operations only after other exceptions was started """ property_name = 'deactivation' self.operations[property_name]['started'] = False self.operations[property_name]['kill'] = False test_dir = self.tiden.config['rt']['remote']['test_dir'] set_log_path = f"{test_dir}/deactivation_log.log" while True: keys = list(self.operations.keys()) for k in keys: if not self.operations.get(property_name): log_print( f'[operation] {property_name.upper()} kill already killed', color='yellow') return if self.operations[property_name]['kill']: return if len(keys) == 1 or (k != property_name and self.operations[k].get('started')): self.operations[property_name]['started'] = True self.cluster.cu.deactivate(log=set_log_path) self.context['activate'] = False self.operations[property_name]['killed'] = True log_print(f'[operation] {property_name.upper()} kill', color='yellow') return sleep(1)
def setup_webconsole(self): console = self.get_app_by_type('webconsole')[0] console.start(users=True) ignite_urls = ['{}:8080'.format(host) for host in console.ssh.hosts] console.start_web_agent(ignite_urls=ignite_urls) log_print('WebConsole Started', color='debug')
def _print_wait_for(self, message, node_idxs, time, timeout, done): nodes_str = ', '.join([str(node_id) for node_id in node_idxs]) log_put( f"Waiting for '{message}' at nodes [{nodes_str}], {time}/{timeout} sec" ) if done: stdout.flush() log_print('')
def test_client_start_restart_grid_stop(self): with PiClient(self.ignite, self.get_client_config()) as piclient: log_print(piclient) self.stop_grid() self.start_grid() cache_names = piclient.get_ignite().cacheNames().toArray() for cache_name in cache_names: log_print(cache_name)
def cluster_state_empty(self): """ Start new clean cluster """ log_print('[cluster state] staring EMPTY', color='yellow') self.clean_cluster() self.start_grid() self.context['clean_cluster_was_here'] = False log_print('[cluster state] started EMPTY', color='yellow')
def operation_baseline(self): """ Update baseline in cycle """ property_name = 'baseline' self.operations[property_name]['started'] = False self.operations[property_name]['kill'] = False test_dir = self.tiden.config['rt']['remote']['test_dir'] set_log_path = f"{test_dir}/set_baseline_join.log" get_log_path = f"{test_dir}/get_baseline_version.log" fail_counter = 0 end_time = time() + self.max_operation_timeout while True: try: curr_top_version = self.cluster.cu.get_current_topology_version( log=get_log_path, show_output=False, ssh_options={'timeout': 60 * 3}) if not curr_top_version: raise TidenException( 'Failed to get current topology version') if not self.operations[property_name]['started']: self.operations[property_name]['started'] = True log_print(f'[operation] {property_name.upper()} started', color='yellow') self.cluster.cu.set_baseline(curr_top_version, log=set_log_path, show_output=False, output_limit=100, ssh_options={'timeout': 60 * 3}) except: fail_counter += 1 log_print(format_exc(), color='red') if fail_counter > 10: self.operations[property_name]['kill'] = True finally: if time() > end_time: log_print( f'[operation] {property_name.upper()} timeout kill', color='yellow') if self.operations.get(property_name): self.operations[property_name]['killed'] = True return if not self.operations.get(property_name): log_print( f'[operation] {property_name.upper()} kill already killed', color='yellow') return if self.operations[property_name]['kill']: log_print(f'[operation] {property_name.upper()} kill', color='yellow') self.operations[property_name]['killed'] = True return sleep(1)
def setup_operations(self, operations: list): """ Start all operations and wait when they being running """ steps = [] try: for operation in operations: steps.append({ 'name': operation, 'time': {'start': self.current_time, 'start_pretty': self.current_time_pretty} }) log_print(f'[operation] {operation.upper()} starting', color='yellow') # start t = Thread(target=self.operations_methods[operation]) self.operations[operation] = {'thread': t} t.start() end_time = time() + 60 while True: started_operations = [(name, op) for name, op in self.operations.items() if op.get('started', False)] for name, started_operation in started_operations: idx = [idx for idx, _step in enumerate(steps) if _step['name'] == name][0] # wait for operation being running if steps[idx].get('status'): continue # step stuff steps[idx]['status'] = 'passed' steps[idx]['stacktrace'] = '' steps[idx]['time']['end'] = self.current_time steps[idx]['time']['end_pretty'] = self.current_time_pretty diff = steps[idx]["time"]["end"] - steps[idx]["time"]["start"] steps[idx]['time']['diff'] = f'{diff // 1000}s' # all operations started if len(started_operations) == len(operations): return if [op for op in self.operations.values() if op.get('killed', False)] or end_time < time(): # if operations killed before start or long time not started for idx, step in enumerate(deepcopy(steps)): if not step.get('status'): steps[idx]['status'] = 'failed' steps[idx]['time']['end'] = self.current_time steps[idx]['time']['end_pretty'] = self.current_time_pretty diff = steps[idx]["time"]["end"] - steps[idx]["time"]["start"] steps[idx]['time']['diff'] = f'{diff // 1000}s' steps[idx]['stacktrace'] = f'Failed to wait for operations start: \n{format_exc()}\n' \ f'Operations: {operations}\n' \ f'Started: {started_operations}' raise TidenException(f'Failed to wait for operations start' f'Operations: {operations}\n' f'Started: {started_operations}') sleep(0.5) finally: self.context['report']['children'] = steps
def wait_transactions_finish(self): log_print( "Waiting current transactions to finish for up to {} seconds". format(self.transactions_timeout)) for tries in range( 0, int(self.transactions_timeout / self.transaction_check_sleep)): self.ignite.cu.control_utility('--tx') if 'Nothing found' in self.ignite.cu.latest_utility_output: break util_sleep(self.transaction_check_sleep)
def teardown_testcase(self): for cluster in self.clusters: log_print('Teardown for cluster {}'.format(cluster)) if cluster.grid: cluster.grid.kill_nodes() cluster.grid.remove_additional_nodes() if not SAVE_LFS: cluster.grid.delete_lfs() cluster.grid = None
def dr_is_stopped(piclient, cache_name): try: piclient.get_ignite().cache(cache_name) stopped = piclient.get_ignite().plugin( piclient.get_gateway().jvm.org.gridgain.grid.GridGain.PLUGIN_NAME ).dr().senderCacheStatus(cache_name).stopped() except Exception as e: log_print(e) stopped = False finally: return stopped
def operation_none(self): property_name = 'none' self.operations[property_name]['started'] = True self.operations[property_name]['kill'] = False log_print(f'[operation] {property_name.upper()} started', color='yellow') while True: if self.operations[property_name]['kill']: self.operations[property_name]['killed'] = True log_print(f'[operation] {property_name.upper()} kill', color='yellow') return sleep(0.5)
def get_nodes_directory_size(ignite, ssh, directory, nodes=None, commands=None): """ Total directory size from all nodes :return: collected result """ node_ids_to_ignite_home = {} if not nodes: nodes = ignite.get_all_default_nodes( ) + ignite.get_all_additional_nodes() for node_ids in nodes: node_ids_to_ignite_home[node_ids] = ignite.nodes[node_ids][ 'ignite_home'] # commands may be overridden if specific commands should be sent to node if not commands: commands = {} for node_idx in nodes: host = ignite.nodes[node_idx]['host'] if commands.get(host) is None: commands[host] = [ 'du -s {}/{}'.format(ignite.nodes[node_idx]['ignite_home'], directory) ] else: commands[host].append('du -s {}/{}'.format( ignite.nodes[node_idx]['ignite_home'], directory)) results = ssh.exec(commands) results_parsed = 0 for host in results.keys(): # print(results[host][0]) search = re.search('(\d+)\\t', results[host][0]) if not search: log_print( 'Unable to get directory size for host %s. Set directory size as 0.' % host) return 0 results_parsed += int(search.group(1)) return results_parsed
def remove_index_bin_files(self, node_id): """ Remove all index.bin files except cache cache-ignite-sys-caches for particular node. :param node_id: :return: """ if node_id in self.ignite.nodes.keys(): host = self.ignite.nodes[node_id]['host'] ignite_home = self.ignite.nodes[node_id]['ignite_home'] commands = dict() dir_to_search = '{}/work/db/'.format(ignite_home) commands[host] = [ 'find {} -name \'index.bin\''.format(dir_to_search) ] output = self.ignite.ssh.exec(commands) files = [ file for file in output[host][0].split('\n') if file and 'cache-ignite-sys-caches' not in file ] log_print(files, color='debug') log_print(commands, color='debug') commands[host] = [ ';'.join(['rm {}'.format(file) for file in files]) ] output = self.ignite.ssh.exec(commands) log_print(output, color='debug') else: log_print("Node id {} does not found in server nodes {}".format( node_id, self.ignite.nodes.keys()), color='red')
def test_master_master_master_blinking_blt(self): self.prepare_clusters() client_config = self.preconfigure_cluster_0() iterations = 10 last_loaded_key = START_DATA_SIZE nodes_before = 6 with PiClient(self.clusters[0].grid, client_config, jvm_options=['-ea']) as piclient: PiClientIgniteUtils.load_data_with_streamer( self.clusters[0].grid, client_config, end_key=last_loaded_key, jvm_options=['-ea'], check_clients=False) sleep(60) with TransactionalLoading(self, ignite=self.clusters[0].grid, config_file=client_config, skip_consistency_check=True): for i in range(0, iterations): log_print(f'Current iteration {i + 1} from {iterations}', color='debug') self.clusters[0].grid.kill_node(2) utility_baseline_log = 'control-utility-baseline.log' self.clusters[0].grid.cu.set_current_topology_as_baseline( background=True, log=utility_baseline_log) self.clusters[0].grid.start_node(2, skip_topology_check=True) self.clusters[0].grid.wait_for_topology_snapshot( server_num=6) self.clusters[0].grid.update_started_node_status(2) self.clusters[0].grid.cu.set_current_topology_as_baseline( background=True, log=utility_baseline_log) self.verify_cluster(0, nodes_before, last_loaded_key)
def initialize_config(self): self.historical_rebalance = self.config.get('historical_rebalance', False) self.metrics_idle = self.config.get('metrics_idle', 30) self.with_loading = self.config.get('with_loading', False) self.idle_verify = is_enabled(self.config.get('idle_verify')) self.load_type = self.config.get('load_type', DEFAULT_LOAD_TYPE) self.single_cache = 'single_cache' in self.config self.parts_distribution = self.config.get('partition_distribution', None) self.with_no_rebalance_cache = self.config.get('with_no_rebalance_cache', False) self.jfr_settings = self.config.get('jfr_settings', None) if self.with_loading and self.historical_rebalance: log_print('There is no support historical rebalance with loading. Skipping loading.') self.with_loading = False if self.idle_verify and self.with_loading: log_print('Skipping idle_verify parameter because of with_loading used', color='yellow')
def calc_checksums_on_client(piclient, start_key=0, end_key=1000, dict_mode=False): """ Calculate checksum based on piclient :param start_key: start key :param end_key: end key :param dict_mode: :return: """ log_print("Calculating checksums using cache.get() from client") cache_operation = {} cache_checksum = {} sorted_cache_names = [] for cache_name in piclient.get_ignite().cacheNames().toArray(): sorted_cache_names.append(cache_name) sorted_cache_names.sort() async_operations = [] for cache_name in sorted_cache_names: async_operation = create_async_operation( create_checksum_operation, cache_name, start_key, end_key, gateway=piclient.get_gateway()) async_operations.append(async_operation) cache_operation[async_operation] = cache_name async_operation.evaluate() checksums = '' for async_operation in async_operations: result = str(async_operation.getResult()) cache_checksum[cache_operation.get(async_operation)] = result checksums += result log_print('Calculating checksums done') if dict_mode: return cache_checksum else: return checksums
def increment_atomic(ignite): atomic_name = None atomic = None try: log_print("Incrementing atomics") for c in range(0, 10): atomic_name = "nodeId_%s" % c atomic = ignite.atomicLong(atomic_name, 0, True) for j in range(0, 50): atomic.incrementAndGet() except Exception: log_print( "Failed to increment atomics: assert that atomic.removed() and recreating it", color='red') if atomic: ignite.atomicLong(atomic_name, 100, True)
def start_nodes(self): start_command = {} pids = {} nodes_to_start = [] for node_idx, node in self.nodes.items(): self.rotate_node_log(node_idx) nodes_to_start.append(node_idx) host = node['host'] if host not in start_command: start_command[host] = [] start_command[host].extend(self.get_node_start_commands(node_idx)) pids[node_idx] = len(start_command[host]) - 1 node['status'] = NodeStatus.STARTING log_print(f"Start {self.name.title()} node(s): {nodes_to_start}") result = self.ssh.exec(start_command) for node_idx, node in self.nodes.items(): host = node['host'] try: node['PID'] = int(result[host][pids[node_idx]].strip()) if not node['PID']: raise ValueError(f'no PID for node {node_idx}') except ValueError or IndexError or KeyError as e: raise TidenException( f"Can't start {self.name.title()} node {node_idx} at host {host}" ) check_command = {} status = {} for node_idx, node in self.nodes.items(): host = node['host'] if host not in check_command: check_command[host] = [] check_command[host].extend(self.get_node_check_commands(node_idx)) status[node_idx] = len(check_command[host]) - 1 result = self.ssh.exec(check_command) for node_idx, node in self.nodes.items(): host = node['host'] try: if result[host][status[node_idx]]: node['status'] = NodeStatus.STARTED except IndexError or ValueError or KeyError as e: raise TidenException( f"Can't check {self.name.title()} node {node_idx} started at host {host}" ) log_print( f"{self.name.title()} node {node_idx} started on {host} with PID {node['PID']}" )
def verify_no_assertion_errors(self, ignite): assertion_errors = ignite.find_exception_in_logs( ".*java.lang.AssertionError.*") # remove assertions from ignite.nodes to prevent massive output for node_id in ignite.nodes.keys(): if 'exception' in ignite.nodes[ node_id] and ignite.nodes[node_id]['exception'] != '': log_print("AssertionError found on node %s, text: %s" % (node_id, ignite.nodes[node_id]['exception'][:100]), color='red') ignite.nodes[node_id]['exception'] = '' if assertion_errors != 0: for node_id in ignite.get_alive_default_nodes(): self.util_get_threads_from_jstack(ignite, node_id) assert False, "AssertionErrors found in server logs! Count %d" % assertion_errors