def stop(self, **kwargs): """ Stop background probe NB! Do not move fut.results vs jmx node killing. This may lead to py4j errors in log. """ super().stop() # write 1 into lock file to stop all probes with open('lock', 'w') as f: f.write('1') # get result from process threads (this code guarantee that jmx is not usable anymore) sleep(10) for name, fut in self.futures.items(): if fut.exception(): log_print("Failed to evaluate probe %s, %s" % (name, fut.exception()), color='red') else: self.results[self.version][name] = fut.result() # kill utility using ignite api (jmx gateway already down in probe thread) if self.jmx: self.jmx.kill_utility() # self.ignite.kill_node(self.jmx.node_id) # shutdown executor self.executor.shutdown(wait=True) # remove lock file os.remove('lock')
def check_single_artifact(self): if len(self.results.keys()) < 2: log_print('There is 0 or 1 result for time probe') self.result_passed = True return True else: return False
def stop(self, **kwargs): """ Store stop time work/db directory size """ if 'speed' not in kwargs: log_print( 'Unable to get time specific results from test: no such property "speed"', color='red') return super().stop() self.results[self.version] += kwargs.get('speed')
def is_passed(self, **kwargs): """ Formula: base_version.total_time - observable_version.total_time / base_version.total_time I.e. difference between base and observable version in percentage :return is verification passed """ if self.check_single_artifact(): return True is_passed = True for version in self.results: if version == self.base_version: continue base_version_result = avg(self.results[self.base_version]) comparable_version_result = avg(self.results[version]) time_diff = percentage_diff(base_version_result, comparable_version_result) min_value = float(self.probe_config.get('diff')[0]) max_value = float(self.probe_config.get('diff')[1]) is_passed = min_value < time_diff < max_value self.result_message = 'Time run difference: {:.2f}, {} (base) - {} sec, {} - {} sec'.format( time_diff, self.base_version, int(base_version_result / 1000) if not self.seconds else base_version_result, # milliseconds to seconds version, int(comparable_version_result / 1000) if not self.seconds else comparable_version_result # milliseconds to seconds ) log_print(self.result_message, color=self.get_out_color(is_passed, **kwargs)) self.result_passed = is_passed return is_passed
def is_passed(self, **kwargs): """ Formula: base_version.end_size - observable_version.end_size / base_version.end_size I.e. difference between base and observable version in percentage :return is verification passed """ if self.check_single_artifact(): return True is_passed = False for version in self.results.keys(): if version == self.base_version: continue base_result = avg(self.results[self.base_version]) comparable_result = avg(self.results[self.base_version]) avg_snapshot_size_diff = percentage_diff(base_result, comparable_result) min_value = float(self.probe_config.get('diff')[0]) max_value = float(self.probe_config.get('diff')[1]) is_passed = min_value < avg_snapshot_size_diff < max_value self.result_message = \ 'Average snapshot directory size difference: {:.2f}. {} (base) - {}, {} - {}'.format( avg_snapshot_size_diff, self.base_version, convert_size(base_result), version, convert_size(comparable_result) ) log_print(self.result_message, color=self.get_out_color(is_passed, **kwargs)) self.result_passed = is_passed return is_passed
def is_passed(self, **kwargs): """ Formula: base_version.end_size - observable_version.end_size / base_version.end_size I.e. difference between base and observable version in percentage :return is verification passed """ if self.check_single_artifact(): return True is_passed = True for version in self.results.keys(): if version == self.base_version: continue db_total_size = percentage_diff( self.results[self.base_version]['end'], self.results[version]['end']) min_value = float(self.probe_config.get('diff')[0]) max_value = float(self.probe_config.get('diff')[1]) is_passed = min_value < db_total_size < max_value self.result_message = \ 'Sub of /db folder size difference: {:.2f}. {} (base) - {}, {} - {}' \ .format(db_total_size, self.base_version, convert_size(self.results[self.base_version]['end']), version, convert_size(self.results[version]['end'])) log_print(self.result_message, color=self.get_out_color(is_passed, **kwargs)) self.result_passed = is_passed return is_passed
def is_passed(self, **kwargs): """ Formula: observable_version.total_time - base_version.total_time / base_version.total_time I.e. difference between base and observable version in percentage :return is verification passed """ if self.check_single_artifact(): return True is_passed = True for version in self.results: if version == self.base_version: continue base_version_result = avg(self.results[self.base_version]) comparable_version_result = avg(self.results[version]) # rotate difference - in this case bigger is better speed_diff = percentage_diff(base_version_result, comparable_version_result) min_value = float(self.probe_config.get('diff')[0]) max_value = float(self.probe_config.get('diff')[1]) is_passed = min_value < speed_diff < max_value self.result_message = 'Rebalance speed difference: {:.2f}, {} (base) - {}/sec, {} - {}/sec'.format( speed_diff, self.base_version, convert_size(base_version_result, start_byte=True), version, convert_size(comparable_version_result, start_byte=True)) log_print(self.result_message, color=self.get_out_color(is_passed, **kwargs)) self.result_passed = is_passed return is_passed
def jmx_thread_method(nodes_to_monitor, ignite_nodes, jmx_node_id, metrics_to_collect, timeout=5): """ probe thread that collects JMX metrics from specified nodes Uses mocked JMXUtility (does not start new instance, just use existing methods) We need to pass nothing to create this instance just override nodes, gateway and service :param nodes_to_monitor: nodes that we want to monitor (server nodes in this example) :param ignite_nodes: nodes from current Ignite app (need to get PID) :param jmx_node_id: jmx node id from tiden.ignite.nodes :param metrics_to_collect: {'attr': {'grp': 'Group', 'bean': 'Bean', 'attribute': 'Attr'}, ...} :param timeout: timeout to collect metrics :return: collected results ('default' python type) """ # Close connections and shutdown gateway properly jmx_metric = {} jmx = None try: jmx = JmxUtility() jmx.initialize_manually(jmx_node_id, ignite_nodes) with open('lock', 'r') as f: while True: if f.read(1) == '1': log_print("Background probe JMX has been interrupted") break current_time = get_current_time() for node_idx in nodes_to_monitor: if current_time not in jmx_metric: jmx_metric[current_time] = {} if node_idx not in jmx_metric[current_time]: jmx_metric[current_time][node_idx] = {} for name, metric in metrics_to_collect.items(): try: string_value = \ jmx.get_attributes(node_idx, metric['grp'], metric['bean'], metric['attribute'], )[metric['attribute']] if metric['type'] == 'int': jmx_metric[current_time][node_idx][ name] = int(string_value) else: jmx_metric[current_time][node_idx][ name] = string_value except Exception: jmx_metric[current_time][node_idx][name] = None sleep(timeout) except Exception: log_print(traceback.format_exc()) finally: # Close connections and shutdown gateway properly if jmx: jmx.kill_manually() return jmx_metric
def heap_thread_method(nodes_to_monitor, ignite_nodes, ssh_config, timeout=5): """ probe thread that collects JVM Heap usage from nodes_to_monitor Command to collect: "jcmd PID GC.class_histogram" This command prints following text: "PID: 1. JAVA_OBJECT_NUM JAVA_OBJECT_SIZE JAVA_OBJECT_NAME ... N. Total TOTAL_OBJECTS TOTAL_OBJECTS_SIZE" So we need to collect PID (to match it to node) and TOTAL_OBJECTS_SIZE from that output. :param nodes_to_monitor: nodes that we want to monitor (server nodes in this example) :param ignite_nodes: nodes from current Ignite app (need to get PID) :param ssh_config: config['ssh_config'] from tiden config (Need to initialize SshPool) :return: collected results ('default' python type) """ ssh = SshPool(ssh_config) ssh.connect() heap_result = {} try: with open('lock', 'r') as f: while True: if f.read(1) == '1': log_print("Background probe HEAP has been interrupted") break commands = {} node_ids_to_pid = {} for node_ids in nodes_to_monitor: node_ids_to_pid[node_ids] = ignite_nodes[node_ids][ 'PID'] for node_idx in nodes_to_monitor: host = ignite_nodes[node_idx]['host'] if commands.get(host) is None: commands[host] = [ 'jcmd %s GC.class_histogram' % ignite_nodes[node_idx]['PID'] ] else: commands[host].append( 'jcmd %s GC.class_histogram' % ignite_nodes[node_idx]['PID']) results = ssh.exec(commands) results_parsed = {} for host in results.keys(): result = results[host][0] findall = re.compile( '(\d+):\n|Total\s+\d+\s+(\d+)').findall(result) # findall will return 2d array: [['PID', ''], [''] ['TOTAL_HEAP_USAGE']] # todo maybe there is a better way to get this if findall: node_id = 0 for node_id, pid in node_ids_to_pid.items(): if pid == int(findall[0][0]): node_id = node_id break try: results_parsed[node_id] = (int(findall[1][1])) except Exception: results_parsed[node_id] = 0 else: continue heap_result[get_current_time()] = results_parsed sleep(timeout) except Exception: log_print(traceback.format_exc()) return heap_result
def cpu_mem_thread_method(nodes_to_monitor, ignite_nodes, ssh_config, timeout=5): """ probe thread that collects cpu,mem from nodes_to_monitor Command to collect: "ps -p PID -o pid,%%cpu,%%mem" :param nodes_to_monitor: nodes that we want to monitor (server nodes in this example) :param ignite_nodes: nodes from current Ignite app (need to get PID) :param ssh_config: config['ssh_config'] from tiden config (Need to initialize SshPool) :param timeout: timeout between data collect :return: collected results ('default' python type) """ ssh = SshPool(ssh_config) ssh.connect() cpu_mem_result = {} with open('lock', 'r') as f: while True: if f.read(1) == '1': log_print("Background probe CPU has been interrupted") break commands = {} node_ids_to_pid = {} for node_ids in nodes_to_monitor: node_ids_to_pid[node_ids] = ignite_nodes[node_ids]['PID'] for node_idx in nodes_to_monitor: host = ignite_nodes[node_idx]['host'] if commands.get(host) is None: commands[host] = [ 'ps -p %s -o pid,%%cpu,%%mem' % ignite_nodes[node_idx]['PID'] ] else: commands[host].append('ps -p %s -o pid,%%cpu,%%mem' % ignite_nodes[node_idx]['PID']) results = ssh.exec(commands) results_parsed = {} for host in results.keys(): result = results[host][0] search = re.search('(\d+)\s+?(\d+.?\d?)\s+?(\d+.?\d?)', result) if search: node_id = 0 for node_id, pid in node_ids_to_pid.items(): if pid == int(search.group(1)): node_id = node_id break results_parsed[node_id] = (float(search.group(2)), float(search.group(3))) else: continue cpu_mem_result[get_current_time()] = results_parsed sleep(timeout) return cpu_mem_result
def print_results(self): log_print('Results for db size probe: {}'.format(self.results))
def print_results(self): log_print('Results for checkpointLockHoldTime probe: {}'.format( self.results))
def print_pre_259_start_times(self): stopped_in_cp = {} for node, logs_content in self.collect_specific_text_from_logs( '.*Ignite node stopped in the middle of checkpoint.').items(): stopped_in_cp[ node] = 'Ignite node stopped in the middle of checkpoint.' in logs_content binary_recovery_time = self.collect_events( '.*performed in (\d+) ms', self.collect_specific_text_from_logs( 'Binary recovery performed in')) part_states = self.collect_events( '.*partitionsProcessed=(\d+), time=(\d+)ms]', self.collect_specific_text_from_logs( 'Finished restoring partition state for local groups')) wal_changes = self.collect_events( '.*updatesApplied=(\d+), time=(\d+) ms', self.collect_specific_text_from_logs( 'Finished applying WAL changes')) memory_changes = self.collect_events( '.*changesApplied=(\d+), time=(\d+) ms', self.collect_specific_text_from_logs( 'Finished applying memory changes')) log_print(stopped_in_cp, color='yellow') log_print(binary_recovery_time, color='yellow') log_print(part_states, color='yellow') log_print(wal_changes, color='yellow') log_print(memory_changes, color='yellow') for node in stopped_in_cp.keys(): log_print("Report for node %s %s" % (node, '(coordinator)' if node == 1 else ''), color='green') if stopped_in_cp[node]: log_print("Stopped in the middle of checkpoint", color='green') else: log_print("Stopped gracefully", color='green') log_print("Binary was restored in %s ms" % binary_recovery_time[node][0], color='green') part_stated_time = part_states[node][1] part_stated_speed = ( part_states[node][1] / part_states[node][0]) if part_states[node][0] != 0 else 0 log_print( "Partitions recovery time = %s ms, (time / partitions) = %.2f" % (part_stated_time, part_stated_speed), color='green') wal_changes_time = wal_changes[node][1] wal_changes_speed = wal_changes[node][0] log_print("Applying WAL changes time = %s ms, updates = %s" % (wal_changes_time, wal_changes_speed), color='green') if node in memory_changes: memory_changes_time = memory_changes[node][1] memory_changes_speed = (memory_changes[node][1] / memory_changes[node][0]) \ if memory_changes[node][0] != 0 else 0 log_print( "Applying memory changes time = %s ms, (time / changes) = %.2f" % (memory_changes_time, memory_changes_speed), color='green') else: log_print("There is no memory changes log message found", color='green')