def _check_failed_number(self, failed_nodes, survived_node): """Ensures number of failed nodes is correct.""" cmd = "grep '%s' %s | wc -l" % (node_failed_event_pattern(), survived_node.log_file) failed_cnt = int(IgniteAwareService.exec_command(survived_node, cmd)) # Cache survivor id, do not read each time. surv_id = IgniteApplicationService.node_id(survived_node) if failed_cnt != len(failed_nodes): failed = IgniteAwareService.exec_command( survived_node, "grep '%s' %s" % (node_failed_event_pattern(), survived_node.log_file)) self.logger.warn( "Node '%s' (%s) has detected the following failures:%s%s" % (survived_node.name, surv_id, os.linesep, failed)) raise AssertionError( "Wrong number of failed nodes: %d. Expected: %d. Check the logs." % (failed_cnt, len(failed_nodes)))
def _simulate_and_detect_failure(self, servers, failed_nodes, event_timeout_sec, net_part: IgniteAwareService.NetPart): """ Perform node failure scenario """ ids_to_wait = [] for node in failed_nodes: ids_to_wait.append(servers.node_id(node)) self.logger.info("Simulating failure of node '%s' (ID: %s)." % (node.name, ids_to_wait[-1])) _, first_terminated = servers.drop_network(failed_nodes, net_part=net_part) # Keeps dates of logged node failures. logged_timestamps = [] data = {} for survivor in [n for n in servers.nodes if n not in failed_nodes]: for failed_id in ids_to_wait: logged_timestamps.append( servers.get_event_time_on_node( survivor, node_failed_event_pattern(failed_id), timeout=event_timeout_sec)) self._check_failed_number(failed_nodes, survivor) self._check_not_segmented(failed_nodes) logged_timestamps.sort(reverse=True) data['Detection of node(s) failure (ms)'] = epoch_mills( logged_timestamps[0]) - epoch_mills(first_terminated) data['All detection delays (ms):'] = str([ epoch_mills(ts) - epoch_mills(first_terminated) for ts in logged_timestamps ]) data['Nodes failed'] = len(failed_nodes) return data