Exemplo n.º 1
0
    def _check_failed_number(self, failed_nodes, survived_node):
        """Ensures number of failed nodes is correct."""
        cmd = "grep '%s' %s | wc -l" % (node_failed_event_pattern(),
                                        survived_node.log_file)

        failed_cnt = int(IgniteAwareService.exec_command(survived_node, cmd))

        # Cache survivor id, do not read each time.
        surv_id = IgniteApplicationService.node_id(survived_node)

        if failed_cnt != len(failed_nodes):
            failed = IgniteAwareService.exec_command(
                survived_node, "grep '%s' %s" %
                (node_failed_event_pattern(), survived_node.log_file))

            self.logger.warn(
                "Node '%s' (%s) has detected the following failures:%s%s" %
                (survived_node.name, surv_id, os.linesep, failed))

            raise AssertionError(
                "Wrong number of failed nodes: %d. Expected: %d. Check the logs."
                % (failed_cnt, len(failed_nodes)))
Exemplo n.º 2
0
    def _simulate_and_detect_failure(self, servers, failed_nodes,
                                     event_timeout_sec,
                                     net_part: IgniteAwareService.NetPart):
        """
        Perform node failure scenario
        """
        ids_to_wait = []

        for node in failed_nodes:
            ids_to_wait.append(servers.node_id(node))

            self.logger.info("Simulating failure of node '%s' (ID: %s)." %
                             (node.name, ids_to_wait[-1]))

        _, first_terminated = servers.drop_network(failed_nodes,
                                                   net_part=net_part)

        # Keeps dates of logged node failures.
        logged_timestamps = []
        data = {}

        for survivor in [n for n in servers.nodes if n not in failed_nodes]:
            for failed_id in ids_to_wait:
                logged_timestamps.append(
                    servers.get_event_time_on_node(
                        survivor,
                        node_failed_event_pattern(failed_id),
                        timeout=event_timeout_sec))

            self._check_failed_number(failed_nodes, survivor)

        self._check_not_segmented(failed_nodes)

        logged_timestamps.sort(reverse=True)

        data['Detection of node(s) failure (ms)'] = epoch_mills(
            logged_timestamps[0]) - epoch_mills(first_terminated)
        data['All detection delays (ms):'] = str([
            epoch_mills(ts) - epoch_mills(first_terminated)
            for ts in logged_timestamps
        ])
        data['Nodes failed'] = len(failed_nodes)

        return data