예제 #1
0
파일: network.py 프로젝트: eddyashton/CCF
    def join_node(self,
                  node,
                  lib_name,
                  args,
                  target_node=None,
                  timeout=JOIN_TIMEOUT,
                  **kwargs):
        self._add_node(node, lib_name, args, target_node, **kwargs)

        primary, _ = self.find_primary()
        try:
            self.consortium.wait_for_node_to_exist_in_store(
                primary,
                node.node_id,
                timeout=timeout,
                node_status=(NodeStatus.PENDING if self.status
                             == ServiceStatus.OPEN else NodeStatus.TRUSTED),
            )
        except TimeoutError as e:
            LOG.error(
                f"New pending node {node.node_id} failed to join the network")
            errors, _ = node.stop()
            self.nodes.remove(node)
            if errors:
                # Throw accurate exceptions if known errors found in
                for error in errors:
                    if "Quote does not contain known enclave measurement" in error:
                        raise CodeIdNotFound from e
                    if "StartupSnapshotIsOld" in error:
                        raise StartupSnapshotIsOld from e
            raise
예제 #2
0
파일: recovery.py 프로젝트: lynshi/CCF
def test_recover_service(network, args, from_snapshot=False):
    network.save_service_identity(args)
    old_primary, _ = network.find_primary()

    snapshots_dir = None
    if from_snapshot:
        snapshots_dir = network.get_committed_snapshots(old_primary)

    # Start health watcher and stop nodes one by one until a recovery has to be staged
    watcher = infra.health_watcher.NetworkHealthWatcher(network,
                                                        args,
                                                        verbose=True)
    watcher.start()

    for node in network.get_joined_nodes():
        time.sleep(args.election_timeout_ms / 1000)
        node.stop()

    watcher.wait_for_recovery()

    # Stop remaining nodes
    network.stop_all_nodes()

    current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger()

    recovered_network = infra.network.Network(
        args.nodes,
        args.binary_dir,
        args.debug_nodes,
        args.perf_nodes,
        existing_network=network,
    )
    recovered_network.start_in_recovery(
        args,
        ledger_dir=current_ledger_dir,
        committed_ledger_dirs=committed_ledger_dirs,
        snapshots_dir=snapshots_dir,
    )

    recovered_network.recover(args)

    return recovered_network
예제 #3
0
    def stop_all_nodes(self):
        fatal_error_found = False
        for node in self.nodes:
            _, fatal_errors = node.stop()
            if fatal_errors:
                fatal_error_found = True

        LOG.info("All nodes stopped...")

        if fatal_error_found:
            if self.ignoring_shutdown_errors:
                LOG.warning("Ignoring shutdown errors")
            else:
                raise NodeShutdownError("Fatal error found during node shutdown")
예제 #4
0
파일: network.py 프로젝트: eddyashton/CCF
    def trust_node(self,
                   node,
                   args,
                   valid_from=None,
                   validity_period_days=None,
                   no_wait=False):
        primary, _ = self.find_primary()
        try:
            if self.status is ServiceStatus.OPEN:
                valid_from = valid_from or str(
                    infra.crypto.datetime_to_X509time(datetime.now()))
                self.consortium.trust_node(
                    primary,
                    node.node_id,
                    valid_from=valid_from,
                    validity_period_days=validity_period_days,
                    timeout=ceil(args.join_timer * 2 / 1000),
                )
            if not no_wait:
                # Here, quote verification has already been run when the node
                # was added as pending. Only wait for the join timer for the
                # joining node to retrieve network secrets.
                node.wait_for_node_to_join(timeout=ceil(args.join_timer * 2 /
                                                        1000))
        except (ValueError, TimeoutError):
            LOG.error(
                f"New trusted node {node.node_id} failed to join the network")
            node.stop()
            raise

        node.network_state = infra.node.NodeNetworkState.joined
        node.set_certificate_validity_period(
            valid_from, validity_period_days
            or args.max_allowed_node_cert_validity_days)
        if not no_wait:
            self.wait_for_all_nodes_to_commit(primary=primary)
예제 #5
0
    def stop_all_nodes(self):
        # Verify that all txs committed on the service can be read
        if self.txs is not None:
            self.txs.verify(self)

        fatal_error_found = False
        longest_ledger_seqno = 0
        most_up_to_date_node = None
        committed_ledger_dirs = {}

        for node in self.nodes:
            _, fatal_errors = node.stop()
            if fatal_errors:
                fatal_error_found = True

            # Find stopped node with longest ledger
            _, committed_ledger_dir = node.get_ledger(
                include_read_only_dirs=True)
            ledger_end_seqno = 0
            for ledger_file in os.listdir(committed_ledger_dir):
                end_seqno = infra.node.get_committed_ledger_end_seqno(
                    ledger_file)
                if end_seqno > ledger_end_seqno:
                    ledger_end_seqno = end_seqno

            if ledger_end_seqno > longest_ledger_seqno:
                longest_ledger_seqno = ledger_end_seqno
                most_up_to_date_node = node
            committed_ledger_dirs[node.node_id] = [
                committed_ledger_dir,
                ledger_end_seqno,
            ]

        LOG.info("All nodes stopped")

        # Verify that all ledger files on stopped nodes exist on most up-to-date node
        # and are identical
        if most_up_to_date_node:
            longest_ledger_dir, _ = committed_ledger_dirs[
                most_up_to_date_node.node_id]
            for node_id, (committed_ledger_dir,
                          _) in (l for l in committed_ledger_dirs.items()
                                 if not l[0] == most_up_to_date_node.node_id):
                for ledger_file in os.listdir(committed_ledger_dir):
                    if ledger_file not in os.listdir(longest_ledger_dir):
                        raise Exception(
                            f"Ledger file on node {node_id} does not exist on most up-to-date node {most_up_to_date_node.node_id}: {ledger_file}"
                        )
                    if infra.path.compute_file_checksum(
                            os.path.join(longest_ledger_dir, ledger_file)
                    ) != infra.path.compute_file_checksum(
                            os.path.join(committed_ledger_dir, ledger_file)):
                        raise Exception(
                            f"Ledger file checksums between node {node_id} and most up-to-date node {most_up_to_date_node.node_id} did not match: {ledger_file}"
                        )

            LOG.success(
                f"Verified ledger files consistency on all {len(self.nodes)} stopped nodes"
            )

        if fatal_error_found:
            if self.ignoring_shutdown_errors:
                LOG.warning("Ignoring shutdown errors")
            else:
                raise NodeShutdownError(
                    "Fatal error found during node shutdown")
예제 #6
0
 def stop_all_nodes(self):
     for node in self.nodes:
         node.stop()
     LOG.info("All remotes stopped...")