Exemplo n.º 1
0
def test_all_members(network, args):
    def run_test_all_members(network):
        primary, _ = network.find_primary()

        with primary.client() as c:
            r = c.get("/gov/members")
            assert r.status_code == http.HTTPStatus.OK.value
            response_members = r.body.json()

        network_members = network.get_members()
        assert len(network_members) == len(response_members)

        for member in network_members:
            assert member.service_id in response_members
            response_details = response_members[member.service_id]
            assert response_details["cert"] == member.cert
            assert (
                infra.member.MemberStatus(response_details["status"])
                == member.status_code
            )
            assert response_details["member_data"] == member.member_data
            if member.is_recovery_member:
                recovery_enc_key = open(
                    member.member_info["encryption_public_key_file"], encoding="utf-8"
                ).read()
                assert response_details["public_encryption_key"] == recovery_enc_key
            else:
                assert response_details["public_encryption_key"] is None

    # Test on current network
    run_test_all_members(network)

    # Test on mid-recovery network
    network.save_service_identity(args)
    primary, _ = network.find_primary()
    network.stop_all_nodes()
    current_ledger_dir, committed_ledger_dirs = primary.get_ledger()
    # NB: Don't try to get snapshots, since there may not be any committed,
    # and we cannot wait for commit now that the node is stopped
    recovered_network = infra.network.Network(
        args.nodes,
        args.binary_dir,
        args.debug_nodes,
        args.perf_nodes,
        existing_network=network,
    )
    recovered_network.start_in_recovery(
        args,
        ledger_dir=current_ledger_dir,
        committed_ledger_dirs=committed_ledger_dirs,
    )
    run_test_all_members(recovered_network)
    recovered_network.recover(args)

    return recovered_network
Exemplo n.º 2
0
def test_recover_service(network, args, from_snapshot=False):
    network.save_service_identity(args)
    old_primary, _ = network.find_primary()

    snapshots_dir = None
    if from_snapshot:
        snapshots_dir = network.get_committed_snapshots(old_primary)

    # Start health watcher and stop nodes one by one until a recovery has to be staged
    watcher = infra.health_watcher.NetworkHealthWatcher(network,
                                                        args,
                                                        verbose=True)
    watcher.start()

    for node in network.get_joined_nodes():
        time.sleep(args.election_timeout_ms / 1000)
        node.stop()

    watcher.wait_for_recovery()

    # Stop remaining nodes
    network.stop_all_nodes()

    current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger()

    recovered_network = infra.network.Network(
        args.nodes,
        args.binary_dir,
        args.debug_nodes,
        args.perf_nodes,
        existing_network=network,
    )
    recovered_network.start_in_recovery(
        args,
        ledger_dir=current_ledger_dir,
        committed_ledger_dirs=committed_ledger_dirs,
        snapshots_dir=snapshots_dir,
    )

    recovered_network.recover(args)

    return recovered_network
Exemplo n.º 3
0
def run_ledger_compatibility_since_first(args, local_branch, use_snapshot):
    """
    Tests that a service from the very first LTS can be recovered
    to the next LTS, and so forth, until the version of the local checkout.

    The recovery process uses snapshot is `use_snapshot` is True. Otherwise, the
    entire historical ledger is used.
    """

    LOG.info("Use snapshot: {}", use_snapshot)
    repo = infra.github.Repository()
    lts_releases = repo.get_lts_releases(local_branch)
    has_pre_2_rc7_ledger = False

    LOG.info(f"LTS releases: {[r[1] for r in lts_releases.items()]}")

    lts_versions = []

    # Add an empty entry to release to indicate local checkout
    # Note: dicts are ordered from Python3.7
    lts_releases[None] = None

    jwt_issuer = infra.jwt_issuer.JwtIssuer(
        "https://localhost", refresh_interval=args.jwt_key_refresh_interval_s)
    with jwt_issuer.start_openid_server():
        txs = app.LoggingTxs(jwt_issuer=jwt_issuer)
        for idx, (_, lts_release) in enumerate(lts_releases.items()):
            if lts_release:
                version, install_path = repo.install_release(lts_release)
                lts_versions.append(version)
                set_js_args(args, install_path)
            else:
                version = args.ccf_version
                install_path = LOCAL_CHECKOUT_DIRECTORY
                get_new_constitution_for_install(args, install_path)

            binary_dir, library_dir = get_bin_and_lib_dirs_for_install_path(
                install_path)

            if not args.dry_run:
                network_args = {
                    "hosts": args.nodes,
                    "binary_dir": binary_dir,
                    "library_dir": library_dir,
                    "txs": txs,
                    "jwt_issuer": jwt_issuer,
                    "version": version,
                }
                if idx == 0:
                    LOG.info(f"Starting new service (version: {version})")
                    network = infra.network.Network(**network_args)
                    network.start_and_open(args)
                else:
                    LOG.info(f"Recovering service (new version: {version})")
                    network = infra.network.Network(**network_args,
                                                    existing_network=network)
                    network.start_in_recovery(
                        args,
                        ledger_dir,
                        committed_ledger_dirs,
                        snapshots_dir=snapshots_dir,
                    )
                    network.recover(args)

                nodes = network.get_joined_nodes()
                primary, _ = network.find_primary()

                # Verify that all nodes run the expected CCF version
                for node in nodes:
                    # Note: /node/version endpoint and custom certificate validity
                    # were added in 2.x
                    if not node.major_version or node.major_version > 1:
                        with node.client() as c:
                            r = c.get("/node/version")
                            expected_version = node.version or args.ccf_version
                            version = r.body.json()["ccf_version"]
                            assert (
                                r.body.json()["ccf_version"] ==
                                expected_version
                            ), f"Node version is not {expected_version}"
                        node.verify_certificate_validity_period()

                # Rollover JWKS so that new primary must read historical CA bundle table
                # and retrieve new keys via auto refresh
                jwt_issuer.refresh_keys()
                # Note: /gov/jwt_keys/all endpoint was added in 2.x
                primary, _ = network.find_nodes()
                if not primary.major_version or primary.major_version > 1:
                    jwt_issuer.wait_for_refresh(network)
                else:
                    time.sleep(3)

                if idx > 0:
                    test_new_service(
                        network,
                        args,
                        install_path,
                        binary_dir,
                        library_dir,
                        version,
                    )

                # We accept ledger chunk file differences during upgrades
                # from 1.x to 2.x post rc7 ledger. This is necessary because
                # the ledger files may not be chunked at the same interval
                # between those versions (see https://github.com/microsoft/ccf/issues/3613;
                # 1.x ledgers do not contain the header flags to synchronize ledger chunks).
                # This can go once 2.0 is released.
                current_version_past_2_rc7 = primary.version_after(
                    "ccf-2.0.0-rc7")
                has_pre_2_rc7_ledger = (not current_version_past_2_rc7
                                        or has_pre_2_rc7_ledger)
                is_ledger_chunk_breaking = (has_pre_2_rc7_ledger
                                            and current_version_past_2_rc7)

                snapshots_dir = (network.get_committed_snapshots(primary)
                                 if use_snapshot else None)

                network.stop_all_nodes(
                    skip_verification=True,
                    accept_ledger_diff=is_ledger_chunk_breaking,
                )
                ledger_dir, committed_ledger_dirs = primary.get_ledger()
                network.save_service_identity(args)

                # Check that ledger and snapshots can be parsed
                ccf.ledger.Ledger(
                    committed_ledger_dirs).get_latest_public_state()
                if snapshots_dir:
                    for s in os.listdir(snapshots_dir):
                        with ccf.ledger.Snapshot(os.path.join(
                                snapshots_dir, s)) as snapshot:
                            snapshot.get_public_domain()

    return lts_versions
Exemplo n.º 4
0
def test_recover_service_with_wrong_identity(network, args):
    old_primary, _ = network.find_primary()

    snapshots_dir = network.get_committed_snapshots(old_primary)

    network.stop_all_nodes()

    network.save_service_identity(args)
    first_service_identity_file = args.previous_service_identity_file

    current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger()

    # Attempt a recovery with the wrong previous service certificate

    args.previous_service_identity_file = network.consortium.user_cert_path("user0")

    broken_network = infra.network.Network(
        args.nodes,
        args.binary_dir,
        args.debug_nodes,
        args.perf_nodes,
        existing_network=network,
    )

    exception = None
    try:
        broken_network.start_in_recovery(
            args,
            ledger_dir=current_ledger_dir,
            committed_ledger_dirs=committed_ledger_dirs,
            snapshots_dir=snapshots_dir,
        )
    except Exception as ex:
        exception = ex

    broken_network.ignoring_shutdown_errors = True
    broken_network.stop_all_nodes(skip_verification=True)

    if exception is None:
        raise ValueError("Recovery should have failed")
    if not broken_network.nodes[0].check_log_for_error_message(
        "Error starting node: Previous service identity does not endorse the node identity that signed the snapshot"
    ):
        raise ValueError("Node log does not contain the expect error message")

    # Recover, now with the right service identity

    args.previous_service_identity_file = first_service_identity_file

    recovered_network = infra.network.Network(
        args.nodes,
        args.binary_dir,
        args.debug_nodes,
        args.perf_nodes,
        existing_network=network,
    )

    recovered_network.start_in_recovery(
        args,
        ledger_dir=current_ledger_dir,
        committed_ledger_dirs=committed_ledger_dirs,
        snapshots_dir=snapshots_dir,
    )

    recovered_network.recover(args)

    return recovered_network
Exemplo n.º 5
0
def test_recover_service_truncated_ledger(
    network,
    args,
    corrupt_first_tx=False,
    corrupt_last_tx=False,
    corrupt_first_sig=False,
):
    network.save_service_identity(args)
    old_primary, _ = network.find_primary()

    LOG.info("Force new ledger chunk for app txs to be in committed chunks")
    network.consortium.force_ledger_chunk(old_primary)

    LOG.info(
        "Fill ledger with dummy entries until at least one ledger chunk is not committed, and contains a signature"
    )
    current_ledger_path = old_primary.remote.ledger_paths()[0]
    while True:
        network.consortium.create_and_withdraw_large_proposal(
            old_primary, wait_for_commit=True
        )
        # A signature will have been emitted by now (wait_for_commit)
        network.consortium.create_and_withdraw_large_proposal(old_primary)
        if not all(
            f.endswith(ccf.ledger.COMMITTED_FILE_SUFFIX)
            for f in os.listdir(current_ledger_path)
        ):
            break

    network.stop_all_nodes()

    current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger()

    # Corrupt _uncommitted_ ledger before starting new service
    ledger = ccf.ledger.Ledger([current_ledger_dir], committed_only=False)

    def get_middle_tx_offset(tx):
        offset, next_offset = tx.get_offsets()
        return offset + (next_offset - offset) // 2

    for chunk in ledger:
        chunk_filename = chunk.filename()
        first_tx_offset = None
        last_tx_offset = None
        first_sig_offset = None
        for tx in chunk:
            tables = tx.get_public_domain().get_tables()
            if (
                first_sig_offset is None
                and ccf.ledger.SIGNATURE_TX_TABLE_NAME in tables
            ):
                first_sig_offset = get_middle_tx_offset(tx)
            last_tx_offset = get_middle_tx_offset(tx)
            if first_tx_offset is None:
                first_tx_offset = get_middle_tx_offset(tx)

    truncated_ledger_file_path = os.path.join(current_ledger_dir, chunk_filename)
    if corrupt_first_tx:
        truncate_offset = first_tx_offset
    elif corrupt_last_tx:
        truncate_offset = last_tx_offset
    elif corrupt_first_sig:
        truncate_offset = first_sig_offset

    with open(truncated_ledger_file_path, "r+", encoding="utf-8") as f:
        f.truncate(truncate_offset)
    LOG.warning(
        f"Truncated ledger file {truncated_ledger_file_path} at {truncate_offset}"
    )

    recovered_network = infra.network.Network(
        args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network
    )
    recovered_network.start_in_recovery(
        args,
        ledger_dir=current_ledger_dir,
        committed_ledger_dirs=committed_ledger_dirs,
    )
    recovered_network.recover(args)

    return recovered_network
Exemplo n.º 6
0
def test_share_resilience(network, args, from_snapshot=False):
    network.save_service_identity(args)
    old_primary, _ = network.find_primary()

    snapshots_dir = None
    if from_snapshot:
        snapshots_dir = network.get_committed_snapshots(old_primary)

    network.stop_all_nodes()

    current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger()

    recovered_network = infra.network.Network(
        args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network
    )
    recovered_network.start_in_recovery(
        args,
        ledger_dir=current_ledger_dir,
        committed_ledger_dirs=committed_ledger_dirs,
        snapshots_dir=snapshots_dir,
    )
    primary, _ = recovered_network.find_primary()
    recovered_network.consortium.transition_service_to_open(
        primary,
        previous_service_identity=slurp_file(args.previous_service_identity_file),
    )

    # Submit all required recovery shares minus one. Last recovery share is
    # submitted after a new primary is found.
    encrypted_submitted_shares_count = 0
    for m in recovered_network.consortium.get_active_members():
        with primary.client() as nc:
            if (
                encrypted_submitted_shares_count
                >= recovered_network.consortium.recovery_threshold - 1
            ):
                last_member_to_submit = m
                break

            check_commit = infra.checker.Checker(nc)
            check_commit(m.get_and_submit_recovery_share(primary))
            encrypted_submitted_shares_count += 1

    LOG.info(
        f"Shutting down node {primary.node_id} before submitting last recovery share"
    )
    primary.stop()
    new_primary, _ = recovered_network.wait_for_new_primary(primary)

    last_member_to_submit.get_and_submit_recovery_share(new_primary)

    for node in recovered_network.get_joined_nodes():
        recovered_network.wait_for_state(
            node,
            infra.node.State.PART_OF_NETWORK.value,
            timeout=args.ledger_recovery_timeout,
        )

    recovered_network.consortium.check_for_service(
        new_primary,
        infra.network.ServiceStatus.OPEN,
    )
    if recovered_network.service_load:
        recovered_network.service_load.set_network(recovered_network)
    return recovered_network
Exemplo n.º 7
0
def test_recover_service_aborted(network, args, from_snapshot=False):
    network.save_service_identity(args)
    old_primary, _ = network.find_primary()

    snapshots_dir = None
    if from_snapshot:
        snapshots_dir = network.get_committed_snapshots(old_primary)

    network.stop_all_nodes()
    current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger()

    aborted_network = infra.network.Network(
        args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network
    )
    aborted_network.start_in_recovery(
        args,
        ledger_dir=current_ledger_dir,
        committed_ledger_dirs=committed_ledger_dirs,
        snapshots_dir=snapshots_dir,
    )

    LOG.info("Fill in ledger to trigger new chunks, which should be marked as recovery")
    primary, _ = aborted_network.find_primary()
    while (
        len(
            [
                f
                for f in os.listdir(primary.remote.ledger_paths()[0])
                if f.endswith(
                    f"{ccf.ledger.COMMITTED_FILE_SUFFIX}{ccf.ledger.RECOVERY_FILE_SUFFIX}"
                )
            ]
        )
        < 2
    ):
        # Submit large proposal until at least two recovery ledger chunks are committed
        aborted_network.consortium.create_and_withdraw_large_proposal(primary)

    LOG.info(
        "Do not complete service recovery on purpose and initiate new recovery from scratch"
    )

    snapshots_dir = None
    if from_snapshot:
        snapshots_dir = network.get_committed_snapshots(primary)

    # Check that all nodes have the same (recovery) ledger files
    aborted_network.stop_all_nodes(
        skip_verification=True, read_recovery_ledger_files=True
    )

    current_ledger_dir, committed_ledger_dirs = primary.get_ledger()
    recovered_network = infra.network.Network(
        args.nodes,
        args.binary_dir,
        args.debug_nodes,
        args.perf_nodes,
        existing_network=aborted_network,
    )
    recovered_network.start_in_recovery(
        args,
        ledger_dir=current_ledger_dir,
        committed_ledger_dirs=committed_ledger_dirs,
        snapshots_dir=snapshots_dir,
    )
    recovered_network.recover(args)
    return recovered_network