def test_share_resilience(network, args, from_snapshot=False): old_primary, _ = network.find_primary() snapshot_dir = None if from_snapshot: snapshot_dir = network.get_committed_snapshots(old_primary) current_ledger_dir, committed_ledger_dir = old_primary.get_ledger( include_read_only_dirs=True ) recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dir=committed_ledger_dir, snapshot_dir=snapshot_dir, ) primary, _ = recovered_network.find_primary() recovered_network.consortium.accept_recovery(primary) # Submit all required recovery shares minus one. Last recovery share is # submitted after a new primary is found. submitted_shares_count = 0 for m in recovered_network.consortium.get_active_members(): with primary.client() as nc: if ( submitted_shares_count >= recovered_network.consortium.recovery_threshold - 1 ): last_member_to_submit = m break check_commit = infra.checker.Checker(nc) check_commit(m.get_and_submit_recovery_share(primary)) submitted_shares_count += 1 LOG.info( f"Shutting down node {primary.node_id} before submitting last recovery share" ) primary.stop() new_primary, _ = recovered_network.wait_for_new_primary(primary.node_id) assert ( new_primary is not primary ), f"Primary {primary.node_id} should have changed after election" last_member_to_submit.get_and_submit_recovery_share(new_primary) for node in recovered_network.get_joined_nodes(): recovered_network.wait_for_state( node, "partOfNetwork", timeout=args.ledger_recovery_timeout ) recovered_network.consortium.check_for_service( new_primary, infra.network.ServiceStatus.OPEN, ) return recovered_network
def run_join_old_snapshot(args): txs = app.LoggingTxs("user0") nodes = ["local://localhost"] with tempfile.TemporaryDirectory() as tmp_dir: with infra.network.network( nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb, txs=txs, ) as network: network.start_and_open(args) primary, _ = network.find_primary() # First, retrieve and save one committed snapshot txs.issue(network, number_txs=args.snapshot_tx_interval) old_committed_snapshots = network.get_committed_snapshots(primary) copy( os.path.join( old_committed_snapshots, os.listdir(old_committed_snapshots)[0] ), tmp_dir, ) # Then generate another newer snapshot, and add two more nodes from it txs.issue(network, number_txs=args.snapshot_tx_interval) for _ in range(0, 2): new_node = network.create_node("local://localhost") network.join_node( new_node, args.package, args, from_snapshot=True, ) network.trust_node(new_node, args) # Kill primary and wait for a new one: new primary is # guaranteed to have started from the new snapshot primary.stop() network.wait_for_new_primary(primary) # Start new node from the old snapshot try: new_node = network.create_node("local://localhost") network.join_node( new_node, args.package, args, from_snapshot=True, snapshots_dir=tmp_dir, timeout=3, ) except infra.network.StartupSnapshotIsOld: pass
def test_parse_snapshot_file(network, args): primary, _ = network.find_primary() network.txs.issue(network, number_txs=args.snapshot_tx_interval * 2) committed_snapshots_dir = network.get_committed_snapshots(primary) for snapshot in os.listdir(committed_snapshots_dir): with ccf.ledger.Snapshot( os.path.join(committed_snapshots_dir, snapshot)) as s: assert len(s.get_public_domain().get_tables() ), "No public table in snapshot" return network
def test_add_node_from_snapshot( network, args, copy_ledger_read_only=True, from_backup=False ): # Before adding the node from a snapshot, override at least one app entry # and wait for a new committed snapshot covering that entry, so that there # is at least one historical entry to verify. network.txs.issue(network, number_txs=1) for _ in range(1, args.snapshot_tx_interval): network.txs.issue(network, number_txs=1, repeat=True) last_tx = network.txs.get_last_tx(priv=True) if network.wait_for_snapshot_committed_for(seqno=last_tx[1]["seqno"]): break target_node = None snapshots_dir = None if from_backup: primary, target_node = network.find_primary_and_any_backup() # Retrieve snapshot from primary as only primary node # generates snapshots snapshots_dir = network.get_committed_snapshots(primary) new_node = network.create_node("local://localhost") network.join_node( new_node, args.package, args, copy_ledger_read_only=copy_ledger_read_only, target_node=target_node, snapshots_dir=snapshots_dir, from_snapshot=True, ) network.trust_node(new_node, args) with new_node.client() as c: r = c.get("/node/state") assert ( r.body.json()["startup_seqno"] != 0 ), "Node started from snapshot but reports startup seqno of 0" # Finally, verify all app entries on the new node, including historical ones # from the historical ledger network.txs.verify(node=new_node, include_historical=copy_ledger_read_only) return network
def test_recover_service(network, args, from_snapshot=False): network.save_service_identity(args) old_primary, _ = network.find_primary() snapshots_dir = None if from_snapshot: snapshots_dir = network.get_committed_snapshots(old_primary) # Start health watcher and stop nodes one by one until a recovery has to be staged watcher = infra.health_watcher.NetworkHealthWatcher(network, args, verbose=True) watcher.start() for node in network.get_joined_nodes(): time.sleep(args.election_timeout_ms / 1000) node.stop() watcher.wait_for_recovery() # Stop remaining nodes network.stop_all_nodes() current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=network, ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) recovered_network.recover(args) return recovered_network
def test_add_node_from_snapshot( network, args, copy_ledger_read_only=True, from_backup=False ): target_node = None snapshot_dir = None if from_backup: primary, target_node = network.find_primary_and_any_backup() # Retrieve snapshot from primary as only primary node # generates snapshots snapshot_dir = network.get_committed_snapshots(primary) new_node = network.create_and_trust_node( args.package, "local://localhost", args, copy_ledger_read_only=copy_ledger_read_only, target_node=target_node, snapshot_dir=snapshot_dir, ) assert new_node return network
def test(network, args, from_snapshot=False): old_primary, _ = network.find_primary() snapshot_dir = None if from_snapshot: snapshot_dir = network.get_committed_snapshots(old_primary) current_ledger_dir, committed_ledger_dir = old_primary.get_ledger( include_read_only_dirs=True ) recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dir=committed_ledger_dir, snapshot_dir=snapshot_dir, ) recovered_network.recover(args) return recovered_network
def test_forced_snapshot(network, args): primary, _ = network.find_primary() # Submit some dummy transactions network.txs.issue(network, number_txs=3) # Submit a proposal to force a snapshot at the following signature proposal_body, careful_vote = network.consortium.make_proposal( "trigger_snapshot", node_id=primary.node_id) proposal = network.consortium.get_any_active_member().propose( primary, proposal_body) proposal = network.consortium.vote_using_majority( primary, proposal, careful_vote, ) # Issue some more transactions network.txs.issue(network, number_txs=5) ledger_dirs = primary.remote.ledger_paths() # Find first signature after proposal.completed_seqno ledger = ccf.ledger.Ledger(ledger_dirs) chunk, _, _, next_signature = find_ledger_chunk_for_seqno( ledger, proposal.completed_seqno) assert chunk.is_complete and chunk.is_committed() LOG.info(f"Expecting snapshot at {next_signature}") snapshots_dir = network.get_committed_snapshots(primary) for s in os.listdir(snapshots_dir): with ccf.ledger.Snapshot(os.path.join(snapshots_dir, s)) as snapshot: snapshot_seqno = snapshot.get_public_domain().get_seqno() if snapshot_seqno == next_signature: LOG.info(f"Found expected forced snapshot at {next_signature}") return network raise RuntimeError("Could not find matching snapshot file")
def test_read_ledger_utility(network, args): def fmt_str(data: bytes) -> str: return data.decode() format_rule = [(".*records.*", {"key": fmt_str, "value": fmt_str})] network.txs.issue(network, number_txs=args.snapshot_tx_interval) network.get_latest_ledger_public_state() primary, backups = network.find_nodes() for node in (primary, *backups): ledger_dirs = node.remote.ledger_paths() assert ccf.read_ledger.run(paths=ledger_dirs, tables_format_rules=format_rule) snapshot_dir = network.get_committed_snapshots(primary) assert ccf.read_ledger.run( paths=[os.path.join(snapshot_dir, os.listdir(snapshot_dir)[-1])], is_snapshot=True, tables_format_rules=format_rule, ) return network
def run_ledger_compatibility_since_first(args, local_branch, use_snapshot): """ Tests that a service from the very first LTS can be recovered to the next LTS, and so forth, until the version of the local checkout. The recovery process uses snapshot is `use_snapshot` is True. Otherwise, the entire historical ledger is used. """ LOG.info("Use snapshot: {}", use_snapshot) repo = infra.github.Repository() lts_releases = repo.get_lts_releases(local_branch) has_pre_2_rc7_ledger = False LOG.info(f"LTS releases: {[r[1] for r in lts_releases.items()]}") lts_versions = [] # Add an empty entry to release to indicate local checkout # Note: dicts are ordered from Python3.7 lts_releases[None] = None jwt_issuer = infra.jwt_issuer.JwtIssuer( "https://localhost", refresh_interval=args.jwt_key_refresh_interval_s) with jwt_issuer.start_openid_server(): txs = app.LoggingTxs(jwt_issuer=jwt_issuer) for idx, (_, lts_release) in enumerate(lts_releases.items()): if lts_release: version, install_path = repo.install_release(lts_release) lts_versions.append(version) set_js_args(args, install_path) else: version = args.ccf_version install_path = LOCAL_CHECKOUT_DIRECTORY get_new_constitution_for_install(args, install_path) binary_dir, library_dir = get_bin_and_lib_dirs_for_install_path( install_path) if not args.dry_run: network_args = { "hosts": args.nodes, "binary_dir": binary_dir, "library_dir": library_dir, "txs": txs, "jwt_issuer": jwt_issuer, "version": version, } if idx == 0: LOG.info(f"Starting new service (version: {version})") network = infra.network.Network(**network_args) network.start_and_open(args) else: LOG.info(f"Recovering service (new version: {version})") network = infra.network.Network(**network_args, existing_network=network) network.start_in_recovery( args, ledger_dir, committed_ledger_dirs, snapshots_dir=snapshots_dir, ) network.recover(args) nodes = network.get_joined_nodes() primary, _ = network.find_primary() # Verify that all nodes run the expected CCF version for node in nodes: # Note: /node/version endpoint and custom certificate validity # were added in 2.x if not node.major_version or node.major_version > 1: with node.client() as c: r = c.get("/node/version") expected_version = node.version or args.ccf_version version = r.body.json()["ccf_version"] assert ( r.body.json()["ccf_version"] == expected_version ), f"Node version is not {expected_version}" node.verify_certificate_validity_period() # Rollover JWKS so that new primary must read historical CA bundle table # and retrieve new keys via auto refresh jwt_issuer.refresh_keys() # Note: /gov/jwt_keys/all endpoint was added in 2.x primary, _ = network.find_nodes() if not primary.major_version or primary.major_version > 1: jwt_issuer.wait_for_refresh(network) else: time.sleep(3) if idx > 0: test_new_service( network, args, install_path, binary_dir, library_dir, version, ) # We accept ledger chunk file differences during upgrades # from 1.x to 2.x post rc7 ledger. This is necessary because # the ledger files may not be chunked at the same interval # between those versions (see https://github.com/microsoft/ccf/issues/3613; # 1.x ledgers do not contain the header flags to synchronize ledger chunks). # This can go once 2.0 is released. current_version_past_2_rc7 = primary.version_after( "ccf-2.0.0-rc7") has_pre_2_rc7_ledger = (not current_version_past_2_rc7 or has_pre_2_rc7_ledger) is_ledger_chunk_breaking = (has_pre_2_rc7_ledger and current_version_past_2_rc7) snapshots_dir = (network.get_committed_snapshots(primary) if use_snapshot else None) network.stop_all_nodes( skip_verification=True, accept_ledger_diff=is_ledger_chunk_breaking, ) ledger_dir, committed_ledger_dirs = primary.get_ledger() network.save_service_identity(args) # Check that ledger and snapshots can be parsed ccf.ledger.Ledger( committed_ledger_dirs).get_latest_public_state() if snapshots_dir: for s in os.listdir(snapshots_dir): with ccf.ledger.Snapshot(os.path.join( snapshots_dir, s)) as snapshot: snapshot.get_public_domain() return lts_versions
def test_recover_service_with_wrong_identity(network, args): old_primary, _ = network.find_primary() snapshots_dir = network.get_committed_snapshots(old_primary) network.stop_all_nodes() network.save_service_identity(args) first_service_identity_file = args.previous_service_identity_file current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() # Attempt a recovery with the wrong previous service certificate args.previous_service_identity_file = network.consortium.user_cert_path("user0") broken_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=network, ) exception = None try: broken_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) except Exception as ex: exception = ex broken_network.ignoring_shutdown_errors = True broken_network.stop_all_nodes(skip_verification=True) if exception is None: raise ValueError("Recovery should have failed") if not broken_network.nodes[0].check_log_for_error_message( "Error starting node: Previous service identity does not endorse the node identity that signed the snapshot" ): raise ValueError("Node log does not contain the expect error message") # Recover, now with the right service identity args.previous_service_identity_file = first_service_identity_file recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=network, ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) recovered_network.recover(args) return recovered_network
def test_share_resilience(network, args, from_snapshot=False): network.save_service_identity(args) old_primary, _ = network.find_primary() snapshots_dir = None if from_snapshot: snapshots_dir = network.get_committed_snapshots(old_primary) network.stop_all_nodes() current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) primary, _ = recovered_network.find_primary() recovered_network.consortium.transition_service_to_open( primary, previous_service_identity=slurp_file(args.previous_service_identity_file), ) # Submit all required recovery shares minus one. Last recovery share is # submitted after a new primary is found. encrypted_submitted_shares_count = 0 for m in recovered_network.consortium.get_active_members(): with primary.client() as nc: if ( encrypted_submitted_shares_count >= recovered_network.consortium.recovery_threshold - 1 ): last_member_to_submit = m break check_commit = infra.checker.Checker(nc) check_commit(m.get_and_submit_recovery_share(primary)) encrypted_submitted_shares_count += 1 LOG.info( f"Shutting down node {primary.node_id} before submitting last recovery share" ) primary.stop() new_primary, _ = recovered_network.wait_for_new_primary(primary) last_member_to_submit.get_and_submit_recovery_share(new_primary) for node in recovered_network.get_joined_nodes(): recovered_network.wait_for_state( node, infra.node.State.PART_OF_NETWORK.value, timeout=args.ledger_recovery_timeout, ) recovered_network.consortium.check_for_service( new_primary, infra.network.ServiceStatus.OPEN, ) if recovered_network.service_load: recovered_network.service_load.set_network(recovered_network) return recovered_network
def test_recover_service_aborted(network, args, from_snapshot=False): network.save_service_identity(args) old_primary, _ = network.find_primary() snapshots_dir = None if from_snapshot: snapshots_dir = network.get_committed_snapshots(old_primary) network.stop_all_nodes() current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() aborted_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network ) aborted_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) LOG.info("Fill in ledger to trigger new chunks, which should be marked as recovery") primary, _ = aborted_network.find_primary() while ( len( [ f for f in os.listdir(primary.remote.ledger_paths()[0]) if f.endswith( f"{ccf.ledger.COMMITTED_FILE_SUFFIX}{ccf.ledger.RECOVERY_FILE_SUFFIX}" ) ] ) < 2 ): # Submit large proposal until at least two recovery ledger chunks are committed aborted_network.consortium.create_and_withdraw_large_proposal(primary) LOG.info( "Do not complete service recovery on purpose and initiate new recovery from scratch" ) snapshots_dir = None if from_snapshot: snapshots_dir = network.get_committed_snapshots(primary) # Check that all nodes have the same (recovery) ledger files aborted_network.stop_all_nodes( skip_verification=True, read_recovery_ledger_files=True ) current_ledger_dir, committed_ledger_dirs = primary.get_ledger() recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=aborted_network, ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) recovered_network.recover(args) return recovered_network
def run_ledger_compatibility_since_first(args, local_branch, use_snapshot): """ Tests that a service from the very first LTS can be recovered to the next LTS, and so forth, until the version of the local checkout. The recovery process uses snapshot is `use_snapshot` is True. Otherwise, the entire historical ledger is used. """ LOG.info("Use snapshot: {}", use_snapshot) repo = infra.github.Repository() lts_releases = repo.get_lts_releases() LOG.info(f"LTS releases: {[r[1] for r in lts_releases.items()]}") lts_versions = [] # Add an empty entry to release to indicate local checkout # Note: dicts are ordered from Python3.7 lts_releases[None] = None jwt_issuer = infra.jwt_issuer.JwtIssuer( "https://localhost", refresh_interval=args.jwt_key_refresh_interval_s ) with jwt_issuer.start_openid_server(): txs = app.LoggingTxs(jwt_issuer=jwt_issuer) for idx, (_, lts_release) in enumerate(lts_releases.items()): if lts_release: version, install_path = repo.install_release(lts_release) lts_versions.append(version) set_js_args(args, install_path) else: version = args.ccf_version install_path = LOCAL_CHECKOUT_DIRECTORY binary_dir, library_dir = get_bin_and_lib_dirs_for_install_path( install_path ) if not args.dry_run: network_args = { "hosts": args.nodes, "binary_dir": binary_dir, "library_dir": library_dir, "txs": txs, "jwt_issuer": jwt_issuer, "version": version, } if idx == 0: LOG.info(f"Starting new service (version: {version})") network = infra.network.Network(**network_args) network.start_and_join(args) else: LOG.info(f"Recovering service (new version: {version})") network = infra.network.Network( **network_args, existing_network=network ) network.start_in_recovery( args, ledger_dir, committed_ledger_dir, snapshot_dir=snapshot_dir, ) network.recover(args) nodes = network.get_joined_nodes() primary, _ = network.find_primary() # Verify that all nodes run the expected CCF version for node in nodes: # Note: /node/version endpoint and custom certificate validity # were added in 2.x if not node.major_version or node.major_version > 1: with node.client() as c: r = c.get("/node/version") expected_version = node.version or args.ccf_version version = r.body.json()["ccf_version"] assert ( r.body.json()["ccf_version"] == expected_version ), f"Node version is not {expected_version}" node.verify_certificate_validity_period() # Rollover JWKS so that new primary must read historical CA bundle table # and retrieve new keys via auto refresh jwt_issuer.refresh_keys() # Note: /gov/jwt_keys/all endpoint was added in 2.x primary, _ = network.find_nodes() if not primary.major_version or primary.major_version > 1: jwt_issuer.wait_for_refresh(network) else: time.sleep(3) if idx > 0: test_new_service( network, args, install_path, binary_dir, library_dir, version, ) snapshot_dir = ( network.get_committed_snapshots(primary) if use_snapshot else None ) ledger_dir, committed_ledger_dir = primary.get_ledger( include_read_only_dirs=True ) network.stop_all_nodes(skip_verification=True) # Check that ledger and snapshots can be parsed ccf.ledger.Ledger([committed_ledger_dir]).get_latest_public_state() if snapshot_dir: for s in os.listdir(snapshot_dir): with ccf.ledger.Snapshot( os.path.join(snapshot_dir, s) ) as snapshot: snapshot.get_public_domain() return lts_versions