def test_partition_majority(network, args): primary, backups = network.find_nodes() # Create a partition with primary + half remaining nodes (i.e. majority) partition = [primary] partition.extend(backups[len(backups) // 2:]) # Wait for all nodes to be have reached the same level of commit, so that # nodes outside of partition can become primary after this one is dropped network.wait_for_all_nodes_to_commit(primary=primary) # The primary should remain stable while the partition is active # Note: Context manager initial_view = None with network.partitioner.partition(partition): try: network.wait_for_new_primary(primary) assert False, "No new primary should be elected when partitioning majority" except TimeoutError: LOG.info("No new primary, as expected") with primary.client() as c: res = c.get("/node/network") # Well-known read-only endpoint body = res.body.json() initial_view = body["current_view"] # The partitioned nodes will have called elections, increasing their view. # When the partition is lifted, the nodes must elect a new leader, in at least this # increased term. The winning node could come from either partition, and could even # be the original primary. network.wait_for_primary_unanimity(min_view=initial_view) return network
def run(args): args.jwt_key_refresh_interval_s = 1 with infra.network.network(args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb) as network: network.start_and_join(args) network = test_jwt_without_key_policy(network, args) network = test_jwt_with_sgx_key_policy(network, args) network = test_jwt_with_sgx_key_filter(network, args) network = test_jwt_key_auto_refresh(network, args) # Check that auto refresh also works on backups primary, _ = network.find_primary() primary.stop() network.wait_for_new_primary(primary.node_id) network = test_jwt_key_auto_refresh(network, args) args.jwt_key_refresh_interval_s = 100000 with infra.network.network(args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb) as network: network.start_and_join(args) network = test_jwt_key_initial_refresh(network, args) # Check that initial refresh also works on backups primary, _ = network.find_primary() primary.stop() network.wait_for_new_primary(primary.node_id) network = test_jwt_key_initial_refresh(network, args)
def test_isolate_primary_from_one_backup(network, args): primary, backups = network.find_nodes() # Issue one transaction, waiting for all nodes to be have reached # the same level of commit, so that nodes outside of partition can # become primary after this one is dropped # Note: Because of https://github.com/microsoft/CCF/issues/2224, we need to # issue a write transaction instead of just reading the TxID of the latest entry network.txs.issue(network) # Isolate first backup from primary so that first backup becomes candidate # in a new term and wins the election # Note: Managed manually rules = network.partitioner.isolate_node(primary, backups[0]) new_primary, new_view = network.wait_for_new_primary(primary, nodes=backups, timeout_multiplier=6) # Explicitly drop rules before continuing rules.drop() # Old primary should now report of the new primary new_primary_, new_view_ = network.wait_for_new_primary(primary, nodes=[primary]) assert ( new_primary == new_primary_ ), f"New primary {new_primary_.local_node_id} after partition is dropped is different than before {new_primary.local_node_id}" assert ( new_view == new_view_ ), f"Consensus view {new_view} should not changed after partition is dropped: no {new_view_}" return network
def run_join_old_snapshot(args): txs = app.LoggingTxs("user0") nodes = ["local://localhost"] with tempfile.TemporaryDirectory() as tmp_dir: with infra.network.network( nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb, txs=txs, ) as network: network.start_and_open(args) primary, _ = network.find_primary() # First, retrieve and save one committed snapshot txs.issue(network, number_txs=args.snapshot_tx_interval) old_committed_snapshots = network.get_committed_snapshots(primary) copy( os.path.join( old_committed_snapshots, os.listdir(old_committed_snapshots)[0] ), tmp_dir, ) # Then generate another newer snapshot, and add two more nodes from it txs.issue(network, number_txs=args.snapshot_tx_interval) for _ in range(0, 2): new_node = network.create_node("local://localhost") network.join_node( new_node, args.package, args, from_snapshot=True, ) network.trust_node(new_node, args) # Kill primary and wait for a new one: new primary is # guaranteed to have started from the new snapshot primary.stop() network.wait_for_new_primary(primary) # Start new node from the old snapshot try: new_node = network.create_node("local://localhost") network.join_node( new_node, args.package, args, from_snapshot=True, snapshots_dir=tmp_dir, timeout=3, ) except infra.network.StartupSnapshotIsOld: pass
def run_manual(args): with infra.network.network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb ) as network: network.start_and_join(args) test_jwt_key_initial_refresh(network, args) # Check that initial refresh also works on backups primary, _ = network.find_primary() primary.stop() network.wait_for_new_primary(primary) test_jwt_key_initial_refresh(network, args)
def test_retire_primary(network, args): pre_count = count_nodes(node_configs(network), network) primary, backup = network.find_primary_and_any_backup() network.consortium.retire_node(primary, primary) network.wait_for_new_primary(primary) check_can_progress(backup) network.nodes.remove(primary) post_count = count_nodes(node_configs(network), network) assert pre_count == post_count + 1 primary.stop() return network
def test_new_joiner_helps_liveness(network, args): primary, backups = network.find_nodes() # Issue some transactions, so there is a ledger history that a new node must receive network.txs.issue(network, number_txs=10) # Remove a node, leaving the network frail network.retire_node(primary, backups[-1]) backups[-1].stop() primary, backups = network.find_nodes() with contextlib.ExitStack() as stack: # Add a new node, but partition them before trusting them new_node = network.create_node("local://localhost") network.join_node(new_node, args.package, args, from_snapshot=False) new_joiner_partition = [new_node] new_joiner_rules = stack.enter_context( network.partitioner.partition([primary, *backups], new_joiner_partition)) # Trust the new node, and wait for commit of this (but don't ask the new node itself, which doesn't know this yet) network.trust_node(new_node, args, no_wait=True) check_can_progress(primary) # Partition the primary, temporarily creating a minority service that cannot make progress minority_partition = backups[len(backups) // 2:] + new_joiner_partition minority_rules = stack.enter_context( network.partitioner.partition(minority_partition)) # This is an unusual situation, where we've actually produced a dead partitioned node. # Initially any write requests will timeout (failed attempt at forwarding), and then # the node transitions to a candidate with nobody to talk to. Rather than trying to # catch the errors of these states quickly, we just sleep until the latter state is # reached, and then confirm it was reached. time.sleep(network.observed_election_duration) with backups[0].client("user0") as c: r = c.post("/app/log/private", {"id": 42, "msg": "Hello world"}) assert r.status_code == http.HTTPStatus.SERVICE_UNAVAILABLE # Restore the new node to the service new_joiner_rules.drop() # Confirm that the new node catches up, and progress can be made in this majority partition network.wait_for_new_primary(primary, minority_partition) check_can_progress(new_node) # Explicitly drop rules before continuing minority_rules.drop() network.wait_for_primary_unanimity() primary, _ = network.find_nodes() network.wait_for_all_nodes_to_commit(primary=primary)
def test_retire_primary(network, args): pre_count = count_nodes(node_configs(network), network) primary, backup = network.find_primary_and_any_backup() network.retire_node(primary, primary, timeout=15) # Query this backup to find the new primary. If we ask any other # node, then this backup may not know the new primary by the # time we call check_can_progress. network.wait_for_new_primary(primary, nodes=[backup]) check_can_progress(backup) post_count = count_nodes(node_configs(network), network) assert pre_count == post_count + 1 primary.stop() wait_for_reconfiguration_to_complete(network) return network
def test_kill_primary(network, args): primary, _ = network.find_primary_and_any_backup() primary.stop() network.wait_for_new_primary(primary) # Verify that the TxID reported just after an election is valid # Note that the first TxID read after an election may be of a signature # Tx (time-based signature generation) in the new term rather than the # last entry in the previous term for node in network.get_joined_nodes(): with node.client() as c: r = c.get("/node/network") c.wait_for_commit(r) return network
def run_auto(args): with infra.network.network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb ) as network: network.start_and_join(args) test_jwt_without_key_policy(network, args) if args.enclave_type != "virtual": test_jwt_with_sgx_key_policy(network, args) test_jwt_with_sgx_key_filter(network, args) test_jwt_key_auto_refresh(network, args) # Check that auto refresh also works on backups primary, _ = network.find_primary() primary.stop() network.wait_for_new_primary(primary) test_jwt_key_auto_refresh(network, args)
def test_kill_primary_no_reqs(network, args): old_primary, _ = network.find_primary_and_any_backup() old_primary.stop() new_primary, _ = network.wait_for_new_primary(old_primary) # Verify that the TxID reported just after an election is valid # Note that the first TxID read after an election may be of a signature # Tx (time-based signature generation) in the new term rather than the # last entry in the previous term for node in network.get_joined_nodes(): with node.client() as c: r = c.get("/node/network") c.wait_for_commit(r) # Also verify that reported last ack time are as expected r = c.get("/node/consensus") acks = r.body.json()["details"]["acks"] for ack in acks.values(): if node is new_primary: assert (ack["last_received_ms"] < network.args.election_timeout_ms), acks else: assert ( ack["last_received_ms"] == 0 ), f"Backup {node.local_node_id} should report time of last acks of 0: {acks}" return network
def test_update_all_nodes(network, args): primary, _ = network.find_nodes() first_code_id, new_code_id = [ get_code_id(args.oe_binary, infra.path.build_lib_path(pkg, args.enclave_type)) for pkg in [args.package, args.replacement_package] ] LOG.info("Add new code id") network.consortium.add_new_code(primary, new_code_id) with primary.client() as uc: r = uc.get("/node/code") versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"]) expected = sorted( [ {"digest": first_code_id, "status": "ALLOWED_TO_JOIN"}, {"digest": new_code_id, "status": "ALLOWED_TO_JOIN"}, ], key=lambda x: x["digest"], ) assert versions == expected, versions LOG.info("Remove old code id") network.consortium.retire_code(primary, first_code_id) with primary.client() as uc: r = uc.get("/node/code") versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"]) expected = sorted( [ {"digest": new_code_id, "status": "ALLOWED_TO_JOIN"}, ], key=lambda x: x["digest"], ) assert versions == expected, versions old_nodes = network.nodes.copy() LOG.info("Start fresh nodes running new code") for _ in range(0, len(network.nodes)): new_node = network.create_and_trust_node( args.replacement_package, "local://localhost", args ) assert new_node LOG.info("Retire original nodes running old code") for node in old_nodes: primary, _ = network.find_nodes() network.consortium.retire_node(primary, node) # Elections take (much) longer than a backup removal which is just # a commit, so we need to adjust our timeout accordingly, hence this branch if node.node_id == primary.node_id: new_primary, new_term = network.wait_for_new_primary(primary.node_id) LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}") primary = new_primary network.nodes.remove(node) node.stop() LOG.info("Check the network is still functional") reconfiguration.check_can_progress(new_node) return network
def test_kill_primary(network, args): primary, _ = network.find_primary() primary.stop() new_primary, new_term = network.wait_for_new_primary(primary.node_id) LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}") return network
def test_kill_primary(network, args): primary, backup = network.find_primary_and_any_backup() primary.stop() # When the consensus is BFT there is no status message timer that triggers a new election. # It is triggered with a timeout from a message not executing. We need to send the message that # will not execute because of the stopped primary which will then trigger a view change if args.consensus == "bft": try: with backup.client("user0") as c: _ = c.post( "/app/log/private", { "id": -1, "msg": "This is submitted to force a view change", }, ) except CCFConnectionException: LOG.warning( f"Could not successfully connect to node {backup.node_id}.") new_primary, new_term = network.wait_for_new_primary(primary.node_id) LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}") return network
def test_isolate_and_reconnect_primary(network, args, **kwargs): primary, backups = network.find_nodes() with network.partitioner.partition(backups): lost_tx_resp = check_does_not_progress(primary) new_primary, _ = network.wait_for_new_primary(primary, nodes=backups, timeout_multiplier=6) new_tx_resp = check_can_progress(new_primary) # Check reconnected former primary has caught up with primary.client() as c: try: # There will be at least one full election cycle for nothing, where the # re-joining node fails to get elected but causes others to rev up their # term. After that, a successful election needs to take place, and we # arbitrarily allow 3 time periods to avoid being too brittle when # raft timeouts line up badly. c.wait_for_commit(new_tx_resp, timeout=(network.election_duration * 4)) except TimeoutError: details = c.get("/node/consensus").body.json() assert ( False ), f"Stuck before {new_tx_resp.view}.{new_tx_resp.seqno}: {pprint.pformat(details)}" # Check it has dropped anything submitted while partitioned r = c.get( f"/node/tx?transaction_id={lost_tx_resp.view}.{lost_tx_resp.seqno}" ) status = TxStatus(r.body.json()["status"]) assert status == TxStatus.Invalid, r
def test_suspend_primary(network, args): primary, _ = network.find_primary() primary.suspend() new_primary, _ = network.wait_for_new_primary(primary) check_can_progress(new_primary) primary.resume() check_can_progress(new_primary) return network
def test_join_straddling_primary_replacement(network, args): # We need a fourth node before we attempt the replacement, otherwise # we will reach a situation where two out four nodes in the voting quorum # are unable to participate (one retired and one not yet joined). test_add_node(network, args) primary, _ = network.find_primary() new_node = network.create_node("local://localhost") network.join_node(new_node, args.package, args) proposal_body = { "actions": [ { "name": "transition_node_to_trusted", "args": { "node_id": new_node.node_id, "valid_from": str(datetime.now()), }, }, { "name": "remove_node", "args": { "node_id": primary.node_id }, }, ] } proposal = network.consortium.get_any_active_member().propose( primary, proposal_body) network.consortium.vote_using_majority( primary, proposal, { "ballot": "export function vote (proposal, proposer_id) { return true }" }, timeout=10, ) network.wait_for_new_primary(primary) new_node.wait_for_node_to_join(timeout=10) primary.stop() network.nodes.remove(primary) wait_for_reconfiguration_to_complete(network) return network
def test_suspend_primary(network, args): primary, _ = network.find_primary() primary.suspend() new_primary, new_term = network.wait_for_new_primary(primary.node_id) LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}") reconfiguration.check_can_progress(new_primary) primary.resume() reconfiguration.check_can_progress(new_primary) return network
def test_partition_majority(network, args): primary, backups = network.find_nodes() # Create a partition with primary + half remaining nodes (i.e. majority) partition = [primary] partition.extend(backups[len(backups) // 2 :]) # Wait for all nodes to be have reached the same level of commit, so that # nodes outside of partition can become primary after this one is dropped network.wait_for_all_nodes_to_commit(primary=primary) # The primary should remain stable while the partition is active # Note: Context manager with network.partitioner.partition(partition): try: network.wait_for_new_primary(primary) assert False, "No new primary should be elected when partitioning majority" except TimeoutError: pass # A new leader should be elected once the partition is dropped network.wait_for_new_primary(primary) return network
def run(args): # This is deliberately 5, because the rest of the test depends on this # to grow a prefix and allow just enough nodes to resume to reach the # desired election result. Conversion to a general f isn't trivial. hosts = ["local://localhost"] * 5 with infra.network.network(hosts, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb) as network: network.start_and_join(args) primary, backups = network.find_nodes() # Suspend three of the backups to prevent commit backups[1].suspend() backups[2].suspend() backups[3].stop() txs = [] # Run some transactions that can't be committed with primary.client("user0") as uc: for i in range(3): txs.append( uc.post("/app/log/private", { "id": 100 + i, "msg": "Hello world" })) sig_view, sig_seqno = txs[-1].view, txs[-1].seqno + 1 with backups[0].client() as bc: wait_for_pending(bc, sig_view, sig_seqno) # Kill the primary, restore other backups primary.stop() backups[1].resume() backups[2].resume() new_primary, new_term = network.wait_for_new_primary( primary.node_id, timeout_multiplier=6) LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}") # Check that uncommitted but committable suffix is preserved with new_primary.client("user0") as uc: check_commit = infra.checker.Checker(uc) for tx in txs: check_commit(tx)
def test_isolate_primary_from_one_backup(network, args): p, backups = network.find_nodes() b_0, b_1 = backups # Issue one transaction, waiting for all nodes to be have reached # the same level of commit, so that nodes outside of partition can # become primary after this one is dropped # Note: Because of https://github.com/microsoft/CCF/issues/2224, we need to # issue a write transaction instead of just reading the TxID of the latest entry initial_txid = network.txs.issue(network) # Isolate first backup from primary so that first backup becomes candidate # in a new term and wins the election # Note: Managed manually rules = network.partitioner.isolate_node(p, b_0) # Now wait for several elections to occur. We expect: # - b_0 to call and win an election with b_1's help # - b_0 to produce a new signature, and commit it with b_1's help # - p to call its own election, and lose because it doesn't have this signature # - In the resulting election race: # - If p calls first, it loses and we're in the same situation # - If b_0 calls first, it wins, but then p calls its election and we've returned to the same situation # - If b_1 calls first, it can win and then bring _both_ nodes up-to-date, becoming a _stable_ primary # So we repeat elections until b_1 is primary new_primary = network.wait_for_primary_unanimity( min_view=initial_txid.view, timeout_multiplier=30) assert new_primary == b_1 new_view = network.txs.issue(network).view # The partition is now between 2 backups, but both can talk to the new primary # Explicitly drop rules before continuing rules.drop() # Original primary should now, or very soon, report the new primary new_primary_, new_view_ = network.wait_for_new_primary(p, nodes=[p]) assert ( new_primary == new_primary_ ), f"New primary {new_primary_.local_node_id} after partition is dropped is different than before {new_primary.local_node_id}" assert ( new_view == new_view_ ), f"Consensus view {new_view} should not have changed after partition is dropped: now {new_view_}" return network
def test_new_service( network, args, install_path, binary_dir, library_dir, version, cycle_existing_nodes=False, ): LOG.info("Update constitution") primary, _ = network.find_primary() new_constitution = get_new_constitution_for_install(args, install_path) network.consortium.set_constitution(primary, new_constitution) # Note: Changes to constitution between versions should be tested here LOG.info(f"Add node to new service [cycle nodes: {cycle_existing_nodes}]") nodes_to_cycle = network.get_joined_nodes() if cycle_existing_nodes else [] nodes_to_add_count = len(nodes_to_cycle) if cycle_existing_nodes else 1 for _ in range(0, nodes_to_add_count): new_node = network.create_node( "local://localhost", binary_dir=binary_dir, library_dir=library_dir, version=version, ) network.join_node(new_node, args.package, args) network.trust_node(new_node, args) new_node.verify_certificate_validity_period( expected_validity_period_days=DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS ) for node in nodes_to_cycle: network.retire_node(primary, node) if primary == node: primary, _ = network.wait_for_new_primary(primary) node.stop() test_all_nodes_cert_renewal(network, args) LOG.info("Apply transactions to new nodes only") issue_activity_on_live_service(network, args) test_random_receipts(network, args, lts=True)
def test_isolate_and_reconnect_primary(network, args): primary, backups = network.find_nodes() with network.partitioner.partition(backups): new_primary, _ = network.wait_for_new_primary( primary, nodes=backups, timeout_multiplier=6 ) new_tx = check_can_progress(new_primary) # Check reconnected former primary has caught up with primary.client() as c: r = c.get("/node/commit") timeout = 5 end_time = time.time() + timeout while time.time() < end_time: current_tx = TxID.from_str( c.get("/node/commit").body.json()["transaction_id"] ) if current_tx.seqno >= new_tx.seqno: return network time.sleep(0.1) assert False, f"Stuck at {r}"
def test_retire_primary(network, args): pre_count = count_nodes(node_configs(network), network) primary, backup = network.find_primary_and_any_backup() network.retire_node(primary, primary, timeout=15) # Query this backup to find the new primary. If we ask any other # node, then this backup may not know the new primary by the # time we call check_can_progress. new_primary, _ = network.wait_for_new_primary(primary, nodes=[backup]) # The old primary should automatically be removed from the store # once a new primary is elected network.wait_for_node_in_store( new_primary, primary.node_id, node_status=None, timeout=3, ) check_can_progress(backup) post_count = count_nodes(node_configs(network), network) assert pre_count == post_count + 1 primary.stop() wait_for_reconfiguration_to_complete(network) return network
def run(args): # This is deliberately 5, because the rest of the test depends on this # to grow a prefix and allow just enough nodes to resume to reach the # desired election result. Conversion to a general f isn't trivial. hosts = ["local://localhost"] * 5 with infra.network.network(hosts, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb) as network: network.start_and_open(args) primary, backups = network.find_nodes() # Suspend three of the backups to prevent commit backups[1].suspend() backups[2].suspend() backups[3].stop() committable_txs = [] # Run some transactions that can't be committed now with primary.client("user0") as uc: for i in range(3): committable_txs.append( uc.post("/app/log/private", { "id": 100 + i, "msg": "Hello world" })) last_tx = committable_txs[-1] sig_view, sig_seqno = last_tx.view, last_tx.seqno + 1 with backups[0].client() as bc: wait_for_pending(bc, sig_view, sig_seqno) # Suspend the final backup and run some transactions which only the partitioned # primary hears, which should be discarded by the new primary # NB: We can't guarantee that these will be discarded. Since we can't control # what order the queued actions occur in after resuming, they may be appended # before an election is called. They key assertion is that this primary is able # to rejoin the network whatever happens, even when (in the usual case) they # hold a suffix which has been discarded. backups[0].suspend() post_partition_txs = [] with primary.client("user0") as uc: for i in range(3): post_partition_txs.append( uc.post("/app/log/private", { "id": 100 + i, "msg": "Hello world" })) # Sleep long enough that this primary should be instantly replaced when nodes wake sleep_time = 2 * args.election_timeout_ms / 1000 LOG.info(f"Sleeping {sleep_time}s") time.sleep(sleep_time) # Suspend the primary, resume other backups primary.suspend() backups[0].resume() backups[1].resume() backups[2].resume() new_primary, _ = network.wait_for_new_primary(primary, timeout_multiplier=10) with new_primary.client("user0") as uc: # Check that uncommitted but committable suffix is preserved check_commit = infra.checker.Checker(uc) for tx in committable_txs: check_commit(tx) # Check that new transactions can be committed with new_primary.client("user0") as uc: for i in range(3): r = uc.post("/app/log/private", { "id": 100 + i, "msg": "Hello world" }) assert r.status_code == 200 uc.wait_for_commit(r) # Resume original primary, check that they rejoin correctly, including new transactions primary.resume() network.wait_for_node_commit_sync(timeout=16)
def test_isolate_primary_from_one_backup(network, args): p, backups = network.find_nodes() b_0, b_1 = backups # Issue one transaction, waiting for all nodes to be have reached # the same level of commit, so that nodes outside of partition can # become primary after this one is dropped # Note: Because of https://github.com/microsoft/CCF/issues/2224, we need to # issue a write transaction instead of just reading the TxID of the latest entry initial_txid = network.txs.issue(network) # Isolate first backup from primary so that first backup becomes candidate # in a new term and wins the election # Note: Managed manually rules = network.partitioner.isolate_node(p, b_0) LOG.info( f"Check that primary {p.local_node_id} reports increasing last ack time for partitioned backup {b_0.local_node_id}" ) last_ack = 0 while True: with p.client() as c: r = c.get("/node/consensus", log_capture=[]).body.json()["details"] ack = r["acks"][b_0.node_id]["last_received_ms"] if r["primary_id"] is not None: assert ( ack >= last_ack ), f"Nodes {p.local_node_id} and {b_0.local_node_id} are no longer partitioned" last_ack = ack else: LOG.debug(f"Node {p.local_node_id} is no longer primary") break time.sleep(0.1) # Now wait for several elections to occur. We expect: # - b_0 to call and win an election with b_1's help # - b_0 to produce a new signature, and commit it with b_1's help # - p to call its own election, and lose because it doesn't have this signature # - In the resulting election race: # - If p calls first, it loses and we're in the same situation # - If b_0 calls first, it wins, but then p calls its election and we've returned to the same situation # - If b_1 calls first, it can win and then bring _both_ nodes up-to-date, becoming a _stable_ primary # So we repeat elections until b_1 is primary new_primary = network.wait_for_primary_unanimity( min_view=initial_txid.view, timeout_multiplier=30) assert new_primary == b_1 new_view = network.txs.issue(network).view # The partition is now between 2 backups, but both can talk to the new primary # Explicitly drop rules before continuing rules.drop() LOG.info( f"Check that new primary {new_primary.local_node_id} reports stable acks" ) last_ack = 0 end_time = time.time() + 2 * network.args.election_timeout_ms // 1000 while time.time() < end_time: with new_primary.client() as c: acks = c.get("/node/consensus", log_capture=[]).body.json()["details"]["acks"] delayed_acks = [ ack for ack in acks.values() if ack["last_received_ms"] > args.election_timeout_ms ] if delayed_acks: raise RuntimeError( f"New primary reported some delayed acks: {acks}") time.sleep(0.1) # Original primary should now, or very soon, report the new primary new_primary_, new_view_ = network.wait_for_new_primary(p, nodes=[p]) assert ( new_primary == new_primary_ ), f"New primary {new_primary_.local_node_id} after partition is dropped is different than before {new_primary.local_node_id}" assert ( new_view == new_view_ ), f"Consensus view {new_view} should not have changed after partition is dropped: now {new_view_}" return network
def run(args): hosts = ["localhost", "localhost"] with infra.network.network(hosts, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb) as network: network.start_and_join(args) primary, _ = network.find_nodes() first_code_id = get_code_id( infra.path.build_lib_path(args.package, args.enclave_type)) with primary.client() as uc: r = uc.get("/node/code") assert r.body.json() == { "versions": [{ "digest": first_code_id, "status": "ACCEPTED" }], }, r.body LOG.info("Adding a new node") new_node = network.create_and_trust_node(args.package, "localhost", args) assert new_node new_code_id = get_code_id( infra.path.build_lib_path(args.patched_file_name, args.enclave_type)) LOG.info(f"Adding a node with unsupported code id {new_code_id}") code_not_found_exception = None try: network.create_and_add_pending_node(args.patched_file_name, "localhost", args, timeout=3) except infra.network.CodeIdNotFound as err: code_not_found_exception = err assert ( code_not_found_exception is not None ), f"Adding a node with unsupported code id {new_code_id} should fail" # Slow quote verification means that any attempt to add a node may cause an election, so confirm primary after adding node primary, _ = network.find_primary() network.consortium.add_new_code(primary, new_code_id) with primary.client() as uc: r = uc.get("/node/code") versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"]) expected = sorted( [ { "digest": first_code_id, "status": "ACCEPTED" }, { "digest": new_code_id, "status": "ACCEPTED" }, ], key=lambda x: x["digest"], ) assert versions == expected, versions new_nodes = set() old_nodes_count = len(network.nodes) new_nodes_count = old_nodes_count + 1 LOG.info( f"Adding more new nodes ({new_nodes_count}) than originally existed ({old_nodes_count})" ) for _ in range(0, new_nodes_count): new_node = network.create_and_trust_node(args.patched_file_name, "localhost", args) assert new_node new_nodes.add(new_node) LOG.info("Stopping all original nodes") old_nodes = set(network.nodes).difference(new_nodes) for node in old_nodes: LOG.debug(f"Stopping old node {node.node_id}") node.stop() new_primary, _ = network.wait_for_new_primary(primary.node_id) LOG.info(f"New_primary is {new_primary.node_id}") LOG.info("Adding another node to the network") new_node = network.create_and_trust_node(args.patched_file_name, "localhost", args) assert new_node network.wait_for_node_commit_sync(args.consensus) LOG.info("Remove first code id") network.consortium.retire_code(new_node, first_code_id) with new_node.client() as uc: r = uc.get("/node/code") versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"]) expected = sorted( [ { "digest": first_code_id, "status": "RETIRED" }, { "digest": new_code_id, "status": "ACCEPTED" }, ], key=lambda x: x["digest"], ) assert versions == expected, versions LOG.info(f"Adding a node with retired code id {first_code_id}") code_not_found_exception = None try: network.create_and_add_pending_node(args.package, "localhost", args, timeout=3) except infra.network.CodeIdRetired as err: code_not_found_exception = err assert ( code_not_found_exception is not None ), f"Adding a node with unsupported code id {new_code_id} should fail" LOG.info("Adding another node with the new code to the network") new_node = network.create_and_trust_node(args.patched_file_name, "localhost", args) assert new_node network.wait_for_node_commit_sync(args.consensus)
def test_new_service( network, args, install_path, binary_dir, library_dir, version, cycle_existing_nodes=False, ): LOG.info("Update constitution") primary, _ = network.find_primary() new_constitution = get_new_constitution_for_install(args, install_path) network.consortium.set_constitution(primary, new_constitution) all_nodes = network.get_joined_nodes() # Note: Changes to constitution between versions should be tested here LOG.info(f"Add node to new service [cycle nodes: {cycle_existing_nodes}]") nodes_to_cycle = network.get_joined_nodes() if cycle_existing_nodes else [] nodes_to_add_count = len(nodes_to_cycle) if cycle_existing_nodes else 1 # Pre-2.0 nodes require X509 time format valid_from = str(infra.crypto.datetime_to_X509time( datetime.datetime.now())) for _ in range(0, nodes_to_add_count): new_node = network.create_node( "local://localhost", binary_dir=binary_dir, library_dir=library_dir, version=version, ) network.join_node(new_node, args.package, args) network.trust_node( new_node, args, valid_from=valid_from, ) new_node.verify_certificate_validity_period( expected_validity_period_days=DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS ) all_nodes.append(new_node) for node in nodes_to_cycle: network.retire_node(primary, node) if primary == node: primary, _ = network.wait_for_new_primary(primary) node.stop() test_all_nodes_cert_renewal(network, args, valid_from=valid_from) test_service_cert_renewal(network, args, valid_from=valid_from) LOG.info("Waiting for retired nodes to be automatically removed") for node in all_nodes: network.wait_for_node_in_store( primary, node.node_id, node_status=ccf.ledger.NodeStatus.TRUSTED if node.is_joined() else None, ) if args.check_2tx_reconfig_migration: test_migration_2tx_reconfiguration( network, args, initial_is_1tx=False, # Reconfiguration type added in 2.x binary_dir=binary_dir, library_dir=library_dir, version=version, valid_from=valid_from, ) LOG.info("Apply transactions to new nodes only") issue_activity_on_live_service(network, args) test_random_receipts(network, args, lts=True)
def run_code_upgrade_from( args, from_install_path, to_install_path, from_version=None, to_version=None, from_container_image=None, ): from_binary_dir, from_library_dir = get_bin_and_lib_dirs_for_install_path( from_install_path) to_binary_dir, to_library_dir = get_bin_and_lib_dirs_for_install_path( to_install_path) set_js_args(args, from_install_path, to_install_path) jwt_issuer = infra.jwt_issuer.JwtIssuer( "https://localhost", refresh_interval=args.jwt_key_refresh_interval_s) with jwt_issuer.start_openid_server(): txs = app.LoggingTxs(jwt_issuer=jwt_issuer) with infra.network.network( args.nodes, binary_directory=from_binary_dir, library_directory=from_library_dir, pdb=args.pdb, txs=txs, jwt_issuer=jwt_issuer, version=from_version, ) as network: network.start_and_open(args, node_container_image=from_container_image) old_nodes = network.get_joined_nodes() primary, _ = network.find_primary() LOG.info("Apply transactions to old service") issue_activity_on_live_service(network, args) new_code_id = infra.utils.get_code_id( args.enclave_type, args.oe_binary, args.package, library_dir=to_library_dir, ) network.consortium.add_new_code(primary, new_code_id) # Note: alternate between joining from snapshot and replaying entire ledger new_nodes = [] from_snapshot = True for _ in range(0, len(old_nodes)): new_node = network.create_node( "local://localhost", binary_dir=to_binary_dir, library_dir=to_library_dir, version=to_version, ) network.join_node(new_node, args.package, args, from_snapshot=from_snapshot) network.trust_node( new_node, args, valid_from=str( # Pre-2.0 nodes require X509 time format infra.crypto.datetime_to_X509time( datetime.datetime.now())), ) # For 2.x nodes joining a 1.x service before the constitution is updated, # the node certificate validity period is set by the joining node itself # as [node startup time, node startup time + 365 days] new_node.verify_certificate_validity_period( expected_validity_period_days= DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS, ignore_proposal_valid_from=True, ) from_snapshot = not from_snapshot new_nodes.append(new_node) # Verify that all nodes run the expected CCF version for node in network.get_joined_nodes(): # Note: /node/version endpoint was added in 2.x if not node.major_version or node.major_version > 1: with node.client() as c: r = c.get("/node/version") expected_version = node.version or args.ccf_version version = r.body.json()["ccf_version"] assert ( version == expected_version ), f"For node {node.local_node_id}, expect version {expected_version}, got {version}" LOG.info( "Apply transactions to hybrid network, with primary as old node" ) issue_activity_on_live_service(network, args) old_code_id = infra.utils.get_code_id( args.enclave_type, args.oe_binary, args.package, library_dir=from_library_dir, ) primary, _ = network.find_primary() network.consortium.retire_code(primary, old_code_id) for index, node in enumerate(old_nodes): network.retire_node(primary, node) if primary == node: primary, _ = network.wait_for_new_primary(primary) # This block is here to test the transition period from a network that # does not support custom claims to one that does. It can be removed after # the transition is complete. # # The new build, being unreleased, doesn't have a version at all if not primary.major_version: LOG.info("Upgrade to new JS app") # Upgrade to a version of the app containing an endpoint that # registers custom claims network.consortium.set_js_app_from_dir( primary, args.new_js_app_bundle) LOG.info("Run transaction with additional claim") # With wait_for_sync, the client checks that all nodes, including # the minority of old ones, have acked the transaction msg_idx = network.txs.idx + 1 txid = network.txs.issue(network, number_txs=1, record_claim=True, wait_for_sync=True) assert len(network.txs.pub[msg_idx]) == 1 claims = network.txs.pub[msg_idx][-1]["msg"] LOG.info( "Check receipts are fine, including transaction with claims" ) test_random_receipts( network, args, lts=True, additional_seqnos={txid.seqno: claims.encode()}, ) # Also check receipts on an old node if index + 1 < len(old_nodes): next_node = old_nodes[index + 1] test_random_receipts( network, args, lts=True, additional_seqnos={txid.seqno: None}, node=next_node, ) node.stop() LOG.info("Service is now made of new nodes only") # Rollover JWKS so that new primary must read historical CA bundle table # and retrieve new keys via auto refresh if not os.getenv("CONTAINER_NODES"): jwt_issuer.refresh_keys() # Note: /gov/jwt_keys/all endpoint was added in 2.x primary, _ = network.find_nodes() if not primary.major_version or primary.major_version > 1: jwt_issuer.wait_for_refresh(network) else: time.sleep(3) else: # https://github.com/microsoft/CCF/issues/2608#issuecomment-924785744 LOG.warning( "Skipping JWT refresh as running nodes in container") # Code update from 1.x to 2.x requires cycling the freshly-added 2.x nodes # once. This is because 2.x nodes will not have an endorsed certificate # recorded in the store and thus will not be able to have their certificate # refreshed, etc. test_new_service( network, args, to_install_path, to_binary_dir, to_library_dir, to_version, cycle_existing_nodes=True, ) # Check that the ledger can be parsed network.get_latest_ledger_public_state()
def test_update_all_nodes(network, args): replacement_package = get_replacement_package(args) primary, _ = network.find_nodes() first_code_id = infra.utils.get_code_id(args.enclave_type, args.oe_binary, args.package) new_code_id = infra.utils.get_code_id(args.enclave_type, args.oe_binary, replacement_package) if args.enclave_type == "virtual": # Pretend this was already present network.consortium.add_new_code(primary, first_code_id) LOG.info("Add new code id") network.consortium.add_new_code(primary, new_code_id) with primary.client() as uc: r = uc.get("/node/code") versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"]) expected = sorted( [ { "digest": first_code_id, "status": "AllowedToJoin" }, { "digest": new_code_id, "status": "AllowedToJoin" }, ], key=lambda x: x["digest"], ) assert versions == expected, versions LOG.info("Remove old code id") network.consortium.retire_code(primary, first_code_id) with primary.client() as uc: r = uc.get("/node/code") versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"]) expected = sorted( [ { "digest": new_code_id, "status": "AllowedToJoin" }, ], key=lambda x: x["digest"], ) assert versions == expected, versions old_nodes = network.nodes.copy() LOG.info("Start fresh nodes running new code") for _ in range(0, len(old_nodes)): new_node = network.create_node("local://localhost") network.join_node(new_node, replacement_package, args) network.trust_node(new_node, args) LOG.info("Retire original nodes running old code") for node in old_nodes: primary, _ = network.find_nodes() network.retire_node(primary, node) # Elections take (much) longer than a backup removal which is just # a commit, so we need to adjust our timeout accordingly, hence this branch if node.node_id == primary.node_id: new_primary, _ = network.wait_for_new_primary(primary) primary = new_primary node.stop() LOG.info("Check the network is still functional") check_can_progress(new_node) return network