async def test_inactive_window(self, bft_network): """ The goal of this test is to verify full catch up of a Replica only from the Inactive Window. 1) Start all Replicas without Replica 1, which will later catch up from the Primary's Inactive Window. 2) Advance all Replicas to 1 sequence number beyond the first stable and verify they have all collected Stable Checkpoints. 3) Start and isolate the late Replica 1 form all others except the Primary. This way it will not be able to start State Transfer and will only be able to catch up from the Primary's Inactive Window. 4) Verify that Replica 1 has managed to catch up. """ late_replica = 1 bft_network.start_replicas( bft_network.all_replicas(without={late_replica})) skvbc = kvbc.SimpleKVBCProtocol(bft_network) stable_checkpoint_to_reach = 1 num_reqs_to_catch_up = 151 async def write_req(num_req=1): for _ in range(num_req): await skvbc.write_known_kv() # create checkpoint and wait for checkpoint propagation await skvbc.fill_and_wait_for_checkpoint( initial_nodes=bft_network.get_live_replicas(), num_of_checkpoints_to_add=stable_checkpoint_to_reach, verify_checkpoint_persistency=False) await bft_network.wait_for_replicas_to_collect_stable_checkpoint( bft_network.get_live_replicas(), stable_checkpoint_to_reach) with trio.fail_after(seconds=30): with net.ReplicaOneWayTwoSubsetsIsolatingAdversary( bft_network, {1}, {6, 5, 4, 3, 2}) as adversary: adversary.interfere() bft_network.start_replica(late_replica) late_replica_catch_up = False while not late_replica_catch_up: for replica_id in bft_network.all_replicas(): last_stable = await bft_network.get_metric( replica_id, bft_network, 'Gauges', "lastStableSeqNum") last_exec = await bft_network.get_metric( replica_id, bft_network, 'Gauges', "lastExecutedSeqNum") log.log_message( message_type= f"replica = {replica_id}; last_stable = {last_stable}; lase_exec = {last_exec}" ) if replica_id == late_replica and last_exec >= num_reqs_to_catch_up: late_replica_catch_up = True await write_req() await trio.sleep(seconds=3)
async def test_view_change_with_isolated_replicas(self, bft_network, tracker): """ test View Changes with multiple View increments, where the isolated F-1 expected next primaries will not be able to step in as primaries, but will activate the corresponding view for which it is theirs turn to become Primary. Step by step scenario: 1. Use a one way isolating adversary to isolate the F-1 replicas after the current primary in such a way that they cannot send messages to the peers, but can receive messages from them. 2. Stop the current primary. 3. Send Client requests to trigger a View Change. 4. Wait for the system to finish View Change. Note that multiple View increments will happen. 5. Drop the network adversary and verify Fast Commit Path is recovered in the system by introducing client requests. We can perform this test in a loop multiple times. """ # log = foo() # start replicas [bft_network.start_replica(i) for i in bft_network.all_replicas()] skvbc = kvbc.SimpleKVBCProtocol(bft_network, tracker) loop_count = 0 while (loop_count < loops): loop_count = loop_count + 1 primary = await bft_network.get_current_primary() index_list = range(primary + 1, primary + bft_network.config.f) replicas_to_isolate = [] for i in index_list: replicas_to_isolate.append(i % bft_network.config.n) other_replicas = bft_network.all_replicas(without=set(replicas_to_isolate)) view = await bft_network.get_current_view() with net.ReplicaOneWayTwoSubsetsIsolatingAdversary(bft_network, other_replicas, replicas_to_isolate) as adversary: adversary.interfere() bft_network.stop_replica(primary) await skvbc.run_concurrent_ops(10) await bft_network.wait_for_replicas_to_reach_at_least_view(other_replicas, expected_view=view + bft_network.config.f, timeout=15 + timeouts) bft_network.start_replica(primary) await bft_network.wait_for_fast_path_to_be_prevalent( run_ops=lambda: skvbc.run_concurrent_ops(num_ops=20, write_weight=1), threshold=20)
async def test_inactive_window_catchup_up_to_gap(self, bft_network): """ In this test we check the catchup from Inactive Window when we have a gap related to the Peers. The situation can happen if the catching up Replica's last Stable SeqNo is 3 Checkpoints behind its Peers, but its Last Executed is only 2 Checkpoints behind. Steps to recreate: 1) Start all replicas. 2) Isolate 1 Replica from all but the Primary. We will call it Late Replica. 3) Advance all replicas beyond the first Stable Checkpoint. The Late Replica won't be able to collect a Stable Checkpoint. 4) Stop the Late Replica and advance all others 2 more Checkpoints. 5) Start the late Replica and verify it catches up to the end of its Working Window from the Inactive Windows of its Peers. """ late_replica = 1 primary = 0 bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) first_stable_checkpoint_to_reach = 1 checkpoints_to_advance_after_first = 2 seq_nums_per_checkpoint = 150 num_reqs_after_first_checkpoint = 4 async def write_req(num_req=1): for _ in range(num_req): await skvbc.send_write_kv_set() with net.ReplicaOneWayTwoSubsetsIsolatingAdversary( bft_network, {late_replica}, bft_network.all_replicas(without={primary, late_replica})) as adversary: adversary.interfere() # create checkpoint and wait for checkpoint propagation await skvbc.fill_and_wait_for_checkpoint( initial_nodes=bft_network.all_replicas(without={late_replica}), num_of_checkpoints_to_add=first_stable_checkpoint_to_reach, verify_checkpoint_persistency=False ) await bft_network.wait_for_replicas_to_collect_stable_checkpoint( bft_network.all_replicas(without={late_replica}), first_stable_checkpoint_to_reach) await write_req(num_reqs_after_first_checkpoint) # Wait for late_replica to reach num_reqs_after_first_checkpoint past the 1-st Checkpoint with trio.fail_after(seconds=30): while True: last_exec = await bft_network.get_metric(late_replica, bft_network, 'Gauges', "lastExecutedSeqNum") log.log_message(message_type=f"replica = {late_replica}; lase_exec = {last_exec}") if last_exec == seq_nums_per_checkpoint + num_reqs_after_first_checkpoint: break await trio.sleep(seconds=0.3) bft_network.stop_replica(late_replica) # create 2 checkpoints and wait for checkpoint propagation await skvbc.fill_and_wait_for_checkpoint( initial_nodes=bft_network.all_replicas(without={late_replica}), num_of_checkpoints_to_add=checkpoints_to_advance_after_first, verify_checkpoint_persistency=False ) await bft_network.wait_for_replicas_to_collect_stable_checkpoint( bft_network.all_replicas(without={late_replica}), first_stable_checkpoint_to_reach + checkpoints_to_advance_after_first) bft_network.start_replica(late_replica) with trio.fail_after(seconds=30): late_replica_catch_up = False while not late_replica_catch_up: for replica_id in bft_network.get_live_replicas(): last_stable = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastStableSeqNum") last_exec = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastExecutedSeqNum") log.log_message(message_type=f"replica = {replica_id}; last_stable = {last_stable}; lase_exec = {last_exec}") if replica_id == late_replica and last_exec == 2*seq_nums_per_checkpoint: late_replica_catch_up = True await write_req() await trio.sleep(seconds=3)
async def test_view_change_with_f_replicas_collected_stable_checkpoint(self, bft_network): """ The goal of this test is to leave the system with F Replicas that have collected a Stable Checkpoint and to cause a View Change. In this way we get a misalignment in the Restrictions of the previous View and we get in an indefinite View Change scenario. 1) Start all Replicas. 2) Move all Replicas to 1 SeqNo prior to the stable Checkpoint. 3) Stop Replicas 1 and 2. 4) Isolate Replica 3 from 6, 5 and 4 only in one direction - 3 will be able to send messages to all, but won't receive from 6, 5 and 4. this way 3 won't be able to collect a Stable Checkpoint. Do the same for 6, isolating in the same manner from 3, 4 and 5 Do the same for 4, isolating in the same manner from 3, 5 and 6 This way only 0 and 5 will collect a Stable Checkpoint for SeqNo 150. 5) With the isolation scenario, send Client Requests until F replicas collect a Stable Checkpoint. Only Replicas 0 and 5 will collect. 6) We stop Replicas 0, 5 and 6 and start 1 and 2. This way we will cause View Change and we will have only 2 Replicas with a Stable Checkpoint (5 and 0). 7) Start Replicas 5 and 0. Within this state the system must be able to finalize a View Change, because we have (N - 1) live Replicas, but we have only F that have collected a Stable Checkpoint that are live. """ # step 1 bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) num_reqs_before_first_stable = 149 async def write_req(num_req=1): for _ in range(num_req): await skvbc.send_write_kv_set() await write_req(num_reqs_before_first_stable) # step 2 while True: last_exec_seqs = [] for replica_id in bft_network.all_replicas(): last_stable = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastStableSeqNum") last_exec = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastExecutedSeqNum") log.log_message(message_type=f"replica = {replica_id}; last_stable = {last_stable};\ last_exec = {last_exec}") last_exec_seqs.append(last_exec) if sum(x == num_reqs_before_first_stable for x in last_exec_seqs) == bft_network.config.n: break else: last_exec_seqs.clear() # step 3 bft_network.stop_replica(1) bft_network.stop_replica(2) last_stable_seqs = [] # step 4 with net.ReplicaOneWayTwoSubsetsIsolatingAdversary(bft_network, {3}, {6, 5, 4}) as adversary: adversary.add_rule({6}, {3, 4, 5}) adversary.add_rule({4}, {3, 5, 6}) adversary.interfere() while True: for replica_id in bft_network.get_live_replicas(): last_stable = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastStableSeqNum") last_exec = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastExecutedSeqNum") log.log_message(message_type=f"replica = {replica_id}; last_stable = {last_stable};\ lase_exec = {last_exec}") last_stable_seqs.append(last_stable) if sum(x == num_reqs_before_first_stable + 1 for x in last_stable_seqs) == bft_network.config.f: # step 5 completed break else: last_stable_seqs.clear() await write_req() await trio.sleep(seconds=3) # step 6 bft_network.stop_replica(0) bft_network.stop_replica(6) bft_network.stop_replica(5) bft_network.start_replica(1) bft_network.start_replica(2) # Send a Client Request to trigger View Change with trio.move_on_after(seconds=3): await write_req() bft_network.start_replica(5) bft_network.start_replica(0) # Send a Client Request to trigger View Change with trio.move_on_after(seconds=3): await write_req() # step 7 await bft_network.wait_for_view( replica_id=3, expected=lambda v: v == 1, err_msg="Make sure a view change happens from 0 to 1" ) await skvbc.wait_for_liveness()