async def test_pruning_command(self, bft_network): with log.start_action(action_type="test_pruning_command"): bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) client = bft_network.random_client() # Create 100 blocks in total, including the genesis block we have 101 blocks k, v = await skvbc.write_known_kv() for i in range(99): v = skvbc.random_value() await client.write(skvbc.write_req([], [(k, v)], 0)) # Get the minimal latest pruneable block among all replicas op = operator.Operator(bft_network.config, client, bft_network.builddir) await op.latest_pruneable_block() latest_pruneable_blocks = [] rsi_rep = client.get_rsi_replies() for r in rsi_rep.values(): lpab = cmf_msgs.ReconfigurationResponse.deserialize(r)[0] latest_pruneable_blocks += [lpab.response] await op.prune(latest_pruneable_blocks) rsi_rep = client.get_rsi_replies() # we expect to have at least 2f + 1 replies for rep in rsi_rep: r = rsi_rep[rep] data = cmf_msgs.ReconfigurationResponse.deserialize(r)[0] pruned_block = int(data.additional_data.decode('utf-8')) assert pruned_block <= 90
async def test_key_exchange_command(self, bft_network): """ No initial key rotation Sends key exchange command to replica 0 New keys for replica 0 should get effective at checkpoint 2, i.e. seqnum 300 """ bft_network.start_all_replicas() client = bft_network.random_client() skvbc = kvbc.SimpleKVBCProtocol(bft_network) op = operator.Operator(bft_network.config, client, bft_network.builddir) await op.key_exchange([0]) for i in range(450): await skvbc.write_known_kv() sent_key_exchange_counter = await bft_network.metrics.get( 0, *["KeyExchangeManager", "Counters", "sent_key_exchange"]) assert sent_key_exchange_counter == 1 self_key_exchange_counter = await bft_network.metrics.get( 0, *["KeyExchangeManager", "Counters", "self_key_exchange"]) assert self_key_exchange_counter == 1 public_key_exchange_for_peer_counter = await bft_network.metrics.get( 1, *[ "KeyExchangeManager", "Counters", "public_key_exchange_for_peer" ]) assert public_key_exchange_for_peer_counter == 1
async def test_wedge_command_and_specific_replica_info(self, bft_network): """ Sends a wedge command and check that the system stops from processing new requests. Note that in this test we assume no failures and synchronized network. The test does the following: 1. A client sends a wedge command 2. The client then sends a "Have you stopped" read only command such that each replica answers "I have stopped" 3. The client validates with the metrics that all replicas have stopped """ bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) client = bft_network.random_client() # We increase the default request timeout because we need to have around 300 consensuses which occasionally may take more than 5 seconds client.config._replace(req_timeout_milli=10000) op = operator.Operator(bft_network.config, client, bft_network.builddir) await op.wedge() with trio.fail_after(seconds=90): done = False while done is False: await op.wedge_status() rsi_rep = client.get_rsi_replies() done = True for r in rsi_rep.values(): res = cmf_msgs.ReconfigurationResponse.deserialize(r) status = res[0].response.stopped if status is False: done = False break await self.validate_stop_on_super_stable_checkpoint(bft_network, skvbc)
async def test_key_exchange_command(self, bft_network): bft_network.start_all_replicas() client = bft_network.random_client() skvbc = kvbc.SimpleKVBCProtocol(bft_network) # We increase the default request timeout because we need to have around 300 consensuses which occasionally may take more than 5 seconds client.config._replace(req_timeout_milli=10000) op = operator.Operator(bft_network.config, client, bft_network.builddir) await op.key_exchange()
async def test_state_transfer_rvt_validity_after_pruning( self, bft_network): """ The goal of this test is to validate that all replicas have their Range validation trees (RVTs) synchronized after running the consensus and then pruning. 1) Given a BFT network start N - 1 replicas (leaving one stale) 2) Send enough requests to trigger 10 checkpoints 3) Start the stale replica 4) Enter state transfer to bring back the stale node up-to-date 5) Wait for state transfer to be finished 6) Wait for the RVT root values to be in sync 7) Prune 8) Wait for two more checkpoints so that the RVT is updated to reflect the changes after the pruning 9) Wait for the RVT root values to be in sync """ skvbc = kvbc.SimpleKVBCProtocol(bft_network) stale_node = random.choice(bft_network.all_replicas(without={0})) await skvbc.prime_for_state_transfer( stale_nodes={stale_node}, checkpoints_num=10, # key-exchange changes the last executed seqnum persistency_enabled=False) bft_network.start_replica(stale_node) await bft_network.wait_for_state_transfer_to_start() await bft_network.wait_for_state_transfer_to_stop(0, stale_node) # Wait for the RVT root values to be in sync before the pruning await bft_network.wait_for_replicas_rvt_root_values_to_be_in_sync( bft_network.all_replicas()) # Get the minimal latest pruneable block among all replicas client = bft_network.random_client() op = operator.Operator(bft_network.config, client, bft_network.builddir) await op.latest_pruneable_block() latest_pruneable_blocks = [] rsi_rep = client.get_rsi_replies() for r in rsi_rep.values(): lpab = cmf_msgs.ReconfigurationResponse.deserialize(r)[0] latest_pruneable_blocks += [lpab.response] await op.prune(latest_pruneable_blocks) # Wait for two checkpoints so that the RVT is updated to reflect the changes after the pruning await skvbc.fill_and_wait_for_checkpoint( bft_network.all_replicas(), num_of_checkpoints_to_add=2, verify_checkpoint_persistency=False, assert_state_transfer_not_started=False) # Validate that the RVT root values are in sync after the pruning has finished await bft_network.wait_for_replicas_rvt_root_values_to_be_in_sync( bft_network.all_replicas())
async def test_wedge_command_where_noops_should_be_sent_in_two_parts( self, bft_network): """ Sends a wedge command on sequence number 300 and check that the system stops from processing new requests. this way, when the primary tries to sent noop commands, the working window is reach only to 450. Thus, it has to wait for a new stable checkpoint before sending the last 150 noops Note: In this test we assume that the batch duration is no """ bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) client = bft_network.random_client() # We increase the default request timeout because we need to have around 300 consensuses which occasionally may take more than 5 seconds client.config._replace(req_timeout_milli=10000) # bring the system to sequence number 299 for i in range(299): await skvbc.write_known_kv() # verify that all nodes are in sequence number 299 not_reached = True with trio.fail_after(seconds=30): while not_reached: not_reached = False for r in bft_network.all_replicas(): lastExecSeqNum = await bft_network.get_metric( r, bft_network, "Gauges", "lastExecutedSeqNum") if lastExecSeqNum != 299: not_reached = True break # now, send a wedge command. The wedge command sequence number is 300. Hence, in this point the woeking window # is between 150 - 450. But, the wedge command will make the primary to send noops until 600. # we want to verify that the primary manages to send the noops as required. op = operator.Operator(bft_network.config, client, bft_network.builddir) await op.wedge() # now, verify that the system has managed to stop with trio.fail_after(seconds=90): done = False while done is False: await op.wedge_status() rsi_rep = client.get_rsi_replies() done = True for r in rsi_rep.values(): res = cmf_msgs.ReconfigurationResponse.deserialize(r) status = res[0].response.stopped if status is False: done = False break await self.verify_replicas_are_in_wedged_checkpoint( bft_network, 2, range(bft_network.config.n)) await self.verify_last_executed_seq_num(bft_network, 2) await self.validate_stop_on_super_stable_checkpoint(bft_network, skvbc)
async def test_wedge_command_with_state_transfer(self, bft_network): """ This test checks that even a replica that received the super stable checkpoint via the state transfer mechanism is able to stop at the super stable checkpoint. The test does the following: 1. Start all replicas but 1 2. A client sends a wedge command 3. Validate that all started replicas reached to the next next checkpoint 4. Start the late replica 5. Validate that the late replica completed the state transfer 6. Validate that all replicas stopped at the super stable checkpoint and that new commands are not being processed """ initial_prim = 0 late_replicas = bft_network.random_set_of_replicas(1, {initial_prim}) on_time_replicas = bft_network.all_replicas(without=late_replicas) bft_network.start_replicas(on_time_replicas) skvbc = kvbc.SimpleKVBCProtocol(bft_network) await skvbc.wait_for_liveness() checkpoint_before = await bft_network.wait_for_checkpoint(replica_id=0) client = bft_network.random_client() # We increase the default request timeout because we need to have around 300 consensuses which occasionally may take more than 5 seconds client.config._replace(req_timeout_milli=10000) with log.start_action(action_type="send_wedge_cmd", checkpoint_before=checkpoint_before, late_replicas=list(late_replicas)): op = operator.Operator(bft_network.config, client, bft_network.builddir) await op.wedge() await self.verify_replicas_are_in_wedged_checkpoint( bft_network, checkpoint_before, on_time_replicas) bft_network.start_replicas(late_replicas) await bft_network.wait_for_state_transfer_to_start() for r in late_replicas: await bft_network.wait_for_state_transfer_to_stop( initial_prim, r, stop_on_stable_seq_num=False) await self.verify_replicas_are_in_wedged_checkpoint( bft_network, checkpoint_before, range(bft_network.config.n)) await self.validate_stop_on_super_stable_checkpoint(bft_network, skvbc)
async def test_pruning_status_command(self, bft_network): bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) client = bft_network.random_client() op = operator.Operator(bft_network.config, client, bft_network.builddir) await op.prune_status() rsi_rep = client.get_rsi_replies() for r in rsi_rep.values(): status = cmf_msgs.ReconfigurationResponse.deserialize(r)[0] assert status.response.in_progress is False assert status.response.last_pruned_block == 0 # Create 100 blocks in total, including the genesis block we have 101 blocks k, v = await skvbc.write_known_kv() for i in range(99): v = skvbc.random_value() await client.write(skvbc.write_req([], [(k, v)], 0)) # Get the minimal latest pruneable block among all replicas await op.latest_pruneable_block() latest_pruneable_blocks = [] rsi_rep = client.get_rsi_replies() for r in rsi_rep.values(): lpab = cmf_msgs.ReconfigurationResponse.deserialize(r)[0] latest_pruneable_blocks += [lpab.response] await op.prune(latest_pruneable_blocks) # Verify the system is able to get new write requests (which means that pruning has done) with trio.fail_after(30): await skvbc.write_known_kv() await op.prune_status() rsi_rep = client.get_rsi_replies() for r in rsi_rep.values(): status = cmf_msgs.ReconfigurationResponse.deserialize(r)[0] assert status.response.in_progress is False assert status.response.last_pruned_block <= 90
async def test_wedge_command(self, bft_network): """ Sends a wedge command and checks that the system stops processing new requests. Note that in this test we assume no failures and synchronized network. The test does the following: 1. A client sends a wedge command 2. The client verifies that the system reached a super stable checkpoint. 3. The client tries to initiate a new write bft command and fails """ bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) client = bft_network.random_client() # We increase the default request timeout because we need to have around 300 consensuses which occasionally may take more than 5 seconds client.config._replace(req_timeout_milli=10000) checkpoint_before = await bft_network.wait_for_checkpoint(replica_id=0) op = operator.Operator(bft_network.config, client, bft_network.builddir) await op.wedge() await self.verify_replicas_are_in_wedged_checkpoint(bft_network, checkpoint_before, range(bft_network.config.n)) await self.verify_last_executed_seq_num(bft_network, checkpoint_before) await self.validate_stop_on_super_stable_checkpoint(bft_network, skvbc)
async def test_pruning_with_ro_replica_failure(self, bft_network): bft_network.start_all_replicas() ro_replica_id = bft_network.config.n bft_network.start_replica(ro_replica_id) skvbc = kvbc.SimpleKVBCProtocol(bft_network) client = bft_network.random_client() op = operator.Operator(bft_network.config, client, bft_network.builddir) # Create more than 150 blocks in total, including the genesis block we have 101 blocks k, v = await skvbc.write_known_kv() for i in range(200): v = skvbc.random_value() await client.write(skvbc.write_req([], [(k, v)], 0)) # Wait for the read only replica to catch with the state await self._wait_for_st(bft_network, ro_replica_id, 150) # Get the minimal latest pruneable block among all replicas await op.latest_pruneable_block() latest_pruneable_blocks = [] rsi_rep = client.get_rsi_replies() for r in rsi_rep.values(): lpab = cmf_msgs.ReconfigurationResponse.deserialize(r)[0] latest_pruneable_blocks += [lpab.response] # Remove the read only latest pruneable block from the list for m in latest_pruneable_blocks: if m.replica >= bft_network.config.n: latest_pruneable_blocks.remove(m) assert len(latest_pruneable_blocks) == bft_network.config.n # Now, issue a prune request. we expect to receive an error as the read only latest prunebale block is missing rep = await op.prune(latest_pruneable_blocks) rep = cmf_msgs.ReconfigurationResponse.deserialize(rep)[0] assert rep.success is False
async def test_get_latest_pruneable_block(self, bft_network): bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) client = bft_network.random_client() # Create 100 blocks in total, including the genesis block we have 101 blocks k, v = await skvbc.write_known_kv() for i in range(99): v = skvbc.random_value() await client.write(skvbc.write_req([], [(k, v)], 0)) # Get the minimal latest pruneable block among all replicas op = operator.Operator(bft_network.config, client, bft_network.builddir) await op.latest_pruneable_block() rsi_rep = client.get_rsi_replies() min_prunebale_block = 1000 for r in rsi_rep.values(): lpab = cmf_msgs.ReconfigurationResponse.deserialize(r)[0] if lpab.response.block_id < min_prunebale_block: min_prunebale_block = lpab.response.block_id # Create another 100 blocks k, v = await skvbc.write_known_kv() for i in range(99): v = skvbc.random_value() await client.write(skvbc.write_req([], [(k, v)], 0)) # Get the new minimal latest pruneable block await op.latest_pruneable_block() rsi_rep = client.get_rsi_replies() min_prunebale_block_b = 1000 for r in rsi_rep.values(): lpab = cmf_msgs.ReconfigurationResponse.deserialize(r)[0] if lpab.response.block_id < min_prunebale_block_b: min_prunebale_block_b = lpab.response.block_id assert min_prunebale_block < min_prunebale_block_b
async def test_state_transfer_rvt_root_validation_after_adding_blocks( self, bft_network): """ The goal of this test is to validate that all replicas have their Range validation trees (RVTs) synchronized after running the consensus multiple times while there are random restarts and prunings. 1) Start all replicas in a given BFT network 2) Loop 6 times: 3) Send enough requests to trigger 2 checkpoints 4) After the 1st iteration, do pruning at each even iteration (i.e. i % 2 == 0) 5) Randomly choose if a replica will be restarted. If so, select a random replica and restart it 6) Wait for the RVT root values to be in sync """ for i in bft_network.all_replicas(): bft_network.start_replica(i) skvbc = kvbc.SimpleKVBCProtocol(bft_network) client = bft_network.random_client() op = operator.Operator(bft_network.config, client, bft_network.builddir) for i in range(6): print(f'Iteration {i}') await skvbc.fill_and_wait_for_checkpoint( bft_network.all_replicas(), num_of_checkpoints_to_add=2, verify_checkpoint_persistency=False, assert_state_transfer_not_started=False) if i > 0 and i % 2 == 0: await op.latest_pruneable_block() latest_pruneable_blocks = [] rsi_rep = client.get_rsi_replies() for r in rsi_rep.values(): lpab = cmf_msgs.ReconfigurationResponse.deserialize(r)[0] latest_pruneable_blocks += [lpab.response] print('Pruning...') await op.prune(latest_pruneable_blocks) with trio.fail_after(seconds=30): while True: num_replies = 0 await op.prune_status() rsi_rep = client.get_rsi_replies() for r in rsi_rep.values(): status = cmf_msgs.ReconfigurationResponse.deserialize( r)[0] last_prune_blockid = status.response.last_pruned_block log.log_message( message_type= f"last_prune_blockid {last_prune_blockid}, status.response.sender {status.response.sender}" ) if status.response.in_progress is False and last_prune_blockid > 0: num_replies += 1 if num_replies == bft_network.config.n: break print('Done pruning.') restart = random.choice([0, 1]) if restart == 1: print( 'Selecting a random replica to be restarted (the primary is excluded)...' ) replica_to_restart = random.choice( bft_network.all_replicas(without={0})) print(f'Replica {replica_to_restart} will be restarted.') bft_network.stop_replica(replica_to_restart, True) bft_network.start_replica(replica_to_restart) await trio.sleep(seconds=1) # Validate that the RVT root values are in sync after all of the prunings and restarts have finished await bft_network.wait_for_replicas_rvt_root_values_to_be_in_sync( bft_network.all_replicas())
async def test_wedge_command_with_f_failures(self, bft_network): """ This test checks that even a replica that received the super stable checkpoint via the state transfer mechanism is able to stop at the super stable checkpoint. The test does the following: 1. Start all replicas but 2 2. A client sends a wedge command 3. Validate that all started replicas have reached the wedge point 4. Restart the live replicas and validate the system is able to make progress 5. Start the late replica 6. Validate that the late replicas completed the state transfer 7. Join the late replicas to the quorum and make sure the system is able to make progress """ initial_prim = 0 late_replicas = bft_network.random_set_of_replicas(2, {initial_prim}) on_time_replicas = bft_network.all_replicas(without=late_replicas) bft_network.start_replicas(on_time_replicas) skvbc = kvbc.SimpleKVBCProtocol(bft_network) await skvbc.wait_for_liveness() checkpoint_before = await bft_network.wait_for_checkpoint(replica_id=0) client = bft_network.random_client() # We increase the default request timeout because we need to have around 300 consensuses which occasionally may take more than 5 seconds client.config._replace(req_timeout_milli=10000) with log.start_action(action_type="send_wedge_cmd", checkpoint_before=checkpoint_before, late_replicas=list(late_replicas)): op = operator.Operator(bft_network.config, client, bft_network.builddir) await op.wedge() with trio.fail_after(seconds=60): done = False while done is False: await op.wedge_status(quorum=bft_client.MofNQuorum( on_time_replicas, len(on_time_replicas)), fullWedge=False) rsi_rep = client.get_rsi_replies() done = True for r in rsi_rep.values(): res = cmf_msgs.ReconfigurationResponse.deserialize(r) status = res[0].response.stopped if status is False: done = False break # Make sure the system is able to make progress bft_network.stop_replicas(on_time_replicas) bft_network.start_replicas(on_time_replicas) for i in range(100): await skvbc.write_known_kv() # Start late replicas and wait for state transfer to stop bft_network.start_replicas(late_replicas) await bft_network.wait_for_state_transfer_to_start() for r in late_replicas: await bft_network.wait_for_state_transfer_to_stop( initial_prim, r, stop_on_stable_seq_num=True) replicas_to_stop = bft_network.random_set_of_replicas( 2, late_replicas | {initial_prim}) # Make sure the system is able to make progress for i in range(100): await skvbc.write_known_kv()