async def _wait_for_view_under_constant_load( self, replica_id, bft_network, expected=None, err_msg="Expected view not reached"): """ Similar to wait_for_view method, except it allows for consecutive unexpected view changes when waiting for active view. """ if expected is None: expected = lambda _: True matching_view = None nb_replicas_in_matching_view = 0 try: matching_view = await bft_network._wait_for_matching_agreed_view( replica_id, expected) log.log_message( message_type= f'Matching view #{matching_view} has been agreed among replicas.' ) nb_replicas_in_matching_view = await self._wait_for_active_view_under_constant_load( matching_view, bft_network, replica_id, expected) log.log_message( message_type=f'View #{matching_view} has been activated by ' f'{nb_replicas_in_matching_view} >= n-f = {bft_network.config.n - bft_network.config.f}' ) return matching_view except trio.TooSlowError: assert False, err_msg + \ f'(matchingView={matching_view} ' \ f'replicasInMatchingView={nb_replicas_in_matching_view})'
async def _restart_stale_until_non_primary_chosen_as_source( self, bft_network, primary, stale, non_primary_replicas): source_replica_id = inf log.log_message(message_type=f'Restarting stale replica until ' f'it fetches from {non_primary_replicas}...') with trio.move_on_after(10): # seconds while True: bft_network.start_replica(stale) source_replica_id = await bft_network.wait_for_fetching_state( replica_id=stale) bft_network.stop_replica(stale) if source_replica_id in non_primary_replicas: self.assertTrue(expr=source_replica_id != primary, msg="The source must NOT be the primary " "(to avoid triggering a view change)") log.log_message( message_type= f'Stale replica fetching from {source_replica_id}') break self.assertTrue(source_replica_id != inf, msg="Stale replica is not fetching right now.") return source_replica_id
async def _wait_for_active_view_under_constant_load( view, bft_network, replica_id, expected, fail_after_time=30): """ Wait for the latest matching_view to become active on enough (n-f) replicas """ with trio.fail_after(seconds=fail_after_time): while True: nb_replicas_in_view = await bft_network._count_replicas_in_view( view) # wait for n-f = 2f+2c+1 replicas to be in the expected view if nb_replicas_in_view >= 2 * bft_network.config.f + 2 * bft_network.config.c + 1: break # if matching_view updates due to unexpected view change, wait for the latest # matching_view to become active matching_view = await bft_network._wait_for_matching_agreed_view( replica_id, expected) if matching_view > view: log.log_message( message_type= f'Updated matching view #{matching_view} has been agreed among replicas.' ) view = matching_view fail_after_time += 30 return nb_replicas_in_view
async def _run_state_transfer_while_crashing_non_primary( self, bft_network, primary, stale, non_primary_replicas): source_replica_id = \ await self._restart_stale_until_non_primary_chosen_as_source( bft_network, primary, stale, non_primary_replicas ) if source_replica_id in non_primary_replicas: log.log_message( message_type=f'Stopping source replica {source_replica_id}') bft_network.stop_replica(source_replica_id) log.log_message( message_type= f'Re-starting stale replica {stale} to start state transfer') bft_network.start_replica(stale) await bft_network.wait_for_state_transfer_to_stop( up_to_date_node=primary, stale_node=stale) log.log_message( message_type=f'State transfer completed, despite initial source ' f'replica {source_replica_id} being down') bft_network.start_replica(source_replica_id) else: log.log_message( message_type="No source replica set in stale node, checking " "if state transfer has already completed...") await bft_network.wait_for_state_transfer_to_stop( up_to_date_node=primary, stale_node=stale) log.log_message( message_type="State transfer completed before we had a chance " "to stop the source replica.")
async def _wait_for_st(self, bft_network, ro_replica_id, seqnum_threshold=150): # TODO replace the below function with the library function: # await tracker.skvbc.tracked_fill_and_wait_for_checkpoint( # initial_nodes=bft_network.all_replicas(), # num_of_checkpoints_to_add=1) with trio.fail_after(seconds=70): # the ro replica should be able to survive these failures while True: with trio.move_on_after(seconds=.5): try: key = ['replica', 'Gauges', 'lastExecutedSeqNum'] lastExecutedSeqNum = await bft_network.metrics.get( ro_replica_id, *key) except KeyError: continue else: # success! if lastExecutedSeqNum >= seqnum_threshold: log.log_message(message_type="Replica" + str(ro_replica_id) + " : lastExecutedSeqNum:" + str(lastExecutedSeqNum)) break
def stop_blinking(self): if self.blinker_process: self.blinker_process.terminate() if self.blinker_process.wait() != 0: raise Exception( "Error occured while while stopping the blinker process") log.log_message(message_type="Stopped blinking")
async def _fill_and_wait_for_checkpoint_under_constant_load(skvbc, bft_network, initial_nodes, num_of_checkpoints_to_add=2, verify_checkpoint_persistency=True, assert_state_transfer_not_started=True): """ Similar to fill_and_wait_for_checkpoint, except under constant load additional checkpoints may be created. The expected_checkpoint_num in that case may not necessarily be checkpoint_before + num_of_checkpoints_to_add. This function account for the unexpected checkpoints created due to constant load. Unlike fill_and_wait_for_checkpoint, checkpoint_before is obtained from the current_primary instead of a random replica, as under a constant load, it can be possible the chosen replica may be behind. """ client = kvbc.SkvbcClient(bft_network.random_client()) current_primary = await bft_network.get_current_primary() checkpoint_before = await bft_network.wait_for_checkpoint(current_primary) log.log_message(message_type=f"expected_checkpoint_num should be > {checkpoint_before}") # Write enough data to checkpoint and create a need for state transfer for i in range(1 + num_of_checkpoints_to_add * 150): key = skvbc.random_key() val = skvbc.random_value() reply = await client.write([], [(key, val)]) assert reply.success await skvbc.network_wait_for_checkpoint( initial_nodes, expected_checkpoint_num=lambda ecn: ecn > checkpoint_before, verify_checkpoint_persistency=verify_checkpoint_persistency, assert_state_transfer_not_started=assert_state_transfer_not_started)
async def _trigger_view_change(self, skvbc): log.log_message( message_type="Sending random transactions to trigger view change..." ) with trio.move_on_after(1): # seconds async with trio.open_nursery() as nursery: nursery.start_soon(skvbc.send_indefinite_write_requests)
async def test_ro_replica_start_simultaneously(self, bft_network, tracker): """ Start up N of N regular replicas. Start read-only replica. Send client commands. Wait for State Transfer in ReadOnlyReplica to complete. """ bft_network.start_all_replicas() # start the read-only replica ro_replica_id = bft_network.config.n bft_network.start_replica(ro_replica_id) # TODO replace the below function with the library function: # await tracker.skvbc.tracked_fill_and_wait_for_checkpoint( # initial_nodes=bft_network.all_replicas(), # num_of_checkpoints_to_add=1) with trio.fail_after(seconds=60): async with trio.open_nursery() as nursery: skvbc = kvbc.SimpleKVBCProtocol(bft_network, tracker) nursery.start_soon(skvbc.send_indefinite_ops, .7, .1) while True: with trio.move_on_after(seconds=.5): try: key = ['replica', 'Gauges', 'lastExecutedSeqNum'] lastExecutedSeqNum = await bft_network.metrics.get( ro_replica_id, *key) except KeyError: continue else: # success! if lastExecutedSeqNum >= 150: log.log_message(message_type="Replica" + str(ro_replica_id) + " : lastExecutedSeqNum:" + str(lastExecutedSeqNum)) nursery.cancel_scope.cancel()
async def test_isolate_non_primaries_subset_with_view_change( self, bft_network, tracker): """ In this test we isolate f-1 replicas from the rest of the BFT network. We crash the primary and trigger view change while the f-1 replicas are still isolated. At this point we have a total of f unavailable replicas. The adversary is then deactivated and we make sure the previously isolated replicas activate the new view and correctly process incoming client requests. """ bft_network.start_all_replicas() f = bft_network.config.f initial_primary = await bft_network.get_current_primary() expected_next_primary = 1 + initial_primary isolated_replicas = bft_network.random_set_of_replicas( f - 1, without={initial_primary, expected_next_primary}) log.log_message( message_type= f'Isolating network traffic to/from replicas {isolated_replicas}.') with net.ReplicaSubsetIsolatingAdversary( bft_network, isolated_replicas) as adversary: adversary.interfere() bft_network.stop_replica(initial_primary) await self._send_random_writes(tracker) await bft_network.wait_for_view( replica_id=random.choice( bft_network.all_replicas( without={initial_primary}.union(isolated_replicas))), expected=lambda v: v == expected_next_primary, err_msg="Make sure view change has been triggered.") # waiting for the active window to be rebuilt after the view change await trio.sleep(seconds=5) # the adversary is not active anymore: # make sure the isolated replicas activate the new view for ir in isolated_replicas: await bft_network.wait_for_view( replica_id=ir, expected=lambda v: v == expected_next_primary, err_msg= f"Make sure isolated replica #{ir} works in new view {expected_next_primary}." ) # then make sure the isolated replicas participate in consensus & request execution await tracker.run_concurrent_ops(num_ops=50) expected_last_executed_seq_num = await bft_network.wait_for_last_executed_seq_num( replica_id=random.choice( bft_network.all_replicas( without={initial_primary}.union(isolated_replicas)))) for ir in isolated_replicas: await bft_network.wait_for_last_executed_seq_num( replica_id=ir, expected=expected_last_executed_seq_num)
async def run_concurrent_conflict_ops(self, num_ops, write_weight=.70): if self.tracker.no_conflicts is True: log.log_message(message_type="call to run_concurrent_conflict_ops with no_conflicts=True," " calling run_concurrent_ops instead") return await self.run_concurrent_ops(num_ops, write_weight) max_concurrency = len(self.bft_network.clients) // 2 max_size = len(self.keys) // 2 return await self.send_concurrent_ops(num_ops, max_concurrency, max_size, write_weight, create_conflicts=True)
async def _stop_random_replicas_with_delay(bft_network, delay=10, exclude_replicas=None): all_replicas = bft_network.all_replicas(without=exclude_replicas) random.shuffle(all_replicas) for replica in all_replicas: log.log_message(message_type=f"stopping replica: {replica}") bft_network.stop_replica(replica) await trio.sleep(delay) return list(all_replicas)
def restore_form_older_snapshot(self, bft_network, replica, snapshot_id): with log.start_action(action_type="restore with older snapshot"): snapshot_db_dir = os.path.join(bft_network.testdir, DB_SNAPSHOT_PREFIX + str(replica) + "/" + str(snapshot_id)) dest_db_dir = os.path.join(bft_network.testdir, DB_FILE_PREFIX + str(replica)) if os.path.exists(dest_db_dir) : shutil.rmtree(dest_db_dir) ret = shutil.copytree(snapshot_db_dir, dest_db_dir) log.log_message(message_type=f"copy db files from {snapshot_db_dir} to {dest_db_dir}, result is {ret}")
async def test_restarting_replica_with_client_load(self, bft_network): """ The goal of this test is to restart a replica multiple times while the system is processing Client Operations to verify the restarted replica recovery and the system's return to fast path of processing client requests. Scenario: 1) For 1 minute send client operations. 2) While sending client operations we restart multiple times 1 randomly selected Replica (not the Primary). 3) After every restart we verify that the system will eventually return to the Fast Path. """ bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) primary_replica = 0 # Pick one replica to restart multiple times while the system is processing client requests replica_to_restart = random.choice( bft_network.all_replicas(without={primary_replica})) # uncomment for live tracking of log messages from the test # log = foo() async def client_load(task_status=trio.TASK_STATUS_IGNORED): with trio.CancelScope() as scope: task_status.started(scope) await skvbc.send_indefinite_ops() async with trio.open_nursery() as nursery: # Start the sending of client operations in the background. scoped_client_load = await nursery.start(client_load) for v in range(loops * 100): if (0 == v % loops): log.log_message(f"iteration {v}") log.log_message(f"Stop replica {replica_to_restart} and wait for system to move to slow path") bft_network.stop_replica(replica_to_restart, True) latest_slow_paths = total_slow_paths = await bft_network.num_of_slow_path_requests(primary_replica) with trio.fail_after(seconds=15): while latest_slow_paths - total_slow_paths == 0: await trio.sleep(seconds=0.1) latest_slow_paths = await bft_network.num_of_slow_path_requests(primary_replica) log.log_message(f"Start replica {replica_to_restart} and wait for system to move to fast path") bft_network.start_replica(replica_to_restart) latest_fast_paths = total_fast_paths = await bft_network.num_of_fast_path_requests(primary_replica) with trio.fail_after(seconds=15): while latest_fast_paths == total_fast_paths: await trio.sleep(seconds=0.1) latest_fast_paths = await bft_network.num_of_fast_path_requests(primary_replica) scoped_client_load.cancel() # Before the test ends we verify the Fast Path is prevalent, # no matter the restarts we performed on the selected replica. log.log_message("wait for fast path to be prevalent") await bft_network.wait_for_fast_path_to_be_prevalent( run_ops=lambda: skvbc.run_concurrent_ops(num_ops=20, write_weight=1), threshold=20) log.log_message("fast path prevailed")
async def test_inactive_window(self, bft_network): """ The goal of this test is to verify full catch up of a Replica only from the Inactive Window. 1) Start all Replicas without Replica 1, which will later catch up from the Primary's Inactive Window. 2) Advance all Replicas to 1 sequence number beyond the first stable and verify they have all collected Stable Checkpoints. 3) Start and isolate the late Replica 1 form all others except the Primary. This way it will not be able to start State Transfer and will only be able to catch up from the Primary's Inactive Window. 4) Verify that Replica 1 has managed to catch up. """ late_replica = 1 bft_network.start_replicas( bft_network.all_replicas(without={late_replica})) skvbc = kvbc.SimpleKVBCProtocol(bft_network) stable_checkpoint_to_reach = 1 num_reqs_to_catch_up = 151 async def write_req(num_req=1): for _ in range(num_req): await skvbc.write_known_kv() # create checkpoint and wait for checkpoint propagation await skvbc.fill_and_wait_for_checkpoint( initial_nodes=bft_network.get_live_replicas(), num_of_checkpoints_to_add=stable_checkpoint_to_reach, verify_checkpoint_persistency=False) await bft_network.wait_for_replicas_to_collect_stable_checkpoint( bft_network.get_live_replicas(), stable_checkpoint_to_reach) with trio.fail_after(seconds=30): with net.ReplicaOneWayTwoSubsetsIsolatingAdversary( bft_network, {1}, {6, 5, 4, 3, 2}) as adversary: adversary.interfere() bft_network.start_replica(late_replica) late_replica_catch_up = False while not late_replica_catch_up: for replica_id in bft_network.all_replicas(): last_stable = await bft_network.get_metric( replica_id, bft_network, 'Gauges', "lastStableSeqNum") last_exec = await bft_network.get_metric( replica_id, bft_network, 'Gauges', "lastExecutedSeqNum") log.log_message( message_type= f"replica = {replica_id}; last_stable = {last_stable}; lase_exec = {last_exec}" ) if replica_id == late_replica and last_exec >= num_reqs_to_catch_up: late_replica_catch_up = True await write_req() await trio.sleep(seconds=3)
def transfer_dbcheckpoint_files(self, bft_network, source_replica, snapshot_id, dest_replicas): with log.start_action(action_type="transfer snapshot db files"): snapshot_db_dir = os.path.join(bft_network.testdir, DB_SNAPSHOT_PREFIX + str(source_replica) + "/" + str(snapshot_id)) for r in dest_replicas: dest_db_dir = os.path.join(bft_network.testdir, DB_FILE_PREFIX + str(r)) if os.path.exists(dest_db_dir) : shutil.rmtree(dest_db_dir) ret = shutil.copytree(snapshot_db_dir, dest_db_dir) log.log_message(message_type=f"copy db files from {snapshot_db_dir} to {dest_db_dir}, result is {ret}")
async def wrapper(*args, **kwargs): if 'disable_linearizability_checks' in kwargs: kwargs.pop('disable_linearizability_checks') log.log_message(message_type=f'Disabling linearizability is deprecated') bft_network = kwargs['bft_network'] skvbc = kvbc.SimpleKVBCProtocol(bft_network) init_state = skvbc.initial_state() tracker = SkvbcTracker(init_state, skvbc, bft_network, pre_exec_enabled, no_conflicts, block_Accumulation) await async_fn(*args, **kwargs, tracker=tracker) await tracker.fill_missing_blocks_and_verify()
async def test_slow_path_view_change(self, bft_network, tracker): """ This test validates the BFT engine's transition to the slow path when the primary goes down. This effectively triggers a view change in the slow path. First we write a batch of K/V entries and track them using the tracker from the decorator. We check those entries have been processed via the fast commit path. We stop the primary and send a indefinite batch of tracked read & write requests, triggering slow path & view change. We bring the primary back up. We make sure the second batch of requests have been processed via the slow path. """ bft_network.start_all_replicas() num_ops = 5 skvbc = kvbc.SimpleKVBCProtocol(bft_network, tracker) await bft_network.wait_for_fast_path_to_be_prevalent( run_ops=lambda: skvbc.run_concurrent_ops(num_ops=num_ops, write_weight=1), threshold=num_ops) bft_network.stop_replica(0) # trigger the view change await skvbc.run_concurrent_ops(num_ops) randRep = random.choice(bft_network.all_replicas(without={0})) log.log_message(f'wait_for_view - Random replica {randRep}') await bft_network.wait_for_view( replica_id=randRep, expected=lambda v: v > 0, err_msg="Make sure view change has occurred.") nb_fast_paths_to_ignore = await bft_network.num_of_fast_path_requests( randRep) nb_slow_paths_to_ignore = await bft_network.num_of_slow_path_requests( randRep) with trio.move_on_after(seconds=5): async with trio.open_nursery() as nursery: nursery.start_soon(skvbc.send_indefinite_tracked_ops, 1) bft_network.start_replica(0) await bft_network.assert_slow_path_prevalent( nb_fast_paths_to_ignore=nb_fast_paths_to_ignore, nb_slow_paths_to_ignore=nb_slow_paths_to_ignore, replica_id=randRep)
async def test_restart_replica_after_view_change(self, bft_network, tracker): """ This test makes sure that a replica can be safely restarted after a view change: 1) Start all replicas 2) Send a batch of concurrent reads/writes, to make sure the initial view is stable 3) Crash the current primary & trigger view change 4) Make sure the new view is agreed & activated among all live replicas 5) Choose a random non-primary and restart it 6) Send a batch of concurrent reads/writes 7) Make sure the restarted replica is alive and that it works in the new view """ bft_network.start_all_replicas() initial_primary = 0 skvbc = kvbc.SimpleKVBCProtocol(bft_network, tracker) await skvbc.run_concurrent_ops(num_ops=10) bft_network.stop_replica(initial_primary) await self._send_random_writes(skvbc) await bft_network.wait_for_view( replica_id=random.choice( bft_network.all_replicas(without={initial_primary})), expected=lambda v: v == initial_primary + 1, err_msg="Make sure a view change is triggered.") current_primary = initial_primary + 1 bft_network.start_replica(initial_primary) # waiting for the active window to be rebuilt after the view change await trio.sleep(seconds=5) unstable_replica = random.choice( bft_network.all_replicas( without={current_primary, initial_primary})) log.log_message( message_type= f"Restart replica #{unstable_replica} after the view change.") bft_network.stop_replica(unstable_replica) bft_network.start_replica(unstable_replica) await trio.sleep(seconds=5) await skvbc.run_concurrent_ops(num_ops=10) await bft_network.wait_for_view( replica_id=unstable_replica, expected=lambda v: v == current_primary, err_msg="Make sure the unstable replica works in the new view.") await bft_network.wait_for_view( replica_id=initial_primary, expected=lambda v: v == current_primary, err_msg="Make sure the initial primary activates the new view.")
async def _start_random_replicas_with_delay(bft_network, stopped_replicas, initial_primary, f_replicas_stopped_early=None, delay=10): random.shuffle(stopped_replicas) if f_replicas_stopped_early: stopped_replicas.extend(f_replicas_stopped_early) if initial_primary not in stopped_replicas: stopped_replicas.append(initial_primary) for replica in stopped_replicas: log.log_message(message_type=f"starting replica: {replica}") bft_network.start_replica(replica) await trio.sleep(delay) return stopped_replicas
async def _get_gauge(cls, replica_id, bft_network, gauge): with trio.fail_after(seconds=30): while True: with trio.move_on_after(seconds=1): try: key = ['replica', 'Gauges', gauge] value = await bft_network.metrics.get(replica_id, *key) except KeyError: # metrics not yet available, continue looping log.log_message(message_type=f"KeyError! '{gauge}' not yet available.") else: return value
def _start_integrity_check(self, bft_network, keys_file, s3_config_file, key_to_validate=None): """ Start integrity check """ with log.start_action(action_type="start_integrity_check"): stdout_file = None stderr_file = None if os.environ.get('KEEP_APOLLO_LOGS', "").lower() in ["true", "on"]: test_name = os.environ.get('TEST_NAME') if not test_name: now = datetime.now().strftime("%y-%m-%d_%H:%M:%S") test_name = f"{now}_{bft_network.current_test}" test_dir = f"{bft_network.builddir}/tests/apollo/logs/{test_name}/{bft_network.current_test}/" test_log = f"{test_dir}stdout_integrity_check.log" log.log_message(message_type=f"test log is: {test_log}") os.makedirs(test_dir, exist_ok=True) stdout_file = open(test_log, 'w+') stderr_file = open(test_log, 'w+') stdout_file.write("############################################\n") stdout_file.flush() stderr_file.write("############################################\n") stderr_file.flush() s3_config_path = os.path.join(bft_network.builddir, s3_config_file) integrity_check_fds = (stdout_file, stderr_file) integrity_check_exe = os.path.join(bft_network.builddir, "kvbc", "tools", "db_integrity_check", "s3_integrity_check") integrity_check_cmd = [integrity_check_exe, "-k", keys_file, "-3", s3_config_path] if key_to_validate is not None: integrity_check_cmd.append("-v") integrity_check_cmd.append(key_to_validate) else: integrity_check_cmd.append("-a") log.log_message(message_type="starting the subprocess") integrity_check_pid = subprocess.Popen( integrity_check_cmd, stdout=stdout_file, stderr=stderr_file, close_fds=True) try: exit_code = integrity_check_pid.wait() assert exit_code == 0 except Exception as e: assert False finally: for fd in integrity_check_fds: fd.close()
async def test_fast_path_after_view_change(self, bft_network, tracker): """ This test validates the BFT engine's ability to restore the fast path after a view change due to crashed primary. First we write a batch of K/V entries and check those entries have been processed via the fast commit path. We stop the primary and send a single write requests to trigger a view change. We bring the primary back up. We make sure the fast path is eventually maintained. Finally the decorator verifies the KV execution. """ bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network, tracker) num_ops = 5 await bft_network.wait_for_consensus_path( path_type=ConsensusPathType.OPTIMISTIC_FAST, run_ops=lambda: self.send_kvs_sequentially(skvbc, num_ops), threshold=num_ops) # Stop the primary bft_network.stop_replica(0) # Send a write request to trigger a view change with trio.move_on_after(seconds=3): await skvbc.send_write_kv_set() randRep = random.choice(bft_network.all_replicas(without={0})) log.log_message(f'wait_for_view - Random replica {randRep}') await bft_network.wait_for_view( replica_id=randRep, expected=lambda v: v > 0, err_msg="Make sure view change has occurred.") # Restore the crashed primary bft_network.start_replica(0) await self.wait_for_stable_state(skvbc, timeout_secs=10) # View change recovers await bft_network.wait_for_consensus_path( path_type=ConsensusPathType.OPTIMISTIC_FAST, run_ops=lambda: self.send_kvs_sequentially( skvbc, int(1.1 * self.EVALUATION_PERIOD_SEQUENCES)), threshold=num_ops)
async def get_blocks(self, client, block_ids): blocks = {} for block_id in block_ids: retries = 12 # 60 seconds for i in range(0, retries): try: msg = kvbc.SimpleKVBCProtocol.get_block_data_req(block_id) blocks[block_id] = kvbc.SimpleKVBCProtocol.parse_reply(await client.read(msg)) break except trio.TooSlowError: if i == retries - 1: raise log.log_message(message_type=f'Retrieved block {block_id}') return blocks
def setUpClass(cls): if not os.environ.get("CONCORD_BFT_MINIO_BINARY_PATH"): log.log_message( message_type= "CONCORD_BFT_MINIO_BINARY_PATH is not set. Running in RocksDB mode." ) return log.log_message( message_type= "CONCORD_BFT_MINIO_BINARY_PATH is set. Running in S3 mode.") # We need a temp dir for data and binaries - this is cls.dest_dir # self.dest_dir will contain data dir for minio buckets and the minio binary # if there are any directories inside data dir - they become buckets cls.work_dir = "/tmp/concord_bft_minio_datadir_" + next( tempfile._get_candidate_names()) cls.minio_server_data_dir = os.path.join(cls.work_dir, "data") os.makedirs(os.path.join(cls.work_dir, "data", "blockchain")) # create all dirs in one call log.log_message(message_type=f"Working in {cls.work_dir}") # Start server cls._start_s3_server() log.log_message(message_type="Initialisation complete")
def start_s3_server(self): log.log_message(message_type="Starting server") server_env = os.environ.copy() server_env["MINIO_ACCESS_KEY"] = "concordbft" server_env["MINIO_SECRET_KEY"] = "concordbft" minio_server_fname = os.environ.get("CONCORD_BFT_MINIO_BINARY_PATH") if minio_server_fname is None: shutil.rmtree(self.work_dir) raise RuntimeError("Please set path to minio binary to CONCORD_BFT_MINIO_BINARY_PATH env variable") self.minio_server_proc = subprocess.Popen([minio_server_fname, "server", self.minio_server_data_dir], env = server_env, close_fds=True)
async def test_fast_path_after_view_change(self, bft_network, tracker): """ This test validates the BFT engine's ability to restore the fast path after a view change due to crashed primary. First we write a batch of K/V entries and check those entries have been processed via the fast commit path. We stop the primary and send a single write requests to trigger a view change. We bring the primary back up. We make sure the fast path is eventually maintained. Finally the decorator verifies the KV execution. """ bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network, tracker) # Initially all replicas are running on the fast path await bft_network.wait_for_fast_path_to_be_prevalent( run_ops=lambda: skvbc.run_concurrent_ops(num_ops=NUM_OPS, write_weight=1), threshold=NUM_OPS) # Stop the primary bft_network.stop_replica(0) # Send a write request to trigger a view change with trio.move_on_after(seconds=3): await skvbc.send_write_kv_set() randRep = random.choice(bft_network.all_replicas(without={0})) log.log_message(f'wait_for_view - Random replica {randRep}') await bft_network.wait_for_view( replica_id=randRep, expected=lambda v: v > 0, err_msg="Make sure view change has occurred.") # Restore the crashed primary bft_network.start_replica(0) # Make sure that the fast path is maintained eventually await bft_network.wait_for_fast_path_to_be_prevalent( run_ops=lambda: skvbc.run_concurrent_ops(num_ops=NUM_OPS, write_weight=1), threshold=NUM_OPS)
async def _check_st_not_started(self, bft_network, ro_replica_id): with trio.fail_after(seconds=70): # the ro replica should be able to survive these failures while True: with trio.move_on_after(seconds=.5): try: key = ['replica', 'Gauges', 'lastExecutedSeqNum'] lastExecutedSeqNum = await bft_network.metrics.get(ro_replica_id, *key) except KeyError: continue else: # success! if lastExecutedSeqNum == 0: log.log_message(message_type="Replica" + str(ro_replica_id) + " : lastExecutedSeqNum:" + str(lastExecutedSeqNum)) break
def fill_missing_blocks(self, missing_blocks): """ Add all missing blocks to self.blocks Note that these blocks will not have a matching req_index since we never received a reply for the request that created it. In some histories it's not possible to identify an unambiguous request, since there may be multiple possible requests that could have correctly generated the block. Rather than trying to match the requests, to the missing blocks, we just assume the missing blocks are correct for now, and use the full block history to verify successful conditional writes and reads. """ for block_id, kvpairs in missing_blocks.items(): self.blocks[block_id] = Block(kvpairs) if block_id > self.last_known_block: self.last_known_block = block_id self.filled_blocks = missing_blocks log.log_message(message_type=f'{len(missing_blocks)} missing blocks filled.')
async def _test_st_while_crashing_primary( self, bft_network, trigger_view_change, crash_repeatedly, tracker): # we need a BFT network with f >= 2, allowing us to have 2 # crashed replicas at the same time (the primary and the stale node) n = bft_network.config.n stale_replica = n - 1 skvbc = kvbc.SimpleKVBCProtocol(bft_network, tracker) client, known_key, known_val = \ await skvbc.prime_for_state_transfer(stale_nodes={stale_replica}, checkpoints_num=2) view = await bft_network.wait_for_view( replica_id=0, expected=lambda v: v == 0, err_msg="Make sure we are in the initial view." ) log.log_message(message_type=f'Initial view number is {view}, as expected.') if crash_repeatedly: await self._run_state_transfer_while_crashing_primary_repeatedly( skvbc=skvbc, bft_network=bft_network, n=n, primary=0, stale=stale_replica ) else: await self._run_state_transfer_while_crashing_primary_once( skvbc=skvbc, bft_network=bft_network, n=n, primary=0, stale=stale_replica, trigger_view_change=trigger_view_change ) await bft_network.force_quorum_including_replica(stale_replica) kvpairs = await skvbc.send_read_kv_set(client, known_key) self.assertDictEqual(dict([(known_key, known_val)]), kvpairs)