Exemplo n.º 1
0
    async def _wait_for_view_under_constant_load(
            self,
            replica_id,
            bft_network,
            expected=None,
            err_msg="Expected view not reached"):
        """
        Similar to wait_for_view method, except it allows for consecutive unexpected
        view changes when waiting for active view.
        """
        if expected is None:
            expected = lambda _: True

        matching_view = None
        nb_replicas_in_matching_view = 0
        try:
            matching_view = await bft_network._wait_for_matching_agreed_view(
                replica_id, expected)
            log.log_message(
                message_type=
                f'Matching view #{matching_view} has been agreed among replicas.'
            )

            nb_replicas_in_matching_view = await self._wait_for_active_view_under_constant_load(
                matching_view, bft_network, replica_id, expected)
            log.log_message(
                message_type=f'View #{matching_view} has been activated by '
                f'{nb_replicas_in_matching_view} >= n-f = {bft_network.config.n - bft_network.config.f}'
            )

            return matching_view
        except trio.TooSlowError:
            assert False, err_msg + \
                          f'(matchingView={matching_view} ' \
                          f'replicasInMatchingView={nb_replicas_in_matching_view})'
Exemplo n.º 2
0
    async def _restart_stale_until_non_primary_chosen_as_source(
            self, bft_network, primary, stale, non_primary_replicas):

        source_replica_id = inf

        log.log_message(message_type=f'Restarting stale replica until '
                        f'it fetches from {non_primary_replicas}...')
        with trio.move_on_after(10):  # seconds
            while True:
                bft_network.start_replica(stale)
                source_replica_id = await bft_network.wait_for_fetching_state(
                    replica_id=stale)
                bft_network.stop_replica(stale)
                if source_replica_id in non_primary_replicas:
                    self.assertTrue(expr=source_replica_id != primary,
                                    msg="The source must NOT be the primary "
                                    "(to avoid triggering a view change)")
                    log.log_message(
                        message_type=
                        f'Stale replica fetching from {source_replica_id}')
                    break

        self.assertTrue(source_replica_id != inf,
                        msg="Stale replica is not fetching right now.")
        return source_replica_id
Exemplo n.º 3
0
    async def _wait_for_active_view_under_constant_load(
            view, bft_network, replica_id, expected, fail_after_time=30):
        """
        Wait for the latest matching_view to become active on enough (n-f) replicas
        """
        with trio.fail_after(seconds=fail_after_time):
            while True:
                nb_replicas_in_view = await bft_network._count_replicas_in_view(
                    view)

                # wait for n-f = 2f+2c+1 replicas to be in the expected view
                if nb_replicas_in_view >= 2 * bft_network.config.f + 2 * bft_network.config.c + 1:
                    break

                # if matching_view updates due to unexpected view change, wait for the latest
                # matching_view to become active
                matching_view = await bft_network._wait_for_matching_agreed_view(
                    replica_id, expected)
                if matching_view > view:
                    log.log_message(
                        message_type=
                        f'Updated matching view #{matching_view} has been agreed among replicas.'
                    )
                    view = matching_view
                    fail_after_time += 30
        return nb_replicas_in_view
Exemplo n.º 4
0
    async def _run_state_transfer_while_crashing_non_primary(
            self, bft_network, primary, stale, non_primary_replicas):

        source_replica_id = \
            await self._restart_stale_until_non_primary_chosen_as_source(
                bft_network, primary, stale, non_primary_replicas
            )

        if source_replica_id in non_primary_replicas:
            log.log_message(
                message_type=f'Stopping source replica {source_replica_id}')
            bft_network.stop_replica(source_replica_id)

            log.log_message(
                message_type=
                f'Re-starting stale replica {stale} to start state transfer')
            bft_network.start_replica(stale)

            await bft_network.wait_for_state_transfer_to_stop(
                up_to_date_node=primary, stale_node=stale)

            log.log_message(
                message_type=f'State transfer completed, despite initial source '
                f'replica {source_replica_id} being down')

            bft_network.start_replica(source_replica_id)
        else:
            log.log_message(
                message_type="No source replica set in stale node, checking "
                "if state transfer has already completed...")
            await bft_network.wait_for_state_transfer_to_stop(
                up_to_date_node=primary, stale_node=stale)
            log.log_message(
                message_type="State transfer completed before we had a chance "
                "to stop the source replica.")
Exemplo n.º 5
0
 async def _wait_for_st(self,
                        bft_network,
                        ro_replica_id,
                        seqnum_threshold=150):
     # TODO replace the below function with the library function:
     # await tracker.skvbc.tracked_fill_and_wait_for_checkpoint(
     # initial_nodes=bft_network.all_replicas(),
     # num_of_checkpoints_to_add=1)
     with trio.fail_after(seconds=70):
         # the ro replica should be able to survive these failures
         while True:
             with trio.move_on_after(seconds=.5):
                 try:
                     key = ['replica', 'Gauges', 'lastExecutedSeqNum']
                     lastExecutedSeqNum = await bft_network.metrics.get(
                         ro_replica_id, *key)
                 except KeyError:
                     continue
                 else:
                     # success!
                     if lastExecutedSeqNum >= seqnum_threshold:
                         log.log_message(message_type="Replica" +
                                         str(ro_replica_id) +
                                         " : lastExecutedSeqNum:" +
                                         str(lastExecutedSeqNum))
                         break
Exemplo n.º 6
0
 def stop_blinking(self):
     if self.blinker_process:
         self.blinker_process.terminate()
         if self.blinker_process.wait() != 0:
             raise Exception(
                 "Error occured while while stopping the blinker process")
     log.log_message(message_type="Stopped blinking")
    async def _fill_and_wait_for_checkpoint_under_constant_load(skvbc, bft_network, initial_nodes,
                                                                num_of_checkpoints_to_add=2,
                                                                verify_checkpoint_persistency=True,
                                                                assert_state_transfer_not_started=True):
        """
        Similar to fill_and_wait_for_checkpoint, except under constant load additional
        checkpoints may be created. The expected_checkpoint_num in that case may not
        necessarily be checkpoint_before + num_of_checkpoints_to_add. This function
        account for the unexpected checkpoints created due to constant load.
        Unlike fill_and_wait_for_checkpoint, checkpoint_before is obtained from the current_primary
        instead of a random replica, as under a constant load, it can be possible the chosen replica
        may be behind.
        """
        client = kvbc.SkvbcClient(bft_network.random_client())
        current_primary = await bft_network.get_current_primary()
        checkpoint_before = await bft_network.wait_for_checkpoint(current_primary)

        log.log_message(message_type=f"expected_checkpoint_num should be > {checkpoint_before}")
        # Write enough data to checkpoint and create a need for state transfer
        for i in range(1 + num_of_checkpoints_to_add * 150):
            key = skvbc.random_key()
            val = skvbc.random_value()
            reply = await client.write([], [(key, val)])
            assert reply.success

        await skvbc.network_wait_for_checkpoint(
            initial_nodes,
            expected_checkpoint_num=lambda ecn: ecn > checkpoint_before,
            verify_checkpoint_persistency=verify_checkpoint_persistency,
            assert_state_transfer_not_started=assert_state_transfer_not_started)
Exemplo n.º 8
0
 async def _trigger_view_change(self, skvbc):
     log.log_message(
         message_type="Sending random transactions to trigger view change..."
     )
     with trio.move_on_after(1):  # seconds
         async with trio.open_nursery() as nursery:
             nursery.start_soon(skvbc.send_indefinite_write_requests)
Exemplo n.º 9
0
 async def test_ro_replica_start_simultaneously(self, bft_network, tracker):
     """
     Start up N of N regular replicas.
     Start read-only replica.
     Send client commands.
     Wait for State Transfer in ReadOnlyReplica to complete.
     """
     bft_network.start_all_replicas()
     # start the read-only replica
     ro_replica_id = bft_network.config.n
     bft_network.start_replica(ro_replica_id)
     # TODO replace the below function with the library function:
     # await tracker.skvbc.tracked_fill_and_wait_for_checkpoint(
     # initial_nodes=bft_network.all_replicas(),
     # num_of_checkpoints_to_add=1)
     with trio.fail_after(seconds=60):
         async with trio.open_nursery() as nursery:
             skvbc = kvbc.SimpleKVBCProtocol(bft_network, tracker)
             nursery.start_soon(skvbc.send_indefinite_ops, .7, .1)
             while True:
                 with trio.move_on_after(seconds=.5):
                     try:
                         key = ['replica', 'Gauges', 'lastExecutedSeqNum']
                         lastExecutedSeqNum = await bft_network.metrics.get(
                             ro_replica_id, *key)
                     except KeyError:
                         continue
                     else:
                         # success!
                         if lastExecutedSeqNum >= 150:
                             log.log_message(message_type="Replica" +
                                             str(ro_replica_id) +
                                             " : lastExecutedSeqNum:" +
                                             str(lastExecutedSeqNum))
                             nursery.cancel_scope.cancel()
Exemplo n.º 10
0
    async def test_isolate_non_primaries_subset_with_view_change(
            self, bft_network, tracker):
        """
        In this test we isolate f-1 replicas from the rest of the BFT network.
        We crash the primary and trigger view change while the f-1 replicas are still isolated.
        At this point we have a total of f unavailable replicas.

        The adversary is then deactivated and we make sure the previously isolated replicas
        activate the new view and correctly process incoming client requests.
        """
        bft_network.start_all_replicas()

        f = bft_network.config.f
        initial_primary = await bft_network.get_current_primary()
        expected_next_primary = 1 + initial_primary
        isolated_replicas = bft_network.random_set_of_replicas(
            f - 1, without={initial_primary, expected_next_primary})

        log.log_message(
            message_type=
            f'Isolating network traffic to/from replicas {isolated_replicas}.')
        with net.ReplicaSubsetIsolatingAdversary(
                bft_network, isolated_replicas) as adversary:
            adversary.interfere()

            bft_network.stop_replica(initial_primary)
            await self._send_random_writes(tracker)

            await bft_network.wait_for_view(
                replica_id=random.choice(
                    bft_network.all_replicas(
                        without={initial_primary}.union(isolated_replicas))),
                expected=lambda v: v == expected_next_primary,
                err_msg="Make sure view change has been triggered.")

            # waiting for the active window to be rebuilt after the view change
            await trio.sleep(seconds=5)

        # the adversary is not active anymore:
        # make sure the isolated replicas activate the new view
        for ir in isolated_replicas:
            await bft_network.wait_for_view(
                replica_id=ir,
                expected=lambda v: v == expected_next_primary,
                err_msg=
                f"Make sure isolated replica #{ir} works in new view {expected_next_primary}."
            )

        # then make sure the isolated replicas participate in consensus & request execution
        await tracker.run_concurrent_ops(num_ops=50)

        expected_last_executed_seq_num = await bft_network.wait_for_last_executed_seq_num(
            replica_id=random.choice(
                bft_network.all_replicas(
                    without={initial_primary}.union(isolated_replicas))))

        for ir in isolated_replicas:
            await bft_network.wait_for_last_executed_seq_num(
                replica_id=ir, expected=expected_last_executed_seq_num)
Exemplo n.º 11
0
 async def run_concurrent_conflict_ops(self, num_ops, write_weight=.70):
     if self.tracker.no_conflicts is True:
         log.log_message(message_type="call to run_concurrent_conflict_ops with no_conflicts=True,"
                                      " calling run_concurrent_ops instead")
         return await self.run_concurrent_ops(num_ops, write_weight)
     max_concurrency = len(self.bft_network.clients) // 2
     max_size = len(self.keys) // 2
     return await self.send_concurrent_ops(num_ops, max_concurrency, max_size, write_weight, create_conflicts=True)
 async def _stop_random_replicas_with_delay(bft_network, delay=10, exclude_replicas=None):
     all_replicas = bft_network.all_replicas(without=exclude_replicas)
     random.shuffle(all_replicas)
     for replica in all_replicas:
         log.log_message(message_type=f"stopping replica: {replica}")
         bft_network.stop_replica(replica)
         await trio.sleep(delay)
     return list(all_replicas)
Exemplo n.º 13
0
 def restore_form_older_snapshot(self, bft_network, replica, snapshot_id):
     with log.start_action(action_type="restore with older snapshot"):
         snapshot_db_dir = os.path.join(bft_network.testdir, DB_SNAPSHOT_PREFIX + str(replica) + "/" + str(snapshot_id))
         dest_db_dir = os.path.join(bft_network.testdir, DB_FILE_PREFIX + str(replica))
         if os.path.exists(dest_db_dir) :
                 shutil.rmtree(dest_db_dir)
         ret = shutil.copytree(snapshot_db_dir, dest_db_dir) 
         log.log_message(message_type=f"copy db files from {snapshot_db_dir} to {dest_db_dir}, result is {ret}")
    async def test_restarting_replica_with_client_load(self, bft_network):
        """
        The goal of this test is to restart a replica multiple times while the system is processing
        Client Operations to verify the restarted replica recovery and the system's return to
        fast path of processing client requests.
        Scenario:
        1) For 1 minute send client operations.
        2) While sending client operations we restart multiple times 1 randomly selected Replica
           (not the Primary).
        3) After every restart we verify that the system will eventually return to the Fast Path.
        """

        bft_network.start_all_replicas()
        skvbc = kvbc.SimpleKVBCProtocol(bft_network)

        primary_replica = 0

        # Pick one replica to restart multiple times while the system is processing client requests
        replica_to_restart = random.choice(
            bft_network.all_replicas(without={primary_replica}))

        # uncomment for live tracking of log messages from the test
        # log = foo()

        async def client_load(task_status=trio.TASK_STATUS_IGNORED):
            with trio.CancelScope() as scope:
                task_status.started(scope)
                await skvbc.send_indefinite_ops()

        async with trio.open_nursery() as nursery:
            # Start the sending of client operations in the background.
            scoped_client_load = await nursery.start(client_load)
            for v in range(loops * 100):
                if (0 == v % loops):
                    log.log_message(f"iteration {v}")

                log.log_message(f"Stop replica {replica_to_restart} and wait for system to move to slow path")
                bft_network.stop_replica(replica_to_restart, True)
                latest_slow_paths = total_slow_paths = await bft_network.num_of_slow_path_requests(primary_replica)
                with trio.fail_after(seconds=15):
                    while latest_slow_paths - total_slow_paths == 0:
                        await trio.sleep(seconds=0.1)
                        latest_slow_paths = await bft_network.num_of_slow_path_requests(primary_replica)
                log.log_message(f"Start replica {replica_to_restart} and wait for system to move to fast path")
                bft_network.start_replica(replica_to_restart)
                latest_fast_paths = total_fast_paths = await bft_network.num_of_fast_path_requests(primary_replica)
                with trio.fail_after(seconds=15):
                    while latest_fast_paths == total_fast_paths:
                        await trio.sleep(seconds=0.1)
                        latest_fast_paths = await bft_network.num_of_fast_path_requests(primary_replica)
            scoped_client_load.cancel()

        # Before the test ends we verify the Fast Path is prevalent,
        # no matter the restarts we performed on the selected replica.
        log.log_message("wait for fast path to be prevalent")
        await bft_network.wait_for_fast_path_to_be_prevalent(
            run_ops=lambda: skvbc.run_concurrent_ops(num_ops=20, write_weight=1), threshold=20)
        log.log_message("fast path prevailed")
Exemplo n.º 15
0
    async def test_inactive_window(self, bft_network):
        """
        The goal of this test is to verify full catch up of a Replica only from the Inactive Window.
        1) Start all Replicas without Replica 1, which will later catch up from the Primary's Inactive Window.
        2) Advance all Replicas to 1 sequence number beyond the first stable and verify they have all collected
           Stable Checkpoints.
        3) Start and isolate the late Replica 1 form all others except the Primary. This way it will not be able
           to start State Transfer and will only be able to catch up from the Primary's Inactive Window.
        4) Verify that Replica 1 has managed to catch up.
        """

        late_replica = 1

        bft_network.start_replicas(
            bft_network.all_replicas(without={late_replica}))
        skvbc = kvbc.SimpleKVBCProtocol(bft_network)

        stable_checkpoint_to_reach = 1
        num_reqs_to_catch_up = 151

        async def write_req(num_req=1):
            for _ in range(num_req):
                await skvbc.write_known_kv()

        # create checkpoint and wait for checkpoint propagation
        await skvbc.fill_and_wait_for_checkpoint(
            initial_nodes=bft_network.get_live_replicas(),
            num_of_checkpoints_to_add=stable_checkpoint_to_reach,
            verify_checkpoint_persistency=False)

        await bft_network.wait_for_replicas_to_collect_stable_checkpoint(
            bft_network.get_live_replicas(), stable_checkpoint_to_reach)

        with trio.fail_after(seconds=30):
            with net.ReplicaOneWayTwoSubsetsIsolatingAdversary(
                    bft_network, {1}, {6, 5, 4, 3, 2}) as adversary:
                adversary.interfere()

                bft_network.start_replica(late_replica)

                late_replica_catch_up = False
                while not late_replica_catch_up:
                    for replica_id in bft_network.all_replicas():
                        last_stable = await bft_network.get_metric(
                            replica_id, bft_network, 'Gauges',
                            "lastStableSeqNum")
                        last_exec = await bft_network.get_metric(
                            replica_id, bft_network, 'Gauges',
                            "lastExecutedSeqNum")
                        log.log_message(
                            message_type=
                            f"replica = {replica_id}; last_stable = {last_stable}; lase_exec = {last_exec}"
                        )
                        if replica_id == late_replica and last_exec >= num_reqs_to_catch_up:
                            late_replica_catch_up = True

                    await write_req()
                    await trio.sleep(seconds=3)
Exemplo n.º 16
0
 def transfer_dbcheckpoint_files(self, bft_network, source_replica, snapshot_id, dest_replicas):
     with log.start_action(action_type="transfer snapshot db files"):
         snapshot_db_dir = os.path.join(bft_network.testdir, DB_SNAPSHOT_PREFIX + str(source_replica) + "/" + str(snapshot_id))
         for r in dest_replicas:
             dest_db_dir = os.path.join(bft_network.testdir, DB_FILE_PREFIX + str(r))
             if os.path.exists(dest_db_dir) :
                 shutil.rmtree(dest_db_dir)
             ret = shutil.copytree(snapshot_db_dir, dest_db_dir) 
             log.log_message(message_type=f"copy db files from {snapshot_db_dir} to {dest_db_dir}, result is {ret}")
Exemplo n.º 17
0
 async def wrapper(*args, **kwargs):
     if 'disable_linearizability_checks' in kwargs:
         kwargs.pop('disable_linearizability_checks')
         log.log_message(message_type=f'Disabling linearizability is deprecated')
         
     bft_network = kwargs['bft_network']
     skvbc = kvbc.SimpleKVBCProtocol(bft_network)
     init_state = skvbc.initial_state()
     tracker = SkvbcTracker(init_state, skvbc, bft_network, pre_exec_enabled, no_conflicts, block_Accumulation)
     await async_fn(*args, **kwargs, tracker=tracker)
     await tracker.fill_missing_blocks_and_verify()
Exemplo n.º 18
0
    async def test_slow_path_view_change(self, bft_network, tracker):
        """
        This test validates the BFT engine's transition to the slow path
        when the primary goes down. This effectively triggers a view change in the slow path.

        First we write a batch of K/V entries and track them using the tracker from the decorator.

        We check those entries have been processed via the fast commit path.

        We stop the primary and send a indefinite batch of tracked read & write requests,
        triggering slow path & view change.

        We bring the primary back up.

        We make sure the second batch of requests have been processed via the slow path.
        """
        bft_network.start_all_replicas()

        num_ops = 5
        skvbc = kvbc.SimpleKVBCProtocol(bft_network, tracker)
        await bft_network.wait_for_fast_path_to_be_prevalent(
            run_ops=lambda: skvbc.run_concurrent_ops(num_ops=num_ops,
                                                     write_weight=1),
            threshold=num_ops)

        bft_network.stop_replica(0)

        # trigger the view change
        await skvbc.run_concurrent_ops(num_ops)

        randRep = random.choice(bft_network.all_replicas(without={0}))

        log.log_message(f'wait_for_view - Random replica {randRep}')

        await bft_network.wait_for_view(
            replica_id=randRep,
            expected=lambda v: v > 0,
            err_msg="Make sure view change has occurred.")

        nb_fast_paths_to_ignore = await bft_network.num_of_fast_path_requests(
            randRep)
        nb_slow_paths_to_ignore = await bft_network.num_of_slow_path_requests(
            randRep)

        with trio.move_on_after(seconds=5):
            async with trio.open_nursery() as nursery:
                nursery.start_soon(skvbc.send_indefinite_tracked_ops, 1)

        bft_network.start_replica(0)

        await bft_network.assert_slow_path_prevalent(
            nb_fast_paths_to_ignore=nb_fast_paths_to_ignore,
            nb_slow_paths_to_ignore=nb_slow_paths_to_ignore,
            replica_id=randRep)
    async def test_restart_replica_after_view_change(self, bft_network,
                                                     tracker):
        """
        This test makes sure that a replica can be safely restarted after a view change:
        1) Start all replicas
        2) Send a batch of concurrent reads/writes, to make sure the initial view is stable
        3) Crash the current primary & trigger view change
        4) Make sure the new view is agreed & activated among all live replicas
        5) Choose a random non-primary and restart it
        6) Send a batch of concurrent reads/writes
        7) Make sure the restarted replica is alive and that it works in the new view
        """
        bft_network.start_all_replicas()
        initial_primary = 0
        skvbc = kvbc.SimpleKVBCProtocol(bft_network, tracker)
        await skvbc.run_concurrent_ops(num_ops=10)

        bft_network.stop_replica(initial_primary)
        await self._send_random_writes(skvbc)

        await bft_network.wait_for_view(
            replica_id=random.choice(
                bft_network.all_replicas(without={initial_primary})),
            expected=lambda v: v == initial_primary + 1,
            err_msg="Make sure a view change is triggered.")
        current_primary = initial_primary + 1

        bft_network.start_replica(initial_primary)

        # waiting for the active window to be rebuilt after the view change
        await trio.sleep(seconds=5)

        unstable_replica = random.choice(
            bft_network.all_replicas(
                without={current_primary, initial_primary}))
        log.log_message(
            message_type=
            f"Restart replica #{unstable_replica} after the view change.")

        bft_network.stop_replica(unstable_replica)
        bft_network.start_replica(unstable_replica)
        await trio.sleep(seconds=5)

        await skvbc.run_concurrent_ops(num_ops=10)

        await bft_network.wait_for_view(
            replica_id=unstable_replica,
            expected=lambda v: v == current_primary,
            err_msg="Make sure the unstable replica works in the new view.")

        await bft_network.wait_for_view(
            replica_id=initial_primary,
            expected=lambda v: v == current_primary,
            err_msg="Make sure the initial primary activates the new view.")
 async def _start_random_replicas_with_delay(bft_network, stopped_replicas,  initial_primary,
                                             f_replicas_stopped_early=None, delay=10):
     random.shuffle(stopped_replicas)
     if f_replicas_stopped_early:
         stopped_replicas.extend(f_replicas_stopped_early)
     if initial_primary not in stopped_replicas:
         stopped_replicas.append(initial_primary)
     for replica in stopped_replicas:
         log.log_message(message_type=f"starting replica: {replica}")
         bft_network.start_replica(replica)
         await trio.sleep(delay)
     return stopped_replicas
Exemplo n.º 21
0
 async def _get_gauge(cls, replica_id, bft_network, gauge):
     with trio.fail_after(seconds=30):
         while True:
             with trio.move_on_after(seconds=1):
                 try:
                     key = ['replica', 'Gauges', gauge]
                     value = await bft_network.metrics.get(replica_id, *key)
                 except KeyError:
                     # metrics not yet available, continue looping
                     log.log_message(message_type=f"KeyError! '{gauge}' not yet available.")
                 else:
                     return value
Exemplo n.º 22
0
    def _start_integrity_check(self, bft_network, keys_file, s3_config_file, key_to_validate=None):
        """
        Start integrity check
        """
        with log.start_action(action_type="start_integrity_check"):
            stdout_file = None
            stderr_file = None

            if os.environ.get('KEEP_APOLLO_LOGS', "").lower() in ["true", "on"]:
                test_name = os.environ.get('TEST_NAME')

                if not test_name:
                    now = datetime.now().strftime("%y-%m-%d_%H:%M:%S")
                    test_name = f"{now}_{bft_network.current_test}"

                test_dir = f"{bft_network.builddir}/tests/apollo/logs/{test_name}/{bft_network.current_test}/"
                test_log = f"{test_dir}stdout_integrity_check.log"
                log.log_message(message_type=f"test log is: {test_log}")
                os.makedirs(test_dir, exist_ok=True)

                stdout_file = open(test_log, 'w+')
                stderr_file = open(test_log, 'w+')

                stdout_file.write("############################################\n")
                stdout_file.flush()
                stderr_file.write("############################################\n")
                stderr_file.flush()

                s3_config_path = os.path.join(bft_network.builddir, s3_config_file)
                integrity_check_fds = (stdout_file, stderr_file)
                integrity_check_exe = os.path.join(bft_network.builddir, "kvbc", "tools", "db_integrity_check", "s3_integrity_check")
                integrity_check_cmd = [integrity_check_exe,
                                       "-k", keys_file,
                                       "-3", s3_config_path]
                if key_to_validate is not None:
                    integrity_check_cmd.append("-v")
                    integrity_check_cmd.append(key_to_validate)
                else:
                    integrity_check_cmd.append("-a")
                log.log_message(message_type="starting the subprocess")
                integrity_check_pid = subprocess.Popen(
                    integrity_check_cmd,
                    stdout=stdout_file,
                    stderr=stderr_file,
                    close_fds=True)
                try:
                    exit_code = integrity_check_pid.wait()
                    assert exit_code == 0
                except Exception as e:
                    assert False
                finally:
                    for fd in integrity_check_fds:
                        fd.close()
Exemplo n.º 23
0
    async def test_fast_path_after_view_change(self, bft_network, tracker):
        """
        This test validates the BFT engine's ability to restore the fast path
        after a view change due to crashed primary.

        First we write a batch of K/V entries and check those entries have been processed via the fast commit path.

        We stop the primary and send a single write requests to trigger a view change.

        We bring the primary back up.

        We make sure the fast path is eventually maintained.

        Finally the decorator verifies the KV execution.
        """
        bft_network.start_all_replicas()
        skvbc = kvbc.SimpleKVBCProtocol(bft_network, tracker)
        num_ops = 5

        await bft_network.wait_for_consensus_path(
            path_type=ConsensusPathType.OPTIMISTIC_FAST,
            run_ops=lambda: self.send_kvs_sequentially(skvbc, num_ops),
            threshold=num_ops)

        # Stop the primary
        bft_network.stop_replica(0)

        # Send a write request to trigger a view change
        with trio.move_on_after(seconds=3):
            await skvbc.send_write_kv_set()

        randRep = random.choice(bft_network.all_replicas(without={0}))

        log.log_message(f'wait_for_view - Random replica {randRep}')

        await bft_network.wait_for_view(
            replica_id=randRep,
            expected=lambda v: v > 0,
            err_msg="Make sure view change has occurred.")

        # Restore the crashed primary
        bft_network.start_replica(0)

        await self.wait_for_stable_state(skvbc, timeout_secs=10)

        # View change recovers
        await bft_network.wait_for_consensus_path(
            path_type=ConsensusPathType.OPTIMISTIC_FAST,
            run_ops=lambda: self.send_kvs_sequentially(
                skvbc, int(1.1 * self.EVALUATION_PERIOD_SEQUENCES)),
            threshold=num_ops)
Exemplo n.º 24
0
 async def get_blocks(self, client, block_ids):
     blocks = {}
     for block_id in block_ids:
         retries = 12 # 60 seconds
         for i in range(0, retries):
             try:
                 msg = kvbc.SimpleKVBCProtocol.get_block_data_req(block_id)
                 blocks[block_id] = kvbc.SimpleKVBCProtocol.parse_reply(await client.read(msg))
                 break
             except trio.TooSlowError:
                 if i == retries - 1:
                     raise
         log.log_message(message_type=f'Retrieved block {block_id}')
     return blocks
Exemplo n.º 25
0
    def setUpClass(cls):
        if not os.environ.get("CONCORD_BFT_MINIO_BINARY_PATH"):
            log.log_message(
                message_type=
                "CONCORD_BFT_MINIO_BINARY_PATH is not set. Running in RocksDB mode."
            )
            return

        log.log_message(
            message_type=
            "CONCORD_BFT_MINIO_BINARY_PATH is set. Running in S3 mode.")

        # We need a temp dir for data and binaries - this is cls.dest_dir
        # self.dest_dir will contain data dir for minio buckets and the minio binary
        # if there are any directories inside data dir - they become buckets
        cls.work_dir = "/tmp/concord_bft_minio_datadir_" + next(
            tempfile._get_candidate_names())
        cls.minio_server_data_dir = os.path.join(cls.work_dir, "data")
        os.makedirs(os.path.join(cls.work_dir, "data",
                                 "blockchain"))  # create all dirs in one call

        log.log_message(message_type=f"Working in {cls.work_dir}")

        # Start server
        cls._start_s3_server()

        log.log_message(message_type="Initialisation complete")
Exemplo n.º 26
0
    def start_s3_server(self):
        log.log_message(message_type="Starting server")
        server_env = os.environ.copy()
        server_env["MINIO_ACCESS_KEY"] = "concordbft"
        server_env["MINIO_SECRET_KEY"] = "concordbft"

        minio_server_fname = os.environ.get("CONCORD_BFT_MINIO_BINARY_PATH")
        if minio_server_fname is None:
            shutil.rmtree(self.work_dir)
            raise RuntimeError("Please set path to minio binary to CONCORD_BFT_MINIO_BINARY_PATH env variable")

        self.minio_server_proc = subprocess.Popen([minio_server_fname, "server", self.minio_server_data_dir],
                                                 env = server_env,
                                                 close_fds=True)
Exemplo n.º 27
0
    async def test_fast_path_after_view_change(self, bft_network, tracker):
        """
        This test validates the BFT engine's ability to restore the fast path
        after a view change due to crashed primary.

        First we write a batch of K/V entries and check those entries have been processed via the fast commit path.

        We stop the primary and send a single write requests to trigger a view change.

        We bring the primary back up.

        We make sure the fast path is eventually maintained.

        Finally the decorator verifies the KV execution.
        """

        bft_network.start_all_replicas()
        skvbc = kvbc.SimpleKVBCProtocol(bft_network, tracker)

        # Initially all replicas are running on the fast path
        await bft_network.wait_for_fast_path_to_be_prevalent(
            run_ops=lambda: skvbc.run_concurrent_ops(num_ops=NUM_OPS,
                                                     write_weight=1),
            threshold=NUM_OPS)

        # Stop the primary
        bft_network.stop_replica(0)

        # Send a write request to trigger a view change
        with trio.move_on_after(seconds=3):
            await skvbc.send_write_kv_set()

        randRep = random.choice(bft_network.all_replicas(without={0}))

        log.log_message(f'wait_for_view - Random replica {randRep}')

        await bft_network.wait_for_view(
            replica_id=randRep,
            expected=lambda v: v > 0,
            err_msg="Make sure view change has occurred.")

        # Restore the crashed primary
        bft_network.start_replica(0)

        # Make sure that the fast path is maintained eventually
        await bft_network.wait_for_fast_path_to_be_prevalent(
            run_ops=lambda: skvbc.run_concurrent_ops(num_ops=NUM_OPS,
                                                     write_weight=1),
            threshold=NUM_OPS)
Exemplo n.º 28
0
 async def _check_st_not_started(self, bft_network, ro_replica_id):
     with trio.fail_after(seconds=70):
         # the ro replica should be able to survive these failures
         while True:
             with trio.move_on_after(seconds=.5):
                 try:
                     key = ['replica', 'Gauges', 'lastExecutedSeqNum']
                     lastExecutedSeqNum = await bft_network.metrics.get(ro_replica_id, *key)
                 except KeyError:
                     continue
                 else:
                     # success!
                     if lastExecutedSeqNum == 0:
                         log.log_message(message_type="Replica" + str(ro_replica_id) + " : lastExecutedSeqNum:" + str(lastExecutedSeqNum))
                         break
Exemplo n.º 29
0
    def fill_missing_blocks(self, missing_blocks):
        """
        Add all missing blocks to self.blocks

        Note that these blocks will not have a matching req_index since we never
        received a reply for the request that created it. In some histories it's
        not possible to identify an unambiguous request, since there may be
        multiple possible requests that could have correctly generated the
        block. Rather than trying to match the requests, to the missing blocks,
        we just assume the missing blocks are correct for now, and use the full
        block history to verify successful conditional writes and reads.
        """
        for block_id, kvpairs in missing_blocks.items():
            self.blocks[block_id] = Block(kvpairs)
            if block_id > self.last_known_block:
                self.last_known_block = block_id
        self.filled_blocks = missing_blocks
        log.log_message(message_type=f'{len(missing_blocks)} missing blocks filled.')
Exemplo n.º 30
0
    async def _test_st_while_crashing_primary(
            self, bft_network, trigger_view_change, crash_repeatedly, tracker):
        # we need a BFT network with f >= 2, allowing us to have 2
        # crashed replicas at the same time (the primary and the stale node)

        n = bft_network.config.n

        stale_replica = n - 1
        skvbc = kvbc.SimpleKVBCProtocol(bft_network, tracker)

        client, known_key, known_val = \
            await skvbc.prime_for_state_transfer(stale_nodes={stale_replica},
                                                           checkpoints_num=2)
        view = await bft_network.wait_for_view(
            replica_id=0,
            expected=lambda v: v == 0,
            err_msg="Make sure we are in the initial view."
        )

        log.log_message(message_type=f'Initial view number is {view}, as expected.')

        if crash_repeatedly:
            await self._run_state_transfer_while_crashing_primary_repeatedly(
                skvbc=skvbc,
                bft_network=bft_network,
                n=n,
                primary=0,
                stale=stale_replica
            )
        else:
            await self._run_state_transfer_while_crashing_primary_once(
                skvbc=skvbc,
                bft_network=bft_network,
                n=n,
                primary=0,
                stale=stale_replica,
                trigger_view_change=trigger_view_change
            )

        await bft_network.force_quorum_including_replica(stale_replica)

        kvpairs = await skvbc.send_read_kv_set(client, known_key)
        self.assertDictEqual(dict([(known_key, known_val)]), kvpairs)