async def test_pruning_command(self, bft_network):
        with log.start_action(action_type="test_pruning_command"):
            bft_network.start_all_replicas()
            skvbc = kvbc.SimpleKVBCProtocol(bft_network)
            client = bft_network.random_client()

            # Create 100 blocks in total, including the genesis block we have 101 blocks
            k, v = await skvbc.write_known_kv()
            for i in range(99):
                v = skvbc.random_value()
                await client.write(skvbc.write_req([], [(k, v)], 0))

            # Get the minimal latest pruneable block among all replicas
            op = operator.Operator(bft_network.config, client,
                                   bft_network.builddir)
            await op.latest_pruneable_block()

            latest_pruneable_blocks = []
            rsi_rep = client.get_rsi_replies()
            for r in rsi_rep.values():
                lpab = cmf_msgs.ReconfigurationResponse.deserialize(r)[0]
                latest_pruneable_blocks += [lpab.response]

            await op.prune(latest_pruneable_blocks)
            rsi_rep = client.get_rsi_replies()
            # we expect to have at least 2f + 1 replies
            for rep in rsi_rep:
                r = rsi_rep[rep]
                data = cmf_msgs.ReconfigurationResponse.deserialize(r)[0]
                pruned_block = int(data.additional_data.decode('utf-8'))
                assert pruned_block <= 90
    async def test_key_exchange_command(self, bft_network):
        """
            No initial key rotation
            Sends key exchange command to replica 0
            New keys for replica 0 should get effective at checkpoint 2, i.e. seqnum 300
        """
        bft_network.start_all_replicas()
        client = bft_network.random_client()
        skvbc = kvbc.SimpleKVBCProtocol(bft_network)

        op = operator.Operator(bft_network.config, client,
                               bft_network.builddir)
        await op.key_exchange([0])
        for i in range(450):
            await skvbc.write_known_kv()

        sent_key_exchange_counter = await bft_network.metrics.get(
            0, *["KeyExchangeManager", "Counters", "sent_key_exchange"])
        assert sent_key_exchange_counter == 1
        self_key_exchange_counter = await bft_network.metrics.get(
            0, *["KeyExchangeManager", "Counters", "self_key_exchange"])
        assert self_key_exchange_counter == 1
        public_key_exchange_for_peer_counter = await bft_network.metrics.get(
            1, *[
                "KeyExchangeManager", "Counters",
                "public_key_exchange_for_peer"
            ])
        assert public_key_exchange_for_peer_counter == 1
    async def test_wedge_command_and_specific_replica_info(self, bft_network):
        """
             Sends a wedge command and check that the system stops from processing new requests.
             Note that in this test we assume no failures and synchronized network.
             The test does the following:
             1. A client sends a wedge command
             2. The client then sends a "Have you stopped" read only command such that each replica answers "I have stopped"
             3. The client validates with the metrics that all replicas have stopped
         """
        bft_network.start_all_replicas()
        skvbc = kvbc.SimpleKVBCProtocol(bft_network)
        client = bft_network.random_client()
        # We increase the default request timeout because we need to have around 300 consensuses which occasionally may take more than 5 seconds
        client.config._replace(req_timeout_milli=10000)

        op = operator.Operator(bft_network.config, client,
                               bft_network.builddir)
        await op.wedge()

        with trio.fail_after(seconds=90):
            done = False
            while done is False:
                await op.wedge_status()
                rsi_rep = client.get_rsi_replies()
                done = True
                for r in rsi_rep.values():
                    res = cmf_msgs.ReconfigurationResponse.deserialize(r)
                    status = res[0].response.stopped
                    if status is False:
                        done = False
                        break

        await self.validate_stop_on_super_stable_checkpoint(bft_network, skvbc)
 async def test_key_exchange_command(self, bft_network):
     bft_network.start_all_replicas()
     client = bft_network.random_client()
     skvbc = kvbc.SimpleKVBCProtocol(bft_network)
     # We increase the default request timeout because we need to have around 300 consensuses which occasionally may take more than 5 seconds
     client.config._replace(req_timeout_milli=10000)
     op = operator.Operator(bft_network.config, client, bft_network.builddir)
     await op.key_exchange()
示例#5
0
    async def test_state_transfer_rvt_validity_after_pruning(
            self, bft_network):
        """
        The goal of this test is to validate that all replicas have their Range validation trees (RVTs) synchronized
        after running the consensus and then pruning.

        1) Given a BFT network start N - 1 replicas (leaving one stale)
        2) Send enough requests to trigger 10 checkpoints
        3) Start the stale replica
        4) Enter state transfer to bring back the stale node up-to-date
        5) Wait for state transfer to be finished
        6) Wait for the RVT root values to be in sync
        7) Prune
        8) Wait for two more checkpoints so that the RVT is updated to reflect the changes after the pruning
        9) Wait for the RVT root values to be in sync
        """

        skvbc = kvbc.SimpleKVBCProtocol(bft_network)

        stale_node = random.choice(bft_network.all_replicas(without={0}))

        await skvbc.prime_for_state_transfer(
            stale_nodes={stale_node},
            checkpoints_num=10,  # key-exchange changes the last executed seqnum
            persistency_enabled=False)

        bft_network.start_replica(stale_node)
        await bft_network.wait_for_state_transfer_to_start()
        await bft_network.wait_for_state_transfer_to_stop(0, stale_node)
        # Wait for the RVT root values to be in sync before the pruning
        await bft_network.wait_for_replicas_rvt_root_values_to_be_in_sync(
            bft_network.all_replicas())

        # Get the minimal latest pruneable block among all replicas
        client = bft_network.random_client()
        op = operator.Operator(bft_network.config, client,
                               bft_network.builddir)

        await op.latest_pruneable_block()

        latest_pruneable_blocks = []
        rsi_rep = client.get_rsi_replies()
        for r in rsi_rep.values():
            lpab = cmf_msgs.ReconfigurationResponse.deserialize(r)[0]
            latest_pruneable_blocks += [lpab.response]

        await op.prune(latest_pruneable_blocks)

        # Wait for two checkpoints so that the RVT is updated to reflect the changes after the pruning
        await skvbc.fill_and_wait_for_checkpoint(
            bft_network.all_replicas(),
            num_of_checkpoints_to_add=2,
            verify_checkpoint_persistency=False,
            assert_state_transfer_not_started=False)

        # Validate that the RVT root values are in sync after the pruning has finished
        await bft_network.wait_for_replicas_rvt_root_values_to_be_in_sync(
            bft_network.all_replicas())
    async def test_wedge_command_where_noops_should_be_sent_in_two_parts(
            self, bft_network):
        """
            Sends a wedge command on sequence number 300 and check that the system stops from processing new requests.
            this way, when the primary tries to sent noop commands, the working window is reach only to 450.
            Thus, it has to wait for a new stable checkpoint before sending the last 150 noops
            Note: In this test we assume that the batch duration is no
         """
        bft_network.start_all_replicas()
        skvbc = kvbc.SimpleKVBCProtocol(bft_network)
        client = bft_network.random_client()
        # We increase the default request timeout because we need to have around 300 consensuses which occasionally may take more than 5 seconds
        client.config._replace(req_timeout_milli=10000)

        # bring the system to sequence number 299
        for i in range(299):
            await skvbc.write_known_kv()

        # verify that all nodes are in sequence number 299
        not_reached = True
        with trio.fail_after(seconds=30):
            while not_reached:
                not_reached = False
                for r in bft_network.all_replicas():
                    lastExecSeqNum = await bft_network.get_metric(
                        r, bft_network, "Gauges", "lastExecutedSeqNum")
                    if lastExecSeqNum != 299:
                        not_reached = True
                        break

        # now, send a wedge command. The wedge command sequence number is 300. Hence, in this point the woeking window
        # is between 150 - 450. But, the wedge command will make the primary to send noops until 600.
        # we want to verify that the primary manages to send the noops as required.
        op = operator.Operator(bft_network.config, client,
                               bft_network.builddir)
        await op.wedge()

        # now, verify that the system has managed to stop
        with trio.fail_after(seconds=90):
            done = False
            while done is False:
                await op.wedge_status()
                rsi_rep = client.get_rsi_replies()
                done = True
                for r in rsi_rep.values():
                    res = cmf_msgs.ReconfigurationResponse.deserialize(r)
                    status = res[0].response.stopped
                    if status is False:
                        done = False
                        break

        await self.verify_replicas_are_in_wedged_checkpoint(
            bft_network, 2, range(bft_network.config.n))
        await self.verify_last_executed_seq_num(bft_network, 2)
        await self.validate_stop_on_super_stable_checkpoint(bft_network, skvbc)
    async def test_wedge_command_with_state_transfer(self, bft_network):
        """
            This test checks that even a replica that received the super stable checkpoint via the state transfer mechanism
            is able to stop at the super stable checkpoint.
            The test does the following:
            1. Start all replicas but 1
            2. A client sends a wedge command
            3. Validate that all started replicas reached to the next next checkpoint
            4. Start the late replica
            5. Validate that the late replica completed the state transfer
            6. Validate that all replicas stopped at the super stable checkpoint and that new commands are not being processed
        """
        initial_prim = 0
        late_replicas = bft_network.random_set_of_replicas(1, {initial_prim})
        on_time_replicas = bft_network.all_replicas(without=late_replicas)
        bft_network.start_replicas(on_time_replicas)

        skvbc = kvbc.SimpleKVBCProtocol(bft_network)
        await skvbc.wait_for_liveness()

        checkpoint_before = await bft_network.wait_for_checkpoint(replica_id=0)

        client = bft_network.random_client()
        # We increase the default request timeout because we need to have around 300 consensuses which occasionally may take more than 5 seconds
        client.config._replace(req_timeout_milli=10000)
        with log.start_action(action_type="send_wedge_cmd",
                              checkpoint_before=checkpoint_before,
                              late_replicas=list(late_replicas)):
            op = operator.Operator(bft_network.config, client,
                                   bft_network.builddir)
            await op.wedge()

        await self.verify_replicas_are_in_wedged_checkpoint(
            bft_network, checkpoint_before, on_time_replicas)

        bft_network.start_replicas(late_replicas)

        await bft_network.wait_for_state_transfer_to_start()
        for r in late_replicas:
            await bft_network.wait_for_state_transfer_to_stop(
                initial_prim, r, stop_on_stable_seq_num=False)
        await self.verify_replicas_are_in_wedged_checkpoint(
            bft_network, checkpoint_before, range(bft_network.config.n))

        await self.validate_stop_on_super_stable_checkpoint(bft_network, skvbc)
    async def test_pruning_status_command(self, bft_network):

        bft_network.start_all_replicas()
        skvbc = kvbc.SimpleKVBCProtocol(bft_network)
        client = bft_network.random_client()

        op = operator.Operator(bft_network.config, client,
                               bft_network.builddir)
        await op.prune_status()

        rsi_rep = client.get_rsi_replies()
        for r in rsi_rep.values():
            status = cmf_msgs.ReconfigurationResponse.deserialize(r)[0]
            assert status.response.in_progress is False
            assert status.response.last_pruned_block == 0

        # Create 100 blocks in total, including the genesis block we have 101 blocks
        k, v = await skvbc.write_known_kv()
        for i in range(99):
            v = skvbc.random_value()
            await client.write(skvbc.write_req([], [(k, v)], 0))

        # Get the minimal latest pruneable block among all replicas
        await op.latest_pruneable_block()

        latest_pruneable_blocks = []
        rsi_rep = client.get_rsi_replies()
        for r in rsi_rep.values():
            lpab = cmf_msgs.ReconfigurationResponse.deserialize(r)[0]
            latest_pruneable_blocks += [lpab.response]

        await op.prune(latest_pruneable_blocks)

        # Verify the system is able to get new write requests (which means that pruning has done)
        with trio.fail_after(30):
            await skvbc.write_known_kv()

        await op.prune_status()

        rsi_rep = client.get_rsi_replies()
        for r in rsi_rep.values():
            status = cmf_msgs.ReconfigurationResponse.deserialize(r)[0]
            assert status.response.in_progress is False
            assert status.response.last_pruned_block <= 90
 async def test_wedge_command(self, bft_network):
     """
          Sends a wedge command and checks that the system stops processing new requests.
          Note that in this test we assume no failures and synchronized network.
          The test does the following:
          1. A client sends a wedge command
          2. The client verifies that the system reached a super stable checkpoint.
          3. The client tries to initiate a new write bft command and fails
      """
     bft_network.start_all_replicas()
     skvbc = kvbc.SimpleKVBCProtocol(bft_network)
     client = bft_network.random_client()
     # We increase the default request timeout because we need to have around 300 consensuses which occasionally may take more than 5 seconds
     client.config._replace(req_timeout_milli=10000)
     checkpoint_before = await bft_network.wait_for_checkpoint(replica_id=0)
     op = operator.Operator(bft_network.config, client,  bft_network.builddir)
     await op.wedge()
     await self.verify_replicas_are_in_wedged_checkpoint(bft_network, checkpoint_before, range(bft_network.config.n))
     await self.verify_last_executed_seq_num(bft_network, checkpoint_before)
     await self.validate_stop_on_super_stable_checkpoint(bft_network, skvbc)
    async def test_pruning_with_ro_replica_failure(self, bft_network):

        bft_network.start_all_replicas()
        ro_replica_id = bft_network.config.n
        bft_network.start_replica(ro_replica_id)

        skvbc = kvbc.SimpleKVBCProtocol(bft_network)
        client = bft_network.random_client()

        op = operator.Operator(bft_network.config, client,
                               bft_network.builddir)

        # Create more than 150 blocks in total, including the genesis block we have 101 blocks
        k, v = await skvbc.write_known_kv()
        for i in range(200):
            v = skvbc.random_value()
            await client.write(skvbc.write_req([], [(k, v)], 0))

        # Wait for the read only replica to catch with the state
        await self._wait_for_st(bft_network, ro_replica_id, 150)

        # Get the minimal latest pruneable block among all replicas
        await op.latest_pruneable_block()

        latest_pruneable_blocks = []
        rsi_rep = client.get_rsi_replies()
        for r in rsi_rep.values():
            lpab = cmf_msgs.ReconfigurationResponse.deserialize(r)[0]
            latest_pruneable_blocks += [lpab.response]

        # Remove the read only latest pruneable block from the list
        for m in latest_pruneable_blocks:
            if m.replica >= bft_network.config.n:
                latest_pruneable_blocks.remove(m)

        assert len(latest_pruneable_blocks) == bft_network.config.n

        # Now, issue a prune request. we expect to receive an error as the read only latest prunebale block is missing
        rep = await op.prune(latest_pruneable_blocks)
        rep = cmf_msgs.ReconfigurationResponse.deserialize(rep)[0]
        assert rep.success is False
    async def test_get_latest_pruneable_block(self, bft_network):

        bft_network.start_all_replicas()
        skvbc = kvbc.SimpleKVBCProtocol(bft_network)
        client = bft_network.random_client()

        # Create 100 blocks in total, including the genesis block we have 101 blocks
        k, v = await skvbc.write_known_kv()
        for i in range(99):
            v = skvbc.random_value()
            await client.write(skvbc.write_req([], [(k, v)], 0))

        # Get the minimal latest pruneable block among all replicas
        op = operator.Operator(bft_network.config, client,
                               bft_network.builddir)
        await op.latest_pruneable_block()

        rsi_rep = client.get_rsi_replies()
        min_prunebale_block = 1000
        for r in rsi_rep.values():
            lpab = cmf_msgs.ReconfigurationResponse.deserialize(r)[0]
            if lpab.response.block_id < min_prunebale_block:
                min_prunebale_block = lpab.response.block_id

        # Create another 100 blocks
        k, v = await skvbc.write_known_kv()
        for i in range(99):
            v = skvbc.random_value()
            await client.write(skvbc.write_req([], [(k, v)], 0))

        # Get the new minimal latest pruneable block
        await op.latest_pruneable_block()

        rsi_rep = client.get_rsi_replies()
        min_prunebale_block_b = 1000
        for r in rsi_rep.values():
            lpab = cmf_msgs.ReconfigurationResponse.deserialize(r)[0]
            if lpab.response.block_id < min_prunebale_block_b:
                min_prunebale_block_b = lpab.response.block_id
        assert min_prunebale_block < min_prunebale_block_b
示例#12
0
    async def test_state_transfer_rvt_root_validation_after_adding_blocks(
            self, bft_network):
        """
        The goal of this test is to validate that all replicas have their Range validation trees (RVTs) synchronized
        after running the consensus multiple times while there are random restarts and prunings.

        1) Start all replicas in a given BFT network
        2) Loop 6 times:
            3) Send enough requests to trigger 2 checkpoints
            4) After the 1st iteration, do pruning at each even iteration (i.e. i % 2 == 0)
            5) Randomly choose if a replica will be restarted. If so, select a random replica and restart it
        6) Wait for the RVT root values to be in sync
        """

        for i in bft_network.all_replicas():
            bft_network.start_replica(i)

        skvbc = kvbc.SimpleKVBCProtocol(bft_network)

        client = bft_network.random_client()
        op = operator.Operator(bft_network.config, client,
                               bft_network.builddir)

        for i in range(6):
            print(f'Iteration {i}')
            await skvbc.fill_and_wait_for_checkpoint(
                bft_network.all_replicas(),
                num_of_checkpoints_to_add=2,
                verify_checkpoint_persistency=False,
                assert_state_transfer_not_started=False)

            if i > 0 and i % 2 == 0:
                await op.latest_pruneable_block()

                latest_pruneable_blocks = []
                rsi_rep = client.get_rsi_replies()
                for r in rsi_rep.values():
                    lpab = cmf_msgs.ReconfigurationResponse.deserialize(r)[0]
                    latest_pruneable_blocks += [lpab.response]
                print('Pruning...')
                await op.prune(latest_pruneable_blocks)

                with trio.fail_after(seconds=30):
                    while True:
                        num_replies = 0
                        await op.prune_status()
                        rsi_rep = client.get_rsi_replies()
                        for r in rsi_rep.values():
                            status = cmf_msgs.ReconfigurationResponse.deserialize(
                                r)[0]
                            last_prune_blockid = status.response.last_pruned_block
                            log.log_message(
                                message_type=
                                f"last_prune_blockid {last_prune_blockid}, status.response.sender {status.response.sender}"
                            )

                            if status.response.in_progress is False and last_prune_blockid > 0:
                                num_replies += 1
                        if num_replies == bft_network.config.n:
                            break
                print('Done pruning.')

            restart = random.choice([0, 1])
            if restart == 1:
                print(
                    'Selecting a random replica to be restarted (the primary is excluded)...'
                )
                replica_to_restart = random.choice(
                    bft_network.all_replicas(without={0}))
                print(f'Replica {replica_to_restart} will be restarted.')
                bft_network.stop_replica(replica_to_restart, True)
                bft_network.start_replica(replica_to_restart)
                await trio.sleep(seconds=1)

        # Validate that the RVT root values are in sync after all of the prunings and restarts have finished
        await bft_network.wait_for_replicas_rvt_root_values_to_be_in_sync(
            bft_network.all_replicas())
    async def test_wedge_command_with_f_failures(self, bft_network):
        """
            This test checks that even a replica that received the super stable checkpoint via the state transfer mechanism
            is able to stop at the super stable checkpoint.
            The test does the following:
            1. Start all replicas but 2
            2. A client sends a wedge command
            3. Validate that all started replicas have reached the wedge point
            4. Restart the live replicas and validate the system is able to make progress
            5. Start the late replica
            6. Validate that the late replicas completed the state transfer
            7. Join the late replicas to the quorum and make sure the system is able to make progress
        """
        initial_prim = 0
        late_replicas = bft_network.random_set_of_replicas(2, {initial_prim})
        on_time_replicas = bft_network.all_replicas(without=late_replicas)
        bft_network.start_replicas(on_time_replicas)

        skvbc = kvbc.SimpleKVBCProtocol(bft_network)
        await skvbc.wait_for_liveness()

        checkpoint_before = await bft_network.wait_for_checkpoint(replica_id=0)

        client = bft_network.random_client()
        # We increase the default request timeout because we need to have around 300 consensuses which occasionally may take more than 5 seconds
        client.config._replace(req_timeout_milli=10000)
        with log.start_action(action_type="send_wedge_cmd",
                              checkpoint_before=checkpoint_before,
                              late_replicas=list(late_replicas)):
            op = operator.Operator(bft_network.config, client,
                                   bft_network.builddir)
            await op.wedge()

        with trio.fail_after(seconds=60):
            done = False
            while done is False:
                await op.wedge_status(quorum=bft_client.MofNQuorum(
                    on_time_replicas, len(on_time_replicas)),
                                      fullWedge=False)
                rsi_rep = client.get_rsi_replies()
                done = True
                for r in rsi_rep.values():
                    res = cmf_msgs.ReconfigurationResponse.deserialize(r)
                    status = res[0].response.stopped
                    if status is False:
                        done = False
                        break

        # Make sure the system is able to make progress
        bft_network.stop_replicas(on_time_replicas)
        bft_network.start_replicas(on_time_replicas)
        for i in range(100):
            await skvbc.write_known_kv()

        # Start late replicas and wait for state transfer to stop
        bft_network.start_replicas(late_replicas)

        await bft_network.wait_for_state_transfer_to_start()
        for r in late_replicas:
            await bft_network.wait_for_state_transfer_to_stop(
                initial_prim, r, stop_on_stable_seq_num=True)

        replicas_to_stop = bft_network.random_set_of_replicas(
            2, late_replicas | {initial_prim})

        # Make sure the system is able to make progress
        for i in range(100):
            await skvbc.write_known_kv()