def test_catchup_from_unequal_nodes_without_waiting(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): normal_node = txnPoolNodeSet[0] lagging_node_1 = txnPoolNodeSet[1] lagging_node_2 = txnPoolNodeSet[2] stopped_node = txnPoolNodeSet[3] # Make sure everyone have one batch sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) # Wait until all nodes have same data and store last 3PC number of node that's going to be "stopped" ensure_all_nodes_have_same_data(looper, txnPoolNodeSet, custom_timeout=30) last_3pc = stopped_node.master_last_ordered_3PC with delay_rules_without_processing(stopped_node.nodeIbStasher, delay_3pc()): # Create one more batch on all nodes except "stopped" node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) with delay_rules(lagging_node_1.nodeIbStasher, delay_3pc(msgs=Commit)): # Create one more batch on all nodes except "stopped" and first lagging node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) with delay_rules(lagging_node_2.nodeIbStasher, delay_3pc(msgs=Commit)): # Create one more batch on all nodes except "stopped" and both lagging nodes # This time we can't wait for replies because there will be only one reqs = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) # Wait until normal node orders txn looper.run(eventually(lambda: assert_eq(normal_node.master_last_ordered_3PC[1], last_3pc[1] + 3))) # Now all nodes have different number of txns, so if we try to start a catch up # it is guaranteed that we'll need to ask for equal consistency proofs, and # disabled timeout ensures that node can do so without relying on timeout stopped_node.start_catchup() # Wait until catchup ends looper.run(eventually(lambda: assert_eq(stopped_node.ledgerManager._node_leecher._state, NodeLeecherService.State.Idle))) # Ensure stopped node caught up at least one batch assert stopped_node.master_last_ordered_3PC[1] > last_3pc[1] # And there was no view change assert stopped_node.master_last_ordered_3PC[0] == last_3pc[0] # Make sure replies from last request are eventually received sdk_get_and_check_replies(looper, reqs)
def test_catchup_with_all_nodes_sending_cons_proofs_dead( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, logsearch): lagging_node = txnPoolNodeSet[-1] other_nodes = txnPoolNodeSet[:-1] start_delaying(lagging_node.nodeIbStasher, delay_3pc()) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) log_re_ask, _ = logsearch( msgs=['requesting .* missing transactions after timeout']) old_re_ask_count = len(log_re_ask) catchup_reqs = { node.name: start_delaying(node.nodeIbStasher, cqDelay()) for node in other_nodes } audit_catchup_service = lagging_node.ledgerManager._node_leecher._leechers[ AUDIT_LEDGER_ID]._catchup_rep_service lagging_node.start_catchup() looper.run( eventually(lambda: assert_eq(audit_catchup_service._is_working, True))) # Make sure number of cons proofs gathered when all nodes are assert len(audit_catchup_service._nodes_ledger_sizes) == 3 # Allow catchup requests only from nodes that didn't respond first for node_id, node_reqs in catchup_reqs.items(): if node_id not in audit_catchup_service._nodes_ledger_sizes: stop_delaying_and_process(node_reqs) # Check catchup finishes successfully, and there were reasks ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) assert len(log_re_ask) - old_re_ask_count > 0
def test_catchup_uses_only_nodes_with_cons_proofs(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): lagging_node = txnPoolNodeSet[-1] other_nodes = txnPoolNodeSet[:-1] start_delaying(lagging_node.nodeIbStasher, delay_3pc()) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) catchup_reqs = {node.name: start_delaying(node.nodeIbStasher, cqDelay()) for node in other_nodes} audit_catchup_service = lagging_node.ledgerManager._node_leecher._leechers[AUDIT_LEDGER_ID]._catchup_rep_service lagging_node.start_catchup() looper.run(eventually(lambda: assert_eq(audit_catchup_service._is_working, True))) # Make sure number of cons proofs gathered when all nodes are assert len(audit_catchup_service._nodes_ledger_sizes) == 3 # Allow catchup requests only for interesting nodes for node_id in audit_catchup_service._nodes_ledger_sizes.keys(): stop_delaying_and_process(catchup_reqs[node_id]) # Check catchup finishes successfully ensure_all_nodes_have_same_data(looper, txnPoolNodeSet, custom_timeout=30)
def test_re_order_pre_prepares(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): # 0. use new 3PC validator for n in txnPoolNodeSet: ordering_service = n.master_replica._ordering_service ordering_service._validator = OrderingServiceMsgValidator( ordering_service._data) # 1. drop Prepares and Commits on 4thNode # Order a couple of requests on Nodes 1-3 lagging_node = txnPoolNodeSet[-1] other_nodes = txnPoolNodeSet[:-1] with delay_rules_without_processing(lagging_node.nodeIbStasher, cDelay(), pDelay()): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) assert all(n.master_last_ordered_3PC == (0, 1) for n in other_nodes) # 2. simulate view change start so that # all PrePrepares/Prepares/Commits are cleared # and uncommitted txns are reverted for n in txnPoolNodeSet: n.internal_bus.send(ViewChangeStarted(view_no=1)) master_ordering_service = n.master_replica._ordering_service assert not master_ordering_service.prePrepares assert not master_ordering_service.prepares assert not master_ordering_service.commits assert master_ordering_service.old_view_preprepares ledger = n.db_manager.ledgers[DOMAIN_LEDGER_ID] state = n.db_manager.states[DOMAIN_LEDGER_ID] assert len(ledger.uncommittedTxns) == 0 assert ledger.uncommitted_root_hash == ledger.tree.root_hash assert state.committedHead == state.head # 3. Simulate View Change finish to re-order the same PrePrepare assert lagging_node.master_last_ordered_3PC == (0, 0) new_master = txnPoolNodeSet[1] batches = [ preprepare_to_batch_id(pp) for _, pp in new_master.master_replica. _ordering_service.old_view_preprepares.items() ] new_view_msg = NewViewCheckpointsApplied(view_no=0, view_changes=[], checkpoint=None, batches=batches) for n in txnPoolNodeSet: n.master_replica._ordering_service._bus.send(new_view_msg) # 4. Make sure that the nodes 1-3 (that already ordered the requests) sent Prepares and Commits so that # the request was eventually ordered on Node4 as well looper.run( eventually(lambda: assert_eq(lagging_node.master_last_ordered_3PC, (0, 1))))
def test_catchup_during_3pc(tconf, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): ''' 1) Send 1 3PC batch + 2 reqs 2) Delay commits on one node 3) Make sure the batch is ordered on all nodes except the lagged one 4) start catchup of the lagged node 5) Make sure that all nodes are equal 6) Send more requests that we have 3 batches in total 7) Make sure that all nodes are equal ''' lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] with delay_rules(lagging_node.nodeIbStasher, cDelay()): sdk_reqs = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, tconf.Max3PCBatchSize + 2) looper.run( eventually(check_last_ordered_3pc_on_master, rest_nodes, (0, 1))) lagging_node.start_catchup() looper.run( eventually( lambda: assert_eq(lagging_node.mode, Mode.participating), retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5) sdk_get_replies(looper, sdk_reqs) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * tconf.Max3PCBatchSize - 2) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_backup_replica_resumes_ordering_on_lag_in_checkpoints( looper, chkFreqPatched, reqs_for_checkpoint, one_replica_and_others_in_backup_instance, sdk_pool_handle, sdk_wallet_client, view_change_done, txnPoolNodeSet): """ Verifies resumption of ordering 3PC-batches on a backup replica on detection of a lag in checkpoints """ slow_replica, other_replicas = one_replica_and_others_in_backup_instance view_no = slow_replica.viewNo # Send a request and ensure that the replica orders the batch for it sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) looper.run( eventually(lambda: assert_eq(slow_replica.last_ordered_3pc, (view_no, 2)), retryWait=1, timeout=waits.expectedTransactionExecutionTime(nodeCount))) # Don't receive Commits from two replicas slow_replica.node.nodeIbStasher.delay( cDelay(instId=1, sender_filter=other_replicas[0].node.name)) slow_replica.node.nodeIbStasher.delay( cDelay(instId=1, sender_filter=other_replicas[1].node.name)) # Send a request for which the replica will not be able to order the batch # due to an insufficient count of Commits sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Recover reception of Commits slow_replica.node.nodeIbStasher.drop_delayeds() slow_replica.node.nodeIbStasher.resetDelays() # Send requests but in a quantity insufficient # for catch-up number of checkpoints sdk_send_random_requests( looper, sdk_pool_handle, sdk_wallet_client, Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_for_checkpoint - 3) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Ensure that the replica has not ordered any batches # after the very first one assert slow_replica.last_ordered_3pc == (view_no, 2) # Ensure that the watermarks have not been shifted since the view start assert slow_replica.h == 0 assert slow_replica.H == LOG_SIZE # Ensure that the collections related to requests, batches and # own checkpoints are not empty. # (Note that a primary replica removes requests from requestQueues # when creating a batch with them.) if slow_replica.isPrimary: assert slow_replica._ordering_service.sentPrePrepares else: assert slow_replica._ordering_service.requestQueues[DOMAIN_LEDGER_ID] assert slow_replica._ordering_service.prePrepares assert slow_replica._ordering_service.prepares assert slow_replica._ordering_service.commits assert slow_replica._ordering_service.batches assert slow_replica._checkpointer._checkpoint_state # Ensure that there are some quorumed stashed checkpoints assert slow_replica._checkpointer._stashed_checkpoints_with_quorum() # Send more requests to reach catch-up number of checkpoints sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint) # Ensure that the replica has adjusted last_ordered_3pc to the end # of the last checkpoint looper.run( eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc == \ (view_no, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ)), slow_replica, retryWait=1, timeout=waits.expectedTransactionExecutionTime(nodeCount))) # Ensure that the watermarks have been shifted so that the lower watermark # has the same value as last_ordered_3pc assert slow_replica.h == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ assert slow_replica.H == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ + LOG_SIZE # Ensure that the collections related to requests, batches and # own checkpoints have been cleared assert not slow_replica._ordering_service.requestQueues[DOMAIN_LEDGER_ID] assert not slow_replica._ordering_service.sentPrePrepares assert not slow_replica._ordering_service.prePrepares assert not slow_replica._ordering_service.prepares assert not slow_replica._ordering_service.commits assert not slow_replica._ordering_service.batches assert not slow_replica._checkpointer._checkpoint_state # Ensure that now there are no quorumed stashed checkpoints assert not slow_replica._checkpointer._stashed_checkpoints_with_quorum() # Send a request and ensure that the replica orders the batch for it sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) looper.run( eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc == ( view_no, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ + 1)), slow_replica, retryWait=1, timeout=waits.expectedTransactionExecutionTime(nodeCount)))