def test_handle_delayed_preprepares(looper, txnPoolNodeSet, wallet1, client1Connected, teardown): """ Make a node send PREPREPARE again after the slow node has ordered """ slow_node, other_nodes, primary_node, other_non_primary_nodes = \ split_nodes(txnPoolNodeSet) # This node will send PRE-PREPARE again confused_node = other_non_primary_nodes[0] orig_method = confused_node.handlers[PREPREPARE].serve last_pp = None def patched_method(self, msg): nonlocal last_pp last_pp = orig_method(msg) return last_pp confused_node.handlers[PREPREPARE].serve = types.MethodType( patched_method, confused_node.handlers[PREPREPARE]) # Delay PRE-PREPAREs by large amount simulating loss slow_node.nodeIbStasher.delay(ppDelay(300, 0)) send_reqs_batches_and_get_suff_replies(looper, wallet1, client1Connected, 10, 5) waitNodeDataEquality(looper, slow_node, *other_nodes) slow_master_replica = slow_node.master_replica count_pr_req = get_count(slow_master_replica, slow_master_replica.process_requested_pre_prepare) count_pr_tpc = get_count(slow_master_replica, slow_master_replica.processThreePhaseMsg) confused_node.sendToNodes(MessageRep( **{ f.MSG_TYPE.nm: PREPREPARE, f.PARAMS.nm: { f.INST_ID.nm: last_pp.instId, f.VIEW_NO.nm: last_pp.viewNo, f.PP_SEQ_NO.nm: last_pp.ppSeqNo }, f.MSG.nm: last_pp }), names=[ slow_node.name, ]) def chk(): # `process_requested_pre_prepare` is called but # `processThreePhaseMsg` is not called assert get_count( slow_master_replica, slow_master_replica.process_requested_pre_prepare) > count_pr_req assert get_count( slow_master_replica, slow_master_replica.processThreePhaseMsg) == count_pr_tpc looper.run(eventually(chk, retryWait=1))
def test_node_catchup_after_checkpoints(looper, txnPoolNodeSet, chk_freq_patched, wallet1, client1, client1Connected, slow_node_and_others): """ For some reason a node misses 3pc messages but eventually the node stashes some amount checkpoints and decides to catchup. """ slow_node, other_nodes = slow_node_and_others completed_catchups_before = get_number_of_completed_catchups(slow_node) logger.info("Step 1: Send less than required for start the catchup procedure" "and check the slow node falls behind") batches_num = 2 * chk_freq_patched - 1 send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, num_reqs=batches_num, num_batches=batches_num, ) waitNodeDataInequality(looper, slow_node, *other_nodes) logger.info("Step 2: Send remaining requests in order to trigger the catchup" "procedure for the slow node, then check data equality") send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, num_reqs=1 ) waitNodeDataEquality(looper, slow_node, *other_nodes) # check if there was a catchup completed_catchups_after = get_number_of_completed_catchups(slow_node) assert completed_catchups_after == completed_catchups_before + 1
def test_state_regenerated_from_ledger(looper, txnPoolNodeSet, client1, wallet1, client1Connected, tconf, tdirWithPoolTxns, allPluginsPath): """ Node loses its state database but recreates it from ledger after start """ sent_batches = 10 send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 5 * sent_batches, sent_batches) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) node_to_stop = txnPoolNodeSet[-1] node_state = node_to_stop.states[DOMAIN_LEDGER_ID] assert not node_state.isEmpty state_db_path = node_state._kv.db_path nodeHa, nodeCHa = HA(*node_to_stop.nodestack.ha), HA( *node_to_stop.clientstack.ha) node_to_stop.stop() looper.removeProdable(node_to_stop) shutil.rmtree(state_db_path) restarted_node = TestNode(node_to_stop.name, basedirpath=tdirWithPoolTxns, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(restarted_node) txnPoolNodeSet[-1] = restarted_node looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, restarted_node, *txnPoolNodeSet[:-1])
def test_no_preprepare_requested(looper, txnPoolNodeSet, client1, wallet1, client1Connected, teardown): """ Node missing Propagates hence request not finalised, hence stashes PRE-PREPARE but does not request PRE-PREPARE on receiving PREPARE """ slow_node, other_nodes, _, _ = split_nodes(txnPoolNodeSet) slow_node.nodeIbStasher.delay(ppgDelay(20)) slow_node.nodeIbStasher.delay(msg_rep_delay(20, [PROPAGATE, ])) old_count_resp = count_requested_preprepare_resp(slow_node) send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 4, 2) # The slow node is behind checkNodeDataForInequality(slow_node, *other_nodes) # PRE-PREPARE were not requested assert count_requested_preprepare_resp(slow_node) == old_count_resp slow_node.nodeIbStasher.reset_delays_and_process_delayeds() # The slow node has processed all requests waitNodeDataEquality(looper, slow_node, *other_nodes) # PRE-PREPARE were not requested assert count_requested_preprepare_resp(slow_node) == old_count_resp
def test_view_change_after_max_catchup_rounds(txnPoolNodeSet, looper, wallet1, client1, client1Connected): """ The node should do only a fixed rounds of catchup. For this delay Prepares and Commits for 2 non-primary nodes by a large amount which is equivalent to loss of Prepares and Commits. Make sure 2 nodes have a different last prepared certificate from other two. Then do a view change, make sure view change completes and the pool does not process the request that were prepared by only a subset of the nodes """ send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ledger_summary = txnPoolNodeSet[0].elector.ledger_summary slow_nodes = [ r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)[-2:] ] fast_nodes = [n for n in txnPoolNodeSet if n not in slow_nodes] # Make node slow to process Prepares and Commits for node in slow_nodes: node.nodeIbStasher.delay(pDelay(120, 0)) node.nodeIbStasher.delay(cDelay(120, 0)) sendRandomRequests(wallet1, client1, 5) looper.runFor(3) ensure_view_change(looper, nodes=txnPoolNodeSet) def last_prepared(nodes): lst = [ n.master_replica.last_prepared_certificate_in_view() for n in nodes ] # All nodes have same last prepared assert check_if_all_equal_in_list(lst) return lst[0] last_prepared_slow = last_prepared(slow_nodes) last_prepared_fast = last_prepared(fast_nodes) # Check `slow_nodes` and `fast_nodes` set different last_prepared assert last_prepared_fast != last_prepared_slow # View change complete ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) # The requests which were prepared by only a subset of the nodes were # not ordered assert txnPoolNodeSet[0].elector.ledger_summary == ledger_summary for node in slow_nodes: node.nodeIbStasher.reset_delays_and_process_delayeds() # Make sure pool is functional ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) last_prepared(txnPoolNodeSet)
def test_no_catchup_if_got_from_3pc(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ A node is slow to receive COMMIT messages so after a view change it starts catchup. But before it can start requesting txns, the COMMITs messages are received and are ordered. The node should not request any transactions. :return: """ send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay_cm = 30 delat_cp = 100 slow_node.nodeIbStasher.delay(cDelay(delay_cm)) # The slow node receives consistency proofs after some delay, this delay # gives the opportunity to deliver all 3PC messages slow_node.nodeIbStasher.delay(cpDelay(delat_cp)) # Count of `getCatchupReqs` which is called to construct the `CatchupReq` # to be sent def domain_cr_count(): return sum(1 for entry in slow_node.ledgerManager.spylog.getAll( slow_node.ledgerManager.getCatchupReqs) if entry.params['consProof'].ledgerId == DOMAIN_LEDGER_ID) old_count = domain_cr_count() sent_batches = 10 send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * sent_batches, sent_batches) ensure_view_change(looper, nodes=txnPoolNodeSet) # After view change, the `slow_node` is behind waitNodeDataInequality(looper, slow_node, *other_nodes) # Unstash only COMMIT messages slow_node.nodeIbStasher.reset_delays_and_process_delayeds(Commit.typename) looper.runFor(2) slow_node.nodeIbStasher.reset_delays_and_process_delayeds( ConsistencyProof.typename) waitNodeDataEquality(looper, slow_node, *other_nodes) # No `CatchupReq`s constructed, hence no `CatchupReq`s could have # been sent assert domain_cr_count() == old_count # Some stashed ordered requests have been processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert sent_batches in rv sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def ensure_pool_functional(looper, nodes, wallet, client, num_reqs=10, num_batches=2): send_reqs_batches_and_get_suff_replies(looper, wallet, client, num_reqs, num_batches) ensure_all_nodes_have_same_data(looper, nodes)
def test_node_request_preprepare(looper, txnPoolNodeSet, client1, wallet1, client1Connected, teardown): """ Node requests PRE-PREPARE only once after getting PREPAREs. """ slow_node, other_nodes, primary_node, \ other_primary_nodes = split_nodes(txnPoolNodeSet) # Drop PrePrepares and Prepares slow_node.nodeIbStasher.delay(ppDelay(300, 0)) slow_node.nodeIbStasher.delay(pDelay(300, 0)) send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 10, 5) slow_node.nodeIbStasher.drop_delayeds() slow_node.nodeIbStasher.resetDelays() old_count_req = count_requested_preprepare_req(slow_node) old_count_resp = count_requested_preprepare_resp(slow_node) def chk(increase=True): # Method is called assert count_requested_preprepare_req(slow_node) > old_count_req # Requesting Preprepare assert count_requested_preprepare_resp( slow_node) - old_count_resp == (1 if increase else 0) for pp in primary_node.master_replica.sentPrePrepares.values(): for rep in [n.master_replica for n in other_primary_nodes]: prepare = Prepare(rep.instId, pp.viewNo, pp.ppSeqNo, pp.ppTime, pp.digest, pp.stateRootHash, pp.txnRootHash ) rep.send(prepare) looper.run(eventually(chk, True, retryWait=1)) old_count_resp = count_requested_preprepare_resp(slow_node) prepare = Prepare(rep.instId, pp.viewNo, pp.ppSeqNo, pp.ppTime, pp.digest, pp.stateRootHash, pp.txnRootHash ) rep.send(prepare) looper.run(eventually(chk, False, retryWait=1)) old_count_req = count_requested_preprepare_req(slow_node) old_count_resp = count_requested_preprepare_resp(slow_node)
def test_discard_3PC_messages_for_already_ordered(looper, txnPoolNodeSet, client1, wallet1, client1Connected): """ Nodes discard any 3PC messages for already ordered 3PC keys (view_no, pp_seq_no). Delay all 3PC messages to a node so it cannot respond to them unless the other nodes order them, now when the slow node will get them it will respond but other nodes will not process them and discard them """ slow_node = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)][-1] other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay = 20 delay_3pc_messages([slow_node], 0, delay) delay_3pc_messages([slow_node], 1, delay) sent_batches = 3 send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * sent_batches, sent_batches) def chk(node, inst_id, p_count, c_count): # A node will still record PREPRAREs even if more than n-f-1, till the # request is not ordered assert len(node.replicas[inst_id].prepares) >= p_count assert len(node.replicas[inst_id].commits) == c_count def count_discarded(inst_id, count): for node in other_nodes: assert countDiscarded(node.replicas[inst_id], 'already ordered 3 phase message') == count # `slow_node` did not receive any PREPAREs or COMMITs chk(slow_node, 0, 0, 0) # `other_nodes` have not discarded any 3PC message count_discarded(0, 0) # `other_nodes` have not recorded any PREPAREs or COMMITs from `slow_node` chk_commits_prepares_recvd(0, other_nodes, slow_node) slow_node.reset_delays_and_process_delayeds() waitNodeDataEquality(looper, slow_node, *other_nodes) # `slow_node` did receive correct number of PREPAREs and COMMITs looper.run( eventually(chk, slow_node, 0, sent_batches - 1, sent_batches, retryWait=1)) # `other_nodes` have not recorded any PREPAREs or COMMITs from `slow_node` chk_commits_prepares_recvd(0, other_nodes, slow_node) # `other_nodes` have discarded PREPAREs and COMMITs all batches count_discarded(0, 2 * sent_batches)
def test_lag_size_for_catchup(looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Verifies that if the stored own checkpoints have aligned bounds then the master replica lag which makes the node perform catch-up is Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed received checkpoints. """ slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] # The master replica of the slow node stops to receive 3PC-messages slow_node.master_replica.threePhaseRouter.extend(( (PrePrepare, lambda *x, **y: None), (Prepare, lambda *x, **y: None), (Commit, lambda *x, **y: None), )) completed_catchups_before_reqs = get_number_of_completed_catchups( slow_node) # Send requests for the slow node's master replica to get # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP quorumed stashed checkpoints # from others send_reqs_batches_and_get_suff_replies( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_for_checkpoint) # Give time for the slow node to catch up if it is going to do it looper.runFor( waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) + waits.expectedPoolCatchupTime(len(txnPoolNodeSet))) checkNodeDataForInequality(slow_node, *other_nodes) # Verify that the slow node has not caught up assert get_number_of_completed_catchups( slow_node) == completed_catchups_before_reqs # Send more requests for the slow node's master replica to reach # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed # checkpoints from others send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint) waitNodeDataEquality(looper, slow_node, *other_nodes, exclude_from_check=['check_last_ordered_3pc_backup']) # Verify that the slow node has caught up assert get_number_of_completed_catchups( slow_node) > completed_catchups_before_reqs
def test_view_change_done_delayed(txnPoolNodeSet, looper, wallet1, client1, client1Connected): """ A node is slow so is behind other nodes, after view change, it catches up but it also gets view change message as delayed, a node should start participating only when caught up and ViewChangeCone quorum received. """ nprs = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)] slow_node = nprs[-1] other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay_3pc = 10 delay_vcd = 25 delay_3pc_messages([slow_node], 0, delay_3pc) slow_node.nodeIbStasher.delay(vcd_delay(delay_vcd)) def chk(node): assert node.elector.has_acceptable_view_change_quorum assert node.elector._primary_verified assert node.isParticipating assert None not in {r.isPrimary for r in node.replicas} send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 5 * 4, 4) ensure_view_change(looper, nodes=txnPoolNodeSet) # After view change, the slow node successfully completes catchup waitNodeDataEquality(looper, slow_node, *other_nodes) # Other nodes complete view change, select primary and participate for node in other_nodes: looper.run(eventually(chk, node, retryWait=1)) # Since `ViewChangeCone` is delayed, slow_node is not able to select primary # and participate assert not slow_node.elector.has_acceptable_view_change_quorum assert not slow_node.elector._primary_verified assert not slow_node.isParticipating assert {r.isPrimary for r in slow_node.replicas} == {None} # Send requests to make sure pool is functional sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) # Repair network slow_node.reset_delays_and_process_delayeds() # `slow_node` selects primary and participate looper.run(eventually(chk, slow_node, retryWait=1)) # Processes requests received during lack of primary waitNodeDataEquality(looper, slow_node, *other_nodes) # Send more requests and compare data of all nodes sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_node_requests_missing_preprepare_malicious(looper, txnPoolNodeSet, client1, wallet1, client1Connected, malicious_setup, teardown): """ A node has bad network with primary and thus loses PRE-PREPARE, it requests PRE-PREPARE from non-primaries once it has sufficient PREPAREs but one of the non-primary does not send the PRE-PREPARE """ # primary_node = get_master_primary_node(txnPoolNodeSet) # slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node # other_nodes = [n for n in txnPoolNodeSet if n != slow_node] # bad_node = [n for n in other_nodes if n != primary_node][0] # good_non_primary_node = [n for n in other_nodes if n != slow_node # and n != bad_node and n != primary_node][0] primary_node, bad_node, good_non_primary_node, slow_node, other_nodes, \ bad_method, orig_method = malicious_setup slow_node.nodeIbStasher.delay(ppDelay(300, 0)) def get_reply_count_frm(node): return sum([ 1 for entry in slow_node.spylog.getAll(slow_node.process_message_rep) if entry.params['msg'].msg_type == PREPREPARE and entry.params['frm'] == node.name ]) old_reply_count_from_bad_node = get_reply_count_frm(bad_node) old_reply_count_from_good_node = get_reply_count_frm(good_non_primary_node) old_discarded = countDiscarded(slow_node.master_replica, 'does not have ' 'expected state') send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 10, 2) waitNodeDataEquality(looper, slow_node, *other_nodes) assert check_if_all_equal_in_list( [n.master_replica.ordered for n in txnPoolNodeSet]) assert not slow_node.master_replica.requested_pre_prepares if bad_method.__name__ == 'do_not_send': assert get_reply_count_frm(bad_node) == old_reply_count_from_bad_node else: assert countDiscarded(slow_node.master_replica, 'does not have expected state') > old_discarded assert get_reply_count_frm(good_non_primary_node) > \ old_reply_count_from_good_node slow_node.reset_delays_and_process_delayeds() bad_node.nodeMsgRouter.routes[MessageReq] = orig_method
def test_lag_size_for_catchup( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Verifies that if the stored own checkpoints have aligned bounds then the master replica lag which makes the node perform catch-up is Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed received checkpoints. """ slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] # The master replica of the slow node stops to receive 3PC-messages slow_node.master_replica.threePhaseRouter.extend( ( (PrePrepare, lambda *x, **y: None), (Prepare, lambda *x, **y: None), (Commit, lambda *x, **y: None), ) ) completed_catchups_before_reqs = get_number_of_completed_catchups(slow_node) # Send requests for the slow node's master replica to get # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP quorumed stashed checkpoints # from others send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_for_checkpoint) # Give time for the slow node to catch up if it is going to do it looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) + waits.expectedPoolCatchupTime(len(txnPoolNodeSet))) checkNodeDataForInequality(slow_node, *other_nodes) # Verify that the slow node has not caught up assert get_number_of_completed_catchups(slow_node) == completed_catchups_before_reqs # Send more requests for the slow node's master replica to reach # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed # checkpoints from others send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint) waitNodeDataEquality(looper, slow_node, *other_nodes) # Verify that the slow node has caught up assert get_number_of_completed_catchups(slow_node) > completed_catchups_before_reqs
def test_nodes_removes_request_keys_for_ordered(setup, looper, txnPoolNodeSet, client1, wallet1): """ A node does not order requests since it is missing some 3PC messages, gets them from catchup. It then clears them from its request queues """ slow_node, fast_nodes = setup reqs = send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 10, 5) ensure_all_nodes_have_same_data(looper, fast_nodes) assert slow_node.master_replica.last_ordered_3pc != \ fast_nodes[0].master_replica.last_ordered_3pc def chk(key, nodes, present): for node in nodes: assert (key in node.master_replica.requestQueues[DOMAIN_LEDGER_ID] ) == present for req in reqs: chk(req.key, fast_nodes, False) chk(req.key, [slow_node], True) ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) for req in reqs: chk(req.key, txnPoolNodeSet, False)
def test_nodes_removes_request_keys_for_ordered(setup, looper, txnPoolNodeSet, client1, wallet1): """ A node does not order requests since it is missing some 3PC messages, gets them from catchup. It then clears them from its request queues """ slow_node, fast_nodes = setup reqs = send_reqs_batches_and_get_suff_replies( looper, wallet1, client1, 10, 5) ensure_all_nodes_have_same_data(looper, fast_nodes) assert slow_node.master_replica.last_ordered_3pc != \ fast_nodes[0].master_replica.last_ordered_3pc def chk(key, nodes, present): for node in nodes: assert ( key in node.master_replica.requestQueues[DOMAIN_LEDGER_ID]) == present for req in reqs: chk(req.key, fast_nodes, False) chk(req.key, [slow_node], True) # Reset catchup reply delay so that catchup can complete slow_node.nodeIbStasher.reset_delays_and_process_delayeds(CatchupRep.typename) ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) for req in reqs: chk(req.key, txnPoolNodeSet, False) # Needed for the next run due to the parametrised fixture slow_node.reset_delays_and_process_delayeds()
def test_node_requests_missing_preprepare(looper, txnPoolNodeSet, client1, wallet1, client1Connected, teardown): """ A node has bad network with primary and thus loses PRE-PREPARE, it requests PRE-PREPARE from non-primaries once it has sufficient PREPAREs """ slow_node, other_nodes, _, _ = split_nodes(txnPoolNodeSet) # Delay PRE-PREPAREs by large amount simulating loss slow_node.nodeIbStasher.delay(ppDelay(300, 0)) old_count_pp = get_count(slow_node.master_replica, slow_node.master_replica.processPrePrepare) old_count_mrq = { n.name: get_count(n, n.process_message_req) for n in other_nodes } old_count_mrp = get_count(slow_node, slow_node.process_message_rep) send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 15, 5) waitNodeDataEquality(looper, slow_node, *other_nodes) assert not slow_node.master_replica.requested_pre_prepares # `slow_node` processed PRE-PREPARE assert get_count(slow_node.master_replica, slow_node.master_replica.processPrePrepare) > old_count_pp # `slow_node` did receive `MessageRep` assert get_count(slow_node, slow_node.process_message_rep) > old_count_mrp # More than `f` nodes received `MessageReq` recv_reqs = set() for n in other_nodes: if get_count(n, n.process_message_req) > old_count_mrq[n.name]: recv_reqs.add(n.name) assert len(recv_reqs) > slow_node.f # All nodes including the `slow_node` ordered the same requests assert check_if_all_equal_in_list( [n.master_replica.ordered for n in txnPoolNodeSet])
def test_state_regenerated_from_ledger( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): """ Node loses its state database but recreates it from ledger after start """ sent_batches = 10 send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5 * sent_batches, sent_batches) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) node_to_stop = txnPoolNodeSet[-1] node_state = node_to_stop.states[DOMAIN_LEDGER_ID] assert not node_state.isEmpty state_db_path = node_state._kv.db_path nodeHa, nodeCHa = HA(* node_to_stop.nodestack.ha), HA(* node_to_stop.clientstack.ha) node_to_stop.stop() looper.removeProdable(node_to_stop) shutil.rmtree(state_db_path) config_helper = PNodeConfigHelper(node_to_stop.name, tconf, chroot=tdir) restarted_node = TestNode(node_to_stop.name, config_helper=config_helper, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(restarted_node) txnPoolNodeSet[-1] = restarted_node looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, restarted_node, *txnPoolNodeSet[:-1])
def test_checkpoint_across_views(sent_batches, chkFreqPatched, looper, txnPoolNodeSet, client1, wallet1, client1Connected): """ Test checkpointing across views. This test checks that checkpointing and garbage collection works correctly no matter if view change happened before a checkpoint or after a checkpoint """ batch_size = 2 send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, batch_size*sent_batches, sent_batches) # Check that correct garbage collection happens non_gced_batch_count = (sent_batches - CHK_FREQ) if sent_batches >= CHK_FREQ else sent_batches looper.run(eventually(checkRequestCounts, txnPoolNodeSet, batch_size*non_gced_batch_count, non_gced_batch_count, non_gced_batch_count, retryWait=1)) ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) # Check that after view change, proper clean up is done for node in txnPoolNodeSet: for r in node.replicas: assert not r.checkpoints # No stashed checkpoint for previous view assert not [view_no for view_no in r.stashedRecvdCheckpoints if view_no < r.viewNo] assert r._h == 0 assert r._lastPrePrepareSeqNo == 0 assert r.h == 0 assert r.H == r._h + chkFreqPatched.LOG_SIZE checkRequestCounts(txnPoolNodeSet, 0, 0, 0) # Even after view change, chekpointing works send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, batch_size*sent_batches, sent_batches) looper.run(eventually(checkRequestCounts, txnPoolNodeSet, batch_size * non_gced_batch_count, non_gced_batch_count, non_gced_batch_count, retryWait=1)) # Send more batches so one more checkpoint happens. This is done so that # when this test finishes, all requests are garbage collected and the # next run of this test (with next param) has the calculations correct more = CHK_FREQ - non_gced_batch_count send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, batch_size * more, more) looper.run(eventually(checkRequestCounts, txnPoolNodeSet, 0, 0, 0, retryWait=1))
def test_node_catchup_after_checkpoints( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, broken_node_and_others): """ A node misses 3pc messages and checkpoints during some period but later it stashes some amount of checkpoints and decides to catchup. """ max_batch_size = chkFreqPatched.Max3PCBatchSize broken_node, other_nodes = broken_node_and_others logger.info("Step 1: The node misses quite a lot of requests") send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint + max_batch_size) waitNodeDataInequality(looper, broken_node, *other_nodes) logger.info( "Step 2: The node gets requests but cannot process them because of " "missed ones. But the nodes eventually stashes some amount checkpoints " "after that the node starts catch up") repaired_node = repair_broken_node(broken_node) completed_catchups_before = get_number_of_completed_catchups(broken_node) send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * reqs_for_checkpoint - max_batch_size) waitNodeDataEquality(looper, repaired_node, *other_nodes) # Note that the repaired node might not fill the gap of missed 3PC-messages # by requesting them from other nodes because these 3PC-batches start from # an already stabilized checkpoint, so a part of these 3PC-messages are # already unavailable # Verify that a catch-up was done completed_catchups_after = get_number_of_completed_catchups(repaired_node) assert completed_catchups_after > completed_catchups_before logger.info("Step 3: Check if the node is able to process requests") send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint + max_batch_size) waitNodeDataEquality(looper, repaired_node, *other_nodes)
def test_commits_recvd_first(looper, txnPoolNodeSet, client1, wallet1, client1Connected): slow_node = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)][-1] other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay = 50 slow_node.nodeIbStasher.delay(ppDelay(delay, 0)) slow_node.nodeIbStasher.delay(pDelay(delay, 0)) send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 20, 4) assert not slow_node.master_replica.prePrepares assert not slow_node.master_replica.prepares assert not slow_node.master_replica.commits assert len(slow_node.master_replica.commitsWaitingForPrepare) > 0 slow_node.reset_delays_and_process_delayeds() waitNodeDataEquality(looper, slow_node, *other_nodes) assert check_if_all_equal_in_list( [n.master_replica.ordered for n in txnPoolNodeSet]) assert slow_node.master_replica.prePrepares assert slow_node.master_replica.prepares assert slow_node.master_replica.commits assert not slow_node.master_replica.commitsWaitingForPrepare
def test_nodes_removes_request_keys_for_ordered(setup, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ A node does not order requests since it is missing some 3PC messages, gets them from catchup. It then clears them from its request queues """ slow_node, fast_nodes = setup reqs = sdk_json_couples_to_request_list( send_reqs_batches_and_get_suff_replies( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10, 5)) ensure_all_nodes_have_same_data(looper, fast_nodes) assert slow_node.master_replica.last_ordered_3pc != \ fast_nodes[0].master_replica.last_ordered_3pc def chk(key, nodes, present): for node in nodes: assert ( key in node.master_replica.requestQueues[DOMAIN_LEDGER_ID]) == present for req in reqs: chk(req.digest, fast_nodes, False) chk(req.digest, [slow_node], True) # Reset catchup reply delay so that catchup can complete slow_node.nodeIbStasher.reset_delays_and_process_delayeds(CatchupRep.typename) ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) for req in reqs: chk(req.digest, txnPoolNodeSet, False) # Needed for the next run due to the parametrised fixture slow_node.reset_delays_and_process_delayeds()
def test_nodes_removes_request_keys_for_ordered(setup, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ A node does not order requests since it is missing some 3PC messages, gets them from catchup. It then clears them from its request queues """ slow_node, fast_nodes = setup reqs = sdk_json_couples_to_request_list( send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10, 5)) ensure_all_nodes_have_same_data(looper, fast_nodes) assert slow_node.master_replica.last_ordered_3pc != \ fast_nodes[0].master_replica.last_ordered_3pc def chk(key, nodes, present): for node in nodes: assert (key in node.master_replica._ordering_service. requestQueues[DOMAIN_LEDGER_ID]) == present for req in reqs: chk(req.digest, fast_nodes, False) chk(req.digest, [slow_node], True) # Reset catchup reply delay so that catchup can complete slow_node.nodeIbStasher.reset_delays_and_process_delayeds( CatchupRep.typename) slow_node.start_catchup() ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) for req in reqs: chk(req.digest, txnPoolNodeSet, False) # Needed for the next run due to the parametrised fixture slow_node.reset_delays_and_process_delayeds()
def test_node_catchup_after_checkpoints(looper, chk_freq_patched, txnPoolNodeSet, wallet1, client1, client1Connected, broken_node_and_others): """ For some reason a node misses 3pc messages but eventually the node stashes some amount checkpoints and decides to catchup. """ broken_node, other_nodes = broken_node_and_others completed_catchups_before = get_number_of_completed_catchups(broken_node) logger.info("Step 1: The node misses quite a lot requests") send_reqs_batches_and_get_suff_replies( looper, wallet1, client1, num_reqs=chk_freq_patched + 1, num_batches=chk_freq_patched + 1, ) waitNodeDataInequality(looper, broken_node, *other_nodes) logger.info( "Step 2: The node gets requests but cannot process them because of " "missed ones. But the nodes eventually stashes some amount checkpoints " "after that the node starts catch up") repaired_node = repair_broken_node(broken_node) send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, num_reqs=2 * chk_freq_patched, num_batches=2 * chk_freq_patched) waitNodeDataEquality(looper, repaired_node, *other_nodes) # check if there was at least 1 catchup completed_catchups_after = get_number_of_completed_catchups(repaired_node) assert completed_catchups_after >= completed_catchups_before + 1 logger.info("Step 3: Check if the node is able to process requests") send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, num_reqs=chk_freq_patched + 2, num_batches=chk_freq_patched + 2) waitNodeDataEquality(looper, repaired_node, *other_nodes)
def test_no_catchup_if_got_from_3pc(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ A node is slow to receive COMMIT messages so after a view change it starts catchup. But before it can start requesting txns, the COMMITs messages are received and are ordered. The node should not request any transactions. :return: """ send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay_cm = 30 delat_cp = 100 slow_node.nodeIbStasher.delay(cDelay(delay_cm)) # The slow node receives consistency proofs after some delay, this delay # gives the opportunity to deliver all 3PC messages slow_node.nodeIbStasher.delay(cpDelay(delat_cp)) # Count of `getCatchupReqs` which is called to construct the `CatchupReq` # to be sent def domain_cr_count(): return sum(1 for entry in slow_node.ledgerManager.spylog.getAll( slow_node.ledgerManager.getCatchupReqs) if entry.params['consProof'].ledgerId == DOMAIN_LEDGER_ID) old_count = domain_cr_count() sent_batches = 10 send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * sent_batches, sent_batches) ensure_view_change(looper, nodes=txnPoolNodeSet) # After view change, the `slow_node` is behind waitNodeDataInequality(looper, slow_node, *other_nodes) # Unstash only COMMIT messages slow_node.nodeIbStasher.reset_delays_and_process_delayeds(Commit.typename) looper.runFor(2) slow_node.nodeIbStasher.reset_delays_and_process_delayeds( ConsistencyProof.typename) waitNodeDataEquality(looper, slow_node, *other_nodes) # No `CatchupReq`s constructed, hence no `CatchupReq`s could have # been sent assert domain_cr_count() == old_count # Some stashed ordered requests have been processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert sent_batches in rv sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_complete_short_checkpoint_not_included_in_lag_for_catchup( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, sdk_wallet_client, tdir, tconf, allPluginsPath): """ Verifies that if the first stored own checkpoint has a not aligned lower bound (this means that it was started after a catch-up), is complete and there is a quorumed stashed checkpoint from other replicas with the same end then this stashed checkpoint is not included into the lag for a catch-up, i.e. in such a case the lag which makes the node perform catch-up is Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed received checkpoints. """ max_batch_size = chkFreqPatched.Max3PCBatchSize _, new_node = sdk_add_new_steward_and_node( looper, sdk_pool_handle, sdk_wallet_steward, 'EpsilonSteward', 'Epsilon', tdir, tconf, allPluginsPath=allPluginsPath) txnPoolNodeSet.append(new_node) looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) # Epsilon did not participate in ordering of the batch with EpsilonSteward # NYM transaction and the batch with Epsilon NODE transaction. # Epsilon got these transactions via catch-up. # To complete the first checkpoint send requests for 1 checkpoint minus # 2 3PC-batches (since there are already 2 3PC-batches in the first # checkpoint : with EpsilonSteward NYM transaction and with Epsilon NODE # transaction). This checkpoint has a not aligned lower bound # on the new node replicas so it will not be stabilized on them. send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint - 2 * max_batch_size) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) # The master replica of the new node stops to receive 3PC-messages new_node.master_replica.threePhaseRouter.extend( ( (PrePrepare, lambda *x, **y: None), (Prepare, lambda *x, **y: None), (Commit, lambda *x, **y: None), ) ) completed_catchups_before_reqs = get_number_of_completed_catchups(new_node) # Send requests for the new node's master replica to reach # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed # checkpoints from others send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_for_checkpoint) # Give time for the new node to catch up if it is going to do it looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) + waits.expectedPoolCatchupTime(len(txnPoolNodeSet))) waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1]) # Verify that the new node has not caught up assert get_number_of_completed_catchups(new_node) == completed_catchups_before_reqs # Send more requests for the new node's master replica to reach # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed # checkpoints from others send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) # Verify that the new node has caught up assert get_number_of_completed_catchups(new_node) > completed_catchups_before_reqs
def test_complete_short_checkpoint_not_included_in_lag_for_catchup( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, sdk_wallet_client, tdir, tconf, allPluginsPath): """ Verifies that if the first stored own checkpoint has a not aligned lower bound (this means that it was started after a catch-up), is complete and there is a quorumed stashed checkpoint from other replicas with the same end then this stashed checkpoint is not included into the lag for a catch-up, i.e. in such a case the lag which makes the node perform catch-up is Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed received checkpoints. """ max_batch_size = chkFreqPatched.Max3PCBatchSize _, new_node = sdk_add_new_steward_and_node( looper, sdk_pool_handle, sdk_wallet_steward, 'EpsilonSteward', 'Epsilon', tdir, tconf, allPluginsPath=allPluginsPath) txnPoolNodeSet.append(new_node) looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup']) # Epsilon did not participate in ordering of the batch with EpsilonSteward # NYM transaction and the batch with Epsilon NODE transaction. # Epsilon got these transactions via catch-up. # To complete the first checkpoint send requests for 1 checkpoint minus # 2 3PC-batches (since there are already 2 3PC-batches in the first # checkpoint : with EpsilonSteward NYM transaction and with Epsilon NODE # transaction). This checkpoint has a not aligned lower bound # on the new node replicas so it will not be stabilized on them. send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint - 2 * max_batch_size) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup']) # The master replica of the new node stops to receive 3PC-messages new_node.master_replica.threePhaseRouter.extend( ( (PrePrepare, lambda *x, **y: None), (Prepare, lambda *x, **y: None), (Commit, lambda *x, **y: None), ) ) completed_catchups_before_reqs = get_number_of_completed_catchups(new_node) # Send requests for the new node's master replica to reach # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed # checkpoints from others send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_for_checkpoint) # Give time for the new node to catch up if it is going to do it looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) + waits.expectedPoolCatchupTime(len(txnPoolNodeSet))) waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup']) # Verify that the new node has not caught up assert get_number_of_completed_catchups(new_node) == completed_catchups_before_reqs # Send more requests for the new node's master replica to reach # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed # checkpoints from others send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup']) # Verify that the new node has caught up assert get_number_of_completed_catchups(new_node) > completed_catchups_before_reqs
def test_reverted_unordered(txnPoolNodeSet, looper, wallet1, client1, client1Connected): """ Before starting catchup, revert any uncommitted changes to state and ledger. This is to avoid any re-application of requests that were ordered but stashed Example scenario prepared (1, 4) startViewChange start_catchup ... .... ... committed and send Ordered (1, 2) ... .... preLedgerCatchUp force_process_ordered, take out (1,2) and stash (1, 2) now process stashed Ordered(1,2), its requests will be applied again Simulation: Delay COMMITs to a node so that it can not order requests but has prepared them. Then trigger a view change and make sure the slow node has not ordered same number of requests as others but has prepared so it can order when it receives COMMITs while view change is in progress. The slow node should revert unordered batches and but it should eventually process the ordered requests, so delay LEDGER_STATUS too so catchup is delayed """ slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_node.nodeIbStasher.delay(cDelay(120, 0)) sent_batches = 5 send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * sent_batches, sent_batches) # Fast nodes have same last ordered and same data last_ordered = [n.master_last_ordered_3PC for n in fast_nodes] assert check_if_all_equal_in_list(last_ordered) ensure_all_nodes_have_same_data(looper, fast_nodes) # Slow nodes have different last ordered than fast nodes assert last_ordered[0] != slow_node.master_last_ordered_3PC # Delay LEDGER_STATUS so catchup starts late slow_node.nodeIbStasher.delay(lsDelay(100)) slow_node.nodeIbStasher.delay(msg_rep_delay(100)) # slow_node has not reverted batches assert sent_batches not in getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) ensure_view_change(looper, txnPoolNodeSet) def chk1(): # slow_node reverted all batches rv = getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) assert sent_batches in rv looper.run(eventually(chk1, retryWait=1)) # After the view change slow_node has prepared same requests as the fast # nodes have ordered assert last_ordered[ 0] == slow_node.master_replica.last_prepared_before_view_change # Deliver COMMITs slow_node.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT) def chk2(): # slow_node orders all requests as others have assert last_ordered[0] == slow_node.master_last_ordered_3PC looper.run(eventually(chk2, retryWait=1)) # Deliver LEDGER_STATUS so catchup can complete slow_node.nodeIbStasher.reset_delays_and_process_delayeds(LEDGER_STATUS) slow_node.nodeIbStasher.reset_delays_and_process_delayeds(MESSAGE_RESPONSE) # Ensure all nodes have same data ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) def chk3(): # slow_node processed stashed Ordered requests successfully rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert sent_batches in rv looper.run(eventually(chk3, retryWait=1)) # Ensure pool is functional ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1)
def test_node_detecting_lag_from_view_change_done_messages( txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf): """ A node is slow and after view change starts, it marks it's `last_prepared` to less than others, after catchup it does not get any txns from others and finds it has already ordered it's `last_prepared`, but when it gets ViewChangeDone messages, it starts catchup again and this time gets the txns. To achieve this delay all 3PC messages to a node so before view change it has different last_prepared from others. Also delay processing of COMMITs and INSTANCE_CHANGEs by other nodes """ send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica fast_master_replicas = [n.master_replica for n in fast_nodes] delay_3pc = 50 delay_ic = tconf.PerfCheckFreq + 5 delay_commit = delay_ic + 10 delay_3pc_messages([slow_node], 0, delay_3pc) for n in fast_nodes: n.nodeIbStasher.delay(icDelay(delay_ic)) n.nodeIbStasher.delay(cDelay(delay_commit)) reqs = [] for i in range(10): reqs = reqs + sendRandomRequests(wallet1, client1, 2) looper.runFor(.2) def chk1(): for rep in fast_master_replicas: assert compare_3PC_keys( slow_master_replica.last_prepared_certificate_in_view(), rep.last_prepared_certificate_in_view()) > 0 assert slow_master_replica.last_ordered_3pc == rep.last_ordered_3pc looper.run(eventually(chk1)) no_more_catchup_count = get_count(slow_node, slow_node.no_more_catchups_needed) # Track last prepared for master replica of each node prepareds = {} orig_methods = {} for node in txnPoolNodeSet: orig_methods[node.name] = node.master_replica.on_view_change_start def patched_on_view_change_start(self): orig_methods[self.node.name]() prepareds[self.node.name] = self.last_prepared_before_view_change node.master_replica.on_view_change_start = types.MethodType( patched_on_view_change_start, node.master_replica) ensure_view_change(looper, txnPoolNodeSet, exclude_from_check=fast_nodes) def chk2(): # last_prepared of slow_node is less than fast_nodes for rep in fast_master_replicas: assert compare_3PC_keys(prepareds[slow_master_replica.node.name], prepareds[rep.node.name]) > 0 looper.run(eventually(chk2, timeout=delay_ic + 5)) last_start_catchup_call_at = None no_more_catchup_call_at = None def chk3(): # no_more_catchups_needed was called since node found no need of catchup nonlocal last_start_catchup_call_at, no_more_catchup_call_at assert (get_count(slow_node, slow_node.no_more_catchups_needed) - no_more_catchup_count) > 0 no_more_catchup_call_at = slow_node.spylog.getLast( slow_node.no_more_catchups_needed).starttime last_start_catchup_call_at = slow_node.spylog.getLast( slow_node.start_catchup).starttime looper.run(eventually(chk3, timeout=delay_commit)) for n in fast_nodes: n.nodeIbStasher.reset_delays_and_process_delayeds() n.nodeIbStasher.reset_delays_and_process_delayeds() ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) assert slow_node.spylog.getLast( slow_node.start_catchup).starttime > no_more_catchup_call_at assert slow_node.spylog.getLast( slow_node.start_catchup).starttime > last_start_catchup_call_at
def test_catchup_not_triggered_if_another_in_progress( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, broken_node_and_others): """ A node misses 3pc messages and checkpoints during some period but later it stashes some amount of checkpoints and starts catchup. When the node is performing the catchup, it receives more checkpoints enough to start a new catchup but it does not start it because the first catchup is in progress. """ max_batch_size = chkFreqPatched.Max3PCBatchSize broken_node, other_nodes = broken_node_and_others logger.info("Step 1: The node misses quite a lot of 3PC-messages and checkpoints") send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint + max_batch_size) waitNodeDataInequality(looper, broken_node, *other_nodes) logger.info( "Step 2: The node receives 3PC-messages but cannot process them because of " "missed ones. But the node eventually stashes some amount of checkpoints " "and after that starts catchup") repaired_node = repair_broken_node(broken_node) initial_do_start_catchup_times = repaired_node.spylog.count(Node._do_start_catchup) initial_all_ledgers_caught_up = repaired_node.spylog.count(Node.allLedgersCaughtUp) with delay_rules(repaired_node.nodeIbStasher, cr_delay()): send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * reqs_for_checkpoint - max_batch_size) ensure_all_nodes_have_same_data(looper, other_nodes) looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.discovering), timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)) + waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)))) assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 logger.info( "Step 3: While doing the catchup, the node receives new checkpoints " "enough to start a new catchup but the node does not start it because " "the former is in progress") process_checkpoint_times_before = \ repaired_node.master_replica._checkpointer.spylog.count(CheckpointService.process_checkpoint) send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * reqs_for_checkpoint) # Wait until the node receives the new checkpoints from all the other nodes looper.run( eventually(lambda: assertExp( repaired_node.master_replica._checkpointer.spylog.count(CheckpointService.process_checkpoint) - process_checkpoint_times_before == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * (len(txnPoolNodeSet) - 1)), timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)))) # New catchup is not started when another one is in progress assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 assert repaired_node.mode == Mode.discovering logger.info("Step 4: The node completes the catchup. The ledger has been " "updated to the level determined on its start") looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.participating), timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 assert repaired_node.spylog.count(Node.allLedgersCaughtUp) - initial_all_ledgers_caught_up == 1 assert repaired_node.domainLedger.size == other_nodes[0].domainLedger.size
def test_catchup_not_triggered_if_another_in_progress( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, broken_node_and_others): """ A node misses 3pc messages and checkpoints during some period but later it stashes some amount of checkpoints and starts catchup. When the node is performing the catchup, it receives more checkpoints enough to start a new catchup but it does not start it because the first catchup is in progress. """ max_batch_size = chkFreqPatched.Max3PCBatchSize broken_node, other_nodes = broken_node_and_others logger.info("Step 1: The node misses quite a lot of 3PC-messages and checkpoints") send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint + max_batch_size) waitNodeDataInequality(looper, broken_node, *other_nodes) logger.info( "Step 2: The node receives 3PC-messages but cannot process them because of " "missed ones. But the node eventually stashes some amount of checkpoints " "and after that starts catchup") repaired_node = repair_broken_node(broken_node) initial_do_start_catchup_times = repaired_node.spylog.count(Node._do_start_catchup) initial_all_ledgers_caught_up = repaired_node.spylog.count(Node.allLedgersCaughtUp) with delay_rules(repaired_node.nodeIbStasher, cr_delay()): send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * reqs_for_checkpoint - max_batch_size) ensure_all_nodes_have_same_data(looper, other_nodes) target_ledger_size = other_nodes[0].domainLedger.size looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.syncing), timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)) + waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)))) assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 logger.info( "Step 3: While doing the catchup, the node receives new checkpoints " "enough to start a new catchup but the node does not start it because " "the former is in progress") process_checkpoint_times_before = repaired_node.master_replica.spylog.count(Replica.processCheckpoint) send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * reqs_for_checkpoint) # Wait until the node receives the new checkpoints from all the other nodes looper.run( eventually(lambda: assertExp(repaired_node.master_replica.spylog.count(Replica.processCheckpoint) - process_checkpoint_times_before == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * (len(txnPoolNodeSet) - 1)), timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)))) # New catchup is not started when another one is in progress assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 assert repaired_node.mode == Mode.syncing logger.info("Step 4: The node completes the catchup. The ledger has been " "updated to the level determined on its start") looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.participating), timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 assert repaired_node.spylog.count(Node.allLedgersCaughtUp) - initial_all_ledgers_caught_up == 1 assert repaired_node.domainLedger.size == target_ledger_size