def test_catchup_with_one_slow_node(tdir, tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath, logsearch): ''' 1. Stop the node Delta 2. Order 9 txns. In sending CatchupReq in a first round every node [Alpha, Beta, Gamma] will receive request for 3 txns. 3. Delay CatchupReq messages on Alpha 4. Start Delta 5. Check that all nodes have equality data. 6. Check that Delta re-ask CatchupRep only once. In the second CatchupRep (first re-ask) Delta shouldn't request CatchupRep from Alpha because it didn't answer early. If the behavior is wrong and Delta re-ask txns form all nodes, every node will receive request for 1 txns, Alpha will not answer and Delta will need a new re-ask round. ''' # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Send more requests to active nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, len(rest_nodes) * 3) waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node and wait for successful catch up lagging_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) log_re_ask, _ = logsearch( msgs=['requesting .* missing transactions after timeout']) old_re_ask_count = len(log_re_ask) # Delay CatchupRep messages on Alpha with delay_rules(rest_nodes[0].nodeIbStasher, cqDelay()): with delay_rules(lagging_node.nodeIbStasher, cs_delay()): looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=120) assert len( log_re_ask ) - old_re_ask_count == 2 # for audit and domain ledgers
def test_3pc_while_catchup_with_chkpoints_only( tdir, tconf, looper, chkFreqPatched, reqs_for_checkpoint, testNodeClass, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath): ''' Check that catch-up is not started again even if a quorum of stashed checkpoints is received during catch-up. Assume that only checkpoints and no 3PC messages are received. ''' # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] # Check that requests executed well sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Send more requests to active nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node and wait for successful catch up lagging_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) initial_all_ledgers_caught_up = lagging_node.spylog.count( Node.allLedgersCaughtUp) # delay all 3PC messages on the lagged node so that it # receives only Checkpoints and catch-up messages lagging_node.nodeIbStasher.delay(ppDelay()) lagging_node.nodeIbStasher.delay(pDelay()) lagging_node.nodeIbStasher.delay(cDelay()) # delay CurrentState to avoid Primary Propagation (since it will lead to more catch-ups not needed in this test). with delay_rules(lagging_node.nodeIbStasher, cs_delay()): with delay_rules(lagging_node.nodeIbStasher, lsDelay(), msg_rep_delay()): looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) # wait till we got ledger statuses for messages missed while the node was offline, # so that now we can order more messages, and they will not be caught up, but stashed looper.run( eventually(lambda: assertExp( lagging_node.nodeIbStasher.num_of_stashed(LedgerStatus) >= 3), retryWait=1, timeout=60)) assert lagging_node.mode != Mode.participating # make sure that more requests are being ordered while catch-up is in progress # stash enough stable checkpoints for starting a catch-up num_checkpoints = Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 num_reqs = reqs_for_checkpoint * num_checkpoints + 1 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs) looper.run( eventually(check_last_ordered_3pc_on_all_replicas, rest_nodes, (0, num_reqs + 2))) # all good nodes stabilized checkpoint looper.run(eventually(chkChkpoints, rest_nodes, 2, 0)) assert lagging_node.mode != Mode.participating # lagging node is catching up and stashing all checkpoints looper.run( eventually(lambda: assertExp( get_stashed_checkpoints(lagging_node) == num_checkpoints * len(rest_nodes)), timeout=waits.expectedPoolCatchupTime( len(txnPoolNodeSet)))) # check that last_ordered is set looper.run( eventually(check_last_ordered_3pc_on_master, [lagging_node], (0, num_reqs + 2))) # check that the catch-up is finished looper.run( eventually( lambda: assertExp(lagging_node.mode == Mode.participating), retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) # check that catch-up was started only once assert lagging_node.spylog.count( Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1 looper.run( eventually(lambda: assertExp( lagging_node.spylog.count(Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1))) looper.run( eventually(lambda: assertExp( lagging_node.spylog.count(Node.start_catchup) == 1))) # do not check for audit ledger since we didn't catch-up audit ledger when txns were ordering durinf catch-up waitNodeDataEquality(looper, *txnPoolNodeSet, exclude_from_check='check_audit', customTimeout=5)
def test_3pc_while_catchup_with_chkpoints(tdir, tconf, looper, chkFreqPatched, reqs_for_checkpoint, testNodeClass, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath): # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] # Check that requests executed well sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Send more requests to active nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node and wait for successful catch up lagging_node = start_stopped_node(lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) initial_all_ledgers_caught_up = lagging_node.spylog.count(Node.allLedgersCaughtUp) # delay CurrentState to avoid Primary Propagation (since it will lead to more catch-ups not needed in this test). with delay_rules(lagging_node.nodeIbStasher, cs_delay()): with delay_rules(lagging_node.nodeIbStasher, cr_delay()): looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) # wait till we got catchup replies for messages missed while the node was offline, # so that now qwe can order more messages, and they will not be caught up, but stashed looper.run( eventually(lambda: assertExp(len(lagging_node.nodeIbStasher.delayeds) >= 3), retryWait=1, timeout=60)) assert lagging_node.mode == Mode.syncing # make sure that more requests are being ordered while catch-up is in progress # stash enough stable checkpoints for starting a catch-up sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint * (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1)) assert lagging_node.mode == Mode.syncing # check that the catch-up is finished looper.run( eventually( lambda: assertExp(lagging_node.mode == Mode.participating), retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)) ) ) looper.run( eventually( lambda: assertExp( lagging_node.spylog.count(Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1) ) ) # check that catch-up was started only once looper.run( eventually( lambda: assertExp( lagging_node.spylog.count(Node.start_catchup) == 1) ) ) # check that the node was able to order requests stashed during catch-up waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
def test_limited_stash_3pc_while_catchup(tdir, tconf, looper, testNodeClass, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath, chkFreqPatched): ''' Test that the lagging_node can process messages from catchup stash after catchup and request lost messages from other nodes. ''' # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] # Check that requests executed well sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Order 2 checkpoints on rest_nodes (2 txns in 2 batches) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * CHK_FREQ, 2) waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node and wait for successful catch up lagging_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) initial_all_ledgers_caught_up = lagging_node.spylog.count( Node.allLedgersCaughtUp) with delay_rules(lagging_node.nodeIbStasher, cs_delay(), msg_rep_delay(types_to_delay=[PREPARE, PREPREPARE])): with delay_rules(lagging_node.nodeIbStasher, cr_delay(ledger_filter=DOMAIN_LEDGER_ID)): looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) # Order 2 checkpoints in the first lagging node catchup (2 txns in 2 batches) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * CHK_FREQ, 2) # Check that firs txn was ordered from stash after first catchup looper.run( eventually(lambda: assertExp(lagging_node.master_last_ordered_3PC[ 1] == txnPoolNodeSet[0].master_last_ordered_3PC[1] - 1), retryWait=1, timeout=waits.expectedPoolCatchupTime( len(txnPoolNodeSet)))) # Order 2 checkpoints in the second lagging node catchup (2 txns in 2 batches) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * CHK_FREQ, 2) waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) looper.run( eventually(lambda: assertExp(lagging_node.master_last_ordered_3PC == n. master_last_ordered_3PC for n in txnPoolNodeSet))) # check that catch-up was started only twice assert lagging_node.spylog.count( Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 2
def test_restart_node_with_view_changes(tdir, tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath): ''' 1. Stop the node Delta 2. Patch methods for processing VCStartMsgStrategy messages 3. Delay CurrentState messages on Delta 4. Start Delta 5. Start view change with a maser degradation reason (from view 0 to 1) 6. Check that Delta start VCStartMsgStrategy after quorum of InstanceChanges 7. Reset delay for CurrentStates 8. Check that propagate primary happened. 9. Unpatch VCStartMsgStrategy methods and process catching messages. 10. Start view change with a maser degradation reason (from view 1 to 2) 11. Check that all nodes has viewNo = 2 and can order transactions. ''' # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] start_view_no = lagging_node.viewNo # Stop Delta waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Send more requests to active nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, len(rest_nodes) * 3) waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node lagging_node = start_stopped_node(lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) # Add to lagging_node node route a patched method for processing # ViewChangeStartMessage to delay processing. global view_change_started_messages view_change_started_messages = [] def patch_on_view_change_started(node, msg, frm): view_change_started_messages.append((node, msg, frm)) processor = partial(patch_on_view_change_started, lagging_node) lagging_node.nodeMsgRouter.add((ViewChangeStartMessage, processor)) # Delay CurrentState messages on lagging_node to delay propagate primary with delay_rules(lagging_node.nodeIbStasher, cs_delay()): # Add lagging_node to pool looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) looper.run( eventually(lambda: assertExp(len(lagging_node.nodeIbStasher.delayeds) >= 3))) # Change viewNo in received CurrentStates. for cs in lagging_node.nodeIbStasher.delayeds: cs[0][0].viewNo = 1 for vcd in cs[0][0].primary: vcd[f.VIEW_NO.nm] = 1 # Start ViewChange (0 -> 1) for n in rest_nodes: n.view_changer.on_master_degradation() # Wait View Change start with InstanceChange messages on lagging_node looper.run( eventually( lambda: assertExp(len(view_change_started_messages) == 1))) looper.run( eventually( lambda: assertExp(lagging_node.viewNo == start_view_no + 1))) # Unpatch ViewChangeStartMessages processing and process delayed messages processor = partial(VCStartMsgStrategy.on_view_change_started, lagging_node) lagging_node.nodeMsgRouter.add((ViewChangeStartMessage, processor)) for msg in view_change_started_messages: lagging_node.view_changer.node.nodeInBox.append((msg[1], lagging_node.view_changer.node.name)) waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=start_view_no + 1, customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(txnPoolNodeSet))) # Start ViewChange (1 -> 2) for n in rest_nodes: n.view_changer.on_master_degradation() waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=start_view_no + 2, customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(txnPoolNodeSet))) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, instances_list=range(txnPoolNodeSet[0].requiredNumberOfInstances)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) waitNodeDataEquality(looper, *txnPoolNodeSet)
def test_3pc_while_catchup(tdir, tconf, looper, testNodeClass, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath): ''' Tests that requests being ordered during catch-up are stashed and re-applied when catch-up is finished ''' # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] # Check that requests executed well sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Send more requests to active nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node and wait for successful catch up lagging_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) initial_all_ledgers_caught_up = lagging_node.spylog.count( Node.allLedgersCaughtUp) assert all(replica.stasher.num_stashed_catchup == 0 for inst_id, replica in lagging_node.replicas.items()) # delay CurrentState to avoid Primary Propagation (since it will lead to more catch-ups not needed in this test). with delay_rules(lagging_node.nodeIbStasher, cs_delay()): with delay_rules(lagging_node.nodeIbStasher, cr_delay(ledger_filter=DOMAIN_LEDGER_ID)): looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) # wait till we got catchup replies for messages missed while the node was offline, # so that now we can order more messages, and they will not be caught up, but stashed looper.run( eventually(lambda: assertExp( lagging_node.nodeIbStasher.num_of_stashed(CatchupRep) >= 3 ), retryWait=1, timeout=60)) # make sure that more requests are being ordered while catch-up is in progress sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) assert lagging_node.mode == Mode.syncing assert all(replica.stasher.num_stashed_catchup > 0 for inst_id, replica in lagging_node.replicas.items()) # check that the catch-up is finished looper.run( eventually( lambda: assertExp(lagging_node.mode == Mode.participating), retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) looper.run( eventually(lambda: assertExp( lagging_node.spylog.count(Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1))) # check that the node was able to order requests stashed during catch-up # do not check for audit ledger since we didn't catch-up audit ledger when txns were ordering durinf catch-up waitNodeDataEquality(looper, *txnPoolNodeSet, exclude_from_check='check_audit', customTimeout=5) assert all(replica.stasher.num_stashed_catchup == 0 for inst_id, replica in lagging_node.replicas.items())