def testNodeCatchupAfterDisconnect( sdk_new_node_caught_up, txnPoolNodeSet, sdk_node_set_with_node_added_after_some_txns): """ A node that disconnects after some transactions should eventually get the transactions which happened while it was disconnected :return: """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \ sdk_node_set_with_node_added_after_some_txns logger.debug("Disconnecting node {} with pool ledger size {}".format( new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node, stopNode=False) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, 5) # Make sure new node got out of sync waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1]) logger.debug("Connecting the stopped node, {}".format(new_node)) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, new_node) logger.debug("Waiting for the node to catch up, {}".format(new_node)) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) logger.debug("Sending more requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, 10) checkNodeDataForEquality(new_node, *txnPoolNodeSet[:-1])
def test_node_catchup_after_checkpoints(looper, txnPoolNodeSet, chk_freq_patched, wallet1, client1, client1Connected, slow_node_and_others): """ For some reason a node misses 3pc messages but eventually the node stashes some amount checkpoints and decides to catchup. """ slow_node, other_nodes = slow_node_and_others completed_catchups_before = get_number_of_completed_catchups(slow_node) logger.info("Step 1: Send less than required for start the catchup procedure" "and check the slow node falls behind") batches_num = 2 * chk_freq_patched - 1 send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, num_reqs=batches_num, num_batches=batches_num, ) waitNodeDataInequality(looper, slow_node, *other_nodes) logger.info("Step 2: Send remaining requests in order to trigger the catchup" "procedure for the slow node, then check data equality") send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, num_reqs=1 ) waitNodeDataEquality(looper, slow_node, *other_nodes) # check if there was a catchup completed_catchups_after = get_number_of_completed_catchups(slow_node) assert completed_catchups_after == completed_catchups_before + 1
def testNodeCatchupAfterDisconnect(sdk_new_node_caught_up, txnPoolNodeSet, sdk_node_set_with_node_added_after_some_txns): """ A node that disconnects after some transactions should eventually get the transactions which happened while it was disconnected :return: """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \ sdk_node_set_with_node_added_after_some_txns logger.debug("Disconnecting node {} with pool ledger size {}". format(new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, new_node, stopNode=False) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, 5) # Make sure new node got out of sync waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1]) logger.debug("Connecting the stopped node, {}".format(new_node)) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, new_node) logger.debug("Waiting for the node to catch up, {}".format(new_node)) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) logger.debug("Sending more requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, 10) checkNodeDataForEquality(new_node, *txnPoolNodeSet[:-1])
def testNodeCatchupAfterDisconnect(newNodeCaughtUp, txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): """ A node that disconnects after some transactions should eventually get the transactions which happened while it was disconnected :return: """ looper, newNode, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns logger.debug("Stopping node {} with pool ledger size {}".format( newNode, newNode.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, newNode, stopNode=False) looper.removeProdable(newNode) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) # Make sure new node got out of sync waitNodeDataInequality(looper, newNode, *txnPoolNodeSet[:-1]) logger.debug("Starting the stopped node, {}".format(newNode)) looper.add(newNode) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, newNode) logger.debug("Waiting for the node to catch up, {}".format(newNode)) waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1]) logger.debug("Sending more requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 10) checkNodeDataForEquality(newNode, *txnPoolNodeSet[:-1])
def testNodeCatchupAfterLostConnection(newNodeCaughtUp, txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): """ A node that has poor internet connection and got unsynced after some transactions should eventually get the transactions which happened while it was not accessible :return: """ looper, newNode, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns logger.debug("Disconnecting node {}, ledger size {}". format(newNode, newNode.domainLedger.size)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, newNode, stopNode=False) looper.removeProdable(newNode) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) # Make sure new node got out of sync waitNodeDataInequality(looper, newNode, *txnPoolNodeSet[:-1]) # logger.debug("Ensure node {} gets disconnected".format(newNode)) ensure_node_disconnected(looper, newNode, txnPoolNodeSet[:-1]) logger.debug("Connecting the node {} back, ledger size {}". format(newNode, newNode.domainLedger.size)) looper.add(newNode) logger.debug("Waiting for the node to catch up, {}".format(newNode)) waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1]) logger.debug("Sending more requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 10) checkNodeDataForEquality(newNode, *txnPoolNodeSet[:-1])
def test_caught_up_for_current_view_check(looper, txnPoolNodeSet, client1, wallet1, client1Connected): """ One of the node experiences poor network and loses 3PC messages. It has to do multiple rounds of catchup to be caught up """ sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 3 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) bad_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != bad_node] orig_method = bad_node.master_replica.dispatchThreePhaseMsg # Bad node does not process any 3 phase messages, equivalent to messages # being lost def bad_method(self, m, s): pass bad_node.master_replica.dispatchThreePhaseMsg = types.MethodType( bad_method, bad_node.master_replica) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 6 * Max3PCBatchSize) waitNodeDataInequality(looper, bad_node, *other_nodes) # Patch all nodes to return ConsistencyProof of a smaller ledger to the # bad node but only once, so that the bad_node needs to do catchup again. make_a_node_catchup_twice(bad_node, other_nodes, DOMAIN_LEDGER_ID, Max3PCBatchSize) def is_catchup_needed_count(): return len( getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=True)) def caught_up_for_current_view_count(): return len( getAllReturnVals(bad_node, bad_node.caught_up_for_current_view, compare_val_to=True)) old_count_1 = is_catchup_needed_count() old_count_2 = caught_up_for_current_view_count() ensure_view_change(looper, txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert is_catchup_needed_count() > old_count_1 # The bad_node caught up due to receiving sufficient ViewChangeDone # messages assert caught_up_for_current_view_count() > old_count_2 bad_node.master_replica.dispatchThreePhaseMsg = types.MethodType( orig_method, bad_node.master_replica)
def testNodeCatchupFPlusOne(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, tdirWithPoolTxns, allPluginsPath, testNodeClass): """ Check that f+1 nodes is enough for catchup """ assert len(txnPoolNodeSet) == 4 node1 = txnPoolNodeSet[-1] node0 = txnPoolNodeSet[-2] logger.debug("Stopping node0 with pool ledger size {}".format( node0.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node0, stopNode=True) looper.removeProdable(node0) logger.debug("Sending requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) logger.debug("Stopping node1 with pool ledger size {}".format( node1.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node1, stopNode=True) looper.removeProdable(node1) # Make sure new node got out of sync # Excluding state check since the node is stopped hence the state db is closed waitNodeDataInequality(looper, node0, *txnPoolNodeSet[:-2], exclude_from_check=['check_state']) # TODO: Check if the node has really stopped processing requests? logger.debug("Starting the stopped node0") nodeHa, nodeCHa = HA(*node0.nodestack.ha), HA(*node0.clientstack.ha) config_helper = PNodeConfigHelper(node0.name, tconf, chroot=tdir) node0 = testNodeClass(node0.name, config_helper=config_helper, ha=nodeHa, cliha=nodeCHa, config=tconf, pluginPaths=allPluginsPath) looper.add(node0) logger.debug("Waiting for the node0 to catch up") waitNodeDataEquality(looper, node0, *txnPoolNodeSet[:-2]) logger.debug("Sending more requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) checkNodeDataForEquality(node0, *txnPoolNodeSet[:-2])
def test_no_catchup_if_got_from_3pc(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ A node is slow to receive COMMIT messages so after a view change it starts catchup. But before it can start requesting txns, the COMMITs messages are received and are ordered. The node should not request any transactions. :return: """ send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay_cm = 30 delat_cp = 100 slow_node.nodeIbStasher.delay(cDelay(delay_cm)) # The slow node receives consistency proofs after some delay, this delay # gives the opportunity to deliver all 3PC messages slow_node.nodeIbStasher.delay(cpDelay(delat_cp)) # Count of `getCatchupReqs` which is called to construct the `CatchupReq` # to be sent def domain_cr_count(): return sum(1 for entry in slow_node.ledgerManager.spylog.getAll( slow_node.ledgerManager.getCatchupReqs) if entry.params['consProof'].ledgerId == DOMAIN_LEDGER_ID) old_count = domain_cr_count() sent_batches = 10 send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * sent_batches, sent_batches) ensure_view_change(looper, nodes=txnPoolNodeSet) # After view change, the `slow_node` is behind waitNodeDataInequality(looper, slow_node, *other_nodes) # Unstash only COMMIT messages slow_node.nodeIbStasher.reset_delays_and_process_delayeds(Commit.typename) looper.runFor(2) slow_node.nodeIbStasher.reset_delays_and_process_delayeds( ConsistencyProof.typename) waitNodeDataEquality(looper, slow_node, *other_nodes) # No `CatchupReq`s constructed, hence no `CatchupReq`s could have # been sent assert domain_cr_count() == old_count # Some stashed ordered requests have been processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert sent_batches in rv sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_node_catchup_after_checkpoints( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, broken_node_and_others): """ A node misses 3pc messages and checkpoints during some period but later it stashes some amount of checkpoints and decides to catchup. """ max_batch_size = chkFreqPatched.Max3PCBatchSize broken_node, other_nodes = broken_node_and_others logger.info("Step 1: The node misses quite a lot of requests") send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint + max_batch_size) waitNodeDataInequality(looper, broken_node, *other_nodes) logger.info( "Step 2: The node gets requests but cannot process them because of " "missed ones. But the nodes eventually stashes some amount checkpoints " "after that the node starts catch up") repaired_node = repair_broken_node(broken_node) completed_catchups_before = get_number_of_completed_catchups(broken_node) send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * reqs_for_checkpoint - max_batch_size) waitNodeDataEquality(looper, repaired_node, *other_nodes) # Note that the repaired node might not fill the gap of missed 3PC-messages # by requesting them from other nodes because these 3PC-batches start from # an already stabilized checkpoint, so a part of these 3PC-messages are # already unavailable # Verify that a catch-up was done completed_catchups_after = get_number_of_completed_catchups(repaired_node) assert completed_catchups_after > completed_catchups_before logger.info("Step 3: Check if the node is able to process requests") send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint + max_batch_size) waitNodeDataEquality(looper, repaired_node, *other_nodes)
def testNodeCatchupFPlusOne(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, tdirWithPoolTxns, allPluginsPath, testNodeClass): """ Check that f+1 nodes is enough for catchup """ assert len(txnPoolNodeSet) == 4 node1 = txnPoolNodeSet[-1] node0 = txnPoolNodeSet[-2] logger.debug("Stopping node0 with pool ledger size {}". format(node0.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, node0, stopNode=True) looper.removeProdable(node0) logger.debug("Sending requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) logger.debug("Stopping node1 with pool ledger size {}". format(node1.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, node1, stopNode=True) looper.removeProdable(node1) # Make sure new node got out of sync # Excluding state check since the node is stopped hence the state db is closed waitNodeDataInequality(looper, node0, *txnPoolNodeSet[:-2], exclude_from_check=['check_state']) # TODO: Check if the node has really stopped processing requests? logger.debug("Starting the stopped node0") nodeHa, nodeCHa = HA(*node0.nodestack.ha), HA(*node0.clientstack.ha) config_helper = PNodeConfigHelper(node0.name, tconf, chroot=tdir) node0 = testNodeClass(node0.name, config_helper=config_helper, ha=nodeHa, cliha=nodeCHa, config=tconf, pluginPaths=allPluginsPath) looper.add(node0) logger.debug("Waiting for the node0 to catch up") waitNodeDataEquality(looper, node0, *txnPoolNodeSet[:-2]) logger.debug("Sending more requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) checkNodeDataForEquality(node0, *txnPoolNodeSet[:-2])
def testNodeCatchupFPlusOne(txnPoolNodeSet, poolAfterSomeTxns, tconf, tdir, tdirWithPoolTxns, allPluginsPath, testNodeClass): """ Check that f+1 nodes is enough for catchup """ looper, client, wallet = poolAfterSomeTxns assert len(txnPoolNodeSet) == 4 node1 = txnPoolNodeSet[-1] node0 = txnPoolNodeSet[-2] logger.debug("Stopping node0 with pool ledger size {}".format( node0.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node0, stopNode=True) looper.removeProdable(node0) logger.debug("Sending requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) logger.debug("Stopping node1 with pool ledger size {}".format( node1.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node1, stopNode=True) looper.removeProdable(node1) # Make sure new node got out of sync waitNodeDataInequality(looper, node0, *txnPoolNodeSet[:-2]) # TODO: Check if the node has really stopped processing requests? logger.debug("Starting the stopped node0") nodeHa, nodeCHa = HA(*node0.nodestack.ha), HA(*node0.clientstack.ha) config_helper = PNodeConfigHelper(node0.name, tconf, chroot=tdir) node0 = testNodeClass(node0.name, config_helper=config_helper, ha=nodeHa, cliha=nodeCHa, config=tconf, pluginPaths=allPluginsPath) looper.add(node0) logger.debug("Waiting for the node0 to catch up") waitNodeDataEquality(looper, node0, *txnPoolNodeSet[:-2]) logger.debug("Sending more requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 2) checkNodeDataForEquality(node0, *txnPoolNodeSet[:-2])
def test_node_catchup_causes_no_desync(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, monkeypatch, chkFreqPatched, reqs_for_checkpoint): """ Checks that transactions received by catchup do not break performance monitoring """ max_batch_size = chkFreqPatched.Max3PCBatchSize lagging_node = get_any_non_primary_node(txnPoolNodeSet) rest_nodes = set(txnPoolNodeSet).difference({lagging_node}) # Make master replica lagging by hiding all messages sent to it make_master_replica_lag(lagging_node) monkeypatch.setattr(lagging_node.master_replica, '_request_missing_three_phase_messages', lambda *x, **y: None) # Send some requests and check that all replicas except master executed it sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint - max_batch_size) waitNodeDataInequality(looper, lagging_node, *rest_nodes) looper.run(eventually(backup_replicas_run_forward, lagging_node)) assert not lagging_node.monitor.isMasterDegraded() sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint + max_batch_size) # Check that catchup done waitNodeDataEquality(looper, lagging_node, *rest_nodes) lagging_node.reset_delays_and_process_delayeds() # Send some more requests to ensure that backup and master replicas # are in the same state sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint - max_batch_size) looper.run(eventually(replicas_synced, lagging_node)) # Check that master is not considered to be degraded assert not lagging_node.monitor.isMasterDegraded()
def test_node_catchup_causes_no_desync(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, monkeypatch, chkFreqPatched, reqs_for_checkpoint): """ Checks that transactions received by catchup do not break performance monitoring """ max_batch_size = chkFreqPatched.Max3PCBatchSize lagging_node = get_any_non_primary_node(txnPoolNodeSet) rest_nodes = set(txnPoolNodeSet).difference({lagging_node}) # Make master replica lagging by hiding all messages sent to it make_master_replica_lag(lagging_node) monkeypatch.setattr(lagging_node.master_replica, '_request_missing_three_phase_messages', lambda *x, **y: None) # Send some requests and check that all replicas except master executed it sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint - max_batch_size) waitNodeDataInequality(looper, lagging_node, *rest_nodes) looper.run(eventually(backup_replicas_run_forward, lagging_node)) assert not lagging_node.monitor.isMasterDegraded() sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint + max_batch_size) # Check that catchup done waitNodeDataEquality(looper, lagging_node, *rest_nodes) lagging_node.reset_delays_and_process_delayeds() # Send some more requests to ensure that backup and master replicas # are in the same state sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint - max_batch_size) looper.run(eventually(replicas_synced, lagging_node)) # Check that master is not considered to be degraded assert not lagging_node.monitor.isMasterDegraded()
def test_node_catchup_after_checkpoints(looper, chk_freq_patched, txnPoolNodeSet, wallet1, client1, client1Connected, broken_node_and_others): """ For some reason a node misses 3pc messages but eventually the node stashes some amount checkpoints and decides to catchup. """ broken_node, other_nodes = broken_node_and_others completed_catchups_before = get_number_of_completed_catchups(broken_node) logger.info("Step 1: The node misses quite a lot requests") send_reqs_batches_and_get_suff_replies( looper, wallet1, client1, num_reqs=chk_freq_patched + 1, num_batches=chk_freq_patched + 1, ) waitNodeDataInequality(looper, broken_node, *other_nodes) logger.info( "Step 2: The node gets requests but cannot process them because of " "missed ones. But the nodes eventually stashes some amount checkpoints " "after that the node starts catch up") repaired_node = repair_broken_node(broken_node) send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, num_reqs=2 * chk_freq_patched, num_batches=2 * chk_freq_patched) waitNodeDataEquality(looper, repaired_node, *other_nodes) # check if there was at least 1 catchup completed_catchups_after = get_number_of_completed_catchups(repaired_node) assert completed_catchups_after >= completed_catchups_before + 1 logger.info("Step 3: Check if the node is able to process requests") send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, num_reqs=chk_freq_patched + 2, num_batches=chk_freq_patched + 2) waitNodeDataEquality(looper, repaired_node, *other_nodes)
def test_node_catchup_causes_no_desync(looper, txnPoolNodeSet, client1, wallet1, client1Connected, monkeypatch): """ Checks that transactions received by catchup do not break performance monitoring """ client, wallet = client1, wallet1 lagging_node = get_any_non_primary_node(txnPoolNodeSet) rest_nodes = set(txnPoolNodeSet).difference({lagging_node}) # Make master replica lagging by hiding all messages sent to it make_master_replica_lag(lagging_node) monkeypatch.setattr(lagging_node.master_replica, '_request_missing_three_phase_messages', lambda *x, **y: None) # Send some requests and check that all replicas except master executed it sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) waitNodeDataInequality(looper, lagging_node, *rest_nodes) looper.run(eventually(backup_replicas_run_forward, lagging_node)) # Disconnect lagging node, send some more requests and start it back # After start it should fall in a such state that it needs to make catchup disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=False) looper.removeProdable(lagging_node) sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) looper.add(lagging_node) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, lagging_node) # Check that catchup done waitNodeDataEquality(looper, lagging_node, *rest_nodes) # Send some more requests to ensure that backup and master replicas # are in the same state sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) looper.run(eventually(replicas_synced, lagging_node)) # Check that master is not considered to be degraded assert not lagging_node.monitor.isMasterDegraded()
def testNodeCatchupFPlusOne(txnPoolNodeSet, poolAfterSomeTxns): """ Check that f+1 nodes is enough for catchup """ looper, client, wallet = poolAfterSomeTxns assert len(txnPoolNodeSet) == 4 node1 = txnPoolNodeSet[-1] node0 = txnPoolNodeSet[-2] logger.debug("Stopping node0 with pool ledger size {}". format(node0.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, node0, stopNode=False) looper.removeProdable(node0) logger.debug("Sending requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) logger.debug("Stopping node1 with pool ledger size {}". format(node1.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, node1, stopNode=False) looper.removeProdable(node1) # Make sure new node got out of sync waitNodeDataInequality(looper, node0, *txnPoolNodeSet[:-2]) # TODO: Check if the node has really stopped processing requests? logger.debug("Starting the stopped node0") looper.add(node0) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet[:-1], node0) logger.debug("Waiting for the node0 to catch up") waitNodeDataEquality(looper, node0, *txnPoolNodeSet[:-2]) logger.debug("Sending more requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 2) checkNodeDataForEquality(node0, *txnPoolNodeSet[:-2])
def test_disconnected_node_catchup_plugin_ledger_txns(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, sdk_new_node_caught_up): """ A node gets disconnected, a few config ledger txns happen, the disconnected node comes back up and catches up the config ledger """ new_node = sdk_new_node_caught_up disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, new_node, stopNode=False) # Do some demo txns; some_demo_txns(looper, sdk_wallet_client, sdk_pool_handle) # Make sure new node got out of sync waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1]) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, new_node) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1])
def test_disconnected_node_catchup_config_ledger_txns( looper, some_config_txns_done, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, newNodeCaughtUp, keys): """ A node gets disconnected, a few config ledger txns happen, the disconnected node comes back up and catches up the config ledger """ new_node = newNodeCaughtUp disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node, stopNode=False) # Do some config txns; using a fixture as a method, passing some arguments # as None as they only make sense for the fixture (pre-requisites) send_some_config_txns(looper, sdk_pool_handle, sdk_wallet_client, keys) # Make sure new node got out of sync waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1]) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, new_node) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1])
def test_slow_nodes_catchup_before_selecting_primary_in_new_view( looper, txnPoolNodeSet, steward1, stewardWallet, tconf, slow_node): """ Delay 3PC to 1 node and then cause view change so by the time the view change happens(each node gets >n-f `INSTANCE_CHANGE`s), the slow node is behind other nodes. The should initiate catchup to come to the same state as other nodes. """ fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay = tconf.PerfCheckFreq # Bad network introduced slow_node.nodeIbStasher.delay(ppDelay(delay, 0)) slow_node.nodeIbStasher.delay(pDelay(2 * delay, 0)) slow_node.nodeIbStasher.delay(cDelay(3 * delay, 0)) for i in range(2): sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1, 20) waitNodeDataInequality(looper, slow_node, *fast_nodes) catchup_reply_counts = { n.name: n.ledgerManager.spylog.count(n.ledgerManager.processCatchupRep) for n in txnPoolNodeSet } catchup_done_counts = { n.name: n.spylog.count(n.allLedgersCaughtUp) for n in txnPoolNodeSet } def slow_node_processed_some(): assert slow_node.master_replica.batches # The slow node has received some PRE-PREPAREs looper.run(eventually(slow_node_processed_some, retryWait=1, timeout=delay)) # No reverts have been called by the slow node rv = getAllReturnVals(slow_node.replicas[0], TestReplica.revert_unordered_batches) assert not rv or max(rv) == 0 # Delay reception of catchup replies so ViewChangeDone can be received # before catchup completes delay_catchup_reply = 2 slow_node.nodeIbStasher.delay(cr_delay(delay_catchup_reply)) ensure_view_change(looper, txnPoolNodeSet) # `slow_node` will not have elections done but others will. checkProtocolInstanceSetup(looper, fast_nodes, numInstances=len(slow_node.replicas), retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) # `slow_node` does catchup, `fast_nodes` don't for n in txnPoolNodeSet: assert n.spylog.count( n.allLedgersCaughtUp) > catchup_done_counts[n.name] if n == slow_node: assert n.ledgerManager.spylog.count( n.ledgerManager.processCatchupRep) > catchup_reply_counts[ n.name] else: assert n.ledgerManager.spylog.count( n.ledgerManager.processCatchupRep) == catchup_reply_counts[ n.name] # Greater than 0 batches were reverted by the slow node assert max( getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches)) > 0 # Bad network repaired slow_node.reset_delays_and_process_delayeds() # Make sure pool is functional sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1, 5) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_caught_up_for_current_view_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ One of the node experiences poor network and loses 3PC messages. It has to do multiple rounds of catchup to be caught up """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) bad_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != bad_node] orig_method = bad_node.master_replica.dispatchThreePhaseMsg # Bad node does not process any 3 phase messages, equivalent to messages # being lost def bad_method(self, m, s): pass bad_node.master_replica.dispatchThreePhaseMsg = types.MethodType( bad_method, bad_node.master_replica) # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the # node catch-ups 2 times. # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all # missing txns, so no stashed ones are applied bad_node.nodeIbStasher.delay(lsDelay(1000)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6 * Max3PCBatchSize) waitNodeDataInequality(looper, bad_node, *other_nodes) # Patch all nodes to return ConsistencyProof of a smaller ledger to the # bad node but only once, so that the bad_node needs to do catchup again. make_a_node_catchup_twice(bad_node, other_nodes, DOMAIN_LEDGER_ID, Max3PCBatchSize) def is_catchup_needed_count(): return len(getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=True)) def is_catchup_not_needed_count(): return len(getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=False)) def has_ordered_till_last_prepared_certificate_count(): return len(getAllReturnVals(bad_node, bad_node.has_ordered_till_last_prepared_certificate, compare_val_to=True)) old_count_1 = is_catchup_needed_count() old_count_2 = has_ordered_till_last_prepared_certificate_count() old_count_3 = is_catchup_not_needed_count() ensure_view_change(looper, txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert is_catchup_needed_count() > old_count_1 assert is_catchup_not_needed_count() > old_count_3 # The bad_node caught up due to ordering till last prepared certificate assert has_ordered_till_last_prepared_certificate_count() > old_count_2 bad_node.master_replica.dispatchThreePhaseMsg = types.MethodType( orig_method, bad_node.master_replica)
def test_catchup_not_triggered_if_another_in_progress( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, broken_node_and_others): """ A node misses 3pc messages and checkpoints during some period but later it stashes some amount of checkpoints and starts catchup. When the node is performing the catchup, it receives more checkpoints enough to start a new catchup but it does not start it because the first catchup is in progress. """ max_batch_size = chkFreqPatched.Max3PCBatchSize broken_node, other_nodes = broken_node_and_others logger.info("Step 1: The node misses quite a lot of 3PC-messages and checkpoints") send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint + max_batch_size) waitNodeDataInequality(looper, broken_node, *other_nodes) logger.info( "Step 2: The node receives 3PC-messages but cannot process them because of " "missed ones. But the node eventually stashes some amount of checkpoints " "and after that starts catchup") repaired_node = repair_broken_node(broken_node) initial_do_start_catchup_times = repaired_node.spylog.count(Node._do_start_catchup) initial_all_ledgers_caught_up = repaired_node.spylog.count(Node.allLedgersCaughtUp) with delay_rules(repaired_node.nodeIbStasher, cr_delay()): send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * reqs_for_checkpoint - max_batch_size) ensure_all_nodes_have_same_data(looper, other_nodes) target_ledger_size = other_nodes[0].domainLedger.size looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.syncing), timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)) + waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)))) assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 logger.info( "Step 3: While doing the catchup, the node receives new checkpoints " "enough to start a new catchup but the node does not start it because " "the former is in progress") process_checkpoint_times_before = repaired_node.master_replica.spylog.count(Replica.processCheckpoint) send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * reqs_for_checkpoint) # Wait until the node receives the new checkpoints from all the other nodes looper.run( eventually(lambda: assertExp(repaired_node.master_replica.spylog.count(Replica.processCheckpoint) - process_checkpoint_times_before == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * (len(txnPoolNodeSet) - 1)), timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)))) # New catchup is not started when another one is in progress assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 assert repaired_node.mode == Mode.syncing logger.info("Step 4: The node completes the catchup. The ledger has been " "updated to the level determined on its start") looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.participating), timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 assert repaired_node.spylog.count(Node.allLedgersCaughtUp) - initial_all_ledgers_caught_up == 1 assert repaired_node.domainLedger.size == target_ledger_size
def test_complete_short_checkpoint_not_included_in_lag_for_catchup( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, sdk_wallet_client, tdir, tconf, allPluginsPath): """ Verifies that if the first stored own checkpoint has a not aligned lower bound (this means that it was started after a catch-up), is complete and there is a quorumed stashed checkpoint from other replicas with the same end then this stashed checkpoint is not included into the lag for a catch-up, i.e. in such a case the lag which makes the node perform catch-up is Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed received checkpoints. """ max_batch_size = chkFreqPatched.Max3PCBatchSize _, new_node = sdk_add_new_steward_and_node( looper, sdk_pool_handle, sdk_wallet_steward, 'EpsilonSteward', 'Epsilon', tdir, tconf, allPluginsPath=allPluginsPath) txnPoolNodeSet.append(new_node) looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) # Epsilon did not participate in ordering of the batch with EpsilonSteward # NYM transaction and the batch with Epsilon NODE transaction. # Epsilon got these transactions via catch-up. # To complete the first checkpoint send requests for 1 checkpoint minus # 2 3PC-batches (since there are already 2 3PC-batches in the first # checkpoint : with EpsilonSteward NYM transaction and with Epsilon NODE # transaction). This checkpoint has a not aligned lower bound # on the new node replicas so it will not be stabilized on them. send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint - 2 * max_batch_size) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) # The master replica of the new node stops to receive 3PC-messages new_node.master_replica.threePhaseRouter.extend( ( (PrePrepare, lambda *x, **y: None), (Prepare, lambda *x, **y: None), (Commit, lambda *x, **y: None), ) ) completed_catchups_before_reqs = get_number_of_completed_catchups(new_node) # Send requests for the new node's master replica to reach # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed # checkpoints from others send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_for_checkpoint) # Give time for the new node to catch up if it is going to do it looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) + waits.expectedPoolCatchupTime(len(txnPoolNodeSet))) waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1]) # Verify that the new node has not caught up assert get_number_of_completed_catchups(new_node) == completed_catchups_before_reqs # Send more requests for the new node's master replica to reach # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed # checkpoints from others send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) # Verify that the new node has caught up assert get_number_of_completed_catchups(new_node) > completed_catchups_before_reqs
def test_catchup_not_triggered_if_another_in_progress( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, broken_node_and_others): """ A node misses 3pc messages and checkpoints during some period but later it stashes some amount of checkpoints and starts catchup. When the node is performing the catchup, it receives more checkpoints enough to start a new catchup but it does not start it because the first catchup is in progress. """ max_batch_size = chkFreqPatched.Max3PCBatchSize broken_node, other_nodes = broken_node_and_others logger.info("Step 1: The node misses quite a lot of 3PC-messages and checkpoints") send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint + max_batch_size) waitNodeDataInequality(looper, broken_node, *other_nodes) logger.info( "Step 2: The node receives 3PC-messages but cannot process them because of " "missed ones. But the node eventually stashes some amount of checkpoints " "and after that starts catchup") repaired_node = repair_broken_node(broken_node) initial_do_start_catchup_times = repaired_node.spylog.count(Node._do_start_catchup) initial_all_ledgers_caught_up = repaired_node.spylog.count(Node.allLedgersCaughtUp) with delay_rules(repaired_node.nodeIbStasher, cr_delay()): send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * reqs_for_checkpoint - max_batch_size) ensure_all_nodes_have_same_data(looper, other_nodes) looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.discovering), timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)) + waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)))) assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 logger.info( "Step 3: While doing the catchup, the node receives new checkpoints " "enough to start a new catchup but the node does not start it because " "the former is in progress") process_checkpoint_times_before = \ repaired_node.master_replica._checkpointer.spylog.count(CheckpointService.process_checkpoint) send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * reqs_for_checkpoint) # Wait until the node receives the new checkpoints from all the other nodes looper.run( eventually(lambda: assertExp( repaired_node.master_replica._checkpointer.spylog.count(CheckpointService.process_checkpoint) - process_checkpoint_times_before == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * (len(txnPoolNodeSet) - 1)), timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)))) # New catchup is not started when another one is in progress assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 assert repaired_node.mode == Mode.discovering logger.info("Step 4: The node completes the catchup. The ledger has been " "updated to the level determined on its start") looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.participating), timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 assert repaired_node.spylog.count(Node.allLedgersCaughtUp) - initial_all_ledgers_caught_up == 1 assert repaired_node.domainLedger.size == other_nodes[0].domainLedger.size
def test_caught_up_for_current_view_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ One of the node experiences poor network and loses 3PC messages. It has to do multiple rounds of catchup to be caught up """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) bad_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != bad_node] # Bad node does not process any 3 Commit messages, equivalent to messages with delay_rules(bad_node.nodeIbStasher, cDelay()): # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the # node catch-ups 2 times. # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all # missing txns, so no stashed ones are applied bad_node.nodeIbStasher.delay(lsDelay(1000)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6 * Max3PCBatchSize) waitNodeDataInequality(looper, bad_node, *other_nodes) # Patch all nodes to return ConsistencyProof of a smaller ledger to the # bad node but only once, so that the bad_node needs to do catchup again. make_a_node_catchup_twice(bad_node, other_nodes, DOMAIN_LEDGER_ID, Max3PCBatchSize) def is_catchup_needed_count(): return len(getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=True)) def is_catchup_not_needed_count(): return len(getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=False)) def has_ordered_till_last_prepared_certificate_count(): return len(getAllReturnVals(bad_node, bad_node.has_ordered_till_last_prepared_certificate, compare_val_to=True)) old_is_catchup_needed_count = is_catchup_needed_count() old_is_catchup_not_needed_count = is_catchup_not_needed_count() old_has_ordered_till_last_prepared_certificate_count = has_ordered_till_last_prepared_certificate_count() ensure_view_change(looper, txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet, exclude_from_check=['check_last_ordered_3pc', 'check_audit', 'check_last_ordered_3pc_backup']) assert is_catchup_needed_count() == old_is_catchup_needed_count assert is_catchup_not_needed_count() > old_is_catchup_not_needed_count # The bad_node caught up due to ordering till last prepared certificate assert has_ordered_till_last_prepared_certificate_count() > old_has_ordered_till_last_prepared_certificate_count ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet, exclude_from_check=['check_last_ordered_3pc_backup'])
def test_no_catchup_if_got_from_3pc(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ A node is slow to receive COMMIT messages so after a view change it starts catchup. But before it can start requesting txns, the COMMITs messages are received and are ordered. The node should not request any transactions. :return: """ send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay_cm = 30 delat_cp = 100 slow_node.nodeIbStasher.delay(cDelay(delay_cm)) # The slow node receives consistency proofs after some delay, this delay # gives the opportunity to deliver all 3PC messages slow_node.nodeIbStasher.delay(cpDelay(delat_cp)) # Count of `getCatchupReqs` which is called to construct the `CatchupReq` # to be sent def domain_cr_count(): return sum(1 for entry in slow_node.ledgerManager.spylog.getAll( slow_node.ledgerManager.getCatchupReqs) if entry.params['consProof'].ledgerId == DOMAIN_LEDGER_ID) old_count = domain_cr_count() sent_batches = 10 send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * sent_batches, sent_batches) ensure_view_change(looper, nodes=txnPoolNodeSet) # After view change, the `slow_node` is behind waitNodeDataInequality(looper, slow_node, *other_nodes) # Unstash only COMMIT messages slow_node.nodeIbStasher.reset_delays_and_process_delayeds(Commit.typename) looper.runFor(2) slow_node.nodeIbStasher.reset_delays_and_process_delayeds( ConsistencyProof.typename) waitNodeDataEquality(looper, slow_node, *other_nodes) # No `CatchupReq`s constructed, hence no `CatchupReq`s could have # been sent assert domain_cr_count() == old_count # Some stashed ordered requests have been processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert sent_batches in rv sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_slow_node_reverts_unordered_state_during_catchup(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Delay COMMITs to a node such that when it needs to catchup, it needs to revert some unordered state. Also till this time the node should have receive all COMMITs such that it will apply some of the COMMITs ( for which it has not received txns from catchup). For this delay COMMITs by long, do catchup for a little older than the state received in LedgerStatus, once catchup completes, reset delays and try to process delayed COMMITs, some COMMITs will be rejected but some will be processed since catchup was done for older ledger. """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3 * Max3PCBatchSize) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) slow_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica commit_delay = 150 catchup_rep_delay = 25 # Delay COMMITs to one node slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0)) # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the # node catch-ups 2 times. # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all # missing txns, so no stashed ones are applied slow_node.nodeIbStasher.delay(lsDelay(1000)) # Make the slow node receive txns for a smaller ledger so it still finds # the need to catchup delay_batches = 2 make_a_node_catchup_less(slow_node, other_nodes, DOMAIN_LEDGER_ID, delay_batches * Max3PCBatchSize) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, other_nodes) waitNodeDataInequality(looper, slow_node, *other_nodes) old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp) # `slow_node` is slow to receive CatchupRep, so that it # gets a chance to order COMMITs slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay)) # start view change (and hence catchup) ensure_view_change(looper, txnPoolNodeSet) # Check last ordered of `other_nodes` is same for n1, n2 in combinations(other_nodes, 2): lst_3pc = check_last_ordered_3pc(n1, n2) def chk1(): # `slow_node` has prepared all 3PC messages which # `other_nodes` have ordered assertEquality(slow_master_replica.last_prepared_before_view_change, lst_3pc) looper.run(eventually(chk1, retryWait=1)) old_pc_count = slow_master_replica.spylog.count( slow_master_replica.can_process_since_view_change_in_progress) assert len(slow_node.stashedOrderedReqs) == 0 # Repair the network so COMMITs are received, processed and stashed slow_node.reset_delays_and_process_delayeds(COMMIT) def chk2(): # COMMITs are processed for prepared messages assert slow_master_replica.spylog.count( slow_master_replica.can_process_since_view_change_in_progress) > old_pc_count looper.run(eventually(chk2, retryWait=1, timeout=5)) def chk3(): # COMMITs are stashed assert len(slow_node.stashedOrderedReqs) == delay_batches * Max3PCBatchSize looper.run(eventually(chk3, retryWait=1, timeout=15)) # fix catchup, so the node gets a chance to be caught-up repair_node_catchup_less(other_nodes) def chk4(): # Some COMMITs were ordered but stashed and they were processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert delay_batches in rv looper.run(eventually(chk4, retryWait=1, timeout=catchup_rep_delay + 5)) def chk5(): # Catchup was done once assert slow_node.spylog.count( slow_node.allLedgersCaughtUp) > old_lcu_count looper.run( eventually( chk5, retryWait=1, timeout=waits.expectedPoolCatchupTime( len(txnPoolNodeSet)))) # make sure that the pool is functional checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_complete_short_checkpoint_not_included_in_lag_for_catchup( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, sdk_wallet_client, tdir, tconf, allPluginsPath): """ Verifies that if the first stored own checkpoint has a not aligned lower bound (this means that it was started after a catch-up), is complete and there is a quorumed stashed checkpoint from other replicas with the same end then this stashed checkpoint is not included into the lag for a catch-up, i.e. in such a case the lag which makes the node perform catch-up is Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed received checkpoints. """ max_batch_size = chkFreqPatched.Max3PCBatchSize _, new_node = sdk_add_new_steward_and_node( looper, sdk_pool_handle, sdk_wallet_steward, 'EpsilonSteward', 'Epsilon', tdir, tconf, allPluginsPath=allPluginsPath) txnPoolNodeSet.append(new_node) looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup']) # Epsilon did not participate in ordering of the batch with EpsilonSteward # NYM transaction and the batch with Epsilon NODE transaction. # Epsilon got these transactions via catch-up. # To complete the first checkpoint send requests for 1 checkpoint minus # 2 3PC-batches (since there are already 2 3PC-batches in the first # checkpoint : with EpsilonSteward NYM transaction and with Epsilon NODE # transaction). This checkpoint has a not aligned lower bound # on the new node replicas so it will not be stabilized on them. send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint - 2 * max_batch_size) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup']) # The master replica of the new node stops to receive 3PC-messages new_node.master_replica.threePhaseRouter.extend( ( (PrePrepare, lambda *x, **y: None), (Prepare, lambda *x, **y: None), (Commit, lambda *x, **y: None), ) ) completed_catchups_before_reqs = get_number_of_completed_catchups(new_node) # Send requests for the new node's master replica to reach # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed # checkpoints from others send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_for_checkpoint) # Give time for the new node to catch up if it is going to do it looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) + waits.expectedPoolCatchupTime(len(txnPoolNodeSet))) waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup']) # Verify that the new node has not caught up assert get_number_of_completed_catchups(new_node) == completed_catchups_before_reqs # Send more requests for the new node's master replica to reach # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed # checkpoints from others send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup']) # Verify that the new node has caught up assert get_number_of_completed_catchups(new_node) > completed_catchups_before_reqs
def test_slow_node_reverts_unordered_state_during_catchup( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Delay COMMITs to a node such that when it needs to catchup, it needs to revert some unordered state. Also till this time the node should have receive all COMMITs such that it will apply some of the COMMITs ( for which it has not received txns from catchup). For this delay COMMITs by long, do catchup for a little older than the state received in LedgerStatus, once catchup completes, reset delays and try to process delayed COMMITs, some COMMITs will be rejected but some will be processed since catchup was done for older ledger. """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3 * Max3PCBatchSize) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) slow_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica commit_delay = 150 catchup_rep_delay = 25 # Delay COMMITs to one node slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0)) # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the # node catch-ups 2 times. # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all # missing txns, so no stashed ones are applied slow_node.nodeIbStasher.delay(lsDelay(1000)) # Make the slow node receive txns for a smaller ledger so it still finds # the need to catchup delay_batches = 2 make_a_node_catchup_less(slow_node, other_nodes, DOMAIN_LEDGER_ID, delay_batches * Max3PCBatchSize) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, other_nodes) waitNodeDataInequality(looper, slow_node, *other_nodes) old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp) # `slow_node` is slow to receive CatchupRep, so that it # gets a chance to order COMMITs slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay)) old_last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc # start view change (and hence catchup) ensure_view_change(looper, txnPoolNodeSet) # Check last ordered of `other_nodes` is same for n1, n2 in combinations(other_nodes, 2): check_last_ordered_3pc(n1, n2) assert slow_master_replica.last_prepared_before_view_change == old_last_ordered old_pc_count = slow_master_replica._ordering_service.spylog.count( slow_master_replica._ordering_service._validate) assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == 0 # Repair the network so COMMITs are received, processed and stashed slow_node.reset_delays_and_process_delayeds(COMMIT) def chk2(): # COMMITs are processed for prepared messages assert slow_master_replica._ordering_service.spylog.count( slow_master_replica._ordering_service._validate) > old_pc_count looper.run(eventually(chk2, retryWait=1, timeout=5)) def chk3(): # (delay_batches * Max3PCBatchSize * commits_count_in_phase) COMMITs are stashed assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == \ delay_batches * Max3PCBatchSize * (len(txnPoolNodeSet) - 1) looper.run(eventually(chk3, retryWait=1, timeout=15)) # fix catchup, so the node gets a chance to be caught-up repair_node_catchup_less(other_nodes) def chk4(): # Some COMMITs were received but stashed and # they will processed after catchup assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == 0 looper.run(eventually(chk4, retryWait=1, timeout=catchup_rep_delay + 50)) def chk5(): # Catchup was done once assert slow_node.spylog.count( slow_node.allLedgersCaughtUp) > old_lcu_count looper.run( eventually(chk5, retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) # make sure that the pool is functional checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_slow_node_reverts_unordered_state_during_catchup( looper, txnPoolNodeSet, client1, wallet1, client1Connected): """ Delay COMMITs to a node such that when it needs to catchup, it needs to revert some unordered state. Also till this time the node should have receive all COMMITs such that it will apply some of the COMMITs ( for which it has not received txns from catchup). For this delay COMMITs by long, do catchup for a little older than the state received in LedgerStatus, once catchup completes, reset delays and try to process delayed COMMITs, some COMMITs will be rejected but some will be processed since catchup was done for older ledger. """ sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 3 * Max3PCBatchSize) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) slow_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica commit_delay = 150 catchup_rep_delay = 15 # Delay COMMITs to one node slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0)) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 6 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, other_nodes) waitNodeDataInequality(looper, slow_node, *other_nodes) # Make the slow node receive txns for a smaller ledger so it still finds # the need to catchup delay_batches = 2 make_a_node_catchup_twice(slow_node, other_nodes, DOMAIN_LEDGER_ID, delay_batches * Max3PCBatchSize) def is_catchup_needed_count(): return len( getAllReturnVals(slow_node, slow_node.is_catchup_needed, compare_val_to=True)) old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp) old_cn_count = is_catchup_needed_count() # `slow_node` is slow to receive CatchupRep, so that it # gets a chance to order COMMITs slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay)) ensure_view_change(looper, txnPoolNodeSet) # Check last ordered of `other_nodes` is same for n1, n2 in combinations(other_nodes, 2): lst_3pc = check_last_ordered_3pc(n1, n2) def chk1(): # `slow_node` has prepared all 3PC messages which # `other_nodes` have ordered assertEquality(slow_master_replica.last_prepared_before_view_change, lst_3pc) looper.run(eventually(chk1, retryWait=1)) old_pc_count = slow_master_replica.spylog.count( slow_master_replica.can_process_since_view_change_in_progress) # Repair the network so COMMITs are received and processed slow_node.reset_delays_and_process_delayeds(COMMIT) def chk2(): # COMMITs are processed for prepared messages assert slow_master_replica.spylog.count( slow_master_replica.can_process_since_view_change_in_progress ) > old_pc_count looper.run(eventually(chk2, retryWait=1, timeout=5)) def chk3(): # Some COMMITs were ordered but stashed and they were processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert delay_batches in rv looper.run(eventually(chk3, retryWait=1, timeout=catchup_rep_delay + 5)) def chk4(): # Catchup was done once assert slow_node.spylog.count( slow_node.allLedgersCaughtUp) > old_lcu_count looper.run( eventually(chk4, retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) def chk5(): # Once catchup was done, need of other catchup was not found assertEquality(is_catchup_needed_count(), old_cn_count) looper.run(eventually(chk5, retryWait=1, timeout=5)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 2 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)