def nodes_slow_to_process_catchup_reqs(txnPoolNodeSet): """ This will make the new node slow to complete the catchup and hence will not send any 3PC messages till catchup is complete """ for node in txnPoolNodeSet: node.nodeIbStasher.delay(cqDelay(catchup_delay))
def test_catchup_with_all_nodes_sending_cons_proofs_dead( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, logsearch): lagging_node = txnPoolNodeSet[-1] other_nodes = txnPoolNodeSet[:-1] start_delaying(lagging_node.nodeIbStasher, delay_3pc()) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) log_re_ask, _ = logsearch( msgs=['requesting .* missing transactions after timeout']) old_re_ask_count = len(log_re_ask) catchup_reqs = { node.name: start_delaying(node.nodeIbStasher, cqDelay()) for node in other_nodes } audit_catchup_service = lagging_node.ledgerManager._node_leecher._leechers[ AUDIT_LEDGER_ID]._catchup_rep_service lagging_node.start_catchup() looper.run( eventually(lambda: assert_eq(audit_catchup_service._is_working, True))) # Make sure number of cons proofs gathered when all nodes are assert len(audit_catchup_service._nodes_ledger_sizes) == 3 # Allow catchup requests only from nodes that didn't respond first for node_id, node_reqs in catchup_reqs.items(): if node_id not in audit_catchup_service._nodes_ledger_sizes: stop_delaying_and_process(node_reqs) # Check catchup finishes successfully, and there were reasks ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) assert len(log_re_ask) - old_re_ask_count > 0
def test_catchup_uses_only_nodes_with_cons_proofs(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): lagging_node = txnPoolNodeSet[-1] other_nodes = txnPoolNodeSet[:-1] start_delaying(lagging_node.nodeIbStasher, delay_3pc()) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) catchup_reqs = {node.name: start_delaying(node.nodeIbStasher, cqDelay()) for node in other_nodes} audit_catchup_service = lagging_node.ledgerManager._node_leecher._leechers[AUDIT_LEDGER_ID]._catchup_rep_service lagging_node.start_catchup() looper.run(eventually(lambda: assert_eq(audit_catchup_service._is_working, True))) # Make sure number of cons proofs gathered when all nodes are assert len(audit_catchup_service._nodes_ledger_sizes) == 3 # Allow catchup requests only for interesting nodes for node_id in audit_catchup_service._nodes_ledger_sizes.keys(): stop_delaying_and_process(catchup_reqs[node_id]) # Check catchup finishes successfully ensure_all_nodes_have_same_data(looper, txnPoolNodeSet, custom_timeout=30)
def test_catchup_with_one_slow_node(tdir, tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath, logsearch): ''' 1. Stop the node Delta 2. Order 9 txns. In sending CatchupReq in a first round every node [Alpha, Beta, Gamma] will receive request for 3 txns. 3. Delay CatchupReq messages on Alpha 4. Start Delta 5. Check that all nodes have equality data. 6. Check that Delta re-ask CatchupRep only once. In the second CatchupRep (first re-ask) Delta shouldn't request CatchupRep from Alpha because it didn't answer early. If the behavior is wrong and Delta re-ask txns form all nodes, every node will receive request for 1 txns, Alpha will not answer and Delta will need a new re-ask round. ''' # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Send more requests to active nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, len(rest_nodes) * 3) waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node and wait for successful catch up lagging_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) log_re_ask, _ = logsearch( msgs=['requesting .* missing transactions after timeout']) old_re_ask_count = len(log_re_ask) # Delay CatchupRep messages on Alpha with delay_rules(rest_nodes[0].nodeIbStasher, cqDelay()): with delay_rules(lagging_node.nodeIbStasher, cs_delay()): looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=120) assert len( log_re_ask ) - old_re_ask_count == 2 # for audit and domain ledgers
def testNewNodeCatchupWhileIncomingRequests(looper, txnPoolNodeSet, testNodeClass, tdir, tconf, sdk_pool_handle, sdk_wallet_steward, allPluginsPath): """ A new node joins while transactions are happening, its catchup requests include till where it has to catchup, which would be less than the other node's ledger size. In the meantime, the new node will stash all requests """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) def chkAfterCall(self, req, frm): r = self.processCatchupReq(req, frm) typ = getattr(req, f.LEDGER_ID.nm) if typ == DOMAIN_LEDGER_ID: ledger = self.getLedgerForMsg(req) assert req.catchupTill <= ledger.size return r for node in txnPoolNodeSet: node.nodeMsgRouter.routes[CatchupReq] = \ types.MethodType(chkAfterCall, node.ledgerManager) node.nodeIbStasher.delay(cqDelay(3)) print('Sending 5 requests') sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_steward, 5) looper.runFor(1) new_steward_name = randomString() new_node_name = "Epsilon" new_steward_wallet_handle, new_node = sdk_add_new_steward_and_node( looper, sdk_pool_handle, sdk_wallet_steward, new_steward_name, new_node_name, tdir, tconf, nodeClass=testNodeClass, allPluginsPath=allPluginsPath, autoStart=True) sdk_pool_refresh(looper, sdk_pool_handle) txnPoolNodeSet.append(new_node) looper.runFor(2) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) # TODO select or create a timeout for this case in 'waits' looper.run( eventually(checkNodeDataForEquality, new_node, *txnPoolNodeSet[:-1], retryWait=1, timeout=80)) assert new_node.spylog.count(TestNode.processStashedOrderedReqs) > 0
def nodeStashingOrderedRequests(txnPoolNodeSet, nodeCreatedAfterSomeTxns): looper, newNode, client, wallet, _, _ = nodeCreatedAfterSomeTxns for node in txnPoolNodeSet: node.nodeIbStasher.delay(cqDelay(5)) txnPoolNodeSet.append(newNode) ensureClientConnectedToNodesAndPoolLedgerSame(looper, client, *txnPoolNodeSet[:-1]) sendRandomRequests(wallet, client, 10) looper.run(checkNodesConnected(txnPoolNodeSet)) def stashing(): assert newNode.mode != Mode.participating assert len(newNode.stashedOrderedReqs) > 0 # assert len(newNode.reqsFromCatchupReplies) > 0 timeout = waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.run(eventually(stashing, retryWait=1, timeout=timeout))
def nodeStashingOrderedRequests(txnPoolNodeSet, sdk_node_created_after_some_txns): looper, new_node, sdk_pool_handle, new_steward_wallet_handle = sdk_node_created_after_some_txns for node in txnPoolNodeSet: node.nodeIbStasher.delay(cqDelay(5)) txnPoolNodeSet.append(new_node) sdk_ensure_pool_functional(looper, txnPoolNodeSet, new_steward_wallet_handle, sdk_pool_handle) sdk_send_random_requests(looper, sdk_pool_handle, new_steward_wallet_handle, 10) looper.run(checkNodesConnected(txnPoolNodeSet)) def stashing(): assert new_node.mode != Mode.participating assert len(new_node.stashedOrderedReqs) > 0 # assert len(newNode.reqsFromCatchupReplies) > 0 timeout = waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.run(eventually(stashing, retryWait=1, timeout=timeout))
def testNewNodeCatchupWhileIncomingRequests(looper, txnPoolNodeSet, testNodeClass, tdir, tconf, sdk_pool_handle, sdk_wallet_steward, allPluginsPath): """ A new node joins while transactions are happening, its catchup requests include till where it has to catchup, which would be less than the other node's ledger size. In the meantime, the new node will stash all requests """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) def chkAfterCall(self, req, frm): r = self.processCatchupReq(req, frm) typ = getattr(req, f.LEDGER_ID.nm) if typ == DOMAIN_LEDGER_ID: ledger = self.getLedgerForMsg(req) assert req.catchupTill <= ledger.size return r for node in txnPoolNodeSet: node.nodeMsgRouter.routes[CatchupReq] = \ types.MethodType(chkAfterCall, node.ledgerManager) node.nodeIbStasher.delay(cqDelay(3)) print('Sending 5 requests') sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_steward, 5) looper.runFor(1) new_steward_name = randomString() new_node_name = "Epsilon" new_steward_wallet_handle, new_node = sdk_add_new_steward_and_node( looper, sdk_pool_handle, sdk_wallet_steward, new_steward_name, new_node_name, tdir, tconf, nodeClass=testNodeClass, allPluginsPath=allPluginsPath, autoStart=True) sdk_pool_refresh(looper, sdk_pool_handle) txnPoolNodeSet.append(new_node) looper.runFor(2) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) # TODO select or create a timeout for this case in 'waits' looper.run(eventually(checkNodeDataForEquality, new_node, *txnPoolNodeSet[:-1], retryWait=1, timeout=150)) assert new_node.spylog.count(TestNode.processStashedOrderedReqs) > 0
def test_slow_node_reverts_unordered_state_during_catchup( looper, txnPoolNodeSet, client1, wallet1, client1Connected): """ Delay COMMITs to a node such that when it needs to catchup, it needs to revert some unordered state. Also till this time the node should have receive all COMMITs such that it will apply some of the COMMITs ( for which it has not received txns from catchup). For this delay COMMITs by long, do catchup for a little older than the state received in LedgerStatus, once catchup completes, reset delays and try to process delayed COMMITs, some COMMITs will be rejected but some will be processed since catchup was done for older ledger. """ sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 3 * Max3PCBatchSize) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) slow_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica commit_delay = 150 catchup_req_delay = 15 # Delay COMMITs to one node slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0)) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 6 * Max3PCBatchSize) waitNodeDataInequality(looper, slow_node, *other_nodes) # Make the slow node receive txns for a smaller ledger so it still finds # the need to catchup delay_batches = 2 make_a_node_catchup_twice(slow_node, other_nodes, DOMAIN_LEDGER_ID, delay_batches * Max3PCBatchSize) def is_catchup_needed_count(): return len( getAllReturnVals(slow_node, slow_node.is_catchup_needed, compare_val_to=True)) old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp) old_cn_count = is_catchup_needed_count() # Other nodes are slow to respond to CatchupReq, so that `slow_node` # gets a chance to order COMMITs for n in other_nodes: n.nodeIbStasher.delay(cqDelay(catchup_req_delay)) ensure_view_change(looper, txnPoolNodeSet) # Check last ordered of `other_nodes` is same for n1, n2 in combinations(other_nodes, 2): lst_3pc = check_last_ordered_3pc(n1, n2) def chk1(): # `slow_node` has prepared all 3PC messages which # `other_nodes` have ordered assert slow_master_replica.last_prepared_before_view_change == lst_3pc looper.run(eventually(chk1, retryWait=1)) old_pc_count = slow_master_replica.spylog.count( slow_master_replica.can_process_since_view_change_in_progress) # Repair the network so COMMITs are delayed and processed slow_node.resetDelays() slow_node.force_process_delayeds() def chk2(): # COMMITs are processed for prepared messages assert slow_master_replica.spylog.count( slow_master_replica.can_process_since_view_change_in_progress ) > old_pc_count looper.run(eventually(chk2, retryWait=1, timeout=5)) def chk3(): # Some COMMITs were ordered but stashed and they were processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert rv[0] == delay_batches looper.run(eventually(chk3, retryWait=1, timeout=catchup_req_delay + 5)) def chk4(): # Catchup was done once assert slow_node.spylog.count( slow_node.allLedgersCaughtUp) > old_lcu_count looper.run( eventually(chk4, retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) def chk5(): # Once catchup was done, need of other catchup was not found assert is_catchup_needed_count() == old_cn_count looper.run(eventually(chk5, retryWait=1, timeout=5)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 2 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)