def txnPoolNodeSet(txnPoolNodeSet, looper, client1, wallet1, client1Connected, tconf, tdirWithPoolTxns, allPluginsPath): logger.debug("Do several view changes to round the list of primaries") assert txnPoolNodeSet[0].viewNo == len(txnPoolNodeSet) - 1 logger.debug( "Do view change to reach viewNo {}".format(txnPoolNodeSet[0].viewNo + 1)) ensure_view_change_complete(looper, txnPoolNodeSet) logger.debug("Send requests to ensure that pool is working properly, " "viewNo: {}".format(txnPoolNodeSet[0].viewNo)) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, numReqs=3) logger.debug("Pool is ready, current viewNo: {}".format( txnPoolNodeSet[0].viewNo)) # TODO find out and fix why additional view change could happen # because of degarded master. It's critical for current test to have # view change completed for the time when new node is joining. # Thus, disable master degradation check as it won't impact the case # and guarantees necessary state. for node in txnPoolNodeSet: node.monitor.isMasterDegraded = lambda: False return txnPoolNodeSet
def testZStackNodeReconnection(tconf, looper, txnPoolNodeSet, client1, wallet1, tdirWithPoolTxns, client1Connected): sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1) npr = [n for n in txnPoolNodeSet if not n.hasPrimary] nodeToCrash = npr[0] idxToCrash = txnPoolNodeSet.index(nodeToCrash) otherNodes = [_ for _ in txnPoolNodeSet if _ != nodeToCrash] def checkFlakyConnected(conn=True): for node in otherNodes: if conn: assert nodeToCrash.nodestack.name in node.nodestack.connecteds else: assert nodeToCrash.nodestack.name not in node.nodestack.connecteds checkFlakyConnected(True) nodeToCrash.stop() looper.removeProdable(nodeToCrash) looper.runFor(1) looper.run(eventually(checkFlakyConnected, False, retryWait=1, timeout=35)) looper.runFor(1) node = TestNode(nodeToCrash.name, basedirpath=tdirWithPoolTxns, config=tconf, ha=nodeToCrash.nodestack.ha, cliha=nodeToCrash.clientstack.ha) looper.add(node) txnPoolNodeSet[idxToCrash] = node looper.run(eventually(checkFlakyConnected, True, retryWait=2, timeout=50)) ensureElectionsDone(looper, txnPoolNodeSet, retryWait=2, timeout=50) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1) checkNodesSendingCommits(txnPoolNodeSet)
def test_node_load_after_add(newNodeCaughtUp, txnPoolNodeSet, tconf, tdirWithPoolTxns, allPluginsPath, poolTxnStewardData, looper, client1, wallet1, client1Connected, capsys): """ A node that restarts after some transactions should eventually get the transactions which happened while it was down :return: """ new_node = newNodeCaughtUp logger.debug("Sending requests") # Here's where we apply some load client_batches = 300 txns_per_batch = 25 for i in range(client_batches): s = perf_counter() sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, txns_per_batch, override_timeout_limit=True) with capsys.disabled(): print('{} executed {} client txns in {:.2f} seconds'.format( i + 1, txns_per_batch, perf_counter() - s)) logger.debug("Starting the stopped node, {}".format(new_node)) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4])
def testNodeCatchupAfterRestart(newNodeCaughtUp, txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): """ A node that restarts after some transactions should eventually get the transactions which happened while it was down :return: """ looper, newNode, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns logger.debug("Stopping node {} with pool ledger size {}".format( newNode, newNode.poolManager.txnSeqNo)) ensureNodeDisconnectedFromPool(looper, txnPoolNodeSet, newNode) # for n in txnPoolNodeSet[:4]: # for r in n.nodestack.remotes.values(): # if r.name == newNode.name: # r.removeStaleCorrespondents() # looper.run(eventually(checkNodeDisconnectedFrom, newNode.name, # txnPoolNodeSet[:4], retryWait=1, timeout=5)) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) logger.debug("Starting the stopped node, {}".format(newNode)) newNode.start(looper.loop) looper.run(checkNodesConnected(txnPoolNodeSet, overrideTimeout=30)) looper.run( eventually(checkNodeLedgersForEquality, newNode, *txnPoolNodeSet[:4], retryWait=1, timeout=75))
def test_not_check_if_no_new_requests(looper: Looper, nodeSet: TestNodeSet, wallet1, client1): """ Checks that node does not do performance check if there were no new requests since previous check """ # Ensure that nodes participating, because otherwise they do not do check for node in list(nodeSet): assert node.isParticipating # Check that first performance checks passes, but further do not for node in list(nodeSet): assert node.checkPerformance() assert not node.checkPerformance() assert not node.checkPerformance() assert not node.checkPerformance() # Send new request and check that after it nodes can do # performance check again num_requests = 1 sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, num_requests, nodeSet.f) for node in list(nodeSet): assert node.checkPerformance()
def test_new_node_accepts_chosen_primary( txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): looper, new_node, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns logger.debug("Ensure nodes data equality".format(txnPoolNodeSet[0].viewNo)) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) # here we must have view_no = 4 # - current primary is Alpha (based on node registry before new node joined) # - but new node expects itself as primary basing # on updated node registry # -> new node doesn't verify current primary assert not new_node.view_changer._primary_verified # -> new node haven't received ViewChangeDone from the expected primary # (self VCHD message is registered when node sends it, not the case # for primary propagate logic) assert not new_node.view_changer.has_view_change_from_primary # -> BUT new node understands that no view change actually happens assert new_node.view_changer._is_propagated_view_change_completed logger.debug("Send requests to ensure that pool is working properly, " "viewNo: {}".format(txnPoolNodeSet[0].viewNo)) sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, numReqs=3) logger.debug("Ensure nodes data equality".format(txnPoolNodeSet[0].viewNo)) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1])
def testNodeCatchupAfterLostConnection(newNodeCaughtUp, txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): """ A node that has poor internet connection and got unsynced after some transactions should eventually get the transactions which happened while it was not accessible :return: """ looper, newNode, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns logger.debug("Disconnecting node {}, ledger size {}". format(newNode, newNode.domainLedger.size)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, newNode, stopNode=False) looper.removeProdable(newNode) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) # Make sure new node got out of sync waitNodeDataInequality(looper, newNode, *txnPoolNodeSet[:-1]) # logger.debug("Ensure node {} gets disconnected".format(newNode)) ensure_node_disconnected(looper, newNode, txnPoolNodeSet[:-1]) logger.debug("Connecting the node {} back, ledger size {}". format(newNode, newNode.domainLedger.size)) looper.add(newNode) logger.debug("Waiting for the node to catch up, {}".format(newNode)) waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1]) logger.debug("Sending more requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 10) checkNodeDataForEquality(newNode, *txnPoolNodeSet[:-1])
def view_change_in_between_3pc(looper, nodes, slow_nodes, wallet, client, slow_delay=1, wait=None): send_reqs_to_nodes_and_verify_all_replies(looper, wallet, client, 4) delay_3pc_messages(slow_nodes, 0, delay=slow_delay) sendRandomRequests(wallet, client, 10) if wait: looper.runFor(wait) ensure_view_change_complete(looper, nodes, customTimeout=60) reset_delays_and_process_delayeds(slow_nodes) sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5, total_timeout=30) send_reqs_to_nodes_and_verify_all_replies(looper, wallet, client, 5, total_timeout=30)
def nodeCreatedAfterSomeTxns(looper, testNodeClass, do_post_node_creation, txnPoolNodeSet, tdir, tdirWithClientPoolTxns, poolTxnStewardData, tconf, allPluginsPath, request): client, wallet = buildPoolClientAndWallet(poolTxnStewardData, tdirWithClientPoolTxns, clientClass=TestClient) looper.add(client) looper.run(client.ensureConnectedToNodes()) txnCount = getValueFromModule(request, "txnCount", 5) sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, txnCount) newStewardName = randomString() newNodeName = "Epsilon" newStewardClient, newStewardWallet, newNode = addNewStewardAndNode( looper, client, wallet, newStewardName, newNodeName, tdir, tdirWithClientPoolTxns, tconf, nodeClass=testNodeClass, allPluginsPath=allPluginsPath, autoStart=True, do_post_node_creation=do_post_node_creation) yield looper, newNode, client, wallet, newStewardClient, \ newStewardWallet
def testRequestsSize(txnPoolNodesLooper, txnPoolNodeSet, poolTxnClientNames, tdirWithPoolTxns, poolTxnData, noRetryReq): """ Client should not be using node registry but pool transaction file :return: """ clients = [] for name in poolTxnClientNames: seed = poolTxnData["seeds"][name].encode() client, wallet = buildPoolClientAndWallet((name, seed), tdirWithPoolTxns) txnPoolNodesLooper.add(client) ensureClientConnectedToNodesAndPoolLedgerSame(txnPoolNodesLooper, client, *txnPoolNodeSet) clients.append((client, wallet)) n = 250 timeOutPerReq = 3 for (client, wallet) in clients: logger.debug("{} sending {} requests".format(client, n)) sendReqsToNodesAndVerifySuffReplies(txnPoolNodesLooper, wallet, client, n, 1, timeOutPerReq) logger.debug("{} sent {} requests".format(client, n)) for node in txnPoolNodeSet: logger.debug("{} has requests {} with size {}". format(node, len(node.requests), get_size(node.requests))) for replica in node.replicas: logger.debug("{} has prepares {} with size {}". format(replica, len(replica.prepares), get_size(replica.prepares))) logger.debug("{} has commits {} with size {}". format(replica, len(replica.commits), get_size(replica.commits)))
def test_different_ledger_request_interleave(tconf, looper, txnPoolNodeSet, client1, wallet1, one_node_added, client1Connected, tdir, client_tdir, tdirWithPoolTxns, steward1, stewardWallet, allPluginsPath): """ Send pool and domain ledger requests such that they interleave, and do view change in between and verify the pool is functional """ new_node = one_node_added sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 2) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) # Send domain ledger requests but don't wait for replies requests = sendRandomRequests(wallet1, client1, 2) # Add another node by sending pool ledger request _, _, new_theta = nodeThetaAdded(looper, txnPoolNodeSet, tdir, client_tdir, tconf, steward1, stewardWallet, allPluginsPath, name='new_theta') # Send more domain ledger requests but don't wait for replies requests.extend(sendRandomRequests(wallet1, client1, 3)) # Do view change without waiting for replies ensure_view_change(looper, nodes=txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) # Make sure all requests are completed waitForSufficientRepliesForRequests(looper, client1, requests=requests) ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1) new_steward, new_steward_wallet = addNewSteward(looper, client_tdir, steward1, stewardWallet, 'another_ste') # Send another pool ledger request (NODE) but don't wait for completion of # request next_node_name = 'next_node' r = sendAddNewNode(tdir, tconf, next_node_name, new_steward, new_steward_wallet) node_req = r[0] # Send more domain ledger requests but don't wait for replies requests = [ node_req, *sendRandomRequests(new_steward_wallet, new_steward, 5) ] # Make sure all requests are completed waitForSufficientRepliesForRequests(looper, new_steward, requests=requests) # Make sure pool is functional ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1)
def testReplyWhenRequestSentToMoreThanFPlusOneNodes(looper, nodeSet, fClient, replied1, wallet1): """ Alpha would not be sent request but other nodes will be, so Alpha will just rely on propagates from other nodes """ alpha = nodeSet.Alpha other_nodes = [n for n in nodeSet if n != alpha] def chk(req_count=1): for node in nodeSet: prc_req = node.processRequest.__name__ prc_ppg = node.processPropagate.__name__ if node != alpha: # All nodes except alpha will receive requests from client assert node.spylog.count(prc_req) == req_count else: # Alpha will not receive requests from client assert node.spylog.count(prc_req) == 0 # All nodes will get propagates from others assert node.spylog.count(prc_ppg) == req_count * (nodeCount - 1) # Ledger is same for all nodes waitNodeDataEquality(looper, alpha, *other_nodes) chk(1) more_reqs_count = 5 sendReqsToNodesAndVerifySuffReplies(looper, wallet1, fClient, more_reqs_count, 1) # Ledger is same for all nodes waitNodeDataEquality(looper, alpha, *other_nodes) chk(6) # Since one request is already sent as part of `replied1`
def testRequestsSize(txnPoolNodesLooper, txnPoolNodeSet, poolTxnClientNames, tdirWithPoolTxns, poolTxnData, noRetryReq): """ Client should not be using node registry but pool transaction file :return: """ clients = [] for name in poolTxnClientNames: seed = poolTxnData["seeds"][name].encode() client, wallet = buildPoolClientAndWallet((name, seed), tdirWithPoolTxns) txnPoolNodesLooper.add(client) ensureClientConnectedToNodesAndPoolLedgerSame(txnPoolNodesLooper, client, *txnPoolNodeSet) clients.append((client, wallet)) n = 250 timeOutPerReq = 3 for (client, wallet) in clients: logger.debug("{} sending {} requests".format(client, n)) sendReqsToNodesAndVerifySuffReplies(txnPoolNodesLooper, wallet, client, n, 1, timeOutPerReq) logger.debug("{} sent {} requests".format(client, n)) for node in txnPoolNodeSet: logger.debug("{} has requests {} with size {}".format( node, len(node.requests), get_size(node.requests))) for replica in node.replicas: logger.debug("{} has prepares {} with size {}".format( replica, len(replica.prepares), get_size(replica.prepares))) logger.debug("{} has commits {} with size {}".format( replica, len(replica.commits), get_size(replica.commits)))
def test_new_node_accepts_timestamp(tconf, looper, txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns, client1, wallet1, client1Connected): """ A new node joins the pool and is able to function properly without """ _, new_node, _, _, _, _ = nodeSetWithNodeAddedAfterSomeTxns old_susp_count = get_timestamp_suspicion_count(new_node) # Don't wait for node to catchup, start sending requests sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 10) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) # No suspicions were raised by new_node assert get_timestamp_suspicion_count(new_node) == old_susp_count # All nodes should reply send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, Max3PCBatchSize * 3) # No suspicions were raised by new_node assert get_timestamp_suspicion_count(new_node) == old_susp_count suspicions = { node.name: get_timestamp_suspicion_count(node) for node in txnPoolNodeSet } ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, Max3PCBatchSize * 3) for node in txnPoolNodeSet: assert suspicions[node.name] == get_timestamp_suspicion_count(node)
def test_node_load_after_one_node_drops_all_msgs(looper, txnPoolNodeSet, tconf, tdirWithPoolTxns, allPluginsPath, poolTxnStewardData, capsys): client, wallet = buildPoolClientAndWallet(poolTxnStewardData, tdirWithPoolTxns, clientClass=TestClient) looper.add(client) looper.run(client.ensureConnectedToNodes()) nodes = txnPoolNodeSet x = nodes[-1] with capsys.disabled(): print("Patching node {}".format(x)) def handleOneNodeMsg(self, wrappedMsg): # do nothing with an incoming node message pass x.handleOneNodeMsg = MethodType(handleOneNodeMsg, x) client_batches = 120 txns_per_batch = 25 for i in range(client_batches): s = perf_counter() sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, txns_per_batch, override_timeout_limit=True) with capsys.disabled(): print('{} executed {} client txns in {:.2f} seconds'.format( i + 1, txns_per_batch, perf_counter() - s))
def testNodeCatchupAfterDisconnect(newNodeCaughtUp, txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): """ A node that disconnects after some transactions should eventually get the transactions which happened while it was disconnected :return: """ looper, newNode, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns logger.debug("Stopping node {} with pool ledger size {}".format( newNode, newNode.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, newNode, stopNode=False) looper.removeProdable(newNode) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) # Make sure new node got out of sync waitNodeDataInequality(looper, newNode, *txnPoolNodeSet[:-1]) logger.debug("Starting the stopped node, {}".format(newNode)) looper.add(newNode) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, newNode) logger.debug("Waiting for the node to catch up, {}".format(newNode)) waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1]) logger.debug("Sending more requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 10) checkNodeDataForEquality(newNode, *txnPoolNodeSet[:-1])
def nodeCreatedAfterSomeTxns(txnPoolNodeSet, tdirWithPoolTxns, poolTxnStewardData, tconf, allPluginsPath, request): with Looper(debug=True) as looper: client, wallet = buildPoolClientAndWallet(poolTxnStewardData, tdirWithPoolTxns, clientClass=TestClient) looper.add(client) looper.run(client.ensureConnectedToNodes()) txnCount = getValueFromModule(request, "txnCount", 5) sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, txnCount, timeoutPerReq=25) newStewardName = randomString() newNodeName = "Epsilon" newStewardClient, newStewardWallet, newNode = addNewStewardAndNode( looper, client, wallet, newStewardName, newNodeName, tdirWithPoolTxns, tconf, allPluginsPath=allPluginsPath, autoStart=True) yield looper, newNode, client, wallet, newStewardClient, \ newStewardWallet
def test_observer_execution(looper, txnPoolNodeSet, client1, wallet1): resp1 = [] resp2 = [] def callable1(name, reqId, frm, result, numReplies): resp1.append(reqId) return reqId def callable2(name, reqId, frm, result, numReplies): resp2.append(reqId) return reqId client1.registerObserver(callable1, name='first') client1.registerObserver(callable2) # Send 1 request req, = sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1) # Each observer is called only once assert len(resp1) == 1 assert len(resp2) == 1 assert resp1[0] == req.reqId assert resp2[0] == req.reqId client1.deregisterObserver('first') # Send another request req1, = sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1) # Only 1 observer is called assert len(resp1) == 1 assert len(resp2) == 2 assert resp1[-1] == req.reqId assert resp2[-1] == req1.reqId
def testNodeCatchupAfterRestart(newNodeCaughtUp, txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): """ A node that restarts after some transactions should eventually get the transactions which happened while it was down :return: """ looper, newNode, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns logger.debug("Stopping node {} with pool ledger size {}". format(newNode, newNode.poolManager.txnSeqNo)) ensureNodeDisconnectedFromPool(looper, txnPoolNodeSet, newNode) # for n in txnPoolNodeSet[:4]: # for r in n.nodestack.remotes.values(): # if r.name == newNode.name: # r.removeStaleCorrespondents() # looper.run(eventually(checkNodeDisconnectedFrom, newNode.name, # txnPoolNodeSet[:4], retryWait=1, timeout=5)) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) logger.debug("Starting the stopped node, {}".format(newNode)) newNode.start(looper.loop) looper.run(checkNodesConnected(txnPoolNodeSet)) looper.run(eventually(checkNodeLedgersForEquality, newNode, *txnPoolNodeSet[:4], retryWait=1, timeout=15))
def testQueueingReqFromFutureView(delayedPerf, looper, nodeSet, up, client1): """ Test if every node queues 3 Phase requests(PRE-PREPARE, PREPARE and COMMIT) that come from a view which is greater than the current view """ f = getMaxFailures(nodeCount) # Delay processing of instance change on a node nodeA = nodeSet.Alpha nodeA.nodeIbStasher.delay(icDelay(60)) nonPrimReps = getNonPrimaryReplicas(nodeSet, 0) # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's throughput falls and view changes ppDelayer = ppDelay(5, 0) for r in nonPrimReps: r.node.nodeIbStasher.delay(ppDelayer) sendReqsToNodesAndVerifySuffReplies(looper, client1, 4, timeout=5 * nodeCount) # Every node except Node A should have a view change for node in nodeSet: if node.name != nodeA.name: looper.run(eventually( partial(checkViewChangeInitiatedForNode, node, 0), retryWait=1, timeout=20)) # Node A's view should not have changed yet with pytest.raises(AssertionError): looper.run(eventually(partial( checkViewChangeInitiatedForNode, nodeA, 0), retryWait=1, timeout=20)) # NodeA should not have any pending 3 phase request for a later view for r in nodeA.replicas: # type: TestReplica assert len(r.threePhaseMsgsForLaterView) == 0 # Reset delays on incoming messages from all nodes for node in nodeSet: node.nodeIbStasher.nodelay(ppDelayer) # Send one more request sendRandomRequest(client1) def checkPending3PhaseReqs(): # Get all replicas that have their primary status decided reps = [rep for rep in nodeA.replicas if rep.isPrimary is not None] # Atleast one replica should have its primary status decided assert len(reps) > 0 for r in reps: # type: TestReplica logging.debug("primary status for replica {} is {}" .format(r, r.primaryNames)) assert len(r.threePhaseMsgsForLaterView) > 0 # NodeA should now have pending 3 phase request for a later view looper.run(eventually(checkPending3PhaseReqs, retryWait=1, timeout=30))
def test_master_primary_different_from_previous(txnPoolNodeSet, looper, client1, wallet1, client1Connected): """ After a view change, primary must be different from previous primary for master instance, it does not matter for other instance. The primary is benign and does not vote for itself. """ pr = slow_primary(txnPoolNodeSet, 0, delay=10) old_pr_node_name = pr.node.name # View change happens ensure_view_change(looper, txnPoolNodeSet) logger.debug("VIEW HAS BEEN CHANGED!") # Elections done ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # New primary is not same as old primary assert getPrimaryReplica(txnPoolNodeSet, 0).node.name != old_pr_node_name pr.outBoxTestStasher.resetDelays() # The new primary can still process requests sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)
def testNodeDoesNotParticipateUntilCaughtUp(txnPoolNodeSet, nodes_slow_to_process_catchup_reqs, nodeCreatedAfterSomeTxns): """ A new node that joins after some transactions should stash new transactions until it has caught up :return: """ looper, new_node, client, wallet, newStewardClient, newStewardWallet = \ nodeCreatedAfterSomeTxns txnPoolNodeSet.append(new_node) old_nodes = txnPoolNodeSet[:-1] sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) chk_commits_prepares_recvd(0, old_nodes, new_node) for node in old_nodes: node.reset_delays_and_process_delayeds() timeout = waits.expectedPoolCatchupTime(len(txnPoolNodeSet)) + \ catchup_delay + \ waits.expectedPoolElectionTimeout(len(txnPoolNodeSet)) ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=timeout) waitNodeDataEquality(looper, new_node, *old_nodes) sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 2) # Commits and Prepares are received by all old nodes with pytest.raises(AssertionError): # Since nodes discard 3PC messages for already ordered requests. chk_commits_prepares_recvd(0, old_nodes, new_node) waitNodeDataEquality(looper, new_node, *old_nodes)
def testPrimaryRecvs3PhaseMessageOutsideWatermarks(chkFreqPatched, looper, txnPoolNodeSet, client1, wallet1, client1Connected): """ One of the primary starts getting lot of requests, more than his log size and queues up requests since they will go beyond its watermarks. This happens since other nodes are slow in processing its PRE-PREPARE. Eventually this primary will send PRE-PREPARE for all requests and those requests will complete """ instId = 1 reqsToSend = 2*chkFreqPatched.LOG_SIZE + 1 npr = getNonPrimaryReplicas(txnPoolNodeSet, instId) pr = getPrimaryReplica(txnPoolNodeSet, instId) from plenum.server.replica import TPCStat orderedCount = pr.stats.get(TPCStat.OrderSent) for r in npr: r.node.nodeIbStasher.delay(ppDelay(10, instId)) def chk(): assert orderedCount + reqsToSend == pr.stats.get(TPCStat.OrderSent) print('Sending {} requests'.format(reqsToSend)) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, reqsToSend, 1) looper.run(eventually(chk, retryWait=1, timeout=80))
def test_catch_up_after_demoted(txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): logger.info( "1. add a new node after sending some txns and check that catch-up " "is done (the new node is up to date)") looper, newNode, client, wallet, newStewardClient, \ newStewardWallet = nodeSetWithNodeAddedAfterSomeTxns waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:4]) logger.info("2. turn the new node off (demote)") node_data = {ALIAS: newNode.name, SERVICES: []} updateNodeData(looper, newStewardClient, newStewardWallet, newNode, node_data) logger.info("3. send more requests, " "so that the new node's state is outdated") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) checkNodeDataForInequality(newNode, *txnPoolNodeSet[:-1]) logger.info("4. turn the new node on") node_data = {ALIAS: newNode.name, SERVICES: [VALIDATOR]} updateNodeData(looper, newStewardClient, newStewardWallet, newNode, node_data) logger.info("5. make sure catch-up is done " "(the new node is up to date again)") waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1]) logger.info("6. send more requests and make sure " "that the new node participates in processing them") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 10) waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1])
def testCheckpointCreated(chkFreqPatched, looper, txnPoolNodeSet, client1, wallet1, client1Connected, reqs_for_checkpoint): """ After requests less than `CHK_FREQ`, there should be one checkpoint on each replica. After `CHK_FREQ`, one checkpoint should become stable """ # Send one batch less so checkpoint is not created sendReqsToNodesAndVerifySuffReplies( looper, wallet1, client1, reqs_for_checkpoint - (chkFreqPatched.Max3PCBatchSize), 1) # Deliberately waiting so as to verify that not more than 1 checkpoint is # created looper.runFor(2) chkChkpoints(txnPoolNodeSet, 1) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, chkFreqPatched.Max3PCBatchSize, 1) timeout = waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.run( eventually(chkChkpoints, txnPoolNodeSet, 1, 0, retryWait=1, timeout=timeout))
def testElectionsAfterViewChange(delayedPerf, looper: Looper, nodeSet: TestNodeSet, up, wallet1, client1): """ Test that a primary election does happen after a view change """ # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's throughput falls # and view changes nonPrimReps = getNonPrimaryReplicas(nodeSet, 0) for r in nonPrimReps: r.node.nodeIbStasher.delay(ppDelay(10, 0)) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 4) # Ensure view change happened for both node and its primary elector for node in nodeSet: looper.run( eventually(partial(checkViewChangeInitiatedForNode, node, 1), retryWait=1, timeout=20)) # Ensure elections are done again and pool is setup again with appropriate # protocol instances and each protocol instance is setup properly too checkProtocolInstanceSetup(looper, nodeSet, retryWait=1, timeout=30)
def setup(looper, tconf, startedNodes, up, wallet1, client1): # Get the master replica of the master protocol instance P = getPrimaryReplica(startedNodes) # Make `Delta` small enough so throughput check passes. for node in startedNodes: node.monitor.Delta = .001 # set LAMBDA not so huge like it set in the production config testLambda = 30 for node in startedNodes: node.monitor.Lambda = testLambda slowed_request = False # make P (primary replica on master) faulty, i.e., slow to send # PRE-PREPARE for a specific client request only def specificPrePrepare(msg): nonlocal slowed_request if isinstance(msg, PrePrepare) and slowed_request is False: slowed_request = True return testLambda + 5 # just more that LAMBDA P.outBoxTestStasher.delay(specificPrePrepare) # TODO select or create a timeout for this case in 'waits' sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, numReqs=5, customTimeoutPerReq=tconf.TestRunningTimeLimitSec) return adict(nodes=startedNodes)
def setup(looper, startedNodes, up, wallet1, client1): # Get the master replica of the master protocol instance P = getPrimaryReplica(startedNodes) # Make `Delta` small enough so throughput check passes. for node in startedNodes: node.monitor.Delta = .001 slowRequest = None # make P (primary replica on master) faulty, i.e., slow to send # PRE-PREPARE for a specific client request only def by65SpecificPrePrepare(msg): nonlocal slowRequest if isinstance(msg, PrePrepare) and slowRequest is None: slowRequest = getattr(msg, f.REQ_ID.nm) return 65 P.outBoxTestStasher.delay(by65SpecificPrePrepare) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, numReqs=5, timeoutPerReq=80) return adict(nodes=startedNodes)
def testViewChangesIfMasterPrimaryDisconnected(txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet viewNoBefore = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary stopNodes([old_pr_node], looper) looper.removeProdable(old_pr_node) remainingNodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remainingNodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remainingNodes, viewNoBefore + 1) ensure_all_nodes_have_same_data(looper, nodes=remainingNodes) new_pr_node = get_master_primary_node(remainingNodes) assert old_pr_node != new_pr_node sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)
def testOldCheckpointDeleted(chkFreqPatched, looper, txnPoolNodeSet, client1, wallet1, client1Connected, reqs_for_checkpoint): """ Send requests more than twice of `CHK_FREQ`, there should be one new stable checkpoint on each replica. The old stable checkpoint should be removed """ sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, numReqs=2 * reqs_for_checkpoint, fVal=1) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, numReqs=1, fVal=1) timeout = waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.run( eventually(chkChkpoints, txnPoolNodeSet, 2, 0, retryWait=1, timeout=timeout))
def test_caught_up_for_current_view_check(looper, txnPoolNodeSet, client1, wallet1, client1Connected): """ One of the node experiences poor network and loses 3PC messages. It has to do multiple rounds of catchup to be caught up """ sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 3 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) bad_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != bad_node] orig_method = bad_node.master_replica.dispatchThreePhaseMsg # Bad node does not process any 3 phase messages, equivalent to messages # being lost def bad_method(self, m, s): pass bad_node.master_replica.dispatchThreePhaseMsg = types.MethodType( bad_method, bad_node.master_replica) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 6 * Max3PCBatchSize) waitNodeDataInequality(looper, bad_node, *other_nodes) # Patch all nodes to return ConsistencyProof of a smaller ledger to the # bad node but only once, so that the bad_node needs to do catchup again. make_a_node_catchup_twice(bad_node, other_nodes, DOMAIN_LEDGER_ID, Max3PCBatchSize) def is_catchup_needed_count(): return len( getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=True)) def caught_up_for_current_view_count(): return len( getAllReturnVals(bad_node, bad_node.caught_up_for_current_view, compare_val_to=True)) old_count_1 = is_catchup_needed_count() old_count_2 = caught_up_for_current_view_count() ensure_view_change(looper, txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert is_catchup_needed_count() > old_count_1 # The bad_node caught up due to receiving sufficient ViewChangeDone # messages assert caught_up_for_current_view_count() > old_count_2 bad_node.master_replica.dispatchThreePhaseMsg = types.MethodType( orig_method, bad_node.master_replica)
def changeNodeHa(looper, txnPoolNodeSet, tdirWithClientPoolTxns, poolTxnData, poolTxnStewardNames, tconf, shouldBePrimary, tdir): # prepare new ha for node and client stack subjectedNode = None stewardName = None stewardsSeed = None for nodeIndex, n in enumerate(txnPoolNodeSet): if shouldBePrimary == n.has_master_primary: subjectedNode = n stewardName = poolTxnStewardNames[nodeIndex] stewardsSeed = poolTxnData["seeds"][stewardName].encode() break nodeStackNewHA, clientStackNewHA = genHa(2) logger.debug("change HA for node: {} to {}".format( subjectedNode.name, (nodeStackNewHA, clientStackNewHA))) nodeSeed = poolTxnData["seeds"][subjectedNode.name].encode() # change HA stewardClient, req = changeHA(looper, tconf, subjectedNode.name, nodeSeed, nodeStackNewHA, stewardName, stewardsSeed, basedir=tdirWithClientPoolTxns) waitForSufficientRepliesForRequests(looper, stewardClient, requests=[req]) # stop node for which HA will be changed subjectedNode.stop() looper.removeProdable(subjectedNode) # start node with new HA config_helper = PNodeConfigHelper(subjectedNode.name, tconf, chroot=tdir) restartedNode = TestNode(subjectedNode.name, config_helper=config_helper, config=tconf, ha=nodeStackNewHA, cliha=clientStackNewHA) looper.add(restartedNode) txnPoolNodeSet[nodeIndex] = restartedNode looper.run(checkNodesConnected(txnPoolNodeSet, customTimeout=70)) electionTimeout = waits.expectedPoolElectionTimeout( nodeCount=len(txnPoolNodeSet), numOfReelections=3) ensureElectionsDone(looper, txnPoolNodeSet, retryWait=1, customTimeout=electionTimeout) # start client and check the node HA anotherClient, _ = genTestClient(tmpdir=tdirWithClientPoolTxns, usePoolLedger=True) looper.add(anotherClient) looper.run(eventually(anotherClient.ensureConnectedToNodes)) stewardWallet = Wallet(stewardName) stewardWallet.addIdentifier(signer=DidSigner(seed=stewardsSeed)) sendReqsToNodesAndVerifySuffReplies( looper, stewardWallet, stewardClient, 8)
def test_recover_stop_primaries(looper, checkpoint_size, txnPoolNodeSet, allPluginsPath, tdir, tconf, client1, wallet1, client1Connected): """ Test that we can recover after having more than f nodes disconnected: - stop current master primary (Alpha) - send txns - restart current master primary (Beta) - send txns """ active_nodes = list(txnPoolNodeSet) assert 4 == len(active_nodes) initial_view_no = active_nodes[0].viewNo logger.info("Stop first node (current Primary)") _, active_nodes = stop_primary(looper, active_nodes) logger.info("Make sure view changed") expected_view_no = initial_view_no + 1 waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no) ensureElectionsDone(looper=looper, nodes=active_nodes, numInstances=2) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) logger.info("send at least one checkpoint") assert nodes_do_not_have_checkpoints(*active_nodes) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, numReqs=2 * checkpoint_size) assert nodes_have_checkpoints(*active_nodes) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) logger.info( "Stop second node (current Primary) so the primary looses his state") stopped_node, active_nodes = stop_primary(looper, active_nodes) logger.info("Restart the primary node") restarted_node = start_stopped_node(stopped_node, looper, tconf, tdir, allPluginsPath) assert nodes_do_not_have_checkpoints(restarted_node) assert nodes_have_checkpoints(*active_nodes) active_nodes = active_nodes + [restarted_node] logger.info("Check that primary selected") ensureElectionsDone(looper=looper, nodes=active_nodes, numInstances=2, customTimeout=30) waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) logger.info("Check if the pool is able to process requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, numReqs=10 * checkpoint_size) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) assert nodes_have_checkpoints(*active_nodes)
def testInstChangeWithLowerRatioThanDelta(looper, step3, wallet1, client1): sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 10) # wait for every node to run another checkPerformance waitForNextPerfCheck(looper, step3.nodes, step3.perfChecks) # verify all nodes have undergone an instance change looper.run(eventually(checkViewNoForNodes, step3.nodes, 1, timeout=10))
def viewChangeDone(nodeSet, looper, up, wallet1, client1, viewNo): # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's performance falls and view changes delayNonPrimaries(nodeSet, 0, 10) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 4) looper.run(eventually(partial(checkViewNoForNodes, nodeSet, viewNo+1), retryWait=1, timeout=20))
def testReplyReceivedOnlyByClientWhoSentRequest(looper, nodeSet, tdir, client1, wallet1): newClient, _ = genTestClient(nodeSet, tmpdir=tdir) looper.add(newClient) looper.run(newClient.ensureConnectedToNodes()) client1InboxSize = len(client1.inBox) newClientInboxSize = len(newClient.inBox) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, newClient, 1) assert len(client1.inBox) == client1InboxSize assert len(newClient.inBox) > newClientInboxSize
def changeNodeHa(looper, txnPoolNodeSet, tdirWithPoolTxns, poolTxnData, poolTxnStewardNames, tconf, shouldBePrimary): # prepare new ha for node and client stack subjectedNode = None stewardName = None stewardsSeed = None for nodeIndex, n in enumerate(txnPoolNodeSet): if (shouldBePrimary and n.primaryReplicaNo == 0) or \ (not shouldBePrimary and n.primaryReplicaNo != 0): subjectedNode = n stewardName = poolTxnStewardNames[nodeIndex] stewardsSeed = poolTxnData["seeds"][stewardName].encode() break nodeStackNewHA, clientStackNewHA = genHa(2) logger.debug("change HA for node: {} to {}". format(subjectedNode.name, (nodeStackNewHA, clientStackNewHA))) nodeSeed = poolTxnData["seeds"][subjectedNode.name].encode() # change HA stewardClient, req = changeHA(looper, tconf, subjectedNode.name, nodeSeed, nodeStackNewHA, stewardName, stewardsSeed) f = getMaxFailures(len(stewardClient.nodeReg)) looper.run(eventually(checkSufficientRepliesRecvd, stewardClient.inBox, req.reqId, f, retryWait=1, timeout=20)) # stop node for which HA will be changed subjectedNode.stop() looper.removeProdable(subjectedNode) # start node with new HA restartedNode = TestNode(subjectedNode.name, basedirpath=tdirWithPoolTxns, config=tconf, ha=nodeStackNewHA, cliha=clientStackNewHA) looper.add(restartedNode) txnPoolNodeSet[nodeIndex] = restartedNode looper.run(checkNodesConnected(txnPoolNodeSet, overrideTimeout=70)) ensureElectionsDone(looper, txnPoolNodeSet, retryWait=1, timeout=10) # start client and check the node HA anotherClient, _ = genTestClient(tmpdir=tdirWithPoolTxns, usePoolLedger=True) looper.add(anotherClient) looper.run(eventually(anotherClient.ensureConnectedToNodes)) stewardWallet = Wallet(stewardName) stewardWallet.addIdentifier(signer=SimpleSigner(seed=stewardsSeed)) sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, stewardClient, 8) looper.run(eventually(checkIfGenesisPoolTxnFileUpdated, *txnPoolNodeSet, stewardClient, anotherClient, retryWait=1, timeout=10)) looper.removeProdable(stewardClient)
def testNodesReceiveClientMsgs(txnPoolNodeSet, tdirWithPoolTxns, poolTxnClientData, txnPoolCliNodeReg): with Looper(debug=True) as looper: name, pkseed, sigseed = poolTxnClientData signer = SimpleSigner(seed=sigseed) client = TestClient(name=name, nodeReg=txnPoolCliNodeReg, ha=genHa(), signer=signer, basedirpath=tdirWithPoolTxns) looper.add(client) looper.run(client.ensureConnectedToNodes()) sendReqsToNodesAndVerifySuffReplies(looper, client, 1)
def testOldCheckpointDeleted(chkFreqPatched, looper, txnPoolNodeSet, client1, wallet1, client1Connected): """ Send requests more than twice of `CHK_FREQ`, there should be one new stable checkpoint on each replica. The old stable checkpoint should be removed """ sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 2*CHK_FREQ, 1) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1, 1) looper.run(eventually(chkChkpoints, txnPoolNodeSet, 2, 0, retryWait=1))
def testNodeDiscardMessageFromUnknownView(txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns, newNodeCaughtUp, tdirWithPoolTxns, tconf, allPluginsPath): """ Node discards 3-phase and election messages from view nos that it does not know of (view nos before it joined the pool) :return: """ looper, nodeX, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns viewNo = nodeX.viewNo # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's performance falls and view changes delayNonPrimaries(txnPoolNodeSet, 0, 10) sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 4) looper.run(eventually(partial(checkViewNoForNodes, txnPoolNodeSet, viewNo + 1), retryWait=1, timeout=20)) newStewardName = "testClientSteward" + randomString(3) nodeName = "Theta" _, _, nodeTheta = addNewStewardAndNode(looper, client, wallet, newStewardName, nodeName, tdirWithPoolTxns, tconf, allPluginsPath) txnPoolNodeSet.append(nodeTheta) looper.run(checkNodesConnected(txnPoolNodeSet)) looper.run(client.ensureConnectedToNodes()) looper.run(eventually(checkNodeLedgersForEquality, nodeTheta, *txnPoolNodeSet[:-1], retryWait=1, timeout=5)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1, timeout=10) electMsg = Nomination(nodeX.name, 0, viewNo) threePMsg = PrePrepare( 0, viewNo, 10, wallet.defaultId, wallet._getIdData().lastReqId+1, "random digest", time.time() ) ridTheta = nodeX.nodestack.getRemote(nodeTheta.name).uid nodeX.send(electMsg, ridTheta) nodeX.send(threePMsg, ridTheta) nodeX.send(electMsg, ridTheta) looper.run(eventually(checkDiscardMsg, [nodeTheta, ], electMsg, 'un-acceptable viewNo', retryWait=1, timeout=5)) nodeX.send(threePMsg, ridTheta) looper.run(eventually(checkDiscardMsg, [nodeTheta, ], threePMsg, 'un-acceptable viewNo', retryWait=1, timeout=5))
def testRequestOlderThanStableCheckpointRemoved( chkFreqPatched, looper, txnPoolNodeSet, client1, wallet1, client1Connected ): reqs = sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, CHK_FREQ - 1, 1) looper.run(eventually(chkChkpoints, txnPoolNodeSet, 1, retryWait=1)) checkRequestCounts(txnPoolNodeSet, len(reqs)) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1, 1) looper.run(eventually(chkChkpoints, txnPoolNodeSet, 1, 0, retryWait=1)) checkRequestCounts(txnPoolNodeSet, 0) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 3 * CHK_FREQ + 1, 1) looper.run(eventually(chkChkpoints, txnPoolNodeSet, 2, 0, retryWait=1)) checkRequestCounts(txnPoolNodeSet, 1)
def testInstChangeWithLowerRatioThanDelta(looper, step3, client1): sendReqsToNodesAndVerifySuffReplies(looper, client1, 5) # wait for every node to run another checkPerformance newPerfChecks = waitForNextPerfCheck(looper, step3.nodes, step3.perfChecks) # verify all nodes recognize P as degraded # for n in step3.nodes: # assert newPerfChecks[n.name].result is False # verify all nodes have undergone an instance change checkViewNoForNodes(step3.nodes, 1)
def viewChangeDone(nodeSet, looper, up, client1): """ Test that a view change is done when the performance of master goes down """ # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's performance falls and view changes nonPrimReps = getNonPrimaryReplicas(nodeSet, 0) for r in nonPrimReps: r.node.nodeIbStasher.delay(ppDelay(10, 0)) sendReqsToNodesAndVerifySuffReplies(looper, client1, 4) looper.run(eventually(partial(checkViewNoForNodes, nodeSet, 1), retryWait=1, timeout=20))
def testCheckpointCreated(chkFreqPatched, looper, txnPoolNodeSet, client1, wallet1, client1Connected): """ After requests less than `CHK_FREQ`, there should be one checkpoint on each replica. After `CHK_FREQ`, one checkpoint should become stable """ sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, CHK_FREQ-1, 1) # Deliberately waiting so as to verify that not more than 1 checkpoint is # created looper.runFor(2) chkChkpoints(txnPoolNodeSet, 1) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1, 1) looper.run(eventually(chkChkpoints, txnPoolNodeSet, 1, 0, retryWait=1))
def testPostingThroughput(postingStatsEnabled, looper: Looper, nodeSet: TestNodeSet, wallet1, client1): """ The throughput after `DashboardUpdateFreq` seconds and before sending any requests should be zero. Send `n` requests in less than `ThroughputWindowSize` seconds and the throughput till `ThroughputWindowSize` should consider those `n` requests. After `ThroughputWindowSize` seconds the throughput should be zero Test `totalRequests` too. """ # We are sleeping for this window size, because we need to clear previous # values that were being stored for this much time in tests looper.runFor(config.ThroughputWindowSize) reqCount = 10 for node in nodeSet: assert node.monitor.highResThroughput == 0 assert node.monitor.totalRequests == 0 sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, reqCount, nodeSet.f, timeoutPerReq=20) for node in nodeSet: assert len(node.monitor.orderedRequestsInLast) == reqCount assert node.monitor.highResThroughput > 0 assert node.monitor.totalRequests == reqCount # TODO: Add implementation to actually call firebase plugin # and test if firebase plugin is sending total request count # if node is primary looper.runFor(config.DashboardUpdateFreq) for node in nodeSet: node.monitor.spylog.count(Monitor.sendThroughput.__name__) > 0 # Run for latency window duration so that `orderedRequestsInLast` # becomes empty looper.runFor(config.ThroughputWindowSize) def chk(): for node in nodeSet: assert len(node.monitor.orderedRequestsInLast) == 0 assert node.monitor.highResThroughput == 0 assert node.monitor.totalRequests == reqCount looper.run(eventually(chk, retryWait=1, timeout=10))
def testNodeDoesNotParticipateUntilCaughtUp(txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): """ A new node that joins after some transactions should stash new transactions until it has caught up :return: """ looper, newNode, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) for node in txnPoolNodeSet[:4]: for replica in node.replicas: for commit in replica.commits.values(): assert newNode.name not in commit.voters for prepare in replica.prepares.values(): assert newNode.name not in prepare.voters
def setup(looper, startedNodes, up, client1): # Get the master replica of the master protocol instance P = getPrimaryReplica(startedNodes) # Make `Delta` small enough so throughput check passes. for node in startedNodes: node.monitor.Delta = .001 # make P (primary replica on master) faulty, i.e., slow to send # PRE-PREPARE for a specific client request only def by65SpecificPrePrepare(msg): if isinstance(msg, PrePrepare) and getattr(msg, f.REQ_ID.nm) == 2: return 65 P.outBoxTestStasher.delay(by65SpecificPrePrepare) sendReqsToNodesAndVerifySuffReplies(looper, client1, numReqs=5, timeout=80) return adict(nodes=startedNodes)
def testClientConnectToRestartedNodes(looper, txnPoolNodeSet, tdirWithPoolTxns, poolTxnClientNames, poolTxnData, tconf, poolTxnNodeNames, allPluginsPath): name = poolTxnClientNames[-1] seed = poolTxnData["seeds"][name] newClient, w = genTestClient(tmpdir=tdirWithPoolTxns, nodes=txnPoolNodeSet, name=name, usePoolLedger=True) looper.add(newClient) ensureClientConnectedToNodesAndPoolLedgerSame(looper, newClient, *txnPoolNodeSet) sendReqsToNodesAndVerifySuffReplies(looper, w, newClient, 1, 1) for node in txnPoolNodeSet: node.stop() looper.removeProdable(node) # looper.run(newClient.ensureDisconnectedToNodes(timeout=60)) txnPoolNodeSet = [] for nm in poolTxnNodeNames: node = TestNode(nm, basedirpath=tdirWithPoolTxns, config=tconf, pluginPaths=allPluginsPath) looper.add(node) txnPoolNodeSet.append(node) looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, retryWait=1, timeout=10) def chk(): for node in txnPoolNodeSet: assert node.isParticipating looper.run(eventually(chk, retryWait=1, timeout=10)) bootstrapClientKeys(w.defaultId, w.getVerkey(), txnPoolNodeSet) req = sendRandomRequest(w, newClient) checkSufficientRepliesForRequests(looper, newClient, [req, ], timeoutPerReq=10) ensureClientConnectedToNodesAndPoolLedgerSame(looper, newClient, *txnPoolNodeSet) sendReqsToNodesAndVerifySuffReplies(looper, w, newClient, 1, 1)
def testPostingLatency(postingStatsEnabled, looper: Looper, nodeSet: TestNodeSet, wallet1, client1): """ The latencies (master as well as average of backups) after `DashboardUpdateFreq` seconds and before sending any requests should be zero. Send `n` requests in less than `LatencyWindowSize` seconds and the latency till `LatencyWindowSize` should consider those `n` requests. After `LatencyWindowSize` seconds the latencies should be zero """ # Run for latency window duration so that `latenciesByMasterInLast` and # `latenciesByBackupsInLast` become empty looper.runFor(config.LatencyWindowSize) reqCount = 10 for node in nodeSet: assert node.monitor.masterLatency == 0 assert node.monitor.avgBackupLatency == 0 sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, reqCount, nodeSet.f, timeoutPerReq=20) for node in nodeSet: assert node.monitor.masterLatency > 0 assert node.monitor.avgBackupLatency > 0 looper.runFor(config.DashboardUpdateFreq) for node in nodeSet: node.monitor.spylog.count(Monitor.sendLatencies.__name__) > 0 # Run for latency window duration so that `latenciesByMasterInLast` and # `latenciesByBackupsInLast` become empty looper.runFor(config.LatencyWindowSize) def chk(): for node in nodeSet: assert node.monitor.masterLatency == 0 assert node.monitor.avgBackupLatency == 0 looper.run(eventually(chk, retryWait=1, timeout=10))
def testElectionsAfterViewChange(delayedPerf, looper: Looper, nodeSet: TestNodeSet, up, client1): """ Test that a primary election does happen after a view change """ # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's throughput falls # and view changes nonPrimReps = getNonPrimaryReplicas(nodeSet, 0) for r in nonPrimReps: r.node.nodeIbStasher.delay(ppDelay(10, 0)) sendReqsToNodesAndVerifySuffReplies(looper, client1, 4) # Ensure view change happened for both node and its primary elector for node in nodeSet: looper.run(eventually(partial(checkViewChangeInitiatedForNode, node, 0), retryWait=1, timeout=20)) # Ensure elections are done again and pool is setup again with appropriate # protocol instances and each protocol instance is setup properly too checkProtocolInstanceSetup(looper, nodeSet, retryWait=1, timeout=30)
def step1(looper, startedNodes, up, client1): """ stand up a pool of nodes and send 5 requests to client """ # the master instance has a primary replica, call it P P = getPrimaryReplica(startedNodes) requests = sendReqsToNodesAndVerifySuffReplies(looper, client1, 5) # profile_this(sendReqsToNodesAndVerifySuffReplies, looper, client1, 5) return adict(P=P, nodes=startedNodes, requests=requests)
def testViewNotChanged(looper: Looper, nodeSet: TestNodeSet, up, client1): """ Test that a view change is not done when the performance of master does not go down """ """ Send multiple requests to the client and delay some requests by all backup instances to ensure master instance is always faster than backup instances and there is no view change """ # Delay PRE-PREPARE for all backup protocol instances so master performs # better for i in range(1, F + 1): nonPrimReps = getNonPrimaryReplicas(nodeSet, i) # type: Iterable[TestReplica] for r in nonPrimReps: r.node.nodeIbStasher.delay(ppDelay(10, i)) sendReqsToNodesAndVerifySuffReplies(looper, client1, 5) checkViewNoForNodes(nodeSet, 0)
def testViewChangeCase1(nodeSet, looper, up, wallet1, client1, viewNo): """ Node will change view even though it does not find the master to be degraded when a quorum of nodes agree that master performance degraded """ # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's performance falls and view changes delayNonPrimaries(nodeSet, 0, 10) pr = getPrimaryReplica(nodeSet, 0) relucatantNode = pr.node # Count sent instance changes of all nodes sentInstChanges = {} instChngMethodName = Node.sendInstanceChange.__name__ for n in nodeSet: sentInstChanges[n.name] = n.spylog.count(instChngMethodName) # Node reluctant to change view, never says master is degraded relucatantNode.monitor.isMasterDegraded = types.MethodType( lambda x: False, relucatantNode.monitor) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 4) # Check that view change happened for all nodes looper.run(eventually(partial(checkViewNoForNodes, nodeSet, viewNo + 1), retryWait=1, timeout=20)) # All nodes except the reluctant node should have sent a view change and # thus must have called `sendInstanceChange` for n in nodeSet: if n.name != relucatantNode.name: assert n.spylog.count(instChngMethodName) > \ sentInstChanges.get(n.name, 0) else: assert n.spylog.count(instChngMethodName) == \ sentInstChanges.get(n.name, 0)
def nodeCreatedAfterSomeTxns( txnPoolNodesLooper, txnPoolNodeSet, tdirWithPoolTxns, poolTxnStewardData, tconf, allPluginsPath, request ): # with Looper(debug=True) as looper: client, wallet = buildPoolClientAndWallet(poolTxnStewardData, tdirWithPoolTxns, clientClass=TestClient) txnPoolNodesLooper.add(client) txnPoolNodesLooper.run(client.ensureConnectedToNodes()) txnCount = getValueFromModule(request, "txnCount", 5) sendReqsToNodesAndVerifySuffReplies(txnPoolNodesLooper, wallet, client, txnCount, timeoutPerReq=25) newStewardName = randomString() newNodeName = "Epsilon" newStewardClient, newStewardWallet, newNode = addNewStewardAndNode( txnPoolNodesLooper, client, wallet, newStewardName, newNodeName, tdirWithPoolTxns, tconf, allPluginsPath=allPluginsPath, autoStart=True, ) yield txnPoolNodesLooper, newNode, client, wallet, newStewardClient, newStewardWallet
def testNodesReceiveClientMsgs(looper, txnPoolNodeSet, wallet1, client1, client1Connected): ensureClientConnectedToNodesAndPoolLedgerSame(looper, client1, *txnPoolNodeSet) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1)