def testZStackNodeReconnection(tconf, looper, txnPoolNodeSet, client1, wallet1, tdirWithPoolTxns, client1Connected): sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1) npr = [n for n in txnPoolNodeSet if not n.hasPrimary] nodeToCrash = npr[0] idxToCrash = txnPoolNodeSet.index(nodeToCrash) otherNodes = [_ for _ in txnPoolNodeSet if _ != nodeToCrash] def checkFlakyConnected(conn=True): for node in otherNodes: if conn: assert nodeToCrash.nodestack.name in node.nodestack.connecteds else: assert nodeToCrash.nodestack.name not in node.nodestack.connecteds checkFlakyConnected(True) nodeToCrash.stop() looper.removeProdable(nodeToCrash) looper.runFor(1) looper.run(eventually(checkFlakyConnected, False, retryWait=1, timeout=35)) looper.runFor(1) node = TestNode(nodeToCrash.name, basedirpath=tdirWithPoolTxns, config=tconf, ha=nodeToCrash.nodestack.ha, cliha=nodeToCrash.clientstack.ha) looper.add(node) txnPoolNodeSet[idxToCrash] = node looper.run(eventually(checkFlakyConnected, True, retryWait=2, timeout=50)) ensureElectionsDone(looper, txnPoolNodeSet, retryWait=2, timeout=50) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1) checkNodesSendingCommits(txnPoolNodeSet)
def testViewChangesIfMasterPrimaryDisconnected(txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet viewNoBefore = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary stopNodes([old_pr_node], looper) looper.removeProdable(old_pr_node) remainingNodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remainingNodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remainingNodes, viewNoBefore + 1) ensure_all_nodes_have_same_data(looper, nodes=remainingNodes) new_pr_node = get_master_primary_node(remainingNodes) assert old_pr_node != new_pr_node sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)
def test_no_instance_change_on_primary_disconnection_for_not_ready_node( looper, txnPoolNodeSet, tdir, tconf, allPluginsPath, steward1, stewardWallet, client_tdir): """ Test steps: 1. create a new node, but don't add it to the pool (so not send NODE txn), so that the node is not ready. 2. wait for more than VIEW_CHANGE_TIMEOUT (a timeout for initial check for disconnected primary) 3. make sure no InstanceChange sent by the new node 4. add the node to the pool (send NODE txn) and make sure that the node is ready now. 5. wait for more than VIEW_CHANGE_TIMEOUT (a timeout for initial check for disconnected primary) 6. make sure no InstanceChange sent by the new node """ # 1. create a new node, but don't add it to the pool (so not send NODE txn), so that the node is not ready. sigseed, bls_key, new_node = start_not_added_node(looper, tdir, tconf, allPluginsPath, "TestTheta") # 2. wait for more than VIEW_CHANGE_TIMEOUT (a timeout for initial check for disconnected primary) looper.runFor(tconf.VIEW_CHANGE_TIMEOUT + 2) # 3. make sure no InstanceChange sent by the new node assert 0 == new_node.view_changer.spylog.count( ViewChanger.sendInstanceChange.__name__) # 4. add the node to the pool (send NODE txn) and make sure that the node is ready now. add_started_node(looper, new_node, txnPoolNodeSet, client_tdir, steward1, stewardWallet, sigseed, bls_key) # 5. wait for more than VIEW_CHANGE_TIMEOUT (a timeout for initial check for disconnected primary) looper.runFor(tconf.VIEW_CHANGE_TIMEOUT + 2) # 6. make sure no InstanceChange sent by the new node assert 0 == new_node.view_changer.spylog.count( ViewChanger.sendInstanceChange.__name__)
def test_view_change_after_max_catchup_rounds(txnPoolNodeSet, looper, wallet1, client1, client1Connected): """ The node should do only a fixed rounds of catchup. For this delay Prepares and Commits for 2 non-primary nodes by a large amount which is equivalent to loss of Prepares and Commits. Make sure 2 nodes have a different last prepared certificate from other two. Then do a view change, make sure view change completes and the pool does not process the request that were prepared by only a subset of the nodes """ send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ledger_summary = txnPoolNodeSet[0].elector.ledger_summary slow_nodes = [ r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)[-2:] ] fast_nodes = [n for n in txnPoolNodeSet if n not in slow_nodes] # Make node slow to process Prepares and Commits for node in slow_nodes: node.nodeIbStasher.delay(pDelay(120, 0)) node.nodeIbStasher.delay(cDelay(120, 0)) sendRandomRequests(wallet1, client1, 5) looper.runFor(3) ensure_view_change(looper, nodes=txnPoolNodeSet) def last_prepared(nodes): lst = [ n.master_replica.last_prepared_certificate_in_view() for n in nodes ] # All nodes have same last prepared assert check_if_all_equal_in_list(lst) return lst[0] last_prepared_slow = last_prepared(slow_nodes) last_prepared_fast = last_prepared(fast_nodes) # Check `slow_nodes` and `fast_nodes` set different last_prepared assert last_prepared_fast != last_prepared_slow # View change complete ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) # The requests which were prepared by only a subset of the nodes were # not ordered assert txnPoolNodeSet[0].elector.ledger_summary == ledger_summary for node in slow_nodes: node.nodeIbStasher.reset_delays_and_process_delayeds() # Make sure pool is functional ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) last_prepared(txnPoolNodeSet)
def test_view_changes_if_master_primary_disconnected(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet old_view_no = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_pr_node, stopNode=True) looper.removeProdable(old_pr_node) remaining_nodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remaining_nodes, old_view_no + 1) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes) new_pr_node = get_master_primary_node(remaining_nodes) assert old_pr_node != new_pr_node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Check if old primary can join the pool and still functions old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [old_pr_node] looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=10)) assert len( getAllReturnVals( old_pr_node.view_changer, old_pr_node.view_changer._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not old_pr_node.view_changer._next_view_indications
def test_no_catchup_if_got_from_3pc(looper, txnPoolNodeSet, wallet1, client1, client1Connected): """ A node is slow to receive COMMIT messages so after a view change it starts catchup. But before it can start requesting txns, the COMMITs messages are received and are ordered. The node should not request any transactions. :return: """ send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay_cm = 30 delat_cp = 40 slow_node.nodeIbStasher.delay(cDelay(delay_cm)) # The slow node receives consistency proofs after some delay, this delay # gives the opportunity to deliver all 3PC messages slow_node.nodeIbStasher.delay(cpDelay(delat_cp)) # Count of `getCatchupReqs` which is called to construct the `CatchupReq` # to be sent def domain_cr_count(): return sum(1 for entry in slow_node.ledgerManager.spylog.getAll( slow_node.ledgerManager.getCatchupReqs) if entry.params['consProof'].ledgerId == DOMAIN_LEDGER_ID) old_count = domain_cr_count() sent_batches = 10 send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * sent_batches, sent_batches) ensure_view_change(looper, nodes=txnPoolNodeSet) # After view change, the `slow_node` is behind waitNodeDataInequality(looper, slow_node, *other_nodes) # Unstash only COMMIT messages slow_node.nodeIbStasher.reset_delays_and_process_delayeds(Commit.__name__) looper.runFor(2) slow_node.nodeIbStasher.reset_delays_and_process_delayeds( ConsistencyProof.__name__) waitNodeDataEquality(looper, slow_node, *other_nodes) # No `CatchupReq`s constructed, hence no `CatchupReq`s could have # been sent assert domain_cr_count() == old_count # Some stashed ordered requests have been processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert sent_batches in rv ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1)
def test_repeated_request_not_processed_if_already_ordered( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): delta = txnPoolNodeSet[3] initial_ledger_size = delta.domainLedger.size one_req = sdk_signed_random_requests(looper, sdk_wallet_client, 1) sdk_send_and_check(one_req, looper, txnPoolNodeSet, sdk_pool_handle) sdk_send_signed_requests(sdk_pool_handle, one_req) looper.runFor(waits.expectedTransactionExecutionTime(len(txnPoolNodeSet))) for node in txnPoolNodeSet: assert node.domainLedger.size - initial_ledger_size == 1
def test_belated_propagate_not_processed_if_already_ordered( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): delta = txnPoolNodeSet[3] initial_ledger_size = delta.domainLedger.size delta.nodeIbStasher.delay(ppgDelay(300, 'Gamma')) one_req = sdk_signed_random_requests(looper, sdk_wallet_client, 1) sdk_send_and_check(one_req, looper, txnPoolNodeSet, sdk_pool_handle) delta.nodeIbStasher.reset_delays_and_process_delayeds() looper.runFor(waits.expectedTransactionExecutionTime(len(txnPoolNodeSet))) for node in txnPoolNodeSet: assert node.domainLedger.size - initial_ledger_size == 1
def tear(): # Repair any broken network for node in txnPoolNodeSet: node.reset_delays_and_process_delayeds() # Give a little time to process any delayed messages looper.runFor(3) # Check each node has same data ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) # Check each node has ordered all requests (no catchup) assert check_if_all_equal_in_list([n.master_replica.ordered for n in txnPoolNodeSet]) # Check the network is functional since all nodes reply send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, 5)
def test_belated_request_not_processed_after_view_change( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): delta = txnPoolNodeSet[3] initial_ledger_size = delta.domainLedger.size delta.clientIbStasher.delay(req_delay(300)) one_req = sdk_signed_random_requests(looper, sdk_wallet_client, 1) sdk_send_and_check(one_req, looper, txnPoolNodeSet, sdk_pool_handle) ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) delta.clientIbStasher.reset_delays_and_process_delayeds() looper.runFor(waits.expectedTransactionExecutionTime(len(txnPoolNodeSet))) for node in txnPoolNodeSet: assert node.domainLedger.size - initial_ledger_size == 1
def testZStackNodeReconnection(tconf, looper, txnPoolNodeSet, client1, wallet1, tdir, client1Connected): sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1) npr = [n for n in txnPoolNodeSet if not n.hasPrimary] nodeToCrash = npr[0] idxToCrash = txnPoolNodeSet.index(nodeToCrash) otherNodes = [_ for _ in txnPoolNodeSet if _ != nodeToCrash] def checkFlakyConnected(conn=True): for node in otherNodes: if conn: assert nodeToCrash.nodestack.name in node.nodestack.connecteds else: assert nodeToCrash.nodestack.name not in node.nodestack.connecteds checkFlakyConnected(True) nodeToCrash.stop() logger.debug('Stopped node {}'.format(nodeToCrash)) looper.removeProdable(nodeToCrash) looper.runFor(1) stopNodes([nodeToCrash], looper) # TODO Select or create the timeout from 'waits'. Don't use constant. looper.run(eventually(checkFlakyConnected, False, retryWait=1, timeout=60)) looper.runFor(1) config_helper = PNodeConfigHelper(nodeToCrash.name, tconf, chroot=tdir) node = TestNode(nodeToCrash.name, ledger_dir=config_helper.ledger_dir, keys_dir=config_helper.keys_dir, genesis_dir=config_helper.genesis_dir, plugins_dir=config_helper.plugins_dir, config=tconf, ha=nodeToCrash.nodestack.ha, cliha=nodeToCrash.clientstack.ha) looper.add(node) txnPoolNodeSet[idxToCrash] = node # TODO Select or create the timeout from 'waits'. Don't use constant. looper.run(eventually(checkFlakyConnected, True, retryWait=2, timeout=50)) ensureElectionsDone(looper, txnPoolNodeSet, retryWait=2) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, 10)
def restart_nodes(looper, nodeSet, restart_set, tconf, tdir, allPluginsPath, after_restart_timeout=None, per_add_timeout=None): for node_to_stop in restart_set: node_to_stop.cleanupOnStopping = True node_to_stop.stop() looper.removeProdable(node_to_stop) rest_nodes = [n for n in nodeSet if n not in restart_set] for node_to_stop in restart_set: ensure_node_disconnected(looper, node_to_stop, nodeSet, timeout=2) if after_restart_timeout: looper.runFor(after_restart_timeout) for node_to_restart in restart_set: config_helper = PNodeConfigHelper(node_to_restart.name, tconf, chroot=tdir) restarted_node = TestNode(node_to_restart.name, config_helper=config_helper, config=tconf, pluginPaths=allPluginsPath, ha=node_to_restart.nodestack.ha, cliha=node_to_restart.clientstack.ha) looper.add(restarted_node) idx = nodeSet.index(node_to_restart) nodeSet[idx] = restarted_node if per_add_timeout: looper.run( checkNodesConnected(rest_nodes + [restarted_node], customTimeout=per_add_timeout)) rest_nodes += [restarted_node] if not per_add_timeout: looper.run( checkNodesConnected(nodeSet, customTimeout=after_restart_timeout))
def test_belated_request_not_processed_if_already_in_3pc_process( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): delta = txnPoolNodeSet[3] initial_ledger_size = delta.domainLedger.size delta.clientIbStasher.delay(req_delay(300)) for node in txnPoolNodeSet: node.nodeIbStasher.delay(cDelay(300)) one_req = sdk_signed_random_requests(looper, sdk_wallet_client, 1) sdk_send_signed_requests(sdk_pool_handle, one_req) looper.runFor( waits.expectedPropagateTime(len(txnPoolNodeSet)) + waits.expectedPrePrepareTime(len(txnPoolNodeSet)) + waits.expectedPrepareTime(len(txnPoolNodeSet)) + waits.expectedCommittedTime(len(txnPoolNodeSet))) delta.clientIbStasher.reset_delays_and_process_delayeds() looper.runFor( waits.expectedPropagateTime(len(txnPoolNodeSet)) + waits.expectedPrePrepareTime(len(txnPoolNodeSet)) + waits.expectedPrepareTime(len(txnPoolNodeSet)) + waits.expectedCommittedTime(len(txnPoolNodeSet))) for node in txnPoolNodeSet: node.nodeIbStasher.reset_delays_and_process_delayeds() looper.runFor(waits.expectedOrderingTime(delta.replicas.num_replicas)) for node in txnPoolNodeSet: assert node.domainLedger.size - initial_ledger_size == 1
def test_pool_reaches_quorum_after_f_plus_2_nodes_turned_off_and_later_on( looper, allPluginsPath, tdir, tconf, txnPoolNodeSet, wallet1, client1, client1Connected): nodes = txnPoolNodeSet initial_view_no = nodes[0].viewNo request = sendRandomRequest(wallet1, client1) waitForSufficientRepliesForRequests(looper, client1, requests=[request]) stop_node(nodes[0], looper, nodes) waitForViewChange(looper, nodes[1:], expectedViewNo=initial_view_no + 1) ensureElectionsDone(looper, nodes[1:], numInstances=getRequiredInstances(nodeCount)) request = sendRandomRequest(wallet1, client1) waitForSufficientRepliesForRequests(looper, client1, requests=[request]) stop_node(nodes[1], looper, nodes) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) checkViewNoForNodes(nodes[2:], initial_view_no + 1) request = sendRandomRequest(wallet1, client1) verify_request_not_replied_and_not_ordered(request, looper, client1, nodes) stop_node(nodes[2], looper, nodes) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) checkViewNoForNodes(nodes[3:], initial_view_no + 1) request = sendRandomRequest(wallet1, client1) verify_request_not_replied_and_not_ordered(request, looper, client1, nodes) nodes[2] = start_stopped_node(nodes[2], looper, tconf, tdir, allPluginsPath) looper.runFor(waits.expectedPoolElectionTimeout(len(nodes))) request = sendRandomRequest(wallet1, client1) verify_request_not_replied_and_not_ordered(request, looper, client1, nodes) nodes[1] = start_stopped_node(nodes[1], looper, tconf, tdir, allPluginsPath) ensureElectionsDone(looper, nodes[1:], numInstances=getRequiredInstances(nodeCount)) waitForViewChange(looper, nodes[1:], expectedViewNo=initial_view_no + 1) request = sendRandomRequest(wallet1, client1) waitForSufficientRepliesForRequests(looper, client1, requests=[request]) nodes[0] = start_stopped_node(nodes[0], looper, tconf, tdir, allPluginsPath) ensureElectionsDone(looper, nodes, numInstances=getRequiredInstances(nodeCount)) waitForViewChange(looper, nodes, expectedViewNo=initial_view_no + 1) request = sendRandomRequest(wallet1, client1) waitForSufficientRepliesForRequests(looper, client1, requests=[request])
def testPostingThroughput(postingStatsEnabled, decreasedMonitoringTimeouts, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ The throughput after `DashboardUpdateFreq` seconds and before sending any requests should be zero. Send `n` requests in less than `ThroughputWindowSize` seconds and the throughput till `ThroughputWindowSize` should consider those `n` requests. After `ThroughputWindowSize` seconds the throughput should be zero Test `totalRequests` too. """ config = decreasedMonitoringTimeouts # We are sleeping for this window size, because we need to clear previous # values that were being stored for this much time in tests looper.runFor(config.ThroughputWindowSize) reqCount = 10 for node in txnPoolNodeSet: assert node.monitor.highResThroughput == 0 assert node.monitor.totalRequests == 0 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqCount) for node in txnPoolNodeSet: assert len(node.monitor.orderedRequestsInLast) == reqCount assert node.monitor.highResThroughput > 0 assert node.monitor.totalRequests == reqCount # TODO: Add implementation to actually call firebase plugin # and test if firebase plugin is sending total request count # if node is primary looper.runFor(config.DashboardUpdateFreq) for node in txnPoolNodeSet: node.monitor.spylog.count(Monitor.sendThroughput.__name__) > 0 # Run for latency window duration so that `orderedRequestsInLast` # becomes empty looper.runFor(config.ThroughputWindowSize) def chk(): for node in txnPoolNodeSet: assert len(node.monitor.orderedRequestsInLast) == 0 assert node.monitor.highResThroughput == 0 assert node.monitor.totalRequests == reqCount timeout = config.ThroughputWindowSize looper.run(eventually(chk, retryWait=1, timeout=timeout))
def testPostingLatency(postingStatsEnabled, decreasedMonitoringTimeouts, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ The latencies (master as well as average of backups) after `DashboardUpdateFreq` seconds and before sending any requests should be zero. Send `n` requests in less than `LatencyWindowSize` seconds and the latency till `LatencyWindowSize` should consider those `n` requests. After `LatencyWindowSize` seconds the latencies should be zero """ config = decreasedMonitoringTimeouts # Run for latency window duration so that `latenciesByMasterInLast` and # `latenciesByBackupsInLast` become empty looper.runFor(config.LatencyWindowSize) reqCount = 10 for node in txnPoolNodeSet: assert node.monitor.masterLatency == 0 assert node.monitor.avgBackupLatency == 0 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqCount) for node in txnPoolNodeSet: assert node.monitor.masterLatency > 0 assert node.monitor.avgBackupLatency > 0 looper.runFor(config.DashboardUpdateFreq) for node in txnPoolNodeSet: node.monitor.spylog.count(Monitor.sendLatencies.__name__) > 0 # Run for latency window duration so that `latenciesByMasterInLast` and # `latenciesByBackupsInLast` become empty looper.runFor(config.LatencyWindowSize) def chk(): for node in txnPoolNodeSet: assert node.monitor.masterLatency == 0 assert node.monitor.avgBackupLatency == 0 timeout = config.LatencyWindowSize looper.run(eventually(chk, retryWait=1, timeout=timeout))
def test_quorum_after_f_plus_2_nodes_but_not_primary_turned_off_and_later_on( looper, allPluginsPath, tdir, tconf, txnPoolNodeSet, wallet1, client1, client1Connected): nodes = txnPoolNodeSet request1 = sendRandomRequest(wallet1, client1) waitForSufficientRepliesForRequests(looper, client1, requests=[request1]) stop_node(nodes[4], looper, nodes) looper.runFor(tconf.ToleratePrimaryDisconnection + waits.expectedPoolElectionTimeout(len(nodes))) checkViewNoForNodes(nodes[:4], expectedViewNo=0) request2 = sendRandomRequest(wallet1, client1) waitForSufficientRepliesForRequests(looper, client1, requests=[request2]) stop_node(nodes[3], looper, nodes) looper.runFor(tconf.ToleratePrimaryDisconnection + waits.expectedPoolElectionTimeout(len(nodes))) checkViewNoForNodes(nodes[:3], expectedViewNo=0) request3 = sendRandomRequest(wallet1, client1) verify_request_not_replied_and_not_ordered(request3, looper, client1, nodes) stop_node(nodes[2], looper, nodes) looper.runFor(tconf.ToleratePrimaryDisconnection + waits.expectedPoolElectionTimeout(len(nodes))) checkViewNoForNodes(nodes[:2], expectedViewNo=0) request4 = sendRandomRequest(wallet1, client1) verify_request_not_replied_and_not_ordered(request4, looper, client1, nodes) nodes[4] = start_stopped_node(nodes[4], looper, tconf, tdir, allPluginsPath) looper.runFor(waits.expectedPoolElectionTimeout(len(nodes))) checkViewNoForNodes(nodes[:2] + nodes[4:], expectedViewNo=0) request5 = sendRandomRequest(wallet1, client1) verify_request_not_replied_and_not_ordered(request5, looper, client1, nodes) nodes[3] = start_stopped_node(nodes[3], looper, tconf, tdir, allPluginsPath) ensureElectionsDone(looper, nodes[:2] + nodes[3:], numInstances=getRequiredInstances(nodeCount)) checkViewNoForNodes(nodes[:2] + nodes[3:], expectedViewNo=0) request6 = sendRandomRequest(wallet1, client1) waitForSufficientRepliesForRequests( looper, client1, requests=[request3, request4, request5, request6]) nodes[2] = start_stopped_node(nodes[2], looper, tconf, tdir, allPluginsPath) ensureElectionsDone(looper, nodes, numInstances=getRequiredInstances(nodeCount)) checkViewNoForNodes(nodes, expectedViewNo=0) request7 = sendRandomRequest(wallet1, client1) waitForSufficientRepliesForRequests(looper, client1, requests=[request7])
def test_view_change_after_back_to_quorum_with_disconnected_primary( txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): assert len(txnPoolNodeSet) == 4 pr_node = get_master_primary_node(txnPoolNodeSet) assert pr_node.name == "Alpha" # 1. Initiate view change be primary (Alpha) restart nodes = ensure_view_change_by_primary_restart(looper, txnPoolNodeSet, tconf, tdir, allPluginsPath, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) # Now primary should be Beta pr_node = get_master_primary_node(nodes) assert pr_node.name == "Beta" # 2. Stop non-primary node Delta, no any view changes are expected non_primary_to_stop = [n for n in nodes if n.name == "Delta"][0] disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, non_primary_to_stop) looper.removeProdable(non_primary_to_stop) remaining_nodes = list(set(nodes) - {non_primary_to_stop}) # Primary is going to be stopped, remember instance change messages count # to ensure that no view change happened as number of connected nodes is less # than quorum. ic_cnt = {} for n in remaining_nodes: ic_cnt[n.name] = n.view_changer.spylog.count( ViewChanger.sendInstanceChange.__name__) # 3. Disconnect primary disconnect_node_and_ensure_disconnected(looper, remaining_nodes, pr_node) looper.removeProdable(pr_node) # Wait for more than ToleratePrimaryDisconnection timeout and check that no IC messages presented. looper.runFor(tconf.ToleratePrimaryDisconnection + 5) remaining_nodes = list(set(remaining_nodes) - {pr_node}) for n in remaining_nodes: assert ic_cnt[n.name] == n.view_changer.spylog.count( ViewChanger.sendInstanceChange.__name__) view_no = checkViewNoForNodes(remaining_nodes) # 4. Start Delta (non-primary), now primary (Beta) is disconnected but there is a quorum # to choose a new one. restartedNode = start_stopped_node(non_primary_to_stop, looper, tconf, tdir, allPluginsPath, delay_instance_change_msgs=False) remaining_nodes = remaining_nodes + [restartedNode] # 5. Check that view change happened. waitForViewChange(looper, remaining_nodes, expectedViewNo=(view_no + 1), customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) # ensure pool is working properly sdk_send_random_and_check(looper, remaining_nodes, sdk_pool_handle, sdk_wallet_client, 3) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes)
def test_node_detecting_lag_from_view_change_done_messages( txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf): """ A node is slow and after view change starts, it marks it's `last_prepared` to less than others, after catchup it does not get any txns from others and finds it has already ordered it's `last_prepared`, but when it gets ViewChangeDone messages, it starts catchup again and this time gets the txns. To achieve this delay all 3PC messages to a node so before view change it has different last_prepared from others. Also delay processing of COMMITs and INSTANCE_CHANGEs by other nodes """ send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica fast_master_replicas = [n.master_replica for n in fast_nodes] delay_3pc = 50 delay_ic = tconf.PerfCheckFreq + 5 delay_commit = delay_ic + 10 delay_3pc_messages([slow_node], 0, delay_3pc) for n in fast_nodes: n.nodeIbStasher.delay(icDelay(delay_ic)) n.nodeIbStasher.delay(cDelay(delay_commit)) reqs = [] for i in range(10): reqs = reqs + sendRandomRequests(wallet1, client1, 2) looper.runFor(.2) def chk1(): for rep in fast_master_replicas: assert compare_3PC_keys( slow_master_replica.last_prepared_certificate_in_view(), rep.last_prepared_certificate_in_view()) > 0 assert slow_master_replica.last_ordered_3pc == rep.last_ordered_3pc looper.run(eventually(chk1)) no_more_catchup_count = get_count(slow_node, slow_node.no_more_catchups_needed) # Track last prepared for master replica of each node prepareds = {} orig_methods = {} for node in txnPoolNodeSet: orig_methods[node.name] = node.master_replica.on_view_change_start def patched_on_view_change_start(self): orig_methods[self.node.name]() prepareds[self.node.name] = self.last_prepared_before_view_change node.master_replica.on_view_change_start = types.MethodType( patched_on_view_change_start, node.master_replica) ensure_view_change(looper, txnPoolNodeSet, exclude_from_check=fast_nodes) def chk2(): # last_prepared of slow_node is less than fast_nodes for rep in fast_master_replicas: assert compare_3PC_keys(prepareds[slow_master_replica.node.name], prepareds[rep.node.name]) > 0 looper.run(eventually(chk2, timeout=delay_ic + 5)) last_start_catchup_call_at = None no_more_catchup_call_at = None def chk3(): # no_more_catchups_needed was called since node found no need of catchup nonlocal last_start_catchup_call_at, no_more_catchup_call_at assert (get_count(slow_node, slow_node.no_more_catchups_needed) - no_more_catchup_count) > 0 no_more_catchup_call_at = slow_node.spylog.getLast( slow_node.no_more_catchups_needed).starttime last_start_catchup_call_at = slow_node.spylog.getLast( slow_node.start_catchup).starttime looper.run(eventually(chk3, timeout=delay_commit)) for n in fast_nodes: n.nodeIbStasher.reset_delays_and_process_delayeds() n.nodeIbStasher.reset_delays_and_process_delayeds() ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) assert slow_node.spylog.getLast( slow_node.start_catchup).starttime > no_more_catchup_call_at assert slow_node.spylog.getLast( slow_node.start_catchup).starttime > last_start_catchup_call_at
def testProtocolInstanceCannotBecomeActiveWithLessThanFourServers( tconf_for_func, tdir_for_func): """ A protocol instance must have at least 4 nodes to come up. The status of the nodes will change from starting to started only after the addition of the fourth node to the system. """ nodeCount = 13 f = 4 minimumNodesToBeUp = nodeCount - f nodeNames = genNodeNames(nodeCount) with TestNodeSet(tconf_for_func, names=nodeNames, tmpdir=tdir_for_func) as nodeSet: with Looper(nodeSet) as looper: # helpers def genExpectedStates(connecteds: Iterable[str]): return { nn: CONNECTED if nn in connecteds else JOINED_NOT_ALLOWED for nn in nodeNames } def checkNodeStatusRemotesAndF(expectedStatus: Status, nodeIdx: int): for node in nodeSet.nodes.values(): checkNodeRemotes( node, genExpectedStates(nodeNames[:nodeIdx + 1])) assert node.status == expectedStatus def addNodeBackAndCheck(nodeIdx: int, expectedStatus: Status): logger.info("Add back the {} node and see status of {}".format( ordinal(nodeIdx + 1), expectedStatus)) addNodeBack(nodeSet, looper, nodeNames[nodeIdx]) timeout = waits.expectedNodeStartUpTimeout() + \ waits.expectedPoolInterconnectionTime(len(nodeSet)) # TODO: Probably it's better to modify waits.* functions timeout *= 1.5 looper.run( eventually(checkNodeStatusRemotesAndF, expectedStatus, nodeIdx, retryWait=1, timeout=timeout)) logger.debug("Sharing keys") looper.run(checkNodesConnected(nodeSet)) logger.debug("Remove all the nodes") for n in nodeNames: looper.removeProdable(nodeSet.nodes[n]) nodeSet.removeNode(n) looper.runFor(10) logger.debug("Add nodes back one at a time") for i in range(nodeCount): nodes = i + 1 if nodes < minimumNodesToBeUp: expectedStatus = Status.starting elif nodes < nodeCount: expectedStatus = Status.started_hungry else: expectedStatus = Status.started addNodeBackAndCheck(i, expectedStatus)