def test_node_load_after_disconnect(looper, txnPoolNodeSet, tconf, tdirWithPoolTxns, allPluginsPath, poolTxnStewardData, capsys): client, wallet = buildPoolClientAndWallet(poolTxnStewardData, tdirWithPoolTxns, clientClass=TestClient) looper.add(client) looper.run(client.ensureConnectedToNodes()) nodes = txnPoolNodeSet x = nodes[-1] with capsys.disabled(): print("Stopping node {} with pool ledger size {}". format(x, x.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, x) looper.removeProdable(x) client_batches = 80 txns_per_batch = 10 for i in range(client_batches): s = perf_counter() sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, txns_per_batch, override_timeout_limit=True) with capsys.disabled(): print('{} executed {} client txns in {:.2f} seconds'. format(i + 1, txns_per_batch, perf_counter() - s)) nodeHa, nodeCHa = HA(*x.nodestack.ha), HA(*x.clientstack.ha) newNode = TestNode(x.name, basedirpath=tdirWithPoolTxns, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(newNode) txnPoolNodeSet[-1] = newNode looper.run(checkNodesConnected(txnPoolNodeSet))
def test_node_load_after_one_node_drops_all_msgs( looper, txnPoolNodeSet, tconf, tdirWithPoolTxns, allPluginsPath, poolTxnStewardData, capsys): client, wallet = buildPoolClientAndWallet(poolTxnStewardData, tdirWithPoolTxns, clientClass=TestClient) looper.add(client) looper.run(client.ensureConnectedToNodes()) nodes = txnPoolNodeSet x = nodes[-1] with capsys.disabled(): print("Patching node {}".format(x)) def handleOneNodeMsg(self, wrappedMsg): # do nothing with an incoming node message pass x.handleOneNodeMsg = MethodType(handleOneNodeMsg, x) client_batches = 120 txns_per_batch = 25 for i in range(client_batches): s = perf_counter() sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, txns_per_batch, override_timeout_limit=True) with capsys.disabled(): print('{} executed {} client txns in {:.2f} seconds'. format(i + 1, txns_per_batch, perf_counter() - s))
def test_state_regenerated_from_ledger(looper, txnPoolNodeSet, client1, wallet1, client1Connected, tdir, tconf, allPluginsPath): """ Node loses its state database but recreates it from ledger after start """ sent_batches = 10 send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 5 * sent_batches, sent_batches) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) node_to_stop = txnPoolNodeSet[-1] node_state = node_to_stop.states[DOMAIN_LEDGER_ID] assert not node_state.isEmpty state_db_path = node_state._kv.db_path nodeHa, nodeCHa = HA(*node_to_stop.nodestack.ha), HA( *node_to_stop.clientstack.ha) node_to_stop.stop() looper.removeProdable(node_to_stop) shutil.rmtree(state_db_path) config_helper = PNodeConfigHelper(node_to_stop.name, tconf, chroot=tdir) restarted_node = TestNode(node_to_stop.name, ledger_dir=config_helper.ledger_dir, keys_dir=config_helper.keys_dir, genesis_dir=config_helper.genesis_dir, plugins_dir=config_helper.plugins_dir, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(restarted_node) txnPoolNodeSet[-1] = restarted_node looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, restarted_node, *txnPoolNodeSet[:-1])
def test_view_change_on_start(tconf, txnPoolNodeSet, looper, wallet1, client1, client1Connected): """ Do view change on a without any requests """ old_view_no = txnPoolNodeSet[0].viewNo master_primary = get_master_primary_node(txnPoolNodeSet) other_nodes = [n for n in txnPoolNodeSet if n != master_primary] delay_3pc = 10 delay_3pc_messages(txnPoolNodeSet, 0, delay_3pc) sent_batches = 2 sendRandomRequests(wallet1, client1, sent_batches * tconf.Max3PCBatchSize) def chk1(): t_root, s_root = check_uncommitteds_equal(other_nodes) assert master_primary.domainLedger.uncommittedRootHash != t_root assert master_primary.states[DOMAIN_LEDGER_ID].headHash != s_root looper.run(eventually(chk1, retryWait=1)) timeout = tconf.PerfCheckFreq + \ waits.expectedPoolElectionTimeout(len(txnPoolNodeSet)) waitForViewChange(looper, txnPoolNodeSet, old_view_no + 1, customTimeout=timeout) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) check_uncommitteds_equal(txnPoolNodeSet) reset_delays_and_process_delayeds(txnPoolNodeSet) send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, 2 * Max3PCBatchSize, add_delay_to_timeout=delay_3pc) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def testPrepareDigest(setup, looper, sent1): """ A non primary replica sends PREPARE message with incorrect digest to all other replicas. Other replicas should raise suspicion the PREPARE seen """ primaryRep, nonPrimaryReps, faultyRep = setup.primaryRep, \ setup.nonPrimaryReps, \ setup.faultyRep def chkSusp(): for r in (primaryRep, *nonPrimaryReps): if r.name != faultyRep.name: # Every node except the one from which PREPARE with incorrect # digest was sent should raise suspicion for the PREPARE # message assert len( getNodeSuspicions(r.node, Suspicions.PR_DIGEST_WRONG.code)) == 1 numOfNodes = len(primaryRep.node.nodeReg) timeout = waits.expectedTransactionExecutionTime(numOfNodes) looper.run(eventually(chkSusp, retryWait=1, timeout=timeout))
def test_old_non_primary_restart_after_view_change(new_node_in_correct_view, looper, txnPoolNodeSet, tdirWithPoolTxns, allPluginsPath, tconf, wallet1, client1): """ An existing non-primary node crashes and then view change happens, the crashed node comes back up after view change """ node_to_stop = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node old_view_no = node_to_stop.viewNo # Stop non-primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_stop, stopNode=True) looper.removeProdable(node_to_stop) remaining_nodes = list(set(txnPoolNodeSet) - {node_to_stop}) # Send some requests before view change sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) ensure_view_change(looper, remaining_nodes) ensureElectionsDone(looper, remaining_nodes) # Send some requests after view change sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) restarted_node = start_stopped_node(node_to_stop, looper, tconf, tdirWithPoolTxns, allPluginsPath) txnPoolNodeSet = remaining_nodes + [restarted_node] looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=10)) assert len(getAllReturnVals(restarted_node, restarted_node._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not restarted_node._next_view_indications
def testPrePrepareDigest(setup, looper, sent1): """ A primary replica sends PRE-PREPARE message with incorrect digest to the non primary replicas but non primary replicas should raise suspicion on encountering the PRE-PREPARE. Also it should send no PREPARE """ primaryRep, nonPrimaryReps = setup.primaryRep, setup.nonPrimaryReps def chkSusp(): for r in nonPrimaryReps: # Every node with non primary replicas of instance 0 should raise # suspicion susp_code = Suspicions.PPR_DIGEST_WRONG.code # Since the node sending bad requests might become primary of # some backup instance after view changes, it will again send a # PRE-PREPARE with incorrect digest, so other nodes might raise # suspicion more than once assert len(getNodeSuspicions(r.node, susp_code)) >= 1 # No non primary replica should send any PREPARE assert len(sentPrepare(r, viewNo=0, ppSeqNo=1)) == 0 numOfNodes = len(primaryRep.node.nodeReg) timeout = waits.expectedTransactionExecutionTime(numOfNodes) looper.run(eventually(chkSusp, retryWait=1, timeout=timeout))
def test_node_requests_missing_three_phase_messages_after_long_disconnection( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tconf, tdirWithPoolTxns, allPluginsPath): """ 2 of 4 nodes go down, so pool can not process any more incoming requests. A new request comes in. Test than waits for some time to ensure that PrePrepare was created long enough seconds to be dropped by time checker. Two stopped nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPARES and PREPREPARES and the pool successfully handles both transactions. """ INIT_REQS_CNT = 10 MISSING_REQS_CNT = 1 REQS_AFTER_RECONNECT_CNT = 1 alive_nodes = [] disconnected_nodes = [] for node in txnPoolNodeSet: if node.hasPrimary is not None: alive_nodes.append(node) else: disconnected_nodes.append(node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) init_ledger_size = txnPoolNodeSet[0].domainLedger.size current_node_set = set(txnPoolNodeSet) for node in disconnected_nodes: disconnect_node_and_ensure_disconnected(looper, current_node_set, node, stopNode=False) current_node_set.remove(node) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) def check_pp_out_of_sync(alive_nodes, disconnected_nodes): def get_last_pp(node): return node.replicas._master_replica.lastPrePrepare last_3pc_key_alive = get_last_pp(alive_nodes[0]) for node in alive_nodes[1:]: assert get_last_pp(node) == last_3pc_key_alive last_3pc_key_diconnected = get_last_pp(disconnected_nodes[0]) assert last_3pc_key_diconnected != last_3pc_key_alive for node in disconnected_nodes[1:]: assert get_last_pp(node) == last_3pc_key_diconnected looper.run( eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) preprepare_deviation = 4 tconf.ACCEPTABLE_DEVIATION_PREPREPARE_SECS = preprepare_deviation time.sleep(preprepare_deviation * 2) for node in disconnected_nodes: current_node_set.add(node) reconnect_node_and_ensure_connected(looper, current_node_set, node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def test_view_not_changed_when_short_disconnection(txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf): """ When primary is disconnected but not long enough to trigger the timeout, view change should not happen """ pr_node = get_master_primary_node(txnPoolNodeSet) view_no = checkViewNoForNodes(txnPoolNodeSet) lost_pr_calls = { node.name: node.spylog.count(node.lost_master_primary.__name__) for node in txnPoolNodeSet if node != pr_node } prp_inst_chg_calls = { node.name: node.spylog.count(node.propose_view_change.__name__) for node in txnPoolNodeSet if node != pr_node } recv_inst_chg_calls = { node.name: node.spylog.count(node.processInstanceChange.__name__) for node in txnPoolNodeSet if node != pr_node } def chk1(): # Check that non-primary nodes detects losing connection with # primary for node in txnPoolNodeSet: if node != pr_node: assert node.spylog.count(node.lost_master_primary.__name__) \ > lost_pr_calls[node.name] def chk2(): # Schedule an instance change but do not send it # since primary joins again for node in txnPoolNodeSet: if node != pr_node: assert node.spylog.count(node.propose_view_change.__name__) \ > prp_inst_chg_calls[node.name] assert node.spylog.count(node.processInstanceChange.__name__) \ == recv_inst_chg_calls[node.name] # Disconnect master's primary for node in txnPoolNodeSet: if node != pr_node: node.nodestack.getRemote(pr_node.nodestack.name).disconnect() timeout = min(tconf.ToleratePrimaryDisconnection - 1, 1) looper.run(eventually(chk1, retryWait=.2, timeout=timeout)) # Reconnect master's primary for node in txnPoolNodeSet: if node != pr_node: node.nodestack.retryDisconnected() looper.run(eventually(chk2, retryWait=.2, timeout=timeout + 1)) def chk3(): # Check the view does not change with pytest.raises(AssertionError): assert checkViewNoForNodes(txnPoolNodeSet) == view_no + 1 looper.run(eventually(chk3, retryWait=1, timeout=10)) # Send some requests and make sure the request execute sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)
def test_node_requests_missing_three_phase_messages(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ 2 of 4 nodes go down, so pool can not process any more incoming requests. A new request comes in. After a while those 2 nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPARES and PREPREPARES and the pool successfully handles both transactions after that. """ INIT_REQS_CNT = 10 MISSING_REQS_CNT = 1 REQS_AFTER_RECONNECT_CNT = 1 disconnected_nodes = txnPoolNodeSet[2:] alive_nodes = txnPoolNodeSet[:2] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) init_ledger_size = txnPoolNodeSet[0].domainLedger.size current_node_set = set(txnPoolNodeSet) for node in disconnected_nodes: disconnect_node_and_ensure_disconnected(looper, current_node_set, node, stopNode=False) current_node_set.remove(node) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) def check_pp_out_of_sync(alive_nodes, disconnected_nodes): def get_last_pp(node): return node.replicas._master_replica.lastPrePrepare last_3pc_key_alive = get_last_pp(alive_nodes[0]) for node in alive_nodes[1:]: assert get_last_pp(node) == last_3pc_key_alive last_3pc_key_diconnected = get_last_pp(disconnected_nodes[0]) assert last_3pc_key_diconnected != last_3pc_key_alive for node in disconnected_nodes[1:]: assert get_last_pp(node) == last_3pc_key_diconnected looper.run( eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) for node in disconnected_nodes: current_node_set.add(node) reconnect_node_and_ensure_connected(looper, current_node_set, node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet[:-1]) for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def test_slow_nodes_catchup_before_selecting_primary_in_new_view(looper, txnPoolNodeSet, steward1, stewardWallet, tconf, slow_node): """ Delay 3PC to 1 node and then cause view change so by the time the view change happens(each node gets >n-f `INSTANCE_CHANGE`s), the slow node is behind other nodes. The should initiate catchup to come to the same state as other nodes. """ fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay = tconf.PerfCheckFreq # Bad network introduced slow_node.nodeIbStasher.delay(ppDelay(delay, 0)) slow_node.nodeIbStasher.delay(pDelay(2*delay, 0)) slow_node.nodeIbStasher.delay(cDelay(3*delay, 0)) for i in range(2): sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1, 20) waitNodeDataInequality(looper, slow_node, *fast_nodes) catchup_reply_counts = {n.name: n.ledgerManager.spylog.count( n.ledgerManager.processCatchupRep) for n in txnPoolNodeSet} catchup_done_counts = {n.name: n.spylog.count(n.allLedgersCaughtUp) for n in txnPoolNodeSet} def slow_node_processed_some(): assert slow_node.master_replica.batches # The slow node has received some PRE-PREPAREs looper.run(eventually(slow_node_processed_some, retryWait=1, timeout=delay)) # No reverts have been called by the slow node rv = getAllReturnVals(slow_node.replicas[0], TestReplica.revert_unordered_batches) assert not rv or max(rv) == 0 # Delay reception of catchup replies so ViewChangeDone can be received # before catchup completes delay_catchup_reply = 2 slow_node.nodeIbStasher.delay(cr_delay(delay_catchup_reply)) ensure_view_change(looper, txnPoolNodeSet) # `slow_node` will not have elections done but others will. checkProtocolInstanceSetup(looper, fast_nodes, numInstances=len(slow_node.replicas), retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) # `slow_node` does catchup, `fast_nodes` don't for n in txnPoolNodeSet: assert n.spylog.count(n.allLedgersCaughtUp) > catchup_done_counts[ n.name] if n == slow_node: assert n.ledgerManager.spylog.count( n.ledgerManager.processCatchupRep) > catchup_reply_counts[n.name] else: assert n.ledgerManager.spylog.count( n.ledgerManager.processCatchupRep) == catchup_reply_counts[n.name] # Greater than 0 batches were reverted by the slow node assert max(getAllReturnVals(slow_node.master_replica, slow_node.master_replica.revert_unordered_batches)) > 0 # Bad network repaired slow_node.reset_delays_and_process_delayeds() # Make sure pool is functional sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1, 5) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_client_can_send_request_no_catchup(looper, poolTxnClientData, tdirWithPoolTxns, txnPoolNodeSet): client, _ = new_client(poolTxnClientData, tdirWithPoolTxns) looper.add(client) req = random_requests(1)[0] looper.run(eventually(can_send_request, client, req))
def test_client_can_send_read_requests_no_catchup(looper, poolTxnClientData, tdirWithPoolTxns, txnPoolNodeSet): client, _ = new_client(poolTxnClientData, tdirWithPoolTxns) looper.add(client) looper.run(eventually(can_send_read_requests, client))
def test_view_change_gc_in_between_3pc_all_nodes_delays( looper, txnPoolNodeSet, wallet1, client): """ Test that garbage collector compares the whole 3PC key (viewNo, ppSeqNo) and does not remove messages from node's queues that have higher viewNo than last ordered one even if their ppSeqNo are less or equal """ numNodes = len(client.nodeReg) viewNo = checkViewNoForNodes(txnPoolNodeSet) # 1 send two messages one by one separately to make # node pool working with two batches # -> last_ordered_3pc = (+0, 2) [+0 means from the initial state] # (last_ordered_3pc here and futher is tracked # for master instances only cause non-master ones have # specific logic of its management which we don't care in # the test, see Replica::_setup_for_non_master) send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client, 1) send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client, 1) last_ordered_3pc = (viewNo, 2) check_nodes_last_ordered_3pc(txnPoolNodeSet, last_ordered_3pc) check_nodes_requests_size(txnPoolNodeSet, 2) # 2 do view change # -> GC should remove it from nodes' queues # -> viewNo = +1 ensure_view_change_complete(looper, txnPoolNodeSet) viewNo = checkViewNoForNodes(txnPoolNodeSet, viewNo + 1) check_nodes_last_ordered_3pc(txnPoolNodeSet, last_ordered_3pc) check_nodes_requests_size(txnPoolNodeSet, 0) # 3 slow processing 3PC messages for all nodes (all replica instances) # randomly and send one more message # -> not ordered (last_ordered_3pc still equal (+0, 2)) but primaries # should at least send PRE-PREPAREs # TODO could it be not enough for wainting that at least primary # has sent PRE-PREPARE propagationTimeout = waits.expectedClientRequestPropagationTime(numNodes) delay_3pc_messages(txnPoolNodeSet, 0, delay=propagationTimeout * 2) delay_3pc_messages(txnPoolNodeSet, 1, delay=propagationTimeout * 2) requests = sendRandomRequests(wallet1, client, 1) def checkPrePrepareSentAtLeastByPrimary(): for node in txnPoolNodeSet: for replica in node.replicas: if replica.isPrimary: assert len(replica.sentPrePrepares) looper.run( eventually(checkPrePrepareSentAtLeastByPrimary, retryWait=0.1, timeout=propagationTimeout)) # 4 do view change # -> GC shouldn't remove anything because # last_ordered_3pc (+0, 1) < last message's 3pc key (+1, 1) # -> viewNo = 2 ensure_view_change_complete(looper, txnPoolNodeSet) viewNoNew = checkViewNoForNodes(txnPoolNodeSet) # another view change could happen because of slow nodes assert viewNoNew - viewNo in (1, 2) viewNo = viewNoNew check_nodes_last_ordered_3pc(txnPoolNodeSet, last_ordered_3pc) check_nodes_requests_size(txnPoolNodeSet, 1) # 5 reset delays and wait for replies # -> new primaries should send new 3pc for last message # with 3pc key (+2, 1) # -> they should be ordered # -> last_ordered_3pc = (+2, 1) reset_delays_and_process_delayeds(txnPoolNodeSet) waitForSufficientRepliesForRequests(looper, client, requests=requests, fVal=numNodes - 1) checkViewNoForNodes(txnPoolNodeSet, viewNo) last_ordered_3pc = (viewNo, 1) check_nodes_last_ordered_3pc(txnPoolNodeSet, last_ordered_3pc) check_nodes_requests_size(txnPoolNodeSet, 1) # 6 do view change # -> GC should remove them ensure_view_change_complete(looper, txnPoolNodeSet) viewNo = checkViewNoForNodes(txnPoolNodeSet, viewNo + 1) check_nodes_last_ordered_3pc(txnPoolNodeSet, last_ordered_3pc) check_nodes_requests_size(txnPoolNodeSet, 0)
def test_reverted_unordered(txnPoolNodeSet, looper, wallet1, client1, client1Connected): """ Before starting catchup, revert any uncommitted changes to state and ledger. This is to avoid any re-application of requests that were ordered but stashed Example scenario prepared (1, 4) startViewChange start_catchup ... .... ... committed and send Ordered (1, 2) ... .... preLedgerCatchUp force_process_ordered, take out (1,2) and stash (1, 2) now process stashed Ordered(1,2), its requests will be applied again Simulation: Delay COMMITs to a node so that it can not order requests but has prepared them. Then trigger a view change and make sure the slow node has not ordered same number of requests as others but has prepared so it can order when it receives COMMITs while view change is in progress. The slow node should revert unordered batches and but it should eventually process the ordered requests, so delay LEDGER_STATUS too so catchup is delayed """ slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_node.nodeIbStasher.delay(cDelay(120, 0)) sent_batches = 5 send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * sent_batches, sent_batches) # Fast nodes have same last ordered and same data last_ordered = [n.master_last_ordered_3PC for n in fast_nodes] assert check_if_all_equal_in_list(last_ordered) ensure_all_nodes_have_same_data(looper, fast_nodes) # Slow nodes have different last ordered than fast nodes assert last_ordered[0] != slow_node.master_last_ordered_3PC # Delay LEDGER_STATUS so catchup starts late slow_node.nodeIbStasher.delay(lsDelay(100)) slow_node.nodeIbStasher.delay(msg_rep_delay(100)) # slow_node has not reverted batches assert sent_batches not in getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) ensure_view_change(looper, txnPoolNodeSet) def chk1(): # slow_node reverted all batches rv = getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) assert sent_batches in rv looper.run(eventually(chk1, retryWait=1)) # After the view change slow_node has prepared same requests as the fast # nodes have ordered assert last_ordered[ 0] == slow_node.master_replica.last_prepared_before_view_change # Deliver COMMITs slow_node.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT) def chk2(): # slow_node orders all requests as others have assert last_ordered[0] == slow_node.master_last_ordered_3PC looper.run(eventually(chk2, retryWait=1)) # Deliver LEDGER_STATUS so catchup can complete slow_node.nodeIbStasher.reset_delays_and_process_delayeds(LEDGER_STATUS) slow_node.nodeIbStasher.reset_delays_and_process_delayeds(MESSAGE_RESPONSE) # Ensure all nodes have same data ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) def chk3(): # slow_node processed stashed Ordered requests successfully rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert sent_batches in rv looper.run(eventually(chk3, retryWait=1)) # Ensure pool is functional ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1)
def test_6th_node_join_after_view_change_by_master_restart( looper, txnPoolNodeSet, tdir, tconf, allPluginsPath, steward1, stewardWallet, client_tdir, limitTestRunningTime): """ Test steps: 1. start pool of 4 nodes 2. force 4 view change by restarting primary node 3. now primary node must be Alpha, then add new node, named Epsilon 4. ensure, that Epsilon was added and catch-up done 5. send some txns 6. force 4 view change. Now primary node is new added Epsilon 7. add 6th node and ensure, that new node is catchuped """ pool_of_nodes = txnPoolNodeSet for __ in range(4): pool_of_nodes = ensure_view_change_by_primary_restart( looper, pool_of_nodes, tconf, tdir, allPluginsPath, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) timeout = waits.expectedPoolCatchupTime(nodeCount=len(pool_of_nodes)) for node in pool_of_nodes: looper.run(eventually(catchuped, node, timeout=2 * timeout)) ensure_all_nodes_have_same_data(looper, pool_of_nodes, custom_timeout=timeout) sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1, 5) new_epsilon_node = add_new_node(looper, pool_of_nodes, steward1, stewardWallet, tdir, client_tdir, tconf, allPluginsPath, name='Epsilon') sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1, 5) """ check that pool and domain ledgers for new node are in synced state """ timeout = waits.expectedPoolCatchupTime(nodeCount=len(pool_of_nodes)) for node in pool_of_nodes: looper.run( eventually(check_ledger_state, node, DOMAIN_LEDGER_ID, LedgerState.synced, retryWait=.5, timeout=timeout)) looper.run( eventually(check_ledger_state, node, POOL_LEDGER_ID, LedgerState.synced, retryWait=.5, timeout=timeout)) for __ in range(4): pool_of_nodes = ensure_view_change_by_primary_restart( looper, pool_of_nodes, tconf, tdir, allPluginsPath, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) timeout = waits.expectedPoolCatchupTime(nodeCount=len(pool_of_nodes)) for node in pool_of_nodes: looper.run(eventually(catchuped, node, timeout=2 * timeout)) sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1, 2) new_psi_node = add_new_node(looper, pool_of_nodes, steward1, stewardWallet, tdir, client_tdir, tconf, allPluginsPath, name='Psi') looper.run( eventually(check_ledger_state, new_psi_node, DOMAIN_LEDGER_ID, LedgerState.synced, retryWait=.5, timeout=5)) looper.run( eventually(check_ledger_state, new_psi_node, POOL_LEDGER_ID, LedgerState.synced, retryWait=.5, timeout=5))
def test_handle_delayed_preprepares(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, teardown): """ Make a node send PREPREPARE again after the slow node has ordered """ slow_node, other_nodes, primary_node, other_non_primary_nodes = \ split_nodes(txnPoolNodeSet) # This node will send PRE-PREPARE again confused_node = other_non_primary_nodes[0] orig_method = confused_node.handlers[PREPREPARE].serve last_pp = None def patched_method(self, msg): nonlocal last_pp last_pp = orig_method(msg) return last_pp confused_node.handlers[PREPREPARE].serve = types.MethodType( patched_method, confused_node.handlers[PREPREPARE]) # Delay PRE-PREPAREs by large amount simulating loss slow_node.nodeIbStasher.delay(ppDelay(300, 0)) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=10, num_batches=5) waitNodeDataEquality(looper, slow_node, *other_nodes) slow_master_replica = slow_node.master_replica count_pr_req = get_count(slow_master_replica, slow_master_replica.process_requested_pre_prepare) count_pr_tpc = get_count(slow_master_replica, slow_master_replica.processThreePhaseMsg) confused_node.sendToNodes(MessageRep( **{ f.MSG_TYPE.nm: PREPREPARE, f.PARAMS.nm: { f.INST_ID.nm: last_pp.instId, f.VIEW_NO.nm: last_pp.viewNo, f.PP_SEQ_NO.nm: last_pp.ppSeqNo }, f.MSG.nm: last_pp }), names=[ slow_node.name, ]) def chk(): # `process_requested_pre_prepare` is called but # `processThreePhaseMsg` is not called assert get_count( slow_master_replica, slow_master_replica.process_requested_pre_prepare) > count_pr_req assert get_count( slow_master_replica, slow_master_replica.processThreePhaseMsg) == count_pr_tpc looper.run(eventually(chk, retryWait=1))
def test_view_not_changed_when_primary_disconnected_from_less_than_quorum( txnPoolNodeSet, looper, wallet1, client1, client1Connected): """ Less than quorum nodes lose connection with primary, this should not trigger view change as the protocol can move ahead """ pr_node = get_master_primary_node(txnPoolNodeSet) npr = getNonPrimaryReplicas(txnPoolNodeSet, 0) partitioned_rep = npr[0] partitioned_node = partitioned_rep.node lost_pr_calls = partitioned_node.spylog.count( partitioned_node.lost_master_primary.__name__) recv_inst_chg_calls = { node.name: node.spylog.count(node.processInstanceChange.__name__) for node in txnPoolNodeSet if node != partitioned_node and node != pr_node } view_no = checkViewNoForNodes(txnPoolNodeSet) orig_retry_meth = partitioned_node.nodestack.retryDisconnected def wont_retry(self, exclude=None): # Do not attempt to retry connection pass # simulating a partition here # Disconnect a node from only the primary of the master and dont retry to # connect to it partitioned_node.nodestack.retryDisconnected = types.MethodType( wont_retry, partitioned_node.nodestack) r = partitioned_node.nodestack.getRemote(pr_node.nodestack.name) r.disconnect() def chk1(): # Check that the partitioned node detects losing connection with # primary and sends an instance change which is received by other # nodes except the primary (since its disconnected from primary) assert partitioned_node.spylog.count( partitioned_node.lost_master_primary.__name__) > lost_pr_calls for node in txnPoolNodeSet: if node != partitioned_node and node != pr_node: assert node.spylog.count(node.processInstanceChange.__name__ ) > recv_inst_chg_calls[node.name] looper.run(eventually(chk1, retryWait=1, timeout=10)) def chk2(): # Check the view does not change with pytest.raises(AssertionError): assert checkViewNoForNodes(txnPoolNodeSet) == view_no + 1 looper.run(eventually(chk2, retryWait=1, timeout=10)) # Send some requests and make sure the request execute sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) # Repair the connection so the node is no longer partitioned partitioned_node.nodestack.retryDisconnected = types.MethodType( orig_retry_meth, partitioned_node.nodestack) # Send some requests and make sure the request execute sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) # Partitioned node should have the same ledger and state as others # eventually waitNodeDataEquality(looper, partitioned_node, *[n for n in txnPoolNodeSet if n != partitioned_node])
def testLoggingTxnStateWhenCommitFails(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, logsearch): logsPropagate, _ = logsearch(levels=['INFO'], files=['propagator.py'], funcs=['propagate'], msgs=['propagating.*request.*from client']) logsOrdered, _ = logsearch(levels=['INFO'], files=['replica.py'], funcs=['order_3pc_key'], msgs=['ordered batch request']) logsCommitFail, _ = logsearch(levels=['WARNING'], files=['node.py'], funcs=['executeBatch'], msgs=['commit failed for batch request']) seed = randomString(32) wh, _ = sdk_wallet_steward nym_request, _ = looper.loop.run_until_complete( prepare_nym_request(sdk_wallet_steward, seed, "name", None)) sdk_sign_and_send_prepared_request(looper, sdk_wallet_steward, sdk_pool_handle, nym_request) class SomeError(Exception): pass def commitPatched(node, commitOrig, *args, **kwargs): req_handler = node.get_req_handler(ledger_id=DOMAIN_LEDGER_ID) req_handler.commit = commitOrig raise SomeError(ERORR_MSG) excCounter = 0 def executeBatchPatched(node, executeBatchOrig, *args, **kwargs): nonlocal excCounter try: executeBatchOrig(*args, **kwargs) except SomeError: excCounter += 1 node.executeBatch = executeBatchOrig pass def checkSufficientExceptionsHappend(): assert excCounter == len(txnPoolNodeSet) return for node in txnPoolNodeSet: req_handler = node.get_req_handler(ledger_id=DOMAIN_LEDGER_ID) req_handler.commit = functools.partial(commitPatched, node, req_handler.commit) node.executeBatch = functools.partial(executeBatchPatched, node, node.executeBatch) timeout = waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.run( eventually(checkSufficientExceptionsHappend, retryWait=1, timeout=timeout)) reqId = str(json.loads(nym_request)['reqId']) assert any(reqId in record.getMessage() for record in logsPropagate) assert any(reqId in record.getMessage() for record in logsOrdered) assert any(reqId in record.getMessage() for record in logsCommitFail) assert any(ERORR_MSG in record.getMessage() for record in logsCommitFail)
def test_node_detecting_lag_from_view_change_done_messages( txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf): """ A node is slow and after view change starts, it marks it's `last_prepared` to less than others, after catchup it does not get any txns from others and finds it has already ordered it's `last_prepared`, but when it gets ViewChangeDone messages, it starts catchup again and this time gets the txns. To achieve this delay all 3PC messages to a node so before view change it has different last_prepared from others. Also delay processing of COMMITs and INSTANCE_CHANGEs by other nodes """ send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica fast_master_replicas = [n.master_replica for n in fast_nodes] delay_3pc = 50 delay_ic = tconf.PerfCheckFreq + 5 delay_commit = delay_ic + 10 delay_3pc_messages([slow_node], 0, delay_3pc) for n in fast_nodes: n.nodeIbStasher.delay(icDelay(delay_ic)) n.nodeIbStasher.delay(cDelay(delay_commit)) reqs = [] for i in range(10): reqs = reqs + sendRandomRequests(wallet1, client1, 2) looper.runFor(.2) def chk1(): for rep in fast_master_replicas: assert compare_3PC_keys( slow_master_replica.last_prepared_certificate_in_view(), rep.last_prepared_certificate_in_view()) > 0 assert slow_master_replica.last_ordered_3pc == rep.last_ordered_3pc looper.run(eventually(chk1)) no_more_catchup_count = get_count(slow_node, slow_node.no_more_catchups_needed) # Track last prepared for master replica of each node prepareds = {} orig_methods = {} for node in txnPoolNodeSet: orig_methods[node.name] = node.master_replica.on_view_change_start def patched_on_view_change_start(self): orig_methods[self.node.name]() prepareds[self.node.name] = self.last_prepared_before_view_change node.master_replica.on_view_change_start = types.MethodType( patched_on_view_change_start, node.master_replica) ensure_view_change(looper, txnPoolNodeSet, exclude_from_check=fast_nodes) def chk2(): # last_prepared of slow_node is less than fast_nodes for rep in fast_master_replicas: assert compare_3PC_keys(prepareds[slow_master_replica.node.name], prepareds[rep.node.name]) > 0 looper.run(eventually(chk2, timeout=delay_ic + 5)) last_start_catchup_call_at = None no_more_catchup_call_at = None def chk3(): # no_more_catchups_needed was called since node found no need of catchup nonlocal last_start_catchup_call_at, no_more_catchup_call_at assert (get_count(slow_node, slow_node.no_more_catchups_needed) - no_more_catchup_count) > 0 no_more_catchup_call_at = slow_node.spylog.getLast( slow_node.no_more_catchups_needed).starttime last_start_catchup_call_at = slow_node.spylog.getLast( slow_node.start_catchup).starttime looper.run(eventually(chk3, timeout=delay_commit)) for n in fast_nodes: n.nodeIbStasher.reset_delays_and_process_delayeds() n.nodeIbStasher.reset_delays_and_process_delayeds() ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) assert slow_node.spylog.getLast( slow_node.start_catchup).starttime > no_more_catchup_call_at assert slow_node.spylog.getLast( slow_node.start_catchup).starttime > last_start_catchup_call_at
def testProtocolInstanceCannotBecomeActiveWithLessThanFourServers( tconf_for_func, tdir_for_func): """ A protocol instance must have at least 4 nodes to come up. The status of the nodes will change from starting to started only after the addition of the fourth node to the system. """ nodeCount = 13 f = 4 minimumNodesToBeUp = nodeCount - f nodeNames = genNodeNames(nodeCount) with TestNodeSet(tconf_for_func, names=nodeNames, tmpdir=tdir_for_func) as nodeSet: with Looper(nodeSet) as looper: # helpers def genExpectedStates(connecteds: Iterable[str]): return { nn: CONNECTED if nn in connecteds else JOINED_NOT_ALLOWED for nn in nodeNames } def checkNodeStatusRemotesAndF(expectedStatus: Status, nodeIdx: int): for node in nodeSet.nodes.values(): checkNodeRemotes( node, genExpectedStates(nodeNames[:nodeIdx + 1])) assert node.status == expectedStatus def addNodeBackAndCheck(nodeIdx: int, expectedStatus: Status): logger.info("Add back the {} node and see status of {}".format( ordinal(nodeIdx + 1), expectedStatus)) addNodeBack(nodeSet, looper, nodeNames[nodeIdx]) timeout = waits.expectedNodeStartUpTimeout() + \ waits.expectedPoolInterconnectionTime(len(nodeSet)) # TODO: Probably it's better to modify waits.* functions timeout *= 1.5 looper.run( eventually(checkNodeStatusRemotesAndF, expectedStatus, nodeIdx, retryWait=1, timeout=timeout)) logger.debug("Sharing keys") looper.run(checkNodesConnected(nodeSet)) logger.debug("Remove all the nodes") for n in nodeNames: looper.removeProdable(nodeSet.nodes[n]) nodeSet.removeNode(n) looper.runFor(10) logger.debug("Add nodes back one at a time") for i in range(nodeCount): nodes = i + 1 if nodes < minimumNodesToBeUp: expectedStatus = Status.starting elif nodes < nodeCount: expectedStatus = Status.started_hungry else: expectedStatus = Status.started addNodeBackAndCheck(i, expectedStatus)
def test_node_load_after_add_then_disconnect(newNodeCaughtUp, txnPoolNodeSet, tconf, looper, client1, wallet1, client1Connected, tdirWithPoolTxns, allPluginsPath, poolTxnStewardData, capsys): """ A node that restarts after some transactions should eventually get the transactions which happened while it was down :return: """ new_node = newNodeCaughtUp with capsys.disabled(): print("Stopping node {} with pool ledger size {}".format( new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node) looper.removeProdable(new_node) client_batches = 80 txns_per_batch = 10 for i in range(client_batches): s = perf_counter() sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, txns_per_batch, override_timeout_limit=True) with capsys.disabled(): print('{} executed {} client txns in {:.2f} seconds'.format( i + 1, txns_per_batch, perf_counter() - s)) with capsys.disabled(): print("Starting the stopped node, {}".format(new_node)) nodeHa, nodeCHa = HA(*new_node.nodestack.ha), HA(*new_node.clientstack.ha) new_node = TestNode(new_node.name, basedirpath=tdirWithPoolTxns, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(new_node) txnPoolNodeSet[-1] = new_node # Delay catchup reply processing so LedgerState does not change delay_catchup_reply = 5 new_node.nodeIbStasher.delay(cr_delay(delay_catchup_reply)) looper.run(checkNodesConnected(txnPoolNodeSet)) # Make sure ledger starts syncing (sufficient consistency proofs received) looper.run( eventually(check_ledger_state, new_node, DOMAIN_LEDGER_ID, LedgerState.syncing, retryWait=.5, timeout=5)) # Not accurate timeout but a conservative one timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \ 2*delay_catchup_reply waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4], customTimeout=timeout) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4])
def test_node_load_consistent_time(tconf, change_checkpoint_freq, disable_node_monitor_config, looper, txnPoolNodeSet, tdirWithPoolTxns, allPluginsPath, poolTxnStewardData, capsys): # One of the reason memory grows is because spylog grows client, wallet = buildPoolClientAndWallet(poolTxnStewardData, tdirWithPoolTxns, clientClass=TestClient) looper.add(client) looper.run(client.ensureConnectedToNodes()) client_batches = 300 txns_per_batch = 25 time_log = [] warm_up_batches = 10 tolerance_factor = 2 print_detailed_memory_usage = False from pympler import tracker tr = tracker.SummaryTracker() node_methods_to_capture = [TestNode.executeBatch, TestNode.recordAndPropagate, TestNode.domainDynamicValidation, TestNode.domainRequestApplication] times = {n.name: {meth.__name__: [] for meth in node_methods_to_capture} for n in txnPoolNodeSet} for node in txnPoolNodeSet: for meth in node_methods_to_capture: meth_name = meth.__name__ patched = timeit(getattr(node, meth_name), times[node.name][meth_name]) setattr(node, meth_name, patched) for i in range(client_batches): s = perf_counter() sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, txns_per_batch, override_timeout_limit=True) t = perf_counter() - s with capsys.disabled(): print('{} executed {} client txns in {:.2f} seconds'. format(i + 1, txns_per_batch, t)) print('--------Memory Usage details start') for node in txnPoolNodeSet: # print(sys.getsizeof(node)) print('---Node {}-----'.format(node)) # print('Requests {}'.format(asizeof.asizeof(node.requests, detail=1))) print( get_memory_usage( node, print_detailed_memory_usage, get_only_non_empty=True)) for r in node.replicas: print('---Replica {}-----'.format(r)) print( get_memory_usage( r, print_detailed_memory_usage, get_only_non_empty=True)) # if i % 3 == 0: # tr.print_diff() print('--------Memory Usage details end') for node in txnPoolNodeSet: for meth in node_methods_to_capture: ts = times[node.name][meth.__name__] print('{} {} {} {}'.format( node, meth.__name__, mean(ts), ts)) if len(time_log) >= warm_up_batches: m = mean(time_log) sd = tolerance_factor * pstdev(time_log) assert m > t or abs(t - m) <= sd, '{} {}'.format(abs(t - m), sd) time_log.append(t) # Since client checks inbox for sufficient replies, clear inbox so that # it takes constant time to check replies for each batch client.inBox.clear() client.txnLog.reset()