def test_watermarks_after_view_change(tdir, tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Delay commit, checkpoint, InstanceChange and ViewChangeDone messages for lagging_node. Start ViewChange. Check that ViewChange finished. Reset delays. Check that lagging_node can order transactions and has same data with other nodes. """ lagging_node = txnPoolNodeSet[-1] lagging_node.master_replica.config.LOG_SIZE = LOG_SIZE start_view_no = lagging_node.viewNo with delay_rules(lagging_node.nodeIbStasher, cDelay(), chk_delay(), icDelay(), nv_delay()): trigger_view_change(txnPoolNodeSet) waitForViewChange(looper, txnPoolNodeSet[:-1], expectedViewNo=start_view_no + 1, customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(txnPoolNodeSet))) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet[:-1]) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_delayed_instance_changes_after_vcd_for_next_view( looper, txnPoolNodeSet): ''' A node is doing view change to view=1, while the other nodes already finished view change to view=2. The node receives a quorum of VCD messages for view=2 before a quorum of InstanceChange messages for view=2. Nevertheless, the node should not start a view change to view=2 without a quorum of InstanceChanges, that is it should not go to propagate primary mode since it's already in view chanage state. The node should eventually finish view change to view=2 once receives all VCD and IS msgs for view=2 ''' nodes = txnPoolNodeSet slow_node = nodes[-1] fast_nodes = [n for n in nodes if n != slow_node] slow_stasher = slow_node.nodeIbStasher # 1. DO FIRST VIEW CHANGE # delay VCD for the first ViewChange with delay_rules(slow_stasher, nv_delay()): # Trigger view change trigger_view_change(nodes) waitForViewChange(looper, nodes, expectedViewNo=1) # make sure view change is finished on all nodes except the slow one ensureElectionsDone(looper, fast_nodes, instances_list=range(3), customTimeout=30) # drop all VCD to view=1 slow_stasher.drop_delayeds() # 2. DO SECOND VIEW CHANGE # delay Instance Changes and # so that the slow node receives VCD for view=2 before # a quorum of InstanceChanges for that view while still doing view change to view=1 with delay_rules(slow_stasher, icDelay()): # Trigger view change trigger_view_change(nodes) waitForViewChange(looper, fast_nodes, expectedViewNo=2) # make sure view change is finished on all nodes except the slow one ensureElectionsDone(looper, fast_nodes, instances_list=range(3)) # slow node is still on view=1 assert slow_node.viewNo == 1 assert slow_node.view_change_in_progress # make sure that the slow node receives VCD msgs for view=2 # and didn't receive IS msgs for view=2 # check_vcd_msgs(slow_node, expected_view_no=2, expected_count=len(fast_nodes), ) check_no_ic_msgs(slow_node, 2, fast_nodes) # 3. RESET DELAYS AND CHECK waitForViewChange(looper, nodes, expectedViewNo=2) ensureElectionsDone(looper, nodes) assert not slow_node.view_change_in_progress ensure_all_nodes_have_same_data(looper, nodes=nodes)
def test_resend_instance_change_messages(looper, txnPoolNodeSet, tconf, sdk_wallet_steward, sdk_pool_handle): primary_node = txnPoolNodeSet[0] old_view_no = checkViewNoForNodes(txnPoolNodeSet, 0) assert primary_node.master_replica.isPrimary for n in txnPoolNodeSet: n.nodeIbStasher.delay(icDelay(3 * tconf.INSTANCE_CHANGE_TIMEOUT)) assert set([n.view_changer.instance_change_rounds for n in txnPoolNodeSet]) == {0} disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, primary_node, stopNode=False) txnPoolNodeSet.remove(primary_node) looper.run(eventually(partial(check_count_connected_node, txnPoolNodeSet, 4), timeout=5, acceptableExceptions=[AssertionError])) looper.runFor(2*tconf.INSTANCE_CHANGE_TIMEOUT) assert set([n.view_changer.instance_change_rounds for n in txnPoolNodeSet]) == {1} looper.runFor(tconf.INSTANCE_CHANGE_TIMEOUT) looper.run(eventually(partial(checkViewNoForNodes, txnPoolNodeSet, expectedViewNo=old_view_no + 1), timeout=tconf.VIEW_CHANGE_TIMEOUT)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_no_propagated_future_view_change_while_view_change( txnPoolNodeSet, looper): # the last node is a lagging one, which will receive ViewChangeDone messages for future view viewNo = checkViewNoForNodes(txnPoolNodeSet) lagged_node = txnPoolNodeSet[-1] other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) # emulate view change in progress lagged_node.view_changer.view_change_in_progress = True old_view_no = checkViewNoForNodes([lagged_node]) initial_vhdc = \ lagged_node.view_changer.spylog.count(lagged_node.view_changer.process_future_view_vchd_msg.__name__) # delay INSTANCE CHANGE on lagged nodes, so all nodes except the lagging one finish View Change with delay_rules(lagged_node.nodeIbStasher, icDelay()): # make sure that View Change happened on all nodes but the lagging one ensure_view_change(looper, other_nodes) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, numInstances=2) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) # check that lagged node recived 3 Future VCD, but didn't start new view change assert len(other_nodes) + initial_vhdc ==\ lagged_node.view_changer.spylog.count(lagged_node.view_changer.process_future_view_vchd_msg.__name__) assert old_view_no == checkViewNoForNodes([lagged_node])
def test_no_propagated_future_view_change_until_synced(txnPoolNodeSet, looper, mode): # the last node is a lagging one, which will receive ViewChangeDone messages for future view viewNo = checkViewNoForNodes(txnPoolNodeSet) lagged_node_index = (viewNo + 3) % len(txnPoolNodeSet) lagged_node = txnPoolNodeSet[lagged_node_index] other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) # emulate catchup by setting non-synced status lagged_node.mode = mode old_view_no = checkViewNoForNodes([lagged_node]) check_future_vcd_count(lagged_node, 0) # delay INSTANCE CHANGE on lagged nodes, so all nodes except the lagging one finish View Change with delay_rules(lagged_node.nodeIbStasher, icDelay()): # make sure that View Change happened on all nodes but the lagging one ensure_view_change(looper, other_nodes) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, numInstances=2) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) check_no_view_change(looper, lagged_node) assert old_view_no == checkViewNoForNodes([lagged_node]) # emulate finishing of catchup by setting Participating status lagged_node.mode = Mode.participating # make sure that View Change happened on lagging node waitForViewChange(looper, [lagged_node], expectedViewNo=old_view_no + 1, customTimeout=10) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
def test_resend_inst_ch_in_progress_v_ch(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): old_view = viewNoForNodes(txnPoolNodeSet) # disconnect two nodes. One of them should be next master primary in case of view change. for node in [txnPoolNodeSet[1], txnPoolNodeSet[-1]]: disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node, stopNode=True) looper.removeProdable(node) txnPoolNodeSet.remove(node) # delay I_CH on every node except last one and initiate view change stashers = [n.nodeIbStasher for n in txnPoolNodeSet[:-1]] with delay_rules_without_processing(stashers, icDelay(viewNo=2)): ensure_view_change(looper, txnPoolNodeSet) looper.runFor(tconf.NEW_VIEW_TIMEOUT + 1) # checks def checks(): assert all(not node.view_change_in_progress for node in txnPoolNodeSet) assert all(node.viewNo == old_view + 2 for node in txnPoolNodeSet) looper.run( eventually(checks, timeout=tconf.NEW_VIEW_TIMEOUT * 2.5, retryWait=1)) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_propagate_primary_after_primary_restart_view_0( looper, txnPoolNodeSet, tconf, sdk_pool_handle, sdk_wallet_steward, tdir, allPluginsPath): """ Delay instance change msgs to prevent view change during primary restart to test propagate primary for primary node. ppSeqNo should be > 0 to be able to check that propagate primary restores all indexes correctly case viewNo == 0 """ sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle) old_ppseqno = _get_ppseqno(txnPoolNodeSet) assert (old_ppseqno > 0) old_viewNo = checkViewNoForNodes(txnPoolNodeSet) old_primary = get_master_primary_node(txnPoolNodeSet) delay_instance_change(txnPoolNodeSet, IC_DELAY_SEC) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_primary, stopNode=True) looper.removeProdable(old_primary) logger.info("Restart node {}".format(old_primary)) restartedNode = start_stopped_node(old_primary, looper, tconf, tdir, allPluginsPath, delay_instance_change_msgs=False) idx = [ i for i, n in enumerate(txnPoolNodeSet) if n.name == restartedNode.name ][0] txnPoolNodeSet[idx] = restartedNode restartedNode.nodeIbStasher.delay(icDelay(IC_DELAY_SEC)) looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) new_viewNo = checkViewNoForNodes(txnPoolNodeSet) assert (new_viewNo == old_viewNo) new_primary = get_master_primary_node(txnPoolNodeSet) assert (new_primary.name == old_primary.name) # check ppSeqNo the same _get_ppseqno(txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle) new_ppseqno = _get_ppseqno(txnPoolNodeSet) assert (new_ppseqno > old_ppseqno)
def test_no_propagated_future_view_change_until_synced(txnPoolNodeSet, looper, mode): # the last node is a lagging one, which will receive ViewChangeDone messages for future view viewNo = checkViewNoForNodes(txnPoolNodeSet) lagged_node_index = (viewNo + 3) % len(txnPoolNodeSet) lagged_node = txnPoolNodeSet[lagged_node_index] other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) # emulate catchup by setting non-synced status lagged_node.mode = mode old_view_no = checkViewNoForNodes([lagged_node]) check_future_vcd_count(lagged_node, 0) # delay INSTANCE CHANGE on lagged nodes, so all nodes except the lagging one finish View Change with delay_rules(lagged_node.nodeIbStasher, icDelay()): # make sure that View Change happened on all nodes but the lagging one ensure_view_change(looper, other_nodes) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, instances=range(2)) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) check_no_view_change(looper, lagged_node) assert old_view_no == checkViewNoForNodes([lagged_node]) # emulate finishing of catchup by setting Participating status lagged_node.mode = Mode.participating # make sure that View Change happened on lagging node waitForViewChange(looper, [lagged_node], expectedViewNo=old_view_no + 1, customTimeout=10) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
def test_resend_instance_change_messages(looper, txnPoolNodeSet, tconf, sdk_wallet_steward, sdk_pool_handle): primary_node = txnPoolNodeSet[0] old_view_no = checkViewNoForNodes(txnPoolNodeSet, 0) assert primary_node.master_replica.isPrimary for n in txnPoolNodeSet: n.nodeIbStasher.delay(icDelay(3 * tconf.NEW_VIEW_TIMEOUT)) check_sent_instance_changes_count(txnPoolNodeSet, 0) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, primary_node, stopNode=False) txnPoolNodeSet.remove(primary_node) looper.run( eventually(check_count_connected_node, txnPoolNodeSet, 4, timeout=5, acceptableExceptions=[AssertionError])) looper.run( eventually(check_sent_instance_changes_count, txnPoolNodeSet, 1, timeout=2 * tconf.NEW_VIEW_TIMEOUT)) looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=3 * tconf.NEW_VIEW_TIMEOUT)) ensureElectionsDone(looper, txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def testQueueingReqFromFutureView(delayed_perf_chk, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Test if every node queues 3 Phase requests(PRE-PREPARE, PREPARE and COMMIT) that come from a view which is greater than the current view. - Delay reception and processing of view change messages by a non primary for master instance => it starts receiving 3 phase commit messages for next view """ lagging_node = get_last_master_non_primary_node(txnPoolNodeSet) old_view_no = lagging_node.viewNo # Delay processing of InstanceChange and ViewChangeDone so node stashes # 3PC messages delay_ic = 60 lagging_node.nodeIbStasher.delay(icDelay(delay_ic)) lagging_node.nodeIbStasher.delay(vcd_delay(delay_ic)) logger.debug('{} will delay its view change'.format(lagging_node)) def chk_fut_view(view_no, is_empty): length = len(lagging_node.msgsForFutureViews.get(view_no, ())) if is_empty: assert length == 0 else: assert length > 0 return length # No messages queued for future view chk_fut_view(old_view_no + 1, is_empty=True) logger.debug( '{} does not have any messages for future views'.format(lagging_node)) # Every node except Node A should do a view change ensure_view_change(looper, [n for n in txnPoolNodeSet if n != lagging_node], [lagging_node]) # send more requests that will be queued for the lagged node reqs = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 5) l = looper.run( eventually(chk_fut_view, old_view_no + 1, False, retryWait=1)) logger.debug('{} has {} messages for future views'.format(lagging_node, l)) sdk_get_replies(looper, reqs) # reset delays for the lagging_node node so that it finally makes view # change lagging_node.reset_delays_and_process_delayeds() # Eventually no messages queued for future view looper.run( eventually(chk_fut_view, old_view_no + 1, True, retryWait=1, timeout=delay_ic + 10)) logger.debug( '{} exhausted pending messages for future views'.format(lagging_node)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2)
def test_audit_multiple_uncommitted_node_regs(looper, tdir, tconf, allPluginsPath, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, sdk_wallet_steward): ''' - Delay COMMITS on 1 Node - Add 2 more nodes (so that the slow node hs multiple uncommitted node txns) - Make sure that all nodes have equal state eventually ''' slow_node = txnPoolNodeSet[-1] fast_nodes = [node for node in txnPoolNodeSet if node != slow_node] slow_stashers = [slow_node.nodeIbStasher] # let's ignore view changes for simplicity here start_delaying([n.nodeIbStasher for n in txnPoolNodeSet], icDelay()) with delay_rules(slow_stashers, cDelay()): # Add Node5 new_node = add_new_node(looper, fast_nodes, sdk_pool_handle, sdk_wallet_steward, tdir, tconf, allPluginsPath, name='Psi', wait_till_added=False) txnPoolNodeSet.append(new_node) start_delaying(new_node.nodeIbStasher, icDelay()) # Add Node6 new_node = add_new_node(looper, fast_nodes, sdk_pool_handle, sdk_wallet_steward, tdir, tconf, allPluginsPath, name='Eta', wait_till_added=False) txnPoolNodeSet.append(new_node) start_delaying(new_node.nodeIbStasher, icDelay()) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet, custom_timeout=20) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_delayed_instance_changes_after_vcd_for_next_view(looper, txnPoolNodeSet): ''' A node is doing view change to view=1, while the other nodes already finished view change to view=2. The node receives a quorum of VCD messages for view=2 before a quorum of InstanceChange messages for view=2. Nevertheless, the node should not start a view change to view=2 without a quorum of InstanceChanges, that is it should not go to propagate primary mode since it's already in view chanage state. The node should eventually finish view change to view=2 once receives all VCD and IS msgs for view=2 ''' nodes = txnPoolNodeSet slow_node = nodes[-1] fast_nodes = [n for n in nodes if n != slow_node] slow_stasher = slow_node.nodeIbStasher # 1. DO FIRST VIEW CHANGE # delay VCD for the first ViewChange with delay_rules(slow_stasher, vcd_delay()): # Trigger view change for n in nodes: n.view_changer.on_master_degradation() waitForViewChange(looper, nodes, expectedViewNo=1) # make sure view change is finished on all nodes except the slow one ensureElectionsDone(looper, fast_nodes, instances_list=range(3)) # drop all VCD to view=1 slow_stasher.drop_delayeds() # 2. DO SECOND VIEW CHANGE # delay Instance Changes and # so that the slow node receives VCD for view=2 before # a quorum of InstanceChanges for that view while still doing view change to view=1 with delay_rules(slow_stasher, icDelay()): # Trigger view change for n in nodes: n.view_changer.on_master_degradation() waitForViewChange(looper, fast_nodes, expectedViewNo=2) # make sure view change is finished on all nodes except the slow one ensureElectionsDone(looper, fast_nodes, instances_list=range(3)) # slow node is still on view=1 assert slow_node.viewNo == 1 assert slow_node.view_change_in_progress # make sure that the slow node receives VCD msgs for view=2 # and didn't receive IS msgs for view=2 check_vcd_msgs(slow_node, expected_view_no=2, expected_count=len(fast_nodes), ) check_no_ic_msgs(slow_node, expected_view_no=2) # 3. RESET DELAYS AND CHECK waitForViewChange(looper, nodes, expectedViewNo=2) ensureElectionsDone(looper, nodes) assert not slow_node.view_change_in_progress ensure_all_nodes_have_same_data(looper, nodes=nodes)
def test_view_change_during_unstash(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf): slow_node = txnPoolNodeSet[-1] other_nodes = txnPoolNodeSet[:-1] slow_stasher = slow_node.nodeIbStasher other_stashers = [n.nodeIbStasher for n in other_nodes] all_stashers = [n.nodeIbStasher for n in txnPoolNodeSet] # Preload nodes with some transactions sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) for node in txnPoolNodeSet: assert node.master_replica.last_ordered_3pc == (0, 1) # Prevent ordering of some requests start_delaying(all_stashers, delay_3pc(after=7, msgs=(Prepare, Commit))) # Stop ordering on slow node and send requests slow_node_after_5 = start_delaying(slow_stasher, delay_3pc(after=5, msgs=Commit)) slow_node_until_5 = start_delaying(slow_stasher, delay_3pc(after=0)) reqs_view_0 = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 8) # Make pool order first 2 batches and pause pool_after_3 = start_delaying(other_stashers, delay_3pc(after=3)) looper.run(eventually(check_nodes_ordered_till, other_nodes, 0, 3)) # Start catchup, continue ordering everywhere (except two last batches on slow node) with delay_rules(slow_stasher, cr_delay()): slow_node._do_start_catchup(just_started=False) looper.run(eventually(check_catchup_is_started, slow_node)) stop_delaying_and_process(pool_after_3) looper.run(eventually(check_nodes_ordered_till, other_nodes, 0, 7)) # Finish catchup and continue processing on slow node looper.run(eventually(check_catchup_is_finished, slow_node)) stop_delaying_and_process(slow_node_until_5) looper.run(eventually(check_nodes_ordered_till, [slow_node], 0, 5)) # Start view change and allow slow node to get remaining commits with delay_rules(all_stashers, icDelay()): for node in txnPoolNodeSet: node.view_changer.on_master_degradation() looper.runFor(0.1) stop_delaying_and_process(slow_node_after_5) # Ensure that expected number of requests was ordered replies = sdk_get_replies(looper, reqs_view_0) for rep in replies[:6]: sdk_check_reply(rep) # Ensure that everything is ok ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_slow_nodes_catchup_before_selecting_primary_in_new_view( tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, one_node_added): """ Delay 3PC messages to one node and view change messages to some others (including primary) so the node that does not receive enough 3PC messages is behind but learns of the view change quickly and starts catchup. Other nodes learn of the view change late and thus keep on processing requests """ new_node = one_node_added nprs = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)] primary_node = getPrimaryReplica(txnPoolNodeSet, 0).node slow_node = nprs[-1] # nodes_slow_to_inst_chg = [primary_node] + nprs[:2] nodes_slow_to_inst_chg = [n for n in txnPoolNodeSet if n != slow_node] delay_3pc = 100 delay_ic = 5 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) delay_3pc_messages([slow_node], 0, delay_3pc) for n in nodes_slow_to_inst_chg: n.nodeIbStasher.delay(icDelay(delay_ic)) def start_count(): return sum([ 1 for e in slow_node.ledgerManager.spylog.getAll( slow_node.ledgerManager.startCatchUpProcess.__name__) if e.params['ledgerId'] == DOMAIN_LEDGER_ID ]) s = start_count() requests = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 10 * Max3PCBatchSize) ensure_view_change(looper, nodes=txnPoolNodeSet, exclude_from_check=nodes_slow_to_inst_chg) sdk_get_and_check_replies(looper, requests) waitNodeDataEquality(looper, slow_node, *txnPoolNodeSet[:-1]) e = start_count() assert e - s >= 2 looper.run(eventually(checkViewNoForNodes, slow_node.viewNo)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) waitNodeDataEquality(looper, new_node, *nodes_slow_to_inst_chg)
def do_view_change_with_propagate_primary_on_one_delayed_node( slow_node, nodes, looper, sdk_pool_handle, sdk_wallet_client): slow_stasher = slow_node.nodeIbStasher fast_nodes = [n for n in nodes if n != slow_node] stashers = [n.nodeIbStasher for n in nodes] # Get last prepared certificate in pool lpc = last_prepared_certificate(nodes) # Get pool current view no view_no = lpc[0] with delay_rules(slow_stasher, icDelay()): with delay_rules(slow_stasher, vcd_delay()): with delay_rules(stashers, cDelay()): # Send request request = sdk_send_random_request(looper, sdk_pool_handle, sdk_wallet_client) # Wait until this request is prepared on N-f nodes looper.run(eventually(check_last_prepared_certificate_on_quorum, nodes, (lpc[0], lpc[1] + 1))) # Trigger view change for n in nodes: n.view_changer.on_master_degradation() # Wait until view change is completed on all nodes except slow one waitForViewChange(looper, fast_nodes, expectedViewNo=view_no + 1, customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(nodes))) wait_for_elections_done_on_given_nodes(looper, fast_nodes, getRequiredInstances(len(nodes)), timeout=waits.expectedPoolElectionTimeout(len(nodes))) # Now all the nodes receive Commits # The slow node will accept Commits and order the 3PC-batch in the old view looper.runFor(waits.expectedOrderingTime(getNoInstances(len(nodes)))) # Now slow node receives ViewChangeDones waitForViewChange(looper, [slow_node], expectedViewNo=view_no + 1, customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(nodes))) wait_for_elections_done_on_given_nodes(looper, [slow_node], getRequiredInstances(len(nodes)), timeout=waits.expectedPoolElectionTimeout(len(nodes))) # Now slow node receives InstanceChanges but discards them because already # started propagate primary to the same view. # Finish request gracefully sdk_get_reply(looper, request)
def test_slow_nodes_catchup_before_selecting_primary_in_new_view( tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, one_node_added): """ Delay 3PC messages to one node and view change messages to some others (including primary) so the node that does not receive enough 3PC messages is behind but learns of the view change quickly and starts catchup. Other nodes learn of the view change late and thus keep on processing requests """ new_node = one_node_added nprs = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)] primary_node = getPrimaryReplica(txnPoolNodeSet, 0).node slow_node = nprs[-1] # nodes_slow_to_inst_chg = [primary_node] + nprs[:2] nodes_slow_to_inst_chg = [n for n in txnPoolNodeSet if n != slow_node] delay_3pc = 100 delay_ic = 5 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) delay_3pc_messages([slow_node], 0, delay_3pc) for n in nodes_slow_to_inst_chg: n.nodeIbStasher.delay(icDelay(delay_ic)) def start_count(): return sum([1 for e in slow_node.ledgerManager.spylog.getAll( slow_node.ledgerManager.startCatchUpProcess.__name__) if e.params['ledgerId'] == DOMAIN_LEDGER_ID]) s = start_count() requests = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 10 * Max3PCBatchSize) ensure_view_change(looper, nodes=txnPoolNodeSet, exclude_from_check=nodes_slow_to_inst_chg) sdk_get_and_check_replies(looper, requests) waitNodeDataEquality(looper, slow_node, *txnPoolNodeSet[:-1]) e = start_count() assert e - s >= 2 looper.run(eventually(checkViewNoForNodes, slow_node.viewNo)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) waitNodeDataEquality(looper, new_node, *nodes_slow_to_inst_chg)
def test_propagate_primary_after_primary_restart_view_1( looper, txnPoolNodeSet, tconf, sdk_pool_handle, sdk_wallet_steward, tdir, allPluginsPath): """ Delay instance change msgs to prevent view change during primary restart to test propagate primary for primary node. ppSeqNo should be > 0 to be able to check that propagate primary restores all indices correctly case viewNo > 0 """ ensure_view_change(looper, txnPoolNodeSet) checkViewNoForNodes(txnPoolNodeSet, expectedViewNo=1) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle) old_ppseqno = _get_ppseqno(txnPoolNodeSet) assert (old_ppseqno > 0) old_viewNo = checkViewNoForNodes(txnPoolNodeSet) old_primary = get_master_primary_node(txnPoolNodeSet) delay_instance_change(txnPoolNodeSet, IC_DELAY_SEC) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_primary, stopNode=True) looper.removeProdable(old_primary) logger.info("Restart node {}".format(old_primary)) restartedNode = start_stopped_node(old_primary, looper, tconf, tdir, allPluginsPath, delay_instance_change_msgs=False) idx = [i for i, n in enumerate(txnPoolNodeSet) if n.name == restartedNode.name][0] txnPoolNodeSet[idx] = restartedNode restartedNode.nodeIbStasher.delay(icDelay(IC_DELAY_SEC)) looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) new_viewNo = checkViewNoForNodes(txnPoolNodeSet) assert (new_viewNo == old_viewNo) new_primary = get_master_primary_node(txnPoolNodeSet) assert (new_primary.name == old_primary.name) # check ppSeqNo the same _get_ppseqno(txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle) new_ppseqno = _get_ppseqno(txnPoolNodeSet) assert (new_ppseqno > old_ppseqno)
def test_start_view_change_by_vc_msgs(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): delayed_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] with delay_rules_without_processing(delayed_node.nodeIbStasher, icDelay()): current_view_no = checkViewNoForNodes(txnPoolNodeSet) trigger_view_change(txnPoolNodeSet) looper.run(eventually(checkViewNoForNodes, rest_nodes, current_view_no + 1)) ensureElectionsDone(looper, txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_delay_IC_for_next_primary(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): current_view_no = checkViewNoForNodes(txnPoolNodeSet) next_primary_name = get_next_primary_name(txnPoolNodeSet, current_view_no + 1) next_primary = [n for n in txnPoolNodeSet if n.name == next_primary_name][0] rest_nodes = list(set(txnPoolNodeSet) - {next_primary}) with delay_rules_without_processing(next_primary.nodeIbStasher, icDelay()): trigger_view_change(txnPoolNodeSet) looper.run(eventually(checkViewNoForNodes, rest_nodes, current_view_no + 1)) ensureElectionsDone(looper, txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle) assert next_primary.master_replica.isPrimary
def start_stopped_node(stopped_node, looper, tconf, tdirWithPoolTxns, allPluginsPath): nodeHa, nodeCHa = HA(*stopped_node.nodestack.ha), HA( *stopped_node.clientstack.ha) restarted_node = TestNode(stopped_node.name, basedirpath=tdirWithPoolTxns, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(restarted_node) # Even after reconnection INSTANCE_CHANGE messages are received, # delay them enough to simulate real disconnection. This needs to fixed # soon when simulating a disconnection drains the transport queues restarted_node.nodeIbStasher.delay(icDelay(200)) return restarted_node
def testPrimaryRecvs3PhaseMessageOutsideWatermarks(perf_chk_patched, chkFreqPatched, looper, txnPoolNodeSet, client1, wallet1, client1Connected, reqs_for_logsize): """ One of the primary starts getting lot of requests, more than his log size and queues up requests since they will go beyond its watermarks. This happens since other nodes are slow in processing its PRE-PREPARE. Eventually this primary will send PRE-PREPARE for all requests and those requests will complete """ tconf = perf_chk_patched delay = 5 instId = 1 reqs_to_send = 2 * reqs_for_logsize + 1 logger.debug('Will send {} requests'.format(reqs_to_send)) npr = getNonPrimaryReplicas(txnPoolNodeSet, instId) pr = getPrimaryReplica(txnPoolNodeSet, instId) from plenum.server.replica import TPCStat orderedCount = pr.stats.get(TPCStat.OrderSent) for r in npr: r.node.nodeIbStasher.delay(ppDelay(delay, instId)) r.node.nodeIbStasher.delay(pDelay(delay, instId)) # do not do any view changes since we're dealing with non-master instance and # may have not order all requests if view is changed # delay for all nodes (both primary and non-primary), since this is delay # for receiving, not sending. for node in txnPoolNodeSet: node.nodeIbStasher.delay(icDelay(300)) tm_exec_1_batch = waits.expectedTransactionExecutionTime( len(txnPoolNodeSet)) batch_count = math.ceil(reqs_to_send / tconf.Max3PCBatchSize) total_timeout = (tm_exec_1_batch + delay) * batch_count def chk(): assert orderedCount + batch_count == pr.stats.get(TPCStat.OrderSent) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, reqs_to_send) looper.run(eventually(chk, retryWait=1, timeout=total_timeout))
def start_stopped_node(stopped_node, looper, tconf, tdir, allPluginsPath, delay_instance_change_msgs=True): nodeHa, nodeCHa = HA(* stopped_node.nodestack.ha), HA(* stopped_node.clientstack.ha) config_helper = PNodeConfigHelper(stopped_node.name, tconf, chroot=tdir) restarted_node = TestNode(stopped_node.name, config_helper=config_helper, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(restarted_node) # Even after reconnection INSTANCE_CHANGE messages are received, # delay them enough to simulate real disconnection. This needs to fixed # soon when simulating a disconnection drains the transport queues # TODO is it still actual? if delay_instance_change_msgs: restarted_node.nodeIbStasher.delay(icDelay(200)) return restarted_node
def test_add_vc_start_msg_during_start_view_change(txnPoolNodeSet, looper): delayed_node = txnPoolNodeSet[-1] current_view_no = delayed_node.viewNo proposed_view_no = current_view_no + 1 delayed_node.nodeIbStasher.delay(vcd_delay(1000)) delayed_node.nodeIbStasher.delay(icDelay(1000)) assert delayed_node.view_changer ensure_view_change(looper, txnPoolNodeSet[:-1]) """ If view number was incremented, that means that instanceChange quorum was archived """ assert txnPoolNodeSet[0].viewNo == proposed_view_no looper.removeProdable(delayed_node) delayed_node.nodeInBox.append((InstanceChange(1, 25), 'Alpha')) delayed_node.nodeInBox.append((InstanceChange(1, 25), 'Beta')) delayed_node.nodeInBox.append((InstanceChange(1, 25), 'Gamma')) delayed_node.processNodeInBox = functools.partial(Node.processNodeInBox, delayed_node) looper.run(delayed_node.processNodeInBox()) looper.run(delayed_node.serviceViewChanger(None)) assert len(delayed_node.nodeInBox) == 1 m = delayed_node.nodeInBox.popleft() assert isinstance(m[0], ViewChangeStartMessage)
def test_freshness_instance_changes_are_sent_continuosly( looper, tconf, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): current_view_no = txnPoolNodeSet[0].viewNo for node in txnPoolNodeSet: assert node.viewNo == current_view_no def check_instance_change_messages(count=1): for node in txnPoolNodeSet: assert has_freshness_instance_change(node, count) stashers = [n.nodeIbStasher for n in txnPoolNodeSet] with delay_rules(stashers, ppDelay(), icDelay()): looper.run( eventually(check_instance_change_messages, 3, timeout=FRESHNESS_TIMEOUT * 5)) for node in txnPoolNodeSet: all_instance_changes = node.view_changer.spylog.getAll( 'sendInstanceChange') freshness_instance_changes = [ ic for ic in all_instance_changes if ic.params['suspicion'].code == 43 ] # Ensure that all instance change messages were due to freshness assert len(all_instance_changes) == len(freshness_instance_changes) # Ensure that all instance change messages are for same view for ic in freshness_instance_changes: assert ic.params['view_no'] == current_view_no + 1 # Ensure instance change messages had sensible interval for ic1, ic2 in zip(freshness_instance_changes[1:], freshness_instance_changes): interval = ic2.starttime - ic1.starttime assert 0.9 * FRESHNESS_TIMEOUT < interval < 2.1 * FRESHNESS_TIMEOUT
def test_no_propagated_future_view_change_while_view_change(txnPoolNodeSet, looper): # the last node is a lagging one, which will receive ViewChangeDone messages for future view viewNo = checkViewNoForNodes(txnPoolNodeSet) lagged_node = txnPoolNodeSet[-1] other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) # emulate view change in progress lagged_node.view_changer.view_change_in_progress = True old_view_no = checkViewNoForNodes([lagged_node]) initial_vhdc = \ lagged_node.view_changer.spylog.count(lagged_node.view_changer.process_future_view_vchd_msg.__name__) # delay INSTANCE CHANGE on lagged nodes, so all nodes except the lagging one finish View Change with delay_rules(lagged_node.nodeIbStasher, icDelay()): # make sure that View Change happened on all nodes but the lagging one ensure_view_change(looper, other_nodes) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, instances=range(2)) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) # check that lagged node recived 3 Future VCD, but didn't start new view change assert len(other_nodes) + initial_vhdc ==\ lagged_node.view_changer.spylog.count(lagged_node.view_changer.process_future_view_vchd_msg.__name__) assert old_view_no == checkViewNoForNodes([lagged_node])
def test_new_primary_has_wrong_clock(tconf, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One of non-primary has a bad clock, it raises suspicions but orders requests after getting PREPAREs. Then a view change happens this non-primary with the bad clock becomes the new primary but is not able to get any of it's PRE-PREPAREs ordered. Eventually another view change happens and a new primary is elected the pool is functional again :return: """ # The node having the bad clock, this node will be primary after view # change faulty_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[0].node make_clock_faulty(faulty_node) assert not faulty_node.master_replica.isPrimary # faulty_node replies too sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=Max3PCBatchSize * 3) ledger_sizes = { node.name: node.domainLedger.size for node in txnPoolNodeSet} susp_counts = {node.name: get_timestamp_suspicion_count( node) for node in txnPoolNodeSet} ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # After view change, faulty_node is primary assert faulty_node.master_replica.isPrimary old_view_no = txnPoolNodeSet[0].viewNo # Delay instance change so view change doesn't happen in the middle of this test stashers = (n.nodeIbStasher for n in txnPoolNodeSet) with delay_rules(stashers, icDelay()): # Requests are sent for _ in range(5): sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, count=2) looper.runFor(2) def chk(): for node in txnPoolNodeSet: assert node.viewNo == old_view_no for node in [n for n in txnPoolNodeSet if n != faulty_node]: # Each non faulty node raises suspicion assert get_timestamp_suspicion_count(node) > susp_counts[node.name] # Ledger does not change assert node.domainLedger.size == ledger_sizes[node.name] assert faulty_node.domainLedger.size == ledger_sizes[faulty_node.name] looper.run(eventually(chk, retryWait=1)) # Eventually another view change happens looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, retryWait=1, timeout=2 * tconf.PerfCheckFreq)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # After view change, faulty_node is no more the primary assert not faulty_node.master_replica.isPrimary # All nodes reply sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=Max3PCBatchSize * 2)
def test_audit_ledger_view_change(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, sdk_wallet_steward, initial_domain_size, initial_pool_size, initial_config_size, tdir, tconf, allPluginsPath, view_no, pp_seq_no, initial_seq_no, monkeypatch): ''' 1. Send a NODE transaction and add a 7th Node for adding a new instance, but delay Ordered messages. 2. Send a NYM txn. 3. Reset delays in executing force_process_ordered 4. Check that an audit txn for the NYM txn uses primary list from uncommitted audit with a new list of primaries. ''' other_nodes = txnPoolNodeSet[:-1] slow_node = txnPoolNodeSet[-1] # Add a new steward for creating a new node new_steward_wallet_handle = sdk_add_new_nym(looper, sdk_pool_handle, sdk_wallet_steward, alias="newSteward", role=STEWARD_STRING) audit_size_initial = [node.auditLedger.size for node in txnPoolNodeSet] ordereds = [] monkeypatch.setattr(slow_node, 'try_processing_ordered', lambda msg: ordereds.append(msg)) with delay_rules([n.nodeIbStasher for n in txnPoolNodeSet], icDelay()): # Send NODE txn fo 7th node new_node = sdk_add_new_node(looper, sdk_pool_handle, new_steward_wallet_handle, "Theta", tdir, tconf, allPluginsPath) txnPoolNodeSet.append(new_node) looper.run(checkNodesConnected(other_nodes + [new_node])) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) check_audit_ledger_updated(audit_size_initial, [slow_node], audit_txns_added=0) looper.run(eventually(check_audit_ledger_uncommitted_updated, audit_size_initial, [slow_node], 2)) def patch_force_process_ordered(): for msg in list(ordereds): slow_node.replicas[msg.instId].outBox.append(msg) ordereds.remove(msg) monkeypatch.undo() slow_node.force_process_ordered() assert ordereds monkeypatch.setattr(slow_node, 'force_process_ordered', patch_force_process_ordered) looper.run(eventually(lambda: assertExp(all(n.viewNo == 1 for n in txnPoolNodeSet)))) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) looper.run(eventually(lambda: assertExp(not ordereds))) for node in txnPoolNodeSet: last_txn = node.auditLedger.get_last_txn() last_txn['txn']['data']['primaries'] = node._get_last_audited_primaries() check_audit_txn(txn=last_txn, view_no=view_no + 1, pp_seq_no=1, seq_no=initial_seq_no + 4, txn_time=node.master_replica._ordering_service.last_accepted_pre_prepare_time, txn_roots={DOMAIN_LEDGER_ID: node.getLedger(DOMAIN_LEDGER_ID).tree.root_hash}, state_roots={DOMAIN_LEDGER_ID: node.getState(DOMAIN_LEDGER_ID).committedHeadHash}, pool_size=initial_pool_size + 1, domain_size=initial_domain_size + 2, config_size=initial_config_size, last_pool_seqno=2, last_domain_seqno=1, last_config_seqno=None, primaries=node.write_manager.future_primary_handler.get_last_primaries() or node.primaries)
def delay_instance_change(txnPoolNodeSet, val): for n in txnPoolNodeSet: n.nodeIbStasher.delay(icDelay(val))
def test_finish_view_change_with_incorrect_primaries_list( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tdir, tconf, allPluginsPath): """ This test imitates situation when one of nodes is lagged. It missed txn for adding new node and view_change after this. After that lagged node started the next view_change with other nodes, but it has different committed node_reg and selected other primaries. In this case we expect, that lagged node will complete view_change with other primaries and will start catchup by Checkpoints because will not be able to ordering. """ def complete_vc(node): assert not node.view_change_in_progress view_no = checkViewNoForNodes(txnPoolNodeSet) # Delta is lagged lagging_node = txnPoolNodeSet[3] fast_nodes = txnPoolNodeSet[:3] + txnPoolNodeSet[4:] # Force 5 view changes so that we have viewNo == 5 and Zeta is the primary. for _ in range(5): trigger_view_change(txnPoolNodeSet) waitForViewChange(looper, txnPoolNodeSet, view_no + 1) ensureElectionsDone(looper, txnPoolNodeSet) view_no = checkViewNoForNodes(txnPoolNodeSet) with delay_rules_without_processing(lagging_node.nodeIbStasher, msg_rep_delay(), icDelay(), vc_delay(), nv_delay(), cDelay(), ppDelay(), pDelay()): # Add new node and this action should starts view_change because of NODE txn ordered _, theta = sdk_add_new_steward_and_node(looper, sdk_pool_handle, sdk_wallet_steward, 'Theta_Steward', 'Theta', tdir, tconf, allPluginsPath=allPluginsPath) txnPoolNodeSet.append(theta) fast_nodes.append(theta) looper.run(checkNodesConnected(fast_nodes)) ensure_all_nodes_have_same_data(looper, fast_nodes) waitForViewChange(looper, fast_nodes, view_no + 1) ensureElectionsDone(looper, fast_nodes) assert lagging_node.viewNo != fast_nodes[0].viewNo assert fast_nodes[0].viewNo == view_no + 1 current_view_no = checkViewNoForNodes(fast_nodes) expected_view_no = current_view_no + 1 trigger_view_change(txnPoolNodeSet) waitForViewChange(looper, txnPoolNodeSet, expected_view_no) ensureElectionsDone(looper, fast_nodes) looper.run(eventually(complete_vc, lagging_node, timeout=60)) assert lagging_node.viewNo == expected_view_no # We assume that after 2 Checkpoints receiving lagged node will start catchup and elect right primaries sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2 * CHK_SIZE) ensureElectionsDone(looper, txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle)
def test_new_primary_has_wrong_clock(tconf, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One of non-primary has a bad clock, it raises suspicions but orders requests after getting PREPAREs. Then a view change happens this non-primary with the bad clock becomes the new primary but is not able to get any of it's PRE-PREPAREs ordered. Eventually another view change happens and a new primary is elected the pool is functional again :return: """ # The node having the bad clock, this node will be primary after view # change faulty_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[0].node make_clock_faulty(faulty_node) assert not faulty_node.master_replica.isPrimary # faulty_node replies too ledger_sizes = { node.name: node.domainLedger.size for node in txnPoolNodeSet} susp_counts = {node.name: get_timestamp_suspicion_count( node) for node in txnPoolNodeSet} ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # After view change, faulty_node is primary assert faulty_node.master_replica.isPrimary old_view_no = txnPoolNodeSet[0].viewNo # Delay instance change so view change doesn't happen in the middle of this test stashers = (n.nodeIbStasher for n in txnPoolNodeSet) with delay_rules(stashers, icDelay()): # Requests are sent for _ in range(5): sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, count=2) looper.runFor(2) def chk(): for node in txnPoolNodeSet: assert node.viewNo == old_view_no for node in [n for n in txnPoolNodeSet if n != faulty_node]: # Each non faulty node raises suspicion assert get_timestamp_suspicion_count(node) > susp_counts[node.name] # Ledger does not change assert node.domainLedger.size == ledger_sizes[node.name] assert faulty_node.domainLedger.size == ledger_sizes[faulty_node.name] looper.run(eventually(chk, retryWait=1)) # Eventually another view change happens ensure_view_change(looper, txnPoolNodeSet) looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, retryWait=1, timeout=2 * tconf.PerfCheckFreq)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # After view change, faulty_node is no more the primary assert not faulty_node.master_replica.isPrimary # All nodes reply sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=Max3PCBatchSize * 2)
def testQueueingReqFromFutureView(delayedPerf, looper, nodeSet, up, wallet1, client1): """ Test if every node queues 3 Phase requests(PRE-PREPARE, PREPARE and COMMIT) that come from a view which is greater than the current view """ f = getMaxFailures(nodeCount) # Delay processing of instance change on a node nodeA = nodeSet.Alpha nodeA.nodeIbStasher.delay(icDelay(60)) nonPrimReps = getNonPrimaryReplicas(nodeSet, 0) # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's throughput falls and view changes ppDelayer = ppDelay(5, 0) for r in nonPrimReps: r.node.nodeIbStasher.delay(ppDelayer) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 4, timeoutPerReq=5 * nodeCount) # Every node except Node A should have a view change for node in nodeSet: if node.name != nodeA.name: looper.run(eventually( partial(checkViewChangeInitiatedForNode, node, 1), retryWait=1, timeout=20)) # Node A's view should not have changed yet with pytest.raises(AssertionError): looper.run(eventually(partial( checkViewChangeInitiatedForNode, nodeA, 1), retryWait=1, timeout=20)) # NodeA should not have any pending 3 phase request for a later view for r in nodeA.replicas: # type: TestReplica assert len(r.threePhaseMsgsForLaterView) == 0 # Reset delays on incoming messages from all nodes for node in nodeSet: node.nodeIbStasher.nodelay(ppDelayer) # Send one more request sendRandomRequest(wallet1, client1) def checkPending3PhaseReqs(): # Get all replicas that have their primary status decided reps = [rep for rep in nodeA.replicas if rep.isPrimary is not None] # Atleast one replica should have its primary status decided assert len(reps) > 0 for r in reps: # type: TestReplica logger.debug("primary status for replica {} is {}" .format(r, r.primaryNames)) assert len(r.threePhaseMsgsForLaterView) > 0 # NodeA should now have pending 3 phase request for a later view looper.run(eventually(checkPending3PhaseReqs, retryWait=1, timeout=30))
def test_successive_batch_do_no_change_state(looper, tconf, nodeSet, sdk_pool_handle, sdk_wallet_trustee, monkeypatch): """ Send 2 NYM txns in different batches such that the second batch does not change state so that state root remains same, but keep the identifier and reqId different. Make sure the first request is not ordered by the primary before PRE-PREPARE for the second is sent. Also check reject and commit :return: """ # Disable view change during this test for n in nodeSet: n.nodeIbStasher.delay(icDelay()) # Delay only first PRE-PREPARE delay_cm_duration = 10 def delay_commits(wrappedMsg): msg, sender = wrappedMsg if isinstance(msg, Commit) and msg.instId == 0: return delay_cm_duration def check_verkey(i, vk): for node in nodeSet: data = node.idrCache.getNym(i, isCommitted=True) assert data[VERKEY] == vk def check_uncommitted(count): for node in nodeSet: assert len(node.idrCache.un_committed) == count for node in nodeSet: for rpl in node.replicas: monkeypatch.setattr(rpl, '_request_missing_three_phase_messages', lambda *x, **y: None) wh, did = sdk_wallet_trustee seed = randomString(32) (new_did, verkey) = looper.loop.run_until_complete( create_and_store_my_did(wh, json.dumps({'seed': seed}))) sdk_add_new_nym(looper, sdk_pool_handle, sdk_wallet_trustee, dest=new_did, verkey=verkey) for node in nodeSet: node.nodeIbStasher.delay(delay_commits) # Setting the same verkey thrice but in different batches with different # request ids for _ in range(3): verkey = sdk_rotate_verkey(looper, sdk_pool_handle, wh, new_did, new_did, seed) logger.debug('{} rotates his key to {}'.format(new_did, verkey)) # Number of uncommitted entries is 0 looper.run(eventually(check_uncommitted, 0)) check_verkey(new_did, verkey) # Setting the verkey to `x`, then `y` and then back to `x` but in different # batches with different request ids. The idea is to change # state root to `t` then `t'` and then back to `t` and observe that no # errors are encountered seed = randomString(32) (new_client_did, verkey) = looper.loop.run_until_complete( create_and_store_my_did(wh, json.dumps({'seed': seed}))) sdk_add_new_nym(looper, sdk_pool_handle, sdk_wallet_trustee, dest=new_client_did, verkey=verkey) x_seed = randomString(32) verkey = sdk_rotate_verkey(looper, sdk_pool_handle, wh, new_client_did, new_client_did, x_seed) logger.debug('{} rotates his key to {}'.format(new_client_did, verkey)) y_seed = randomString(32) sdk_rotate_verkey(looper, sdk_pool_handle, wh, new_client_did, new_client_did, y_seed) logger.debug('{} rotates his key to {}'.format(new_client_did, verkey)) sdk_rotate_verkey(looper, sdk_pool_handle, wh, new_client_did, new_client_did, x_seed) logger.debug('{} rotates his key to {}'.format(new_client_did, verkey)) # Number of uncommitted entries is 0 looper.run(eventually(check_uncommitted, 0)) check_verkey(new_client_did, verkey) monkeypatch.undo() # Delay COMMITs so that IdrCache can be checked for correct # number of entries uncommitteds = {} methods = {} for node in nodeSet: cache = node.idrCache # type: IdrCache uncommitteds[cache._name] = [] # Since the post batch creation handler is registered (added to a list), # find it and patch it dh = node.get_req_handler(DOMAIN_LEDGER_ID) for i, handler in enumerate(dh.post_batch_creation_handlers): # Find the cache's post create handler, not hardcoding names of # class or functions as they can change with refactoring. if handler.__func__.__qualname__ == '{}.{}'.format( cache.__class__.__name__, cache.currentBatchCreated.__name__): cre = dh.post_batch_creation_handlers[i] break com = cache.onBatchCommitted methods[cache._name] = (cre, com) # Patch methods to record and check roots after commit def patched_cre(self, stateRoot): uncommitteds[self._name].append(stateRoot) return methods[self._name][0](stateRoot) def patched_com(self, stateRoot): assert uncommitteds[self._name][0] == stateRoot rv = methods[self._name][1](stateRoot) uncommitteds[self._name] = uncommitteds[self._name][1:] return rv dh.post_batch_creation_handlers[i] = types.MethodType( patched_cre, cache) cache.onBatchCommitted = types.MethodType(patched_com, cache) # Set verkey of multiple identities more = 5 keys = {} reqs = [] for _ in range(more): seed = randomString(32) nym_request, new_did = looper.loop.run_until_complete( prepare_nym_request(sdk_wallet_trustee, seed, None, None)) keys[new_did] = json.loads(nym_request)['operation']['verkey'] reqs.append( sdk_sign_and_send_prepared_request(looper, sdk_wallet_trustee, sdk_pool_handle, nym_request)) looper.runFor(.01) # Correct number of uncommitted entries looper.run(eventually(check_uncommitted, more, retryWait=1)) sdk_get_and_check_replies(looper, reqs) # Number of uncommitted entries is 0 looper.run(eventually(check_uncommitted, 0)) # The verkeys are correct for i, v in keys.items(): check_verkey(i, v) waitNodeDataEquality(looper, nodeSet[0], *nodeSet[1:]) for _ in range(3): seed = randomString(32) nym_request, new_did = looper.loop.run_until_complete( prepare_nym_request(sdk_wallet_trustee, seed, None, None)) reqs.append( sdk_sign_and_send_prepared_request(looper, sdk_wallet_trustee, sdk_pool_handle, nym_request)) looper.runFor(.01) # Correct number of uncommitted entries looper.run(eventually(check_uncommitted, 3, retryWait=1)) # Check batch reject for node in nodeSet: cache = node.idrCache initial = cache.un_committed cache.batchRejected() # After reject, last entry is removed assert cache.un_committed == initial[:-1] root = cache.un_committed[0][0] cache.onBatchCommitted(root) # Calling commit with same root results in Assertion error with pytest.raises(AssertionError): cache.onBatchCommitted(root)
def test_successive_batch_do_no_change_state(looper, tdirWithDomainTxnsUpdated, tdirWithClientPoolTxns, tconf, nodeSet, trustee, trusteeWallet, monkeypatch): """ Send 2 NYM txns in different batches such that the second batch does not change state so that state root remains same, but keep the identifier and reqId different. Make sure the first request is not ordered by the primary before PRE-PREPARE for the second is sent. Also check reject and commit :return: """ all_reqs = [] # Disable view change during this test for n in nodeSet: n.nodeIbStasher.delay(icDelay()) # Delay only first PRE-PREPARE delay_cm_duration = 10 def delay_commits(wrappedMsg): msg, sender = wrappedMsg if isinstance(msg, Commit) and msg.instId == 0: return delay_cm_duration def new_identity(): wallet = Wallet(randomString(5)) signer = DidSigner() new_idr, _ = wallet.addIdentifier(signer=signer) verkey = wallet.getVerkey(new_idr) idy = Identity(identifier=new_idr, verkey=verkey, role=None) return idy, wallet def submit_id_req(idy, wallet=None, client=None): nonlocal all_reqs wallet = wallet if wallet is not None else trusteeWallet client = client if client is not None else trustee wallet.updateTrustAnchoredIdentity(idy) reqs = wallet.preparePending() all_reqs.extend(reqs) client.submitReqs(*reqs) return reqs def submit_id_req_and_wait(idy, wallet=None, client=None): reqs = submit_id_req(idy, wallet=wallet, client=client) looper.runFor(.2) return reqs def check_verkey(i, vk): for node in nodeSet: data = node.idrCache.getNym(i, isCommitted=True) assert data[VERKEY] == vk def check_uncommitted(count): for node in nodeSet: assert len(node.idrCache.un_committed) == count for node in nodeSet: for rpl in node.replicas: monkeypatch.setattr(rpl, '_request_missing_three_phase_messages', lambda *x, **y: None) idy, new_wallet = new_identity() new_idr = idy.identifier verkey = idy.verkey submit_id_req(idy) waitForSufficientRepliesForRequests(looper, trustee, requests=all_reqs[-1:], add_delay_to_timeout=delay_cm_duration) for node in nodeSet: node.nodeIbStasher.delay(delay_commits) new_client, _ = genTestClient(nodeSet, tmpdir=tdirWithClientPoolTxns, usePoolLedger=True) looper.add(new_client) looper.run(new_client.ensureConnectedToNodes(count=len(nodeSet))) new_client.registerObserver(new_wallet.handleIncomingReply, name='temp') idy.seqNo = None # Setting the same verkey thrice but in different batches with different # request ids for _ in range(3): req, = submit_id_req_and_wait(idy, wallet=new_wallet, client=new_client) logger.debug('{} sent request {} to change verkey'.format( new_client, req)) waitForSufficientRepliesForRequests(looper, new_client, requests=all_reqs[-3:], add_delay_to_timeout=delay_cm_duration) # Number of uncommitted entries is 0 looper.run(eventually(check_uncommitted, 0)) check_verkey(new_idr, verkey) new_client.deregisterObserver(name='temp') # Setting the verkey to `x`, then `y` and then back to `x` but in different # batches with different request ids. The idea is to change # state root to `t` then `t'` and then back to `t` and observe that no # errors are encountered idy, new_wallet = new_identity() submit_id_req(idy) waitForSufficientRepliesForRequests(looper, trustee, requests=all_reqs[-1:], add_delay_to_timeout=delay_cm_duration) new_client.registerObserver(new_wallet.handleIncomingReply) idy.seqNo = None x_signer = SimpleSigner(identifier=idy.identifier) idy.verkey = x_signer.verkey req, = submit_id_req_and_wait(idy, wallet=new_wallet, client=new_client) new_wallet.updateSigner(idy.identifier, x_signer) logger.debug('{} sent request {} to change verkey'.format(new_client, req)) y_signer = SimpleSigner(identifier=idy.identifier) idy.verkey = y_signer.verkey req, = submit_id_req_and_wait(idy, wallet=new_wallet, client=new_client) new_wallet.updateSigner(idy.identifier, y_signer) logger.debug('{} sent request {} to change verkey'.format(new_client, req)) idy.verkey = x_signer.verkey req, = submit_id_req_and_wait(idy, wallet=new_wallet, client=new_client) new_wallet.updateSigner(idy.identifier, x_signer) logger.debug('{} sent request {} to change verkey'.format(new_client, req)) waitForSufficientRepliesForRequests(looper, new_client, requests=all_reqs[-3:], add_delay_to_timeout=delay_cm_duration) # Number of uncommitted entries is 0 looper.run(eventually(check_uncommitted, 0)) check_verkey(new_idr, verkey) monkeypatch.undo() # Delay COMMITs so that IdrCache can be checked for correct # number of entries uncommitteds = {} methods = {} for node in nodeSet: cache = node.idrCache # type: IdrCache uncommitteds[cache._name] = [] # Since the post batch creation handler is registered (added to a list), # find it and patch it dh = node.get_req_handler(DOMAIN_LEDGER_ID) for i, handler in enumerate(dh.post_batch_creation_handlers): # Find the cache's post create handler, not hardcoding names of # class or functions as they can change with refactoring. if handler.__func__.__qualname__ == '{}.{}'.format( cache.__class__.__name__, cache.currentBatchCreated.__name__): cre = dh.post_batch_creation_handlers[i] break com = cache.onBatchCommitted methods[cache._name] = (cre, com) # Patch methods to record and check roots after commit def patched_cre(self, stateRoot): uncommitteds[self._name].append(stateRoot) return methods[self._name][0](stateRoot) def patched_com(self, stateRoot): assert uncommitteds[self._name][0] == stateRoot rv = methods[self._name][1](stateRoot) uncommitteds[self._name] = uncommitteds[self._name][1:] return rv dh.post_batch_creation_handlers[i] = types.MethodType( patched_cre, cache) cache.onBatchCommitted = types.MethodType(patched_com, cache) # Set verkey of multiple identities more = 5 keys = {} for _ in range(more): idy, _ = new_identity() keys[idy.identifier] = idy.verkey submit_id_req(idy) looper.runFor(.01) # Correct number of uncommitted entries looper.run(eventually(check_uncommitted, more, retryWait=1)) waitForSufficientRepliesForRequests(looper, trustee, requests=all_reqs[-more:], add_delay_to_timeout=delay_cm_duration) # Number of uncommitted entries is 0 looper.run(eventually(check_uncommitted, 0)) # The verkeys are correct for i, v in keys.items(): check_verkey(i, v) waitNodeDataEquality(looper, nodeSet[0], *nodeSet[1:]) keys = {} for _ in range(3): idy, _ = new_identity() keys[idy.identifier] = idy.verkey submit_id_req(idy) looper.runFor(.01) # Correct number of uncommitted entries looper.run(eventually(check_uncommitted, 3, retryWait=1)) # Check batch reject for node in nodeSet: cache = node.idrCache initial = cache.un_committed cache.batchRejected() # After reject, last entry is removed assert cache.un_committed == initial[:-1] root = cache.un_committed[0][0] cache.onBatchCommitted(root) # Calling commit with same root results in Assertion error with pytest.raises(AssertionError): cache.onBatchCommitted(root)
def test_catchup_to_next_view_during_view_change_0_to_1_then_1_to_2( txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward): ''' 1) Lagging node is not a primary for new views 2) All nodes except the lagging one go to view=1 3) All nodes except the lagging one order txns on view=1 4) All nodes except the lagging one go to view=2 5) All nodes except the lagging one order txns on view=2 6) Lagging node gets InstanceChanges for view=1 => it changes to view=1, and catches up till txns from view=2 7) Lagging node gets InstanceChanges for view=2 => it changes to view=2 8) Make sure that the lagging node is up to date, and canc participate in consensus ''' lagging_node = txnPoolNodeSet[0] other_nodes = txnPoolNodeSet[1:] initial_view_no = checkViewNoForNodes(txnPoolNodeSet) initial_last_ordered = lagging_node.master_last_ordered_3PC with delay_rules(lagging_node.nodeIbStasher, icDelay(viewNo=2), vc_delay(view_no=2)): with delay_rules(lagging_node.nodeIbStasher, delay_for_view(viewNo=0), delay_for_view(viewNo=1)): # view change to viewNo=1 for n in txnPoolNodeSet: n.view_changer.on_master_degradation() waitForViewChange(looper, other_nodes, expectedViewNo=initial_view_no + 1) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, instances=range(3)) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) # order some txns sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) # view change to viewNo=2 for n in txnPoolNodeSet: n.view_changer.on_master_degradation() waitForViewChange(looper, other_nodes, expectedViewNo=initial_view_no + 2) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, instances=range(3)) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) # order some txns sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) assert initial_view_no == lagging_node.viewNo assert initial_last_ordered == lagging_node.master_last_ordered_3PC # make sure that the first View Change happened on the lagging node waitForViewChange(looper, [lagging_node], expectedViewNo=initial_view_no + 1, customTimeout=20) assert initial_view_no + 1 == lagging_node.viewNo # make sure that the second View Change happened on the lagging node waitForViewChange(looper, [lagging_node], expectedViewNo=initial_view_no + 2, customTimeout=20) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) # make sure that the pool is functional sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle)
def test_node_detecting_lag_from_view_change_done_messages( txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf): """ A node is slow and after view change starts, it marks it's `last_prepared` to less than others, after catchup it does not get any txns from others and finds it has already ordered it's `last_prepared`, but when it gets ViewChangeDone messages, it starts catchup again and this time gets the txns. To achieve this delay all 3PC messages to a node so before view change it has different last_prepared from others. Also delay processing of COMMITs and INSTANCE_CHANGEs by other nodes """ send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica fast_master_replicas = [n.master_replica for n in fast_nodes] delay_3pc = 50 delay_ic = tconf.PerfCheckFreq + 5 delay_commit = delay_ic + 10 delay_3pc_messages([slow_node], 0, delay_3pc) for n in fast_nodes: n.nodeIbStasher.delay(icDelay(delay_ic)) n.nodeIbStasher.delay(cDelay(delay_commit)) reqs = [] for i in range(10): reqs = reqs + sendRandomRequests(wallet1, client1, 2) looper.runFor(.2) def chk1(): for rep in fast_master_replicas: assert compare_3PC_keys( slow_master_replica.last_prepared_certificate_in_view(), rep.last_prepared_certificate_in_view()) > 0 assert slow_master_replica.last_ordered_3pc == rep.last_ordered_3pc looper.run(eventually(chk1)) no_more_catchup_count = get_count(slow_node, slow_node.no_more_catchups_needed) # Track last prepared for master replica of each node prepareds = {} orig_methods = {} for node in txnPoolNodeSet: orig_methods[node.name] = node.master_replica.on_view_change_start def patched_on_view_change_start(self): orig_methods[self.node.name]() prepareds[self.node.name] = self.last_prepared_before_view_change node.master_replica.on_view_change_start = types.MethodType( patched_on_view_change_start, node.master_replica) ensure_view_change(looper, txnPoolNodeSet, exclude_from_check=fast_nodes) def chk2(): # last_prepared of slow_node is less than fast_nodes for rep in fast_master_replicas: assert compare_3PC_keys(prepareds[slow_master_replica.node.name], prepareds[rep.node.name]) > 0 looper.run(eventually(chk2, timeout=delay_ic + 5)) last_start_catchup_call_at = None no_more_catchup_call_at = None def chk3(): # no_more_catchups_needed was called since node found no need of catchup nonlocal last_start_catchup_call_at, no_more_catchup_call_at assert (get_count(slow_node, slow_node.no_more_catchups_needed) - no_more_catchup_count) > 0 no_more_catchup_call_at = slow_node.spylog.getLast( slow_node.no_more_catchups_needed).starttime last_start_catchup_call_at = slow_node.spylog.getLast( slow_node.start_catchup).starttime looper.run(eventually(chk3, timeout=delay_commit)) for n in fast_nodes: n.nodeIbStasher.reset_delays_and_process_delayeds() n.nodeIbStasher.reset_delays_and_process_delayeds() ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) assert slow_node.spylog.getLast( slow_node.start_catchup).starttime > no_more_catchup_call_at assert slow_node.spylog.getLast( slow_node.start_catchup).starttime > last_start_catchup_call_at
def testQueueingReqFromFutureView(delayedPerf, looper, nodeSet, up, wallet1, client1): """ Test if every node queues 3 Phase requests(PRE-PREPARE, PREPARE and COMMIT) that come from a view which is greater than the current view """ f = getMaxFailures(nodeCount) # Delay processing of instance change on a node nodeA = nodeSet.Alpha nodeA.nodeIbStasher.delay(icDelay(60)) nonPrimReps = getNonPrimaryReplicas(nodeSet, 0) # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's throughput falls and view changes ppDelayer = ppDelay(5, 0) for r in nonPrimReps: r.node.nodeIbStasher.delay(ppDelayer) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 4, timeoutPerReq=5 * nodeCount) # Every node except Node A should have a view change for node in nodeSet: if node.name != nodeA.name: looper.run( eventually(partial(checkViewChangeInitiatedForNode, node, 1), retryWait=1, timeout=20)) # Node A's view should not have changed yet with pytest.raises(AssertionError): looper.run( eventually(partial(checkViewChangeInitiatedForNode, nodeA, 1), retryWait=1, timeout=20)) # NodeA should not have any pending 3 phase request for a later view for r in nodeA.replicas: # type: TestReplica assert len(r.threePhaseMsgsForLaterView) == 0 # Reset delays on incoming messages from all nodes for node in nodeSet: node.nodeIbStasher.nodelay(ppDelayer) # Send one more request sendRandomRequest(wallet1, client1) def checkPending3PhaseReqs(): # Get all replicas that have their primary status decided reps = [rep for rep in nodeA.replicas if rep.isPrimary is not None] # Atleast one replica should have its primary status decided assert len(reps) > 0 for r in reps: # type: TestReplica logger.debug("primary status for replica {} is {}".format( r, r.primaryNames)) assert len(r.threePhaseMsgsForLaterView) > 0 # NodeA should now have pending 3 phase request for a later view looper.run(eventually(checkPending3PhaseReqs, retryWait=1, timeout=30))