def test_view_change_during_unstash(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf): slow_node = txnPoolNodeSet[-1] other_nodes = txnPoolNodeSet[:-1] slow_stasher = slow_node.nodeIbStasher other_stashers = [n.nodeIbStasher for n in other_nodes] all_stashers = [n.nodeIbStasher for n in txnPoolNodeSet] # Preload nodes with some transactions sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) for node in txnPoolNodeSet: assert node.master_replica.last_ordered_3pc == (0, 1) # Prevent ordering of some requests start_delaying(all_stashers, delay_3pc(after=7, msgs=(Prepare, Commit))) # Stop ordering on slow node and send requests slow_node_after_5 = start_delaying(slow_stasher, delay_3pc(after=5, msgs=Commit)) slow_node_until_5 = start_delaying(slow_stasher, delay_3pc(after=0)) reqs_view_0 = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 8) # Make pool order first 2 batches and pause pool_after_3 = start_delaying(other_stashers, delay_3pc(after=3)) looper.run(eventually(check_nodes_ordered_till, other_nodes, 0, 3)) # Start catchup, continue ordering everywhere (except two last batches on slow node) with delay_rules(slow_stasher, cr_delay()): slow_node._do_start_catchup(just_started=False) looper.run(eventually(check_catchup_is_started, slow_node)) stop_delaying_and_process(pool_after_3) looper.run(eventually(check_nodes_ordered_till, other_nodes, 0, 7)) # Finish catchup and continue processing on slow node looper.run(eventually(check_catchup_is_finished, slow_node)) stop_delaying_and_process(slow_node_until_5) looper.run(eventually(check_nodes_ordered_till, [slow_node], 0, 5)) # Start view change and allow slow node to get remaining commits with delay_rules(all_stashers, icDelay()): for node in txnPoolNodeSet: node.view_changer.on_master_degradation() looper.runFor(0.1) stop_delaying_and_process(slow_node_after_5) # Ensure that expected number of requests was ordered replies = sdk_get_replies(looper, reqs_view_0) for rep in replies[:6]: sdk_check_reply(rep) # Ensure that everything is ok ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_catchup_from_unequal_nodes_without_waiting(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): normal_node = txnPoolNodeSet[0] lagging_node_1 = txnPoolNodeSet[1] lagging_node_2 = txnPoolNodeSet[2] stopped_node = txnPoolNodeSet[3] # Make sure everyone have one batch sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) # Wait until all nodes have same data and store last 3PC number of node that's going to be "stopped" ensure_all_nodes_have_same_data(looper, txnPoolNodeSet, custom_timeout=30) last_3pc = stopped_node.master_last_ordered_3PC with delay_rules_without_processing(stopped_node.nodeIbStasher, delay_3pc()): # Create one more batch on all nodes except "stopped" node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) with delay_rules(lagging_node_1.nodeIbStasher, delay_3pc(msgs=Commit)): # Create one more batch on all nodes except "stopped" and first lagging node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) with delay_rules(lagging_node_2.nodeIbStasher, delay_3pc(msgs=Commit)): # Create one more batch on all nodes except "stopped" and both lagging nodes # This time we can't wait for replies because there will be only one reqs = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) # Wait until normal node orders txn looper.run(eventually(lambda: assert_eq(normal_node.master_last_ordered_3PC[1], last_3pc[1] + 3))) # Now all nodes have different number of txns, so if we try to start a catch up # it is guaranteed that we'll need to ask for equal consistency proofs, and # disabled timeout ensures that node can do so without relying on timeout stopped_node.start_catchup() # Wait until catchup ends looper.run(eventually(lambda: assert_eq(stopped_node.ledgerManager._node_leecher._state, NodeLeecherService.State.Idle))) # Ensure stopped node caught up at least one batch assert stopped_node.master_last_ordered_3PC[1] > last_3pc[1] # And there was no view change assert stopped_node.master_last_ordered_3PC[0] == last_3pc[0] # Make sure replies from last request are eventually received sdk_get_and_check_replies(looper, reqs)
def test_node_reject_invalid_txn_during_catchup(looper, sdk_pool_handle, sdk_wallet_client, tconf, tdir, txnPoolNodeSet, bad_node, lagging_node): """ Make sure that catching up node will blacklist nodes which send incorrect catchup replies """ normal_nodes = [ node for node in txnPoolNodeSet if node not in [bad_node, lagging_node] ] normal_stashers = [node.nodeIbStasher for node in normal_nodes] with delay_rules_without_processing(lagging_node.nodeIbStasher, delay_3pc()): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Perform catchup, while making sure that cons proof from bad node is received # before cons proofs from normal nodes, so bad node can participate in catchup with delay_rules(normal_stashers, lsDelay()): lagging_node.start_catchup() node_leecher = lagging_node.ledgerManager._node_leecher audit_cons_proof_service = node_leecher._leechers[ AUDIT_LEDGER_ID]._cons_proof_service looper.run( eventually(lambda: assert_in( bad_node.name, audit_cons_proof_service._cons_proofs))) waitNodeDataEquality(looper, lagging_node, *normal_nodes) assert lagging_node.isNodeBlacklisted(bad_node.name)
def test_catchup_with_reask_cp(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): ''' Start a catchup Delay ConsistencyProofs twice Check that the catchup finished ''' lagged_node = txnPoolNodeSet[-1] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) with delay_rules_without_processing( lagged_node.nodeIbStasher, delay_3pc(), msg_rep_delay(types_to_delay=[COMMIT])): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) lagged_node.nodeIbStasher.drop_delayeds() with delay_rules_without_processing(lagged_node.nodeIbStasher, cpDelay()): lagged_node.start_catchup() def chk(): cp_count = 0 for msg in lagged_node.nodeIbStasher.delayeds: if isinstance(msg.item[0], ConsistencyProof): cp_count += 1 assert cp_count >= (len(txnPoolNodeSet) - 1) * 2 lagged_node.nodeIbStasher.drop_delayeds() looper.run(eventually(chk)) waitNodeDataEquality(looper, lagged_node, *txnPoolNodeSet, exclude_from_check=['check_last_ordered_3pc_backup'])
def test_catchup_with_all_nodes_sending_cons_proofs_dead( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, logsearch): lagging_node = txnPoolNodeSet[-1] other_nodes = txnPoolNodeSet[:-1] start_delaying(lagging_node.nodeIbStasher, delay_3pc()) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) log_re_ask, _ = logsearch( msgs=['requesting .* missing transactions after timeout']) old_re_ask_count = len(log_re_ask) catchup_reqs = { node.name: start_delaying(node.nodeIbStasher, cqDelay()) for node in other_nodes } audit_catchup_service = lagging_node.ledgerManager._node_leecher._leechers[ AUDIT_LEDGER_ID]._catchup_rep_service lagging_node.start_catchup() looper.run( eventually(lambda: assert_eq(audit_catchup_service._is_working, True))) # Make sure number of cons proofs gathered when all nodes are assert len(audit_catchup_service._nodes_ledger_sizes) == 3 # Allow catchup requests only from nodes that didn't respond first for node_id, node_reqs in catchup_reqs.items(): if node_id not in audit_catchup_service._nodes_ledger_sizes: stop_delaying_and_process(node_reqs) # Check catchup finishes successfully, and there were reasks ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) assert len(log_re_ask) - old_re_ask_count > 0
def test_missing_pp_before_starting_vc(tconf, txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward): ''' - all nodes delay PrePrepares for viewNo=1 with ppSeqNo<4 - all nodes go to view=1 - 10 batches are sent (they will not be ordered because of missing (delayed) PrePrepares - all nodes start a view change to view=2 - So all nodes finish view change successfully and can continue ordering ''' all_stashers = [n.nodeIbStasher for n in txnPoolNodeSet] # 1. delay PrePrepares with ppSeqNo<4 with delay_rules(all_stashers, delay_3pc(view_no=1, before=4, msgs=PrePrepare)): # 2. do view change for view=1 ensure_view_change(looper, txnPoolNodeSet) looper.run(eventually(check_not_in_view_change, txnPoolNodeSet)) # 3. send requests sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_steward, 10) # 4. do view change for view=2 ensure_view_change(looper, txnPoolNodeSet) # 5. ensure everything is fine ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle)
def test_catchup_uses_only_nodes_with_cons_proofs(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): lagging_node = txnPoolNodeSet[-1] other_nodes = txnPoolNodeSet[:-1] start_delaying(lagging_node.nodeIbStasher, delay_3pc()) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) catchup_reqs = {node.name: start_delaying(node.nodeIbStasher, cqDelay()) for node in other_nodes} audit_catchup_service = lagging_node.ledgerManager._node_leecher._leechers[AUDIT_LEDGER_ID]._catchup_rep_service lagging_node.start_catchup() looper.run(eventually(lambda: assert_eq(audit_catchup_service._is_working, True))) # Make sure number of cons proofs gathered when all nodes are assert len(audit_catchup_service._nodes_ledger_sizes) == 3 # Allow catchup requests only for interesting nodes for node_id in audit_catchup_service._nodes_ledger_sizes.keys(): stop_delaying_and_process(catchup_reqs[node_id]) # Check catchup finishes successfully ensure_all_nodes_have_same_data(looper, txnPoolNodeSet, custom_timeout=30)
def test_re_order_pre_prepares_no_pre_prepares(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): # 1. drop PrePrepars, Prepares and Commits on 4thNode # Order a couple of requests on Nodes 1-3 lagging_node = txnPoolNodeSet[-1] other_nodes = txnPoolNodeSet[:-1] with delay_rules_without_processing(lagging_node.nodeIbStasher, delay_3pc()): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3) assert all(n.master_last_ordered_3PC == (0, 3) for n in other_nodes) with delay_rules_without_processing(lagging_node.nodeIbStasher, msg_rep_delay(types_to_delay=[PREPREPARE, PREPARE, COMMIT])): # 2. simulate view change start so that # all PrePrepares/Prepares/Commits are cleared # and uncommitted txns are reverted for n in txnPoolNodeSet: n.replicas.send_to_internal_bus(ViewChangeStarted(view_no=1)) master_ordering_service = n.master_replica._ordering_service assert not master_ordering_service.prePrepares assert not master_ordering_service.prepares assert not master_ordering_service.commits ledger = n.db_manager.ledgers[DOMAIN_LEDGER_ID] state = n.db_manager.states[DOMAIN_LEDGER_ID] assert len(ledger.uncommittedTxns) == 0 assert ledger.uncommitted_root_hash == ledger.tree.root_hash assert state.committedHead == state.head # check that all nodes but the lagging one have old_view_pps stored for n in other_nodes: assert n.master_replica._ordering_service.old_view_preprepares assert not lagging_node.master_replica._ordering_service.old_view_preprepares # 3. Simulate View Change finish to re-order the same PrePrepare assert lagging_node.master_last_ordered_3PC == (0, 0) new_master = txnPoolNodeSet[1] batches = sorted([preprepare_to_batch_id(pp) for _, pp in new_master.master_replica._ordering_service.old_view_preprepares.items()]) new_view_msg = NewView(viewNo=0, viewChanges=[], checkpoint=None, batches=batches) new_view_chk_applied_msg = NewViewCheckpointsApplied(view_no=0, view_changes=[], checkpoint=None, batches=batches) for n in txnPoolNodeSet: n.master_replica._consensus_data.new_view = new_view_msg n.master_replica._consensus_data.prev_view_prepare_cert = batches[-1].pp_seq_no n.master_replica._ordering_service._bus.send(new_view_chk_applied_msg) # 4. Make sure that the nodes 1-3 (that already ordered the requests) sent Prepares and Commits so that # the request was eventually ordered on Node4 as well waitNodeDataEquality(looper, lagging_node, *other_nodes, customTimeout=60) assert lagging_node.master_last_ordered_3PC == (0, 4) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_catchup_from_unequal_nodes_without_reasking(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): lagged_node_1 = txnPoolNodeSet[-1] lagged_node_2 = txnPoolNodeSet[-2] normal_nodes = [ node for node in txnPoolNodeSet if node not in [lagged_node_1, lagged_node_2] ] normal_stashers = [node.nodeIbStasher for node in normal_nodes] with delay_rules_without_processing(lagged_node_1.nodeIbStasher, delay_3pc()): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2) with delay_rules_without_processing(lagged_node_2.nodeIbStasher, delay_3pc()): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 7) ensure_all_nodes_have_same_data(looper, normal_nodes, custom_timeout=30) # Perform catchup, while making sure that cons proof from lagging node is received # before cons proofs from normal nodes, so lagging node can participate in catchup with delay_rules(normal_stashers, lsDelay()): lagged_node_1.start_catchup() node_leecher = lagged_node_1.ledgerManager._node_leecher audit_cons_proof_service = node_leecher._leechers[ AUDIT_LEDGER_ID]._cons_proof_service looper.run( eventually(lambda: assertExp(lagged_node_2.name in audit_cons_proof_service. _cons_proofs))) # Make sure catchup finishes successfully ensure_all_nodes_have_same_data(looper, set(txnPoolNodeSet) - {lagged_node_2}, custom_timeout=30) assert lagged_node_1.ledgerManager._node_leecher._state == NodeLeecherService.State.Idle
def test_catchup_with_old_txn_metadata_digest_format(tdir, tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, monkeypatch): lagging_node = txnPoolNodeSet[-1] lagging_stasher = lagging_node.nodeIbStasher other_nodes = txnPoolNodeSet[:-1] # Utility def check_nodes_domain_ledger(nodes: Iterable, txn_count: int): for node in nodes: assert node.domainLedger.size >= txn_count # Patch payload metadata, note that it will prevent pool from sending adequate replies to clients def append_old_payload_metadata(txn, frm=None, req_id=None, digest=None, payload_digest=None, taa_acceptance=None, endorser=None): txn = append_payload_metadata(txn, frm, req_id, digest, payload_digest, taa_acceptance, endorser) metadata = txn[TXN_PAYLOAD][TXN_PAYLOAD_METADATA] del metadata[TXN_PAYLOAD_METADATA_PAYLOAD_DIGEST] metadata[TXN_PAYLOAD_METADATA_DIGEST] = payload_digest return txn monkeypatch.setattr(txn_util, 'append_payload_metadata', append_old_payload_metadata) # Check pool initial state initial_size = txnPoolNodeSet[0].domainLedger.size for node in txnPoolNodeSet: assert node.domainLedger.size == initial_size # Order some transactions, with one node discarding messages with delay_rules_without_processing(lagging_stasher, delay_3pc()): reps = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 10) looper.run( eventually(check_nodes_domain_ledger, other_nodes, initial_size + 10)) assert lagging_node.domainLedger.size == initial_size # Catchup transactions and ensure that all nodes will eventually have same data lagging_node.start_catchup() ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) # Catch replies sdk_get_replies(looper, reps)
def test_max_3pc_batches_in_flight(tdir, tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): # Check pool initial state initial_3pc = txnPoolNodeSet[0].master_replica.last_ordered_3pc for node in txnPoolNodeSet[1:]: assert node.master_replica.last_ordered_3pc == initial_3pc # Utility def check_ordered_till(pp_seq_no: int): for node in txnPoolNodeSet: last_ordered = node.master_replica.last_ordered_3pc assert last_ordered[0] == initial_3pc[0] assert last_ordered[1] == pp_seq_no # Delay some commits all_stashers = [node.nodeIbStasher for node in txnPoolNodeSet] delayers = [] for num in range(BATCHES_TO_ORDER): pp_seq_no = initial_3pc[1] + num + 1 delayer = start_delaying(all_stashers, delay_3pc(after=pp_seq_no - 1, before=pp_seq_no + 1)) delayers.append((pp_seq_no, delayer)) # Send a number of requests reqs = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, BATCHES_TO_ORDER) # Continuously check number of batches in flight for pp_seq_no, delayer in delayers: stop_delaying_and_process(delayer) looper.run(eventually(check_ordered_till, pp_seq_no)) for node in txnPoolNodeSet: for replica in node.replicas.values(): batches_in_flight = replica.lastPrePrepareSeqNo - replica.last_ordered_3pc[1] assert batches_in_flight <= MAX_BATCHES_IN_FLIGHT # Check all requests are ordered sdk_get_and_check_replies(looper, reqs) # Ensure that all nodes will eventually have same data ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_set_H_as_maxsize_for_backup_if_is_primary(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) primary_on_backup = txnPoolNodeSet[2] assert primary_on_backup.replicas._replicas[1].isPrimary # Stop Node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, primary_on_backup, stopNode=True) txnPoolNodeSet.remove(primary_on_backup) looper.removeProdable(primary_on_backup) # Start stopped Node primary_on_backup = start_stopped_node(primary_on_backup, looper, tconf, tdir, allPluginsPath) # Delay 3PC messages so that when restarted node does not have them ordered with delay_rules(primary_on_backup.nodeIbStasher, delay_3pc()): txnPoolNodeSet.append(primary_on_backup) ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=tconf.NEW_VIEW_TIMEOUT) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, LOG_SIZE) # Check restored state assert primary_on_backup.replicas._replicas[1].isPrimary assert primary_on_backup.replicas._replicas[1].h == 1 assert primary_on_backup.replicas._replicas[1].H == 1 + LOG_SIZE def chk(): assert primary_on_backup.replicas._replicas[1].h == LOG_SIZE assert primary_on_backup.replicas._replicas[1].H == LOG_SIZE + LOG_SIZE # Check caught-up state looper.run(eventually(chk, retryWait=.2, timeout=tconf.NEW_VIEW_TIMEOUT))
def test_catchup_with_reask_ls(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): ''' Start a catchup Delay MessageqResp with LedgerStatuses twice Check that the catchup finished ''' lagged_node = txnPoolNodeSet[-1] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) lagged_node.nodeIbStasher.delay(msg_rep_delay(types_to_delay=[COMMIT])) with delay_rules_without_processing( lagged_node.nodeIbStasher, delay_3pc(), msg_rep_delay(types_to_delay=[COMMIT])): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) lagged_node.nodeIbStasher.drop_delayeds() with delay_rules_without_processing( lagged_node.nodeIbStasher, lsDelay(), msg_rep_delay(types_to_delay=[LEDGER_STATUS])): lagged_node.start_catchup() def chk(): resp_ls_count = 0 for msg in lagged_node.nodeIbStasher.delayeds: if isinstance( msg.item[0], MessageRep) and msg.item[0].msg_type == LEDGER_STATUS: resp_ls_count += 1 assert resp_ls_count >= (len(txnPoolNodeSet) - 1) * 2 lagged_node.nodeIbStasher.drop_delayeds() looper.run(eventually(chk)) waitNodeDataEquality(looper, lagged_node, *txnPoolNodeSet, exclude_from_check=['check_last_ordered_3pc_backup'])
def test_node_requests_missing_preprepares_and_prepares( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tconf, tdir, allPluginsPath): """ 2 of 4 nodes go down (simulate this by dropping requests), so pool can not process any more incoming requests. A new request comes in. After a while those 2 nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPREPARES and PREPARES and the pool successfully handles both transactions after that. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 disconnected_nodes = txnPoolNodeSet[2:] alive_nodes = txnPoolNodeSet[:2] disconnected_nodes_stashers = [n.nodeIbStasher for n in disconnected_nodes] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) init_ledger_size = txnPoolNodeSet[0].domainLedger.size with delay_rules_without_processing(disconnected_nodes_stashers, delay_3pc()): sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) last_ordered_key = txnPoolNodeSet[0].master_replica.last_ordered_3pc looper.run(eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, last_ordered_key, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) for node in txnPoolNodeSet: assert node.domainLedger.size == init_ledger_size for node in disconnected_nodes: assert node.master_replica._ordering_service.spylog.count(OrderingService._request_pre_prepare) == 0 assert node.master_replica._ordering_service.spylog.count(OrderingService._request_prepare) == 0 assert node.master_replica._message_req_service.spylog.count(MessageReqService.process_message_rep) == 0 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet[:-1]) for node in disconnected_nodes: assert node.master_replica._ordering_service.spylog.count(OrderingService._request_pre_prepare) > 0 assert node.master_replica._ordering_service.spylog.count(OrderingService._request_prepare) > 0 assert node.master_replica._message_req_service.spylog.count(MessageReqService.process_message_rep) > 0 def check_all_ordered(): for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT) looper.run(eventually(check_all_ordered, timeout=20))
def test_catchup_with_skipped_commits(tdir, tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): lagging_node = txnPoolNodeSet[-1] lagging_stasher = lagging_node.nodeIbStasher other_nodes = txnPoolNodeSet[:-1] other_stashers = [node.nodeIbStasher for node in other_nodes] def lagging_node_state() -> NodeLeecherService.State: return lagging_node.ledgerManager._node_leecher._state def check_lagging_node_is_not_syncing_audit(): assert lagging_node_state() != NodeLeecherService.State.SyncingAudit def check_lagging_node_done_catchup(): assert lagging_node_state() == NodeLeecherService.State.Idle def check_nodes_ordered_till(nodes: Iterable, view_no: int, pp_seq_no: int): for node in nodes: assert compare_3PC_keys((view_no, pp_seq_no), node.master_replica.last_ordered_3pc) >= 0 # Preload nodes with some transactions sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) for node in txnPoolNodeSet: assert node.master_replica.last_ordered_3pc == (0, 1) # Setup delayers lagging_mid_commits = start_delaying( lagging_stasher, delay_3pc(after=3, before=6, msgs=Commit)) others_mid_commits = start_delaying( other_stashers, delay_3pc(after=3, before=6, msgs=Commit)) start_delaying(lagging_stasher, delay_3pc(before=4, msgs=Commit)) # Send more requests reqs = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 6) # Wait until pool ordered till (0, 3) looper.run(eventually(check_nodes_ordered_till, other_nodes, 0, 3)) assert lagging_node.master_replica.last_ordered_3pc == (0, 1) with delay_rules(lagging_stasher, delay_catchup(DOMAIN_LEDGER_ID)): with delay_rules(lagging_stasher, delay_catchup(AUDIT_LEDGER_ID)): # Start catchup lagging_node.start_catchup() looper.runFor(0.5) assert lagging_node_state( ) == NodeLeecherService.State.SyncingAudit # Process missing commits on lagging node stop_delaying_and_process(lagging_mid_commits) looper.runFor(0.5) # Allow to catchup audit ledger looper.run(eventually(check_lagging_node_is_not_syncing_audit)) stop_delaying_and_process(others_mid_commits) # Ensure that audit ledger is caught up by lagging node looper.run(eventually(check_lagging_node_done_catchup)) # Ensure that all requests were ordered sdk_get_and_check_replies(looper, reqs) # Ensure that all nodes will eventually have same data ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_node_requests_missing_preprepares_prepares_and_commits( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tdir, allPluginsPath): """ 1 of 4 nodes goes down ((simulate this by dropping requests)). A new request comes in and is ordered by the 3 remaining nodes. After a while the previously disconnected node comes back alive. Another request comes in. Check that the previously disconnected node requests missing PREPREPARES, PREPARES and COMMITS, orders the previous request and all the nodes successfully handles the last request. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 disconnected_node = txnPoolNodeSet[3] alive_nodes = txnPoolNodeSet[:3] disconnected_node_stashers = disconnected_node.nodeIbStasher sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) init_ledger_size = txnPoolNodeSet[0].domainLedger.size with delay_rules_without_processing(disconnected_node_stashers, delay_3pc()): last_ordered_key = txnPoolNodeSet[0].master_replica.last_ordered_3pc sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) looper.run( eventually(check_pp_out_of_sync, alive_nodes, [disconnected_node], last_ordered_key, retryWait=1, timeout=expectedPoolGetReadyTimeout( len(txnPoolNodeSet)))) for node in alive_nodes: assert node.domainLedger.size == init_ledger_size + MISSING_REQS_CNT # Ensure that the reconnected node has not caught up though assert disconnected_node.domainLedger.size == init_ledger_size ordering_service = disconnected_node.master_replica._ordering_service assert ordering_service.spylog.count( OrderingService._request_pre_prepare) == 0 assert ordering_service.spylog.count(OrderingService._request_prepare) == 0 assert ordering_service.spylog.count(OrderingService._request_commit) == 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_pre_prepare) == 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_prepare) == 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_commit) == 0 doOrderTimesBefore = ordering_service.spylog.count( OrderingService._do_order) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_node, *alive_nodes) assert ordering_service.spylog.count( OrderingService._request_pre_prepare) > 0 assert ordering_service.spylog.count(OrderingService._request_prepare) > 0 assert ordering_service.spylog.count(OrderingService._request_commit) > 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_pre_prepare) > 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_prepare) > 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_commit) > 0 doOrderTimesAfter = ordering_service.spylog.count( OrderingService._do_order) # Ensure that the reconnected node has ordered both the missed 3PC-batch and the new 3PC-batch assert doOrderTimesAfter - doOrderTimesBefore == 2 for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT) def check_all_ordered(): for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT) looper.run(eventually(check_all_ordered, timeout=20))
def check_catchup_with_skipped_commits_received_before_catchup( catchup_state, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_new_steward): lagging_node = txnPoolNodeSet[-1] lagging_stasher = lagging_node.nodeIbStasher other_nodes = txnPoolNodeSet[:-1] def delay_catchup(catchup_state: NodeLeecherService.State): ledger_id = NodeLeecherService.state_to_ledger[catchup_state] _delayer = cr_delay(ledger_filter=ledger_id) _delayer.__name__ = "delay_catchup({})".format(ledger_id) return _delayer def check_lagging_node_catchup_state(state: NodeLeecherService.State): assert lagging_node.ledgerManager._node_leecher._state == state def check_nodes_ordered_till(nodes: Iterable, view_no: int, pp_seq_no: int): for node in nodes: assert compare_3PC_keys((view_no, pp_seq_no), node.master_replica.last_ordered_3pc) >= 0 # Make sure pool is in expected state init_pp_seq_no = txnPoolNodeSet[0].master_replica.last_ordered_3pc[1] for node in txnPoolNodeSet: assert node.master_replica.last_ordered_3pc == (0, init_pp_seq_no) # Order pool requests while delaying first two commits on lagging node with delay_rules(lagging_stasher, delay_3pc(before=init_pp_seq_no + 3, msgs=Commit)): # Send some pool requests reqs = sdk_send_random_pool_requests(looper, sdk_pool_handle, sdk_wallet_new_steward, 4) sdk_get_and_check_replies(looper, reqs) # Make sure pool is in expected state for node in other_nodes: assert node.master_replica.last_ordered_3pc == (0, init_pp_seq_no + 4) assert lagging_node.master_replica.last_ordered_3pc == (0, init_pp_seq_no) # Wait until two batches with delayed commits are ordered, but not more looper.run( eventually(check_nodes_ordered_till, [lagging_node], 0, init_pp_seq_no + 2)) assert lagging_node.master_replica.last_ordered_3pc == (0, init_pp_seq_no + 2) with delay_rules(lagging_stasher, delay_catchup(catchup_state)): # Start catchup lagging_node.start_catchup() # Wait until catchup reaches desired state looper.run(eventually(check_lagging_node_catchup_state, catchup_state)) # Emulate scheduled action lagging_node.master_replica._ordering_service._process_stashed_out_of_order_commits( ) # Ensure that audit ledger is caught up by lagging node looper.run( eventually(check_lagging_node_catchup_state, NodeLeecherService.State.Idle)) # Ensure that all nodes will eventually have same data ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_node_requests_missing_preprepares_and_prepares_after_long_disconnection( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tconf, tdirWithPoolTxns, tdir, allPluginsPath): """ 2 of 4 nodes go down (simulate this by dropping requests), so pool can not process any more incoming requests. A new request comes in. Test than waits for some time to ensure that PrePrepare was created long enough seconds to be dropped by time checker. Two stopped nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPREPARES and PREPARES and the pool successfully handles both transactions. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 alive_nodes = [] disconnected_nodes = [] for node in txnPoolNodeSet: if node.hasPrimary: alive_nodes.append(node) else: disconnected_nodes.append(node) disconnected_nodes_stashers = [n.nodeIbStasher for n in disconnected_nodes] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) init_ledger_size = txnPoolNodeSet[0].domainLedger.size with delay_rules_without_processing(disconnected_nodes_stashers, delay_3pc()): sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) last_ordered_key = txnPoolNodeSet[0].master_replica.last_ordered_3pc looper.run( eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, last_ordered_key, retryWait=1, timeout=expectedPoolGetReadyTimeout( len(txnPoolNodeSet)))) preprepare_deviation = 4 tconf.ACCEPTABLE_DEVIATION_PREPREPARE_SECS = preprepare_deviation time.sleep(preprepare_deviation * 2) for node in disconnected_nodes: assert node.domainLedger.size == init_ledger_size for node in disconnected_nodes: assert node.master_replica._ordering_service.spylog.count( OrderingService._request_pre_prepare) == 0 assert node.master_replica._ordering_service.spylog.count( OrderingService._request_prepare) == 0 assert node.master_replica._message_req_service.spylog.count( MessageReqService.process_message_rep) == 0 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) for node in disconnected_nodes: assert node.master_replica._ordering_service.spylog.count( OrderingService._request_pre_prepare) > 0 assert node.master_replica._ordering_service.spylog.count( OrderingService._request_prepare) > 0 assert node.master_replica._message_req_service.spylog.count( MessageReqService.process_message_rep) > 0 def check_all_ordered(): for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT) looper.run(eventually(check_all_ordered, timeout=20))