def test_catchup_with_one_slow_node(tdir, tconf, looper, txnPoolNodeSet,
                                    sdk_pool_handle, sdk_wallet_client,
                                    allPluginsPath, logsearch):
    '''
    1. Stop the node Delta
    2. Order 9 txns. In sending CatchupReq in a first round every
    node [Alpha, Beta, Gamma] will receive request for 3 txns.
    3. Delay CatchupReq messages on Alpha
    4. Start Delta
    5. Check that all nodes have equality data.
    6. Check that Delta re-ask CatchupRep only once.
    In the second CatchupRep (first re-ask) Delta shouldn't request
    CatchupRep from Alpha because it didn't answer early.
    If the behavior is wrong and Delta re-ask txns form all nodes,
    every node will receive request for 1 txns, Alpha will not answer
    and Delta will need a new re-ask round.
    '''
    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Send more requests to active nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client,
                              len(rest_nodes) * 3)
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(
        lagging_node,
        looper,
        tconf,
        tdir,
        allPluginsPath,
        start=False,
    )
    log_re_ask, _ = logsearch(
        msgs=['requesting .* missing transactions after timeout'])
    old_re_ask_count = len(log_re_ask)

    # Delay CatchupRep messages on Alpha
    with delay_rules(rest_nodes[0].nodeIbStasher, cqDelay()):
        with delay_rules(lagging_node.nodeIbStasher, cs_delay()):
            looper.add(lagging_node)
            txnPoolNodeSet[-1] = lagging_node
            looper.run(checkNodesConnected(txnPoolNodeSet))

            waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=120)
            assert len(
                log_re_ask
            ) - old_re_ask_count == 2  # for audit and domain ledgers
Exemplo n.º 2
0
def test_3pc_while_catchup_with_chkpoints_only(
        tdir, tconf, looper, chkFreqPatched, reqs_for_checkpoint,
        testNodeClass, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client,
        allPluginsPath):
    '''
    Check that catch-up is not started again even if a quorum of stashed checkpoints
    is received during catch-up.
    Assume that only checkpoints and no 3PC messages are received.
    '''

    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)

    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Send more requests to active nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(
        lagging_node,
        looper,
        tconf,
        tdir,
        allPluginsPath,
        start=False,
    )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(
        Node.allLedgersCaughtUp)

    # delay all 3PC messages on the lagged node so that it
    # receives only Checkpoints and catch-up messages

    lagging_node.nodeIbStasher.delay(ppDelay())
    lagging_node.nodeIbStasher.delay(pDelay())
    lagging_node.nodeIbStasher.delay(cDelay())

    # delay CurrentState to avoid Primary Propagation (since it will lead to more catch-ups not needed in this test).
    with delay_rules(lagging_node.nodeIbStasher, cs_delay()):
        with delay_rules(lagging_node.nodeIbStasher, lsDelay(),
                         msg_rep_delay()):
            looper.add(lagging_node)
            txnPoolNodeSet[-1] = lagging_node
            looper.run(checkNodesConnected(txnPoolNodeSet))

            # wait till we got ledger statuses for messages missed while the node was offline,
            # so that now we can order more messages, and they will not be caught up, but stashed
            looper.run(
                eventually(lambda: assertExp(
                    lagging_node.nodeIbStasher.num_of_stashed(LedgerStatus) >=
                    3),
                           retryWait=1,
                           timeout=60))

            assert lagging_node.mode != Mode.participating

            # make sure that more requests are being ordered while catch-up is in progress
            # stash enough stable checkpoints for starting a catch-up
            num_checkpoints = Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1
            num_reqs = reqs_for_checkpoint * num_checkpoints + 1
            sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                      sdk_wallet_client, num_reqs)
            looper.run(
                eventually(check_last_ordered_3pc_on_all_replicas, rest_nodes,
                           (0, num_reqs + 2)))

            # all good nodes stabilized checkpoint
            looper.run(eventually(chkChkpoints, rest_nodes, 2, 0))

            assert lagging_node.mode != Mode.participating
            # lagging node is catching up and stashing all checkpoints
            looper.run(
                eventually(lambda: assertExp(
                    get_stashed_checkpoints(lagging_node) == num_checkpoints *
                    len(rest_nodes)),
                           timeout=waits.expectedPoolCatchupTime(
                               len(txnPoolNodeSet))))

        # check that last_ordered is set
        looper.run(
            eventually(check_last_ordered_3pc_on_master, [lagging_node],
                       (0, num_reqs + 2)))

        # check that the catch-up is finished
        looper.run(
            eventually(
                lambda: assertExp(lagging_node.mode == Mode.participating),
                retryWait=1,
                timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))

        # check that catch-up was started only once
        assert lagging_node.spylog.count(
            Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1
        looper.run(
            eventually(lambda: assertExp(
                lagging_node.spylog.count(Node.allLedgersCaughtUp) ==
                initial_all_ledgers_caught_up + 1)))
        looper.run(
            eventually(lambda: assertExp(
                lagging_node.spylog.count(Node.start_catchup) == 1)))

        # do not check for audit ledger since we didn't catch-up audit ledger when txns were ordering durinf catch-up
        waitNodeDataEquality(looper,
                             *txnPoolNodeSet,
                             exclude_from_check='check_audit',
                             customTimeout=5)
def test_3pc_while_catchup_with_chkpoints(tdir, tconf,
                                          looper,
                                          chkFreqPatched,
                                          reqs_for_checkpoint,
                                          testNodeClass,
                                          txnPoolNodeSet,
                                          sdk_pool_handle,
                                          sdk_wallet_client,
                                          allPluginsPath):
    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 10)

    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Send more requests to active nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(lagging_node,
                                      looper,
                                      tconf,
                                      tdir,
                                      allPluginsPath,
                                      start=False,
                                      )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(Node.allLedgersCaughtUp)
    # delay CurrentState to avoid Primary Propagation (since it will lead to more catch-ups not needed in this test).
    with delay_rules(lagging_node.nodeIbStasher, cs_delay()):
        with delay_rules(lagging_node.nodeIbStasher, cr_delay()):
            looper.add(lagging_node)
            txnPoolNodeSet[-1] = lagging_node
            looper.run(checkNodesConnected(txnPoolNodeSet))

            # wait till we got catchup replies for messages missed while the node was offline,
            # so that now qwe can order more messages, and they will not be caught up, but stashed
            looper.run(
                eventually(lambda: assertExp(len(lagging_node.nodeIbStasher.delayeds) >= 3), retryWait=1,
                           timeout=60))

            assert lagging_node.mode == Mode.syncing

            # make sure that more requests are being ordered while catch-up is in progress
            # stash enough stable checkpoints for starting a catch-up
            sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                      sdk_wallet_client,
                                      reqs_for_checkpoint * (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1))

            assert lagging_node.mode == Mode.syncing

        # check that the catch-up is finished
        looper.run(
            eventually(
                lambda: assertExp(lagging_node.mode == Mode.participating), retryWait=1,
                timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))
            )
        )
        looper.run(
            eventually(
                lambda: assertExp(
                    lagging_node.spylog.count(Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1)
            )
        )
        # check that catch-up was started only once
        looper.run(
            eventually(
                lambda: assertExp(
                    lagging_node.spylog.count(Node.start_catchup) == 1)
            )
        )
        # check that the node was able to order requests stashed during catch-up
        waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
Exemplo n.º 4
0
def test_limited_stash_3pc_while_catchup(tdir, tconf, looper, testNodeClass,
                                         txnPoolNodeSet, sdk_pool_handle,
                                         sdk_wallet_client, allPluginsPath,
                                         chkFreqPatched):
    '''
    Test that the lagging_node can process messages from catchup stash after catchup
     and request lost messages from other nodes.
    '''

    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)

    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Order 2 checkpoints on rest_nodes (2 txns in 2 batches)
    sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet,
                                         sdk_pool_handle, sdk_wallet_client,
                                         2 * CHK_FREQ, 2)
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(
        lagging_node,
        looper,
        tconf,
        tdir,
        allPluginsPath,
        start=False,
    )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(
        Node.allLedgersCaughtUp)

    with delay_rules(lagging_node.nodeIbStasher, cs_delay(),
                     msg_rep_delay(types_to_delay=[PREPARE, PREPREPARE])):
        with delay_rules(lagging_node.nodeIbStasher,
                         cr_delay(ledger_filter=DOMAIN_LEDGER_ID)):
            looper.add(lagging_node)
            txnPoolNodeSet[-1] = lagging_node
            looper.run(checkNodesConnected(txnPoolNodeSet))

            # Order 2 checkpoints in the first lagging node catchup (2 txns in 2 batches)
            sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet,
                                                 sdk_pool_handle,
                                                 sdk_wallet_client,
                                                 2 * CHK_FREQ, 2)

        # Check that firs txn was ordered from stash after first catchup
        looper.run(
            eventually(lambda: assertExp(lagging_node.master_last_ordered_3PC[
                1] == txnPoolNodeSet[0].master_last_ordered_3PC[1] - 1),
                       retryWait=1,
                       timeout=waits.expectedPoolCatchupTime(
                           len(txnPoolNodeSet))))

        # Order 2 checkpoints in the second lagging node catchup (2 txns in 2 batches)
        sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet,
                                             sdk_pool_handle,
                                             sdk_wallet_client, 2 * CHK_FREQ,
                                             2)

    waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    looper.run(
        eventually(lambda: assertExp(lagging_node.master_last_ordered_3PC == n.
                                     master_last_ordered_3PC
                                     for n in txnPoolNodeSet)))

    # check that catch-up was started only twice
    assert lagging_node.spylog.count(
        Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 2
def test_restart_node_with_view_changes(tdir, tconf,
                                        looper,
                                        txnPoolNodeSet,
                                        sdk_pool_handle,
                                        sdk_wallet_client,
                                        allPluginsPath):
    '''
    1. Stop the node Delta
    2. Patch methods for processing VCStartMsgStrategy messages
    3. Delay CurrentState messages on Delta
    4. Start Delta
    5. Start view change with a maser degradation reason (from view 0 to 1)
    6. Check that Delta start VCStartMsgStrategy after quorum of InstanceChanges
    7. Reset delay for CurrentStates
    8. Check that propagate primary happened.
    9. Unpatch VCStartMsgStrategy methods and process catching messages.
    10. Start view change with a maser degradation reason (from view 1 to 2)
    11. Check that all nodes has viewNo = 2 and can order transactions.
    '''
    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]
    start_view_no = lagging_node.viewNo

    # Stop Delta
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Send more requests to active nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, len(rest_nodes) * 3)
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node
    lagging_node = start_stopped_node(lagging_node,
                                      looper,
                                      tconf,
                                      tdir,
                                      allPluginsPath,
                                      start=False,
                                      )

    # Add to lagging_node node route a patched method for processing
    # ViewChangeStartMessage to delay processing.
    global view_change_started_messages
    view_change_started_messages = []

    def patch_on_view_change_started(node, msg, frm):
        view_change_started_messages.append((node, msg, frm))

    processor = partial(patch_on_view_change_started,
                        lagging_node)
    lagging_node.nodeMsgRouter.add((ViewChangeStartMessage, processor))

    # Delay CurrentState messages on lagging_node to delay propagate primary
    with delay_rules(lagging_node.nodeIbStasher, cs_delay()):
        # Add lagging_node to pool
        looper.add(lagging_node)
        txnPoolNodeSet[-1] = lagging_node
        looper.run(checkNodesConnected(txnPoolNodeSet))
        looper.run(
            eventually(lambda: assertExp(len(lagging_node.nodeIbStasher.delayeds) >= 3)))

        # Change viewNo in received CurrentStates.
        for cs in lagging_node.nodeIbStasher.delayeds:
            cs[0][0].viewNo = 1
            for vcd in cs[0][0].primary:
                vcd[f.VIEW_NO.nm] = 1

        # Start ViewChange (0 -> 1)
        for n in rest_nodes:
            n.view_changer.on_master_degradation()

        # Wait View Change start with InstanceChange messages on lagging_node
        looper.run(
            eventually(
                lambda: assertExp(len(view_change_started_messages) == 1)))
    looper.run(
        eventually(
            lambda: assertExp(lagging_node.viewNo == start_view_no + 1)))

    # Unpatch ViewChangeStartMessages processing and process delayed messages
    processor = partial(VCStartMsgStrategy.on_view_change_started,
                        lagging_node)
    lagging_node.nodeMsgRouter.add((ViewChangeStartMessage, processor))
    for msg in view_change_started_messages:
        lagging_node.view_changer.node.nodeInBox.append((msg[1],
                                                         lagging_node.view_changer.node.name))

    waitForViewChange(looper,
                      txnPoolNodeSet,
                      expectedViewNo=start_view_no + 1,
                      customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(txnPoolNodeSet)))

    # Start ViewChange (1 -> 2)
    for n in rest_nodes:
        n.view_changer.on_master_degradation()
    waitForViewChange(looper,
                      txnPoolNodeSet,
                      expectedViewNo=start_view_no + 2,
                      customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(txnPoolNodeSet)))
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet,
                        instances_list=range(txnPoolNodeSet[0].requiredNumberOfInstances))

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    waitNodeDataEquality(looper, *txnPoolNodeSet)
Exemplo n.º 6
0
def test_3pc_while_catchup(tdir, tconf, looper, testNodeClass, txnPoolNodeSet,
                           sdk_pool_handle, sdk_wallet_client, allPluginsPath):
    '''
    Tests that requests being ordered during catch-up are stashed and re-applied
    when catch-up is finished
    '''

    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 10)

    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Send more requests to active nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 10)
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(
        lagging_node,
        looper,
        tconf,
        tdir,
        allPluginsPath,
        start=False,
    )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(
        Node.allLedgersCaughtUp)
    assert all(replica.stasher.num_stashed_catchup == 0
               for inst_id, replica in lagging_node.replicas.items())
    # delay CurrentState to avoid Primary Propagation (since it will lead to more catch-ups not needed in this test).
    with delay_rules(lagging_node.nodeIbStasher, cs_delay()):
        with delay_rules(lagging_node.nodeIbStasher,
                         cr_delay(ledger_filter=DOMAIN_LEDGER_ID)):
            looper.add(lagging_node)
            txnPoolNodeSet[-1] = lagging_node
            looper.run(checkNodesConnected(txnPoolNodeSet))

            # wait till we got catchup replies for messages missed while the node was offline,
            # so that now we can order more messages, and they will not be caught up, but stashed
            looper.run(
                eventually(lambda: assertExp(
                    lagging_node.nodeIbStasher.num_of_stashed(CatchupRep) >= 3
                ),
                           retryWait=1,
                           timeout=60))

            # make sure that more requests are being ordered while catch-up is in progress
            sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                      sdk_wallet_client, 10)

            assert lagging_node.mode == Mode.syncing
            assert all(replica.stasher.num_stashed_catchup > 0
                       for inst_id, replica in lagging_node.replicas.items())

        # check that the catch-up is finished
        looper.run(
            eventually(
                lambda: assertExp(lagging_node.mode == Mode.participating),
                retryWait=1,
                timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))
        looper.run(
            eventually(lambda: assertExp(
                lagging_node.spylog.count(Node.allLedgersCaughtUp) ==
                initial_all_ledgers_caught_up + 1)))
        # check that the node was able to order requests stashed during catch-up
        # do not check for audit ledger since we didn't catch-up audit ledger when txns were ordering durinf catch-up
        waitNodeDataEquality(looper,
                             *txnPoolNodeSet,
                             exclude_from_check='check_audit',
                             customTimeout=5)
        assert all(replica.stasher.num_stashed_catchup == 0
                   for inst_id, replica in lagging_node.replicas.items())