def testNodeRemoveUnknownRemote(allPluginsPath, tdirAndLooper, nodeReg, conf):
    """
    The nodes Alpha and Beta know about each other so they should connect but
    they should remove remote for C when it tries to connect to them
    """

    tdir, looper = tdirAndLooper
    names = ["Alpha", "Beta"]
    nrg = {n: nodeReg[n] for n in names}
    initLocalKeys(tdir, nrg)
    logger.debug(names)

    nodes = []
    for name in names:
        node = TestNode(name,
                        nrg,
                        basedirpath=tdir,
                        base_data_dir=tdir,
                        pluginPaths=allPluginsPath)
        nodes.append(node)

    for node in nodes:
        tellKeysToOthers(node, nodes)

    A, B = nodes
    for node in nodes:
        looper.add(node)
    looper.run(checkNodesConnected(nodes))

    initLocalKeys(tdir, {"Gamma": nodeReg["Gamma"]})
    C = TestNode("Gamma", {
        **nrg,
        **{
            "Gamma": nodeReg["Gamma"]
        }
    },
                 basedirpath=tdir,
                 base_data_dir=tdir,
                 pluginPaths=allPluginsPath)
    for node in nodes:
        tellKeysToOthers(node, [
            C,
        ])

    looper.add(C)
    looper.runFor(5)

    stopNodes([
        C,
    ], looper)

    def chk():
        assert C.name not in B.nodestack.nameRemotes
        assert C.name not in A.nodestack.nameRemotes

    timeout = waits.expectedPoolInterconnectionTime(len(nodeReg))
    looper.run(eventually(chk, retryWait=2, timeout=timeout))
    stopNodes([A, B], looper)
            def addNodeBackAndCheck(nodeIdx: int, expectedStatus: Status):
                logger.info("Add back the {} node and see status of {}".
                            format(ordinal(nodeIdx + 1), expectedStatus))
                addNodeBack(nodeSet, looper, nodeNames[nodeIdx])

                timeout = waits.expectedNodeStartUpTimeout() + \
                    waits.expectedPoolInterconnectionTime(len(nodeSet))
                looper.run(eventually(checkNodeStatusRemotesAndF,
                                      expectedStatus,
                                      nodeIdx,
                                      retryWait=1, timeout=timeout))
예제 #3
0
async def checkNodesConnected(nodes: Iterable[TestNode], customTimeout=None):
    # run for how long we expect all of the connections to take
    timeout = customTimeout or \
              waits.expectedPoolInterconnectionTime(len(nodes))
    logger.debug(
        "waiting for {} seconds to check connections...".format(timeout))
    # verify every node can see every other as a remote
    funcs = [partial(check_node_connected, n, set(nodes) - {n}) for n in nodes]
    await eventuallyAll(*funcs,
                        retryWait=.5,
                        totalTimeout=timeout,
                        acceptableExceptions=[AssertionError, RemoteNotFound])
예제 #4
0
async def checkNodesConnected(nodes: Iterable[TestNode],
                              customTimeout=None):
    # run for how long we expect all of the connections to take
    timeout = customTimeout or \
              waits.expectedPoolInterconnectionTime(len(nodes))
    logger.debug(
        "waiting for {} seconds to check connections...".format(timeout))
    # verify every node can see every other as a remote
    funcs = [partial(check_node_connected, n, set(nodes) - {n}) for n in nodes]
    await eventuallyAll(*funcs,
                        retryWait=.5,
                        totalTimeout=timeout,
                        acceptableExceptions=[AssertionError, RemoteNotFound])
예제 #5
0
async def checkNodesConnected(stacks: Iterable[Union[TestNode, TestClient]],
                              expectedRemoteState=None,
                              customTimeout=None):
    expectedRemoteState = expectedRemoteState if expectedRemoteState else CONNECTED
    # run for how long we expect all of the connections to take
    timeout = customTimeout or waits.expectedPoolInterconnectionTime(len(stacks))
    logger.debug("waiting for {} seconds to check connections...".format(timeout))
    # verify every node can see every other as a remote
    funcs = [
        partial(checkRemoteExists, frm.nodestack, to.name, expectedRemoteState)
        for frm, to in permutations(stacks, 2)]
    await eventuallyAll(*funcs,
                        retryWait=.5,
                        totalTimeout=timeout,
                        acceptableExceptions=[AssertionError, RemoteNotFound])
 def addNodeBackAndCheck(nodeIdx: int, expectedStatus: Status):
     logger.info("Add back the {} node and see status of {}".
                 format(ordinal(nodeIdx + 1), expectedStatus))
     addNodeBack(
         current_node_set, looper,
         get_node_by_name(txnPoolNodeSet, nodeNames[nodeIdx]),
         tconf, tdir)
     looper.run(checkNodesConnected(current_node_set))
     timeout = waits.expectedNodeStartUpTimeout() + \
               waits.expectedPoolInterconnectionTime(len(current_node_set))
     # TODO: Probably it's better to modify waits.* functions
     timeout *= 1.5
     looper.run(eventually(checkNodeStatusRemotesAndF,
                           expectedStatus,
                           nodeIdx,
                           retryWait=1, timeout=timeout))
 def addNodeBackAndCheck(nodeIdx: int, expectedStatus: Status):
     logger.info("Add back the {} node and see status of {}".
                 format(ordinal(nodeIdx + 1), expectedStatus))
     addNodeBack(
         current_node_set, looper,
         get_node_by_name(txnPoolNodeSet, nodeNames[nodeIdx]),
         tconf, tdir)
     looper.run(checkNodesConnected(current_node_set))
     timeout = waits.expectedNodeStartUpTimeout() + \
               waits.expectedPoolInterconnectionTime(len(current_node_set))
     # TODO: Probably it's better to modify waits.* functions
     timeout *= 1.5
     looper.run(eventually(checkNodeStatusRemotesAndF,
                           expectedStatus,
                           nodeIdx,
                           retryWait=1, timeout=timeout))
예제 #8
0
def test_catchup_not_triggered_if_another_in_progress(
        looper,
        chkFreqPatched,
        reqs_for_checkpoint,
        txnPoolNodeSet,
        sdk_pool_handle,
        sdk_wallet_client,
        broken_node_and_others):
    """
    A node misses 3pc messages and checkpoints during some period but later it
    stashes some amount of checkpoints and starts catchup. When the node is
    performing the catchup, it receives more checkpoints enough to start a new
    catchup but it does not start it because the first catchup is in progress.
    """
    max_batch_size = chkFreqPatched.Max3PCBatchSize
    broken_node, other_nodes = broken_node_and_others

    logger.info("Step 1: The node misses quite a lot of 3PC-messages and checkpoints")

    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint + max_batch_size)

    waitNodeDataInequality(looper, broken_node, *other_nodes)

    logger.info(
        "Step 2: The node receives 3PC-messages but cannot process them because of "
        "missed ones. But the node eventually stashes some amount of checkpoints "
        "and after that starts catchup")

    repaired_node = repair_broken_node(broken_node)

    initial_do_start_catchup_times = repaired_node.spylog.count(Node._do_start_catchup)
    initial_all_ledgers_caught_up = repaired_node.spylog.count(Node.allLedgersCaughtUp)

    with delay_rules(repaired_node.nodeIbStasher, cr_delay()):
        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint - max_batch_size)

        ensure_all_nodes_have_same_data(looper, other_nodes)

        looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.discovering),
                              timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)) +
                                      waits.expectedPoolConsistencyProof(len(txnPoolNodeSet))))

        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1

        logger.info(
            "Step 3: While doing the catchup, the node receives new checkpoints "
            "enough to start a new catchup but the node does not start it because "
            "the former is in progress")

        process_checkpoint_times_before = \
            repaired_node.master_replica._checkpointer.spylog.count(CheckpointService.process_checkpoint)

        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint)

        # Wait until the node receives the new checkpoints from all the other nodes
        looper.run(
            eventually(lambda: assertExp(
                repaired_node.master_replica._checkpointer.spylog.count(CheckpointService.process_checkpoint) -
                process_checkpoint_times_before ==
                (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                (len(txnPoolNodeSet) - 1)),
                       timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet))))

        # New catchup is not started when another one is in progress
        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
        assert repaired_node.mode == Mode.discovering

    logger.info("Step 4: The node completes the catchup. The ledger has been "
                "updated to the level determined on its start")

    looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.participating),
                          timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))
    assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
    assert repaired_node.spylog.count(Node.allLedgersCaughtUp) - initial_all_ledgers_caught_up == 1
    assert repaired_node.domainLedger.size == other_nodes[0].domainLedger.size
예제 #9
0
def testNodeRemoveUnknownRemote(allPluginsPath, tdir_for_func, tconf_for_func,
                                looper_without_nodeset_for_func, nodeReg):
    """
    The nodes Alpha and Beta know about each other so they should connect but
    they should remove remote for C when it tries to connect to them
    """

    looper = looper_without_nodeset_for_func
    names = ["Alpha", "Beta"]
    nrg = {n: nodeReg[n] for n in names}
    initLocalKeys(tdir_for_func, tconf_for_func, nrg)
    logger.debug(names)

    nodes = []
    for name in names:
        config_helper = PNodeConfigHelper(name,
                                          tconf_for_func,
                                          chroot=tdir_for_func)
        node = TestNode(name,
                        nrg,
                        config_helper=config_helper,
                        config=tconf_for_func,
                        pluginPaths=allPluginsPath)
        nodes.append(node)

    for node in nodes:
        tellKeysToOthers(node, nodes)

    A, B = nodes
    for node in nodes:
        looper.add(node)
    looper.run(checkNodesConnected(nodes))

    name = "Gamma"
    initLocalKeys(tdir_for_func, tconf_for_func, {name: nodeReg[name]})
    config_helper = PNodeConfigHelper(name,
                                      tconf_for_func,
                                      chroot=tdir_for_func)
    C = TestNode(name, {
        **nrg,
        **{
            name: nodeReg[name]
        }
    },
                 config_helper=config_helper,
                 config=tconf_for_func,
                 pluginPaths=allPluginsPath)
    for node in nodes:
        tellKeysToOthers(node, [
            C,
        ])

    looper.add(C)
    looper.runFor(5)

    stopNodes([
        C,
    ], looper)

    timeout = waits.expectedPoolInterconnectionTime(len(nodeReg))
    stopNodes([A, B], looper)
def test_catchup_not_triggered_if_another_in_progress(
        looper,
        chkFreqPatched,
        reqs_for_checkpoint,
        txnPoolNodeSet,
        sdk_pool_handle,
        sdk_wallet_client,
        broken_node_and_others):
    """
    A node misses 3pc messages and checkpoints during some period but later it
    stashes some amount of checkpoints and starts catchup. When the node is
    performing the catchup, it receives more checkpoints enough to start a new
    catchup but it does not start it because the first catchup is in progress.
    """
    max_batch_size = chkFreqPatched.Max3PCBatchSize
    broken_node, other_nodes = broken_node_and_others

    logger.info("Step 1: The node misses quite a lot of 3PC-messages and checkpoints")

    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint + max_batch_size)

    waitNodeDataInequality(looper, broken_node, *other_nodes)

    logger.info(
        "Step 2: The node receives 3PC-messages but cannot process them because of "
        "missed ones. But the node eventually stashes some amount of checkpoints "
        "and after that starts catchup")

    repaired_node = repair_broken_node(broken_node)

    initial_do_start_catchup_times = repaired_node.spylog.count(Node._do_start_catchup)
    initial_all_ledgers_caught_up = repaired_node.spylog.count(Node.allLedgersCaughtUp)

    with delay_rules(repaired_node.nodeIbStasher, cr_delay()):
        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint - max_batch_size)

        ensure_all_nodes_have_same_data(looper, other_nodes)
        target_ledger_size = other_nodes[0].domainLedger.size

        looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.syncing),
                              timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)) +
                                      waits.expectedPoolConsistencyProof(len(txnPoolNodeSet))))
    
        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1

        logger.info(
            "Step 3: While doing the catchup, the node receives new checkpoints "
            "enough to start a new catchup but the node does not start it because "
            "the former is in progress")

        process_checkpoint_times_before = repaired_node.master_replica.spylog.count(Replica.processCheckpoint)

        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint)

        # Wait until the node receives the new checkpoints from all the other nodes
        looper.run(
            eventually(lambda: assertExp(repaired_node.master_replica.spylog.count(Replica.processCheckpoint) -
                                         process_checkpoint_times_before ==
                                         (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                         (len(txnPoolNodeSet) - 1)),
                       timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet))))
    
        # New catchup is not started when another one is in progress
        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
        assert repaired_node.mode == Mode.syncing

    logger.info("Step 4: The node completes the catchup. The ledger has been "
                "updated to the level determined on its start")

    looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.participating),
                          timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))
    assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
    assert repaired_node.spylog.count(Node.allLedgersCaughtUp) - initial_all_ledgers_caught_up == 1
    assert repaired_node.domainLedger.size == target_ledger_size
def test_node_erases_last_sent_pp_key_on_propagate_primary_after_pool_restart(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client,
        tconf, tdir, allPluginsPath, chkFreqPatched):

    # Get a node with a backup primary replica and the rest of the nodes
    replica = getPrimaryReplica(txnPoolNodeSet, instId=backup_inst_id)
    node = replica.node
    other_nodes = set(txnPoolNodeSet) - {node}

    # Send some 3PC-batches and wait until the replica orders the 3PC-batches
    sdk_send_batches_of_random(looper, txnPoolNodeSet,
                               sdk_pool_handle, sdk_wallet_client,
                               num_reqs=7, num_batches=7,
                               timeout=tconf.Max3PCBatchWait)

    looper.run(
        eventually(lambda: assertExp(replica.last_ordered_3pc == (0, 7)),
                   retryWait=1,
                   timeout=waits.expectedTransactionExecutionTime(nodeCount)))

    # Check view no of the node and lastPrePrepareSeqNo of the replica
    assert node.viewNo == 0
    assert replica.lastPrePrepareSeqNo == 7
    assert replica.h == 6
    assert replica.H == 6 + LOG_SIZE

    # Ensure that there is a stored last sent PrePrepare key on the node
    assert LAST_SENT_PRE_PREPARE in node.nodeStatusDB

    # Stop the node containing the replica
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            node.name,
                                            stopNode=True)
    looper.removeProdable(node)
    txnPoolNodeSet.remove(node)

    # Restart the rest of the nodes and wait for primary elections done
    for n in other_nodes:
        disconnect_node_and_ensure_disconnected(looper,
                                                txnPoolNodeSet,
                                                n.name,
                                                timeout=nodeCount,
                                                stopNode=True)
        looper.removeProdable(n)
        txnPoolNodeSet.remove(n)
    for n in other_nodes:
        txnPoolNodeSet.append(start_stopped_node(n, looper, tconf, tdir, allPluginsPath))
    looper.run(checkNodesConnected(txnPoolNodeSet,
                                   customTimeout=waits.expectedPoolInterconnectionTime(nodeCount)))
    def chk():
        for n in txnPoolNodeSet:
            for r in n.replicas.values():
                assert r.hasPrimary
    looper.run(eventually(chk, retryWait=1, timeout=waits.expectedPoolElectionTimeout(nodeCount)))

    # Start the stopped node
    node = start_stopped_node(node, looper, tconf, tdir, allPluginsPath)
    txnPoolNodeSet.append(node)

    looper.run(checkNodesConnected(txnPoolNodeSet))
    ensureElectionsDone(looper, txnPoolNodeSet)

    replica = node.replicas[backup_inst_id]

    # Verify that the node has erased the stored last sent PrePrepare key
    assert LAST_SENT_PRE_PREPARE not in node.nodeStatusDB

    # Verify correspondingly that after the propagate primary procedure
    # the replica (which must again be the primary in its instance) has
    # not restored lastPrePrepareSeqNo, not adjusted last_ordered_3pc and
    # not shifted the watermarks
    assert node.viewNo == 0
    assert replica.isPrimary
    assert replica.lastPrePrepareSeqNo == 0
    assert replica.last_ordered_3pc == (0, 0)
    assert replica.h == 0
    assert replica.H == 0 + LOG_SIZE

    # Send a 3PC-batch and ensure that the replica orders it
    sdk_send_batches_of_random(looper, txnPoolNodeSet,
                               sdk_pool_handle, sdk_wallet_client,
                               num_reqs=1, num_batches=1,
                               timeout=tconf.Max3PCBatchWait)

    looper.run(
        eventually(lambda: assertExp(replica.last_ordered_3pc == (0, 1)),
                   retryWait=1,
                   timeout=waits.expectedTransactionExecutionTime(nodeCount)))