def testZStackNodeReconnection(tconf, looper, txnPoolNodeSet, client1, wallet1,
                               tdirWithPoolTxns, client1Connected):
    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1)

    npr = [n for n in txnPoolNodeSet if not n.hasPrimary]
    nodeToCrash = npr[0]
    idxToCrash = txnPoolNodeSet.index(nodeToCrash)
    otherNodes = [_ for _ in txnPoolNodeSet if _ != nodeToCrash]

    def checkFlakyConnected(conn=True):
        for node in otherNodes:
            if conn:
                assert nodeToCrash.nodestack.name in node.nodestack.connecteds
            else:
                assert nodeToCrash.nodestack.name not in node.nodestack.connecteds

    checkFlakyConnected(True)
    nodeToCrash.stop()
    looper.removeProdable(nodeToCrash)
    looper.runFor(1)
    looper.run(eventually(checkFlakyConnected, False, retryWait=1, timeout=35))
    looper.runFor(1)
    node = TestNode(nodeToCrash.name,
                    basedirpath=tdirWithPoolTxns,
                    config=tconf,
                    ha=nodeToCrash.nodestack.ha,
                    cliha=nodeToCrash.clientstack.ha)
    looper.add(node)
    txnPoolNodeSet[idxToCrash] = node
    looper.run(eventually(checkFlakyConnected, True, retryWait=2, timeout=50))
    ensureElectionsDone(looper, txnPoolNodeSet, retryWait=2, timeout=50)
    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1)
    checkNodesSendingCommits(txnPoolNodeSet)
def testViewChangesIfMasterPrimaryDisconnected(txnPoolNodeSet, looper, wallet1,
                                               client1, client1Connected,
                                               tconf):
    """
    View change occurs when master's primary is disconnected
    """

    # Setup
    nodes = txnPoolNodeSet

    viewNoBefore = checkViewNoForNodes(nodes)
    old_pr_node = get_master_primary_node(nodes)

    # Stop primary
    stopNodes([old_pr_node], looper)
    looper.removeProdable(old_pr_node)
    remainingNodes = list(set(nodes) - {old_pr_node})
    # Sometimes it takes time for nodes to detect disconnection
    ensure_node_disconnected(looper, old_pr_node, remainingNodes, timeout=20)

    looper.runFor(tconf.ToleratePrimaryDisconnection + 2)

    # Give some time to detect disconnection and then verify that view has
    # changed and new primary has been elected
    waitForViewChange(looper, remainingNodes, viewNoBefore + 1)
    ensure_all_nodes_have_same_data(looper, nodes=remainingNodes)
    new_pr_node = get_master_primary_node(remainingNodes)
    assert old_pr_node != new_pr_node

    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)
示例#3
0
def test_no_instance_change_on_primary_disconnection_for_not_ready_node(
        looper, txnPoolNodeSet, tdir, tconf, allPluginsPath, steward1,
        stewardWallet, client_tdir):
    """
    Test steps:
    1. create a new node, but don't add it to the pool (so not send NODE txn), so that the node is not ready.
    2. wait for more than VIEW_CHANGE_TIMEOUT (a timeout for initial check for disconnected primary)
    3. make sure no InstanceChange sent by the new node
    4. add the node to the pool (send NODE txn) and make sure that the node is ready now.
    5. wait for more than VIEW_CHANGE_TIMEOUT (a timeout for initial check for disconnected primary)
    6. make sure no InstanceChange sent by the new node
    """

    # 1. create a new node, but don't add it to the pool (so not send NODE txn), so that the node is not ready.
    sigseed, bls_key, new_node = start_not_added_node(looper, tdir, tconf,
                                                      allPluginsPath,
                                                      "TestTheta")

    # 2. wait for more than VIEW_CHANGE_TIMEOUT (a timeout for initial check for disconnected primary)
    looper.runFor(tconf.VIEW_CHANGE_TIMEOUT + 2)

    # 3. make sure no InstanceChange sent by the new node
    assert 0 == new_node.view_changer.spylog.count(
        ViewChanger.sendInstanceChange.__name__)

    # 4. add the node to the pool (send NODE txn) and make sure that the node is ready now.
    add_started_node(looper, new_node, txnPoolNodeSet, client_tdir, steward1,
                     stewardWallet, sigseed, bls_key)

    # 5. wait for more than VIEW_CHANGE_TIMEOUT (a timeout for initial check for disconnected primary)
    looper.runFor(tconf.VIEW_CHANGE_TIMEOUT + 2)

    # 6. make sure no InstanceChange sent by the new node
    assert 0 == new_node.view_changer.spylog.count(
        ViewChanger.sendInstanceChange.__name__)
示例#4
0
def test_view_change_after_max_catchup_rounds(txnPoolNodeSet, looper, wallet1,
                                              client1, client1Connected):
    """
    The node should do only a fixed rounds of catchup. For this delay Prepares
    and Commits for 2 non-primary nodes by a large amount which is equivalent
    to loss of Prepares and Commits. Make sure 2 nodes have a different last
    prepared certificate from other two. Then do a view change, make sure view
    change completes and the pool does not process the request that were
    prepared by only a subset of the nodes
    """
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * 3, 3)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    ledger_summary = txnPoolNodeSet[0].elector.ledger_summary

    slow_nodes = [
        r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)[-2:]
    ]
    fast_nodes = [n for n in txnPoolNodeSet if n not in slow_nodes]

    # Make node slow to process Prepares and Commits
    for node in slow_nodes:
        node.nodeIbStasher.delay(pDelay(120, 0))
        node.nodeIbStasher.delay(cDelay(120, 0))

    sendRandomRequests(wallet1, client1, 5)
    looper.runFor(3)

    ensure_view_change(looper, nodes=txnPoolNodeSet)

    def last_prepared(nodes):
        lst = [
            n.master_replica.last_prepared_certificate_in_view() for n in nodes
        ]
        # All nodes have same last prepared
        assert check_if_all_equal_in_list(lst)
        return lst[0]

    last_prepared_slow = last_prepared(slow_nodes)
    last_prepared_fast = last_prepared(fast_nodes)

    # Check `slow_nodes` and `fast_nodes` set different last_prepared
    assert last_prepared_fast != last_prepared_slow

    # View change complete
    ensureElectionsDone(looper, txnPoolNodeSet)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)

    # The requests which were prepared by only a subset of the nodes were
    # not ordered
    assert txnPoolNodeSet[0].elector.ledger_summary == ledger_summary

    for node in slow_nodes:
        node.nodeIbStasher.reset_delays_and_process_delayeds()

    # Make sure pool is functional
    ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    last_prepared(txnPoolNodeSet)
示例#5
0
def test_view_changes_if_master_primary_disconnected(txnPoolNodeSet, looper,
                                                     sdk_pool_handle,
                                                     sdk_wallet_client, tdir,
                                                     tconf, allPluginsPath):
    """
    View change occurs when master's primary is disconnected
    """

    # Setup
    nodes = txnPoolNodeSet

    old_view_no = checkViewNoForNodes(nodes)
    old_pr_node = get_master_primary_node(nodes)

    # Stop primary
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            old_pr_node,
                                            stopNode=True)
    looper.removeProdable(old_pr_node)

    remaining_nodes = list(set(nodes) - {old_pr_node})
    # Sometimes it takes time for nodes to detect disconnection
    ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20)

    looper.runFor(tconf.ToleratePrimaryDisconnection + 2)

    # Give some time to detect disconnection and then verify that view has
    # changed and new primary has been elected
    waitForViewChange(looper, remaining_nodes, old_view_no + 1)
    ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes)
    new_pr_node = get_master_primary_node(remaining_nodes)
    assert old_pr_node != new_pr_node

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 5)

    # Check if old primary can join the pool and still functions
    old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdir,
                                     allPluginsPath)

    txnPoolNodeSet = remaining_nodes + [old_pr_node]
    looper.run(
        eventually(checkViewNoForNodes,
                   txnPoolNodeSet,
                   old_view_no + 1,
                   timeout=10))
    assert len(
        getAllReturnVals(
            old_pr_node.view_changer,
            old_pr_node.view_changer._start_view_change_if_possible,
            compare_val_to=True)) > 0

    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)

    assert not old_pr_node.view_changer._next_view_indications
def test_no_catchup_if_got_from_3pc(looper, txnPoolNodeSet, wallet1, client1,
                                    client1Connected):
    """
    A node is slow to receive COMMIT messages so after a view change it
    starts catchup. But before it can start requesting txns, the COMMITs messages
    are received and are ordered. The node should not request any transactions.
    :return:
    """
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * 3, 3)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]

    delay_cm = 30
    delat_cp = 40
    slow_node.nodeIbStasher.delay(cDelay(delay_cm))
    # The slow node receives consistency proofs after some delay, this delay
    # gives the opportunity to deliver all 3PC messages
    slow_node.nodeIbStasher.delay(cpDelay(delat_cp))

    # Count of `getCatchupReqs` which is called to construct the `CatchupReq`
    # to be sent
    def domain_cr_count(): return sum(1 for entry in
                                      slow_node.ledgerManager.spylog.getAll(
                                          slow_node.ledgerManager.getCatchupReqs) if
                                      entry.params['consProof'].ledgerId == DOMAIN_LEDGER_ID)

    old_count = domain_cr_count()
    sent_batches = 10
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1,
                                           2 * sent_batches, sent_batches)
    ensure_view_change(looper, nodes=txnPoolNodeSet)

    # After view change, the `slow_node` is behind
    waitNodeDataInequality(looper, slow_node, *other_nodes)

    # Unstash only COMMIT messages
    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(Commit.__name__)

    looper.runFor(2)

    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(
        ConsistencyProof.__name__)

    waitNodeDataEquality(looper, slow_node, *other_nodes)

    # No `CatchupReq`s constructed, hence no `CatchupReq`s could have
    # been sent
    assert domain_cr_count() == old_count
    # Some stashed ordered requests have been processed
    rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs)
    assert sent_batches in rv

    ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1)
示例#7
0
def test_repeated_request_not_processed_if_already_ordered(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client):

    delta = txnPoolNodeSet[3]
    initial_ledger_size = delta.domainLedger.size

    one_req = sdk_signed_random_requests(looper, sdk_wallet_client, 1)
    sdk_send_and_check(one_req, looper, txnPoolNodeSet, sdk_pool_handle)

    sdk_send_signed_requests(sdk_pool_handle, one_req)
    looper.runFor(waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)))

    for node in txnPoolNodeSet:
        assert node.domainLedger.size - initial_ledger_size == 1
示例#8
0
def test_belated_propagate_not_processed_if_already_ordered(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client):

    delta = txnPoolNodeSet[3]
    initial_ledger_size = delta.domainLedger.size
    delta.nodeIbStasher.delay(ppgDelay(300, 'Gamma'))

    one_req = sdk_signed_random_requests(looper, sdk_wallet_client, 1)
    sdk_send_and_check(one_req, looper, txnPoolNodeSet, sdk_pool_handle)

    delta.nodeIbStasher.reset_delays_and_process_delayeds()
    looper.runFor(waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)))

    for node in txnPoolNodeSet:
        assert node.domainLedger.size - initial_ledger_size == 1
示例#9
0
    def tear():
        # Repair any broken network
        for node in txnPoolNodeSet:
            node.reset_delays_and_process_delayeds()
        # Give a little time to process any delayed messages
        looper.runFor(3)

        # Check each node has same data
        ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)

        # Check each node has ordered all requests (no catchup)
        assert check_if_all_equal_in_list([n.master_replica.ordered
                                           for n in txnPoolNodeSet])

        # Check the network is functional since all nodes reply
        send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, 5)
示例#10
0
def test_belated_request_not_processed_after_view_change(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client):

    delta = txnPoolNodeSet[3]
    initial_ledger_size = delta.domainLedger.size
    delta.clientIbStasher.delay(req_delay(300))

    one_req = sdk_signed_random_requests(looper, sdk_wallet_client, 1)
    sdk_send_and_check(one_req, looper, txnPoolNodeSet, sdk_pool_handle)

    ensure_view_change(looper, txnPoolNodeSet)
    ensureElectionsDone(looper, txnPoolNodeSet)

    delta.clientIbStasher.reset_delays_and_process_delayeds()
    looper.runFor(waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)))

    for node in txnPoolNodeSet:
        assert node.domainLedger.size - initial_ledger_size == 1
def testZStackNodeReconnection(tconf, looper, txnPoolNodeSet, client1, wallet1,
                               tdir, client1Connected):
    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1)

    npr = [n for n in txnPoolNodeSet if not n.hasPrimary]
    nodeToCrash = npr[0]
    idxToCrash = txnPoolNodeSet.index(nodeToCrash)
    otherNodes = [_ for _ in txnPoolNodeSet if _ != nodeToCrash]

    def checkFlakyConnected(conn=True):
        for node in otherNodes:
            if conn:
                assert nodeToCrash.nodestack.name in node.nodestack.connecteds
            else:
                assert nodeToCrash.nodestack.name not in node.nodestack.connecteds

    checkFlakyConnected(True)
    nodeToCrash.stop()
    logger.debug('Stopped node {}'.format(nodeToCrash))
    looper.removeProdable(nodeToCrash)
    looper.runFor(1)
    stopNodes([nodeToCrash], looper)
    # TODO Select or create the timeout from 'waits'. Don't use constant.
    looper.run(eventually(checkFlakyConnected, False, retryWait=1, timeout=60))

    looper.runFor(1)
    config_helper = PNodeConfigHelper(nodeToCrash.name, tconf, chroot=tdir)
    node = TestNode(nodeToCrash.name,
                    ledger_dir=config_helper.ledger_dir,
                    keys_dir=config_helper.keys_dir,
                    genesis_dir=config_helper.genesis_dir,
                    plugins_dir=config_helper.plugins_dir,
                    config=tconf,
                    ha=nodeToCrash.nodestack.ha,
                    cliha=nodeToCrash.clientstack.ha)
    looper.add(node)
    txnPoolNodeSet[idxToCrash] = node

    # TODO Select or create the timeout from 'waits'. Don't use constant.
    looper.run(eventually(checkFlakyConnected, True, retryWait=2, timeout=50))
    ensureElectionsDone(looper, txnPoolNodeSet, retryWait=2)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)

    send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, 10)
示例#12
0
def restart_nodes(looper,
                  nodeSet,
                  restart_set,
                  tconf,
                  tdir,
                  allPluginsPath,
                  after_restart_timeout=None,
                  per_add_timeout=None):
    for node_to_stop in restart_set:
        node_to_stop.cleanupOnStopping = True
        node_to_stop.stop()
        looper.removeProdable(node_to_stop)

    rest_nodes = [n for n in nodeSet if n not in restart_set]
    for node_to_stop in restart_set:
        ensure_node_disconnected(looper, node_to_stop, nodeSet, timeout=2)

    if after_restart_timeout:
        looper.runFor(after_restart_timeout)

    for node_to_restart in restart_set:
        config_helper = PNodeConfigHelper(node_to_restart.name,
                                          tconf,
                                          chroot=tdir)
        restarted_node = TestNode(node_to_restart.name,
                                  config_helper=config_helper,
                                  config=tconf,
                                  pluginPaths=allPluginsPath,
                                  ha=node_to_restart.nodestack.ha,
                                  cliha=node_to_restart.clientstack.ha)
        looper.add(restarted_node)
        idx = nodeSet.index(node_to_restart)
        nodeSet[idx] = restarted_node
        if per_add_timeout:
            looper.run(
                checkNodesConnected(rest_nodes + [restarted_node],
                                    customTimeout=per_add_timeout))
        rest_nodes += [restarted_node]

    if not per_add_timeout:
        looper.run(
            checkNodesConnected(nodeSet, customTimeout=after_restart_timeout))
示例#13
0
def test_belated_request_not_processed_if_already_in_3pc_process(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client):

    delta = txnPoolNodeSet[3]
    initial_ledger_size = delta.domainLedger.size
    delta.clientIbStasher.delay(req_delay(300))
    for node in txnPoolNodeSet:
        node.nodeIbStasher.delay(cDelay(300))

    one_req = sdk_signed_random_requests(looper, sdk_wallet_client, 1)
    sdk_send_signed_requests(sdk_pool_handle, one_req)
    looper.runFor(
        waits.expectedPropagateTime(len(txnPoolNodeSet)) +
        waits.expectedPrePrepareTime(len(txnPoolNodeSet)) +
        waits.expectedPrepareTime(len(txnPoolNodeSet)) +
        waits.expectedCommittedTime(len(txnPoolNodeSet)))

    delta.clientIbStasher.reset_delays_and_process_delayeds()
    looper.runFor(
        waits.expectedPropagateTime(len(txnPoolNodeSet)) +
        waits.expectedPrePrepareTime(len(txnPoolNodeSet)) +
        waits.expectedPrepareTime(len(txnPoolNodeSet)) +
        waits.expectedCommittedTime(len(txnPoolNodeSet)))

    for node in txnPoolNodeSet:
        node.nodeIbStasher.reset_delays_and_process_delayeds()
    looper.runFor(waits.expectedOrderingTime(delta.replicas.num_replicas))

    for node in txnPoolNodeSet:
        assert node.domainLedger.size - initial_ledger_size == 1
示例#14
0
def test_pool_reaches_quorum_after_f_plus_2_nodes_turned_off_and_later_on(
        looper, allPluginsPath, tdir, tconf, txnPoolNodeSet, wallet1, client1,
        client1Connected):

    nodes = txnPoolNodeSet
    initial_view_no = nodes[0].viewNo

    request = sendRandomRequest(wallet1, client1)
    waitForSufficientRepliesForRequests(looper, client1, requests=[request])

    stop_node(nodes[0], looper, nodes)
    waitForViewChange(looper, nodes[1:], expectedViewNo=initial_view_no + 1)
    ensureElectionsDone(looper,
                        nodes[1:],
                        numInstances=getRequiredInstances(nodeCount))

    request = sendRandomRequest(wallet1, client1)
    waitForSufficientRepliesForRequests(looper, client1, requests=[request])

    stop_node(nodes[1], looper, nodes)
    looper.runFor(tconf.ToleratePrimaryDisconnection + 2)
    checkViewNoForNodes(nodes[2:], initial_view_no + 1)

    request = sendRandomRequest(wallet1, client1)
    verify_request_not_replied_and_not_ordered(request, looper, client1, nodes)

    stop_node(nodes[2], looper, nodes)
    looper.runFor(tconf.ToleratePrimaryDisconnection + 2)
    checkViewNoForNodes(nodes[3:], initial_view_no + 1)

    request = sendRandomRequest(wallet1, client1)
    verify_request_not_replied_and_not_ordered(request, looper, client1, nodes)

    nodes[2] = start_stopped_node(nodes[2], looper, tconf, tdir,
                                  allPluginsPath)
    looper.runFor(waits.expectedPoolElectionTimeout(len(nodes)))

    request = sendRandomRequest(wallet1, client1)
    verify_request_not_replied_and_not_ordered(request, looper, client1, nodes)

    nodes[1] = start_stopped_node(nodes[1], looper, tconf, tdir,
                                  allPluginsPath)
    ensureElectionsDone(looper,
                        nodes[1:],
                        numInstances=getRequiredInstances(nodeCount))
    waitForViewChange(looper, nodes[1:], expectedViewNo=initial_view_no + 1)

    request = sendRandomRequest(wallet1, client1)
    waitForSufficientRepliesForRequests(looper, client1, requests=[request])

    nodes[0] = start_stopped_node(nodes[0], looper, tconf, tdir,
                                  allPluginsPath)
    ensureElectionsDone(looper,
                        nodes,
                        numInstances=getRequiredInstances(nodeCount))
    waitForViewChange(looper, nodes, expectedViewNo=initial_view_no + 1)

    request = sendRandomRequest(wallet1, client1)
    waitForSufficientRepliesForRequests(looper, client1, requests=[request])
示例#15
0
def testPostingThroughput(postingStatsEnabled, decreasedMonitoringTimeouts,
                          looper, txnPoolNodeSet, sdk_wallet_client,
                          sdk_pool_handle):
    """
    The throughput after `DashboardUpdateFreq` seconds and before sending any
    requests should be zero.
    Send `n` requests in less than `ThroughputWindowSize` seconds and the
    throughput till `ThroughputWindowSize` should consider those `n` requests.
    After `ThroughputWindowSize` seconds the throughput should be zero
    Test `totalRequests` too.
    """

    config = decreasedMonitoringTimeouts

    # We are sleeping for this window size, because we need to clear previous
    # values that were being stored for this much time in tests
    looper.runFor(config.ThroughputWindowSize)

    reqCount = 10
    for node in txnPoolNodeSet:
        assert node.monitor.highResThroughput == 0
        assert node.monitor.totalRequests == 0

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, reqCount)

    for node in txnPoolNodeSet:
        assert len(node.monitor.orderedRequestsInLast) == reqCount
        assert node.monitor.highResThroughput > 0
        assert node.monitor.totalRequests == reqCount
        # TODO: Add implementation to actually call firebase plugin
        # and test if firebase plugin is sending total request count
        # if node is primary

    looper.runFor(config.DashboardUpdateFreq)

    for node in txnPoolNodeSet:
        node.monitor.spylog.count(Monitor.sendThroughput.__name__) > 0

    # Run for latency window duration so that `orderedRequestsInLast`
    # becomes empty
    looper.runFor(config.ThroughputWindowSize)

    def chk():
        for node in txnPoolNodeSet:
            assert len(node.monitor.orderedRequestsInLast) == 0
            assert node.monitor.highResThroughput == 0
            assert node.monitor.totalRequests == reqCount

    timeout = config.ThroughputWindowSize
    looper.run(eventually(chk, retryWait=1, timeout=timeout))
示例#16
0
def testPostingLatency(postingStatsEnabled, decreasedMonitoringTimeouts,
                       looper, txnPoolNodeSet, sdk_wallet_client,
                       sdk_pool_handle):
    """
    The latencies (master as well as average of backups) after
    `DashboardUpdateFreq` seconds and before sending any requests should be zero.
    Send `n` requests in less than `LatencyWindowSize` seconds and the
    latency till `LatencyWindowSize` should consider those `n` requests.
    After `LatencyWindowSize` seconds the latencies should be zero
    """

    config = decreasedMonitoringTimeouts

    # Run for latency window duration so that `latenciesByMasterInLast` and
    # `latenciesByBackupsInLast` become empty
    looper.runFor(config.LatencyWindowSize)
    reqCount = 10
    for node in txnPoolNodeSet:
        assert node.monitor.masterLatency == 0
        assert node.monitor.avgBackupLatency == 0

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, reqCount)

    for node in txnPoolNodeSet:
        assert node.monitor.masterLatency > 0
        assert node.monitor.avgBackupLatency > 0

    looper.runFor(config.DashboardUpdateFreq)

    for node in txnPoolNodeSet:
        node.monitor.spylog.count(Monitor.sendLatencies.__name__) > 0

    # Run for latency window duration so that `latenciesByMasterInLast` and
    # `latenciesByBackupsInLast` become empty
    looper.runFor(config.LatencyWindowSize)

    def chk():
        for node in txnPoolNodeSet:
            assert node.monitor.masterLatency == 0
            assert node.monitor.avgBackupLatency == 0

    timeout = config.LatencyWindowSize
    looper.run(eventually(chk, retryWait=1, timeout=timeout))
示例#17
0
def test_quorum_after_f_plus_2_nodes_but_not_primary_turned_off_and_later_on(
        looper, allPluginsPath, tdir, tconf, txnPoolNodeSet, wallet1, client1,
        client1Connected):

    nodes = txnPoolNodeSet

    request1 = sendRandomRequest(wallet1, client1)
    waitForSufficientRepliesForRequests(looper, client1, requests=[request1])

    stop_node(nodes[4], looper, nodes)
    looper.runFor(tconf.ToleratePrimaryDisconnection +
                  waits.expectedPoolElectionTimeout(len(nodes)))
    checkViewNoForNodes(nodes[:4], expectedViewNo=0)

    request2 = sendRandomRequest(wallet1, client1)
    waitForSufficientRepliesForRequests(looper, client1, requests=[request2])

    stop_node(nodes[3], looper, nodes)
    looper.runFor(tconf.ToleratePrimaryDisconnection +
                  waits.expectedPoolElectionTimeout(len(nodes)))
    checkViewNoForNodes(nodes[:3], expectedViewNo=0)

    request3 = sendRandomRequest(wallet1, client1)
    verify_request_not_replied_and_not_ordered(request3, looper, client1,
                                               nodes)

    stop_node(nodes[2], looper, nodes)
    looper.runFor(tconf.ToleratePrimaryDisconnection +
                  waits.expectedPoolElectionTimeout(len(nodes)))
    checkViewNoForNodes(nodes[:2], expectedViewNo=0)

    request4 = sendRandomRequest(wallet1, client1)
    verify_request_not_replied_and_not_ordered(request4, looper, client1,
                                               nodes)

    nodes[4] = start_stopped_node(nodes[4], looper, tconf, tdir,
                                  allPluginsPath)
    looper.runFor(waits.expectedPoolElectionTimeout(len(nodes)))
    checkViewNoForNodes(nodes[:2] + nodes[4:], expectedViewNo=0)

    request5 = sendRandomRequest(wallet1, client1)
    verify_request_not_replied_and_not_ordered(request5, looper, client1,
                                               nodes)

    nodes[3] = start_stopped_node(nodes[3], looper, tconf, tdir,
                                  allPluginsPath)
    ensureElectionsDone(looper,
                        nodes[:2] + nodes[3:],
                        numInstances=getRequiredInstances(nodeCount))
    checkViewNoForNodes(nodes[:2] + nodes[3:], expectedViewNo=0)

    request6 = sendRandomRequest(wallet1, client1)
    waitForSufficientRepliesForRequests(
        looper, client1, requests=[request3, request4, request5, request6])

    nodes[2] = start_stopped_node(nodes[2], looper, tconf, tdir,
                                  allPluginsPath)
    ensureElectionsDone(looper,
                        nodes,
                        numInstances=getRequiredInstances(nodeCount))
    checkViewNoForNodes(nodes, expectedViewNo=0)

    request7 = sendRandomRequest(wallet1, client1)
    waitForSufficientRepliesForRequests(looper, client1, requests=[request7])
def test_view_change_after_back_to_quorum_with_disconnected_primary(
        txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir,
        tconf, allPluginsPath):
    assert len(txnPoolNodeSet) == 4

    pr_node = get_master_primary_node(txnPoolNodeSet)
    assert pr_node.name == "Alpha"

    # 1. Initiate view change be primary (Alpha) restart
    nodes = ensure_view_change_by_primary_restart(looper,
                                                  txnPoolNodeSet,
                                                  tconf,
                                                  tdir,
                                                  allPluginsPath,
                                                  customTimeout=2 *
                                                  tconf.VIEW_CHANGE_TIMEOUT)

    # Now primary should be Beta
    pr_node = get_master_primary_node(nodes)
    assert pr_node.name == "Beta"

    # 2. Stop non-primary node Delta, no any view changes are expected
    non_primary_to_stop = [n for n in nodes if n.name == "Delta"][0]
    disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet,
                                            non_primary_to_stop)
    looper.removeProdable(non_primary_to_stop)

    remaining_nodes = list(set(nodes) - {non_primary_to_stop})
    # Primary is going to be stopped, remember instance change messages count
    # to ensure that no view change happened as number of connected nodes is less
    # than quorum.
    ic_cnt = {}
    for n in remaining_nodes:
        ic_cnt[n.name] = n.view_changer.spylog.count(
            ViewChanger.sendInstanceChange.__name__)

    # 3. Disconnect primary
    disconnect_node_and_ensure_disconnected(looper, remaining_nodes, pr_node)
    looper.removeProdable(pr_node)

    # Wait for more than ToleratePrimaryDisconnection timeout and check that no IC messages presented.
    looper.runFor(tconf.ToleratePrimaryDisconnection + 5)
    remaining_nodes = list(set(remaining_nodes) - {pr_node})
    for n in remaining_nodes:
        assert ic_cnt[n.name] == n.view_changer.spylog.count(
            ViewChanger.sendInstanceChange.__name__)

    view_no = checkViewNoForNodes(remaining_nodes)

    # 4. Start Delta (non-primary), now primary (Beta) is disconnected but there is a quorum
    # to choose a new one.
    restartedNode = start_stopped_node(non_primary_to_stop,
                                       looper,
                                       tconf,
                                       tdir,
                                       allPluginsPath,
                                       delay_instance_change_msgs=False)
    remaining_nodes = remaining_nodes + [restartedNode]

    # 5. Check that view change happened.
    waitForViewChange(looper,
                      remaining_nodes,
                      expectedViewNo=(view_no + 1),
                      customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT)

    # ensure pool is working properly
    sdk_send_random_and_check(looper, remaining_nodes, sdk_pool_handle,
                              sdk_wallet_client, 3)
    ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes)
def test_node_detecting_lag_from_view_change_done_messages(
        txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf):
    """
    A node is slow and after view change starts, it marks it's `last_prepared`
    to less than others, after catchup it does not get any txns from others
    and finds it has already ordered it's `last_prepared`, but when
    it gets ViewChangeDone messages, it starts catchup again and this
    time gets the txns. To achieve this delay all 3PC messages to a node so
    before view change it has different last_prepared from others.
    Also delay processing of COMMITs and INSTANCE_CHANGEs by other nodes
    """
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * 3, 3)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)

    slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node
    fast_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    slow_master_replica = slow_node.master_replica
    fast_master_replicas = [n.master_replica for n in fast_nodes]

    delay_3pc = 50
    delay_ic = tconf.PerfCheckFreq + 5
    delay_commit = delay_ic + 10
    delay_3pc_messages([slow_node], 0, delay_3pc)
    for n in fast_nodes:
        n.nodeIbStasher.delay(icDelay(delay_ic))
        n.nodeIbStasher.delay(cDelay(delay_commit))

    reqs = []
    for i in range(10):
        reqs = reqs + sendRandomRequests(wallet1, client1, 2)
        looper.runFor(.2)

    def chk1():
        for rep in fast_master_replicas:
            assert compare_3PC_keys(
                slow_master_replica.last_prepared_certificate_in_view(),
                rep.last_prepared_certificate_in_view()) > 0
            assert slow_master_replica.last_ordered_3pc == rep.last_ordered_3pc

    looper.run(eventually(chk1))

    no_more_catchup_count = get_count(slow_node,
                                      slow_node.no_more_catchups_needed)

    # Track last prepared for master replica of each node
    prepareds = {}
    orig_methods = {}
    for node in txnPoolNodeSet:
        orig_methods[node.name] = node.master_replica.on_view_change_start

        def patched_on_view_change_start(self):
            orig_methods[self.node.name]()
            prepareds[self.node.name] = self.last_prepared_before_view_change

        node.master_replica.on_view_change_start = types.MethodType(
            patched_on_view_change_start, node.master_replica)

    ensure_view_change(looper, txnPoolNodeSet, exclude_from_check=fast_nodes)

    def chk2():
        # last_prepared of slow_node is less than fast_nodes
        for rep in fast_master_replicas:
            assert compare_3PC_keys(prepareds[slow_master_replica.node.name],
                                    prepareds[rep.node.name]) > 0

    looper.run(eventually(chk2, timeout=delay_ic + 5))

    last_start_catchup_call_at = None
    no_more_catchup_call_at = None

    def chk3():
        # no_more_catchups_needed was called since node found no need of catchup
        nonlocal last_start_catchup_call_at, no_more_catchup_call_at
        assert (get_count(slow_node, slow_node.no_more_catchups_needed) -
                no_more_catchup_count) > 0

        no_more_catchup_call_at = slow_node.spylog.getLast(
            slow_node.no_more_catchups_needed).starttime
        last_start_catchup_call_at = slow_node.spylog.getLast(
            slow_node.start_catchup).starttime

    looper.run(eventually(chk3, timeout=delay_commit))

    for n in fast_nodes:
        n.nodeIbStasher.reset_delays_and_process_delayeds()
        n.nodeIbStasher.reset_delays_and_process_delayeds()

    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)

    assert slow_node.spylog.getLast(
        slow_node.start_catchup).starttime > no_more_catchup_call_at
    assert slow_node.spylog.getLast(
        slow_node.start_catchup).starttime > last_start_catchup_call_at
def testProtocolInstanceCannotBecomeActiveWithLessThanFourServers(
        tconf_for_func, tdir_for_func):
    """
    A protocol instance must have at least 4 nodes to come up.
    The status of the nodes will change from starting to started only after the
    addition of the fourth node to the system.
    """
    nodeCount = 13
    f = 4
    minimumNodesToBeUp = nodeCount - f

    nodeNames = genNodeNames(nodeCount)
    with TestNodeSet(tconf_for_func, names=nodeNames,
                     tmpdir=tdir_for_func) as nodeSet:
        with Looper(nodeSet) as looper:

            # helpers

            def genExpectedStates(connecteds: Iterable[str]):
                return {
                    nn: CONNECTED if nn in connecteds else JOINED_NOT_ALLOWED
                    for nn in nodeNames
                }

            def checkNodeStatusRemotesAndF(expectedStatus: Status,
                                           nodeIdx: int):
                for node in nodeSet.nodes.values():
                    checkNodeRemotes(
                        node, genExpectedStates(nodeNames[:nodeIdx + 1]))
                    assert node.status == expectedStatus

            def addNodeBackAndCheck(nodeIdx: int, expectedStatus: Status):
                logger.info("Add back the {} node and see status of {}".format(
                    ordinal(nodeIdx + 1), expectedStatus))
                addNodeBack(nodeSet, looper, nodeNames[nodeIdx])

                timeout = waits.expectedNodeStartUpTimeout() + \
                          waits.expectedPoolInterconnectionTime(len(nodeSet))
                # TODO: Probably it's better to modify waits.* functions
                timeout *= 1.5
                looper.run(
                    eventually(checkNodeStatusRemotesAndF,
                               expectedStatus,
                               nodeIdx,
                               retryWait=1,
                               timeout=timeout))

            logger.debug("Sharing keys")
            looper.run(checkNodesConnected(nodeSet))

            logger.debug("Remove all the nodes")
            for n in nodeNames:
                looper.removeProdable(nodeSet.nodes[n])
                nodeSet.removeNode(n)

            looper.runFor(10)

            logger.debug("Add nodes back one at a time")
            for i in range(nodeCount):
                nodes = i + 1
                if nodes < minimumNodesToBeUp:
                    expectedStatus = Status.starting
                elif nodes < nodeCount:
                    expectedStatus = Status.started_hungry
                else:
                    expectedStatus = Status.started
                addNodeBackAndCheck(i, expectedStatus)