Пример #1
0
def test_add_as_many_pending_nodes(network, args):
    # Killing pending nodes should not change the raft consensus rules
    primary, _ = network.find_primary()
    number_new_nodes = len(network.nodes)
    LOG.info(
        f"Adding {number_new_nodes} pending nodes - consensus rules should not change"
    )

    new_nodes = []
    for _ in range(number_new_nodes):
        new_node = network.create_node("local://localhost")
        network.join_node(new_node, args.package, args, from_snapshot=False)
        new_nodes.append(new_node)

    for new_node in new_nodes:
        new_node.stop()

    # Even though pending nodes (half the number of nodes) are stopped,
    # service can still make progress
    check_can_progress(primary)

    # Cleanup killed pending nodes
    for new_node in new_nodes:
        network.retire_node(primary, new_node)

    wait_for_reconfiguration_to_complete(network)

    return network
Пример #2
0
def test_retire_backup(network, args):
    primary, _ = network.find_primary()
    backup_to_retire = network.find_any_backup()
    network.consortium.retire_node(primary, backup_to_retire)
    backup_to_retire.stop()
    check_can_progress(primary)
    return network
Пример #3
0
def test_add_as_many_pending_nodes(network, args):
    # Should not change the raft consensus rules (i.e. majority)
    primary, _ = network.find_primary()
    number_new_nodes = len(network.nodes)
    LOG.info(
        f"Adding {number_new_nodes} pending nodes - consensus rules should not change"
    )

    new_nodes = []
    for _ in range(number_new_nodes):
        new_node = network.create_node("local://localhost")
        network.join_node(new_node, args.package, args, from_snapshot=False)
        new_nodes.append(new_node)

    check_can_progress(primary)

    for new_node in new_nodes:
        network.retire_node(primary, new_node)

    wait_for_reconfiguration_to_complete(network)

    # Stop the retired nodes so they don't linger in the background and interfere
    # with subsequent tests
    for new_node in new_nodes:
        new_node.stop()

    return network
Пример #4
0
def test_node_replacement(network, args):
    primary, backups = network.find_nodes()

    node_to_replace = backups[-1]
    LOG.info(f"Retiring node {node_to_replace.local_node_id}")
    network.retire_node(primary, node_to_replace)
    node_to_replace.stop()
    check_can_progress(primary)

    LOG.info("Adding one node on same address as retired node")
    replacement_node = network.create_node(
        f"local://{node_to_replace.rpc_host}:{node_to_replace.rpc_port}",
        node_port=node_to_replace.node_port,
    )
    network.join_node(replacement_node, args.package, args, from_snapshot=False)
    network.trust_node(replacement_node, args)

    assert replacement_node.node_id != node_to_replace.node_id
    assert replacement_node.rpc_host == node_to_replace.rpc_host
    assert replacement_node.node_port == node_to_replace.node_port
    assert replacement_node.rpc_port == node_to_replace.rpc_port

    allowed_to_suspend_count = network.get_f() - len(network.get_stopped_nodes())
    backups_to_suspend = backups[:allowed_to_suspend_count]
    LOG.info(
        f"Suspending {len(backups_to_suspend)} other nodes to make progress depend on the replacement"
    )
    for other_backup in backups_to_suspend:
        other_backup.suspend()
    # Confirm the network can make progress
    check_can_progress(primary)
    for other_backup in backups_to_suspend:
        other_backup.resume()

    return network
Пример #5
0
def test_retire_backup(network, args):
    primary, _ = network.find_primary()
    backup_to_retire = network.find_any_backup()
    network.retire_node(primary, backup_to_retire)
    backup_to_retire.stop()
    check_can_progress(primary)
    wait_for_reconfiguration_to_complete(network)
    return network
Пример #6
0
def test_suspend_primary(network, args):
    primary, _ = network.find_primary()
    primary.suspend()
    new_primary, _ = network.wait_for_new_primary(primary)
    check_can_progress(new_primary)
    primary.resume()
    check_can_progress(new_primary)
    return network
Пример #7
0
def test_retire_primary(network, args):
    pre_count = count_nodes(node_configs(network), network)

    primary, backup = network.find_primary_and_any_backup()
    network.consortium.retire_node(primary, primary)
    network.wait_for_new_primary(primary)
    check_can_progress(backup)
    network.nodes.remove(primary)
    post_count = count_nodes(node_configs(network), network)
    assert pre_count == post_count + 1
    primary.stop()
    return network
Пример #8
0
def test_new_joiner_helps_liveness(network, args):
    primary, backups = network.find_nodes()

    # Issue some transactions, so there is a ledger history that a new node must receive
    network.txs.issue(network, number_txs=10)

    # Remove a node, leaving the network frail
    network.retire_node(primary, backups[-1])
    backups[-1].stop()

    primary, backups = network.find_nodes()

    with contextlib.ExitStack() as stack:
        # Add a new node, but partition them before trusting them
        new_node = network.create_node("local://localhost")
        network.join_node(new_node, args.package, args, from_snapshot=False)
        new_joiner_partition = [new_node]
        new_joiner_rules = stack.enter_context(
            network.partitioner.partition([primary, *backups],
                                          new_joiner_partition))

        # Trust the new node, and wait for commit of this (but don't ask the new node itself, which doesn't know this yet)
        network.trust_node(new_node, args, no_wait=True)
        check_can_progress(primary)

        # Partition the primary, temporarily creating a minority service that cannot make progress
        minority_partition = backups[len(backups) // 2:] + new_joiner_partition
        minority_rules = stack.enter_context(
            network.partitioner.partition(minority_partition))
        # This is an unusual situation, where we've actually produced a dead partitioned node.
        # Initially any write requests will timeout (failed attempt at forwarding), and then
        # the node transitions to a candidate with nobody to talk to. Rather than trying to
        # catch the errors of these states quickly, we just sleep until the latter state is
        # reached, and then confirm it was reached.
        time.sleep(network.observed_election_duration)
        with backups[0].client("user0") as c:
            r = c.post("/app/log/private", {"id": 42, "msg": "Hello world"})
            assert r.status_code == http.HTTPStatus.SERVICE_UNAVAILABLE

        # Restore the new node to the service
        new_joiner_rules.drop()

        # Confirm that the new node catches up, and progress can be made in this majority partition
        network.wait_for_new_primary(primary, minority_partition)
        check_can_progress(new_node)

        # Explicitly drop rules before continuing
        minority_rules.drop()

        network.wait_for_primary_unanimity()
        primary, _ = network.find_nodes()
        network.wait_for_all_nodes_to_commit(primary=primary)
Пример #9
0
def test_retire_backup(network, args):
    primary, _ = network.find_primary()
    backup_to_retire = network.find_any_backup()
    network.retire_node(primary, backup_to_retire)
    network.wait_for_node_in_store(
        primary,
        backup_to_retire.node_id,
        node_status=None,
        timeout=3,
    )
    backup_to_retire.stop()
    check_can_progress(primary)
    wait_for_reconfiguration_to_complete(network)
    return network
Пример #10
0
def test_retire_primary(network, args):
    pre_count = count_nodes(node_configs(network), network)

    primary, backup = network.find_primary_and_any_backup()
    network.retire_node(primary, primary, timeout=15)
    # Query this backup to find the new primary. If we ask any other
    # node, then this backup may not know the new primary by the
    # time we call check_can_progress.
    network.wait_for_new_primary(primary, nodes=[backup])
    check_can_progress(backup)
    post_count = count_nodes(node_configs(network), network)
    assert pre_count == post_count + 1
    primary.stop()
    wait_for_reconfiguration_to_complete(network)
    return network
Пример #11
0
def test_learner_does_not_take_part(network, args):
    primary, backups = network.find_nodes()
    f_backups = backups[:network.get_f() + 1]

    new_node = network.create_node("local://localhost")
    network.join_node(new_node, args.package, args, from_snapshot=False)

    with network.partitioner.partition(f_backups):

        check_does_not_progress(primary, timeout=5)

        try:
            network.consortium.trust_node(
                primary,
                new_node.node_id,
                timeout=ceil(args.join_timer * 2 / 1000),
                valid_from=str(
                    infra.crypto.datetime_to_X509time(datetime.now())),
            )
            new_node.wait_for_node_to_join(timeout=ceil(args.join_timer * 2 /
                                                        1000))
            join_failed = False
        except Exception:
            join_failed = True

        if not join_failed:
            raise Exception("join succeeded unexpectedly")

        with new_node.client(self_signed_ok=True) as c:
            r = c.get("/node/network/nodes/self")
            assert r.body.json()["status"] == "Learner"
            r = c.get("/node/consensus")
            assert new_node.node_id in r.body.json()["details"]["learners"]

        # New node joins, but cannot be promoted to TRUSTED without f other backups

        check_does_not_progress(primary, timeout=5)

        with new_node.client(self_signed_ok=True) as c:
            r = c.get("/node/network/nodes/self")
            assert r.body.json()["status"] == "Learner"
            r = c.get("/node/consensus")
            assert new_node.node_id in r.body.json()["details"]["learners"]

    network.wait_for_primary_unanimity()
    primary, _ = network.find_nodes()
    network.wait_for_all_nodes_to_commit(primary=primary)
    check_can_progress(primary)
Пример #12
0
def test_isolate_and_reconnect_primary(network, args, **kwargs):
    primary, backups = network.find_nodes()
    with network.partitioner.partition(backups):
        lost_tx_resp = check_does_not_progress(primary)

        new_primary, _ = network.wait_for_new_primary(primary,
                                                      nodes=backups,
                                                      timeout_multiplier=6)
        new_tx_resp = check_can_progress(new_primary)

    # Check reconnected former primary has caught up
    with primary.client() as c:
        try:
            # There will be at least one full election cycle for nothing, where the
            # re-joining node fails to get elected but causes others to rev up their
            # term. After that, a successful election needs to take place, and we
            # arbitrarily allow 3 time periods to avoid being too brittle when
            # raft timeouts line up badly.
            c.wait_for_commit(new_tx_resp,
                              timeout=(network.election_duration * 4))
        except TimeoutError:
            details = c.get("/node/consensus").body.json()
            assert (
                False
            ), f"Stuck before {new_tx_resp.view}.{new_tx_resp.seqno}: {pprint.pformat(details)}"

        # Check it has dropped anything submitted while partitioned
        r = c.get(
            f"/node/tx?transaction_id={lost_tx_resp.view}.{lost_tx_resp.seqno}"
        )
        status = TxStatus(r.body.json()["status"])
        assert status == TxStatus.Invalid, r
Пример #13
0
def test_add_as_many_pending_nodes(network, args):
    # Should not change the raft consensus rules (i.e. majority)
    primary, _ = network.find_primary()
    number_new_nodes = len(network.nodes)
    LOG.info(
        f"Adding {number_new_nodes} pending nodes - consensus rules should not change"
    )

    new_nodes = [
        network.create_and_add_pending_node(
            args.package,
            "local://localhost",
            args,
        ) for _ in range(number_new_nodes)
    ]
    check_can_progress(primary)

    for new_node in new_nodes:
        network.consortium.retire_node(primary, new_node)
        network.nodes.remove(new_node)
    return network
Пример #14
0
def test_verify_quotes(network, args):
    if args.enclave_type == "virtual":
        LOG.warning("Skipping quote test with virtual enclave")
        return network

    LOG.info("Check the network is stable")
    primary, _ = network.find_primary()
    check_can_progress(primary)

    for node in network.get_joined_nodes():
        LOG.info(f"Verifying quote for node {node.node_id}")
        cafile = os.path.join(network.common_dir, "service_cert.pem")
        assert (infra.proc.ccall(
            "verify_quote.sh",
            f"https://{node.get_public_rpc_host()}:{node.get_public_rpc_port()}",
            "--cacert",
            f"{cafile}",
            log_output=True,
        ).returncode == 0
                ), f"Quote verification for node {node.node_id} failed"

    return network
Пример #15
0
def test_node_replacement(network, args):
    primary, backups = network.find_nodes()

    nodes = network.get_joined_nodes()
    node_to_replace = backups[-1]
    f = infra.e2e_args.max_f(args, len(nodes))
    f_backups = backups[:f]

    # Retire one node
    network.consortium.retire_node(primary, node_to_replace)
    node_to_replace.stop()
    network.nodes.remove(node_to_replace)
    check_can_progress(primary)

    # Add in a node using the same address
    replacement_node = network.create_and_trust_node(
        args.package,
        f"local://{node_to_replace.host}:{node_to_replace.rpc_port}",
        args,
        node_port=node_to_replace.node_port,
        from_snapshot=False,
    )

    assert replacement_node.node_id != node_to_replace.node_id
    assert replacement_node.host == node_to_replace.host
    assert replacement_node.node_port == node_to_replace.node_port
    assert replacement_node.rpc_port == node_to_replace.rpc_port
    LOG.info(
        f"Stopping {len(f_backups)} other nodes to make progress depend on the replacement"
    )
    for other_backup in f_backups:
        other_backup.suspend()
    # Confirm the network can make progress
    check_can_progress(primary)
    for other_backup in f_backups:
        other_backup.resume()

    return network
Пример #16
0
def test_retire_primary(network, args):
    pre_count = count_nodes(node_configs(network), network)

    primary, backup = network.find_primary_and_any_backup()
    network.retire_node(primary, primary, timeout=15)
    # Query this backup to find the new primary. If we ask any other
    # node, then this backup may not know the new primary by the
    # time we call check_can_progress.
    new_primary, _ = network.wait_for_new_primary(primary, nodes=[backup])
    # The old primary should automatically be removed from the store
    # once a new primary is elected
    network.wait_for_node_in_store(
        new_primary,
        primary.node_id,
        node_status=None,
        timeout=3,
    )
    check_can_progress(backup)
    post_count = count_nodes(node_configs(network), network)
    assert pre_count == post_count + 1
    primary.stop()
    wait_for_reconfiguration_to_complete(network)
    return network
Пример #17
0
def test_isolate_and_reconnect_primary(network, args):
    primary, backups = network.find_nodes()
    with network.partitioner.partition(backups):
        new_primary, _ = network.wait_for_new_primary(
            primary, nodes=backups, timeout_multiplier=6
        )
        new_tx = check_can_progress(new_primary)

    # Check reconnected former primary has caught up
    with primary.client() as c:
        r = c.get("/node/commit")
        timeout = 5
        end_time = time.time() + timeout
        while time.time() < end_time:
            current_tx = TxID.from_str(
                c.get("/node/commit").body.json()["transaction_id"]
            )
            if current_tx.seqno >= new_tx.seqno:
                return network
            time.sleep(0.1)
        assert False, f"Stuck at {r}"
Пример #18
0
def test_learner_does_not_take_part(network, args):
    primary, backups = network.find_nodes()
    f_backups = backups[:network.get_f() + 1]

    # Note: host is supplied explicitly to avoid having differently
    # assigned IPs for the interfaces, something which the test infra doesn't
    # support widely yet.
    operator_rpc_interface = "operator_rpc_interface"
    host = infra.net.expand_localhost()
    new_node = network.create_node(
        infra.interfaces.HostSpec(
            rpc_interfaces={
                infra.interfaces.PRIMARY_RPC_INTERFACE:
                infra.interfaces.RPCInterface(host=host),
                operator_rpc_interface:
                infra.interfaces.RPCInterface(
                    host=host,
                    endorsement=infra.interfaces.Endorsement(
                        authority=infra.interfaces.EndorsementAuthority.Node),
                ),
            }))
    network.join_node(new_node, args.package, args, from_snapshot=False)

    LOG.info("Wait for all nodes to have committed join of new pending node")
    network.wait_for_all_nodes_to_commit(primary=primary)

    # Here, we partition a majority of backups. This is very intentional so that
    # the new learner node is not promoted to trusted while the partition is up.
    # However, this means that the isolated majority of backups can (and will)
    # elect one of them as new primary while the partition is up. When the partition
    # is lifted, all the transactions executed of the primary node (including
    # trusting the new node) will be rolled back. Because of this, we issue a new
    # trust_node proposal to make sure the new node ends up being trusted and joins
    # successfully.
    with network.partitioner.partition(f_backups):

        check_does_not_progress(primary, timeout=5)

        try:
            network.consortium.trust_node(
                primary,
                new_node.node_id,
                timeout=ceil(args.join_timer_s * 2),
                valid_from=datetime.now(),
            )
        except TimeoutError:
            LOG.info("Trust node proposal did not commit as expected")
        else:
            raise Exception("Trust node proposal committed unexpectedly")

        check_does_not_progress(primary, timeout=5)

        LOG.info("Majority partition can make progress")
        partition_primary, _ = network.wait_for_new_primary(primary,
                                                            nodes=f_backups)
        check_can_progress(partition_primary)

        LOG.info(
            "New joiner is not promoted to Trusted without f other backups")
        with new_node.client(interface_name=operator_rpc_interface,
                             verify_ca=False) as c:
            r = c.get("/node/network/nodes/self")
            assert r.body.json()["status"] == "Learner"
            r = c.get("/node/consensus")
            assert new_node.node_id in r.body.json()["details"]["learners"]

    LOG.info(
        "Partition is lifted, wait for primary unanimity on original nodes")
    # Note: Because trusting the new node failed, the new node is not considered
    # in the primary unanimity. Indeed, its transition to Trusted may have been rolled back.
    primary = network.wait_for_primary_unanimity()
    network.wait_for_all_nodes_to_commit(primary=primary)

    LOG.info("Trust new joiner again")
    network.trust_node(new_node, args)

    check_can_progress(primary)
    check_can_progress(new_node)
Пример #19
0
def test_update_all_nodes(network, args):
    replacement_package = get_replacement_package(args)

    primary, _ = network.find_nodes()

    first_code_id = infra.utils.get_code_id(args.enclave_type, args.oe_binary,
                                            args.package)
    new_code_id = infra.utils.get_code_id(args.enclave_type, args.oe_binary,
                                          replacement_package)

    if args.enclave_type == "virtual":
        # Pretend this was already present
        network.consortium.add_new_code(primary, first_code_id)

    LOG.info("Add new code id")
    network.consortium.add_new_code(primary, new_code_id)
    with primary.client() as uc:
        r = uc.get("/node/code")
        versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"])
        expected = sorted(
            [
                {
                    "digest": first_code_id,
                    "status": "AllowedToJoin"
                },
                {
                    "digest": new_code_id,
                    "status": "AllowedToJoin"
                },
            ],
            key=lambda x: x["digest"],
        )
        assert versions == expected, versions

    LOG.info("Remove old code id")
    network.consortium.retire_code(primary, first_code_id)
    with primary.client() as uc:
        r = uc.get("/node/code")
        versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"])
        expected = sorted(
            [
                {
                    "digest": new_code_id,
                    "status": "AllowedToJoin"
                },
            ],
            key=lambda x: x["digest"],
        )
        assert versions == expected, versions

    old_nodes = network.nodes.copy()

    LOG.info("Start fresh nodes running new code")
    for _ in range(0, len(old_nodes)):
        new_node = network.create_node("local://localhost")
        network.join_node(new_node, replacement_package, args)
        network.trust_node(new_node, args)

    LOG.info("Retire original nodes running old code")
    for node in old_nodes:
        primary, _ = network.find_nodes()
        network.retire_node(primary, node)
        # Elections take (much) longer than a backup removal which is just
        # a commit, so we need to adjust our timeout accordingly, hence this branch
        if node.node_id == primary.node_id:
            new_primary, _ = network.wait_for_new_primary(primary)
            primary = new_primary
        node.stop()

    LOG.info("Check the network is still functional")
    check_can_progress(new_node)
    return network