示例#1
0
文件: election.py 项目: slingerbv/CCF
def wait_for_seqno_to_commit(seqno, view, nodes):
    """
    Wait for a specific seqno at a specific view to be committed on all nodes.
    """
    for _ in range(infra.network.Network.replication_delay * 10):
        up_to_date_f = []
        logs = {}
        for f in nodes:
            with f.client() as c:
                logs[f.node_id] = []
                r = c.get(
                    f"/node/tx?view={view}&seqno={seqno}", log_capture=logs[f.node_id]
                )
                assert (
                    r.status_code == http.HTTPStatus.OK
                ), f"tx request returned HTTP status {r.status_code}"
                status = TxStatus(r.body.json()["status"])
                if status == TxStatus.Committed:
                    up_to_date_f.append(f.node_id)
                elif status == TxStatus.Invalid:
                    flush_info(logs[f.node_id], None, 0)
                    raise RuntimeError(
                        f"Node {f.node_id} reports transaction ID {view}.{seqno} is invalid and will never be committed"
                    )
                else:
                    pass
        if len(up_to_date_f) == len(nodes):
            break
        time.sleep(0.1)
    for lines in logs.values():
        flush_info(lines, None, 0)
    assert len(up_to_date_f) == len(
        nodes
    ), "Only {} out of {} nodes are up to date".format(len(up_to_date_f), len(nodes))
示例#2
0
def wait_for_commit(client, seqno: int, view: int, timeout: int = 3) -> None:
    """
    Waits for a specific seqno/view pair to be committed by the network,
    as per the node to which client is connected to.

    :param client: Instance of :py:class:`ccf.clients.CCFClient`
    :param int seqno: Transaction sequence number.
    :param int view: Consensus view.
    :param str timeout: Maximum time to wait for this seqno/view pair to be committed before giving up.

    A TimeoutError exception is raised if the commit index is not committed within the given timeout.
    """
    end_time = time.time() + timeout
    while time.time() < end_time:
        r = client.get(f"/node/tx?view={view}&seqno={seqno}")
        assert (r.status_code == http.HTTPStatus.OK
                ), f"tx request returned HTTP status {r.status_code}"
        assert isinstance(r.body, dict), "/node/tx should return a JSON object"
        status = TxStatus(r.body["status"])
        if status == TxStatus.Committed:
            return
        elif status == TxStatus.Invalid:
            raise RuntimeError(
                f"Transaction ID {view}.{seqno} is marked invalid and will never be committed"
            )
        else:
            time.sleep(0.1)
    raise TimeoutError("Timed out waiting for commit")
示例#3
0
def test_isolate_and_reconnect_primary(network, args, **kwargs):
    primary, backups = network.find_nodes()
    with network.partitioner.partition(backups):
        lost_tx_resp = check_does_not_progress(primary)

        new_primary, _ = network.wait_for_new_primary(primary,
                                                      nodes=backups,
                                                      timeout_multiplier=6)
        new_tx_resp = check_can_progress(new_primary)

    # Check reconnected former primary has caught up
    with primary.client() as c:
        try:
            # There will be at least one full election cycle for nothing, where the
            # re-joining node fails to get elected but causes others to rev up their
            # term. After that, a successful election needs to take place, and we
            # arbitrarily allow 3 time periods to avoid being too brittle when
            # raft timeouts line up badly.
            c.wait_for_commit(new_tx_resp,
                              timeout=(network.election_duration * 4))
        except TimeoutError:
            details = c.get("/node/consensus").body.json()
            assert (
                False
            ), f"Stuck before {new_tx_resp.view}.{new_tx_resp.seqno}: {pprint.pformat(details)}"

        # Check it has dropped anything submitted while partitioned
        r = c.get(
            f"/node/tx?transaction_id={lost_tx_resp.view}.{lost_tx_resp.seqno}"
        )
        status = TxStatus(r.body.json()["status"])
        assert status == TxStatus.Invalid, r
示例#4
0
def test_view_history(network, args):
    if args.consensus == "pbft":
        # This appears to work in PBFT, but it is unacceptably slow:
        # - Each /tx request is a write, with a non-trivial roundtrip response time
        # - Since each read (eg - /tx and /commit) has produced writes and a unique tx ID,
        #    there are too many IDs to test exhaustively
        # We could rectify this by making this test non-exhaustive (bisecting for view changes,
        # sampling within a view), but for now it is exhaustive and Raft-only
        LOG.warning("Skipping view reconstruction in PBFT")
        return network

    check = infra.checker.Checker()

    for node in network.get_joined_nodes():
        with node.client("user0") as c:
            r = c.get("/node/commit")
            check(c)

            commit_view = r.body["view"]
            commit_seqno = r.body["seqno"]

            # Retrieve status for all possible Tx IDs
            seqno_to_views = {}
            for seqno in range(1, commit_seqno + 1):
                views = []
                for view in range(1, commit_view + 1):
                    r = c.get(f"/node/tx?view={view}&seqno={seqno}")
                    check(r)
                    status = TxStatus(r.body["status"])
                    if status == TxStatus.Committed:
                        views.append(view)
                seqno_to_views[seqno] = views

            # Check we have exactly one Tx ID for each seqno
            txs_ok = True
            for seqno, views in seqno_to_views.items():
                if len(views) != 1:
                    txs_ok = False
                    LOG.error(
                        f"Node {node.node_id}: Found {len(views)} committed Tx IDs for seqno {seqno}"
                    )

            tx_ids_condensed = ", ".join(
                " OR ".join(f"{view}.{seqno}" for view in views or ["UNKNOWN"])
                for seqno, views in seqno_to_views.items())

            if txs_ok:
                LOG.success(
                    f"Node {node.node_id}: Found a valid sequence of Tx IDs:\n{tx_ids_condensed}"
                )
            else:
                LOG.error(
                    f"Node {node.node_id}: Invalid sequence of Tx IDs:\n{tx_ids_condensed}"
                )
                raise RuntimeError(
                    f"Node {node.node_id}: Incomplete or inconsistent view history"
                )

    return network
示例#5
0
    def wait_for_all_nodes_to_commit(self, primary=None, tx_id=None, timeout=10):
        """
        Wait for all nodes to have joined the network and committed all transactions
        executed on the primary.
        """
        if not (primary or tx_id):
            raise ValueError("Either a valid TxID or primary node should be specified")

        end_time = time.time() + timeout

        # If no TxID is specified, retrieve latest readable one
        if tx_id == None:
            while time.time() < end_time:
                with primary.client() as c:
                    resp = c.get(
                        "/node/network/nodes/self"
                    )  # Well-known read-only endpoint
                    tx_id = TxID(resp.view, resp.seqno)
                    if tx_id.valid():
                        break
                time.sleep(0.1)
            assert (
                tx_id.valid()
            ), f"Primary {primary.node_id} has not made any progress yet ({tx_id})"

        caught_up_nodes = []
        logs = {}
        while time.time() < end_time:
            caught_up_nodes = []
            for node in self.get_joined_nodes():
                with node.client() as c:
                    logs[node.node_id] = []
                    resp = c.get(
                        f"/node/local_tx?transaction_id={tx_id}",
                        log_capture=logs[node.node_id],
                    )
                    if resp.status_code != 200:
                        # Node may not have joined the network yet, try again
                        break
                    status = TxStatus(resp.body.json()["status"])
                    if status == TxStatus.Committed:
                        caught_up_nodes.append(node)
                    elif status == TxStatus.Invalid:
                        flush_info(logs[node.node_id], None, 0)
                        raise RuntimeError(
                            f"Node {node.node_id} reports transaction ID {tx_id} is invalid and will never be committed"
                        )
                    else:
                        pass

            if len(caught_up_nodes) == len(self.get_joined_nodes()):
                break
            time.sleep(0.1)

        for lines in logs.values():
            flush_info(lines, None, 0)
        assert len(caught_up_nodes) == len(
            self.get_joined_nodes()
        ), f"Only {len(caught_up_nodes)} (out of {len(self.get_joined_nodes())}) nodes have joined the network"
示例#6
0
    def wait_for_all_nodes_to_catch_up(self, primary, timeout=10):
        """
        Wait for all nodes to have joined the network and globally replicated
        all transactions globally executed on the primary (including transactions
        which added the nodes).
        """
        end_time = time.time() + timeout
        while time.time() < end_time:
            with primary.client() as c:
                resp = c.get("/node/commit")
                body = resp.body.json()
                seqno = body["seqno"]
                view = body["view"]
                if seqno != 0:
                    break
            time.sleep(0.1)
        assert (
            seqno != 0
        ), f"Primary {primary.node_id} has not made any progress yet (view: {view}, seqno: {seqno})"

        caught_up_nodes = []
        while time.time() < end_time:
            caught_up_nodes = []
            for node in self.get_joined_nodes():
                with node.client() as c:
                    c.get("/node/commit")
                    resp = c.get(f"/node/local_tx?view={view}&seqno={seqno}")
                    if resp.status_code != 200:
                        # Node may not have joined the network yet, try again
                        break
                    status = TxStatus(resp.body.json()["status"])
                    if status == TxStatus.Committed:
                        caught_up_nodes.append(node)
                    elif status == TxStatus.Invalid:
                        raise RuntimeError(
                            f"Node {node.node_id} reports transaction ID {view}.{seqno} is invalid and will never be committed"
                        )
                    else:
                        pass

            if len(caught_up_nodes) == len(self.get_joined_nodes()):
                break
            time.sleep(0.1)
        assert len(caught_up_nodes) == len(
            self.get_joined_nodes()
        ), f"Only {len(caught_up_nodes)} (out of {len(self.get_joined_nodes())}) nodes have joined the network"
示例#7
0
def wait_for_commit(client,
                    seqno: int,
                    view: int,
                    timeout: int = 3,
                    log_capture: Optional[list] = None) -> None:
    """
    Waits for a specific seqno/view pair to be committed by the network,
    as per the node to which client is connected to.

    :param client: Instance of :py:class:`ccf.clients.CCFClient`
    :param int seqno: Transaction sequence number.
    :param int view: Consensus view.
    :param str timeout: Maximum time to wait for this seqno/view pair to be committed before giving up.
    :param list log_capture: Rather than emit to default handler, capture log lines to list (optional).

    A TimeoutError exception is raised if the commit index is not committed within the given timeout.
    """
    if view is None or seqno is None:
        raise ValueError(f"{view}.{seqno} is not a valid transaction ID")

    logs: List[str] = []
    end_time = time.time() + timeout
    while time.time() < end_time:
        logs = []
        r = client.get(f"/node/tx?transaction_id={view}.{seqno}",
                       log_capture=logs)
        assert (r.status_code == http.HTTPStatus.OK
                ), f"tx request returned HTTP status {r.status_code}"
        status = TxStatus(r.body.json()["status"])
        if status == TxStatus.Committed:
            flush_info(logs, log_capture, 1)
            return
        elif status == TxStatus.Invalid:
            raise RuntimeError(
                f"Transaction ID {view}.{seqno} is marked invalid and will never be committed"
            )
        else:
            time.sleep(0.1)
    flush_info(logs, log_capture, 1)
    raise TimeoutError(
        f'Timed out waiting for commit: {pprint.pformat(client.get("/node/consensus").body.json())}'
    )
示例#8
0
def wait_for_pending(client, view, seqno, timeout=3):
    end_time = time.time() + timeout
    while time.time() < end_time:
        r = client.get(f"/node/tx?transaction_id={view}.{seqno}")
        assert (r.status_code == http.HTTPStatus.OK
                ), f"tx request returned HTTP status {r.status_code}"
        status = TxStatus(r.body.json()["status"])
        if status == TxStatus.Pending:
            return
        elif status == TxStatus.Invalid:
            raise RuntimeError(
                f"Transaction ID {view}.{seqno} is marked invalid and will never be committed"
            )
        elif status == TxStatus.Committed:
            raise RuntimeError(
                f"Transaction ID {view}.{seqno} is unexpectedly marked committed"
            )
        else:
            time.sleep(0.1)
    raise TimeoutError("Timed out waiting for commit")
示例#9
0
def test_tx_statuses(network, args):
    primary, _ = network.find_primary()

    with primary.client("user0") as c:
        check = infra.checker.Checker()
        r = c.post("/app/log/private", {"id": 0, "msg": "Ignored"})
        check(r)
        # Until this tx is globally committed, poll for the status of this and some other
        # related transactions around it (and also any historical transactions we're tracking)
        target_view = r.view
        target_seqno = r.seqno
        SentTxs.update_status(target_view, target_seqno)
        SentTxs.update_status(target_view, target_seqno + 1)
        SentTxs.update_status(target_view - 1, target_seqno)

        end_time = time.time() + 10
        while True:
            if time.time() > end_time:
                raise TimeoutError(
                    f"Took too long waiting for global commit of {target_view}.{target_seqno}"
                )

            done = False
            for view, seqno in SentTxs.get_all_tx_ids():
                r = c.get(f"/node/tx?view={view}&seqno={seqno}")
                check(r)
                status = TxStatus(r.body.json()["status"])
                SentTxs.update_status(view, seqno, status)
                if (
                    status == TxStatus.Committed
                    and target_view == view
                    and target_seqno == seqno
                ):
                    done = True

            if done:
                break
            time.sleep(0.1)

    return network
示例#10
0
def test_view_history(network, args):
    if args.consensus == "bft":
        # This appears to work in BFT, but it is unacceptably slow:
        # - Each /tx request is a write, with a non-trivial roundtrip response time
        # - Since each read (eg - /tx and /commit) has produced writes and a unique tx ID,
        #    there are too many IDs to test exhaustively
        # We could rectify this by making this test non-exhaustive (bisecting for view changes,
        # sampling within a view), but for now it is exhaustive and Raft-only
        LOG.warning("Skipping view reconstruction in BFT")
        return network

    check = infra.checker.Checker()

    previous_node = None
    previous_tx_ids = ""
    for node in network.get_joined_nodes():
        with node.client("user0") as c:
            r = c.get("/node/commit")
            check(c)

            commit_tx_id = TxID.from_str(r.body.json()["transaction_id"])

            # Retrieve status for all possible Tx IDs
            seqno_to_views = {}
            for seqno in range(1, commit_tx_id.seqno + 1):
                views = []
                for view in range(1, commit_tx_id.view + 1):
                    r = c.get(f"/node/tx?transaction_id={view}.{seqno}", log_capture=[])
                    check(r)
                    status = TxStatus(r.body.json()["status"])
                    if status == TxStatus.Committed:
                        views.append(view)
                seqno_to_views[seqno] = views

            # Check we have exactly one Tx ID for each seqno
            txs_ok = True
            for seqno, views in seqno_to_views.items():
                if len(views) != 1:
                    txs_ok = False
                    LOG.error(
                        f"Node {node.node_id}: Found {len(views)} committed Tx IDs for seqno {seqno}"
                    )

            tx_ids_condensed = ", ".join(
                " OR ".join(f"{view}.{seqno}" for view in views or ["UNKNOWN"])
                for seqno, views in seqno_to_views.items()
            )

            if txs_ok:
                LOG.success(
                    f"Node {node.node_id}: Found a valid sequence of Tx IDs:\n{tx_ids_condensed}"
                )
            else:
                LOG.error(
                    f"Node {node.node_id}: Invalid sequence of Tx IDs:\n{tx_ids_condensed}"
                )
                raise RuntimeError(
                    f"Node {node.node_id}: Incomplete or inconsistent view history"
                )

            # Compare view history between nodes
            if previous_tx_ids:
                # Some nodes may have a slightly longer view history so only compare the common prefix
                min_tx_ids_len = min(len(previous_tx_ids), len(tx_ids_condensed))
                assert (
                    tx_ids_condensed[:min_tx_ids_len]
                    == previous_tx_ids[:min_tx_ids_len]
                ), f"Tx IDs don't match between node {node.node_id} and node {previous_node.node_id}: {tx_ids_condensed[:min_tx_ids_len]} and {previous_tx_ids[:min_tx_ids_len]}"

            previous_tx_ids = tx_ids_condensed
            previous_node = node

    return network