Пример #1
0
    def test_update_valid_opcert(
        self,
        cluster_lock_pool2: clusterlib.ClusterLib,
        cluster_manager: cluster_management.ClusterManager,
    ):
        """Update a valid operational certificate with another valid operational certificate.

        * generate new operational certificate with valid `--kes-period`
        * restart the node with the new operational certificate
        * check that the pool is still producing blocks
        """
        pool_name = "node-pool2"
        node_name = "pool2"
        cluster = cluster_lock_pool2

        temp_template = helpers.get_func_name()
        pool_rec = cluster_manager.cache.addrs_data[pool_name]

        node_cold = pool_rec["cold_key_pair"]
        stake_pool_id = cluster.get_stake_pool_id(node_cold.vkey_file)
        stake_pool_id_dec = helpers.decode_bech32(stake_pool_id)

        opcert_file = pool_rec["pool_operational_cert"]

        with cluster_manager.restart_on_failure():
            # generate new operational certificate with valid `--kes-period`
            new_opcert_file = cluster.gen_node_operational_cert(
                node_name=node_name,
                kes_vkey_file=pool_rec["kes_key_pair"].vkey_file,
                cold_skey_file=pool_rec["cold_key_pair"].skey_file,
                cold_counter_file=pool_rec["cold_key_pair"].counter_file,
                kes_period=cluster.get_kes_period(),
            )

            # restart the node with the new operational certificate
            logfiles.add_ignore_rule("*.stdout", "MuxBearerClosed")
            shutil.copy(new_opcert_file, opcert_file)
            cluster_nodes.restart_node(node_name)

            LOGGER.info("Checking blocks production for 5 epochs.")
            blocks_made_db = []
            this_epoch = -1
            updated_epoch = cluster.get_epoch()
            for __ in range(5):
                # wait for next epoch
                if cluster.get_epoch() == this_epoch:
                    cluster.wait_for_new_epoch()

                # wait for the end of the epoch
                clusterlib_utils.wait_for_epoch_interval(cluster_obj=cluster,
                                                         start=-19,
                                                         stop=-9)
                this_epoch = cluster.get_epoch()

                ledger_state = clusterlib_utils.get_ledger_state(
                    cluster_obj=cluster)

                # save ledger state
                clusterlib_utils.save_ledger_state(
                    cluster_obj=cluster,
                    state_name=f"{temp_template}_{this_epoch}",
                    ledger_state=ledger_state,
                )

                # check that the pool is still producing blocks
                blocks_made = ledger_state["blocksCurrent"]
                blocks_made_db.append(stake_pool_id_dec in blocks_made)

            assert any(blocks_made_db), (
                f"The pool '{pool_name}' has not produced any blocks "
                f"since epoch {updated_epoch}")
Пример #2
0
    def test_opcert_past_kes_period(
        self,
        cluster_lock_pool2: clusterlib.ClusterLib,
        cluster_manager: cluster_management.ClusterManager,
    ):
        """Start a stake pool with an operational certificate created with expired `--kes-period`.

        * generate new operational certificate with `--kes-period` in the past
        * restart the node with the new operational certificate
        * check that the pool is not producing any blocks
        * generate new operational certificate with valid `--kes-period` and restart the node
        * check that the pool is producing blocks again
        """
        pool_name = "node-pool2"
        node_name = "pool2"
        cluster = cluster_lock_pool2

        temp_template = helpers.get_func_name()
        pool_rec = cluster_manager.cache.addrs_data[pool_name]

        node_cold = pool_rec["cold_key_pair"]
        stake_pool_id = cluster.get_stake_pool_id(node_cold.vkey_file)
        stake_pool_id_dec = helpers.decode_bech32(stake_pool_id)

        opcert_file: Path = pool_rec["pool_operational_cert"]

        def _wait_epoch_chores(this_epoch: int):
            # wait for next epoch
            if cluster.get_epoch() == this_epoch:
                cluster.wait_for_new_epoch()

            # wait for the end of the epoch
            clusterlib_utils.wait_for_epoch_interval(cluster_obj=cluster,
                                                     start=-19,
                                                     stop=-9)

            # save ledger state
            clusterlib_utils.save_ledger_state(
                cluster_obj=cluster,
                state_name=f"{temp_template}_{cluster.get_epoch()}",
            )

        with cluster_manager.restart_on_failure():
            # generate new operational certificate with `--kes-period` in the past
            invalid_opcert_file = cluster.gen_node_operational_cert(
                node_name=node_name,
                kes_vkey_file=pool_rec["kes_key_pair"].vkey_file,
                cold_skey_file=pool_rec["cold_key_pair"].skey_file,
                cold_counter_file=pool_rec["cold_key_pair"].counter_file,
                kes_period=cluster.get_kes_period() - 5,
            )

            expected_errors = [
                (f"{node_name}.stdout", "TPraosCannotForgeKeyNotUsableYet"),
            ]
            with logfiles.expect_errors(expected_errors):
                # restart the node with the new operational certificate
                logfiles.add_ignore_rule("*.stdout", "MuxBearerClosed")
                shutil.copy(invalid_opcert_file, opcert_file)
                cluster_nodes.restart_node(node_name)
                cluster.wait_for_new_epoch()

                LOGGER.info("Checking blocks production for 5 epochs.")
                this_epoch = -1
                for __ in range(5):
                    _wait_epoch_chores(this_epoch)
                    this_epoch = cluster.get_epoch()

                    # check that the pool is not producing any blocks
                    blocks_made = clusterlib_utils.get_ledger_state(
                        cluster_obj=cluster)["blocksCurrent"]
                    if blocks_made:
                        assert (
                            stake_pool_id_dec not in blocks_made
                        ), f"The pool '{pool_name}' has produced blocks in epoch {this_epoch}"

            # generate new operational certificate with valid `--kes-period`
            os.remove(opcert_file)
            valid_opcert_file = cluster.gen_node_operational_cert(
                node_name=node_name,
                kes_vkey_file=pool_rec["kes_key_pair"].vkey_file,
                cold_skey_file=pool_rec["cold_key_pair"].skey_file,
                cold_counter_file=pool_rec["cold_key_pair"].counter_file,
                kes_period=cluster.get_kes_period(),
            )
            # copy the new certificate and restart the node
            shutil.move(str(valid_opcert_file), str(opcert_file))
            cluster_nodes.restart_node(node_name)
            cluster.wait_for_new_epoch()

            LOGGER.info("Checking blocks production for another 5 epochs.")
            blocks_made_db = []
            this_epoch = cluster.get_epoch()
            active_again_epoch = this_epoch
            for __ in range(5):
                _wait_epoch_chores(this_epoch)
                this_epoch = cluster.get_epoch()

                # check that the pool is producing blocks
                blocks_made = clusterlib_utils.get_ledger_state(
                    cluster_obj=cluster)["blocksCurrent"]
                blocks_made_db.append(stake_pool_id_dec in blocks_made)

            assert any(blocks_made_db), (
                f"The pool '{pool_name}' has not produced any blocks "
                f"since epoch {active_again_epoch}")
Пример #3
0
    def test_opcert_future_kes_period(  # noqa: C901
        self,
        cluster_lock_pool2: clusterlib.ClusterLib,
        cluster_manager: cluster_management.ClusterManager,
    ):
        """Start a stake pool with an operational certificate created with invalid `--kes-period`.

        * generate new operational certificate with `--kes-period` in the future
        * restart the node with the new operational certificate
        * check that the pool is not producing any blocks
        * if network era > Alonzo

            - generate new operational certificate with valid `--kes-period`, but counter value +2
              from last used operational ceritificate
            - restart the node
            - check that the pool is not producing any blocks

        * generate new operational certificate with valid `--kes-period` and restart the node
        * check that the pool is producing blocks again
        """
        # pylint: disable=too-many-statements,too-many-branches
        pool_name = cluster_management.Resources.POOL2
        node_name = "pool2"
        cluster = cluster_lock_pool2

        temp_template = common.get_test_id(cluster)
        pool_rec = cluster_manager.cache.addrs_data[pool_name]

        node_cold = pool_rec["cold_key_pair"]
        stake_pool_id = cluster.get_stake_pool_id(node_cold.vkey_file)
        stake_pool_id_dec = helpers.decode_bech32(stake_pool_id)

        opcert_file: Path = pool_rec["pool_operational_cert"]
        cold_counter_file: Path = pool_rec["cold_key_pair"].counter_file

        expected_errors = [
            (f"{node_name}.stdout", "PraosCannotForgeKeyNotUsableYet"),
        ]

        if VERSIONS.cluster_era > VERSIONS.ALONZO:
            expected_errors.append((f"{node_name}.stdout", "CounterOverIncrementedOCERT"))
            # In Babbage we get `CounterOverIncrementedOCERT` error if counter for new opcert
            # is not exactly +1 from last used opcert. We'll backup the original counter
            # file so we can use it for issuing next valid opcert.
            cold_counter_file_orig = Path(
                f"{cold_counter_file.stem}_orig{cold_counter_file.suffix}"
            ).resolve()
            shutil.copy(cold_counter_file, cold_counter_file_orig)

        logfiles.add_ignore_rule(
            files_glob="*.stdout",
            regex="MuxBearerClosed|CounterOverIncrementedOCERT",
            ignore_file_id=cluster_manager.worker_id,
        )

        # generate new operational certificate with `--kes-period` in the future
        invalid_opcert_file = cluster.gen_node_operational_cert(
            node_name=f"{node_name}_invalid_opcert_file",
            kes_vkey_file=pool_rec["kes_key_pair"].vkey_file,
            cold_skey_file=pool_rec["cold_key_pair"].skey_file,
            cold_counter_file=cold_counter_file,
            kes_period=cluster.get_kes_period() + 100,
        )

        kes_query_currently_broken = False

        with cluster_manager.restart_on_failure():
            with logfiles.expect_errors(expected_errors, ignore_file_id=cluster_manager.worker_id):
                # restart the node with the new operational certificate
                shutil.copy(invalid_opcert_file, opcert_file)
                cluster_nodes.restart_nodes([node_name])
                cluster.wait_for_new_epoch()

                LOGGER.info("Checking blocks production for 4 epochs.")
                this_epoch = -1
                for invalid_opcert_epoch in range(4):
                    _wait_epoch_chores(
                        cluster_obj=cluster, temp_template=temp_template, this_epoch=this_epoch
                    )
                    this_epoch = cluster.get_epoch()

                    # check that the pool is not producing any blocks
                    blocks_made = clusterlib_utils.get_ledger_state(cluster_obj=cluster)[
                        "blocksCurrent"
                    ]
                    if blocks_made:
                        assert (
                            stake_pool_id_dec not in blocks_made
                        ), f"The pool '{pool_name}' has produced blocks in epoch {this_epoch}"

                    if invalid_opcert_epoch == 1:
                        # check kes-period-info with operational certificate with
                        # invalid `--kes-period`
                        # TODO: the query is currently broken
                        try:
                            kes_period_info = cluster.get_kes_period_info(invalid_opcert_file)
                        except clusterlib.CLIError as err:
                            if "currentlyBroken" not in str(err):
                                raise
                            kes_query_currently_broken = True

                        if not kes_query_currently_broken:
                            kes.check_kes_period_info_result(
                                kes_output=kes_period_info,
                                expected_scenario=kes.KesScenarios.INVALID_KES_PERIOD,
                            )

                    # test the `CounterOverIncrementedOCERT` error - the counter will now be +2 from
                    # last used opcert counter value
                    if invalid_opcert_epoch == 2 and VERSIONS.cluster_era > VERSIONS.ALONZO:
                        overincrement_opcert_file = cluster.gen_node_operational_cert(
                            node_name=f"{node_name}_overincrement_opcert_file",
                            kes_vkey_file=pool_rec["kes_key_pair"].vkey_file,
                            cold_skey_file=pool_rec["cold_key_pair"].skey_file,
                            cold_counter_file=cold_counter_file,
                            kes_period=cluster.get_kes_period(),
                        )
                        # copy the new certificate and restart the node
                        shutil.copy(overincrement_opcert_file, opcert_file)
                        cluster_nodes.restart_nodes([node_name])

                    if invalid_opcert_epoch == 3:
                        # check kes-period-info with operational certificate with
                        # invalid counter
                        # TODO: the query is currently broken, implement once it is fixed
                        pass

            # in Babbage we'll use the original counter for issuing new valid opcert so the counter
            # value of new valid opcert equals to counter value of the original opcert +1
            if VERSIONS.cluster_era > VERSIONS.ALONZO:
                shutil.copy(cold_counter_file_orig, cold_counter_file)

            # generate new operational certificate with valid `--kes-period`
            valid_opcert_file = cluster.gen_node_operational_cert(
                node_name=f"{node_name}_valid_opcert_file",
                kes_vkey_file=pool_rec["kes_key_pair"].vkey_file,
                cold_skey_file=pool_rec["cold_key_pair"].skey_file,
                cold_counter_file=cold_counter_file,
                kes_period=cluster.get_kes_period(),
            )
            # copy the new certificate and restart the node
            shutil.copy(valid_opcert_file, opcert_file)
            cluster_nodes.restart_nodes([node_name])
            this_epoch = cluster.wait_for_new_epoch()

            LOGGER.info("Checking blocks production for another 2 epochs.")
            blocks_made_db = []
            active_again_epoch = this_epoch
            for __ in range(2):
                _wait_epoch_chores(
                    cluster_obj=cluster, temp_template=temp_template, this_epoch=this_epoch
                )
                this_epoch = cluster.get_epoch()

                # check that the pool is producing blocks
                blocks_made = clusterlib_utils.get_ledger_state(cluster_obj=cluster)[
                    "blocksCurrent"
                ]
                blocks_made_db.append(stake_pool_id_dec in blocks_made)

            assert any(blocks_made_db), (
                f"The pool '{pool_name}' has not produced any blocks "
                f"since epoch {active_again_epoch}"
            )

        if kes_query_currently_broken:
            pytest.xfail("`query kes-period-info` is currently broken")
        else:
            # check kes-period-info with valid operational certificate
            kes_period_info = cluster.get_kes_period_info(valid_opcert_file)
            kes.check_kes_period_info_result(
                kes_output=kes_period_info, expected_scenario=kes.KesScenarios.ALL_VALID
            )

            # check kes-period-info with invalid operational certificate, wrong counter and period
            kes_period_info = cluster.get_kes_period_info(invalid_opcert_file)
            kes.check_kes_period_info_result(
                kes_output=kes_period_info,
                expected_scenario=kes.KesScenarios.INVALID_KES_PERIOD
                if VERSIONS.cluster_era > VERSIONS.ALONZO
                else kes.KesScenarios.ALL_INVALID,
            )
Пример #4
0
    def test_update_valid_opcert(
        self,
        cluster_lock_pool2: clusterlib.ClusterLib,
        cluster_manager: cluster_management.ClusterManager,
    ):
        """Update a valid operational certificate with another valid operational certificate.

        * generate new operational certificate with valid `--kes-period`
        * copy new operational certificate to the node
        * stop the node so the corresponding pool is not minting new blocks
        * check `kes-period-info` while the pool is not minting blocks
        * start the node with the new operational certificate
        * check that the pool is minting blocks again
        * check that metrics reported by `kes-period-info` got updated once the pool started
          minting blocks again
        * check `kes-period-info` with the old (replaced) operational certificate
        """
        # pylint: disable=too-many-statements
        pool_name = cluster_management.Resources.POOL2
        node_name = "pool2"
        cluster = cluster_lock_pool2

        temp_template = common.get_test_id(cluster)
        pool_rec = cluster_manager.cache.addrs_data[pool_name]

        node_cold = pool_rec["cold_key_pair"]
        stake_pool_id = cluster.get_stake_pool_id(node_cold.vkey_file)
        stake_pool_id_dec = helpers.decode_bech32(stake_pool_id)

        opcert_file = pool_rec["pool_operational_cert"]
        opcert_file_old = shutil.copy(opcert_file, f"{opcert_file}_old")

        with cluster_manager.restart_on_failure():
            # generate new operational certificate with valid `--kes-period`
            new_opcert_file = cluster.gen_node_operational_cert(
                node_name=f"{node_name}_new_opcert_file",
                kes_vkey_file=pool_rec["kes_key_pair"].vkey_file,
                cold_skey_file=pool_rec["cold_key_pair"].skey_file,
                cold_counter_file=pool_rec["cold_key_pair"].counter_file,
                kes_period=cluster.get_kes_period(),
            )

            # copy new operational certificate to the node
            logfiles.add_ignore_rule(
                files_glob="*.stdout",
                regex="MuxBearerClosed",
                ignore_file_id=cluster_manager.worker_id,
            )
            shutil.copy(new_opcert_file, opcert_file)

            # stop the node so the corresponding pool is not minting new blocks
            cluster_nodes.stop_nodes([node_name])

            time.sleep(10)

            # check kes-period-info while the pool is not minting blocks
            # TODO: the query is currently broken
            kes_query_currently_broken = False
            try:
                kes_period_info_new = cluster.get_kes_period_info(opcert_file)
            except clusterlib.CLIError as err:
                if "currentlyBroken" not in str(err):
                    raise
                kes_query_currently_broken = True

            if not kes_query_currently_broken:
                kes.check_kes_period_info_result(
                    kes_output=kes_period_info_new, expected_scenario=kes.KesScenarios.ALL_VALID
                )
                kes_period_info_old = cluster.get_kes_period_info(opcert_file_old)
                kes.check_kes_period_info_result(
                    kes_output=kes_period_info_old, expected_scenario=kes.KesScenarios.ALL_VALID
                )
                assert (
                    kes_period_info_new["metrics"]["qKesNodeStateOperationalCertificateNumber"]
                    == kes_period_info_old["metrics"]["qKesNodeStateOperationalCertificateNumber"]
                )

            # start the node with the new operational certificate
            cluster_nodes.start_nodes([node_name])

            # make sure we are not at the very end of an epoch so we still have time for
            # the first block production check
            clusterlib_utils.wait_for_epoch_interval(cluster_obj=cluster, start=5, stop=-18)

            LOGGER.info("Checking blocks production for 5 epochs.")
            blocks_made_db = []
            this_epoch = -1
            updated_epoch = cluster.get_epoch()
            for __ in range(5):
                # wait for next epoch
                if cluster.get_epoch() == this_epoch:
                    cluster.wait_for_new_epoch()

                # wait for the end of the epoch
                clusterlib_utils.wait_for_epoch_interval(
                    cluster_obj=cluster, start=-19, stop=-15, force_epoch=True
                )
                this_epoch = cluster.get_epoch()

                ledger_state = clusterlib_utils.get_ledger_state(cluster_obj=cluster)

                # save ledger state
                clusterlib_utils.save_ledger_state(
                    cluster_obj=cluster,
                    state_name=f"{temp_template}_{this_epoch}",
                    ledger_state=ledger_state,
                )

                # check that the pool is minting blocks
                blocks_made = ledger_state["blocksCurrent"]
                blocks_made_db.append(stake_pool_id_dec in blocks_made)

            assert any(
                blocks_made_db
            ), f"The pool '{pool_name}' has not minted any blocks since epoch {updated_epoch}"

        if kes_query_currently_broken:
            pytest.xfail("`query kes-period-info` is currently broken")
        else:
            # check that metrics reported by kes-period-info got updated once the pool started
            # minting blocks again
            kes_period_info_updated = cluster.get_kes_period_info(opcert_file)
            kes.check_kes_period_info_result(
                kes_output=kes_period_info_updated, expected_scenario=kes.KesScenarios.ALL_VALID
            )
            assert (
                kes_period_info_updated["metrics"]["qKesNodeStateOperationalCertificateNumber"]
                != kes_period_info_old["metrics"]["qKesNodeStateOperationalCertificateNumber"]
            )

            # check kes-period-info with operational certificate with a wrong counter
            kes_period_info_invalid = cluster.get_kes_period_info(opcert_file_old)
            kes.check_kes_period_info_result(
                kes_output=kes_period_info_invalid,
                expected_scenario=kes.KesScenarios.INVALID_COUNTERS,
            )
Пример #5
0
    def test_expired_kes(
        self,
        cluster_kes: clusterlib.ClusterLib,
        cluster_manager: cluster_management.ClusterManager,
        worker_id: str,
    ):
        """Test expired KES.

        * start local cluster instance configured with short KES period and low number of key
          evolutions, so KES expires soon on all pools
        * refresh opcert on 2 of the 3 pools, so KES doesn't expire on those 2 pools and
          the pools keep minting blocks
        * wait for KES expiration on the selected pool
        * check that the pool with expired KES didn't mint blocks in an epoch that followed after
          KES expiration
        * check KES period info command with an operational certificate with an expired KES
        * check KES period info command with operational certificates with a valid KES
        """
        cluster = cluster_kes
        temp_template = common.get_test_id(cluster)

        expire_timeout = 200
        expire_node_name = "pool1"
        expire_pool_name = f"node-{expire_node_name}"
        expire_pool_rec = cluster_manager.cache.addrs_data[expire_pool_name]
        expire_pool_id = cluster.get_stake_pool_id(expire_pool_rec["cold_key_pair"].vkey_file)
        expire_pool_id_dec = helpers.decode_bech32(expire_pool_id)

        # refresh opcert on 2 of the 3 pools, so KES doesn't expire on those 2 pools and
        # the pools keep minting blocks
        refreshed_nodes = ["pool2", "pool3"]

        def _refresh_opcerts():
            for n in refreshed_nodes:
                refreshed_pool_rec = cluster_manager.cache.addrs_data[f"node-{n}"]
                refreshed_opcert_file = cluster.gen_node_operational_cert(
                    node_name=f"{n}_refreshed_opcert",
                    kes_vkey_file=refreshed_pool_rec["kes_key_pair"].vkey_file,
                    cold_skey_file=refreshed_pool_rec["cold_key_pair"].skey_file,
                    cold_counter_file=refreshed_pool_rec["cold_key_pair"].counter_file,
                    kes_period=cluster.get_kes_period(),
                )
                shutil.copy(refreshed_opcert_file, refreshed_pool_rec["pool_operational_cert"])
            cluster_nodes.restart_nodes(refreshed_nodes)

        _refresh_opcerts()

        expected_err_regexes = ["KESKeyAlreadyPoisoned", "KESCouldNotEvolve"]
        # ignore expected errors in bft1 node log file, as bft1 opcert will not get refreshed
        logfiles.add_ignore_rule(
            files_glob="bft1.stdout",
            regex="|".join(expected_err_regexes),
            ignore_file_id=worker_id,
        )
        # search for expected errors only in log file corresponding to pool with expired KES
        expected_errors = [(f"{expire_node_name}.stdout", err) for err in expected_err_regexes]

        this_epoch = -1
        with logfiles.expect_errors(expected_errors, ignore_file_id=worker_id):
            LOGGER.info(
                f"{datetime.datetime.now()}: Waiting for {expire_timeout} sec for KES expiration."
            )
            time.sleep(expire_timeout)

            _wait_epoch_chores(
                cluster_obj=cluster, temp_template=temp_template, this_epoch=this_epoch
            )
            this_epoch = cluster.get_epoch()

            # check that the pool is not producing any blocks
            blocks_made = clusterlib_utils.get_ledger_state(cluster_obj=cluster)["blocksCurrent"]
            if blocks_made:
                assert (
                    expire_pool_id_dec not in blocks_made
                ), f"The pool '{expire_pool_name}' has minted blocks in epoch {this_epoch}"

            # refresh opcerts one more time
            _refresh_opcerts()

            LOGGER.info(
                f"{datetime.datetime.now()}: Waiting 120 secs to make sure the expected errors "
                "make it to log files."
            )
            time.sleep(120)

        # check kes-period-info with an operational certificate with KES expired
        # TODO: the query is currently broken
        kes_query_currently_broken = False
        try:
            kes_info_expired = cluster.get_kes_period_info(
                opcert_file=expire_pool_rec["pool_operational_cert"]
            )
        except clusterlib.CLIError as err:
            if "currentlyBroken" not in str(err):
                raise
            kes_query_currently_broken = True

        if kes_query_currently_broken:
            pytest.xfail("`query kes-period-info` is currently broken")
        else:
            kes.check_kes_period_info_result(
                kes_output=kes_info_expired, expected_scenario=kes.KesScenarios.INVALID_KES_PERIOD
            )

            # check kes-period-info with valid operational certificates
            for n in refreshed_nodes:
                refreshed_pool_rec = cluster_manager.cache.addrs_data[f"node-{n}"]
                kes_info_valid = cluster.get_kes_period_info(
                    opcert_file=refreshed_pool_rec["pool_operational_cert"]
                )
                kes.check_kes_period_info_result(
                    kes_output=kes_info_valid, expected_scenario=kes.KesScenarios.ALL_VALID
                )