Exemplo n.º 1
0
    def teardown_fixture(self, logger, abort=False):
        """
        Run a test that tears down the job's fixture.

        Return True if the teardown was successful, False otherwise.
        """

        # Refresh the fixture table before teardown to capture changes due to
        # CleanEveryN and stepdown hooks.
        self.report.logging_prefix = create_fixture_table(self.fixture)

        if abort:
            test_case = _fixture.FixtureAbortTestCase(
                self.test_queue_logger, self.fixture,
                "job{}".format(self.job_num), self.times_set_up)
            self.times_set_up += 1
        else:
            test_case = _fixture.FixtureTeardownTestCase(
                self.test_queue_logger, self.fixture,
                "job{}".format(self.job_num))

        test_case(self.report)
        if self.report.find_test_info(test_case).status != "pass":
            logger.error("The teardown of %s failed.", self.fixture)
            return False

        return True
Exemplo n.º 2
0
    def __restart_init_sync(self, sync_node):
        # Tear down and restart the initial sync node to start initial sync again.
        sync_node.teardown()

        self.logger.info("Starting the initial sync node back up again...")
        sync_node.setup()
        self.logger.info(fixture_interface.create_fixture_table(self.fixture))
        sync_node.await_ready()
Exemplo n.º 3
0
    def _execute_test(self, test, hook_failure_flag):
        """Call the before/after test hooks and execute 'test'."""

        test.configure(self.fixture, config.NUM_CLIENTS_PER_FIXTURE)
        self._run_hooks_before_tests(test, hook_failure_flag)
        self.report.logging_prefix = create_fixture_table(self.fixture)

        test(self.report)
        try:
            if test.propagate_error is not None:
                raise test.propagate_error

            # We are intentionally only checking the individual 'test' status and not calling
            # report.wasSuccessful() here. It is possible that a thread running in the background as
            # part of a hook has added a failed test case to 'self.report'. Checking the individual
            # 'test' status ensures self._run_hooks_after_tests() is called if it is a hook's test
            # case that has failed and not 'test' that has failed.
            if self.suite_options.fail_fast and self.report.find_test_info(
                    test).status != "pass":
                self.logger.info("%s failed, so stopping..." %
                                 (test.short_description()))
                raise errors.StopExecution("%s failed" %
                                           (test.short_description()))

            if self._check_if_fixture_running and not self.fixture.is_running(
            ):
                self.logger.error(
                    "%s marked as a failure because the fixture crashed during the test.",
                    test.short_description())
                self.report.setFailure(test, return_code=2)
                # Always fail fast if the fixture fails.
                raise errors.StopExecution(
                    "%s not running after %s" %
                    (self.fixture, test.short_description()))
        finally:
            success = self.report.find_test_info(test).status == "pass"

            # Stop background hooks first since they can interfere with fixture startup and teardown
            # done as part of archival.
            self._run_hooks_after_tests(test,
                                        hook_failure_flag,
                                        background=True)

            if self.archival:
                result = TestResult(test=test, hook=None, success=success)
                self.archival.archive(self.logger, result, self.manager)

            self._run_hooks_after_tests(test,
                                        hook_failure_flag,
                                        background=False)
Exemplo n.º 4
0
    def _check_secondaries_and_restart_fixture(self):
        preserve_dbpaths = []
        for node in self.fixture.nodes:
            preserve_dbpaths.append(node.preserve_dbpath)
            node.preserve_dbpath = True

        for secondary in self.fixture.get_secondaries():
            self._check_invariants_as_standalone(secondary)

            self.logger.info(
                "Restarting the secondary on port %d as a replica set node with"
                " its data files intact...", secondary.port)
            # Start the 'secondary' mongod back up as part of the replica set and wait for it to
            # reach state SECONDARY.
            secondary.setup()
            self.logger.info(fixture.create_fixture_table(self.fixture))
            secondary.await_ready()
            self._await_secondary_state(secondary)

            try:
                secondary.teardown()
            except errors.ServerFailure:
                raise errors.ServerFailure(
                    "{} did not exit cleanly after reconciling the end of its oplog"
                    .format(secondary))

        self.logger.info(
            "Starting the fixture back up again with its data files intact for final"
            " validation...")

        try:
            self.fixture.setup()
            self.logger.info(fixture.create_fixture_table(self.fixture))
            self.fixture.await_ready()
        finally:
            for (i, node) in enumerate(self.fixture.nodes):
                node.preserve_dbpath = preserve_dbpaths[i]
Exemplo n.º 5
0
    def _restart_and_clear_fixture(self):
        # We restart the fixture after setting 'preserve_dbpath' back to its original value in order
        # to clear the contents of the data directory if desired. The CleanEveryN hook cannot be
        # used in combination with the PeriodicKillSecondaries hook because we may attempt to call
        # Fixture.teardown() while the "rsSyncApplyStop" failpoint is still enabled on the
        # secondaries, causing them to exit with a non-zero return code.
        self.logger.info("Finished verifying data consistency, stopping the fixture...")

        try:
            self.fixture.teardown()
        except errors.ServerFailure:
            raise errors.ServerFailure(
                "{} did not exit cleanly after verifying data consistency".format(self.fixture))

        self.logger.info("Starting the fixture back up again with no data...")
        self.fixture.setup()
        self.logger.info(fixture.create_fixture_table(self.fixture))
        self.fixture.await_ready()
Exemplo n.º 6
0
    def restart_node(self, chosen):
        """Restart the new step up node."""
        self.logger.info("Waiting for the old primary on port %d of replica set '%s' to exit.",
                         chosen.port, self.replset_name)

        chosen.mongod.wait()

        self.logger.info("Attempting to restart the old primary on port %d of replica set '%s.",
                         chosen.port, self.replset_name)

        # Restart the mongod on the old primary and wait until we can contact it again. Keep the
        # original preserve_dbpath to restore after restarting the mongod.
        original_preserve_dbpath = chosen.preserve_dbpath
        chosen.preserve_dbpath = True
        try:
            chosen.setup()
            self.logger.info(interface.create_fixture_table(self))
            chosen.await_ready()
        finally:
            chosen.preserve_dbpath = original_preserve_dbpath
Exemplo n.º 7
0
    def _step_down(self, rs_fixture):
        try:
            primary = rs_fixture.get_primary(
                timeout_secs=self._stepdown_interval_secs)
        except errors.ServerFailure:
            # We ignore the ServerFailure exception because it means a primary wasn't available.
            # We'll try again after self._stepdown_interval_secs seconds.
            return

        secondaries = rs_fixture.get_secondaries()

        # Check that the fixture is still running before stepping down or killing the primary.
        # This ensures we still detect some cases in which the fixture has already crashed.
        if not rs_fixture.is_running():
            raise errors.ServerFailure(
                "ReplicaSetFixture {} expected to be running in"
                " ContinuousStepdown, but wasn't.".format(
                    rs_fixture.replset_name))

        if self._terminate:
            # If we're running with background reconfigs, it's possible to be in a scenario
            # where we kill a necessary voting node (i.e. in a 5 node repl set), only 2 are
            # voting. In this scenario, we want to avoid killing the primary because no
            # secondary can step up.
            if self._background_reconfig:
                # stagger the kill thread so that it runs a little after the reconfig thread
                time.sleep(1)
                voting_members = rs_fixture.get_voting_members()

                self.logger.info("Current voting members: %s", voting_members)

                if len(voting_members) <= 3:
                    # Do not kill or terminate the primary if we don't have enough voting nodes to
                    # elect a new primary.
                    return

            should_kill = self._kill and random.choice([True, False])
            action = "Killing" if should_kill else "Terminating"
            self.logger.info("%s the primary on port %d of replica set '%s'.",
                             action, primary.port, rs_fixture.replset_name)

            # We send the mongod process the signal to exit but don't immediately wait for it to
            # exit because clean shutdown may take a while and we want to restore write availability
            # as quickly as possible.
            teardown_mode = fixture_interface.TeardownMode.KILL if should_kill else fixture_interface.TeardownMode.TERMINATE
            primary.mongod.stop(mode=teardown_mode)
        elif not self._stepdown_via_heartbeats:
            self.logger.info(
                "Stepping down the primary on port %d of replica set '%s'.",
                primary.port, rs_fixture.replset_name)
            try:
                client = self._create_client(primary)
                client.admin.command(
                    bson.SON([
                        ("replSetStepDown", self._stepdown_duration_secs),
                        ("force", True),
                    ]))
            except pymongo.errors.AutoReconnect:
                # AutoReconnect exceptions are expected as connections are closed during stepdown.
                pass
            except pymongo.errors.PyMongoError:
                self.logger.exception(
                    "Error while stepping down the primary on port %d of replica set '%s'.",
                    primary.port, rs_fixture.replset_name)
                raise

        # We have skipped stepping down the primary if we want to step up secondaries instead. Here,
        # we simply need to pick an arbitrary secondary to run for election which will lead to
        # unconditional stepdown on the primary.
        #
        # If we have terminated/killed/stepped down the primary above, write availability has lost.
        # We pick an arbitrary secondary to run for election immediately in order to avoid a long
        # period where the replica set doesn't have write availability. If none of the secondaries
        # are eligible, or their election attempt fails, then we'll run the replSetStepUp command on
        # 'primary' to ensure we have write availability sooner than the
        # self._stepdown_duration_secs duration expires.
        while secondaries:
            chosen = random.choice(secondaries)

            self.logger.info(
                "Attempting to step up the secondary on port %d of replica set '%s'.",
                chosen.port, rs_fixture.replset_name)

            try:
                client = self._create_client(chosen)
                client.admin.command("replSetStepUp")
                break
            except pymongo.errors.OperationFailure:
                # OperationFailure exceptions are expected when the election attempt fails due to
                # not receiving enough votes. This can happen when the 'chosen' secondary's opTime
                # is behind that of other secondaries. We handle this by attempting to elect a
                # different secondary.
                self.logger.info(
                    "Failed to step up the secondary on port %d of replica set '%s'.",
                    chosen.port, rs_fixture.replset_name)
                secondaries.remove(chosen)
            except pymongo.errors.AutoReconnect:
                # It is possible for a replSetStepUp to fail with AutoReconnect if that node goes
                # into Rollback (which causes it to close any open connections).
                pass

        if self._terminate:
            self.logger.info(
                "Waiting for the old primary on port %d of replica set '%s' to exit.",
                primary.port, rs_fixture.replset_name)

            primary.mongod.wait()

            self.logger.info(
                "Attempting to restart the old primary on port %d of replica set '%s.",
                primary.port, rs_fixture.replset_name)

            # Restart the mongod on the old primary and wait until we can contact it again. Keep the
            # original preserve_dbpath to restore after restarting the mongod.
            original_preserve_dbpath = primary.preserve_dbpath
            primary.preserve_dbpath = True
            try:
                primary.setup()
                self.logger.info(
                    fixture_interface.create_fixture_table(self._fixture))
                primary.await_ready()
            finally:
                primary.preserve_dbpath = original_preserve_dbpath
        elif not self._stepdown_via_heartbeats:
            # If we chose to step up a secondary instead, the primary was never stepped down via the
            # replSetStepDown command and thus will not have a stepdown period. So we can skip
            # running {replSetFreeze: 0}. Otherwise, the replSetStepDown command run earlier
            # introduced a stepdown period on the former primary and so we have to run the
            # {replSetFreeze: 0} command to ensure the former primary is electable in the next
            # round of _step_down().
            client = self._create_client(primary)
            client.admin.command({"replSetFreeze": 0})
        elif secondaries:
            # We successfully stepped up a secondary, wait for the former primary to step down via
            # heartbeats. We need to wait for the former primary to step down to complete this step
            # down round and to avoid races between the ContinuousStepdown hook and other test hooks
            # that may depend on the health of the replica set.
            self.logger.info(
                "Successfully stepped up the secondary on port %d of replica set '%s'.",
                chosen.port, rs_fixture.replset_name)
            while True:
                try:
                    client = self._create_client(primary)
                    is_secondary = client.admin.command(
                        "isMaster")["secondary"]
                    if is_secondary:
                        break
                except pymongo.errors.AutoReconnect:
                    pass
                self.logger.info(
                    "Waiting for primary on port %d of replica set '%s' to step down.",
                    primary.port, rs_fixture.replset_name)
                time.sleep(0.2)  # Wait a little bit before trying again.
            self.logger.info(
                "Primary on port %d of replica set '%s' stepped down.",
                primary.port, rs_fixture.replset_name)

        if not secondaries:
            # If we failed to step up one of the secondaries, then we run the replSetStepUp to try
            # and elect the former primary again. This way we don't need to wait
            # self._stepdown_duration_secs seconds to restore write availability to the cluster.
            # Since the former primary may have been killed, we need to wait until it has been
            # restarted by retrying replSetStepUp.
            retry_time_secs = rs_fixture.AWAIT_REPL_TIMEOUT_MINS * 60
            retry_start_time = time.time()
            while True:
                try:
                    client = self._create_client(primary)
                    client.admin.command("replSetStepUp")
                    break
                except pymongo.errors.OperationFailure:
                    self._wait(0.2)
                if time.time() - retry_start_time > retry_time_secs:
                    raise errors.ServerFailure(
                        "The old primary on port {} of replica set {} did not step up in"
                        " {} seconds.".format(client.port,
                                              rs_fixture.replset_name,
                                              retry_time_secs))

        # Bump the counter for the chosen secondary to indicate that the replSetStepUp command
        # executed successfully.
        key = "{}/{}".format(
            rs_fixture.replset_name,
            chosen.get_internal_connection_string() if secondaries else "none")
        self._step_up_stats[key] += 1
Exemplo n.º 8
0
    def change_version_and_restart_node(self, primary, auth_options):
        """
        Select Secondary for stepUp.

        Ensure its version is different to that
        of the old primary; change the version of the Secondary is needed.
        """
        def get_chosen_node_from_replsetstatus(status_member_infos):
            max_optime = None
            chosen_index = None
            # We always select the secondary with highest optime to setup.
            for member_info in status_member_infos:
                if member_info.get("self", False):
                    # Ignore self, which is the old primary and not eligible
                    # to be re-elected in downgrade multiversion cluster.
                    continue
                optime_dict = member_info["optime"]
                if max_optime is None:
                    chosen_index = member_info["_id"]
                    max_optime = optime_dict
                else:
                    if compare_optime(optime_dict, max_optime) > 0:
                        chosen_index = member_info["_id"]
                        max_optime = optime_dict

                if chosen_index is None or max_optime is None:
                    raise self.fixturelib.ServerFailure(
                        "Failed to find a secondary eligible for "
                        f"election; index: {chosen_index}, optime: {max_optime}"
                    )

                return self.nodes[chosen_index]

        primary_client = interface.authenticate(primary.mongo_client(),
                                                auth_options)
        retry_time_secs = self.AWAIT_REPL_TIMEOUT_MINS * 60
        retry_start_time = time.time()

        while True:
            member_infos = primary_client.admin.command(
                {"replSetGetStatus": 1})["members"]
            chosen_node = get_chosen_node_from_replsetstatus(member_infos)

            if chosen_node.change_version_if_needed(primary):
                self.logger.info(
                    "Waiting for the chosen secondary on port %d of replica set '%s' to exit.",
                    chosen_node.port, self.replset_name)

                teardown_mode = interface.TeardownMode.TERMINATE
                chosen_node.mongod.stop(mode=teardown_mode)
                chosen_node.mongod.wait()

                self.logger.info(
                    "Attempting to restart the chosen secondary on port %d of replica set '%s.",
                    chosen_node.port, self.replset_name)

                chosen_node.setup()
                self.logger.info(interface.create_fixture_table(self))
                chosen_node.await_ready()

            if self.stepup_node(chosen_node, auth_options):
                break

            if time.time() - retry_start_time > retry_time_secs:
                raise pymongo.errors.ServerFailure(
                    "The old primary on port {} of replica set {} did not step up in"
                    " {} seconds.".format(chosen_node.port, self.replset_name,
                                          retry_time_secs))

        return chosen_node
Exemplo n.º 9
0
    def _check_invariants_as_standalone(self, secondary):  # pylint: disable=too-many-locals
        # pylint: disable=too-many-branches,too-many-statements
        # We remove the --replSet option in order to start the node as a standalone.
        replset_name = secondary.mongod_options.pop("replSet")
        self.logger.info(
            "Restarting the secondary on port %d as a standalone node with"
            " its data files intact...", secondary.port)

        try:
            secondary.setup()
            self.logger.info(fixture.create_fixture_table(self.fixture))
            secondary.await_ready()

            client = secondary.mongo_client()
            minvalid_doc = client.local["replset.minvalid"].find_one()
            oplog_truncate_after_doc = client.local["replset.oplogTruncateAfterPoint"].find_one()
            recovery_timestamp_res = client.admin.command("replSetTest",
                                                          getLastStableRecoveryTimestamp=True)
            latest_oplog_doc = client.local["oplog.rs"].find_one(sort=[("$natural",
                                                                        pymongo.DESCENDING)])

            self.logger.info("Checking invariants: minValid: {}, oplogTruncateAfterPoint: {},"
                             " stable recovery timestamp: {}, latest oplog doc: {}".format(
                                 minvalid_doc, oplog_truncate_after_doc, recovery_timestamp_res,
                                 latest_oplog_doc))

            null_ts = bson.Timestamp(0, 0)

            # We wait for a stable recovery timestamp at setup, so we must have an oplog.
            latest_oplog_entry_ts = null_ts
            if latest_oplog_doc is None:
                raise errors.ServerFailure("No latest oplog entry")
            latest_oplog_entry_ts = latest_oplog_doc.get("ts")
            if latest_oplog_entry_ts is None:
                raise errors.ServerFailure(
                    "Latest oplog entry had no 'ts' field: {}".format(latest_oplog_doc))

            # The "oplogTruncateAfterPoint" document may not exist at startup. If so, we default
            # it to null.
            oplog_truncate_after_ts = null_ts
            if oplog_truncate_after_doc is not None:
                oplog_truncate_after_ts = oplog_truncate_after_doc.get(
                    "oplogTruncateAfterPoint", null_ts)

            # The "lastStableRecoveryTimestamp" field is present if the storage engine supports
            # "recover to a timestamp". If it's a null timestamp on a durable storage engine, that
            # means we do not yet have a stable checkpoint timestamp and must be restarting at the
            # top of the oplog. Since we wait for a stable recovery timestamp at test fixture setup,
            # we should never encounter a null timestamp here.
            recovery_timestamp = recovery_timestamp_res.get("lastStableRecoveryTimestamp")
            if recovery_timestamp == null_ts:
                raise errors.ServerFailure(
                    "Received null stable recovery timestamp {}".format(recovery_timestamp_res))
            # On a storage engine that doesn't support "recover to a timestamp", we default to null.
            if recovery_timestamp is None:
                recovery_timestamp = null_ts

            # last stable recovery timestamp <= top of oplog
            if not recovery_timestamp <= latest_oplog_entry_ts:
                raise errors.ServerFailure("The condition last stable recovery timestamp <= top"
                                           " of oplog ({} <= {}) doesn't hold:"
                                           " getLastStableRecoveryTimestamp result={},"
                                           " latest oplog entry={}".format(
                                               recovery_timestamp, latest_oplog_entry_ts,
                                               recovery_timestamp_res, latest_oplog_doc))

            if minvalid_doc is not None:
                applied_through_ts = minvalid_doc.get("begin", {}).get("ts", null_ts)
                minvalid_ts = minvalid_doc.get("ts", null_ts)

                # The "appliedThrough" value should always equal the "last stable recovery
                # timestamp", AKA the stable checkpoint for durable engines, on server restart.
                #
                # The written "appliedThrough" time is updated with the latest timestamp at the end
                # of each batch application, and batch boundaries are the only valid stable
                # timestamps on secondaries. Therefore, a non-null appliedThrough timestamp must
                # equal the checkpoint timestamp, because any stable timestamp that the checkpoint
                # could use includes an equal persisted appliedThrough timestamp.
                if (recovery_timestamp != null_ts and applied_through_ts != null_ts
                        and (not recovery_timestamp == applied_through_ts)):
                    raise errors.ServerFailure(
                        "The condition last stable recovery timestamp ({}) == appliedThrough ({})"
                        " doesn't hold: minValid document={},"
                        " getLastStableRecoveryTimestamp result={}, last oplog entry={}".format(
                            recovery_timestamp, applied_through_ts, minvalid_doc,
                            recovery_timestamp_res, latest_oplog_doc))

                if applied_through_ts == null_ts:
                    # We clear "appliedThrough" to represent having applied through the top of the
                    # oplog in PRIMARY state or immediately after "rollback via refetch".
                    # If we are using a storage engine that supports "recover to a timestamp,"
                    # then we will have a "last stable recovery timestamp" and we should use that
                    # as our "appliedThrough" (similarly to why we assert their equality above).
                    # If both are null, then we are in PRIMARY state on a storage engine that does
                    # not support "recover to a timestamp" or in RECOVERING immediately after
                    # "rollback via refetch". Since we do not update "minValid" in PRIMARY state,
                    # we leave "appliedThrough" as null so that the invariants below hold, rather
                    # than substituting the latest oplog entry for the "appliedThrough" value.
                    applied_through_ts = recovery_timestamp

                if minvalid_ts == null_ts:
                    # The server treats the "ts" field in the minValid document as missing when its
                    # value is the null timestamp.
                    minvalid_ts = applied_through_ts

                if latest_oplog_entry_ts == null_ts:
                    # If the oplog is empty, we treat the "minValid" as the latest oplog entry.
                    latest_oplog_entry_ts = minvalid_ts

                if oplog_truncate_after_ts == null_ts:
                    # The server treats the "oplogTruncateAfterPoint" field as missing when its
                    # value is the null timestamp. When it is null, the oplog is complete and
                    # should not be truncated, so it is effectively the top of the oplog.
                    oplog_truncate_after_ts = latest_oplog_entry_ts

                # Check the ordering invariants before the secondary has reconciled the end of
                # its oplog.
                # The "oplogTruncateAfterPoint" is set to the first timestamp of each batch of
                # oplog entries before they are written to the oplog. Thus, it can be ahead
                # of the top of the oplog before any oplog entries are written, and behind it
                # after some are written. Thus, we cannot compare it to the top of the oplog.

                # appliedThrough <= minValid
                # appliedThrough represents the end of the previous batch, so it is always the
                # earliest.
                if applied_through_ts > minvalid_ts:
                    raise errors.ServerFailure(
                        "The condition appliedThrough <= minValid ({} <= {}) doesn't hold: minValid"
                        " document={}, latest oplog entry={}".format(
                            applied_through_ts, minvalid_ts, minvalid_doc, latest_oplog_doc))

                # minValid <= oplogTruncateAfterPoint
                # This is true because this hook is never run after a rollback. Thus, we only
                # move "minValid" to the end of each batch after the batch is written to the oplog.
                # We reset the "oplogTruncateAfterPoint" to null before we move "minValid" from
                # the end of the previous batch to the end of the current batch. Thus "minValid"
                # must be less than or equal to the "oplogTruncateAfterPoint".
                if minvalid_ts > oplog_truncate_after_ts:
                    raise errors.ServerFailure(
                        "The condition minValid <= oplogTruncateAfterPoint ({} <= {}) doesn't"
                        " hold: minValid document={}, oplogTruncateAfterPoint document={},"
                        " latest oplog entry={}".format(minvalid_ts, oplog_truncate_after_ts,
                                                        minvalid_doc, oplog_truncate_after_doc,
                                                        latest_oplog_doc))

                # minvalid <= latest oplog entry
                # "minValid" is set to the end of a batch after the batch is written to the oplog.
                # Thus it is always less than or equal to the top of the oplog.
                if minvalid_ts > latest_oplog_entry_ts:
                    raise errors.ServerFailure(
                        "The condition minValid <= top of oplog ({} <= {}) doesn't"
                        " hold: minValid document={}, latest oplog entry={}".format(
                            minvalid_ts, latest_oplog_entry_ts, minvalid_doc, latest_oplog_doc))

            try:
                secondary.teardown()
            except errors.ServerFailure:
                raise errors.ServerFailure(
                    "{} did not exit cleanly after being started up as a standalone".format(
                        secondary))
        except pymongo.errors.OperationFailure as err:
            self.logger.exception(
                "Failed to read the minValid document, the oplogTruncateAfterPoint document,"
                " the last stable recovery timestamp, or the latest oplog entry from the"
                " mongod on port %d", secondary.port)
            raise errors.ServerFailure(
                "Failed to read the minValid document, the oplogTruncateAfterPoint document,"
                " the last stable recovery timestamp, or the latest oplog entry from the"
                " mongod on port {}: {}".format(secondary.port, err.args[0]))
        finally:
            # Set the secondary's options back to their original values.
            secondary.mongod_options["replSet"] = replset_name