def teardown_fixture(self, logger, abort=False): """ Run a test that tears down the job's fixture. Return True if the teardown was successful, False otherwise. """ # Refresh the fixture table before teardown to capture changes due to # CleanEveryN and stepdown hooks. self.report.logging_prefix = create_fixture_table(self.fixture) if abort: test_case = _fixture.FixtureAbortTestCase( self.test_queue_logger, self.fixture, "job{}".format(self.job_num), self.times_set_up) self.times_set_up += 1 else: test_case = _fixture.FixtureTeardownTestCase( self.test_queue_logger, self.fixture, "job{}".format(self.job_num)) test_case(self.report) if self.report.find_test_info(test_case).status != "pass": logger.error("The teardown of %s failed.", self.fixture) return False return True
def __restart_init_sync(self, sync_node): # Tear down and restart the initial sync node to start initial sync again. sync_node.teardown() self.logger.info("Starting the initial sync node back up again...") sync_node.setup() self.logger.info(fixture_interface.create_fixture_table(self.fixture)) sync_node.await_ready()
def _execute_test(self, test, hook_failure_flag): """Call the before/after test hooks and execute 'test'.""" test.configure(self.fixture, config.NUM_CLIENTS_PER_FIXTURE) self._run_hooks_before_tests(test, hook_failure_flag) self.report.logging_prefix = create_fixture_table(self.fixture) test(self.report) try: if test.propagate_error is not None: raise test.propagate_error # We are intentionally only checking the individual 'test' status and not calling # report.wasSuccessful() here. It is possible that a thread running in the background as # part of a hook has added a failed test case to 'self.report'. Checking the individual # 'test' status ensures self._run_hooks_after_tests() is called if it is a hook's test # case that has failed and not 'test' that has failed. if self.suite_options.fail_fast and self.report.find_test_info( test).status != "pass": self.logger.info("%s failed, so stopping..." % (test.short_description())) raise errors.StopExecution("%s failed" % (test.short_description())) if self._check_if_fixture_running and not self.fixture.is_running( ): self.logger.error( "%s marked as a failure because the fixture crashed during the test.", test.short_description()) self.report.setFailure(test, return_code=2) # Always fail fast if the fixture fails. raise errors.StopExecution( "%s not running after %s" % (self.fixture, test.short_description())) finally: success = self.report.find_test_info(test).status == "pass" # Stop background hooks first since they can interfere with fixture startup and teardown # done as part of archival. self._run_hooks_after_tests(test, hook_failure_flag, background=True) if self.archival: result = TestResult(test=test, hook=None, success=success) self.archival.archive(self.logger, result, self.manager) self._run_hooks_after_tests(test, hook_failure_flag, background=False)
def _check_secondaries_and_restart_fixture(self): preserve_dbpaths = [] for node in self.fixture.nodes: preserve_dbpaths.append(node.preserve_dbpath) node.preserve_dbpath = True for secondary in self.fixture.get_secondaries(): self._check_invariants_as_standalone(secondary) self.logger.info( "Restarting the secondary on port %d as a replica set node with" " its data files intact...", secondary.port) # Start the 'secondary' mongod back up as part of the replica set and wait for it to # reach state SECONDARY. secondary.setup() self.logger.info(fixture.create_fixture_table(self.fixture)) secondary.await_ready() self._await_secondary_state(secondary) try: secondary.teardown() except errors.ServerFailure: raise errors.ServerFailure( "{} did not exit cleanly after reconciling the end of its oplog" .format(secondary)) self.logger.info( "Starting the fixture back up again with its data files intact for final" " validation...") try: self.fixture.setup() self.logger.info(fixture.create_fixture_table(self.fixture)) self.fixture.await_ready() finally: for (i, node) in enumerate(self.fixture.nodes): node.preserve_dbpath = preserve_dbpaths[i]
def _restart_and_clear_fixture(self): # We restart the fixture after setting 'preserve_dbpath' back to its original value in order # to clear the contents of the data directory if desired. The CleanEveryN hook cannot be # used in combination with the PeriodicKillSecondaries hook because we may attempt to call # Fixture.teardown() while the "rsSyncApplyStop" failpoint is still enabled on the # secondaries, causing them to exit with a non-zero return code. self.logger.info("Finished verifying data consistency, stopping the fixture...") try: self.fixture.teardown() except errors.ServerFailure: raise errors.ServerFailure( "{} did not exit cleanly after verifying data consistency".format(self.fixture)) self.logger.info("Starting the fixture back up again with no data...") self.fixture.setup() self.logger.info(fixture.create_fixture_table(self.fixture)) self.fixture.await_ready()
def restart_node(self, chosen): """Restart the new step up node.""" self.logger.info("Waiting for the old primary on port %d of replica set '%s' to exit.", chosen.port, self.replset_name) chosen.mongod.wait() self.logger.info("Attempting to restart the old primary on port %d of replica set '%s.", chosen.port, self.replset_name) # Restart the mongod on the old primary and wait until we can contact it again. Keep the # original preserve_dbpath to restore after restarting the mongod. original_preserve_dbpath = chosen.preserve_dbpath chosen.preserve_dbpath = True try: chosen.setup() self.logger.info(interface.create_fixture_table(self)) chosen.await_ready() finally: chosen.preserve_dbpath = original_preserve_dbpath
def _step_down(self, rs_fixture): try: primary = rs_fixture.get_primary( timeout_secs=self._stepdown_interval_secs) except errors.ServerFailure: # We ignore the ServerFailure exception because it means a primary wasn't available. # We'll try again after self._stepdown_interval_secs seconds. return secondaries = rs_fixture.get_secondaries() # Check that the fixture is still running before stepping down or killing the primary. # This ensures we still detect some cases in which the fixture has already crashed. if not rs_fixture.is_running(): raise errors.ServerFailure( "ReplicaSetFixture {} expected to be running in" " ContinuousStepdown, but wasn't.".format( rs_fixture.replset_name)) if self._terminate: # If we're running with background reconfigs, it's possible to be in a scenario # where we kill a necessary voting node (i.e. in a 5 node repl set), only 2 are # voting. In this scenario, we want to avoid killing the primary because no # secondary can step up. if self._background_reconfig: # stagger the kill thread so that it runs a little after the reconfig thread time.sleep(1) voting_members = rs_fixture.get_voting_members() self.logger.info("Current voting members: %s", voting_members) if len(voting_members) <= 3: # Do not kill or terminate the primary if we don't have enough voting nodes to # elect a new primary. return should_kill = self._kill and random.choice([True, False]) action = "Killing" if should_kill else "Terminating" self.logger.info("%s the primary on port %d of replica set '%s'.", action, primary.port, rs_fixture.replset_name) # We send the mongod process the signal to exit but don't immediately wait for it to # exit because clean shutdown may take a while and we want to restore write availability # as quickly as possible. teardown_mode = fixture_interface.TeardownMode.KILL if should_kill else fixture_interface.TeardownMode.TERMINATE primary.mongod.stop(mode=teardown_mode) elif not self._stepdown_via_heartbeats: self.logger.info( "Stepping down the primary on port %d of replica set '%s'.", primary.port, rs_fixture.replset_name) try: client = self._create_client(primary) client.admin.command( bson.SON([ ("replSetStepDown", self._stepdown_duration_secs), ("force", True), ])) except pymongo.errors.AutoReconnect: # AutoReconnect exceptions are expected as connections are closed during stepdown. pass except pymongo.errors.PyMongoError: self.logger.exception( "Error while stepping down the primary on port %d of replica set '%s'.", primary.port, rs_fixture.replset_name) raise # We have skipped stepping down the primary if we want to step up secondaries instead. Here, # we simply need to pick an arbitrary secondary to run for election which will lead to # unconditional stepdown on the primary. # # If we have terminated/killed/stepped down the primary above, write availability has lost. # We pick an arbitrary secondary to run for election immediately in order to avoid a long # period where the replica set doesn't have write availability. If none of the secondaries # are eligible, or their election attempt fails, then we'll run the replSetStepUp command on # 'primary' to ensure we have write availability sooner than the # self._stepdown_duration_secs duration expires. while secondaries: chosen = random.choice(secondaries) self.logger.info( "Attempting to step up the secondary on port %d of replica set '%s'.", chosen.port, rs_fixture.replset_name) try: client = self._create_client(chosen) client.admin.command("replSetStepUp") break except pymongo.errors.OperationFailure: # OperationFailure exceptions are expected when the election attempt fails due to # not receiving enough votes. This can happen when the 'chosen' secondary's opTime # is behind that of other secondaries. We handle this by attempting to elect a # different secondary. self.logger.info( "Failed to step up the secondary on port %d of replica set '%s'.", chosen.port, rs_fixture.replset_name) secondaries.remove(chosen) except pymongo.errors.AutoReconnect: # It is possible for a replSetStepUp to fail with AutoReconnect if that node goes # into Rollback (which causes it to close any open connections). pass if self._terminate: self.logger.info( "Waiting for the old primary on port %d of replica set '%s' to exit.", primary.port, rs_fixture.replset_name) primary.mongod.wait() self.logger.info( "Attempting to restart the old primary on port %d of replica set '%s.", primary.port, rs_fixture.replset_name) # Restart the mongod on the old primary and wait until we can contact it again. Keep the # original preserve_dbpath to restore after restarting the mongod. original_preserve_dbpath = primary.preserve_dbpath primary.preserve_dbpath = True try: primary.setup() self.logger.info( fixture_interface.create_fixture_table(self._fixture)) primary.await_ready() finally: primary.preserve_dbpath = original_preserve_dbpath elif not self._stepdown_via_heartbeats: # If we chose to step up a secondary instead, the primary was never stepped down via the # replSetStepDown command and thus will not have a stepdown period. So we can skip # running {replSetFreeze: 0}. Otherwise, the replSetStepDown command run earlier # introduced a stepdown period on the former primary and so we have to run the # {replSetFreeze: 0} command to ensure the former primary is electable in the next # round of _step_down(). client = self._create_client(primary) client.admin.command({"replSetFreeze": 0}) elif secondaries: # We successfully stepped up a secondary, wait for the former primary to step down via # heartbeats. We need to wait for the former primary to step down to complete this step # down round and to avoid races between the ContinuousStepdown hook and other test hooks # that may depend on the health of the replica set. self.logger.info( "Successfully stepped up the secondary on port %d of replica set '%s'.", chosen.port, rs_fixture.replset_name) while True: try: client = self._create_client(primary) is_secondary = client.admin.command( "isMaster")["secondary"] if is_secondary: break except pymongo.errors.AutoReconnect: pass self.logger.info( "Waiting for primary on port %d of replica set '%s' to step down.", primary.port, rs_fixture.replset_name) time.sleep(0.2) # Wait a little bit before trying again. self.logger.info( "Primary on port %d of replica set '%s' stepped down.", primary.port, rs_fixture.replset_name) if not secondaries: # If we failed to step up one of the secondaries, then we run the replSetStepUp to try # and elect the former primary again. This way we don't need to wait # self._stepdown_duration_secs seconds to restore write availability to the cluster. # Since the former primary may have been killed, we need to wait until it has been # restarted by retrying replSetStepUp. retry_time_secs = rs_fixture.AWAIT_REPL_TIMEOUT_MINS * 60 retry_start_time = time.time() while True: try: client = self._create_client(primary) client.admin.command("replSetStepUp") break except pymongo.errors.OperationFailure: self._wait(0.2) if time.time() - retry_start_time > retry_time_secs: raise errors.ServerFailure( "The old primary on port {} of replica set {} did not step up in" " {} seconds.".format(client.port, rs_fixture.replset_name, retry_time_secs)) # Bump the counter for the chosen secondary to indicate that the replSetStepUp command # executed successfully. key = "{}/{}".format( rs_fixture.replset_name, chosen.get_internal_connection_string() if secondaries else "none") self._step_up_stats[key] += 1
def change_version_and_restart_node(self, primary, auth_options): """ Select Secondary for stepUp. Ensure its version is different to that of the old primary; change the version of the Secondary is needed. """ def get_chosen_node_from_replsetstatus(status_member_infos): max_optime = None chosen_index = None # We always select the secondary with highest optime to setup. for member_info in status_member_infos: if member_info.get("self", False): # Ignore self, which is the old primary and not eligible # to be re-elected in downgrade multiversion cluster. continue optime_dict = member_info["optime"] if max_optime is None: chosen_index = member_info["_id"] max_optime = optime_dict else: if compare_optime(optime_dict, max_optime) > 0: chosen_index = member_info["_id"] max_optime = optime_dict if chosen_index is None or max_optime is None: raise self.fixturelib.ServerFailure( "Failed to find a secondary eligible for " f"election; index: {chosen_index}, optime: {max_optime}" ) return self.nodes[chosen_index] primary_client = interface.authenticate(primary.mongo_client(), auth_options) retry_time_secs = self.AWAIT_REPL_TIMEOUT_MINS * 60 retry_start_time = time.time() while True: member_infos = primary_client.admin.command( {"replSetGetStatus": 1})["members"] chosen_node = get_chosen_node_from_replsetstatus(member_infos) if chosen_node.change_version_if_needed(primary): self.logger.info( "Waiting for the chosen secondary on port %d of replica set '%s' to exit.", chosen_node.port, self.replset_name) teardown_mode = interface.TeardownMode.TERMINATE chosen_node.mongod.stop(mode=teardown_mode) chosen_node.mongod.wait() self.logger.info( "Attempting to restart the chosen secondary on port %d of replica set '%s.", chosen_node.port, self.replset_name) chosen_node.setup() self.logger.info(interface.create_fixture_table(self)) chosen_node.await_ready() if self.stepup_node(chosen_node, auth_options): break if time.time() - retry_start_time > retry_time_secs: raise pymongo.errors.ServerFailure( "The old primary on port {} of replica set {} did not step up in" " {} seconds.".format(chosen_node.port, self.replset_name, retry_time_secs)) return chosen_node
def _check_invariants_as_standalone(self, secondary): # pylint: disable=too-many-locals # pylint: disable=too-many-branches,too-many-statements # We remove the --replSet option in order to start the node as a standalone. replset_name = secondary.mongod_options.pop("replSet") self.logger.info( "Restarting the secondary on port %d as a standalone node with" " its data files intact...", secondary.port) try: secondary.setup() self.logger.info(fixture.create_fixture_table(self.fixture)) secondary.await_ready() client = secondary.mongo_client() minvalid_doc = client.local["replset.minvalid"].find_one() oplog_truncate_after_doc = client.local["replset.oplogTruncateAfterPoint"].find_one() recovery_timestamp_res = client.admin.command("replSetTest", getLastStableRecoveryTimestamp=True) latest_oplog_doc = client.local["oplog.rs"].find_one(sort=[("$natural", pymongo.DESCENDING)]) self.logger.info("Checking invariants: minValid: {}, oplogTruncateAfterPoint: {}," " stable recovery timestamp: {}, latest oplog doc: {}".format( minvalid_doc, oplog_truncate_after_doc, recovery_timestamp_res, latest_oplog_doc)) null_ts = bson.Timestamp(0, 0) # We wait for a stable recovery timestamp at setup, so we must have an oplog. latest_oplog_entry_ts = null_ts if latest_oplog_doc is None: raise errors.ServerFailure("No latest oplog entry") latest_oplog_entry_ts = latest_oplog_doc.get("ts") if latest_oplog_entry_ts is None: raise errors.ServerFailure( "Latest oplog entry had no 'ts' field: {}".format(latest_oplog_doc)) # The "oplogTruncateAfterPoint" document may not exist at startup. If so, we default # it to null. oplog_truncate_after_ts = null_ts if oplog_truncate_after_doc is not None: oplog_truncate_after_ts = oplog_truncate_after_doc.get( "oplogTruncateAfterPoint", null_ts) # The "lastStableRecoveryTimestamp" field is present if the storage engine supports # "recover to a timestamp". If it's a null timestamp on a durable storage engine, that # means we do not yet have a stable checkpoint timestamp and must be restarting at the # top of the oplog. Since we wait for a stable recovery timestamp at test fixture setup, # we should never encounter a null timestamp here. recovery_timestamp = recovery_timestamp_res.get("lastStableRecoveryTimestamp") if recovery_timestamp == null_ts: raise errors.ServerFailure( "Received null stable recovery timestamp {}".format(recovery_timestamp_res)) # On a storage engine that doesn't support "recover to a timestamp", we default to null. if recovery_timestamp is None: recovery_timestamp = null_ts # last stable recovery timestamp <= top of oplog if not recovery_timestamp <= latest_oplog_entry_ts: raise errors.ServerFailure("The condition last stable recovery timestamp <= top" " of oplog ({} <= {}) doesn't hold:" " getLastStableRecoveryTimestamp result={}," " latest oplog entry={}".format( recovery_timestamp, latest_oplog_entry_ts, recovery_timestamp_res, latest_oplog_doc)) if minvalid_doc is not None: applied_through_ts = minvalid_doc.get("begin", {}).get("ts", null_ts) minvalid_ts = minvalid_doc.get("ts", null_ts) # The "appliedThrough" value should always equal the "last stable recovery # timestamp", AKA the stable checkpoint for durable engines, on server restart. # # The written "appliedThrough" time is updated with the latest timestamp at the end # of each batch application, and batch boundaries are the only valid stable # timestamps on secondaries. Therefore, a non-null appliedThrough timestamp must # equal the checkpoint timestamp, because any stable timestamp that the checkpoint # could use includes an equal persisted appliedThrough timestamp. if (recovery_timestamp != null_ts and applied_through_ts != null_ts and (not recovery_timestamp == applied_through_ts)): raise errors.ServerFailure( "The condition last stable recovery timestamp ({}) == appliedThrough ({})" " doesn't hold: minValid document={}," " getLastStableRecoveryTimestamp result={}, last oplog entry={}".format( recovery_timestamp, applied_through_ts, minvalid_doc, recovery_timestamp_res, latest_oplog_doc)) if applied_through_ts == null_ts: # We clear "appliedThrough" to represent having applied through the top of the # oplog in PRIMARY state or immediately after "rollback via refetch". # If we are using a storage engine that supports "recover to a timestamp," # then we will have a "last stable recovery timestamp" and we should use that # as our "appliedThrough" (similarly to why we assert their equality above). # If both are null, then we are in PRIMARY state on a storage engine that does # not support "recover to a timestamp" or in RECOVERING immediately after # "rollback via refetch". Since we do not update "minValid" in PRIMARY state, # we leave "appliedThrough" as null so that the invariants below hold, rather # than substituting the latest oplog entry for the "appliedThrough" value. applied_through_ts = recovery_timestamp if minvalid_ts == null_ts: # The server treats the "ts" field in the minValid document as missing when its # value is the null timestamp. minvalid_ts = applied_through_ts if latest_oplog_entry_ts == null_ts: # If the oplog is empty, we treat the "minValid" as the latest oplog entry. latest_oplog_entry_ts = minvalid_ts if oplog_truncate_after_ts == null_ts: # The server treats the "oplogTruncateAfterPoint" field as missing when its # value is the null timestamp. When it is null, the oplog is complete and # should not be truncated, so it is effectively the top of the oplog. oplog_truncate_after_ts = latest_oplog_entry_ts # Check the ordering invariants before the secondary has reconciled the end of # its oplog. # The "oplogTruncateAfterPoint" is set to the first timestamp of each batch of # oplog entries before they are written to the oplog. Thus, it can be ahead # of the top of the oplog before any oplog entries are written, and behind it # after some are written. Thus, we cannot compare it to the top of the oplog. # appliedThrough <= minValid # appliedThrough represents the end of the previous batch, so it is always the # earliest. if applied_through_ts > minvalid_ts: raise errors.ServerFailure( "The condition appliedThrough <= minValid ({} <= {}) doesn't hold: minValid" " document={}, latest oplog entry={}".format( applied_through_ts, minvalid_ts, minvalid_doc, latest_oplog_doc)) # minValid <= oplogTruncateAfterPoint # This is true because this hook is never run after a rollback. Thus, we only # move "minValid" to the end of each batch after the batch is written to the oplog. # We reset the "oplogTruncateAfterPoint" to null before we move "minValid" from # the end of the previous batch to the end of the current batch. Thus "minValid" # must be less than or equal to the "oplogTruncateAfterPoint". if minvalid_ts > oplog_truncate_after_ts: raise errors.ServerFailure( "The condition minValid <= oplogTruncateAfterPoint ({} <= {}) doesn't" " hold: minValid document={}, oplogTruncateAfterPoint document={}," " latest oplog entry={}".format(minvalid_ts, oplog_truncate_after_ts, minvalid_doc, oplog_truncate_after_doc, latest_oplog_doc)) # minvalid <= latest oplog entry # "minValid" is set to the end of a batch after the batch is written to the oplog. # Thus it is always less than or equal to the top of the oplog. if minvalid_ts > latest_oplog_entry_ts: raise errors.ServerFailure( "The condition minValid <= top of oplog ({} <= {}) doesn't" " hold: minValid document={}, latest oplog entry={}".format( minvalid_ts, latest_oplog_entry_ts, minvalid_doc, latest_oplog_doc)) try: secondary.teardown() except errors.ServerFailure: raise errors.ServerFailure( "{} did not exit cleanly after being started up as a standalone".format( secondary)) except pymongo.errors.OperationFailure as err: self.logger.exception( "Failed to read the minValid document, the oplogTruncateAfterPoint document," " the last stable recovery timestamp, or the latest oplog entry from the" " mongod on port %d", secondary.port) raise errors.ServerFailure( "Failed to read the minValid document, the oplogTruncateAfterPoint document," " the last stable recovery timestamp, or the latest oplog entry from the" " mongod on port {}: {}".format(secondary.port, err.args[0])) finally: # Set the secondary's options back to their original values. secondary.mongod_options["replSet"] = replset_name