def __init__(self, ctx, manager, config, name, logger): super(MonitorThrasher, self).__init__() self.ctx = ctx self.manager = manager self.manager.wait_for_clean() self.stopping = False self.logger = logger self.config = config self.name = name if self.config is None: self.config = dict() """ Test reproducibility """ self.random_seed = self.config.get('seed', None) if self.random_seed is None: self.random_seed = int(time.time()) self.rng = random.Random() self.rng.seed(int(self.random_seed)) """ Monitor thrashing """ self.revive_delay = float(self.config.get('revive_delay', 10.0)) self.thrash_delay = float(self.config.get('thrash_delay', 0.0)) self.thrash_many = self.config.get('thrash_many', False) self.maintain_quorum = self.config.get('maintain_quorum', True) self.scrub = self.config.get('scrub', True) self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10)) self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0)) assert self.max_killable() > 0, \ 'Unable to kill at least one monitor with the current config.' """ Store thrashing """ self.store_thrash = self.config.get('store_thrash', False) self.store_thrash_probability = int( self.config.get('store_thrash_probability', 50)) if self.store_thrash: assert self.store_thrash_probability > 0, \ 'store_thrash is set, probability must be > 0' assert self.maintain_quorum, \ 'store_thrash = true must imply maintain_quorum = true' #MDS failover self.mds_failover = self.config.get('check_mds_failover', False) if self.mds_failover: self.mds_cluster = MDSCluster(ctx) self.thread = gevent.spawn(self.do_thrash)
def ready(ctx, config): """ That the file system is ready for clients. """ if config is None: config = {} assert isinstance(config, dict), \ 'task only accepts a dict for configuration' timeout = config.get('timeout', 300) mdsc = MDSCluster(ctx) status = mdsc.status() for filesystem in status.get_filesystems(): fs = Filesystem(ctx, fscid=filesystem['id']) fs.wait_for_daemons(timeout=timeout, status=status)
def pre_upgrade_save(ctx, config): """ That the upgrade procedure doesn't clobber state: save state. """ mdsc = MDSCluster(ctx) status = mdsc.status() state = {} ctx['mds-upgrade-state'] = state for fs in list(status.get_filesystems()): fscid = fs['id'] mdsmap = fs['mdsmap'] fs_state = {} fs_state['epoch'] = mdsmap['epoch'] fs_state['max_mds'] = mdsmap['max_mds'] fs_state['flags'] = mdsmap['flags'] & UPGRADE_FLAGS_MASK state[fscid] = fs_state log.debug(f"fs fscid={fscid},name={mdsmap['fs_name']} state = {fs_state}")
def post_upgrade_checks(ctx, config): """ That the upgrade procedure doesn't clobber state. """ state = ctx['mds-upgrade-state'] mdsc = MDSCluster(ctx) status = mdsc.status() for fs in list(status.get_filesystems()): fscid = fs['id'] mdsmap = fs['mdsmap'] fs_state = state[fscid] log.debug(f"checking fs fscid={fscid},name={mdsmap['fs_name']} state = {fs_state}") # check state was restored to previous values assert fs_state['max_mds'] == mdsmap['max_mds'] assert fs_state['flags'] == (mdsmap['flags'] & UPGRADE_FLAGS_MASK) # now confirm that the upgrade procedure was followed epoch = mdsmap['epoch'] pre_upgrade_epoch = fs_state['epoch'] assert pre_upgrade_epoch < epoch should_decrease_max_mds = fs_state['max_mds'] > 1 did_decrease_max_mds = False should_disable_allow_standby_replay = fs_state['flags'] & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY did_disable_allow_standby_replay = False for i in range(pre_upgrade_epoch+1, mdsmap['epoch']): old_status = mdsc.status(epoch=i) old_fs = old_status.get_fsmap(fscid) old_mdsmap = old_fs['mdsmap'] if should_decrease_max_mds and old_mdsmap['max_mds'] == 1: log.debug(f"max_mds reduced in epoch {i}") did_decrease_max_mds = True if should_disable_allow_standby_replay and not (old_mdsmap['flags'] & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY): log.debug(f"allow_standby_replay disabled in epoch {i}") did_disable_allow_standby_replay = True assert not should_decrease_max_mds or did_decrease_max_mds assert not should_disable_allow_standby_replay or did_disable_allow_standby_replay
def task(ctx, config): """ Run the CephFS test cases. Run everything in tasks/cephfs/test_*.py: :: tasks: - install: - ceph: - ceph-fuse: - cephfs_test_runner: `modules` argument allows running only some specific modules: :: tasks: ... - cephfs_test_runner: modules: - tasks.cephfs.test_sessionmap - tasks.cephfs.test_auto_repair By default, any cases that can't be run on the current cluster configuration will generate a failure. When the optional `fail_on_skip` argument is set to false, any tests that can't be run on the current configuration will simply be skipped: :: tasks: ... - cephfs_test_runner: fail_on_skip: false """ ceph_cluster = CephCluster(ctx) if len(list(misc.all_roles_of_type(ctx.cluster, 'mds'))): mds_cluster = MDSCluster(ctx) fs = Filesystem(ctx) else: mds_cluster = None fs = None if len(list(misc.all_roles_of_type(ctx.cluster, 'mgr'))): mgr_cluster = MgrCluster(ctx) else: mgr_cluster = None # Mount objects, sorted by ID if hasattr(ctx, 'mounts'): mounts = [ v for k, v in sorted(ctx.mounts.items(), lambda a, b: cmp(a[0], b[0])) ] else: # The test configuration has a filesystem but no fuse/kclient mounts mounts = [] decorating_loader = DecoratingLoader({ "ctx": ctx, "mounts": mounts, "fs": fs, "ceph_cluster": ceph_cluster, "mds_cluster": mds_cluster, "mgr_cluster": mgr_cluster, }) fail_on_skip = config.get('fail_on_skip', True) # Put useful things onto ctx for interactive debugging ctx.fs = fs ctx.mds_cluster = mds_cluster ctx.mgr_cluster = mgr_cluster # Depending on config, either load specific modules, or scan for moduless if config and 'modules' in config and config['modules']: module_suites = [] for mod_name in config['modules']: # Test names like cephfs.test_auto_repair module_suites.append(decorating_loader.loadTestsFromName(mod_name)) overall_suite = suite.TestSuite(module_suites) else: # Default, run all tests overall_suite = decorating_loader.discover( os.path.join(os.path.dirname(os.path.abspath(__file__)), "cephfs/")) if ctx.config.get("interactive-on-error", False): InteractiveFailureResult.ctx = ctx result_class = InteractiveFailureResult else: result_class = unittest.TextTestResult class LoggingResult(result_class): def startTest(self, test): log.info("Starting test: {0}".format(self.getDescription(test))) return super(LoggingResult, self).startTest(test) def addSkip(self, test, reason): if fail_on_skip: # Don't just call addFailure because that requires a traceback self.failures.append((test, reason)) else: super(LoggingResult, self).addSkip(test, reason) # Execute! result = unittest.TextTestRunner(stream=LogStream(), resultclass=LoggingResult, verbosity=2, failfast=True).run(overall_suite) if not result.wasSuccessful(): result.printErrors() # duplicate output at end for convenience bad_tests = [] for test, error in result.errors: bad_tests.append(str(test)) for test, failure in result.failures: bad_tests.append(str(test)) raise RuntimeError("Test failure: {0}".format(", ".join(bad_tests))) yield
def task(ctx, config): """ Stress test the mds by thrashing while another task/workunit is running. Please refer to MDSThrasher class for further information on the available options. """ mds_cluster = MDSCluster(ctx) if config is None: config = {} assert isinstance(config, dict), \ 'mds_thrash task only accepts a dict for configuration' mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds')) assert len(mdslist) > 1, \ 'mds_thrash task requires at least 2 metadata servers' # choose random seed if 'seed' in config: seed = int(config['seed']) else: seed = int(time.time()) log.info('mds thrasher using random seed: {seed}'.format(seed=seed)) random.seed(seed) (first, ) = ctx.cluster.only( 'mds.{_id}'.format(_id=mdslist[0])).remotes.keys() manager = ceph_manager.CephManager( first, ctx=ctx, logger=log.getChild('ceph_manager'), ) # make sure everyone is in active, standby, or standby-replay log.info('Wait for all MDSs to reach steady state...') status = mds_cluster.status() while True: steady = True for info in status.get_all(): state = info['state'] if state not in ('up:active', 'up:standby', 'up:standby-replay'): steady = False break if steady: break sleep(2) status = mds_cluster.status() log.info('Ready to start thrashing') manager.wait_for_clean() assert manager.is_clean() if 'cluster' not in config: config['cluster'] = 'ceph' for fs in status.get_filesystems(): thrasher = MDSThrasher(ctx, manager, config, Filesystem(ctx, fs['id']), fs['mdsmap']['max_mds']) thrasher.start() ctx.ceph[config['cluster']].thrashers.append(thrasher) try: log.debug('Yielding') yield finally: log.info('joining mds_thrasher') thrasher.stop() if thrasher.exception is not None: raise RuntimeError('error during thrashing') thrasher.join() log.info('done joining')
def task(ctx, config): """ Stress test the mds by thrashing while another task/workunit is running. Please refer to MDSThrasher class for further information on the available options. """ mds_cluster = MDSCluster(ctx) if config is None: config = {} assert isinstance(config, dict), \ 'mds_thrash task only accepts a dict for configuration' mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds')) assert len(mdslist) > 1, \ 'mds_thrash task requires at least 2 metadata servers' # choose random seed if 'seed' in config: seed = int(config['seed']) else: seed = int(time.time()) log.info('mds thrasher using random seed: {seed}'.format(seed=seed)) random.seed(seed) max_thrashers = config.get('max_thrash', 1) thrashers = {} (first, ) = ctx.cluster.only( 'mds.{_id}'.format(_id=mdslist[0])).remotes.iterkeys() manager = ceph_manager.CephManager( first, ctx=ctx, logger=log.getChild('ceph_manager'), ) # make sure everyone is in active, standby, or standby-replay log.info('Wait for all MDSs to reach steady state...') statuses = None statuses_by_rank = None while True: statuses = {m: mds_cluster.get_mds_info(m) for m in mdslist} statuses_by_rank = {} for _, s in statuses.iteritems(): if isinstance(s, dict): statuses_by_rank[s['rank']] = s ready = filter( lambda (_, s): s is not None and (s['state'] == 'up:active' or s[ 'state'] == 'up:standby' or s['state'] == 'up:standby-replay'), statuses.items()) if len(ready) == len(statuses): break time.sleep(2) log.info('Ready to start thrashing') # setup failure groups failure_groups = {} actives = { s['name']: s for (_, s) in statuses.iteritems() if s['state'] == 'up:active' } log.info('Actives is: {d}'.format(d=actives)) log.info('Statuses is: {d}'.format(d=statuses_by_rank)) for active in actives: for (r, s) in statuses.iteritems(): if s['standby_for_name'] == active: if not active in failure_groups: failure_groups[active] = [] log.info('Assigning mds rank {r} to failure group {g}'.format( r=r, g=active)) failure_groups[active].append(r) manager.wait_for_clean() for (active, standbys) in failure_groups.iteritems(): weight = 1.0 if 'thrash_weights' in config: weight = int(config['thrash_weights'].get( 'mds.{_id}'.format(_id=active), '0.0')) failure_group = [active] failure_group.extend(standbys) thrasher = MDSThrasher( ctx, manager, mds_cluster, config, logger=log.getChild( 'mds_thrasher.failure_group.[{a}, {sbs}]'.format( a=active, sbs=', '.join(standbys))), failure_group=failure_group, weight=weight) thrasher.start() thrashers[active] = thrasher # if thrash_weights isn't specified and we've reached max_thrash, # we're done if 'thrash_weights' not in config and len(thrashers) == max_thrashers: break try: log.debug('Yielding') yield finally: log.info('joining mds_thrashers') for t in thrashers: log.info('join thrasher for failure group [{fg}]'.format( fg=', '.join(failure_group))) thrashers[t].stop() thrashers[t].get() # Raise any exception from _run() thrashers[t].join() log.info('done joining')
def task(ctx, config): """ Stress test the mds by running scrub iterations while another task/workunit is running. Example config: - fwd_scrub: scrub_timeout: 300 sleep_between_iterations: 1 """ mds_cluster = MDSCluster(ctx) if config is None: config = {} assert isinstance(config, dict), \ 'fwd_scrub task only accepts a dict for configuration' mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds')) assert len(mdslist) > 0, \ 'fwd_scrub task requires at least 1 metadata server' (first, ) = ctx.cluster.only(f'mds.{mdslist[0]}').remotes.keys() manager = ceph_manager.CephManager( first, ctx=ctx, logger=log.getChild('ceph_manager'), ) # make sure everyone is in active, standby, or standby-replay log.info('Wait for all MDSs to reach steady state...') status = mds_cluster.status() while True: steady = True for info in status.get_all(): state = info['state'] if state not in ('up:active', 'up:standby', 'up:standby-replay'): steady = False break if steady: break sleep(2) status = mds_cluster.status() log.info('Ready to start scrub thrashing') manager.wait_for_clean() assert manager.is_clean() if 'cluster' not in config: config['cluster'] = 'ceph' for fs in status.get_filesystems(): fwd_scrubber = ForwardScrubber(Filesystem(ctx, fscid=fs['id']), config['scrub_timeout'], config['sleep_between_iterations']) fwd_scrubber.start() ctx.ceph[config['cluster']].thrashers.append(fwd_scrubber) try: log.debug('Yielding') yield finally: log.info('joining ForwardScrubbers') stop_all_fwd_scrubbers(ctx.ceph[config['cluster']].thrashers) log.info('done joining')
def task(ctx, config): """ Mount/unmount a ``ceph-fuse`` client. The config is optional and defaults to mounting on all clients. If a config is given, it is expected to be a list of clients to do this operation on. This lets you e.g. set up one client with ``ceph-fuse`` and another with ``kclient``. Example that mounts all clients:: tasks: - ceph: - ceph-fuse: - interactive: Example that uses both ``kclient` and ``ceph-fuse``:: tasks: - ceph: - ceph-fuse: [client.0] - kclient: [client.1] - interactive: Example that enables valgrind: tasks: - ceph: - ceph-fuse: client.0: valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes] - interactive: Example that stops an already-mounted client: :: tasks: - ceph: - ceph-fuse: [client.0] - ... do something that requires the FS mounted ... - ceph-fuse: client.0: mounted: false - ... do something that requires the FS unmounted ... Example that adds more generous wait time for mount (for virtual machines): tasks: - ceph: - ceph-fuse: client.0: mount_wait: 60 # default is 0, do not wait before checking /sys/ mount_timeout: 120 # default is 30, give up if /sys/ is not populated - interactive: :param ctx: Context :param config: Configuration """ log.info('Mounting ceph-fuse clients...') testdir = teuthology.get_testdir(ctx) config = get_client_configs(ctx, config) # List clients we will configure mounts for, default is all clients clients = list( teuthology.get_clients(ctx=ctx, roles=filter(lambda x: 'client.' in x, config.keys()))) all_mounts = getattr(ctx, 'mounts', {}) mounted_by_me = {} log.info('Wait for MDS to reach steady state...') mds_cluster = MDSCluster(ctx) status = mds_cluster.status() for filesystem in status.get_filesystems(): fs = Filesystem(ctx, fscid=filesystem['id']) fs.wait_for_daemons() log.info('Ready to start ceph-fuse...') # Construct any new FuseMount instances for id_, remote in clients: client_config = config.get("client.%s" % id_) if client_config is None: client_config = {} if id_ not in all_mounts: fuse_mount = FuseMount(client_config, testdir, id_, remote) all_mounts[id_] = fuse_mount else: # Catch bad configs where someone has e.g. tried to use ceph-fuse and kcephfs for the same client assert isinstance(all_mounts[id_], FuseMount) if not config.get("disabled", False) and client_config.get( 'mounted', True): mounted_by_me[id_] = all_mounts[id_] ctx.mounts = all_mounts # Mount any clients we have been asked to (default to mount all) for mount in mounted_by_me.values(): mount.mount() for mount in mounted_by_me.values(): mount.wait_until_mounted() # Umount any pre-existing clients that we have not been asked to mount for client_id in set(all_mounts.keys()) - set(mounted_by_me.keys()): mount = all_mounts[client_id] if mount.is_mounted(): mount.umount_wait() try: yield all_mounts finally: log.info('Unmounting ceph-fuse clients...') for mount in mounted_by_me.values(): # Conditional because an inner context might have umounted it if mount.is_mounted(): mount.umount_wait()