def task(ctx, config): """ Upgrade CephFS file system snap format. """ if config is None: config = {} assert isinstance(config, dict), \ 'snap-upgrade task only accepts a dict for configuration' fs = Filesystem(ctx) mds_map = fs.get_mds_map() assert(mds_map['max_mds'] == 1) json = fs.rank_tell(["scrub", "start", "/", "force", "recursive", "repair"]) if not json or json['return_code'] == 0: log.info("scrub / completed") else: log.info("scrub / failed: {}".format(json)) json = fs.rank_tell(["scrub", "start", "~mdsdir", "force", "recursive", "repair"]) if not json or json['return_code'] == 0: log.info("scrub ~mdsdir completed") else: log.info("scrub / failed: {}".format(json)) for i in range(0, 10): mds_map = fs.get_mds_map() if (mds_map['flags'] & (1<<1)) != 0 and (mds_map['flags'] & (1<<4)) != 0: break time.sleep(10) assert((mds_map['flags'] & (1<<1)) != 0) # Test CEPH_MDSMAP_ALLOW_SNAPS assert((mds_map['flags'] & (1<<4)) != 0) # Test CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS
def task(ctx, config): fs = Filesystem(ctx) # Pick out the clients we will use from the configuration # ======================================================= if len(ctx.mounts) < 2: raise RuntimeError("Need at least two clients") mount_a = ctx.mounts.values()[0] mount_b = ctx.mounts.values()[1] # Stash references on ctx so that we can easily debug in interactive mode # ======================================================================= ctx.filesystem = fs ctx.mount_a = mount_a ctx.mount_b = mount_b run_tests(ctx, config, TestClusterFull, { 'fs': fs, 'mount_a': mount_a, 'mount_b': mount_b }) # Continue to any downstream tasks # ================================ yield
def task(ctx, config): """ Prepare MDS cluster for upgrade. This task reduces ranks to 1 and stops all standbys. """ if config is None: config = {} assert isinstance(config, dict), \ 'snap-upgrade task only accepts a dict for configuration' fs = Filesystem(ctx) status = fs.getinfo() fs.set_max_mds(1) fs.reach_max_mds() # Stop standbys now to minimize time rank 0 is down in subsequent: # tasks: # - ceph.stop: [mds.*] rank0 = fs.get_rank(rank=0, status=status) for daemon in ctx.daemons.iter_daemons_of_role('mds', fs.mon_manager.cluster): if rank0['name'] != daemon.id_: daemon.stop() for i in range(1, 10): time.sleep(5) # time for FSMap to update status = fs.getinfo() if len(list(status.get_standbys())) == 0: break assert (len(list(status.get_standbys())) == 0)
def cephfs_setup(ctx, config): testdir = teuthology.get_testdir(ctx) coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) first_mon = teuthology.get_first_mon(ctx, config) (mon_remote, ) = ctx.cluster.only(first_mon).remotes.iterkeys() mdss = ctx.cluster.only(teuthology.is_type('mds')) # If there are any MDSs, then create a filesystem for them to use # Do this last because requires mon cluster to be up and running if mdss.remotes: log.info('Setting up CephFS filesystem...') ceph_fs = Filesystem(ctx) if not ceph_fs.legacy_configured(): ceph_fs.create() is_active_mds = lambda role: role.startswith( 'mds.') and not role.endswith('-s') and role.find('-s-') == -1 all_roles = [ item for remote_roles in mdss.remotes.values() for item in remote_roles ] num_active = len([r for r in all_roles if is_active_mds(r)]) mon_remote.run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph', 'mds', 'set_max_mds', str(num_active) ]) yield
def __init__(self, ctx, manager, mds_cluster, config, logger, failure_group, weight): super(MDSThrasher, self).__init__() self.ctx = ctx self.manager = manager assert self.manager.is_clean() self.mds_cluster = mds_cluster self.stopping = Event() self.logger = logger self.config = config self.randomize = bool(self.config.get('randomize', True)) self.max_thrash_delay = float(self.config.get('thrash_delay', 30.0)) self.thrash_in_replay = float( self.config.get('thrash_in_replay', False)) assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format( v=self.thrash_in_replay) self.max_replay_thrash_delay = float( self.config.get('max_replay_thrash_delay', 4.0)) self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0)) self.failure_group = failure_group self.weight = weight # TODO support multiple filesystems: will require behavioural change to select # which filesystem to act on when doing rank-ish things self.fs = Filesystem(self.ctx)
def task(ctx, config): fs = Filesystem(ctx) # Pick out the clients we will use from the configuration # ======================================================= if len(ctx.mounts) < 2: raise RuntimeError("Need at least two clients") mount_a = ctx.mounts.values()[0] mount_b = ctx.mounts.values()[1] if not isinstance(mount_a, FuseMount) or not isinstance( mount_b, FuseMount): # kclient kill() power cycles nodes, so requires clients to each be on # their own node if mount_a.client_remote.hostname == mount_b.client_remote.hostname: raise RuntimeError("kclient clients must be on separate nodes") # Stash references on ctx so that we can easily debug in interactive mode # ======================================================================= ctx.filesystem = fs ctx.mount_a = mount_a ctx.mount_b = mount_b run_tests(ctx, config, TestClientLimits, { 'fs': fs, 'mount_a': mount_a, 'mount_b': mount_b }) # Continue to any downstream tasks # ================================ yield
def healthy(ctx, config): """ Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK. :param ctx: Context :param config: Configuration """ config = config if isinstance(config, dict) else dict() cluster_name = config.get('cluster', 'ceph') log.info('Waiting until ceph cluster %s is healthy...', cluster_name) firstmon = teuthology.get_first_mon(ctx, config, cluster_name) (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys() teuthology.wait_until_osds_up( ctx, cluster=ctx.cluster, remote=mon0_remote, ceph_cluster=cluster_name, ) teuthology.wait_until_healthy( ctx, remote=mon0_remote, ceph_cluster=cluster_name, ) if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes: # Some MDSs exist, wait for them to be healthy ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware ceph_fs.wait_for_daemons(timeout=300)
def setupfs(self, name=None): if name is None and self.fs is not None: # Previous mount existed, reuse the old name name = self.fs.name self.fs = Filesystem(self.ctx, name=name) log.info('Wait for MDS to reach steady state...') self.fs.wait_for_daemons() log.info('Ready to start {}...'.format(type(self).__name__))
def clients_evicted(ctx, config): """ Check clients are evicted, unmount (cleanup) if so. """ if config is None: config = {} assert isinstance(config, dict), \ 'task only accepts a dict for configuration' clients = config.get('clients') if clients is None: clients = {("client."+client_id): True for client_id in ctx.mounts} log.info("clients is {}".format(str(clients))) fs = Filesystem(ctx) status = fs.status() has_session = set() mounts = {} for client in clients: client_id = re.match("^client.([0-9]+)$", client).groups(1)[0] mounts[client] = ctx.mounts.get(client_id) for rank in fs.get_ranks(status=status): ls = fs.rank_asok(['session', 'ls'], rank=rank['rank'], status=status) for session in ls: for client, evicted in six.viewitems(clients): mount = mounts.get(client) if mount is not None: global_id = mount.get_global_id() if session['id'] == global_id: if evicted: raise RuntimeError("client still has session: {}".format(str(session))) else: log.info("client {} has a session with MDS {}.{}".format(client, fs.id, rank['rank'])) has_session.add(client) no_session = set(clients) - has_session should_assert = False for client, evicted in six.viewitems(clients): mount = mounts.get(client) if mount is not None: if evicted: log.info("confirming client {} is blacklisted".format(client)) assert mount.is_blacklisted() elif client in no_session: log.info("client {} should not be evicted but has no session with an MDS".format(client)) mount.is_blacklisted() # for debugging should_assert = True if should_assert: raise RuntimeError("some clients which should not be evicted have no session with an MDS?")
def cephfs_setup(ctx, config): testdir = teuthology.get_testdir(ctx) coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) first_mon = teuthology.get_first_mon(ctx, config) (mon_remote, ) = ctx.cluster.only(first_mon).remotes.iterkeys() mdss = ctx.cluster.only(teuthology.is_type('mds')) # If there are any MDSs, then create a filesystem for them to use # Do this last because requires mon cluster to be up and running if mdss.remotes: log.info('Setting up CephFS filesystem...') try: proc = mon_remote.run(args=[ 'sudo', 'ceph', '--format=json-pretty', 'osd', 'lspools' ], stdout=StringIO()) pools = json.loads(proc.stdout.getvalue()) metadata_pool_exists = 'metadata' in [p['poolname'] for p in pools] except CommandFailedError as e: # For use in upgrade tests, Ceph cuttlefish and earlier don't support # structured output (--format) from the CLI. if e.exitstatus == 22: metadata_pool_exists = True else: raise # In case we are using an older Ceph which creates FS by default if metadata_pool_exists: log.info("Metadata pool already exists, skipping") else: ceph_fs = Filesystem(ctx) ceph_fs.create() is_active_mds = lambda role: role.startswith( 'mds.') and not role.endswith('-s') and role.find('-s-') == -1 all_roles = [ item for remote_roles in mdss.remotes.values() for item in remote_roles ] num_active = len([r for r in all_roles if is_active_mds(r)]) mon_remote.run(args=[ 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph', 'mds', 'set_max_mds', str(num_active) ]) yield
def task(ctx, config): """ Prepare MDS cluster for upgrade. This task reduces ranks to 1 and stops all standbys. """ if config is None: config = {} assert isinstance(config, dict), \ 'snap-upgrade task only accepts a dict for configuration' fs = Filesystem(ctx) fs.getinfo() # load name fs.set_allow_standby_replay(False) fs.set_max_mds(1) fs.reach_max_mds()
def task(ctx, config): fs = Filesystem(ctx) mount_a = ctx.mounts.values()[0] # Stash references on ctx so that we can easily debug in interactive mode # ======================================================================= ctx.filesystem = fs ctx.mount_a = mount_a run_tests(ctx, config, TestMDSAutoRepair, { 'fs': fs, 'mount_a': mount_a, }) # Continue to any downstream tasks # ================================ yield
def ready(ctx, config): """ That the file system is ready for clients. """ if config is None: config = {} assert isinstance(config, dict), \ 'task only accepts a dict for configuration' timeout = config.get('timeout', 300) mdsc = MDSCluster(ctx) status = mdsc.status() for filesystem in status.get_filesystems(): fs = Filesystem(ctx, fscid=filesystem['id']) fs.wait_for_daemons(timeout=timeout, status=status)
def healthy(ctx, config): """ Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK. :param ctx: Context :param config: Configuration """ log.info('Waiting until ceph is healthy...') firstmon = teuthology.get_first_mon(ctx, config) (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys() teuthology.wait_until_osds_up(ctx, cluster=ctx.cluster, remote=mon0_remote) teuthology.wait_until_healthy( ctx, remote=mon0_remote, ) if ctx.cluster.only(teuthology.is_type('mds')).remotes: # Some MDSs exist, wait for them to be healthy ceph_fs = Filesystem(ctx) ceph_fs.wait_for_daemons(timeout=300)
def cephfs_setup(ctx, config): cluster_name = config['cluster'] testdir = teuthology.get_testdir(ctx) coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) first_mon = teuthology.get_first_mon(ctx, config, cluster_name) (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys() mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name)) # If there are any MDSs, then create a filesystem for them to use # Do this last because requires mon cluster to be up and running if mdss.remotes: log.info('Setting up CephFS filesystem...') ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware if not ceph_fs.legacy_configured(): ceph_fs.create() is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles] num_active = len([r for r in all_roles if is_active_mds(r)]) mon_remote.run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph', 'mds', 'set', 'allow_multimds', 'true', '--yes-i-really-mean-it'], check_status=False, # probably old version, upgrade test ) mon_remote.run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph', '--cluster', cluster_name, 'mds', 'set_max_mds', str(num_active)]) yield
def task(ctx, config): """ Stress test the mds by thrashing while another task/workunit is running. Please refer to MDSThrasher class for further information on the available options. """ mds_cluster = MDSCluster(ctx) if config is None: config = {} assert isinstance(config, dict), \ 'mds_thrash task only accepts a dict for configuration' mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds')) assert len(mdslist) > 1, \ 'mds_thrash task requires at least 2 metadata servers' # choose random seed if 'seed' in config: seed = int(config['seed']) else: seed = int(time.time()) log.info('mds thrasher using random seed: {seed}'.format(seed=seed)) random.seed(seed) (first, ) = ctx.cluster.only( 'mds.{_id}'.format(_id=mdslist[0])).remotes.keys() manager = ceph_manager.CephManager( first, ctx=ctx, logger=log.getChild('ceph_manager'), ) # make sure everyone is in active, standby, or standby-replay log.info('Wait for all MDSs to reach steady state...') status = mds_cluster.status() while True: steady = True for info in status.get_all(): state = info['state'] if state not in ('up:active', 'up:standby', 'up:standby-replay'): steady = False break if steady: break sleep(2) status = mds_cluster.status() log.info('Ready to start thrashing') manager.wait_for_clean() assert manager.is_clean() if 'cluster' not in config: config['cluster'] = 'ceph' for fs in status.get_filesystems(): thrasher = MDSThrasher(ctx, manager, config, Filesystem(ctx, fs['id']), fs['mdsmap']['max_mds']) thrasher.start() ctx.ceph[config['cluster']].thrashers.append(thrasher) try: log.debug('Yielding') yield finally: log.info('joining mds_thrasher') thrasher.stop() if thrasher.exception is not None: raise RuntimeError('error during thrashing') thrasher.join() log.info('done joining')
def setupfs(self, name=None): self.fs = Filesystem(self.ctx, name=name) log.info('Wait for MDS to reach steady state...') self.fs.wait_for_daemons() log.info('Ready to start {}...'.format(type(self).__name__))
def task(ctx, config): """ Execute CephFS client recovery test suite. Requires: - An outer ceph_fuse task with at least two clients - That the clients are on a separate host to the MDS """ fs = Filesystem(ctx) # Pick out the clients we will use from the configuration # ======================================================= if len(ctx.mounts) < 2: raise RuntimeError("Need at least two clients") mount_a = ctx.mounts.values()[0] mount_b = ctx.mounts.values()[1] if not isinstance(mount_a, FuseMount) or not isinstance( mount_b, FuseMount): # kclient kill() power cycles nodes, so requires clients to each be on # their own node if mount_a.client_remote.hostname == mount_b.client_remote.hostname: raise RuntimeError("kclient clients must be on separate nodes") # Check we have at least one remote client for use with network-dependent tests # ============================================================================= if mount_a.client_remote.hostname in fs.get_mds_hostnames(): raise RuntimeError( "Require first client to on separate server from MDSs") # Stash references on ctx so that we can easily debug in interactive mode # ======================================================================= ctx.filesystem = fs ctx.mount_a = mount_a ctx.mount_b = mount_b run_tests( ctx, config, TestClientRecovery, { "mds_reconnect_timeout": int( fs.mds_asok(['config', 'get', 'mds_reconnect_timeout' ])['mds_reconnect_timeout']), "mds_session_timeout": int( fs.mds_asok(['config', 'get', 'mds_session_timeout' ])['mds_session_timeout']), "ms_max_backoff": int( fs.mds_asok(['config', 'get', 'ms_max_backoff' ])['ms_max_backoff']), "fs": fs, "mount_a": mount_a, "mount_b": mount_b }) # Continue to any downstream tasks # ================================ yield
def build_ceph_cluster(ctx, config): """Build a ceph cluster""" # Expect to find ceph_admin on the first mon by ID, same place that the download task # puts it. Remember this here, because subsequently IDs will change from those in # the test config to those that ceph-deploy invents. (ceph_admin,) = ctx.cluster.only( teuthology.get_first_mon(ctx, config)).remotes.iterkeys() def execute_ceph_deploy(cmd): """Remotely execute a ceph_deploy command""" return ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(cmd), ], check_status=False, ).exitstatus try: log.info('Building ceph cluster using ceph-deploy...') testdir = teuthology.get_testdir(ctx) ceph_branch = None if config.get('branch') is not None: cbranch = config.get('branch') for var, val in cbranch.iteritems(): ceph_branch = '--{var}={val}'.format(var=var, val=val) all_nodes = get_all_nodes(ctx, config) mds_nodes = get_nodes_using_role(ctx, 'mds') mds_nodes = " ".join(mds_nodes) mon_node = get_nodes_using_role(ctx, 'mon') mon_nodes = " ".join(mon_node) new_mon = './ceph-deploy new' + " " + mon_nodes mon_hostname = mon_nodes.split(' ')[0] mon_hostname = str(mon_hostname) gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname deploy_mds = './ceph-deploy mds create' + " " + mds_nodes no_of_osds = 0 if mon_nodes is None: raise RuntimeError("no monitor nodes in the config file") estatus_new = execute_ceph_deploy(new_mon) if estatus_new != 0: raise RuntimeError("ceph-deploy: new command failed") log.info('adding config inputs...') testdir = teuthology.get_testdir(ctx) conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir) if config.get('conf') is not None: confp = config.get('conf') for section, keys in confp.iteritems(): lines = '[{section}]\n'.format(section=section) teuthology.append_lines_to_file(ceph_admin, conf_path, lines, sudo=True) for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) lines = '{key} = {value}\n'.format(key=key, value=value) teuthology.append_lines_to_file( ceph_admin, conf_path, lines, sudo=True) # install ceph dev_branch = ctx.config['branch'] branch = '--dev={branch}'.format(branch=dev_branch) if ceph_branch: option = ceph_branch else: option = branch install_nodes = './ceph-deploy install ' + option + " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph") # install ceph-test package too install_nodes2 = './ceph-deploy install --tests ' + option + \ " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes2) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph-test") mon_create_nodes = './ceph-deploy mon create-initial' # If the following fails, it is OK, it might just be that the monitors # are taking way more than a minute/monitor to form quorum, so lets # try the next block which will wait up to 15 minutes to gatherkeys. execute_ceph_deploy(mon_create_nodes) # create-keys is explicit now # http://tracker.ceph.com/issues/16036 mons = ctx.cluster.only(teuthology.is_type('mon')) for remote in mons.remotes.iterkeys(): remote.run(args=['sudo', 'ceph-create-keys', '--cluster', 'ceph', '--id', remote.shortname]) estatus_gather = execute_ceph_deploy(gather_keys) if mds_nodes: estatus_mds = execute_ceph_deploy(deploy_mds) if estatus_mds != 0: raise RuntimeError("ceph-deploy: Failed to deploy mds") if config.get('test_mon_destroy') is not None: for d in range(1, len(mon_node)): mon_destroy_nodes = './ceph-deploy mon destroy' + \ " " + mon_node[d] estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes) if estatus_mon_d != 0: raise RuntimeError("ceph-deploy: Failed to delete monitor") node_dev_list = get_dev_for_osd(ctx, config) for d in node_dev_list: node = d[0] for disk in d[1:]: zap = './ceph-deploy disk zap ' + node + ':' + disk estatus = execute_ceph_deploy(zap) if estatus != 0: raise RuntimeError("ceph-deploy: Failed to zap osds") osd_create_cmd = './ceph-deploy osd create ' if config.get('dmcrypt') is not None: osd_create_cmd += '--dmcrypt ' osd_create_cmd += ":".join(d) estatus_osd = execute_ceph_deploy(osd_create_cmd) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") if config.get('wait-for-healthy', True) and no_of_osds >= 2: is_healthy(ctx=ctx, config=None) log.info('Setting up client nodes...') conf_path = '/etc/ceph/ceph.conf' admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring' first_mon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys() conf_data = teuthology.get_file( remote=mon0_remote, path=conf_path, sudo=True, ) admin_keyring = teuthology.get_file( remote=mon0_remote, path=admin_keyring_path, sudo=True, ) clients = ctx.cluster.only(teuthology.is_type('client')) for remot, roles_for_host in clients.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'client'): client_keyring = \ '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) mon0_remote.run( args=[ 'cd', '{tdir}'.format(tdir=testdir), run.Raw('&&'), 'sudo', 'bash', '-c', run.Raw('"'), 'ceph', 'auth', 'get-or-create', 'client.{id}'.format(id=id_), 'mds', 'allow', 'mon', 'allow *', 'osd', 'allow *', run.Raw('>'), client_keyring, run.Raw('"'), ], ) key_data = teuthology.get_file( remote=mon0_remote, path=client_keyring, sudo=True, ) teuthology.sudo_write_file( remote=remot, path=client_keyring, data=key_data, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=admin_keyring_path, data=admin_keyring, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=conf_path, data=conf_data, perms='0644' ) if mds_nodes: log.info('Configuring CephFS...') ceph_fs = Filesystem(ctx) if not ceph_fs.legacy_configured(): ceph_fs.create() elif not config.get('only_mon'): raise RuntimeError( "The cluster is NOT operational due to insufficient OSDs") yield except Exception: log.info( "Error encountered, logging exception before tearing down ceph-deploy") log.info(traceback.format_exc()) raise finally: if config.get('keep_running'): return log.info('Stopping ceph...') ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'), 'sudo', 'service', 'ceph', 'stop', run.Raw('||'), 'sudo', 'systemctl', 'stop', 'ceph.target']) # Are you really not running anymore? # try first with the init tooling # ignoring the status so this becomes informational only ctx.cluster.run( args=[ 'sudo', 'status', 'ceph-all', run.Raw('||'), 'sudo', 'service', 'ceph', 'status', run.Raw('||'), 'sudo', 'systemctl', 'status', 'ceph.target'], check_status=False) # and now just check for the processes themselves, as if upstart/sysvinit # is lying to us. Ignore errors if the grep fails ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'), 'grep', '-v', 'grep', run.Raw('|'), 'grep', 'ceph'], check_status=False) if ctx.archive is not None: # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) mons = ctx.cluster.only(teuthology.is_type('mon')) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) # Prevent these from being undefined if the try block fails all_nodes = get_all_nodes(ctx, config) purge_nodes = './ceph-deploy purge' + " " + all_nodes purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes log.info('Purging package...') execute_ceph_deploy(purge_nodes) log.info('Purging data...') execute_ceph_deploy(purgedata_nodes)
def task(ctx, config): """ Mount/unmount a ``ceph-fuse`` client. The config is optional and defaults to mounting on all clients. If a config is given, it is expected to be a list of clients to do this operation on. This lets you e.g. set up one client with ``ceph-fuse`` and another with ``kclient``. Example that mounts all clients:: tasks: - ceph: - ceph-fuse: - interactive: Example that uses both ``kclient` and ``ceph-fuse``:: tasks: - ceph: - ceph-fuse: [client.0] - kclient: [client.1] - interactive: Example that enables valgrind: tasks: - ceph: - ceph-fuse: client.0: valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes] - interactive: Example that stops an already-mounted client: :: tasks: - ceph: - ceph-fuse: [client.0] - ... do something that requires the FS mounted ... - ceph-fuse: client.0: mounted: false - ... do something that requires the FS unmounted ... Example that adds more generous wait time for mount (for virtual machines): tasks: - ceph: - ceph-fuse: client.0: mount_wait: 60 # default is 0, do not wait before checking /sys/ mount_timeout: 120 # default is 30, give up if /sys/ is not populated - interactive: :param ctx: Context :param config: Configuration """ log.info('Mounting ceph-fuse clients...') testdir = teuthology.get_testdir(ctx) config = get_client_configs(ctx, config) # List clients we will configure mounts for, default is all clients clients = list( teuthology.get_clients(ctx=ctx, roles=filter(lambda x: 'client.' in x, config.keys()))) all_mounts = getattr(ctx, 'mounts', {}) mounted_by_me = {} log.info('Wait for MDS to reach steady state...') mds_cluster = MDSCluster(ctx) status = mds_cluster.status() for filesystem in status.get_filesystems(): fs = Filesystem(ctx, fscid=filesystem['id']) fs.wait_for_daemons() log.info('Ready to start ceph-fuse...') # Construct any new FuseMount instances for id_, remote in clients: client_config = config.get("client.%s" % id_) if client_config is None: client_config = {} if id_ not in all_mounts: fuse_mount = FuseMount(client_config, testdir, id_, remote) all_mounts[id_] = fuse_mount else: # Catch bad configs where someone has e.g. tried to use ceph-fuse and kcephfs for the same client assert isinstance(all_mounts[id_], FuseMount) if not config.get("disabled", False) and client_config.get( 'mounted', True): mounted_by_me[id_] = all_mounts[id_] ctx.mounts = all_mounts # Mount any clients we have been asked to (default to mount all) for mount in mounted_by_me.values(): mount.mount() for mount in mounted_by_me.values(): mount.wait_until_mounted() # Umount any pre-existing clients that we have not been asked to mount for client_id in set(all_mounts.keys()) - set(mounted_by_me.keys()): mount = all_mounts[client_id] if mount.is_mounted(): mount.umount_wait() try: yield all_mounts finally: log.info('Unmounting ceph-fuse clients...') for mount in mounted_by_me.values(): # Conditional because an inner context might have umounted it if mount.is_mounted(): mount.umount_wait()
def build_ceph_cluster(ctx, config): """Build a ceph cluster""" # Expect to find ceph_admin on the first mon by ID, same place that the download task # puts it. Remember this here, because subsequently IDs will change from those in # the test config to those that ceph-deploy invents. (ceph_admin,) = ctx.cluster.only('mon.a').remotes.keys() def execute_ceph_deploy(cmd): """Remotely execute a ceph_deploy command""" return ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(cmd), ], check_status=False, ).exitstatus def ceph_disk_osd_create(ctx, config): node_dev_list = get_dev_for_osd(ctx, config) no_of_osds = 0 for d in node_dev_list: node = d[0] for disk in d[1:]: zap = './ceph-deploy disk zap ' + node + ' ' + disk estatus = execute_ceph_deploy(zap) if estatus != 0: raise RuntimeError("ceph-deploy: Failed to zap osds") osd_create_cmd = './ceph-deploy osd create ' # first check for filestore, default is bluestore with ceph-deploy if config.get('filestore') is not None: osd_create_cmd += '--filestore ' elif config.get('bluestore') is not None: osd_create_cmd += '--bluestore ' if config.get('dmcrypt') is not None: osd_create_cmd += '--dmcrypt ' osd_create_cmd += ":".join(d) estatus_osd = execute_ceph_deploy(osd_create_cmd) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") return no_of_osds def ceph_volume_osd_create(ctx, config): osds = ctx.cluster.only(teuthology.is_type('osd')) no_of_osds = 0 for remote in osds.remotes.keys(): # all devs should be lvm osd_create_cmd = './ceph-deploy osd create --debug ' + remote.shortname + ' ' # default is bluestore so we just need config item for filestore roles = ctx.cluster.remotes[remote] dev_needed = len([role for role in roles if role.startswith('osd')]) all_devs = teuthology.get_scratch_devices(remote) log.info("node={n}, need_devs={d}, available={a}".format( n=remote.shortname, d=dev_needed, a=all_devs, )) devs = all_devs[0:dev_needed] # rest of the devices can be used for journal if required jdevs = dev_needed for device in devs: device_split = device.split('/') lv_device = device_split[-2] + '/' + device_split[-1] if config.get('filestore') is not None: osd_create_cmd += '--filestore --data ' + lv_device + ' ' # filestore with ceph-volume also needs journal disk try: jdevice = all_devs.pop(jdevs) except IndexError: raise RuntimeError("No device available for \ journal configuration") jdevice_split = jdevice.split('/') j_lv = jdevice_split[-2] + '/' + jdevice_split[-1] osd_create_cmd += '--journal ' + j_lv else: osd_create_cmd += ' --data ' + lv_device estatus_osd = execute_ceph_deploy(osd_create_cmd) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") return no_of_osds try: log.info('Building ceph cluster using ceph-deploy...') testdir = teuthology.get_testdir(ctx) ceph_branch = None if config.get('branch') is not None: cbranch = config.get('branch') for var, val in cbranch.items(): ceph_branch = '--{var}={val}'.format(var=var, val=val) all_nodes = get_all_nodes(ctx, config) mds_nodes = get_nodes_using_role(ctx, 'mds') mds_nodes = " ".join(mds_nodes) mon_node = get_nodes_using_role(ctx, 'mon') mon_nodes = " ".join(mon_node) # skip mgr based on config item # this is needed when test uses latest code to install old ceph # versions skip_mgr = config.get('skip-mgr', False) if not skip_mgr: mgr_nodes = get_nodes_using_role(ctx, 'mgr') mgr_nodes = " ".join(mgr_nodes) new_mon = './ceph-deploy new' + " " + mon_nodes if not skip_mgr: mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes mon_hostname = mon_nodes.split(' ')[0] mon_hostname = str(mon_hostname) gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname deploy_mds = './ceph-deploy mds create' + " " + mds_nodes if mon_nodes is None: raise RuntimeError("no monitor nodes in the config file") estatus_new = execute_ceph_deploy(new_mon) if estatus_new != 0: raise RuntimeError("ceph-deploy: new command failed") log.info('adding config inputs...') testdir = teuthology.get_testdir(ctx) conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir) if config.get('conf') is not None: confp = config.get('conf') for section, keys in confp.items(): lines = '[{section}]\n'.format(section=section) teuthology.append_lines_to_file(ceph_admin, conf_path, lines, sudo=True) for key, value in keys.items(): log.info("[%s] %s = %s" % (section, key, value)) lines = '{key} = {value}\n'.format(key=key, value=value) teuthology.append_lines_to_file( ceph_admin, conf_path, lines, sudo=True) # install ceph dev_branch = ctx.config['branch'] branch = '--dev={branch}'.format(branch=dev_branch) if ceph_branch: option = ceph_branch else: option = branch install_nodes = './ceph-deploy install ' + option + " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph") # install ceph-test package too install_nodes2 = './ceph-deploy install --tests ' + option + \ " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes2) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph-test") mon_create_nodes = './ceph-deploy mon create-initial' # If the following fails, it is OK, it might just be that the monitors # are taking way more than a minute/monitor to form quorum, so lets # try the next block which will wait up to 15 minutes to gatherkeys. execute_ceph_deploy(mon_create_nodes) estatus_gather = execute_ceph_deploy(gather_keys) if estatus_gather != 0: raise RuntimeError("ceph-deploy: Failed during gather keys") # install admin key on mons (ceph-create-keys doesn't do this any more) mons = ctx.cluster.only(teuthology.is_type('mon')) for remote in mons.remotes.keys(): execute_ceph_deploy('./ceph-deploy admin ' + remote.shortname) # create osd's if config.get('use-ceph-volume', False): no_of_osds = ceph_volume_osd_create(ctx, config) else: # this method will only work with ceph-deploy v1.5.39 or older no_of_osds = ceph_disk_osd_create(ctx, config) if not skip_mgr: execute_ceph_deploy(mgr_create) if mds_nodes: estatus_mds = execute_ceph_deploy(deploy_mds) if estatus_mds != 0: raise RuntimeError("ceph-deploy: Failed to deploy mds") if config.get('test_mon_destroy') is not None: for d in range(1, len(mon_node)): mon_destroy_nodes = './ceph-deploy mon destroy' + \ " " + mon_node[d] estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes) if estatus_mon_d != 0: raise RuntimeError("ceph-deploy: Failed to delete monitor") if config.get('wait-for-healthy', True) and no_of_osds >= 2: is_healthy(ctx=ctx, config=None) log.info('Setting up client nodes...') conf_path = '/etc/ceph/ceph.conf' admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring' first_mon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys() conf_data = teuthology.get_file( remote=mon0_remote, path=conf_path, sudo=True, ) admin_keyring = teuthology.get_file( remote=mon0_remote, path=admin_keyring_path, sudo=True, ) clients = ctx.cluster.only(teuthology.is_type('client')) for remot, roles_for_host in clients.remotes.items(): for id_ in teuthology.roles_of_type(roles_for_host, 'client'): client_keyring = \ '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) mon0_remote.run( args=[ 'cd', '{tdir}'.format(tdir=testdir), run.Raw('&&'), 'sudo', 'bash', '-c', run.Raw('"'), 'ceph', 'auth', 'get-or-create', 'client.{id}'.format(id=id_), 'mds', 'allow', 'mon', 'allow *', 'osd', 'allow *', run.Raw('>'), client_keyring, run.Raw('"'), ], ) key_data = teuthology.get_file( remote=mon0_remote, path=client_keyring, sudo=True, ) teuthology.sudo_write_file( remote=remot, path=client_keyring, data=key_data, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=admin_keyring_path, data=admin_keyring, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=conf_path, data=conf_data, perms='0644' ) if mds_nodes: log.info('Configuring CephFS...') Filesystem(ctx, create=True) elif not config.get('only_mon'): raise RuntimeError( "The cluster is NOT operational due to insufficient OSDs") # create rbd pool ceph_admin.run( args=[ 'sudo', 'ceph', '--cluster', 'ceph', 'osd', 'pool', 'create', 'rbd', '128', '128'], check_status=False) ceph_admin.run( args=[ 'sudo', 'ceph', '--cluster', 'ceph', 'osd', 'pool', 'application', 'enable', 'rbd', 'rbd', '--yes-i-really-mean-it' ], check_status=False) yield except Exception: log.info( "Error encountered, logging exception before tearing down ceph-deploy") log.info(traceback.format_exc()) raise finally: if config.get('keep_running'): return log.info('Stopping ceph...') ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'], check_status=False) time.sleep(4) # and now just check for the processes themselves, as if upstart/sysvinit # is lying to us. Ignore errors if the grep fails ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'), 'grep', '-v', 'grep', run.Raw('|'), 'grep', 'ceph'], check_status=False) ctx.cluster.run(args=['sudo', 'systemctl', run.Raw('|'), 'grep', 'ceph'], check_status=False) if ctx.archive is not None: # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) mons = ctx.cluster.only(teuthology.is_type('mon')) for remote, roles in mons.remotes.items(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.keys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) # Prevent these from being undefined if the try block fails all_nodes = get_all_nodes(ctx, config) purge_nodes = './ceph-deploy purge' + " " + all_nodes purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes log.info('Purging package...') execute_ceph_deploy(purge_nodes) log.info('Purging data...') execute_ceph_deploy(purgedata_nodes)
def task(ctx, config): """ Stress test the mds by running scrub iterations while another task/workunit is running. Example config: - fwd_scrub: scrub_timeout: 300 sleep_between_iterations: 1 """ mds_cluster = MDSCluster(ctx) if config is None: config = {} assert isinstance(config, dict), \ 'fwd_scrub task only accepts a dict for configuration' mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds')) assert len(mdslist) > 0, \ 'fwd_scrub task requires at least 1 metadata server' (first, ) = ctx.cluster.only(f'mds.{mdslist[0]}').remotes.keys() manager = ceph_manager.CephManager( first, ctx=ctx, logger=log.getChild('ceph_manager'), ) # make sure everyone is in active, standby, or standby-replay log.info('Wait for all MDSs to reach steady state...') status = mds_cluster.status() while True: steady = True for info in status.get_all(): state = info['state'] if state not in ('up:active', 'up:standby', 'up:standby-replay'): steady = False break if steady: break sleep(2) status = mds_cluster.status() log.info('Ready to start scrub thrashing') manager.wait_for_clean() assert manager.is_clean() if 'cluster' not in config: config['cluster'] = 'ceph' for fs in status.get_filesystems(): fwd_scrubber = ForwardScrubber(Filesystem(ctx, fscid=fs['id']), config['scrub_timeout'], config['sleep_between_iterations']) fwd_scrubber.start() ctx.ceph[config['cluster']].thrashers.append(fwd_scrubber) try: log.debug('Yielding') yield finally: log.info('joining ForwardScrubbers') stop_all_fwd_scrubbers(ctx.ceph[config['cluster']].thrashers) log.info('done joining')
def task(ctx, config): """ Run the CephFS test cases. Run everything in tasks/cephfs/test_*.py: :: tasks: - install: - ceph: - ceph-fuse: - cephfs_test_runner: `modules` argument allows running only some specific modules: :: tasks: ... - cephfs_test_runner: modules: - tasks.cephfs.test_sessionmap - tasks.cephfs.test_auto_repair By default, any cases that can't be run on the current cluster configuration will generate a failure. When the optional `fail_on_skip` argument is set to false, any tests that can't be run on the current configuration will simply be skipped: :: tasks: ... - cephfs_test_runner: fail_on_skip: false """ ceph_cluster = CephCluster(ctx) if len(list(misc.all_roles_of_type(ctx.cluster, 'mds'))): mds_cluster = MDSCluster(ctx) fs = Filesystem(ctx) else: mds_cluster = None fs = None if len(list(misc.all_roles_of_type(ctx.cluster, 'mgr'))): mgr_cluster = MgrCluster(ctx) else: mgr_cluster = None # Mount objects, sorted by ID if hasattr(ctx, 'mounts'): mounts = [ v for k, v in sorted(ctx.mounts.items(), lambda a, b: cmp(a[0], b[0])) ] else: # The test configuration has a filesystem but no fuse/kclient mounts mounts = [] decorating_loader = DecoratingLoader({ "ctx": ctx, "mounts": mounts, "fs": fs, "ceph_cluster": ceph_cluster, "mds_cluster": mds_cluster, "mgr_cluster": mgr_cluster, }) fail_on_skip = config.get('fail_on_skip', True) # Put useful things onto ctx for interactive debugging ctx.fs = fs ctx.mds_cluster = mds_cluster ctx.mgr_cluster = mgr_cluster # Depending on config, either load specific modules, or scan for moduless if config and 'modules' in config and config['modules']: module_suites = [] for mod_name in config['modules']: # Test names like cephfs.test_auto_repair module_suites.append(decorating_loader.loadTestsFromName(mod_name)) overall_suite = suite.TestSuite(module_suites) else: # Default, run all tests overall_suite = decorating_loader.discover( os.path.join(os.path.dirname(os.path.abspath(__file__)), "cephfs/")) if ctx.config.get("interactive-on-error", False): InteractiveFailureResult.ctx = ctx result_class = InteractiveFailureResult else: result_class = unittest.TextTestResult class LoggingResult(result_class): def startTest(self, test): log.info("Starting test: {0}".format(self.getDescription(test))) return super(LoggingResult, self).startTest(test) def addSkip(self, test, reason): if fail_on_skip: # Don't just call addFailure because that requires a traceback self.failures.append((test, reason)) else: super(LoggingResult, self).addSkip(test, reason) # Execute! result = unittest.TextTestRunner(stream=LogStream(), resultclass=LoggingResult, verbosity=2, failfast=True).run(overall_suite) if not result.wasSuccessful(): result.printErrors() # duplicate output at end for convenience bad_tests = [] for test, error in result.errors: bad_tests.append(str(test)) for test, failure in result.failures: bad_tests.append(str(test)) raise RuntimeError("Test failure: {0}".format(", ".join(bad_tests))) yield
def task(ctx, config): fs = Filesystem(ctx) run_test(ctx, config, fs)