def task(ctx, config): """ Test [deep] repair in several situations: Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica] The config should be as follows: Must include the log-whitelist below Must enable filestore_debug_inject_read_err config example: tasks: - chef: - install: - ceph: log-whitelist: ['candidate had a read error', 'deep-scrub 0 missing, 1 inconsistent objects', 'deep-scrub 0 missing, 4 inconsistent objects', 'deep-scrub 1 errors', 'deep-scrub 4 errors', '!= known omap_digest', 'repair 0 missing, 1 inconsistent objects', 'repair 0 missing, 4 inconsistent objects', 'repair 1 errors, 1 fixed', 'repair 4 errors, 4 fixed', 'scrub 0 missing, 1 inconsistent', 'scrub 1 errors', 'size 1 != known size'] conf: osd: filestore debug inject read err: true - repair_test: """ if config is None: config = {} assert isinstance(config, dict), \ 'repair_test task only accepts a dict for config' if not hasattr(ctx, 'manager'): first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() ctx.manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager') ) num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') log.info('num_osds is %s' % num_osds) while len(ctx.manager.get_osd_status()['up']) < num_osds: time.sleep(10) tests = [ gen_repair_test_1(mdataerr(ctx), choose_primary(ctx), "scrub"), gen_repair_test_1(mdataerr(ctx), choose_replica(ctx), "scrub"), gen_repair_test_1(dataerr(ctx), choose_primary(ctx), "deep-scrub"), gen_repair_test_1(dataerr(ctx), choose_replica(ctx), "deep-scrub"), gen_repair_test_1(trunc(ctx), choose_primary(ctx), "scrub"), gen_repair_test_1(trunc(ctx), choose_replica(ctx), "scrub"), gen_repair_test_2(choose_primary(ctx)), gen_repair_test_2(choose_replica(ctx)) ] for test in tests: run_test(ctx, config, test)
def task(ctx, config): """ Die if {testdir}/err exists or if an OSD dumps core """ if config is None: config = {} first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') log.info('num_osds is %s' % num_osds) manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < num_osds: time.sleep(10) testdir = teuthology.get_testdir(ctx) while True: for i in range(num_osds): (osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.iterkeys() p = osd_remote.run( args = [ 'test', '-e', '{tdir}/err'.format(tdir=testdir) ], wait=True, check_status=False, ) exit_status = p.exitstatus if exit_status == 0: log.info("osd %d has an error" % i) raise Exception("osd %d error" % i) log_path = '/var/log/ceph/osd.%d.log' % (i) p = osd_remote.run( args = [ 'tail', '-1', log_path, run.Raw('|'), 'grep', '-q', 'end dump' ], wait=True, check_status=False, ) exit_status = p.exitstatus if exit_status == 0: log.info("osd %d dumped core" % i) raise Exception("osd %d dumped core" % i) time.sleep(5)
def task(ctx, config): """ Test [deep] scrub tasks: - chef: - install: - ceph: log-whitelist: - '!= known digest' - '!= known omap_digest' - deep-scrub 0 missing, 1 inconsistent objects - deep-scrub 1 errors - repair 0 missing, 1 inconsistent objects - repair 1 errors, 1 fixed - scrub_test: """ if config is None: config = {} assert isinstance(config, dict), "scrub_test task only accepts a dict for configuration" first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() num_osds = teuthology.num_instances_of_type(ctx.cluster, "osd") log.info("num_osds is %s" % num_osds) manager = ceph_manager.CephManager(mon, ctx=ctx, logger=log.getChild("ceph_manager")) while len(manager.get_osd_status()["up"]) < num_osds: time.sleep(10) for i in range(num_osds): manager.raw_cluster_cmd("tell", "osd.%d" % i, "flush_pg_stats") manager.wait_for_clean() # write some data p = manager.do_rados(mon, ["-p", "rbd", "bench", "--no-cleanup", "1", "write", "-b", "4096"]) log.info("err is %d" % p.exitstatus) # wait for some PG to have data that we can mess with pg, acting = wait_for_victim_pg(manager) osd = acting[0] osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd) manager.do_rados(mon, ["-p", "rbd", "setomapval", obj_name, "key", "val"]) log.info("err is %d" % p.exitstatus) manager.do_rados(mon, ["-p", "rbd", "setomapheader", obj_name, "hdr"]) log.info("err is %d" % p.exitstatus) log.info("messing with PG %s on osd %d" % (pg, osd)) test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, "rbd") test_repair_bad_omap(ctx, manager, pg, osd, obj_name) test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd, obj_name, obj_path) log.info("test successful!")
def task(ctx, config): """ Benchmark the recovery system. Generates objects with smalliobench, runs it normally to get a baseline performance measurement, then marks an OSD out and reruns to measure performance during recovery. The config should be as follows: recovery_bench: duration: <seconds for each measurement run> num_objects: <number of objects> io_size: <io size in bytes> example: tasks: - ceph: - recovery_bench: duration: 60 num_objects: 500 io_size: 4096 """ if config is None: config = {} assert isinstance(config, dict), \ 'recovery_bench task only accepts a dict for configuration' log.info('Beginning recovery bench...') first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') while len(manager.get_osd_status()['up']) < num_osds: manager.sleep(10) bench_proc = RecoveryBencher( manager, config, ) try: yield finally: log.info('joining recovery bencher') bench_proc.do_join()
def task(ctx, config): """ Run scrub periodically. Randomly chooses an OSD to scrub. The config should be as follows: scrub: frequency: <seconds between scrubs> deep: <bool for deepness> example: tasks: - ceph: - scrub: frequency: 30 deep: 0 """ if config is None: config = {} assert isinstance(config, dict), \ 'scrub task only accepts a dict for configuration' log.info('Beginning scrub...') first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') while len(manager.get_osd_status()['up']) < num_osds: time.sleep(10) scrub_proc = Scrubber( manager, config, ) try: yield finally: log.info('joining scrub') scrub_proc.do_join()
def task(ctx, config): """ Run scrub periodically. Randomly chooses an OSD to scrub. The config should be as follows: scrub: frequency: <seconds between scrubs> deep: <bool for deepness> example: tasks: - ceph: - scrub: frequency: 30 deep: 0 """ if config is None: config = {} assert isinstance(config, dict), \ 'scrub task only accepts a dict for configuration' log.info('Beginning scrub...') first_mon = teuthology.get_first_mon(ctx, config) (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') while len(manager.get_osd_status()['up']) < num_osds: time.sleep(10) scrub_proc = Scrubber( manager, config, ) try: yield finally: log.info('joining scrub') scrub_proc.do_join()
def cluster(ctx, config): """ Handle the creation and removal of a ceph cluster. On startup: Create directories needed for the cluster. Create remote journals for all osds. Create and set keyring. Copy the monmap to tht test systems. Setup mon nodes. Setup mds nodes. Mkfs osd nodes. Add keyring information to monmaps Mkfs mon nodes. On exit: If errors occured, extract a failure message and store in ctx.summary. Unmount all test files and temporary journaling files. Save the monitor information and archive all ceph logs. Cleanup the keyring setup, and remove all monitor map and data files left over. :param ctx: Context :param config: Configuration """ if ctx.config.get('use_existing_cluster', False) is True: log.info("'use_existing_cluster' is true; skipping cluster creation") yield testdir = teuthology.get_testdir(ctx) log.info('Creating ceph cluster...') run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', '{tdir}/data'.format(tdir=testdir), ], wait=False, ) ) run.wait( ctx.cluster.run( args=[ 'sudo', 'install', '-d', '-m0777', '--', '/var/run/ceph', ], wait=False, ) ) devs_to_clean = {} remote_to_roles_to_devs = {} remote_to_roles_to_journals = {} osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, roles_for_host in osds.remotes.iteritems(): devs = teuthology.get_scratch_devices(remote) roles_to_devs = {} roles_to_journals = {} if config.get('fs'): log.info('fs option selected, checking for scratch devs') log.info('found devs: %s' % (str(devs),)) devs_id_map = teuthology.get_wwn_id_map(remote, devs) iddevs = devs_id_map.values() roles_to_devs = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), iddevs ) if len(roles_to_devs) < len(iddevs): iddevs = iddevs[len(roles_to_devs):] devs_to_clean[remote] = [] if config.get('block_journal'): log.info('block journal enabled') roles_to_journals = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), iddevs ) log.info('journal map: %s', roles_to_journals) if config.get('tmpfs_journal'): log.info('tmpfs journal enabled') roles_to_journals = {} remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] ) for osd in teuthology.roles_of_type(roles_for_host, 'osd'): tmpfs = '/mnt/osd.%s' % osd roles_to_journals[osd] = tmpfs remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] ) log.info('journal map: %s', roles_to_journals) log.info('dev map: %s' % (str(roles_to_devs),)) remote_to_roles_to_devs[remote] = roles_to_devs remote_to_roles_to_journals[remote] = roles_to_journals log.info('Generating config...') remotes_and_roles = ctx.cluster.remotes.items() roles = [role_list for (remote, role_list) in remotes_and_roles] ips = [host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)] conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips) for remote, roles_to_journals in remote_to_roles_to_journals.iteritems(): for role, journal in roles_to_journals.iteritems(): key = "osd." + str(role) if key not in conf: conf[key] = {} conf[key]['osd journal'] = journal for section, keys in config['conf'].iteritems(): for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) if section not in conf: conf[section] = {} conf[section][key] = value if config.get('tmpfs_journal'): conf['journal dio'] = False ctx.ceph = argparse.Namespace() ctx.ceph.conf = conf keyring_path = config.get('keyring_path', '/etc/ceph/ceph.keyring') coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) firstmon = teuthology.get_first_mon(ctx, config) log.info('Setting up %s...' % firstmon) ctx.cluster.only(firstmon).run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', keyring_path, ], ) ctx.cluster.only(firstmon).run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--gen-key', '--name=mon.', keyring_path, ], ) ctx.cluster.only(firstmon).run( args=[ 'sudo', 'chmod', '0644', keyring_path, ], ) (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() fsid = teuthology.create_simple_monmap( ctx, remote=mon0_remote, conf=conf, ) if not 'global' in conf: conf['global'] = {} conf['global']['fsid'] = fsid log.info('Writing ceph.conf for FSID %s...' % fsid) conf_path = config.get('conf_path', DEFAULT_CONF_PATH) write_conf(ctx, conf_path) log.info('Creating admin key on %s...' % firstmon) ctx.cluster.only(firstmon).run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--gen-key', '--name=client.admin', '--set-uid=0', '--cap', 'mon', 'allow *', '--cap', 'osd', 'allow *', '--cap', 'mds', 'allow *', keyring_path, ], ) log.info('Copying monmap to all nodes...') keyring = teuthology.get_file( remote=mon0_remote, path=keyring_path, ) monmap = teuthology.get_file( remote=mon0_remote, path='{tdir}/monmap'.format(tdir=testdir), ) for rem in ctx.cluster.remotes.iterkeys(): # copy mon key and initial monmap log.info('Sending monmap to node {remote}'.format(remote=rem)) teuthology.sudo_write_file( remote=rem, path=keyring_path, data=keyring, perms='0644' ) teuthology.write_file( remote=rem, path='{tdir}/monmap'.format(tdir=testdir), data=monmap, ) log.info('Setting up mon nodes...') mons = ctx.cluster.only(teuthology.is_type('mon')) run.wait( mons.run( args=[ 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'osdmaptool', '-c', conf_path, '--clobber', '--createsimple', '{num:d}'.format( num=teuthology.num_instances_of_type(ctx.cluster, 'osd'), ), '{tdir}/osdmap'.format(tdir=testdir), '--pg_bits', '2', '--pgp_bits', '4', ], wait=False, ), ) log.info('Setting up mds nodes...') mdss = ctx.cluster.only(teuthology.is_type('mds')) for remote, roles_for_host in mdss.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'mds'): remote.run( args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/mds/ceph-{id}'.format(id=id_), run.Raw('&&'), 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', '--gen-key', '--name=mds.{id}'.format(id=id_), '/var/lib/ceph/mds/ceph-{id}/keyring'.format(id=id_), ], ) cclient.create_keyring(ctx) log.info('Running mkfs on osd nodes...') ctx.disk_config = argparse.Namespace() ctx.disk_config.remote_to_roles_to_dev = remote_to_roles_to_devs ctx.disk_config.remote_to_roles_to_journals = remote_to_roles_to_journals ctx.disk_config.remote_to_roles_to_dev_mount_options = {} ctx.disk_config.remote_to_roles_to_dev_fstype = {} log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev))) for remote, roles_for_host in osds.remotes.iteritems(): roles_to_devs = remote_to_roles_to_devs[remote] roles_to_journals = remote_to_roles_to_journals[remote] for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run( args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/osd/ceph-{id}'.format(id=id_), ]) log.info(str(roles_to_journals)) log.info(id_) if roles_to_devs.get(id_): dev = roles_to_devs[id_] fs = config.get('fs') package = None mkfs_options = config.get('mkfs_options') mount_options = config.get('mount_options') if fs == 'btrfs': #package = 'btrfs-tools' if mount_options is None: mount_options = ['noatime','user_subvol_rm_allowed'] if mkfs_options is None: mkfs_options = ['-m', 'single', '-l', '32768', '-n', '32768'] if fs == 'xfs': #package = 'xfsprogs' if mount_options is None: mount_options = ['noatime'] if mkfs_options is None: mkfs_options = ['-f', '-i', 'size=2048'] if fs == 'ext4' or fs == 'ext3': if mount_options is None: mount_options = ['noatime','user_xattr'] if mount_options is None: mount_options = [] if mkfs_options is None: mkfs_options = [] mkfs = ['mkfs.%s' % fs] + mkfs_options log.info('%s on %s on %s' % (mkfs, dev, remote)) if package is not None: remote.run( args=[ 'sudo', 'apt-get', 'install', '-y', package ], stdout=StringIO(), ) try: remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) except run.CommandFailedError: # Newer btfs-tools doesn't prompt for overwrite, use -f if '-f' not in mount_options: mkfs_options.append('-f') mkfs = ['mkfs.%s' % fs] + mkfs_options log.info('%s on %s on %s' % (mkfs, dev, remote)) remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) log.info('mount %s on %s -o %s' % (dev, remote, ','.join(mount_options))) remote.run( args=[ 'sudo', 'mount', '-t', fs, '-o', ','.join(mount_options), dev, os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)), ] ) if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options: ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {} ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][id_] = mount_options if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype: ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {} ctx.disk_config.remote_to_roles_to_dev_fstype[remote][id_] = fs devs_to_clean[remote].append( os.path.join( os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)), ) ) for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run( args=[ 'sudo', 'MALLOC_CHECK_=3', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-osd', '--mkfs', '--mkkey', '-i', id_, '--monmap', '{tdir}/monmap'.format(tdir=testdir), ], ) log.info('Reading keys from all nodes...') keys_fp = StringIO() keys = [] for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for type_ in ['mds','osd']: for id_ in teuthology.roles_of_type(roles_for_host, type_): data = teuthology.get_file( remote=remote, path='/var/lib/ceph/{type}/ceph-{id}/keyring'.format( type=type_, id=id_, ), sudo=True, ) keys.append((type_, id_, data)) keys_fp.write(data) for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for type_ in ['client']: for id_ in teuthology.roles_of_type(roles_for_host, type_): data = teuthology.get_file( remote=remote, path='/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) ) keys.append((type_, id_, data)) keys_fp.write(data) log.info('Adding keys to all mons...') writes = mons.run( args=[ 'sudo', 'tee', '-a', keyring_path, ], stdin=run.PIPE, wait=False, stdout=StringIO(), ) keys_fp.seek(0) teuthology.feed_many_stdins_and_close(keys_fp, writes) run.wait(writes) for type_, id_, data in keys: run.wait( mons.run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', keyring_path, '--name={type}.{id}'.format( type=type_, id=id_, ), ] + list(teuthology.generate_caps(type_)), wait=False, ), ) log.info('Running mkfs on mon nodes...') for remote, roles_for_host in mons.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'mon'): remote.run( args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/mon/ceph-{id}'.format(id=id_), ], ) remote.run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-mon', '--mkfs', '-i', id_, '--monmap={tdir}/monmap'.format(tdir=testdir), '--osdmap={tdir}/osdmap'.format(tdir=testdir), '--keyring={kpath}'.format(kpath=keyring_path), ], ) run.wait( mons.run( args=[ 'rm', '--', '{tdir}/monmap'.format(tdir=testdir), '{tdir}/osdmap'.format(tdir=testdir), ], wait=False, ), ) try: yield except Exception: # we need to know this below ctx.summary['success'] = False raise finally: (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() log.info('Checking cluster log for badness...') def first_in_ceph_log(pattern, excludes): """ Find the first occurence of the pattern specified in the Ceph log, Returns None if none found. :param pattern: Pattern scanned for. :param excludes: Patterns to ignore. :return: First line of text (or None if not found) """ args = [ 'sudo', 'egrep', pattern, '/var/log/ceph/ceph.log', ] for exclude in excludes: args.extend([run.Raw('|'), 'egrep', '-v', exclude]) args.extend([ run.Raw('|'), 'head', '-n', '1', ]) r = mon0_remote.run( stdout=StringIO(), args=args, ) stdout = r.stdout.getvalue() if stdout != '': return stdout return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', config['log_whitelist']) is not None: log.warning('Found errors (ERR|WRN|SEC) in cluster log') ctx.summary['success'] = False # use the most severe problem as the failure reason if 'failure_reason' not in ctx.summary: for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']: match = first_in_ceph_log(pattern, config['log_whitelist']) if match is not None: ctx.summary['failure_reason'] = \ '"{match}" in cluster log'.format( match=match.rstrip('\n'), ) break for remote, dirs in devs_to_clean.iteritems(): for dir_ in dirs: log.info('Unmounting %s on %s' % (dir_, remote)) try: remote.run( args=[ 'sync', run.Raw('&&'), 'sudo', 'umount', '-f', dir_ ] ) except Exception as e: remote.run(args=[ 'sudo', run.Raw('PATH=/usr/sbin:$PATH'), 'lsof', run.Raw(';'), 'ps', 'auxf', ]) raise e if config.get('tmpfs_journal'): log.info('tmpfs journal enabled - unmounting tmpfs at /mnt') for remote, roles_for_host in osds.remotes.iteritems(): remote.run( args=[ 'sudo', 'umount', '-f', '/mnt' ], check_status=False, ) if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') log.info('Cleaning ceph cluster...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-rf', '--', conf_path, keyring_path, '{tdir}/data'.format(tdir=testdir), '{tdir}/monmap'.format(tdir=testdir), ], wait=False, ), )
def task(ctx, config): """ Test [deep] scrub tasks: - chef: - install: - ceph: log-whitelist: - '!= known digest' - '!= known omap_digest' - deep-scrub 0 missing, 1 inconsistent objects - deep-scrub 1 errors - repair 0 missing, 1 inconsistent objects - repair 1 errors, 1 fixed - scrub_test: """ if config is None: config = {} assert isinstance(config, dict), \ 'scrub_test task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') log.info('num_osds is %s' % num_osds) manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < num_osds: time.sleep(10) for i in range(num_osds): manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats') manager.wait_for_clean() # write some data p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1', 'write', '-b', '4096']) log.info('err is %d' % p.exitstatus) # wait for some PG to have data that we can mess with pg, acting = wait_for_victim_pg(manager) osd = acting[0] osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd) manager.do_rados(mon, ['-p', 'rbd', 'setomapval', obj_name, 'key', 'val']) log.info('err is %d' % p.exitstatus) manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', obj_name, 'hdr']) log.info('err is %d' % p.exitstatus) log.info('messing with PG %s on osd %d' % (pg, osd)) test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path) test_repair_bad_omap(ctx, manager, pg, osd, obj_name) test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd, obj_name, obj_path) log.info('test successful!')
def task(ctx, config): """ Test the dump_stuck command. :param ctx: Context :param config: Configuration """ assert config is None, \ 'dump_stuck requires no configuration' assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \ 'dump_stuck requires exactly 2 osds' timeout = 60 first_mon = teuthology.get_first_mon(ctx, config) (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) manager.flush_pg_stats([0, 1]) manager.wait_for_clean(timeout) manager.raw_cluster_cmd( 'tell', 'mon.0', 'injectargs', '--', # '--mon-osd-report-timeout 90', '--mon-pg-stuck-threshold 10') # all active+clean check_stuck( manager, num_inactive=0, num_unclean=0, num_stale=0, ) num_pgs = manager.get_num_pgs() manager.mark_out_osd(0) time.sleep(timeout) manager.flush_pg_stats([1]) manager.wait_for_recovery(timeout) # all active+clean+remapped check_stuck( manager, num_inactive=0, num_unclean=0, num_stale=0, ) manager.mark_in_osd(0) manager.flush_pg_stats([0, 1]) manager.wait_for_clean(timeout) # all active+clean check_stuck( manager, num_inactive=0, num_unclean=0, num_stale=0, ) log.info('stopping first osd') manager.kill_osd(0) manager.mark_down_osd(0) log.info('waiting for all to be unclean') starttime = time.time() done = False while not done: try: check_stuck( manager, num_inactive=0, num_unclean=num_pgs, num_stale=0, ) done = True except AssertionError: # wait up to 15 minutes to become stale if time.time() - starttime > 900: raise log.info('stopping second osd') manager.kill_osd(1) manager.mark_down_osd(1) log.info('waiting for all to be stale') starttime = time.time() done = False while not done: try: check_stuck( manager, num_inactive=0, num_unclean=num_pgs, num_stale=num_pgs, ) done = True except AssertionError: # wait up to 15 minutes to become stale if time.time() - starttime > 900: raise log.info('reviving') for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'): manager.revive_osd(id_) manager.mark_in_osd(id_) while True: try: manager.flush_pg_stats([0, 1]) break except Exception: log.exception('osds must not be started yet, waiting...') time.sleep(1) manager.wait_for_clean(timeout) check_stuck( manager, num_inactive=0, num_unclean=0, num_stale=0, )
def task(ctx, config): """ Execute a radosbench parameter sweep Puts radosbench in a loop, taking values from the given config at each iteration. If given, the min and max values below create a range, e.g. min_replicas=1 and max_replicas=3 implies executing with 1-3 replicas. Parameters: clients: [client list] time: seconds to run (default=120) sizes: [list of object sizes] (default=[4M]) mode: <write|read|seq> (default=write) repetitions: execute the same configuration multiple times (default=1) min_num_replicas: minimum number of replicas to use (default = 3) max_num_replicas: maximum number of replicas to use (default = 3) min_num_osds: the minimum number of OSDs in a pool (default=all) max_num_osds: the maximum number of OSDs in a pool (default=all) file: name of CSV-formatted output file (default='radosbench.csv') columns: columns to include (default=all) - rep: execution number (takes values from 'repetitions') - num_osd: number of osds for pool - num_replica: number of replicas - avg_throughput: throughput - avg_latency: latency - stdev_throughput: - stdev_latency: Example: - radsobenchsweep: columns: [rep, num_osd, num_replica, avg_throughput, stdev_throughput] """ log.info('Beginning radosbenchsweep...') assert isinstance(config, dict), 'expecting dictionary for configuration' # get and validate config values # { # only one client supported for now if len(config.get('clients', [])) != 1: raise Exception("Only one client can be specified") # only write mode if config.get('mode', 'write') != 'write': raise Exception("Only 'write' mode supported for now.") # OSDs total_osds_in_cluster = teuthology.num_instances_of_type(ctx.cluster, 'osd') min_num_osds = config.get('min_num_osds', total_osds_in_cluster) max_num_osds = config.get('max_num_osds', total_osds_in_cluster) if max_num_osds > total_osds_in_cluster: raise Exception('max_num_osds cannot be greater than total in cluster') if min_num_osds < 1: raise Exception('min_num_osds cannot be less than 1') if min_num_osds > max_num_osds: raise Exception('min_num_osds cannot be greater than max_num_osd') osds = range(0, (total_osds_in_cluster + 1)) # replicas min_num_replicas = config.get('min_num_replicas', 3) max_num_replicas = config.get('max_num_replicas', 3) if min_num_replicas < 1: raise Exception('min_num_replicas cannot be less than 1') if min_num_replicas > max_num_replicas: raise Exception('min_num_replicas cannot be greater than max_replicas') if max_num_replicas > max_num_osds: raise Exception('max_num_replicas cannot be greater than max_num_osds') replicas = range(min_num_replicas, (max_num_replicas + 1)) # object size sizes = config.get('size', [4 << 20]) # repetitions reps = range(config.get('repetitions', 1)) # file fname = config.get('file', 'radosbench.csv') f = open('{}/{}'.format(ctx.archive, fname), 'w') f.write(get_csv_header(config) + '\n') # } # set default pools size=1 to avoid 'unhealthy' issues ctx.manager.set_pool_property('data', 'size', 1) ctx.manager.set_pool_property('metadata', 'size', 1) ctx.manager.set_pool_property('rbd', 'size', 1) current_osds_out = 0 # sweep through all parameters for osds_out, size, replica, rep in product(osds, sizes, replicas, reps): osds_in = total_osds_in_cluster - osds_out if osds_in == 0: # we're done break if current_osds_out != osds_out: # take an osd out ctx.manager.raw_cluster_cmd( 'osd', 'reweight', str(osds_out-1), '0.0') wait_until_healthy(ctx, config) current_osds_out = osds_out if osds_in not in range(min_num_osds, (max_num_osds + 1)): # no need to execute with a number of osds that wasn't requested continue if osds_in < replica: # cannot execute with more replicas than available osds continue run_radosbench(ctx, config, f, osds_in, size, replica, rep) f.close() yield
def task(ctx, config): """ Test backfill """ if config is None: config = {} assert isinstance(config, dict), \ 'thrashosds task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') log.info('num_osds is %s' % num_osds) assert num_osds == 3 manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < 3: time.sleep(10) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_clean() # write some data p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096', '--no-cleanup']) err = p.wait() log.info('err is %d' % err) # mark osd.0 out to trigger a rebalance/backfill manager.mark_out_osd(0) # also mark it down to it won't be included in pg_temps manager.kill_osd(0) manager.mark_down_osd(0) # wait for everything to peer and be happy... manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_recovery() # write some new data p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '30', 'write', '-b', '4096', '--no-cleanup']) time.sleep(15) # blackhole + restart osd.1 # this triggers a divergent backfill target manager.blackhole_kill_osd(1) time.sleep(2) manager.revive_osd(1) # wait for our writes to complete + succeed err = p.wait() log.info('err is %d' % err) # cluster must recover manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_recovery() # re-add osd.0 manager.revive_osd(0) manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_clean()
def task(ctx, config): """ Test (non-backfill) recovery """ if config is None: config = {} assert isinstance(config, dict), \ 'task only accepts a dict for configuration' testdir = teuthology.get_testdir(ctx) first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') log.info('num_osds is %s' % num_osds) assert num_osds == 3 manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < 3: time.sleep(10) manager.flush_pg_stats([0, 1, 2]) manager.wait_for_clean() # test some osdmap flags manager.raw_cluster_cmd('osd', 'set', 'noin') manager.raw_cluster_cmd('osd', 'set', 'noout') manager.raw_cluster_cmd('osd', 'set', 'noup') manager.raw_cluster_cmd('osd', 'set', 'nodown') manager.raw_cluster_cmd('osd', 'unset', 'noin') manager.raw_cluster_cmd('osd', 'unset', 'noout') manager.raw_cluster_cmd('osd', 'unset', 'noup') manager.raw_cluster_cmd('osd', 'unset', 'nodown') # write some new data p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '20', 'write', '-b', '4096', '--no-cleanup']) time.sleep(15) # trigger a divergent target: # blackhole + restart osd.1 (shorter log) manager.blackhole_kill_osd(1) # kill osd.2 (longer log... we'll make it divergent below) manager.kill_osd(2) time.sleep(2) manager.revive_osd(1) # wait for our writes to complete + succeed err = p.wait() log.info('err is %d' % err) # cluster must repeer manager.flush_pg_stats([0, 1]) manager.wait_for_active_or_down() # write some more (make sure osd.2 really is divergent) p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096']) p.wait() # revive divergent osd manager.revive_osd(2) while len(manager.get_osd_status()['up']) < 3: log.info('waiting a bit...') time.sleep(2) log.info('3 are up!') # cluster must recover manager.flush_pg_stats([0, 1, 2]) manager.wait_for_clean()
def task(ctx, config): """ Test [deep] scrub tasks: - chef: - install: - ceph: log-whitelist: - '!= data_digest' - '!= omap_digest' - '!= size' - deep-scrub 0 missing, 1 inconsistent objects - deep-scrub [0-9]+ errors - repair 0 missing, 1 inconsistent objects - repair [0-9]+ errors, [0-9]+ fixed - shard [0-9]+ .* : missing - deep-scrub 1 missing, 1 inconsistent objects - does not match object info size - attr name mistmatch - deep-scrub 1 missing, 0 inconsistent objects - failed to pick suitable auth object - candidate size [0-9]+ info size [0-9]+ mismatch conf: osd: osd deep scrub update digest min age: 0 - scrub_test: """ if config is None: config = {} assert isinstance(config, dict), \ 'scrub_test task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.keys() num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') log.info('num_osds is %s' % num_osds) manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < num_osds: time.sleep(10) for i in range(num_osds): manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs', '--', '--osd-objectstore-fuse') manager.flush_pg_stats(range(num_osds)) manager.wait_for_clean() # write some data p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1', 'write', '-b', '4096']) log.info('err is %d' % p.exitstatus) # wait for some PG to have data that we can mess with pg, acting = wait_for_victim_pg(manager) osd = acting[0] osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd) manager.do_rados(mon, ['-p', 'rbd', 'setomapval', obj_name, 'key', 'val']) log.info('err is %d' % p.exitstatus) manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', obj_name, 'hdr']) log.info('err is %d' % p.exitstatus) # Update missing digests, requires "osd deep scrub update digest min age: 0" pgnum = get_pgnum(pg) manager.do_pg_scrub('rbd', pgnum, 'deep-scrub') log.info('messing with PG %s on osd %d' % (pg, osd)) test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, 'rbd') test_repair_bad_omap(ctx, manager, pg, osd, obj_name) test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd, obj_name, obj_path) log.info('test successful!') # shut down fuse mount for i in range(num_osds): manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs', '--', '--no-osd-objectstore-fuse') time.sleep(5) log.info('done')
def task(ctx, config): """ Test [deep] scrub tasks: - chef: - install: - ceph: log-whitelist: - '!= known digest' - '!= known omap_digest' - deep-scrub 0 missing, 1 inconsistent objects - deep-scrub 1 errors - repair 0 missing, 1 inconsistent objects - repair 1 errors, 1 fixed - scrub_test: """ if config is None: config = {} assert isinstance(config, dict), \ 'scrub_test task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') log.info('num_osds is %s' % num_osds) manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < num_osds: time.sleep(10) for i in range(num_osds): manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats') manager.wait_for_clean() # write some data p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1', 'write', '-b', '4096']) err = p.exitstatus log.info('err is %d' % err) # wait for some PG to have data that we can mess with victim = None osd = None while victim is None: stats = manager.get_pg_stats() for pg in stats: size = pg['stat_sum']['num_bytes'] if size > 0: victim = pg['pgid'] osd = pg['acting'][0] break if victim is None: time.sleep(3) log.info('messing with PG %s on osd %d' % (victim, osd)) (osd_remote,) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys() data_path = os.path.join( '/var/lib/ceph/osd', 'ceph-{id}'.format(id=osd), 'current', '{pg}_head'.format(pg=victim) ) # fuzz time ls_fp = StringIO() osd_remote.run( args=[ 'sudo', 'ls', data_path ], stdout=ls_fp, ) ls_out = ls_fp.getvalue() ls_fp.close() # find an object file we can mess with osdfilename = None for line in ls_out.split('\n'): if 'object' in line: osdfilename = line break assert osdfilename is not None # Get actual object name from osd stored filename tmp=osdfilename.split('__') objname=tmp[0] objname=objname.replace('\u', '_') log.info('fuzzing %s' % objname) # put a single \0 at the beginning of the file osd_remote.run( args=[ 'sudo', 'dd', 'if=/dev/zero', 'of=%s' % os.path.join(data_path, osdfilename), 'bs=1', 'count=1', 'conv=notrunc' ] ) # scrub, verify inconsistent manager.raw_cluster_cmd('pg', 'deep-scrub', victim) # Give deep-scrub a chance to start time.sleep(60) while True: stats = manager.get_single_pg_stats(victim) state = stats['state'] # wait for the scrub to finish if 'scrubbing' in state: time.sleep(3) continue inconsistent = stats['state'].find('+inconsistent') != -1 assert inconsistent break # repair, verify no longer inconsistent manager.raw_cluster_cmd('pg', 'repair', victim) # Give repair a chance to start time.sleep(60) while True: stats = manager.get_single_pg_stats(victim) state = stats['state'] # wait for the scrub to finish if 'scrubbing' in state: time.sleep(3) continue inconsistent = stats['state'].find('+inconsistent') != -1 assert not inconsistent break # Test deep-scrub with various omap modifications manager.do_rados(mon, ['-p', 'rbd', 'setomapval', objname, 'key', 'val']) manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', objname, 'hdr']) # Modify omap on specific osd log.info('fuzzing omap of %s' % objname) manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'key']); manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname, 'badkey', 'badval']); manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'badhdr']); # scrub, verify inconsistent manager.raw_cluster_cmd('pg', 'deep-scrub', victim) # Give deep-scrub a chance to start time.sleep(60) while True: stats = manager.get_single_pg_stats(victim) state = stats['state'] # wait for the scrub to finish if 'scrubbing' in state: time.sleep(3) continue inconsistent = stats['state'].find('+inconsistent') != -1 assert inconsistent break # repair, verify no longer inconsistent manager.raw_cluster_cmd('pg', 'repair', victim) # Give repair a chance to start time.sleep(60) while True: stats = manager.get_single_pg_stats(victim) state = stats['state'] # wait for the scrub to finish if 'scrubbing' in state: time.sleep(3) continue inconsistent = stats['state'].find('+inconsistent') != -1 assert not inconsistent break log.info('test successful!')
def task(ctx, config): """ Test the dump_stuck command. The ceph configuration should include:: mon_osd_report_timeout = 90 mon_pg_stuck_threshold = 10 """ assert config is None, \ 'dump_stuck requires no configuration' assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \ 'dump_stuck requires exactly 2 osds' timeout = 60 first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.wait_for_clean(timeout) check_stuck( manager, num_inactive=0, num_unclean=0, num_stale=0, ) num_pgs = manager.get_num_pgs() manager.mark_out_osd(0) time.sleep(timeout) manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.wait_for_recovery(timeout) check_stuck( manager, num_inactive=0, num_unclean=num_pgs, num_stale=0, ) manager.mark_in_osd(0) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.wait_for_clean(timeout) check_stuck( manager, num_inactive=0, num_unclean=0, num_stale=0, ) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'): manager.kill_osd(id_) manager.mark_down_osd(id_) starttime = time.time() done = False while not done: try: check_stuck( manager, num_inactive=0, num_unclean=0, num_stale=num_pgs, ) done = True except AssertionError: # wait up to 15 minutes to become stale if time.time() - starttime > 900: raise for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'): manager.revive_osd(id_) manager.mark_in_osd(id_) time.sleep(timeout) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.wait_for_clean(timeout) check_stuck( manager, num_inactive=0, num_unclean=0, num_stale=0, )
def task(ctx, config): """ Test [deep] scrub tasks: - chef: - install: - ceph: log-whitelist: - '!= known digest' - '!= known omap_digest' - deep-scrub 0 missing, 1 inconsistent objects - deep-scrub 1 errors - repair 0 missing, 1 inconsistent objects - repair 1 errors, 1 fixed - scrub_test: """ if config is None: config = {} assert isinstance(config, dict), \ 'scrub_test task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys() num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') log.info('num_osds is %s' % num_osds) manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < num_osds: time.sleep(10) for i in range(num_osds): manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats') manager.wait_for_clean() # write some data p = manager.do_rados( mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1', 'write', '-b', '4096']) err = p.exitstatus log.info('err is %d' % err) # wait for some PG to have data that we can mess with victim = None osd = None while victim is None: stats = manager.get_pg_stats() for pg in stats: size = pg['stat_sum']['num_bytes'] if size > 0: victim = pg['pgid'] osd = pg['acting'][0] break if victim is None: time.sleep(3) log.info('messing with PG %s on osd %d' % (victim, osd)) (osd_remote, ) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys() data_path = os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=osd), 'current', '{pg}_head'.format(pg=victim)) # fuzz time ls_fp = StringIO() osd_remote.run( args=['sudo', 'ls', data_path], stdout=ls_fp, ) ls_out = ls_fp.getvalue() ls_fp.close() # find an object file we can mess with osdfilename = None for line in ls_out.split('\n'): if 'object' in line: osdfilename = line break assert osdfilename is not None # Get actual object name from osd stored filename tmp = osdfilename.split('__') objname = tmp[0] objname = objname.replace('\u', '_') log.info('fuzzing %s' % objname) # put a single \0 at the beginning of the file osd_remote.run(args=[ 'sudo', 'dd', 'if=/dev/zero', 'of=%s' % os.path.join(data_path, osdfilename), 'bs=1', 'count=1', 'conv=notrunc' ]) # scrub, verify inconsistent manager.raw_cluster_cmd('pg', 'deep-scrub', victim) # Give deep-scrub a chance to start time.sleep(60) while True: stats = manager.get_single_pg_stats(victim) state = stats['state'] # wait for the scrub to finish if 'scrubbing' in state: time.sleep(3) continue inconsistent = stats['state'].find('+inconsistent') != -1 assert inconsistent break # repair, verify no longer inconsistent manager.raw_cluster_cmd('pg', 'repair', victim) # Give repair a chance to start time.sleep(60) while True: stats = manager.get_single_pg_stats(victim) state = stats['state'] # wait for the scrub to finish if 'scrubbing' in state: time.sleep(3) continue inconsistent = stats['state'].find('+inconsistent') != -1 assert not inconsistent break # Test deep-scrub with various omap modifications manager.do_rados(mon, ['-p', 'rbd', 'setomapval', objname, 'key', 'val']) manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', objname, 'hdr']) # Modify omap on specific osd log.info('fuzzing omap of %s' % objname) manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'key']) manager.osd_admin_socket( osd, ['setomapval', 'rbd', objname, 'badkey', 'badval']) manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'badhdr']) # scrub, verify inconsistent manager.raw_cluster_cmd('pg', 'deep-scrub', victim) # Give deep-scrub a chance to start time.sleep(60) while True: stats = manager.get_single_pg_stats(victim) state = stats['state'] # wait for the scrub to finish if 'scrubbing' in state: time.sleep(3) continue inconsistent = stats['state'].find('+inconsistent') != -1 assert inconsistent break # repair, verify no longer inconsistent manager.raw_cluster_cmd('pg', 'repair', victim) # Give repair a chance to start time.sleep(60) while True: stats = manager.get_single_pg_stats(victim) state = stats['state'] # wait for the scrub to finish if 'scrubbing' in state: time.sleep(3) continue inconsistent = stats['state'].find('+inconsistent') != -1 assert not inconsistent break log.info('test successful!')
def test_incomplete_pgs(ctx, config): """ Test handling of incomplete pgs. Requires 4 osds. """ testdir = teuthology.get_testdir(ctx) if config is None: config = {} assert isinstance(config, dict), \ 'task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys() num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') log.info('num_osds is %s' % num_osds) assert num_osds == 4 manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < 4: time.sleep(10) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') manager.wait_for_clean() log.info('Testing incomplete pgs...') for i in range(4): manager.set_config(i, osd_recovery_delay_start=1000) # move data off of osd.0, osd.1 manager.raw_cluster_cmd('osd', 'out', '0', '1') manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') manager.wait_for_clean() # lots of objects in rbd (no pg log, will backfill) p = rados_start( testdir, mon, ['-p', 'rbd', 'bench', '20', 'write', '-b', '1', '--no-cleanup']) p.wait() # few objects in rbd pool (with pg log, normal recovery) for f in range(1, 20): p = rados_start(testdir, mon, ['-p', 'rbd', 'put', 'foo.%d' % f, '/etc/passwd']) p.wait() # move it back manager.raw_cluster_cmd('osd', 'in', '0', '1') manager.raw_cluster_cmd('osd', 'out', '2', '3') manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats') manager.wait_for_active() assert not manager.is_clean() assert not manager.is_recovered() # kill 2 + 3 log.info('stopping 2,3') manager.kill_osd(2) manager.kill_osd(3) log.info('...') manager.raw_cluster_cmd('osd', 'down', '2', '3') manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.wait_for_active_or_down() assert manager.get_num_down() > 0 # revive 2 + 3 manager.revive_osd(2) manager.revive_osd(3) while len(manager.get_osd_status()['up']) < 4: log.info('waiting a bit...') time.sleep(2) log.info('all are up!') for i in range(4): manager.kick_recovery_wq(i) # cluster must recover manager.wait_for_clean()
def task(ctx, config): """ Test the dump_stuck command. :param ctx: Context :param config: Configuration """ assert config is None, \ 'dump_stuck requires no configuration' assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \ 'dump_stuck requires exactly 2 osds' timeout = 60 first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) manager.flush_pg_stats([0, 1]) manager.wait_for_clean(timeout) manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--', # '--mon-osd-report-timeout 90', '--mon-pg-stuck-threshold 10') # all active+clean check_stuck( manager, num_inactive=0, num_unclean=0, num_stale=0, ) num_pgs = manager.get_num_pgs() manager.mark_out_osd(0) time.sleep(timeout) manager.flush_pg_stats([1]) manager.wait_for_recovery(timeout) # all active+clean+remapped check_stuck( manager, num_inactive=0, num_unclean=0, num_stale=0, ) manager.mark_in_osd(0) manager.flush_pg_stats([0, 1]) manager.wait_for_clean(timeout) # all active+clean check_stuck( manager, num_inactive=0, num_unclean=0, num_stale=0, ) log.info('stopping first osd') manager.kill_osd(0) manager.mark_down_osd(0) manager.wait_for_active(timeout) log.info('waiting for all to be unclean') starttime = time.time() done = False while not done: try: check_stuck( manager, num_inactive=0, num_unclean=num_pgs, num_stale=0, ) done = True except AssertionError: # wait up to 15 minutes to become stale if time.time() - starttime > 900: raise log.info('stopping second osd') manager.kill_osd(1) manager.mark_down_osd(1) log.info('waiting for all to be stale') starttime = time.time() done = False while not done: try: check_stuck( manager, num_inactive=0, num_unclean=num_pgs, num_stale=num_pgs, ) done = True except AssertionError: # wait up to 15 minutes to become stale if time.time() - starttime > 900: raise log.info('reviving') for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'): manager.revive_osd(id_) manager.mark_in_osd(id_) while True: try: manager.flush_pg_stats([0, 1]) break except Exception: log.exception('osds must not be started yet, waiting...') time.sleep(1) manager.wait_for_clean(timeout) check_stuck( manager, num_inactive=0, num_unclean=0, num_stale=0, )
def cluster(ctx, config): log.info('Creating ceph cluster...') run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', '/tmp/cephtest/data', ], wait=False, ) ) log.info('Generating config...') remotes_and_roles = ctx.cluster.remotes.items() roles = [roles for (remote, roles) in remotes_and_roles] ips = [host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, roles) in remotes_and_roles)] conf = teuthology.skeleton_config(roles=roles, ips=ips) for section, keys in config['conf'].iteritems(): for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) if section not in conf: conf[section] = {} conf[section][key] = value ctx.ceph = argparse.Namespace() ctx.ceph.conf = conf log.info('Writing configs...') conf_fp = StringIO() conf.write(conf_fp) conf_fp.seek(0) writes = ctx.cluster.run( args=[ 'python', '-c', 'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))', '/tmp/cephtest/ceph.conf', ], stdin=run.PIPE, wait=False, ) teuthology.feed_many_stdins_and_close(conf_fp, writes) run.wait(writes) coverage_dir = '/tmp/cephtest/archive/coverage' firstmon = teuthology.get_first_mon(ctx, config) log.info('Setting up %s...' % firstmon) ctx.cluster.only(firstmon).run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-authtool', '--create-keyring', '/tmp/cephtest/ceph.keyring', ], ) ctx.cluster.only(firstmon).run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-authtool', '--gen-key', '--name=mon.', '/tmp/cephtest/ceph.keyring', ], ) (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() teuthology.create_simple_monmap( remote=mon0_remote, conf=conf, ) log.info('Creating admin key on %s...' % firstmon) ctx.cluster.only(firstmon).run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-authtool', '--gen-key', '--name=client.admin', '--set-uid=0', '--cap', 'mon', 'allow *', '--cap', 'osd', 'allow *', '--cap', 'mds', 'allow', '/tmp/cephtest/ceph.keyring', ], ) log.info('Copying monmap to all nodes...') keyring = teuthology.get_file( remote=mon0_remote, path='/tmp/cephtest/ceph.keyring', ) monmap = teuthology.get_file( remote=mon0_remote, path='/tmp/cephtest/monmap', ) for rem in ctx.cluster.remotes.iterkeys(): # copy mon key and initial monmap log.info('Sending monmap to node {remote}'.format(remote=rem)) teuthology.write_file( remote=rem, path='/tmp/cephtest/ceph.keyring', data=keyring, ) teuthology.write_file( remote=rem, path='/tmp/cephtest/monmap', data=monmap, ) log.info('Setting up mon nodes...') mons = ctx.cluster.only(teuthology.is_type('mon')) run.wait( mons.run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/osdmaptool', '--clobber', '--createsimple', '{num:d}'.format( num=teuthology.num_instances_of_type(ctx.cluster, 'osd'), ), '/tmp/cephtest/osdmap', '--pg_bits', '2', '--pgp_bits', '4', ], wait=False, ), ) log.info('Setting up osd nodes...') osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, roles_for_host in osds.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-authtool', '--create-keyring', '--gen-key', '--name=osd.{id}'.format(id=id_), '/tmp/cephtest/data/osd.{id}.keyring'.format(id=id_), ], ) log.info('Setting up mds nodes...') mdss = ctx.cluster.only(teuthology.is_type('mds')) for remote, roles_for_host in mdss.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'mds'): remote.run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-authtool', '--create-keyring', '--gen-key', '--name=mds.{id}'.format(id=id_), '/tmp/cephtest/data/mds.{id}.keyring'.format(id=id_), ], ) log.info('Setting up client nodes...') clients = ctx.cluster.only(teuthology.is_type('client')) for remote, roles_for_host in clients.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'client'): remote.run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-authtool', '--create-keyring', '--gen-key', # TODO this --name= is not really obeyed, all unknown "types" are munged to "client" '--name=client.{id}'.format(id=id_), '/tmp/cephtest/data/client.{id}.keyring'.format(id=id_), ], ) log.info('Reading keys from all nodes...') keys_fp = StringIO() keys = [] for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for type_ in ['osd', 'mds', 'client']: for id_ in teuthology.roles_of_type(roles_for_host, type_): data = teuthology.get_file( remote=remote, path='/tmp/cephtest/data/{type}.{id}.keyring'.format( type=type_, id=id_, ), ) keys.append((type_, id_, data)) keys_fp.write(data) log.info('Adding keys to all mons...') writes = mons.run( args=[ 'cat', run.Raw('>>'), '/tmp/cephtest/ceph.keyring', ], stdin=run.PIPE, wait=False, ) keys_fp.seek(0) teuthology.feed_many_stdins_and_close(keys_fp, writes) run.wait(writes) for type_, id_, data in keys: run.wait( mons.run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-authtool', '/tmp/cephtest/ceph.keyring', '--name={type}.{id}'.format( type=type_, id=id_, ), ] + list(teuthology.generate_caps(type_)), wait=False, ), ) log.info('Running mkfs on mon nodes...') for remote, roles_for_host in mons.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'mon'): remote.run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-mon', '--mkfs', '-i', id_, '-c', '/tmp/cephtest/ceph.conf', '--monmap=/tmp/cephtest/monmap', '--osdmap=/tmp/cephtest/osdmap', '--keyring=/tmp/cephtest/ceph.keyring', ], ) log.info('Running mkfs on osd nodes...') devs_to_clean = {} for remote, roles_for_host in osds.remotes.iteritems(): roles_to_devs = {} if config.get('btrfs'): log.info('btrfs option selected, checkin for scrach devs') devs = teuthology.get_scratch_devices(remote) log.info('found devs: %s' % (str(devs),)) roles_to_devs = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), devs ) log.info('dev map: %s' % (str(roles_to_devs),)) devs_to_clean[remote] = [] for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run( args=[ 'mkdir', os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_)), ], ) if roles_to_devs.get(id_): dev = roles_to_devs[id_] log.info('mkfs.btrfs on %s on %s' % (dev, remote)) remote.run( args=[ 'sudo', 'apt-get', 'install', '-y', 'btrfs-tools' ] ) remote.run( args=[ 'sudo', 'mkfs.btrfs', dev ] ) log.info('mount %s on %s' % (dev, remote)) remote.run( args=[ 'sudo', 'mount', '-o', 'user_subvol_rm_allowed', dev, os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_)), ] ) remote.run( args=[ 'sudo', 'chown', '-R', 'ubuntu.ubuntu', os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_)) ] ) remote.run( args=[ 'sudo', 'chmod', '-R', '755', os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_)) ] ) devs_to_clean[remote].append( os.path.join( '/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_) ) ) for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-osd', '--mkfs', '-i', id_, '-c', '/tmp/cephtest/ceph.conf', '--monmap', '/tmp/cephtest/monmap', ], ) run.wait( mons.run( args=[ 'rm', '--', '/tmp/cephtest/monmap', '/tmp/cephtest/osdmap', ], wait=False, ), ) try: yield finally: (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() if ctx.archive is not None: log.info('Grabbing cluster log from %s %s...' % (mon0_remote, firstmon)) dest = os.path.join(ctx.archive, 'ceph.log') mon0_remote.run( args = [ 'cat', '--', '/tmp/cephtest/data/%s/log' % firstmon ], stdout=file(dest, 'wb'), ) log.info('Checking cluster ceph.log for badness...') def first_in_ceph_log(pattern, excludes): args = [ 'egrep', pattern, '/tmp/cephtest/data/%s/log' % firstmon, ] for exclude in excludes: args.extend([run.Raw('|'), 'egrep', '-v', exclude]) args.extend([ run.Raw('|'), 'head', '-n', '1', ]) r = mon0_remote.run( stdout=StringIO(), args=args, ) stdout = r.stdout.getvalue() if stdout != '': return stdout return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', config['log_whitelist']) is not None: log.warning('Found errors (ERR|WRN|SEC) in cluster log') ctx.summary['success'] = False # use the most severe problem as the failure reason if 'failure_reason' not in ctx.summary: for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']: match = first_in_ceph_log(pattern, config['log_whitelist']) if match is not None: ctx.summary['failure_reason'] = \ '"{match}" in cluster log'.format( match=match.rstrip('\n'), ) break for remote, dirs in devs_to_clean.iteritems(): for dir_ in dirs: log.info('Unmounting %s on %s' % (dir_, remote)) remote.run( args=[ "sudo", "umount", "-f", dir_ ] ) log.info('Cleaning ceph cluster...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', '/tmp/cephtest/ceph.conf', '/tmp/cephtest/ceph.keyring', '/tmp/cephtest/data', '/tmp/cephtest/monmap', run.Raw('/tmp/cephtest/asok.*') ], wait=False, ), )
def cluster(ctx, config): """ Handle the creation and removal of a ceph cluster. On startup: Create directories needed for the cluster. Create remote journals for all osds. Create and set keyring. Copy the monmap to tht test systems. Setup mon nodes. Setup mds nodes. Mkfs osd nodes. Add keyring information to monmaps Mkfs mon nodes. On exit: If errors occured, extract a failure message and store in ctx.summary. Unmount all test files and temporary journaling files. Save the monitor information and archive all ceph logs. Cleanup the keyring setup, and remove all monitor map and data files left over. :param ctx: Context :param config: Configuration """ if ctx.config.get("use_existing_cluster", False) is True: log.info("'use_existing_cluster' is true; skipping cluster creation") yield testdir = teuthology.get_testdir(ctx) cluster_name = config["cluster"] data_dir = "{tdir}/{cluster}.data".format(tdir=testdir, cluster=cluster_name) log.info("Creating ceph cluster %s...", cluster_name) run.wait(ctx.cluster.run(args=["install", "-d", "-m0755", "--", data_dir], wait=False)) run.wait(ctx.cluster.run(args=["sudo", "install", "-d", "-m0777", "--", "/var/run/ceph"], wait=False)) devs_to_clean = {} remote_to_roles_to_devs = {} remote_to_roles_to_journals = {} osds = ctx.cluster.only(teuthology.is_type("osd", cluster_name)) for remote, roles_for_host in osds.remotes.iteritems(): devs = teuthology.get_scratch_devices(remote) roles_to_devs = {} roles_to_journals = {} if config.get("fs"): log.info("fs option selected, checking for scratch devs") log.info("found devs: %s" % (str(devs),)) devs_id_map = teuthology.get_wwn_id_map(remote, devs) iddevs = devs_id_map.values() roles_to_devs = assign_devs(teuthology.cluster_roles_of_type(roles_for_host, "osd", cluster_name), iddevs) if len(roles_to_devs) < len(iddevs): iddevs = iddevs[len(roles_to_devs) :] devs_to_clean[remote] = [] if config.get("block_journal"): log.info("block journal enabled") roles_to_journals = assign_devs( teuthology.cluster_roles_of_type(roles_for_host, "osd", cluster_name), iddevs ) log.info("journal map: %s", roles_to_journals) if config.get("tmpfs_journal"): log.info("tmpfs journal enabled") roles_to_journals = {} remote.run(args=["sudo", "mount", "-t", "tmpfs", "tmpfs", "/mnt"]) for role in teuthology.cluster_roles_of_type(roles_for_host, "osd", cluster_name): tmpfs = "/mnt/" + role roles_to_journals[role] = tmpfs remote.run(args=["truncate", "-s", "1500M", tmpfs]) log.info("journal map: %s", roles_to_journals) log.info("dev map: %s" % (str(roles_to_devs),)) remote_to_roles_to_devs[remote] = roles_to_devs remote_to_roles_to_journals[remote] = roles_to_journals log.info("Generating config...") remotes_and_roles = ctx.cluster.remotes.items() roles = [role_list for (remote, role_list) in remotes_and_roles] ips = [ host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles) ] conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name) for remote, roles_to_journals in remote_to_roles_to_journals.iteritems(): for role, journal in roles_to_journals.iteritems(): name = teuthology.ceph_role(role) if name not in conf: conf[name] = {} conf[name]["osd journal"] = journal for section, keys in config["conf"].iteritems(): for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) if section not in conf: conf[section] = {} conf[section][key] = value if config.get("tmpfs_journal"): conf["journal dio"] = False if not hasattr(ctx, "ceph"): ctx.ceph = {} ctx.ceph[cluster_name] = argparse.Namespace() ctx.ceph[cluster_name].conf = conf default_keyring = "/etc/ceph/{cluster}.keyring".format(cluster=cluster_name) keyring_path = config.get("keyring_path", default_keyring) coverage_dir = "{tdir}/archive/coverage".format(tdir=testdir) firstmon = teuthology.get_first_mon(ctx, config, cluster_name) log.info("Setting up %s..." % firstmon) ctx.cluster.only(firstmon).run( args=[ "sudo", "adjust-ulimits", "ceph-coverage", coverage_dir, "ceph-authtool", "--create-keyring", keyring_path, ] ) ctx.cluster.only(firstmon).run( args=[ "sudo", "adjust-ulimits", "ceph-coverage", coverage_dir, "ceph-authtool", "--gen-key", "--name=mon.", keyring_path, ] ) ctx.cluster.only(firstmon).run(args=["sudo", "chmod", "0644", keyring_path]) (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() monmap_path = "{tdir}/{cluster}.monmap".format(tdir=testdir, cluster=cluster_name) fsid = teuthology.create_simple_monmap(ctx, remote=mon0_remote, conf=conf, path=monmap_path) if not "global" in conf: conf["global"] = {} conf["global"]["fsid"] = fsid default_conf_path = "/etc/ceph/{cluster}.conf".format(cluster=cluster_name) conf_path = config.get("conf_path", default_conf_path) log.info("Writing %s for FSID %s..." % (conf_path, fsid)) write_conf(ctx, conf_path, cluster_name) log.info("Creating admin key on %s..." % firstmon) ctx.cluster.only(firstmon).run( args=[ "sudo", "adjust-ulimits", "ceph-coverage", coverage_dir, "ceph-authtool", "--gen-key", "--name=client.admin", "--set-uid=0", "--cap", "mon", "allow *", "--cap", "osd", "allow *", "--cap", "mds", "allow *", keyring_path, ] ) log.info("Copying monmap to all nodes...") keyring = teuthology.get_file(remote=mon0_remote, path=keyring_path) monmap = teuthology.get_file(remote=mon0_remote, path=monmap_path) for rem in ctx.cluster.remotes.iterkeys(): # copy mon key and initial monmap log.info("Sending monmap to node {remote}".format(remote=rem)) teuthology.sudo_write_file(remote=rem, path=keyring_path, data=keyring, perms="0644") teuthology.write_file(remote=rem, path=monmap_path, data=monmap) log.info("Setting up mon nodes...") mons = ctx.cluster.only(teuthology.is_type("mon", cluster_name)) osdmap_path = "{tdir}/{cluster}.osdmap".format(tdir=testdir, cluster=cluster_name) run.wait( mons.run( args=[ "adjust-ulimits", "ceph-coverage", coverage_dir, "osdmaptool", "-c", conf_path, "--clobber", "--createsimple", "{num:d}".format(num=teuthology.num_instances_of_type(ctx.cluster, "osd", cluster_name)), osdmap_path, "--pg_bits", "2", "--pgp_bits", "4", ], wait=False, ) ) log.info("Setting up mgr nodes...") mgrs = ctx.cluster.only(teuthology.is_type("mgr", cluster_name)) for remote, roles_for_host in mgrs.remotes.iteritems(): for role in teuthology.cluster_roles_of_type(roles_for_host, "mgr", cluster_name): _, _, id_ = teuthology.split_role(role) mgr_dir = "/var/lib/ceph/mgr/{cluster}-{id}".format(cluster=cluster_name, id=id_) remote.run( args=[ "sudo", "mkdir", "-p", mgr_dir, run.Raw("&&"), "sudo", "adjust-ulimits", "ceph-coverage", coverage_dir, "ceph-authtool", "--create-keyring", "--gen-key", "--name=mgr.{id}".format(id=id_), mgr_dir + "/keyring", ] ) log.info("Setting up mds nodes...") mdss = ctx.cluster.only(teuthology.is_type("mds", cluster_name)) for remote, roles_for_host in mdss.remotes.iteritems(): for role in teuthology.cluster_roles_of_type(roles_for_host, "mds", cluster_name): _, _, id_ = teuthology.split_role(role) mds_dir = "/var/lib/ceph/mds/{cluster}-{id}".format(cluster=cluster_name, id=id_) remote.run( args=[ "sudo", "mkdir", "-p", mds_dir, run.Raw("&&"), "sudo", "adjust-ulimits", "ceph-coverage", coverage_dir, "ceph-authtool", "--create-keyring", "--gen-key", "--name=mds.{id}".format(id=id_), mds_dir + "/keyring", ] ) cclient.create_keyring(ctx, cluster_name) log.info("Running mkfs on osd nodes...") if not hasattr(ctx, "disk_config"): ctx.disk_config = argparse.Namespace() if not hasattr(ctx.disk_config, "remote_to_roles_to_dev"): ctx.disk_config.remote_to_roles_to_dev = {} if not hasattr(ctx.disk_config, "remote_to_roles_to_journals"): ctx.disk_config.remote_to_roles_to_journals = {} if not hasattr(ctx.disk_config, "remote_to_roles_to_dev_mount_options"): ctx.disk_config.remote_to_roles_to_dev_mount_options = {} if not hasattr(ctx.disk_config, "remote_to_roles_to_dev_fstype"): ctx.disk_config.remote_to_roles_to_dev_fstype = {} teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs) teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals) log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev))) for remote, roles_for_host in osds.remotes.iteritems(): roles_to_devs = remote_to_roles_to_devs[remote] roles_to_journals = remote_to_roles_to_journals[remote] for role in teuthology.cluster_roles_of_type(roles_for_host, "osd", cluster_name): _, _, id_ = teuthology.split_role(role) mnt_point = "/var/lib/ceph/osd/{cluster}-{id}".format(cluster=cluster_name, id=id_) remote.run(args=["sudo", "mkdir", "-p", mnt_point]) log.info(str(roles_to_journals)) log.info(role) if roles_to_devs.get(role): dev = roles_to_devs[role] fs = config.get("fs") package = None mkfs_options = config.get("mkfs_options") mount_options = config.get("mount_options") if fs == "btrfs": # package = 'btrfs-tools' if mount_options is None: mount_options = ["noatime", "user_subvol_rm_allowed"] if mkfs_options is None: mkfs_options = ["-m", "single", "-l", "32768", "-n", "32768"] if fs == "xfs": # package = 'xfsprogs' if mount_options is None: mount_options = ["noatime"] if mkfs_options is None: mkfs_options = ["-f", "-i", "size=2048"] if fs == "ext4" or fs == "ext3": if mount_options is None: mount_options = ["noatime", "user_xattr"] if mount_options is None: mount_options = [] if mkfs_options is None: mkfs_options = [] mkfs = ["mkfs.%s" % fs] + mkfs_options log.info("%s on %s on %s" % (mkfs, dev, remote)) if package is not None: remote.run(args=["sudo", "apt-get", "install", "-y", package], stdout=StringIO()) try: remote.run(args=["yes", run.Raw("|")] + ["sudo"] + mkfs + [dev]) except run.CommandFailedError: # Newer btfs-tools doesn't prompt for overwrite, use -f if "-f" not in mount_options: mkfs_options.append("-f") mkfs = ["mkfs.%s" % fs] + mkfs_options log.info("%s on %s on %s" % (mkfs, dev, remote)) remote.run(args=["yes", run.Raw("|")] + ["sudo"] + mkfs + [dev]) log.info("mount %s on %s -o %s" % (dev, remote, ",".join(mount_options))) remote.run(args=["sudo", "mount", "-t", fs, "-o", ",".join(mount_options), dev, mnt_point]) if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options: ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {} ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype: ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {} ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs devs_to_clean[remote].append(mnt_point) for role in teuthology.cluster_roles_of_type(roles_for_host, "osd", cluster_name): _, _, id_ = teuthology.split_role(role) remote.run( args=[ "sudo", "MALLOC_CHECK_=3", "adjust-ulimits", "ceph-coverage", coverage_dir, "ceph-osd", "--cluster", cluster_name, "--mkfs", "--mkkey", "-i", id_, "--monmap", monmap_path, ] ) log.info("Reading keys from all nodes...") keys_fp = StringIO() keys = [] for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for type_ in ["mgr", "mds", "osd"]: for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name): _, _, id_ = teuthology.split_role(role) data = teuthology.get_file( remote=remote, path="/var/lib/ceph/{type}/{cluster}-{id}/keyring".format(type=type_, id=id_, cluster=cluster_name), sudo=True, ) keys.append((type_, id_, data)) keys_fp.write(data) for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for role in teuthology.cluster_roles_of_type(roles_for_host, "client", cluster_name): _, _, id_ = teuthology.split_role(role) data = teuthology.get_file( remote=remote, path="/etc/ceph/{cluster}.client.{id}.keyring".format(id=id_, cluster=cluster_name) ) keys.append(("client", id_, data)) keys_fp.write(data) log.info("Adding keys to all mons...") writes = mons.run(args=["sudo", "tee", "-a", keyring_path], stdin=run.PIPE, wait=False, stdout=StringIO()) keys_fp.seek(0) teuthology.feed_many_stdins_and_close(keys_fp, writes) run.wait(writes) for type_, id_, data in keys: run.wait( mons.run( args=[ "sudo", "adjust-ulimits", "ceph-coverage", coverage_dir, "ceph-authtool", keyring_path, "--name={type}.{id}".format(type=type_, id=id_), ] + list(generate_caps(type_)), wait=False, ) ) log.info("Running mkfs on mon nodes...") for remote, roles_for_host in mons.remotes.iteritems(): for role in teuthology.cluster_roles_of_type(roles_for_host, "mon", cluster_name): _, _, id_ = teuthology.split_role(role) remote.run( args=["sudo", "mkdir", "-p", "/var/lib/ceph/mon/{cluster}-{id}".format(id=id_, cluster=cluster_name)] ) remote.run( args=[ "sudo", "adjust-ulimits", "ceph-coverage", coverage_dir, "ceph-mon", "--cluster", cluster_name, "--mkfs", "-i", id_, "--monmap", monmap_path, "--osdmap", osdmap_path, "--keyring", keyring_path, ] ) run.wait(mons.run(args=["rm", "--", monmap_path, osdmap_path], wait=False)) try: yield except Exception: # we need to know this below ctx.summary["success"] = False raise finally: (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() log.info("Checking cluster log for badness...") def first_in_ceph_log(pattern, excludes): """ Find the first occurence of the pattern specified in the Ceph log, Returns None if none found. :param pattern: Pattern scanned for. :param excludes: Patterns to ignore. :return: First line of text (or None if not found) """ args = ["sudo", "egrep", pattern, "/var/log/ceph/{cluster}.log".format(cluster=cluster_name)] for exclude in excludes: args.extend([run.Raw("|"), "egrep", "-v", exclude]) args.extend([run.Raw("|"), "head", "-n", "1"]) r = mon0_remote.run(stdout=StringIO(), args=args) stdout = r.stdout.getvalue() if stdout != "": return stdout return None if first_in_ceph_log("\[ERR\]|\[WRN\]|\[SEC\]", config["log_whitelist"]) is not None: log.warning("Found errors (ERR|WRN|SEC) in cluster log") ctx.summary["success"] = False # use the most severe problem as the failure reason if "failure_reason" not in ctx.summary: for pattern in ["\[SEC\]", "\[ERR\]", "\[WRN\]"]: match = first_in_ceph_log(pattern, config["log_whitelist"]) if match is not None: ctx.summary["failure_reason"] = '"{match}" in cluster log'.format(match=match.rstrip("\n")) break for remote, dirs in devs_to_clean.iteritems(): for dir_ in dirs: log.info("Unmounting %s on %s" % (dir_, remote)) try: remote.run(args=["sync", run.Raw("&&"), "sudo", "umount", "-f", dir_]) except Exception as e: remote.run(args=["sudo", run.Raw("PATH=/usr/sbin:$PATH"), "lsof", run.Raw(";"), "ps", "auxf"]) raise e if config.get("tmpfs_journal"): log.info("tmpfs journal enabled - unmounting tmpfs at /mnt") for remote, roles_for_host in osds.remotes.iteritems(): remote.run(args=["sudo", "umount", "-f", "/mnt"], check_status=False) if ctx.archive is not None and not (ctx.config.get("archive-on-error") and ctx.summary["success"]): # archive mon data, too log.info("Archiving mon data...") path = os.path.join(ctx.archive, "data") try: os.makedirs(path) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for remote, roles in mons.remotes.iteritems(): for role in roles: is_mon = teuthology.is_type("mon", cluster_name) if is_mon(role): _, _, id_ = teuthology.split_role(role) mon_dir = "/var/lib/ceph/mon/" + "{0}-{1}".format(cluster_name, id_) teuthology.pull_directory_tarball(remote, mon_dir, path + "/" + role + ".tgz") log.info("Cleaning ceph cluster...") run.wait( ctx.cluster.run( args=[ "sudo", "rm", "-rf", "--", conf_path, keyring_path, data_dir, monmap_path, osdmap_path, run.Raw("{tdir}/../*.pid".format(tdir=testdir)), ], wait=False, ) )
def task(ctx, config): """ Test backfill """ if config is None: config = {} assert isinstance(config, dict), \ 'thrashosds task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys() num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') log.info('num_osds is %s' % num_osds) assert num_osds == 3 manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < 3: manager.sleep(10) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_clean() # write some data p = rados_start( ctx, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096', '--no-cleanup']) err = p.wait() log.info('err is %d' % err) # mark osd.0 out to trigger a rebalance/backfill manager.mark_out_osd(0) # also mark it down to it won't be included in pg_temps manager.kill_osd(0) manager.mark_down_osd(0) # wait for everything to peer and be happy... manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_recovery() # write some new data p = rados_start( ctx, mon, ['-p', 'rbd', 'bench', '30', 'write', '-b', '4096', '--no-cleanup']) time.sleep(15) # blackhole + restart osd.1 # this triggers a divergent backfill target manager.blackhole_kill_osd(1) time.sleep(2) manager.revive_osd(1) # wait for our writes to complete + succeed err = p.wait() log.info('err is %d' % err) # cluster must recover manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_recovery() # re-add osd.0 manager.revive_osd(0) manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_clean()
def task(ctx, config): """ Test [deep] scrub """ if config is None: config = {} assert isinstance(config, dict), \ 'scrub_test task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') log.info('num_osds is %s' % num_osds) manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < num_osds: time.sleep(10) for i in range(num_osds): manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats') manager.wait_for_clean() # write some data p = rados_start(mon, ['-p', 'rbd', 'bench', '1', 'write', '-b', '4096']) err = p.exitstatus log.info('err is %d' % err) # wait for some PG to have data that we can mess with victim = None osd = None while victim is None: stats = manager.get_pg_stats() for pg in stats: size = pg['stat_sum']['num_bytes'] if size > 0: victim = pg['pgid'] osd = pg['acting'][0] break if victim is None: time.sleep(3) log.info('messing with PG %s on osd %d' % (victim, osd)) (osd_remote,) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys() data_path = os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=osd), 'current', '{pg}_head'.format(pg=victim) ) # fuzz time ls_fp = StringIO() osd_remote.run( args=[ 'ls', data_path ], stdout=ls_fp, ) ls_out = ls_fp.getvalue() ls_fp.close() # find an object file we can mess with file = None for line in ls_out.split('\n'): if line.find('object'): file = line break assert file is not None log.info('fuzzing %s' % file) # put a single \0 at the beginning of the file osd_remote.run( args=[ 'dd', 'if=/dev/zero', 'of=%s' % os.path.join(data_path, file), 'bs=1', 'count=1', 'conv=notrunc' ] ) # scrub, verify inconsistent manager.raw_cluster_cmd('pg', 'deep-scrub', victim) while True: stats = manager.get_single_pg_stats(victim) state = stats['state'] # wait for the scrub to finish if state.find('scrubbing'): time.sleep(3) continue inconsistent = stats['state'].find('+inconsistent') != -1 assert inconsistent break # repair, verify no longer inconsistent manager.raw_cluster_cmd('pg', 'repair', victim) while True: stats = manager.get_single_pg_stats(victim) state = stats['state'] # wait for the scrub to finish if state.find('scrubbing'): time.sleep(3) continue inconsistent = stats['state'].find('+inconsistent') != -1 assert not inconsistent break log.info('test successful!')
def test_incomplete_pgs(ctx, config): """ Test handling of incomplete pgs. Requires 4 osds. """ testdir = teuthology.get_testdir(ctx) if config is None: config = {} assert isinstance(config, dict), \ 'task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') log.info('num_osds is %s' % num_osds) assert num_osds == 4 manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < 4: time.sleep(10) manager.flush_pg_stats([0, 1, 2, 3]) manager.wait_for_clean() log.info('Testing incomplete pgs...') for i in range(4): manager.set_config( i, osd_recovery_delay_start=1000) # move data off of osd.0, osd.1 manager.raw_cluster_cmd('osd', 'out', '0', '1') manager.flush_pg_stats([0, 1, 2, 3], [0, 1]) manager.wait_for_clean() # lots of objects in rbd (no pg log, will backfill) p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '20', 'write', '-b', '1', '--no-cleanup']) p.wait() # few objects in rbd pool (with pg log, normal recovery) for f in range(1, 20): p = rados_start(testdir, mon, ['-p', 'rbd', 'put', 'foo.%d' % f, '/etc/passwd']) p.wait() # move it back manager.raw_cluster_cmd('osd', 'in', '0', '1') manager.raw_cluster_cmd('osd', 'out', '2', '3') time.sleep(10) manager.flush_pg_stats([0, 1, 2, 3], [2, 3]) time.sleep(10) manager.wait_for_active() assert not manager.is_clean() assert not manager.is_recovered() # kill 2 + 3 log.info('stopping 2,3') manager.kill_osd(2) manager.kill_osd(3) log.info('...') manager.raw_cluster_cmd('osd', 'down', '2', '3') manager.flush_pg_stats([0, 1]) manager.wait_for_active_or_down() assert manager.get_num_down() > 0 # revive 2 + 3 manager.revive_osd(2) manager.revive_osd(3) while len(manager.get_osd_status()['up']) < 4: log.info('waiting a bit...') time.sleep(2) log.info('all are up!') for i in range(4): manager.kick_recovery_wq(i) # cluster must recover manager.wait_for_clean()
def cluster(ctx, config): log.info('Creating ceph cluster...') run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', '/tmp/cephtest/data', ], wait=False, ) ) devs_to_clean = {} remote_to_roles_to_devs = {} remote_to_roles_to_journals = {} osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, roles_for_host in osds.remotes.iteritems(): devs = teuthology.get_scratch_devices(remote) roles_to_devs = {} roles_to_journals = {} if config.get('fs'): log.info('fs option selected, checkin for scratch devs') log.info('found devs: %s' % (str(devs),)) roles_to_devs = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), devs ) if len(roles_to_devs) < len(devs): devs = devs[len(roles_to_devs):] log.info('dev map: %s' % (str(roles_to_devs),)) devs_to_clean[remote] = [] if config.get('block_journal'): log.info('block journal enabled') roles_to_journals = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), devs ) log.info('journal map: %s', roles_to_journals) if config.get('tmpfs_journal'): log.info('tmpfs journal enabled') roles_to_journals = {} remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] ) for osd in teuthology.roles_of_type(roles_for_host, 'osd'): tmpfs = '/mnt/osd.%s' % osd roles_to_journals[osd] = tmpfs remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] ) log.info('journal map: %s', roles_to_journals) remote_to_roles_to_devs[remote] = roles_to_devs remote_to_roles_to_journals[remote] = roles_to_journals log.info('Generating config...') remotes_and_roles = ctx.cluster.remotes.items() roles = [roles for (remote, roles) in remotes_and_roles] ips = [host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, roles) in remotes_and_roles)] conf = teuthology.skeleton_config(roles=roles, ips=ips) for remote, roles_to_journals in remote_to_roles_to_journals.iteritems(): for role, journal in roles_to_journals.iteritems(): key = "osd." + str(role) if key not in conf: conf[key] = {} conf[key]['osd journal'] = journal for section, keys in config['conf'].iteritems(): for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) if section not in conf: conf[section] = {} conf[section][key] = value if config.get('tmpfs_journal'): conf['journal dio'] = False ctx.ceph = argparse.Namespace() ctx.ceph.conf = conf log.info('Writing configs...') conf_fp = StringIO() conf.write(conf_fp) conf_fp.seek(0) writes = ctx.cluster.run( args=[ 'python', '-c', 'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))', '/tmp/cephtest/ceph.conf', ], stdin=run.PIPE, wait=False, ) teuthology.feed_many_stdins_and_close(conf_fp, writes) run.wait(writes) coverage_dir = '/tmp/cephtest/archive/coverage' firstmon = teuthology.get_first_mon(ctx, config) log.info('Setting up %s...' % firstmon) ctx.cluster.only(firstmon).run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-authtool', '--create-keyring', '/tmp/cephtest/ceph.keyring', ], ) ctx.cluster.only(firstmon).run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-authtool', '--gen-key', '--name=mon.', '/tmp/cephtest/ceph.keyring', ], ) (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() teuthology.create_simple_monmap( remote=mon0_remote, conf=conf, ) log.info('Creating admin key on %s...' % firstmon) ctx.cluster.only(firstmon).run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-authtool', '--gen-key', '--name=client.admin', '--set-uid=0', '--cap', 'mon', 'allow *', '--cap', 'osd', 'allow *', '--cap', 'mds', 'allow', '/tmp/cephtest/ceph.keyring', ], ) log.info('Copying monmap to all nodes...') keyring = teuthology.get_file( remote=mon0_remote, path='/tmp/cephtest/ceph.keyring', ) monmap = teuthology.get_file( remote=mon0_remote, path='/tmp/cephtest/monmap', ) for rem in ctx.cluster.remotes.iterkeys(): # copy mon key and initial monmap log.info('Sending monmap to node {remote}'.format(remote=rem)) teuthology.write_file( remote=rem, path='/tmp/cephtest/ceph.keyring', data=keyring, ) teuthology.write_file( remote=rem, path='/tmp/cephtest/monmap', data=monmap, ) log.info('Setting up mon nodes...') mons = ctx.cluster.only(teuthology.is_type('mon')) run.wait( mons.run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/osdmaptool', '--clobber', '--createsimple', '{num:d}'.format( num=teuthology.num_instances_of_type(ctx.cluster, 'osd'), ), '/tmp/cephtest/osdmap', '--pg_bits', '2', '--pgp_bits', '4', ], wait=False, ), ) log.info('Setting up osd nodes...') for remote, roles_for_host in osds.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-authtool', '--create-keyring', '--gen-key', '--name=osd.{id}'.format(id=id_), '/tmp/cephtest/data/osd.{id}.keyring'.format(id=id_), ], ) log.info('Setting up mds nodes...') mdss = ctx.cluster.only(teuthology.is_type('mds')) for remote, roles_for_host in mdss.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'mds'): remote.run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-authtool', '--create-keyring', '--gen-key', '--name=mds.{id}'.format(id=id_), '/tmp/cephtest/data/mds.{id}.keyring'.format(id=id_), ], ) log.info('Setting up client nodes...') clients = ctx.cluster.only(teuthology.is_type('client')) for remote, roles_for_host in clients.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'client'): remote.run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-authtool', '--create-keyring', '--gen-key', # TODO this --name= is not really obeyed, all unknown "types" are munged to "client" '--name=client.{id}'.format(id=id_), '/tmp/cephtest/data/client.{id}.keyring'.format(id=id_), ], ) log.info('Reading keys from all nodes...') keys_fp = StringIO() keys = [] for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for type_ in ['osd', 'mds', 'client']: for id_ in teuthology.roles_of_type(roles_for_host, type_): data = teuthology.get_file( remote=remote, path='/tmp/cephtest/data/{type}.{id}.keyring'.format( type=type_, id=id_, ), ) keys.append((type_, id_, data)) keys_fp.write(data) log.info('Adding keys to all mons...') writes = mons.run( args=[ 'cat', run.Raw('>>'), '/tmp/cephtest/ceph.keyring', ], stdin=run.PIPE, wait=False, ) keys_fp.seek(0) teuthology.feed_many_stdins_and_close(keys_fp, writes) run.wait(writes) for type_, id_, data in keys: run.wait( mons.run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-authtool', '/tmp/cephtest/ceph.keyring', '--name={type}.{id}'.format( type=type_, id=id_, ), ] + list(teuthology.generate_caps(type_)), wait=False, ), ) log.info('Running mkfs on mon nodes...') for remote, roles_for_host in mons.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'mon'): remote.run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-mon', '--mkfs', '-i', id_, '-c', '/tmp/cephtest/ceph.conf', '--monmap=/tmp/cephtest/monmap', '--osdmap=/tmp/cephtest/osdmap', '--keyring=/tmp/cephtest/ceph.keyring', ], ) log.info('Running mkfs on osd nodes...') for remote, roles_for_host in osds.remotes.iteritems(): roles_to_devs = remote_to_roles_to_devs[remote] roles_to_journals = remote_to_roles_to_journals[remote] ctx.disk_config = argparse.Namespace() ctx.disk_config.remote_to_roles_to_dev = remote_to_roles_to_devs ctx.disk_config.remote_to_roles_to_journals = remote_to_roles_to_journals for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): log.info(str(roles_to_journals)) log.info(id_) remote.run( args=[ 'mkdir', os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_)), ], ) if roles_to_devs.get(id_): dev = roles_to_devs[id_] fs = config.get('fs') package = None mkfs_options = config.get('mkfs_options') mount_options = config.get('mount_options') if fs == 'btrfs': package = 'btrfs-tools' if mount_options is None: mount_options = ['noatime','user_subvol_rm_allowed'] if mkfs_options is None: mkfs_options = ['-m', 'single', '-l', '32768', '-n', '32768'] if fs == 'xfs': package = 'xfsprogs' if mount_options is None: mount_options = ['noatime'] if mkfs_options is None: mkfs_options = ['-f', '-i', 'size=2048'] if fs == 'ext4' or fs == 'ext3': if mount_options is None: mount_options = ['noatime','user_xattr'] if mount_options is None: mount_options = [] if mkfs_options is None: mkfs_options = [] mkfs = ['mkfs.%s' % fs] + mkfs_options log.info('%s on %s on %s' % (mkfs, dev, remote)) if package is not None: remote.run( args=[ 'sudo', 'apt-get', 'install', '-y', package ] ) remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) log.info('mount %s on %s -o %s' % (dev, remote, ','.join(mount_options))) remote.run( args=[ 'sudo', 'mount', '-t', fs, '-o', ','.join(mount_options), dev, os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_)), ] ) remote.run( args=[ 'sudo', 'chown', '-R', 'ubuntu.ubuntu', os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_)) ] ) remote.run( args=[ 'sudo', 'chmod', '-R', '755', os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_)) ] ) devs_to_clean[remote].append( os.path.join( '/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_) ) ) for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run( args=[ '/tmp/cephtest/enable-coredump', '/tmp/cephtest/binary/usr/local/bin/ceph-coverage', coverage_dir, '/tmp/cephtest/binary/usr/local/bin/ceph-osd', '--mkfs', '-i', id_, '-c', '/tmp/cephtest/ceph.conf', '--monmap', '/tmp/cephtest/monmap', ], ) run.wait( mons.run( args=[ 'rm', '--', '/tmp/cephtest/monmap', '/tmp/cephtest/osdmap', ], wait=False, ), ) try: yield finally: (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() log.info('Checking cluster log for badness...') def first_in_ceph_log(pattern, excludes): args = [ 'egrep', pattern, '/tmp/cephtest/archive/log/cluster.%s.log' % firstmon, ] for exclude in excludes: args.extend([run.Raw('|'), 'egrep', '-v', exclude]) args.extend([ run.Raw('|'), 'head', '-n', '1', ]) r = mon0_remote.run( stdout=StringIO(), args=args, ) stdout = r.stdout.getvalue() if stdout != '': return stdout return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', config['log_whitelist']) is not None: log.warning('Found errors (ERR|WRN|SEC) in cluster log') ctx.summary['success'] = False # use the most severe problem as the failure reason if 'failure_reason' not in ctx.summary: for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']: match = first_in_ceph_log(pattern, config['log_whitelist']) if match is not None: ctx.summary['failure_reason'] = \ '"{match}" in cluster log'.format( match=match.rstrip('\n'), ) break for remote, dirs in devs_to_clean.iteritems(): for dir_ in dirs: log.info('Unmounting %s on %s' % (dir_, remote)) remote.run( args=[ 'sync', run.Raw('&&'), 'sudo', 'umount', '-f', dir_ ] ) if config.get('tmpfs_journal'): log.info('tmpfs journal enabled - unmounting tmpfs at /mnt') for remote, roles_for_host in osds.remotes.iteritems(): remote.run( args=[ 'sudo', 'umount', '-f', '/mnt' ], check_status=False, ) if ctx.archive is not None: # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball(remote, '/tmp/cephtest/data/%s' % role, path + '/' + role + '.tgz') log.info('Cleaning ceph cluster...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', '/tmp/cephtest/ceph.conf', '/tmp/cephtest/ceph.keyring', '/tmp/cephtest/data', '/tmp/cephtest/monmap', run.Raw('/tmp/cephtest/asok.*') ], wait=False, ), )
def cluster(ctx, config): """ Handle the creation and removal of a ceph cluster. On startup: Create directories needed for the cluster. Create remote journals for all osds. Create and set keyring. Copy the monmap to tht test systems. Setup mon nodes. Setup mds nodes. Mkfs osd nodes. Add keyring information to monmaps Mkfs mon nodes. On exit: If errors occured, extract a failure message and store in ctx.summary. Unmount all test files and temporary journaling files. Save the monitor information and archive all ceph logs. Cleanup the keyring setup, and remove all monitor map and data files left over. :param ctx: Context :param config: Configuration """ if ctx.config.get('use_existing_cluster', False) is True: log.info("'use_existing_cluster' is true; skipping cluster creation") yield testdir = teuthology.get_testdir(ctx) cluster_name = config['cluster'] data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name) log.info('Creating ceph cluster %s...', cluster_name) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', data_dir, ], wait=False, )) run.wait( ctx.cluster.run( args=[ 'sudo', 'install', '-d', '-m0777', '--', '/var/run/ceph', ], wait=False, )) devs_to_clean = {} remote_to_roles_to_devs = {} remote_to_roles_to_journals = {} osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name)) for remote, roles_for_host in osds.remotes.iteritems(): devs = teuthology.get_scratch_devices(remote) roles_to_devs = {} roles_to_journals = {} if config.get('fs'): log.info('fs option selected, checking for scratch devs') log.info('found devs: %s' % (str(devs), )) devs_id_map = teuthology.get_wwn_id_map(remote, devs) iddevs = devs_id_map.values() roles_to_devs = assign_devs( teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs) if len(roles_to_devs) < len(iddevs): iddevs = iddevs[len(roles_to_devs):] devs_to_clean[remote] = [] if config.get('block_journal'): log.info('block journal enabled') roles_to_journals = assign_devs( teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs) log.info('journal map: %s', roles_to_journals) if config.get('tmpfs_journal'): log.info('tmpfs journal enabled') roles_to_journals = {} remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt']) for role in teuthology.cluster_roles_of_type( roles_for_host, 'osd', cluster_name): tmpfs = '/mnt/' + role roles_to_journals[role] = tmpfs remote.run(args=['truncate', '-s', '1500M', tmpfs]) log.info('journal map: %s', roles_to_journals) log.info('dev map: %s' % (str(roles_to_devs), )) remote_to_roles_to_devs[remote] = roles_to_devs remote_to_roles_to_journals[remote] = roles_to_journals log.info('Generating config...') remotes_and_roles = ctx.cluster.remotes.items() roles = [role_list for (remote, role_list) in remotes_and_roles] ips = [ host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles) ] conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name) for remote, roles_to_journals in remote_to_roles_to_journals.iteritems(): for role, journal in roles_to_journals.iteritems(): name = teuthology.ceph_role(role) if name not in conf: conf[name] = {} conf[name]['osd journal'] = journal for section, keys in config['conf'].iteritems(): for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) if section not in conf: conf[section] = {} conf[section][key] = value if config.get('tmpfs_journal'): conf['journal dio'] = False if not hasattr(ctx, 'ceph'): ctx.ceph = {} ctx.ceph[cluster_name] = argparse.Namespace() ctx.ceph[cluster_name].conf = conf default_keyring = '/etc/ceph/{cluster}.keyring'.format( cluster=cluster_name) keyring_path = config.get('keyring_path', default_keyring) coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) firstmon = teuthology.get_first_mon(ctx, config, cluster_name) log.info('Setting up %s...' % firstmon) ctx.cluster.only(firstmon).run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', keyring_path, ], ) ctx.cluster.only(firstmon).run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--gen-key', '--name=mon.', keyring_path, ], ) ctx.cluster.only(firstmon).run(args=[ 'sudo', 'chmod', '0644', keyring_path, ], ) (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys() monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir, cluster=cluster_name) fsid = teuthology.create_simple_monmap( ctx, remote=mon0_remote, conf=conf, path=monmap_path, ) if not 'global' in conf: conf['global'] = {} conf['global']['fsid'] = fsid default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name) conf_path = config.get('conf_path', default_conf_path) log.info('Writing %s for FSID %s...' % (conf_path, fsid)) write_conf(ctx, conf_path, cluster_name) log.info('Creating admin key on %s...' % firstmon) ctx.cluster.only(firstmon).run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--gen-key', '--name=client.admin', '--set-uid=0', '--cap', 'mon', 'allow *', '--cap', 'osd', 'allow *', '--cap', 'mds', 'allow *', keyring_path, ], ) log.info('Copying monmap to all nodes...') keyring = teuthology.get_file( remote=mon0_remote, path=keyring_path, ) monmap = teuthology.get_file( remote=mon0_remote, path=monmap_path, ) for rem in ctx.cluster.remotes.iterkeys(): # copy mon key and initial monmap log.info('Sending monmap to node {remote}'.format(remote=rem)) teuthology.sudo_write_file(remote=rem, path=keyring_path, data=keyring, perms='0644') teuthology.write_file( remote=rem, path=monmap_path, data=monmap, ) log.info('Setting up mon nodes...') mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name)) osdmap_path = '{tdir}/{cluster}.osdmap'.format(tdir=testdir, cluster=cluster_name) run.wait( mons.run( args=[ 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'osdmaptool', '-c', conf_path, '--clobber', '--createsimple', '{num:d}'.format(num=teuthology.num_instances_of_type( ctx.cluster, 'osd', cluster_name), ), osdmap_path, '--pg_bits', '2', '--pgp_bits', '4', ], wait=False, ), ) log.info('Setting up mds nodes...') mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name)) for remote, roles_for_host in mdss.remotes.iteritems(): for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds', cluster_name): _, _, id_ = teuthology.split_role(role) mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format( cluster=cluster_name, id=id_, ) remote.run(args=[ 'sudo', 'mkdir', '-p', mds_dir, run.Raw('&&'), 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', '--gen-key', '--name=mds.{id}'.format(id=id_), mds_dir + '/keyring', ], ) cclient.create_keyring(ctx, cluster_name) log.info('Running mkfs on osd nodes...') if not hasattr(ctx, 'disk_config'): ctx.disk_config = argparse.Namespace() if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'): ctx.disk_config.remote_to_roles_to_dev = {} if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'): ctx.disk_config.remote_to_roles_to_journals = {} if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'): ctx.disk_config.remote_to_roles_to_dev_mount_options = {} if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'): ctx.disk_config.remote_to_roles_to_dev_fstype = {} teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs) teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals) log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format( r=str(ctx.disk_config.remote_to_roles_to_dev))) for remote, roles_for_host in osds.remotes.iteritems(): roles_to_devs = remote_to_roles_to_devs[remote] roles_to_journals = remote_to_roles_to_journals[remote] for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name): _, _, id_ = teuthology.split_role(role) mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format( cluster=cluster_name, id=id_) remote.run(args=[ 'sudo', 'mkdir', '-p', mnt_point, ]) log.info(str(roles_to_journals)) log.info(role) if roles_to_devs.get(role): dev = roles_to_devs[role] fs = config.get('fs') package = None mkfs_options = config.get('mkfs_options') mount_options = config.get('mount_options') if fs == 'btrfs': # package = 'btrfs-tools' if mount_options is None: mount_options = ['noatime', 'user_subvol_rm_allowed'] if mkfs_options is None: mkfs_options = [ '-m', 'single', '-l', '32768', '-n', '32768' ] if fs == 'xfs': # package = 'xfsprogs' if mount_options is None: mount_options = ['noatime'] if mkfs_options is None: mkfs_options = ['-f', '-i', 'size=2048'] if fs == 'ext4' or fs == 'ext3': if mount_options is None: mount_options = ['noatime', 'user_xattr'] if mount_options is None: mount_options = [] if mkfs_options is None: mkfs_options = [] mkfs = ['mkfs.%s' % fs] + mkfs_options log.info('%s on %s on %s' % (mkfs, dev, remote)) if package is not None: remote.run( args=['sudo', 'apt-get', 'install', '-y', package], stdout=StringIO(), ) try: remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) except run.CommandFailedError: # Newer btfs-tools doesn't prompt for overwrite, use -f if '-f' not in mount_options: mkfs_options.append('-f') mkfs = ['mkfs.%s' % fs] + mkfs_options log.info('%s on %s on %s' % (mkfs, dev, remote)) remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) log.info('mount %s on %s -o %s' % (dev, remote, ','.join(mount_options))) remote.run(args=[ 'sudo', 'mount', '-t', fs, '-o', ','.join(mount_options), dev, mnt_point, ]) if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options: ctx.disk_config.remote_to_roles_to_dev_mount_options[ remote] = {} ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][ role] = mount_options if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype: ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {} ctx.disk_config.remote_to_roles_to_dev_fstype[remote][ role] = fs devs_to_clean[remote].append(mnt_point) for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name): _, _, id_ = teuthology.split_role(role) remote.run(args=[ 'sudo', 'MALLOC_CHECK_=3', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-osd', '--cluster', cluster_name, '--mkfs', '--mkkey', '-i', id_, '--monmap', monmap_path, ], ) log.info('Reading keys from all nodes...') keys_fp = StringIO() keys = [] for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for type_ in ['mds', 'osd']: for role in teuthology.cluster_roles_of_type( roles_for_host, type_, cluster_name): _, _, id_ = teuthology.split_role(role) data = teuthology.get_file( remote=remote, path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format( type=type_, id=id_, cluster=cluster_name, ), sudo=True, ) keys.append((type_, id_, data)) keys_fp.write(data) for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name): _, _, id_ = teuthology.split_role(role) data = teuthology.get_file( remote=remote, path='/etc/ceph/{cluster}.client.{id}.keyring'.format( id=id_, cluster=cluster_name)) keys.append(('client', id_, data)) keys_fp.write(data) log.info('Adding keys to all mons...') writes = mons.run( args=[ 'sudo', 'tee', '-a', keyring_path, ], stdin=run.PIPE, wait=False, stdout=StringIO(), ) keys_fp.seek(0) teuthology.feed_many_stdins_and_close(keys_fp, writes) run.wait(writes) for type_, id_, data in keys: run.wait( mons.run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', keyring_path, '--name={type}.{id}'.format( type=type_, id=id_, ), ] + list(teuthology.generate_caps(type_)), wait=False, ), ) log.info('Running mkfs on mon nodes...') for remote, roles_for_host in mons.remotes.iteritems(): for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name): _, _, id_ = teuthology.split_role(role) remote.run(args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/mon/{cluster}-{id}'.format( id=id_, cluster=cluster_name), ], ) remote.run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-mon', '--cluster', cluster_name, '--mkfs', '-i', id_, '--monmap', monmap_path, '--osdmap', osdmap_path, '--keyring', keyring_path, ], ) run.wait( mons.run( args=[ 'rm', '--', monmap_path, osdmap_path, ], wait=False, ), ) try: yield except Exception: # we need to know this below ctx.summary['success'] = False raise finally: (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys() log.info('Checking cluster log for badness...') def first_in_ceph_log(pattern, excludes): """ Find the first occurence of the pattern specified in the Ceph log, Returns None if none found. :param pattern: Pattern scanned for. :param excludes: Patterns to ignore. :return: First line of text (or None if not found) """ args = [ 'sudo', 'egrep', pattern, '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name), ] for exclude in excludes: args.extend([run.Raw('|'), 'egrep', '-v', exclude]) args.extend([ run.Raw('|'), 'head', '-n', '1', ]) r = mon0_remote.run( stdout=StringIO(), args=args, ) stdout = r.stdout.getvalue() if stdout != '': return stdout return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', config['log_whitelist']) is not None: log.warning('Found errors (ERR|WRN|SEC) in cluster log') ctx.summary['success'] = False # use the most severe problem as the failure reason if 'failure_reason' not in ctx.summary: for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']: match = first_in_ceph_log(pattern, config['log_whitelist']) if match is not None: ctx.summary['failure_reason'] = \ '"{match}" in cluster log'.format( match=match.rstrip('\n'), ) break for remote, dirs in devs_to_clean.iteritems(): for dir_ in dirs: log.info('Unmounting %s on %s' % (dir_, remote)) try: remote.run(args=[ 'sync', run.Raw('&&'), 'sudo', 'umount', '-f', dir_ ]) except Exception as e: remote.run(args=[ 'sudo', run.Raw('PATH=/usr/sbin:$PATH'), 'lsof', run.Raw(';'), 'ps', 'auxf', ]) raise e if config.get('tmpfs_journal'): log.info('tmpfs journal enabled - unmounting tmpfs at /mnt') for remote, roles_for_host in osds.remotes.iteritems(): remote.run( args=['sudo', 'umount', '-f', '/mnt'], check_status=False, ) if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') try: os.makedirs(path) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for remote, roles in mons.remotes.iteritems(): for role in roles: is_mon = teuthology.is_type('mon', cluster_name) if is_mon(role): _, _, id_ = teuthology.split_role(role) mon_dir = '/var/lib/ceph/mon/' + \ '{0}-{1}'.format(cluster_name, id_) teuthology.pull_directory_tarball( remote, mon_dir, path + '/' + role + '.tgz') log.info('Cleaning ceph cluster...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-rf', '--', conf_path, keyring_path, data_dir, monmap_path, osdmap_path, run.Raw('{tdir}/../*.pid'.format(tdir=testdir)), ], wait=False, ), )
def task(ctx, config): """ Test (non-backfill) recovery """ if config is None: config = {} assert isinstance(config, dict), \ 'task only accepts a dict for configuration' testdir = teuthology.get_testdir(ctx) first_mon = teuthology.get_first_mon(ctx, config) (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys() num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') log.info('num_osds is %s' % num_osds) assert num_osds == 3 manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < 3: time.sleep(10) manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_clean() # test some osdmap flags manager.raw_cluster_cmd('osd', 'set', 'noin') manager.raw_cluster_cmd('osd', 'set', 'noout') manager.raw_cluster_cmd('osd', 'set', 'noup') manager.raw_cluster_cmd('osd', 'set', 'nodown') manager.raw_cluster_cmd('osd', 'unset', 'noin') manager.raw_cluster_cmd('osd', 'unset', 'noout') manager.raw_cluster_cmd('osd', 'unset', 'noup') manager.raw_cluster_cmd('osd', 'unset', 'nodown') # write some new data p = rados_start( testdir, mon, ['-p', 'rbd', 'bench', '20', 'write', '-b', '4096', '--no-cleanup']) time.sleep(15) # trigger a divergent target: # blackhole + restart osd.1 (shorter log) manager.blackhole_kill_osd(1) # kill osd.2 (longer log... we'll make it divergent below) manager.kill_osd(2) time.sleep(2) manager.revive_osd(1) # wait for our writes to complete + succeed err = p.wait() log.info('err is %d' % err) # cluster must repeer manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.wait_for_active_or_down() # write some more (make sure osd.2 really is divergent) p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096']) p.wait() # revive divergent osd manager.revive_osd(2) while len(manager.get_osd_status()['up']) < 3: log.info('waiting a bit...') time.sleep(2) log.info('3 are up!') # cluster must recover manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats') manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats') manager.wait_for_clean()
def task(ctx, config): """ Test [deep] scrub tasks: - chef: - install: - ceph: log-whitelist: - '!= data_digest' - '!= omap_digest' - '!= size' - deep-scrub 0 missing, 1 inconsistent objects - deep-scrub [0-9]+ errors - repair 0 missing, 1 inconsistent objects - repair [0-9]+ errors, [0-9]+ fixed - shard [0-9]+ missing - deep-scrub 1 missing, 1 inconsistent objects - does not match object info size - attr name mistmatch - deep-scrub 1 missing, 0 inconsistent objects - failed to pick suitable auth object conf: osd: osd deep scrub update digest min age: 0 - scrub_test: """ if config is None: config = {} assert isinstance(config, dict), \ 'scrub_test task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') log.info('num_osds is %s' % num_osds) manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager'), ) while len(manager.get_osd_status()['up']) < num_osds: time.sleep(10) for i in range(num_osds): manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs', '--', '--osd-objectstore-fuse') for i in range(num_osds): manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats') manager.wait_for_clean() # write some data p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1', 'write', '-b', '4096']) log.info('err is %d' % p.exitstatus) # wait for some PG to have data that we can mess with pg, acting = wait_for_victim_pg(manager) osd = acting[0] osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd) manager.do_rados(mon, ['-p', 'rbd', 'setomapval', obj_name, 'key', 'val']) log.info('err is %d' % p.exitstatus) manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', obj_name, 'hdr']) log.info('err is %d' % p.exitstatus) # Update missing digests, requires "osd deep scrub update digest min age: 0" pgnum = get_pgnum(pg) manager.do_pg_scrub('rbd', pgnum, 'deep-scrub') log.info('messing with PG %s on osd %d' % (pg, osd)) test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, 'rbd') test_repair_bad_omap(ctx, manager, pg, osd, obj_name) test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd, obj_name, obj_path) log.info('test successful!') # shut down fuse mount for i in range(num_osds): manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs', '--', '--no-osd-objectstore-fuse') time.sleep(5) log.info('done')
def cluster(ctx, config): testdir = teuthology.get_testdir(ctx) log.info('Creating ceph cluster...') run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', '{tdir}/data'.format(tdir=testdir), ], wait=False, ) ) run.wait( ctx.cluster.run( args=[ 'sudo', 'install', '-d', '-m0777', '--', '/var/run/ceph', ], wait=False, ) ) devs_to_clean = {} remote_to_roles_to_devs = {} remote_to_roles_to_journals = {} osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, roles_for_host in osds.remotes.iteritems(): devs = teuthology.get_scratch_devices(remote) roles_to_devs = {} roles_to_journals = {} if config.get('fs'): log.info('fs option selected, checking for scratch devs') log.info('found devs: %s' % (str(devs),)) devs_id_map = teuthology.get_wwn_id_map(remote, devs) iddevs = devs_id_map.values() roles_to_devs = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), iddevs ) if len(roles_to_devs) < len(iddevs): iddevs = iddevs[len(roles_to_devs):] devs_to_clean[remote] = [] if config.get('block_journal'): log.info('block journal enabled') roles_to_journals = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), iddevs ) log.info('journal map: %s', roles_to_journals) if config.get('tmpfs_journal'): log.info('tmpfs journal enabled') roles_to_journals = {} remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] ) for osd in teuthology.roles_of_type(roles_for_host, 'osd'): tmpfs = '/mnt/osd.%s' % osd roles_to_journals[osd] = tmpfs remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] ) log.info('journal map: %s', roles_to_journals) log.info('dev map: %s' % (str(roles_to_devs),)) remote_to_roles_to_devs[remote] = roles_to_devs remote_to_roles_to_journals[remote] = roles_to_journals log.info('Generating config...') remotes_and_roles = ctx.cluster.remotes.items() roles = [role_list for (remote, role_list) in remotes_and_roles] ips = [host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)] conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips) for remote, roles_to_journals in remote_to_roles_to_journals.iteritems(): for role, journal in roles_to_journals.iteritems(): key = "osd." + str(role) if key not in conf: conf[key] = {} conf[key]['osd journal'] = journal for section, keys in config['conf'].iteritems(): for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) if section not in conf: conf[section] = {} conf[section][key] = value if config.get('tmpfs_journal'): conf['journal dio'] = False ctx.ceph = argparse.Namespace() ctx.ceph.conf = conf conf_path = config.get('conf_path', '/etc/ceph/ceph.conf') keyring_path = config.get('keyring_path', '/etc/ceph/ceph.keyring') log.info('Writing configs...') conf_fp = StringIO() conf.write(conf_fp) conf_fp.seek(0) writes = ctx.cluster.run( args=[ 'sudo', 'mkdir', '-p', '/etc/ceph', run.Raw('&&'), 'sudo', 'chmod', '0755', '/etc/ceph', run.Raw('&&'), 'sudo', 'python', '-c', 'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))', conf_path, run.Raw('&&'), 'sudo', 'chmod', '0644', conf_path, ], stdin=run.PIPE, wait=False, ) teuthology.feed_many_stdins_and_close(conf_fp, writes) run.wait(writes) coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) firstmon = teuthology.get_first_mon(ctx, config) log.info('Setting up %s...' % firstmon) ctx.cluster.only(firstmon).run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', keyring_path, ], ) ctx.cluster.only(firstmon).run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--gen-key', '--name=mon.', keyring_path, ], ) ctx.cluster.only(firstmon).run( args=[ 'sudo', 'chmod', '0644', keyring_path, ], ) (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() teuthology.create_simple_monmap( ctx, remote=mon0_remote, conf=conf, ) log.info('Creating admin key on %s...' % firstmon) ctx.cluster.only(firstmon).run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--gen-key', '--name=client.admin', '--set-uid=0', '--cap', 'mon', 'allow *', '--cap', 'osd', 'allow *', '--cap', 'mds', 'allow', keyring_path, ], ) log.info('Copying monmap to all nodes...') keyring = teuthology.get_file( remote=mon0_remote, path=keyring_path, ) monmap = teuthology.get_file( remote=mon0_remote, path='{tdir}/monmap'.format(tdir=testdir), ) for rem in ctx.cluster.remotes.iterkeys(): # copy mon key and initial monmap log.info('Sending monmap to node {remote}'.format(remote=rem)) teuthology.sudo_write_file( remote=rem, path=keyring_path, data=keyring, perms='0644' ) teuthology.write_file( remote=rem, path='{tdir}/monmap'.format(tdir=testdir), data=monmap, ) log.info('Setting up mon nodes...') mons = ctx.cluster.only(teuthology.is_type('mon')) run.wait( mons.run( args=[ 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'osdmaptool', '-c', conf_path, '--clobber', '--createsimple', '{num:d}'.format( num=teuthology.num_instances_of_type(ctx.cluster, 'osd'), ), '{tdir}/osdmap'.format(tdir=testdir), '--pg_bits', '2', '--pgp_bits', '4', ], wait=False, ), ) log.info('Setting up mds nodes...') mdss = ctx.cluster.only(teuthology.is_type('mds')) for remote, roles_for_host in mdss.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'mds'): remote.run( args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/mds/ceph-{id}'.format(id=id_), run.Raw('&&'), 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', '--gen-key', '--name=mds.{id}'.format(id=id_), '/var/lib/ceph/mds/ceph-{id}/keyring'.format(id=id_), ], ) cclient.create_keyring(ctx) log.info('Running mkfs on osd nodes...') ctx.disk_config = argparse.Namespace() ctx.disk_config.remote_to_roles_to_dev = remote_to_roles_to_devs ctx.disk_config.remote_to_roles_to_journals = remote_to_roles_to_journals ctx.disk_config.remote_to_roles_to_dev_mount_options = {} ctx.disk_config.remote_to_roles_to_dev_fstype = {} log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev))) for remote, roles_for_host in osds.remotes.iteritems(): roles_to_devs = remote_to_roles_to_devs[remote] roles_to_journals = remote_to_roles_to_journals[remote] for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run( args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/osd/ceph-{id}'.format(id=id_), ]) log.info(str(roles_to_journals)) log.info(id_) if roles_to_devs.get(id_): dev = roles_to_devs[id_] fs = config.get('fs') package = None mkfs_options = config.get('mkfs_options') mount_options = config.get('mount_options') if fs == 'btrfs': #package = 'btrfs-tools' if mount_options is None: mount_options = ['noatime','user_subvol_rm_allowed'] if mkfs_options is None: mkfs_options = ['-m', 'single', '-l', '32768', '-n', '32768'] if fs == 'xfs': #package = 'xfsprogs' if mount_options is None: mount_options = ['noatime'] if mkfs_options is None: mkfs_options = ['-f', '-i', 'size=2048'] if fs == 'ext4' or fs == 'ext3': if mount_options is None: mount_options = ['noatime','user_xattr'] if mount_options is None: mount_options = [] if mkfs_options is None: mkfs_options = [] mkfs = ['mkfs.%s' % fs] + mkfs_options log.info('%s on %s on %s' % (mkfs, dev, remote)) if package is not None: remote.run( args=[ 'sudo', 'apt-get', 'install', '-y', package ], stdout=StringIO(), ) remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) log.info('mount %s on %s -o %s' % (dev, remote, ','.join(mount_options))) remote.run( args=[ 'sudo', 'mount', '-t', fs, '-o', ','.join(mount_options), dev, os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)), ] ) if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options: ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {} ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][id_] = mount_options if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype: ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {} ctx.disk_config.remote_to_roles_to_dev_fstype[remote][id_] = fs devs_to_clean[remote].append( os.path.join( os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)), ) ) for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run( args=[ 'sudo', 'MALLOC_CHECK_=3', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-osd', '--mkfs', '--mkkey', '-i', id_, '--monmap', '{tdir}/monmap'.format(tdir=testdir), ], ) log.info('Reading keys from all nodes...') keys_fp = StringIO() keys = [] for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for type_ in ['mds','osd']: for id_ in teuthology.roles_of_type(roles_for_host, type_): data = teuthology.get_file( remote=remote, path='/var/lib/ceph/{type}/ceph-{id}/keyring'.format( type=type_, id=id_, ), sudo=True, ) keys.append((type_, id_, data)) keys_fp.write(data) for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for type_ in ['client']: for id_ in teuthology.roles_of_type(roles_for_host, type_): data = teuthology.get_file( remote=remote, path='/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) ) keys.append((type_, id_, data)) keys_fp.write(data) log.info('Adding keys to all mons...') writes = mons.run( args=[ 'sudo', 'tee', '-a', keyring_path, ], stdin=run.PIPE, wait=False, stdout=StringIO(), ) keys_fp.seek(0) teuthology.feed_many_stdins_and_close(keys_fp, writes) run.wait(writes) for type_, id_, data in keys: run.wait( mons.run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', keyring_path, '--name={type}.{id}'.format( type=type_, id=id_, ), ] + list(teuthology.generate_caps(type_)), wait=False, ), ) log.info('Running mkfs on mon nodes...') for remote, roles_for_host in mons.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'mon'): remote.run( args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/mon/ceph-{id}'.format(id=id_), ], ) remote.run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-mon', '--mkfs', '-i', id_, '--monmap={tdir}/monmap'.format(tdir=testdir), '--osdmap={tdir}/osdmap'.format(tdir=testdir), '--keyring={kpath}'.format(kpath=keyring_path), ], ) run.wait( mons.run( args=[ 'rm', '--', '{tdir}/monmap'.format(tdir=testdir), '{tdir}/osdmap'.format(tdir=testdir), ], wait=False, ), ) try: yield except Exception: # we need to know this below ctx.summary['success'] = False raise finally: (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() log.info('Checking cluster log for badness...') def first_in_ceph_log(pattern, excludes): args = [ 'sudo', 'egrep', pattern, '/var/log/ceph/ceph.log', ] for exclude in excludes: args.extend([run.Raw('|'), 'egrep', '-v', exclude]) args.extend([ run.Raw('|'), 'head', '-n', '1', ]) r = mon0_remote.run( stdout=StringIO(), args=args, ) stdout = r.stdout.getvalue() if stdout != '': return stdout return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', config['log_whitelist']) is not None: log.warning('Found errors (ERR|WRN|SEC) in cluster log') ctx.summary['success'] = False # use the most severe problem as the failure reason if 'failure_reason' not in ctx.summary: for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']: match = first_in_ceph_log(pattern, config['log_whitelist']) if match is not None: ctx.summary['failure_reason'] = \ '"{match}" in cluster log'.format( match=match.rstrip('\n'), ) break for remote, dirs in devs_to_clean.iteritems(): for dir_ in dirs: log.info('Unmounting %s on %s' % (dir_, remote)) remote.run( args=[ 'sync', run.Raw('&&'), 'sudo', 'umount', '-f', dir_ ] ) if config.get('tmpfs_journal'): log.info('tmpfs journal enabled - unmounting tmpfs at /mnt') for remote, roles_for_host in osds.remotes.iteritems(): remote.run( args=[ 'sudo', 'umount', '-f', '/mnt' ], check_status=False, ) if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') # and logs log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) log.info('Cleaning ceph cluster...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-rf', '--', conf_path, keyring_path, '{tdir}/data'.format(tdir=testdir), '{tdir}/monmap'.format(tdir=testdir), ], wait=False, ), )