示例#1
0
def task(ctx, config):
    """
    Test [deep] repair in several situations:
      Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica]

    The config should be as follows:

      Must include the log-whitelist below
      Must enable filestore_debug_inject_read_err config

    example:

    tasks:
    - chef:
    - install:
    - ceph:
        log-whitelist: ['candidate had a read error', 'deep-scrub 0 missing, 1 inconsistent objects', 'deep-scrub 0 missing, 4 inconsistent objects', 'deep-scrub 1 errors', 'deep-scrub 4 errors', '!= known omap_digest', 'repair 0 missing, 1 inconsistent objects', 'repair 0 missing, 4 inconsistent objects', 'repair 1 errors, 1 fixed', 'repair 4 errors, 4 fixed', 'scrub 0 missing, 1 inconsistent', 'scrub 1 errors', 'size 1 != known size']
        conf:
          osd:
            filestore debug inject read err: true
    - repair_test:

    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'repair_test task only accepts a dict for config'

    if not hasattr(ctx, 'manager'):
        first_mon = teuthology.get_first_mon(ctx, config)
        (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
        ctx.manager = ceph_manager.CephManager(
            mon,
            ctx=ctx,
            logger=log.getChild('ceph_manager')
            )

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)

    while len(ctx.manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    tests = [
        gen_repair_test_1(mdataerr(ctx), choose_primary(ctx), "scrub"),
        gen_repair_test_1(mdataerr(ctx), choose_replica(ctx), "scrub"),
        gen_repair_test_1(dataerr(ctx), choose_primary(ctx), "deep-scrub"),
        gen_repair_test_1(dataerr(ctx), choose_replica(ctx), "deep-scrub"),
        gen_repair_test_1(trunc(ctx), choose_primary(ctx), "scrub"),
        gen_repair_test_1(trunc(ctx), choose_replica(ctx), "scrub"),
        gen_repair_test_2(choose_primary(ctx)),
        gen_repair_test_2(choose_replica(ctx))
        ]

    for test in tests:
        run_test(ctx, config, test)
示例#2
0
def task(ctx, config):
    """
    Die if {testdir}/err exists or if an OSD dumps core
    """
    if config is None:
        config = {}

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    testdir = teuthology.get_testdir(ctx)

    while True:
        for i in range(num_osds):
            (osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.iterkeys()
            p = osd_remote.run(
                args = [ 'test', '-e', '{tdir}/err'.format(tdir=testdir) ],
                wait=True,
                check_status=False,
            )
            exit_status = p.exitstatus

            if exit_status == 0:
                log.info("osd %d has an error" % i)
                raise Exception("osd %d error" % i)

            log_path = '/var/log/ceph/osd.%d.log' % (i)

            p = osd_remote.run(
                args = [
                         'tail', '-1', log_path,
                         run.Raw('|'),
                         'grep', '-q', 'end dump'
                       ],
                wait=True,
                check_status=False,
            )
            exit_status = p.exitstatus

            if exit_status == 0:
                log.info("osd %d dumped core" % i)
                raise Exception("osd %d dumped core" % i)

        time.sleep(5)
示例#3
0
def task(ctx, config):
    """
    Test [deep] scrub

    tasks:
    - chef:
    - install:
    - ceph:
        log-whitelist:
        - '!= known digest'
        - '!= known omap_digest'
        - deep-scrub 0 missing, 1 inconsistent objects
        - deep-scrub 1 errors
        - repair 0 missing, 1 inconsistent objects
        - repair 1 errors, 1 fixed
    - scrub_test:
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), "scrub_test task only accepts a dict for configuration"
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, "osd")
    log.info("num_osds is %s" % num_osds)

    manager = ceph_manager.CephManager(mon, ctx=ctx, logger=log.getChild("ceph_manager"))

    while len(manager.get_osd_status()["up"]) < num_osds:
        time.sleep(10)

    for i in range(num_osds):
        manager.raw_cluster_cmd("tell", "osd.%d" % i, "flush_pg_stats")
    manager.wait_for_clean()

    # write some data
    p = manager.do_rados(mon, ["-p", "rbd", "bench", "--no-cleanup", "1", "write", "-b", "4096"])
    log.info("err is %d" % p.exitstatus)

    # wait for some PG to have data that we can mess with
    pg, acting = wait_for_victim_pg(manager)
    osd = acting[0]

    osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd)
    manager.do_rados(mon, ["-p", "rbd", "setomapval", obj_name, "key", "val"])
    log.info("err is %d" % p.exitstatus)
    manager.do_rados(mon, ["-p", "rbd", "setomapheader", obj_name, "hdr"])
    log.info("err is %d" % p.exitstatus)

    log.info("messing with PG %s on osd %d" % (pg, osd))
    test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, "rbd")
    test_repair_bad_omap(ctx, manager, pg, osd, obj_name)
    test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd, obj_name, obj_path)
    log.info("test successful!")
示例#4
0
def task(ctx, config):
    """
    Benchmark the recovery system.

    Generates objects with smalliobench, runs it normally to get a
    baseline performance measurement, then marks an OSD out and reruns
    to measure performance during recovery.

    The config should be as follows:

    recovery_bench:
        duration: <seconds for each measurement run>
        num_objects: <number of objects>
        io_size: <io size in bytes>

    example:

    tasks:
    - ceph:
    - recovery_bench:
        duration: 60
        num_objects: 500
        io_size: 4096
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'recovery_bench task only accepts a dict for configuration'

    log.info('Beginning recovery bench...')

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    while len(manager.get_osd_status()['up']) < num_osds:
        manager.sleep(10)

    bench_proc = RecoveryBencher(
        manager,
        config,
        )
    try:
        yield
    finally:
        log.info('joining recovery bencher')
        bench_proc.do_join()
def task(ctx, config):
    """
    Benchmark the recovery system.

    Generates objects with smalliobench, runs it normally to get a
    baseline performance measurement, then marks an OSD out and reruns
    to measure performance during recovery.

    The config should be as follows:

    recovery_bench:
        duration: <seconds for each measurement run>
        num_objects: <number of objects>
        io_size: <io size in bytes>

    example:

    tasks:
    - ceph:
    - recovery_bench:
        duration: 60
        num_objects: 500
        io_size: 4096
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'recovery_bench task only accepts a dict for configuration'

    log.info('Beginning recovery bench...')

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    while len(manager.get_osd_status()['up']) < num_osds:
        manager.sleep(10)

    bench_proc = RecoveryBencher(
        manager,
        config,
        )
    try:
        yield
    finally:
        log.info('joining recovery bencher')
        bench_proc.do_join()
示例#6
0
def task(ctx, config):
    """
    Run scrub periodically. Randomly chooses an OSD to scrub.

    The config should be as follows:

    scrub:
        frequency: <seconds between scrubs>
        deep: <bool for deepness>

    example:

    tasks:
    - ceph:
    - scrub:
        frequency: 30
        deep: 0
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'scrub task only accepts a dict for configuration'

    log.info('Beginning scrub...')

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    scrub_proc = Scrubber(
        manager,
        config,
        )
    try:
        yield
    finally:
        log.info('joining scrub')
        scrub_proc.do_join()
示例#7
0
def task(ctx, config):
    """
    Run scrub periodically. Randomly chooses an OSD to scrub.

    The config should be as follows:

    scrub:
        frequency: <seconds between scrubs>
        deep: <bool for deepness>

    example:

    tasks:
    - ceph:
    - scrub:
        frequency: 30
        deep: 0
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'scrub task only accepts a dict for configuration'

    log.info('Beginning scrub...')

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    scrub_proc = Scrubber(
        manager,
        config,
    )
    try:
        yield
    finally:
        log.info('joining scrub')
        scrub_proc.do_join()
示例#8
0
def cluster(ctx, config):
    """
    Handle the creation and removal of a ceph cluster.

    On startup:
        Create directories needed for the cluster.
        Create remote journals for all osds.
        Create and set keyring.
        Copy the monmap to tht test systems.
        Setup mon nodes.
        Setup mds nodes.
        Mkfs osd nodes.
        Add keyring information to monmaps
        Mkfs mon nodes.

    On exit:
        If errors occured, extract a failure message and store in ctx.summary.
        Unmount all test files and temporary journaling files.
        Save the monitor information and archive all ceph logs.
        Cleanup the keyring setup, and remove all monitor map and data files left over.

    :param ctx: Context
    :param config: Configuration
    """
    if ctx.config.get('use_existing_cluster', False) is True:
        log.info("'use_existing_cluster' is true; skipping cluster creation")
        yield

    testdir = teuthology.get_testdir(ctx)
    log.info('Creating ceph cluster...')
    run.wait(
        ctx.cluster.run(
            args=[
                'install', '-d', '-m0755', '--',
                '{tdir}/data'.format(tdir=testdir),
                ],
            wait=False,
            )
        )

    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'install', '-d', '-m0777', '--', '/var/run/ceph',
                ],
            wait=False,
            )
        )


    devs_to_clean = {}
    remote_to_roles_to_devs = {}
    remote_to_roles_to_journals = {}
    osds = ctx.cluster.only(teuthology.is_type('osd'))
    for remote, roles_for_host in osds.remotes.iteritems():
        devs = teuthology.get_scratch_devices(remote)
        roles_to_devs = {}
        roles_to_journals = {}
        if config.get('fs'):
            log.info('fs option selected, checking for scratch devs')
            log.info('found devs: %s' % (str(devs),))
            devs_id_map = teuthology.get_wwn_id_map(remote, devs)
            iddevs = devs_id_map.values()
            roles_to_devs = assign_devs(
                teuthology.roles_of_type(roles_for_host, 'osd'), iddevs
                )
            if len(roles_to_devs) < len(iddevs):
                iddevs = iddevs[len(roles_to_devs):]
            devs_to_clean[remote] = []

        if config.get('block_journal'):
            log.info('block journal enabled')
            roles_to_journals = assign_devs(
                teuthology.roles_of_type(roles_for_host, 'osd'), iddevs
                )
            log.info('journal map: %s', roles_to_journals)

        if config.get('tmpfs_journal'):
            log.info('tmpfs journal enabled')
            roles_to_journals = {}
            remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] )
            for osd in teuthology.roles_of_type(roles_for_host, 'osd'):
                tmpfs = '/mnt/osd.%s' % osd
                roles_to_journals[osd] = tmpfs
                remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] )
            log.info('journal map: %s', roles_to_journals)

        log.info('dev map: %s' % (str(roles_to_devs),))
        remote_to_roles_to_devs[remote] = roles_to_devs
        remote_to_roles_to_journals[remote] = roles_to_journals


    log.info('Generating config...')
    remotes_and_roles = ctx.cluster.remotes.items()
    roles = [role_list for (remote, role_list) in remotes_and_roles]
    ips = [host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
    conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips)
    for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
        for role, journal in roles_to_journals.iteritems():
            key = "osd." + str(role)
            if key not in conf:
                conf[key] = {}
            conf[key]['osd journal'] = journal
    for section, keys in config['conf'].iteritems():
        for key, value in keys.iteritems():
            log.info("[%s] %s = %s" % (section, key, value))
            if section not in conf:
                conf[section] = {}
            conf[section][key] = value

    if config.get('tmpfs_journal'):
        conf['journal dio'] = False

    ctx.ceph = argparse.Namespace()
    ctx.ceph.conf = conf

    keyring_path = config.get('keyring_path', '/etc/ceph/ceph.keyring')

    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    firstmon = teuthology.get_first_mon(ctx, config)

    log.info('Setting up %s...' % firstmon)
    ctx.cluster.only(firstmon).run(
        args=[
            'sudo',
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph-authtool',
            '--create-keyring',
            keyring_path,
            ],
        )
    ctx.cluster.only(firstmon).run(
        args=[
            'sudo',
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph-authtool',
            '--gen-key',
            '--name=mon.',
            keyring_path,
            ],
        )
    ctx.cluster.only(firstmon).run(
        args=[
            'sudo',
            'chmod',
            '0644',
            keyring_path,
            ],
        )
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    fsid = teuthology.create_simple_monmap(
        ctx,
        remote=mon0_remote,
        conf=conf,
        )
    if not 'global' in conf:
        conf['global'] = {}
    conf['global']['fsid'] = fsid

    log.info('Writing ceph.conf for FSID %s...' % fsid)
    conf_path = config.get('conf_path', DEFAULT_CONF_PATH)
    write_conf(ctx, conf_path)

    log.info('Creating admin key on %s...' % firstmon)
    ctx.cluster.only(firstmon).run(
        args=[
            'sudo',
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph-authtool',
            '--gen-key',
            '--name=client.admin',
            '--set-uid=0',
            '--cap', 'mon', 'allow *',
            '--cap', 'osd', 'allow *',
            '--cap', 'mds', 'allow *',
            keyring_path,
            ],
        )

    log.info('Copying monmap to all nodes...')
    keyring = teuthology.get_file(
        remote=mon0_remote,
        path=keyring_path,
        )
    monmap = teuthology.get_file(
        remote=mon0_remote,
        path='{tdir}/monmap'.format(tdir=testdir),
        )

    for rem in ctx.cluster.remotes.iterkeys():
        # copy mon key and initial monmap
        log.info('Sending monmap to node {remote}'.format(remote=rem))
        teuthology.sudo_write_file(
            remote=rem,
            path=keyring_path,
            data=keyring,
            perms='0644'
            )
        teuthology.write_file(
            remote=rem,
            path='{tdir}/monmap'.format(tdir=testdir),
            data=monmap,
            )

    log.info('Setting up mon nodes...')
    mons = ctx.cluster.only(teuthology.is_type('mon'))
    run.wait(
        mons.run(
            args=[
                'adjust-ulimits',
                'ceph-coverage',
                coverage_dir,
                'osdmaptool',
                '-c', conf_path,
                '--clobber',
                '--createsimple', '{num:d}'.format(
                    num=teuthology.num_instances_of_type(ctx.cluster, 'osd'),
                    ),
                '{tdir}/osdmap'.format(tdir=testdir),
                '--pg_bits', '2',
                '--pgp_bits', '4',
                ],
            wait=False,
            ),
        )

    log.info('Setting up mds nodes...')
    mdss = ctx.cluster.only(teuthology.is_type('mds'))
    for remote, roles_for_host in mdss.remotes.iteritems():
        for id_ in teuthology.roles_of_type(roles_for_host, 'mds'):
            remote.run(
                args=[
                    'sudo',
                    'mkdir',
                    '-p',
                    '/var/lib/ceph/mds/ceph-{id}'.format(id=id_),
                    run.Raw('&&'),
                    'sudo',
                    'adjust-ulimits',
                    'ceph-coverage',
                    coverage_dir,
                    'ceph-authtool',
                    '--create-keyring',
                    '--gen-key',
                    '--name=mds.{id}'.format(id=id_),
                    '/var/lib/ceph/mds/ceph-{id}/keyring'.format(id=id_),
                    ],
                )

    cclient.create_keyring(ctx)
    log.info('Running mkfs on osd nodes...')

    ctx.disk_config = argparse.Namespace()
    ctx.disk_config.remote_to_roles_to_dev = remote_to_roles_to_devs
    ctx.disk_config.remote_to_roles_to_journals = remote_to_roles_to_journals
    ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
    ctx.disk_config.remote_to_roles_to_dev_fstype = {}

    log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
    for remote, roles_for_host in osds.remotes.iteritems():
        roles_to_devs = remote_to_roles_to_devs[remote]
        roles_to_journals = remote_to_roles_to_journals[remote]


        for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
            remote.run(
                args=[
                    'sudo',
                    'mkdir',
                    '-p',
                    '/var/lib/ceph/osd/ceph-{id}'.format(id=id_),
                    ])
            log.info(str(roles_to_journals))
            log.info(id_)
            if roles_to_devs.get(id_):
                dev = roles_to_devs[id_]
                fs = config.get('fs')
                package = None
                mkfs_options = config.get('mkfs_options')
                mount_options = config.get('mount_options')
                if fs == 'btrfs':
                    #package = 'btrfs-tools'
                    if mount_options is None:
                        mount_options = ['noatime','user_subvol_rm_allowed']
                    if mkfs_options is None:
                        mkfs_options = ['-m', 'single',
                                        '-l', '32768',
                                        '-n', '32768']
                if fs == 'xfs':
                    #package = 'xfsprogs'
                    if mount_options is None:
                        mount_options = ['noatime']
                    if mkfs_options is None:
                        mkfs_options = ['-f', '-i', 'size=2048']
                if fs == 'ext4' or fs == 'ext3':
                    if mount_options is None:
                        mount_options = ['noatime','user_xattr']

                if mount_options is None:
                    mount_options = []
                if mkfs_options is None:
                    mkfs_options = []
                mkfs = ['mkfs.%s' % fs] + mkfs_options
                log.info('%s on %s on %s' % (mkfs, dev, remote))
                if package is not None:
                    remote.run(
                        args=[
                            'sudo',
                            'apt-get', 'install', '-y', package
                            ],
                        stdout=StringIO(),
                        )

                try:
                    remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
                except run.CommandFailedError:
                    # Newer btfs-tools doesn't prompt for overwrite, use -f
                    if '-f' not in mount_options:
                        mkfs_options.append('-f')
                        mkfs = ['mkfs.%s' % fs] + mkfs_options
                        log.info('%s on %s on %s' % (mkfs, dev, remote))
                    remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])

                log.info('mount %s on %s -o %s' % (dev, remote,
                                                   ','.join(mount_options)))
                remote.run(
                    args=[
                        'sudo',
                        'mount',
                        '-t', fs,
                        '-o', ','.join(mount_options),
                        dev,
                        os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)),
                        ]
                    )
                if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
                    ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
                ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][id_] = mount_options
                if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
                    ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
                ctx.disk_config.remote_to_roles_to_dev_fstype[remote][id_] = fs
                devs_to_clean[remote].append(
                    os.path.join(
                        os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)),
                        )
                    )

        for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
            remote.run(
                args=[
                    'sudo',
                    'MALLOC_CHECK_=3',
                    'adjust-ulimits',
                    'ceph-coverage',
                    coverage_dir,
                    'ceph-osd',
                    '--mkfs',
                    '--mkkey',
                    '-i', id_,
                    '--monmap', '{tdir}/monmap'.format(tdir=testdir),
                    ],
                )


    log.info('Reading keys from all nodes...')
    keys_fp = StringIO()
    keys = []
    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
        for type_ in ['mds','osd']:
            for id_ in teuthology.roles_of_type(roles_for_host, type_):
                data = teuthology.get_file(
                    remote=remote,
                    path='/var/lib/ceph/{type}/ceph-{id}/keyring'.format(
                        type=type_,
                        id=id_,
                        ),
                    sudo=True,
                    )
                keys.append((type_, id_, data))
                keys_fp.write(data)
    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
        for type_ in ['client']:
            for id_ in teuthology.roles_of_type(roles_for_host, type_):
                data = teuthology.get_file(
                    remote=remote,
                    path='/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
                    )
                keys.append((type_, id_, data))
                keys_fp.write(data)

    log.info('Adding keys to all mons...')
    writes = mons.run(
        args=[
            'sudo', 'tee', '-a',
            keyring_path,
            ],
        stdin=run.PIPE,
        wait=False,
        stdout=StringIO(),
        )
    keys_fp.seek(0)
    teuthology.feed_many_stdins_and_close(keys_fp, writes)
    run.wait(writes)
    for type_, id_, data in keys:
        run.wait(
            mons.run(
                args=[
                    'sudo',
                    'adjust-ulimits',
                    'ceph-coverage',
                    coverage_dir,
                    'ceph-authtool',
                    keyring_path,
                    '--name={type}.{id}'.format(
                        type=type_,
                        id=id_,
                        ),
                    ] + list(teuthology.generate_caps(type_)),
                wait=False,
                ),
            )

    log.info('Running mkfs on mon nodes...')
    for remote, roles_for_host in mons.remotes.iteritems():
        for id_ in teuthology.roles_of_type(roles_for_host, 'mon'):
            remote.run(
                args=[
                  'sudo',
                  'mkdir',
                  '-p',
                  '/var/lib/ceph/mon/ceph-{id}'.format(id=id_),
                  ],
                )
            remote.run(
                args=[
                    'sudo',
                    'adjust-ulimits',
                    'ceph-coverage',
                    coverage_dir,
                    'ceph-mon',
                    '--mkfs',
                    '-i', id_,
                    '--monmap={tdir}/monmap'.format(tdir=testdir),
                    '--osdmap={tdir}/osdmap'.format(tdir=testdir),
                    '--keyring={kpath}'.format(kpath=keyring_path),
                    ],
                )


    run.wait(
        mons.run(
            args=[
                'rm',
                '--',
                '{tdir}/monmap'.format(tdir=testdir),
                '{tdir}/osdmap'.format(tdir=testdir),
                ],
            wait=False,
            ),
        )

    try:
        yield
    except Exception:
        # we need to know this below
        ctx.summary['success'] = False
        raise
    finally:
        (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()

        log.info('Checking cluster log for badness...')
        def first_in_ceph_log(pattern, excludes):
            """
            Find the first occurence of the pattern specified in the Ceph log,
            Returns None if none found.

            :param pattern: Pattern scanned for.
            :param excludes: Patterns to ignore.
            :return: First line of text (or None if not found)
            """
            args = [
                'sudo',
                'egrep', pattern,
                '/var/log/ceph/ceph.log',
                ]
            for exclude in excludes:
                args.extend([run.Raw('|'), 'egrep', '-v', exclude])
            args.extend([
                    run.Raw('|'), 'head', '-n', '1',
                    ])
            r = mon0_remote.run(
                stdout=StringIO(),
                args=args,
                )
            stdout = r.stdout.getvalue()
            if stdout != '':
                return stdout
            return None

        if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
                             config['log_whitelist']) is not None:
            log.warning('Found errors (ERR|WRN|SEC) in cluster log')
            ctx.summary['success'] = False
            # use the most severe problem as the failure reason
            if 'failure_reason' not in ctx.summary:
                for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
                    match = first_in_ceph_log(pattern, config['log_whitelist'])
                    if match is not None:
                        ctx.summary['failure_reason'] = \
                            '"{match}" in cluster log'.format(
                            match=match.rstrip('\n'),
                            )
                        break

        for remote, dirs in devs_to_clean.iteritems():
            for dir_ in dirs:
                log.info('Unmounting %s on %s' % (dir_, remote))
                try:
                    remote.run(
                        args=[
                            'sync',
                            run.Raw('&&'),
                            'sudo',
                            'umount',
                            '-f',
                            dir_
                        ]
                    )
                except Exception as e:
                    remote.run(args=[
                            'sudo',
                            run.Raw('PATH=/usr/sbin:$PATH'),
                            'lsof',
                            run.Raw(';'),
                            'ps', 'auxf',
                            ])
                    raise e

        if config.get('tmpfs_journal'):
            log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
            for remote, roles_for_host in osds.remotes.iteritems():
                remote.run(
                    args=[ 'sudo', 'umount', '-f', '/mnt' ],
                    check_status=False,
                )

        if ctx.archive is not None and \
           not (ctx.config.get('archive-on-error') and ctx.summary['success']):

            # archive mon data, too
            log.info('Archiving mon data...')
            path = os.path.join(ctx.archive, 'data')
            os.makedirs(path)
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    if role.startswith('mon.'):
                        teuthology.pull_directory_tarball(
                            remote,
                            '/var/lib/ceph/mon',
                            path + '/' + role + '.tgz')

        log.info('Cleaning ceph cluster...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-rf',
                    '--',
                    conf_path,
                    keyring_path,
                    '{tdir}/data'.format(tdir=testdir),
                    '{tdir}/monmap'.format(tdir=testdir),
                    ],
                wait=False,
                ),
            )
示例#9
0
def task(ctx, config):
    """
    Test [deep] scrub

    tasks:
    - chef:
    - install:
    - ceph:
        log-whitelist:
        - '!= known digest'
        - '!= known omap_digest'
        - deep-scrub 0 missing, 1 inconsistent objects
        - deep-scrub 1 errors
        - repair 0 missing, 1 inconsistent objects
        - repair 1 errors, 1 fixed
    - scrub_test:
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'scrub_test task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    for i in range(num_osds):
        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats')
    manager.wait_for_clean()

    # write some data
    p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1',
                               'write', '-b', '4096'])
    log.info('err is %d' % p.exitstatus)

    # wait for some PG to have data that we can mess with
    pg, acting = wait_for_victim_pg(manager)
    osd = acting[0]

    osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd)
    manager.do_rados(mon, ['-p', 'rbd', 'setomapval', obj_name, 'key', 'val'])
    log.info('err is %d' % p.exitstatus)
    manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', obj_name, 'hdr'])
    log.info('err is %d' % p.exitstatus)

    log.info('messing with PG %s on osd %d' % (pg, osd))
    test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path)
    test_repair_bad_omap(ctx, manager, pg, osd, obj_name)
    test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd,
                               obj_name, obj_path)
    log.info('test successful!')
示例#10
0
def task(ctx, config):
    """
    Test the dump_stuck command.

    :param ctx: Context
    :param config: Configuration
    """
    assert config is None, \
        'dump_stuck requires no configuration'
    assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
        'dump_stuck requires exactly 2 osds'

    timeout = 60
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    manager.flush_pg_stats([0, 1])
    manager.wait_for_clean(timeout)

    manager.raw_cluster_cmd(
        'tell',
        'mon.0',
        'injectargs',
        '--',
        #                            '--mon-osd-report-timeout 90',
        '--mon-pg-stuck-threshold 10')

    # all active+clean
    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
    )
    num_pgs = manager.get_num_pgs()

    manager.mark_out_osd(0)
    time.sleep(timeout)
    manager.flush_pg_stats([1])
    manager.wait_for_recovery(timeout)

    # all active+clean+remapped
    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
    )

    manager.mark_in_osd(0)
    manager.flush_pg_stats([0, 1])
    manager.wait_for_clean(timeout)

    # all active+clean
    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
    )

    log.info('stopping first osd')
    manager.kill_osd(0)
    manager.mark_down_osd(0)

    log.info('waiting for all to be unclean')
    starttime = time.time()
    done = False
    while not done:
        try:
            check_stuck(
                manager,
                num_inactive=0,
                num_unclean=num_pgs,
                num_stale=0,
            )
            done = True
        except AssertionError:
            # wait up to 15 minutes to become stale
            if time.time() - starttime > 900:
                raise

    log.info('stopping second osd')
    manager.kill_osd(1)
    manager.mark_down_osd(1)

    log.info('waiting for all to be stale')
    starttime = time.time()
    done = False
    while not done:
        try:
            check_stuck(
                manager,
                num_inactive=0,
                num_unclean=num_pgs,
                num_stale=num_pgs,
            )
            done = True
        except AssertionError:
            # wait up to 15 minutes to become stale
            if time.time() - starttime > 900:
                raise

    log.info('reviving')
    for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
        manager.revive_osd(id_)
        manager.mark_in_osd(id_)
    while True:
        try:
            manager.flush_pg_stats([0, 1])
            break
        except Exception:
            log.exception('osds must not be started yet, waiting...')
            time.sleep(1)
    manager.wait_for_clean(timeout)

    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
    )
示例#11
0
def task(ctx, config):
    """
    Execute a radosbench parameter sweep

    Puts radosbench in a loop, taking values from the given config at each
    iteration. If given, the min and max values below create a range, e.g.
    min_replicas=1 and max_replicas=3 implies executing with 1-3 replicas.

    Parameters:

        clients: [client list]
        time: seconds to run (default=120)
        sizes: [list of object sizes] (default=[4M])
        mode: <write|read|seq> (default=write)
        repetitions: execute the same configuration multiple times (default=1)
        min_num_replicas: minimum number of replicas to use (default = 3)
        max_num_replicas: maximum number of replicas to use (default = 3)
        min_num_osds: the minimum number of OSDs in a pool (default=all)
        max_num_osds: the maximum number of OSDs in a pool (default=all)
        file: name of CSV-formatted output file (default='radosbench.csv')
        columns: columns to include (default=all)
          - rep: execution number (takes values from 'repetitions')
          - num_osd: number of osds for pool
          - num_replica: number of replicas
          - avg_throughput: throughput
          - avg_latency: latency
          - stdev_throughput:
          - stdev_latency:

    Example:
    - radsobenchsweep:
        columns: [rep, num_osd, num_replica, avg_throughput, stdev_throughput]
    """
    log.info('Beginning radosbenchsweep...')
    assert isinstance(config, dict), 'expecting dictionary for configuration'

    # get and validate config values
    # {

    # only one client supported for now
    if len(config.get('clients', [])) != 1:
        raise Exception("Only one client can be specified")

    # only write mode
    if config.get('mode', 'write') != 'write':
        raise Exception("Only 'write' mode supported for now.")

    # OSDs
    total_osds_in_cluster = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    min_num_osds = config.get('min_num_osds', total_osds_in_cluster)
    max_num_osds = config.get('max_num_osds', total_osds_in_cluster)

    if max_num_osds > total_osds_in_cluster:
        raise Exception('max_num_osds cannot be greater than total in cluster')
    if min_num_osds < 1:
        raise Exception('min_num_osds cannot be less than 1')
    if min_num_osds > max_num_osds:
        raise Exception('min_num_osds cannot be greater than max_num_osd')
    osds = range(0, (total_osds_in_cluster + 1))

    # replicas
    min_num_replicas = config.get('min_num_replicas', 3)
    max_num_replicas = config.get('max_num_replicas', 3)

    if min_num_replicas < 1:
        raise Exception('min_num_replicas cannot be less than 1')
    if min_num_replicas > max_num_replicas:
        raise Exception('min_num_replicas cannot be greater than max_replicas')
    if max_num_replicas > max_num_osds:
        raise Exception('max_num_replicas cannot be greater than max_num_osds')
    replicas = range(min_num_replicas, (max_num_replicas + 1))

    # object size
    sizes = config.get('size', [4 << 20])

    # repetitions
    reps = range(config.get('repetitions', 1))

    # file
    fname = config.get('file', 'radosbench.csv')
    f = open('{}/{}'.format(ctx.archive, fname), 'w')
    f.write(get_csv_header(config) + '\n')
    # }

    # set default pools size=1 to avoid 'unhealthy' issues
    ctx.manager.set_pool_property('data', 'size', 1)
    ctx.manager.set_pool_property('metadata', 'size', 1)
    ctx.manager.set_pool_property('rbd', 'size', 1)

    current_osds_out = 0

    # sweep through all parameters
    for osds_out, size, replica, rep in product(osds, sizes, replicas, reps):

        osds_in = total_osds_in_cluster - osds_out

        if osds_in == 0:
            # we're done
            break

        if current_osds_out != osds_out:
            # take an osd out
            ctx.manager.raw_cluster_cmd(
                'osd', 'reweight', str(osds_out-1), '0.0')
            wait_until_healthy(ctx, config)
            current_osds_out = osds_out

        if osds_in not in range(min_num_osds, (max_num_osds + 1)):
            # no need to execute with a number of osds that wasn't requested
            continue

        if osds_in < replica:
            # cannot execute with more replicas than available osds
            continue

        run_radosbench(ctx, config, f, osds_in, size, replica, rep)

    f.close()

    yield
def task(ctx, config):
    """
    Test backfill
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'thrashosds task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)
    assert num_osds == 3

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    while len(manager.get_osd_status()['up']) < 3:
        time.sleep(10)
    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
    manager.wait_for_clean()

    # write some data
    p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096',
                          '--no-cleanup'])
    err = p.wait()
    log.info('err is %d' % err)

    # mark osd.0 out to trigger a rebalance/backfill
    manager.mark_out_osd(0)

    # also mark it down to it won't be included in pg_temps
    manager.kill_osd(0)
    manager.mark_down_osd(0)

    # wait for everything to peer and be happy...
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
    manager.wait_for_recovery()

    # write some new data
    p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '30', 'write', '-b', '4096',
                          '--no-cleanup'])

    time.sleep(15)

    # blackhole + restart osd.1
    # this triggers a divergent backfill target
    manager.blackhole_kill_osd(1)
    time.sleep(2)
    manager.revive_osd(1)

    # wait for our writes to complete + succeed
    err = p.wait()
    log.info('err is %d' % err)

    # cluster must recover
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
    manager.wait_for_recovery()

    # re-add osd.0
    manager.revive_osd(0)
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
    manager.wait_for_clean()
示例#13
0
def task(ctx, config):
    """
    Test (non-backfill) recovery
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'task only accepts a dict for configuration'
    testdir = teuthology.get_testdir(ctx)
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)
    assert num_osds == 3

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    while len(manager.get_osd_status()['up']) < 3:
        time.sleep(10)
    manager.flush_pg_stats([0, 1, 2])
    manager.wait_for_clean()

    # test some osdmap flags
    manager.raw_cluster_cmd('osd', 'set', 'noin')
    manager.raw_cluster_cmd('osd', 'set', 'noout')
    manager.raw_cluster_cmd('osd', 'set', 'noup')
    manager.raw_cluster_cmd('osd', 'set', 'nodown')
    manager.raw_cluster_cmd('osd', 'unset', 'noin')
    manager.raw_cluster_cmd('osd', 'unset', 'noout')
    manager.raw_cluster_cmd('osd', 'unset', 'noup')
    manager.raw_cluster_cmd('osd', 'unset', 'nodown')

    # write some new data
    p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '20', 'write', '-b', '4096',
                          '--no-cleanup'])

    time.sleep(15)

    # trigger a divergent target:
    #  blackhole + restart osd.1 (shorter log)
    manager.blackhole_kill_osd(1)
    #  kill osd.2 (longer log... we'll make it divergent below)
    manager.kill_osd(2)
    time.sleep(2)
    manager.revive_osd(1)

    # wait for our writes to complete + succeed
    err = p.wait()
    log.info('err is %d' % err)

    # cluster must repeer
    manager.flush_pg_stats([0, 1])
    manager.wait_for_active_or_down()

    # write some more (make sure osd.2 really is divergent)
    p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096'])
    p.wait()

    # revive divergent osd
    manager.revive_osd(2)

    while len(manager.get_osd_status()['up']) < 3:
        log.info('waiting a bit...')
        time.sleep(2)
    log.info('3 are up!')

    # cluster must recover
    manager.flush_pg_stats([0, 1, 2])
    manager.wait_for_clean()
示例#14
0
def task(ctx, config):
    """
    Test [deep] scrub

    tasks:
    - chef:
    - install:
    - ceph:
        log-whitelist:
        - '!= data_digest'
        - '!= omap_digest'
        - '!= size'
        - deep-scrub 0 missing, 1 inconsistent objects
        - deep-scrub [0-9]+ errors
        - repair 0 missing, 1 inconsistent objects
        - repair [0-9]+ errors, [0-9]+ fixed
        - shard [0-9]+ .* : missing
        - deep-scrub 1 missing, 1 inconsistent objects
        - does not match object info size
        - attr name mistmatch
        - deep-scrub 1 missing, 0 inconsistent objects
        - failed to pick suitable auth object
        - candidate size [0-9]+ info size [0-9]+ mismatch
      conf:
        osd:
          osd deep scrub update digest min age: 0
    - scrub_test:
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'scrub_test task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.keys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    for i in range(num_osds):
        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
                                '--', '--osd-objectstore-fuse')
    manager.flush_pg_stats(range(num_osds))
    manager.wait_for_clean()

    # write some data
    p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1',
                               'write', '-b', '4096'])
    log.info('err is %d' % p.exitstatus)

    # wait for some PG to have data that we can mess with
    pg, acting = wait_for_victim_pg(manager)
    osd = acting[0]

    osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd)
    manager.do_rados(mon, ['-p', 'rbd', 'setomapval', obj_name, 'key', 'val'])
    log.info('err is %d' % p.exitstatus)
    manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', obj_name, 'hdr'])
    log.info('err is %d' % p.exitstatus)

    # Update missing digests, requires "osd deep scrub update digest min age: 0"
    pgnum = get_pgnum(pg)
    manager.do_pg_scrub('rbd', pgnum, 'deep-scrub')

    log.info('messing with PG %s on osd %d' % (pg, osd))
    test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, 'rbd')
    test_repair_bad_omap(ctx, manager, pg, osd, obj_name)
    test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd,
                               obj_name, obj_path)
    log.info('test successful!')

    # shut down fuse mount
    for i in range(num_osds):
        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
                                '--', '--no-osd-objectstore-fuse')
    time.sleep(5)
    log.info('done')
示例#15
0
def task(ctx, config):
    """
    Test [deep] scrub

    tasks:
    - chef:
    - install:
    - ceph:
        log-whitelist:
        - '!= known digest'
        - '!= known omap_digest'
        - deep-scrub 0 missing, 1 inconsistent objects
        - deep-scrub 1 errors
        - repair 0 missing, 1 inconsistent objects
        - repair 1 errors, 1 fixed
    - scrub_test: 
    
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'scrub_test task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    
    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    for i in range(num_osds):
        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats')
    manager.wait_for_clean()

    # write some data
    p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1', 'write', '-b', '4096'])
    err = p.exitstatus
    log.info('err is %d' % err)

    # wait for some PG to have data that we can mess with
    victim = None
    osd = None
    while victim is None:
        stats = manager.get_pg_stats()
        for pg in stats:
            size = pg['stat_sum']['num_bytes']
            if size > 0:
                victim = pg['pgid']
                osd = pg['acting'][0]
                break

        if victim is None:
            time.sleep(3)

    log.info('messing with PG %s on osd %d' % (victim, osd))

    (osd_remote,) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys()
    data_path = os.path.join(
        '/var/lib/ceph/osd',
        'ceph-{id}'.format(id=osd),
        'current',
        '{pg}_head'.format(pg=victim)
        )

    # fuzz time
    ls_fp = StringIO()
    osd_remote.run(
        args=[ 'sudo', 'ls', data_path ],
        stdout=ls_fp,
    )
    ls_out = ls_fp.getvalue()
    ls_fp.close()

    # find an object file we can mess with
    osdfilename = None
    for line in ls_out.split('\n'):
        if 'object' in line:
            osdfilename = line
            break
    assert osdfilename is not None

    # Get actual object name from osd stored filename
    tmp=osdfilename.split('__')
    objname=tmp[0]
    objname=objname.replace('\u', '_')
    log.info('fuzzing %s' % objname)

    # put a single \0 at the beginning of the file
    osd_remote.run(
        args=[ 'sudo', 'dd',
               'if=/dev/zero',
               'of=%s' % os.path.join(data_path, osdfilename),
               'bs=1', 'count=1', 'conv=notrunc'
             ]
    )

    # scrub, verify inconsistent
    manager.raw_cluster_cmd('pg', 'deep-scrub', victim)
    # Give deep-scrub a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert inconsistent
        break


    # repair, verify no longer inconsistent
    manager.raw_cluster_cmd('pg', 'repair', victim)
    # Give repair a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert not inconsistent
        break

    # Test deep-scrub with various omap modifications
    manager.do_rados(mon, ['-p', 'rbd', 'setomapval', objname, 'key', 'val'])
    manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', objname, 'hdr'])

    # Modify omap on specific osd
    log.info('fuzzing omap of %s' % objname)
    manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'key']);
    manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname, 'badkey', 'badval']);
    manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'badhdr']);

    # scrub, verify inconsistent
    manager.raw_cluster_cmd('pg', 'deep-scrub', victim)
    # Give deep-scrub a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert inconsistent
        break

    # repair, verify no longer inconsistent
    manager.raw_cluster_cmd('pg', 'repair', victim)
    # Give repair a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert not inconsistent
        break

    log.info('test successful!')
示例#16
0
def task(ctx, config):
    """
    Test the dump_stuck command.

    The ceph configuration should include::

        mon_osd_report_timeout = 90
        mon_pg_stuck_threshold = 10
    """
    assert config is None, \
        'dump_stuck requires no configuration'
    assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
        'dump_stuck requires exactly 2 osds'

    timeout = 60
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.wait_for_clean(timeout)

    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
        )
    num_pgs = manager.get_num_pgs()

    manager.mark_out_osd(0)
    time.sleep(timeout)
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.wait_for_recovery(timeout)

    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=num_pgs,
        num_stale=0,
        )

    manager.mark_in_osd(0)
    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.wait_for_clean(timeout)

    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
        )

    for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
        manager.kill_osd(id_)
        manager.mark_down_osd(id_)

    starttime = time.time()
    done = False
    while not done:
        try:
            check_stuck(
                manager,
                num_inactive=0,
                num_unclean=0,
                num_stale=num_pgs,
                )
            done = True
        except AssertionError:
            # wait up to 15 minutes to become stale
            if time.time() - starttime > 900:
                raise

    for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
        manager.revive_osd(id_)
        manager.mark_in_osd(id_)
    time.sleep(timeout)
    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.wait_for_clean(timeout)

    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
        )
示例#17
0
def task(ctx, config):
    """
    Test [deep] scrub

    tasks:
    - chef:
    - install:
    - ceph:
        log-whitelist:
        - '!= known digest'
        - '!= known omap_digest'
        - deep-scrub 0 missing, 1 inconsistent objects
        - deep-scrub 1 errors
        - repair 0 missing, 1 inconsistent objects
        - repair 1 errors, 1 fixed
    - scrub_test: 
    
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'scrub_test task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    for i in range(num_osds):
        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats')
    manager.wait_for_clean()

    # write some data
    p = manager.do_rados(
        mon,
        ['-p', 'rbd', 'bench', '--no-cleanup', '1', 'write', '-b', '4096'])
    err = p.exitstatus
    log.info('err is %d' % err)

    # wait for some PG to have data that we can mess with
    victim = None
    osd = None
    while victim is None:
        stats = manager.get_pg_stats()
        for pg in stats:
            size = pg['stat_sum']['num_bytes']
            if size > 0:
                victim = pg['pgid']
                osd = pg['acting'][0]
                break

        if victim is None:
            time.sleep(3)

    log.info('messing with PG %s on osd %d' % (victim, osd))

    (osd_remote, ) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys()
    data_path = os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=osd),
                             'current', '{pg}_head'.format(pg=victim))

    # fuzz time
    ls_fp = StringIO()
    osd_remote.run(
        args=['sudo', 'ls', data_path],
        stdout=ls_fp,
    )
    ls_out = ls_fp.getvalue()
    ls_fp.close()

    # find an object file we can mess with
    osdfilename = None
    for line in ls_out.split('\n'):
        if 'object' in line:
            osdfilename = line
            break
    assert osdfilename is not None

    # Get actual object name from osd stored filename
    tmp = osdfilename.split('__')
    objname = tmp[0]
    objname = objname.replace('\u', '_')
    log.info('fuzzing %s' % objname)

    # put a single \0 at the beginning of the file
    osd_remote.run(args=[
        'sudo', 'dd', 'if=/dev/zero',
        'of=%s' %
        os.path.join(data_path, osdfilename), 'bs=1', 'count=1', 'conv=notrunc'
    ])

    # scrub, verify inconsistent
    manager.raw_cluster_cmd('pg', 'deep-scrub', victim)
    # Give deep-scrub a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert inconsistent
        break

    # repair, verify no longer inconsistent
    manager.raw_cluster_cmd('pg', 'repair', victim)
    # Give repair a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert not inconsistent
        break

    # Test deep-scrub with various omap modifications
    manager.do_rados(mon, ['-p', 'rbd', 'setomapval', objname, 'key', 'val'])
    manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', objname, 'hdr'])

    # Modify omap on specific osd
    log.info('fuzzing omap of %s' % objname)
    manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'key'])
    manager.osd_admin_socket(
        osd, ['setomapval', 'rbd', objname, 'badkey', 'badval'])
    manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'badhdr'])

    # scrub, verify inconsistent
    manager.raw_cluster_cmd('pg', 'deep-scrub', victim)
    # Give deep-scrub a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert inconsistent
        break

    # repair, verify no longer inconsistent
    manager.raw_cluster_cmd('pg', 'repair', victim)
    # Give repair a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert not inconsistent
        break

    log.info('test successful!')
def test_incomplete_pgs(ctx, config):
    """
    Test handling of incomplete pgs.  Requires 4 osds.
    """
    testdir = teuthology.get_testdir(ctx)
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)
    assert num_osds == 4

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    while len(manager.get_osd_status()['up']) < 4:
        time.sleep(10)

    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
    manager.wait_for_clean()

    log.info('Testing incomplete pgs...')

    for i in range(4):
        manager.set_config(i, osd_recovery_delay_start=1000)

    # move data off of osd.0, osd.1
    manager.raw_cluster_cmd('osd', 'out', '0', '1')
    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
    manager.wait_for_clean()

    # lots of objects in rbd (no pg log, will backfill)
    p = rados_start(
        testdir, mon,
        ['-p', 'rbd', 'bench', '20', 'write', '-b', '1', '--no-cleanup'])
    p.wait()

    # few objects in rbd pool (with pg log, normal recovery)
    for f in range(1, 20):
        p = rados_start(testdir, mon,
                        ['-p', 'rbd', 'put',
                         'foo.%d' % f, '/etc/passwd'])
        p.wait()

    # move it back
    manager.raw_cluster_cmd('osd', 'in', '0', '1')
    manager.raw_cluster_cmd('osd', 'out', '2', '3')
    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.3', 'flush_pg_stats')
    manager.wait_for_active()

    assert not manager.is_clean()
    assert not manager.is_recovered()

    # kill 2 + 3
    log.info('stopping 2,3')
    manager.kill_osd(2)
    manager.kill_osd(3)
    log.info('...')
    manager.raw_cluster_cmd('osd', 'down', '2', '3')
    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.wait_for_active_or_down()

    assert manager.get_num_down() > 0

    # revive 2 + 3
    manager.revive_osd(2)
    manager.revive_osd(3)
    while len(manager.get_osd_status()['up']) < 4:
        log.info('waiting a bit...')
        time.sleep(2)
    log.info('all are up!')

    for i in range(4):
        manager.kick_recovery_wq(i)

    # cluster must recover
    manager.wait_for_clean()
示例#19
0
def task(ctx, config):
    """
    Test the dump_stuck command.

    :param ctx: Context
    :param config: Configuration
    """
    assert config is None, \
        'dump_stuck requires no configuration'
    assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
        'dump_stuck requires exactly 2 osds'

    timeout = 60
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    manager.flush_pg_stats([0, 1])
    manager.wait_for_clean(timeout)

    manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',
#                            '--mon-osd-report-timeout 90',
                            '--mon-pg-stuck-threshold 10')

    # all active+clean
    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
        )
    num_pgs = manager.get_num_pgs()

    manager.mark_out_osd(0)
    time.sleep(timeout)
    manager.flush_pg_stats([1])
    manager.wait_for_recovery(timeout)

    # all active+clean+remapped
    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
        )

    manager.mark_in_osd(0)
    manager.flush_pg_stats([0, 1])
    manager.wait_for_clean(timeout)

    # all active+clean
    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
        )

    log.info('stopping first osd')
    manager.kill_osd(0)
    manager.mark_down_osd(0)
    manager.wait_for_active(timeout)

    log.info('waiting for all to be unclean')
    starttime = time.time()
    done = False
    while not done:
        try:
            check_stuck(
                manager,
                num_inactive=0,
                num_unclean=num_pgs,
                num_stale=0,
                )
            done = True
        except AssertionError:
            # wait up to 15 minutes to become stale
            if time.time() - starttime > 900:
                raise


    log.info('stopping second osd')
    manager.kill_osd(1)
    manager.mark_down_osd(1)

    log.info('waiting for all to be stale')
    starttime = time.time()
    done = False
    while not done:
        try:
            check_stuck(
                manager,
                num_inactive=0,
                num_unclean=num_pgs,
                num_stale=num_pgs,
                )
            done = True
        except AssertionError:
            # wait up to 15 minutes to become stale
            if time.time() - starttime > 900:
                raise

    log.info('reviving')
    for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
        manager.revive_osd(id_)
        manager.mark_in_osd(id_)
    while True:
        try:
            manager.flush_pg_stats([0, 1])
            break
        except Exception:
            log.exception('osds must not be started yet, waiting...')
            time.sleep(1)
    manager.wait_for_clean(timeout)

    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
        )
示例#20
0
文件: ceph.py 项目: tv42/teuthology
def cluster(ctx, config):
    log.info('Creating ceph cluster...')
    run.wait(
        ctx.cluster.run(
            args=[
                'install', '-d', '-m0755', '--',
                '/tmp/cephtest/data',
                ],
            wait=False,
            )
        )

    log.info('Generating config...')
    remotes_and_roles = ctx.cluster.remotes.items()
    roles = [roles for (remote, roles) in remotes_and_roles]
    ips = [host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, roles) in remotes_and_roles)]
    conf = teuthology.skeleton_config(roles=roles, ips=ips)
    for section, keys in config['conf'].iteritems():
        for key, value in keys.iteritems():
            log.info("[%s] %s = %s" % (section, key, value))
            if section not in conf:
                conf[section] = {}
            conf[section][key] = value

    ctx.ceph = argparse.Namespace()
    ctx.ceph.conf = conf

    log.info('Writing configs...')
    conf_fp = StringIO()
    conf.write(conf_fp)
    conf_fp.seek(0)
    writes = ctx.cluster.run(
        args=[
            'python',
            '-c',
            'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))',
            '/tmp/cephtest/ceph.conf',
            ],
        stdin=run.PIPE,
        wait=False,
        )
    teuthology.feed_many_stdins_and_close(conf_fp, writes)
    run.wait(writes)

    coverage_dir = '/tmp/cephtest/archive/coverage'

    firstmon = teuthology.get_first_mon(ctx, config)

    log.info('Setting up %s...' % firstmon)
    ctx.cluster.only(firstmon).run(
        args=[
            '/tmp/cephtest/enable-coredump',
            '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
            coverage_dir,
            '/tmp/cephtest/binary/usr/local/bin/ceph-authtool',
            '--create-keyring',
            '/tmp/cephtest/ceph.keyring',
            ],
        )
    ctx.cluster.only(firstmon).run(
        args=[
            '/tmp/cephtest/enable-coredump',
            '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
            coverage_dir,
            '/tmp/cephtest/binary/usr/local/bin/ceph-authtool',
            '--gen-key',
            '--name=mon.',
            '/tmp/cephtest/ceph.keyring',
            ],
        )
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.create_simple_monmap(
        remote=mon0_remote,
        conf=conf,
        )

    log.info('Creating admin key on %s...' % firstmon)
    ctx.cluster.only(firstmon).run(
        args=[
            '/tmp/cephtest/enable-coredump',
            '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
            coverage_dir,
            '/tmp/cephtest/binary/usr/local/bin/ceph-authtool',
            '--gen-key',
            '--name=client.admin',
            '--set-uid=0',
            '--cap', 'mon', 'allow *',
            '--cap', 'osd', 'allow *',
            '--cap', 'mds', 'allow',
            '/tmp/cephtest/ceph.keyring',
            ],
        )

    log.info('Copying monmap to all nodes...')
    keyring = teuthology.get_file(
        remote=mon0_remote,
        path='/tmp/cephtest/ceph.keyring',
        )
    monmap = teuthology.get_file(
        remote=mon0_remote,
        path='/tmp/cephtest/monmap',
        )

    for rem in ctx.cluster.remotes.iterkeys():
        # copy mon key and initial monmap
        log.info('Sending monmap to node {remote}'.format(remote=rem))
        teuthology.write_file(
            remote=rem,
            path='/tmp/cephtest/ceph.keyring',
            data=keyring,
            )
        teuthology.write_file(
            remote=rem,
            path='/tmp/cephtest/monmap',
            data=monmap,
            )

    log.info('Setting up mon nodes...')
    mons = ctx.cluster.only(teuthology.is_type('mon'))
    run.wait(
        mons.run(
            args=[
                '/tmp/cephtest/enable-coredump',
                '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                coverage_dir,
                '/tmp/cephtest/binary/usr/local/bin/osdmaptool',
                '--clobber',
                '--createsimple', '{num:d}'.format(
                    num=teuthology.num_instances_of_type(ctx.cluster, 'osd'),
                    ),
                '/tmp/cephtest/osdmap',
                '--pg_bits', '2',
                '--pgp_bits', '4',
                ],
            wait=False,
            ),
        )

    log.info('Setting up osd nodes...')
    osds = ctx.cluster.only(teuthology.is_type('osd'))
    for remote, roles_for_host in osds.remotes.iteritems():
        for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
            remote.run(
                args=[
                    '/tmp/cephtest/enable-coredump',
                    '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                    coverage_dir,
                    '/tmp/cephtest/binary/usr/local/bin/ceph-authtool',
                    '--create-keyring',
                    '--gen-key',
                    '--name=osd.{id}'.format(id=id_),
                    '/tmp/cephtest/data/osd.{id}.keyring'.format(id=id_),
                    ],
                )

    log.info('Setting up mds nodes...')
    mdss = ctx.cluster.only(teuthology.is_type('mds'))
    for remote, roles_for_host in mdss.remotes.iteritems():
        for id_ in teuthology.roles_of_type(roles_for_host, 'mds'):
            remote.run(
                args=[
                    '/tmp/cephtest/enable-coredump',
                    '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                    coverage_dir,
                    '/tmp/cephtest/binary/usr/local/bin/ceph-authtool',
                    '--create-keyring',
                    '--gen-key',
                    '--name=mds.{id}'.format(id=id_),
                    '/tmp/cephtest/data/mds.{id}.keyring'.format(id=id_),
                    ],
                )

    log.info('Setting up client nodes...')
    clients = ctx.cluster.only(teuthology.is_type('client'))
    for remote, roles_for_host in clients.remotes.iteritems():
        for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
            remote.run(
                args=[
                    '/tmp/cephtest/enable-coredump',
                    '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                    coverage_dir,
                    '/tmp/cephtest/binary/usr/local/bin/ceph-authtool',
                    '--create-keyring',
                    '--gen-key',
                    # TODO this --name= is not really obeyed, all unknown "types" are munged to "client"
                    '--name=client.{id}'.format(id=id_),
                    '/tmp/cephtest/data/client.{id}.keyring'.format(id=id_),
                    ],
                )

    log.info('Reading keys from all nodes...')
    keys_fp = StringIO()
    keys = []
    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
        for type_ in ['osd', 'mds', 'client']:
            for id_ in teuthology.roles_of_type(roles_for_host, type_):
                data = teuthology.get_file(
                    remote=remote,
                    path='/tmp/cephtest/data/{type}.{id}.keyring'.format(
                        type=type_,
                        id=id_,
                        ),
                    )
                keys.append((type_, id_, data))
                keys_fp.write(data)

    log.info('Adding keys to all mons...')
    writes = mons.run(
        args=[
            'cat',
            run.Raw('>>'),
            '/tmp/cephtest/ceph.keyring',
            ],
        stdin=run.PIPE,
        wait=False,
        )
    keys_fp.seek(0)
    teuthology.feed_many_stdins_and_close(keys_fp, writes)
    run.wait(writes)
    for type_, id_, data in keys:
        run.wait(
            mons.run(
                args=[
                    '/tmp/cephtest/enable-coredump',
                    '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                    coverage_dir,
                    '/tmp/cephtest/binary/usr/local/bin/ceph-authtool',
                    '/tmp/cephtest/ceph.keyring',
                    '--name={type}.{id}'.format(
                        type=type_,
                        id=id_,
                        ),
                    ] + list(teuthology.generate_caps(type_)),
                wait=False,
                ),
            )

    log.info('Running mkfs on mon nodes...')
    for remote, roles_for_host in mons.remotes.iteritems():
        for id_ in teuthology.roles_of_type(roles_for_host, 'mon'):
            remote.run(
                args=[
                    '/tmp/cephtest/enable-coredump',
                    '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                    coverage_dir,
                    '/tmp/cephtest/binary/usr/local/bin/ceph-mon',
                    '--mkfs',
                    '-i', id_,
                    '-c', '/tmp/cephtest/ceph.conf',
                    '--monmap=/tmp/cephtest/monmap',
                    '--osdmap=/tmp/cephtest/osdmap',
                    '--keyring=/tmp/cephtest/ceph.keyring',
                    ],
                )

    log.info('Running mkfs on osd nodes...')
    devs_to_clean = {}
    for remote, roles_for_host in osds.remotes.iteritems():
        roles_to_devs = {}
        if config.get('btrfs'):
            log.info('btrfs option selected, checkin for scrach devs')
            devs = teuthology.get_scratch_devices(remote)
            log.info('found devs: %s' % (str(devs),))
            roles_to_devs = assign_devs(
                teuthology.roles_of_type(roles_for_host, 'osd'), devs
                )
            log.info('dev map: %s' % (str(roles_to_devs),))
            devs_to_clean[remote] = []

        for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
            remote.run(
                args=[
                    'mkdir',
                    os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_)),
                    ],
                )
            if roles_to_devs.get(id_):
                dev = roles_to_devs[id_]
                log.info('mkfs.btrfs on %s on %s' % (dev, remote))
                remote.run(
                    args=[
                        'sudo',
                        'apt-get', 'install', '-y', 'btrfs-tools'
                        ]
                    )
                remote.run(
                    args=[
                        'sudo',
                        'mkfs.btrfs',
                        dev
                        ]
                    )
                log.info('mount %s on %s' % (dev, remote))
                remote.run(
                    args=[
                        'sudo',
                        'mount',
                        '-o',
                        'user_subvol_rm_allowed',
                        dev,
                        os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_)),
                        ]
                    )
                remote.run(
                    args=[
                        'sudo', 'chown', '-R', 'ubuntu.ubuntu',
                        os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_))
                        ]
                    )
                remote.run(
                    args=[
                        'sudo', 'chmod', '-R', '755',
                        os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_))
                        ]
                    )
                devs_to_clean[remote].append(
                    os.path.join(
                        '/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_)
                        )
                    )

        for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
            remote.run(
                args=[
                    '/tmp/cephtest/enable-coredump',
                    '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                    coverage_dir,
                    '/tmp/cephtest/binary/usr/local/bin/ceph-osd',
                    '--mkfs',
                    '-i', id_,
                    '-c', '/tmp/cephtest/ceph.conf',
                    '--monmap', '/tmp/cephtest/monmap',
                    ],
                )
    run.wait(
        mons.run(
            args=[
                'rm',
                '--',
                '/tmp/cephtest/monmap',
                '/tmp/cephtest/osdmap',
                ],
            wait=False,
            ),
        )

    try:
        yield
    finally:
        (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
        if ctx.archive is not None:
            log.info('Grabbing cluster log from %s %s...' % (mon0_remote,
                                                             firstmon))
            dest = os.path.join(ctx.archive, 'ceph.log')
            mon0_remote.run(
                args = [
                    'cat',
                    '--',
                    '/tmp/cephtest/data/%s/log' % firstmon
                    ],
                stdout=file(dest, 'wb'),
                )

        log.info('Checking cluster ceph.log for badness...')
        def first_in_ceph_log(pattern, excludes):
            args = [
                'egrep', pattern,
                '/tmp/cephtest/data/%s/log' % firstmon,
                ]
            for exclude in excludes:
                args.extend([run.Raw('|'), 'egrep', '-v', exclude])
            args.extend([
                    run.Raw('|'), 'head', '-n', '1',
                    ])
            r = mon0_remote.run(
                stdout=StringIO(),
                args=args,
                )
            stdout = r.stdout.getvalue()
            if stdout != '':
                return stdout
            return None

        if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
                             config['log_whitelist']) is not None:
            log.warning('Found errors (ERR|WRN|SEC) in cluster log')
            ctx.summary['success'] = False
            # use the most severe problem as the failure reason
            if 'failure_reason' not in ctx.summary:
                for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
                    match = first_in_ceph_log(pattern, config['log_whitelist'])
                    if match is not None:
                        ctx.summary['failure_reason'] = \
                            '"{match}" in cluster log'.format(
                            match=match.rstrip('\n'),
                            )
                        break

        for remote, dirs in devs_to_clean.iteritems():
            for dir_ in dirs:
                log.info('Unmounting %s on %s' % (dir_, remote))
                remote.run(
                    args=[
                        "sudo",
                        "umount",
                        "-f",
                        dir_
                        ]
                    )

        log.info('Cleaning ceph cluster...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'rm',
                    '-rf',
                    '--',
                    '/tmp/cephtest/ceph.conf',
                    '/tmp/cephtest/ceph.keyring',
                    '/tmp/cephtest/data',
                    '/tmp/cephtest/monmap',
                    run.Raw('/tmp/cephtest/asok.*')
                    ],
                wait=False,
                ),
            )
示例#21
0
def cluster(ctx, config):
    """
    Handle the creation and removal of a ceph cluster.

    On startup:
        Create directories needed for the cluster.
        Create remote journals for all osds.
        Create and set keyring.
        Copy the monmap to tht test systems.
        Setup mon nodes.
        Setup mds nodes.
        Mkfs osd nodes.
        Add keyring information to monmaps
        Mkfs mon nodes.

    On exit:
        If errors occured, extract a failure message and store in ctx.summary.
        Unmount all test files and temporary journaling files.
        Save the monitor information and archive all ceph logs.
        Cleanup the keyring setup, and remove all monitor map and data files left over.

    :param ctx: Context
    :param config: Configuration
    """
    if ctx.config.get("use_existing_cluster", False) is True:
        log.info("'use_existing_cluster' is true; skipping cluster creation")
        yield

    testdir = teuthology.get_testdir(ctx)
    cluster_name = config["cluster"]
    data_dir = "{tdir}/{cluster}.data".format(tdir=testdir, cluster=cluster_name)
    log.info("Creating ceph cluster %s...", cluster_name)
    run.wait(ctx.cluster.run(args=["install", "-d", "-m0755", "--", data_dir], wait=False))

    run.wait(ctx.cluster.run(args=["sudo", "install", "-d", "-m0777", "--", "/var/run/ceph"], wait=False))

    devs_to_clean = {}
    remote_to_roles_to_devs = {}
    remote_to_roles_to_journals = {}
    osds = ctx.cluster.only(teuthology.is_type("osd", cluster_name))
    for remote, roles_for_host in osds.remotes.iteritems():
        devs = teuthology.get_scratch_devices(remote)
        roles_to_devs = {}
        roles_to_journals = {}
        if config.get("fs"):
            log.info("fs option selected, checking for scratch devs")
            log.info("found devs: %s" % (str(devs),))
            devs_id_map = teuthology.get_wwn_id_map(remote, devs)
            iddevs = devs_id_map.values()
            roles_to_devs = assign_devs(teuthology.cluster_roles_of_type(roles_for_host, "osd", cluster_name), iddevs)
            if len(roles_to_devs) < len(iddevs):
                iddevs = iddevs[len(roles_to_devs) :]
            devs_to_clean[remote] = []

        if config.get("block_journal"):
            log.info("block journal enabled")
            roles_to_journals = assign_devs(
                teuthology.cluster_roles_of_type(roles_for_host, "osd", cluster_name), iddevs
            )
            log.info("journal map: %s", roles_to_journals)

        if config.get("tmpfs_journal"):
            log.info("tmpfs journal enabled")
            roles_to_journals = {}
            remote.run(args=["sudo", "mount", "-t", "tmpfs", "tmpfs", "/mnt"])
            for role in teuthology.cluster_roles_of_type(roles_for_host, "osd", cluster_name):
                tmpfs = "/mnt/" + role
                roles_to_journals[role] = tmpfs
                remote.run(args=["truncate", "-s", "1500M", tmpfs])
            log.info("journal map: %s", roles_to_journals)

        log.info("dev map: %s" % (str(roles_to_devs),))
        remote_to_roles_to_devs[remote] = roles_to_devs
        remote_to_roles_to_journals[remote] = roles_to_journals

    log.info("Generating config...")
    remotes_and_roles = ctx.cluster.remotes.items()
    roles = [role_list for (remote, role_list) in remotes_and_roles]
    ips = [
        host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)
    ]
    conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
    for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
        for role, journal in roles_to_journals.iteritems():
            name = teuthology.ceph_role(role)
            if name not in conf:
                conf[name] = {}
            conf[name]["osd journal"] = journal
    for section, keys in config["conf"].iteritems():
        for key, value in keys.iteritems():
            log.info("[%s] %s = %s" % (section, key, value))
            if section not in conf:
                conf[section] = {}
            conf[section][key] = value

    if config.get("tmpfs_journal"):
        conf["journal dio"] = False

    if not hasattr(ctx, "ceph"):
        ctx.ceph = {}
    ctx.ceph[cluster_name] = argparse.Namespace()
    ctx.ceph[cluster_name].conf = conf

    default_keyring = "/etc/ceph/{cluster}.keyring".format(cluster=cluster_name)
    keyring_path = config.get("keyring_path", default_keyring)

    coverage_dir = "{tdir}/archive/coverage".format(tdir=testdir)

    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)

    log.info("Setting up %s..." % firstmon)
    ctx.cluster.only(firstmon).run(
        args=[
            "sudo",
            "adjust-ulimits",
            "ceph-coverage",
            coverage_dir,
            "ceph-authtool",
            "--create-keyring",
            keyring_path,
        ]
    )
    ctx.cluster.only(firstmon).run(
        args=[
            "sudo",
            "adjust-ulimits",
            "ceph-coverage",
            coverage_dir,
            "ceph-authtool",
            "--gen-key",
            "--name=mon.",
            keyring_path,
        ]
    )
    ctx.cluster.only(firstmon).run(args=["sudo", "chmod", "0644", keyring_path])
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    monmap_path = "{tdir}/{cluster}.monmap".format(tdir=testdir, cluster=cluster_name)
    fsid = teuthology.create_simple_monmap(ctx, remote=mon0_remote, conf=conf, path=monmap_path)
    if not "global" in conf:
        conf["global"] = {}
    conf["global"]["fsid"] = fsid

    default_conf_path = "/etc/ceph/{cluster}.conf".format(cluster=cluster_name)
    conf_path = config.get("conf_path", default_conf_path)
    log.info("Writing %s for FSID %s..." % (conf_path, fsid))
    write_conf(ctx, conf_path, cluster_name)

    log.info("Creating admin key on %s..." % firstmon)
    ctx.cluster.only(firstmon).run(
        args=[
            "sudo",
            "adjust-ulimits",
            "ceph-coverage",
            coverage_dir,
            "ceph-authtool",
            "--gen-key",
            "--name=client.admin",
            "--set-uid=0",
            "--cap",
            "mon",
            "allow *",
            "--cap",
            "osd",
            "allow *",
            "--cap",
            "mds",
            "allow *",
            keyring_path,
        ]
    )

    log.info("Copying monmap to all nodes...")
    keyring = teuthology.get_file(remote=mon0_remote, path=keyring_path)
    monmap = teuthology.get_file(remote=mon0_remote, path=monmap_path)

    for rem in ctx.cluster.remotes.iterkeys():
        # copy mon key and initial monmap
        log.info("Sending monmap to node {remote}".format(remote=rem))
        teuthology.sudo_write_file(remote=rem, path=keyring_path, data=keyring, perms="0644")
        teuthology.write_file(remote=rem, path=monmap_path, data=monmap)

    log.info("Setting up mon nodes...")
    mons = ctx.cluster.only(teuthology.is_type("mon", cluster_name))
    osdmap_path = "{tdir}/{cluster}.osdmap".format(tdir=testdir, cluster=cluster_name)
    run.wait(
        mons.run(
            args=[
                "adjust-ulimits",
                "ceph-coverage",
                coverage_dir,
                "osdmaptool",
                "-c",
                conf_path,
                "--clobber",
                "--createsimple",
                "{num:d}".format(num=teuthology.num_instances_of_type(ctx.cluster, "osd", cluster_name)),
                osdmap_path,
                "--pg_bits",
                "2",
                "--pgp_bits",
                "4",
            ],
            wait=False,
        )
    )

    log.info("Setting up mgr nodes...")
    mgrs = ctx.cluster.only(teuthology.is_type("mgr", cluster_name))
    for remote, roles_for_host in mgrs.remotes.iteritems():
        for role in teuthology.cluster_roles_of_type(roles_for_host, "mgr", cluster_name):
            _, _, id_ = teuthology.split_role(role)
            mgr_dir = "/var/lib/ceph/mgr/{cluster}-{id}".format(cluster=cluster_name, id=id_)
            remote.run(
                args=[
                    "sudo",
                    "mkdir",
                    "-p",
                    mgr_dir,
                    run.Raw("&&"),
                    "sudo",
                    "adjust-ulimits",
                    "ceph-coverage",
                    coverage_dir,
                    "ceph-authtool",
                    "--create-keyring",
                    "--gen-key",
                    "--name=mgr.{id}".format(id=id_),
                    mgr_dir + "/keyring",
                ]
            )

    log.info("Setting up mds nodes...")
    mdss = ctx.cluster.only(teuthology.is_type("mds", cluster_name))
    for remote, roles_for_host in mdss.remotes.iteritems():
        for role in teuthology.cluster_roles_of_type(roles_for_host, "mds", cluster_name):
            _, _, id_ = teuthology.split_role(role)
            mds_dir = "/var/lib/ceph/mds/{cluster}-{id}".format(cluster=cluster_name, id=id_)
            remote.run(
                args=[
                    "sudo",
                    "mkdir",
                    "-p",
                    mds_dir,
                    run.Raw("&&"),
                    "sudo",
                    "adjust-ulimits",
                    "ceph-coverage",
                    coverage_dir,
                    "ceph-authtool",
                    "--create-keyring",
                    "--gen-key",
                    "--name=mds.{id}".format(id=id_),
                    mds_dir + "/keyring",
                ]
            )

    cclient.create_keyring(ctx, cluster_name)
    log.info("Running mkfs on osd nodes...")

    if not hasattr(ctx, "disk_config"):
        ctx.disk_config = argparse.Namespace()
    if not hasattr(ctx.disk_config, "remote_to_roles_to_dev"):
        ctx.disk_config.remote_to_roles_to_dev = {}
    if not hasattr(ctx.disk_config, "remote_to_roles_to_journals"):
        ctx.disk_config.remote_to_roles_to_journals = {}
    if not hasattr(ctx.disk_config, "remote_to_roles_to_dev_mount_options"):
        ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
    if not hasattr(ctx.disk_config, "remote_to_roles_to_dev_fstype"):
        ctx.disk_config.remote_to_roles_to_dev_fstype = {}

    teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
    teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)

    log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
    for remote, roles_for_host in osds.remotes.iteritems():
        roles_to_devs = remote_to_roles_to_devs[remote]
        roles_to_journals = remote_to_roles_to_journals[remote]

        for role in teuthology.cluster_roles_of_type(roles_for_host, "osd", cluster_name):
            _, _, id_ = teuthology.split_role(role)
            mnt_point = "/var/lib/ceph/osd/{cluster}-{id}".format(cluster=cluster_name, id=id_)
            remote.run(args=["sudo", "mkdir", "-p", mnt_point])
            log.info(str(roles_to_journals))
            log.info(role)
            if roles_to_devs.get(role):
                dev = roles_to_devs[role]
                fs = config.get("fs")
                package = None
                mkfs_options = config.get("mkfs_options")
                mount_options = config.get("mount_options")
                if fs == "btrfs":
                    # package = 'btrfs-tools'
                    if mount_options is None:
                        mount_options = ["noatime", "user_subvol_rm_allowed"]
                    if mkfs_options is None:
                        mkfs_options = ["-m", "single", "-l", "32768", "-n", "32768"]
                if fs == "xfs":
                    # package = 'xfsprogs'
                    if mount_options is None:
                        mount_options = ["noatime"]
                    if mkfs_options is None:
                        mkfs_options = ["-f", "-i", "size=2048"]
                if fs == "ext4" or fs == "ext3":
                    if mount_options is None:
                        mount_options = ["noatime", "user_xattr"]

                if mount_options is None:
                    mount_options = []
                if mkfs_options is None:
                    mkfs_options = []
                mkfs = ["mkfs.%s" % fs] + mkfs_options
                log.info("%s on %s on %s" % (mkfs, dev, remote))
                if package is not None:
                    remote.run(args=["sudo", "apt-get", "install", "-y", package], stdout=StringIO())

                try:
                    remote.run(args=["yes", run.Raw("|")] + ["sudo"] + mkfs + [dev])
                except run.CommandFailedError:
                    # Newer btfs-tools doesn't prompt for overwrite, use -f
                    if "-f" not in mount_options:
                        mkfs_options.append("-f")
                        mkfs = ["mkfs.%s" % fs] + mkfs_options
                        log.info("%s on %s on %s" % (mkfs, dev, remote))
                    remote.run(args=["yes", run.Raw("|")] + ["sudo"] + mkfs + [dev])

                log.info("mount %s on %s -o %s" % (dev, remote, ",".join(mount_options)))
                remote.run(args=["sudo", "mount", "-t", fs, "-o", ",".join(mount_options), dev, mnt_point])
                if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
                    ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
                ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
                if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
                    ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
                ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
                devs_to_clean[remote].append(mnt_point)

        for role in teuthology.cluster_roles_of_type(roles_for_host, "osd", cluster_name):
            _, _, id_ = teuthology.split_role(role)
            remote.run(
                args=[
                    "sudo",
                    "MALLOC_CHECK_=3",
                    "adjust-ulimits",
                    "ceph-coverage",
                    coverage_dir,
                    "ceph-osd",
                    "--cluster",
                    cluster_name,
                    "--mkfs",
                    "--mkkey",
                    "-i",
                    id_,
                    "--monmap",
                    monmap_path,
                ]
            )

    log.info("Reading keys from all nodes...")
    keys_fp = StringIO()
    keys = []
    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
        for type_ in ["mgr", "mds", "osd"]:
            for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
                _, _, id_ = teuthology.split_role(role)
                data = teuthology.get_file(
                    remote=remote,
                    path="/var/lib/ceph/{type}/{cluster}-{id}/keyring".format(type=type_, id=id_, cluster=cluster_name),
                    sudo=True,
                )
                keys.append((type_, id_, data))
                keys_fp.write(data)
    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
        for role in teuthology.cluster_roles_of_type(roles_for_host, "client", cluster_name):
            _, _, id_ = teuthology.split_role(role)
            data = teuthology.get_file(
                remote=remote, path="/etc/ceph/{cluster}.client.{id}.keyring".format(id=id_, cluster=cluster_name)
            )
            keys.append(("client", id_, data))
            keys_fp.write(data)

    log.info("Adding keys to all mons...")
    writes = mons.run(args=["sudo", "tee", "-a", keyring_path], stdin=run.PIPE, wait=False, stdout=StringIO())
    keys_fp.seek(0)
    teuthology.feed_many_stdins_and_close(keys_fp, writes)
    run.wait(writes)
    for type_, id_, data in keys:
        run.wait(
            mons.run(
                args=[
                    "sudo",
                    "adjust-ulimits",
                    "ceph-coverage",
                    coverage_dir,
                    "ceph-authtool",
                    keyring_path,
                    "--name={type}.{id}".format(type=type_, id=id_),
                ]
                + list(generate_caps(type_)),
                wait=False,
            )
        )

    log.info("Running mkfs on mon nodes...")
    for remote, roles_for_host in mons.remotes.iteritems():
        for role in teuthology.cluster_roles_of_type(roles_for_host, "mon", cluster_name):
            _, _, id_ = teuthology.split_role(role)
            remote.run(
                args=["sudo", "mkdir", "-p", "/var/lib/ceph/mon/{cluster}-{id}".format(id=id_, cluster=cluster_name)]
            )
            remote.run(
                args=[
                    "sudo",
                    "adjust-ulimits",
                    "ceph-coverage",
                    coverage_dir,
                    "ceph-mon",
                    "--cluster",
                    cluster_name,
                    "--mkfs",
                    "-i",
                    id_,
                    "--monmap",
                    monmap_path,
                    "--osdmap",
                    osdmap_path,
                    "--keyring",
                    keyring_path,
                ]
            )

    run.wait(mons.run(args=["rm", "--", monmap_path, osdmap_path], wait=False))

    try:
        yield
    except Exception:
        # we need to know this below
        ctx.summary["success"] = False
        raise
    finally:
        (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()

        log.info("Checking cluster log for badness...")

        def first_in_ceph_log(pattern, excludes):
            """
            Find the first occurence of the pattern specified in the Ceph log,
            Returns None if none found.

            :param pattern: Pattern scanned for.
            :param excludes: Patterns to ignore.
            :return: First line of text (or None if not found)
            """
            args = ["sudo", "egrep", pattern, "/var/log/ceph/{cluster}.log".format(cluster=cluster_name)]
            for exclude in excludes:
                args.extend([run.Raw("|"), "egrep", "-v", exclude])
            args.extend([run.Raw("|"), "head", "-n", "1"])
            r = mon0_remote.run(stdout=StringIO(), args=args)
            stdout = r.stdout.getvalue()
            if stdout != "":
                return stdout
            return None

        if first_in_ceph_log("\[ERR\]|\[WRN\]|\[SEC\]", config["log_whitelist"]) is not None:
            log.warning("Found errors (ERR|WRN|SEC) in cluster log")
            ctx.summary["success"] = False
            # use the most severe problem as the failure reason
            if "failure_reason" not in ctx.summary:
                for pattern in ["\[SEC\]", "\[ERR\]", "\[WRN\]"]:
                    match = first_in_ceph_log(pattern, config["log_whitelist"])
                    if match is not None:
                        ctx.summary["failure_reason"] = '"{match}" in cluster log'.format(match=match.rstrip("\n"))
                        break

        for remote, dirs in devs_to_clean.iteritems():
            for dir_ in dirs:
                log.info("Unmounting %s on %s" % (dir_, remote))
                try:
                    remote.run(args=["sync", run.Raw("&&"), "sudo", "umount", "-f", dir_])
                except Exception as e:
                    remote.run(args=["sudo", run.Raw("PATH=/usr/sbin:$PATH"), "lsof", run.Raw(";"), "ps", "auxf"])
                    raise e

        if config.get("tmpfs_journal"):
            log.info("tmpfs journal enabled - unmounting tmpfs at /mnt")
            for remote, roles_for_host in osds.remotes.iteritems():
                remote.run(args=["sudo", "umount", "-f", "/mnt"], check_status=False)

        if ctx.archive is not None and not (ctx.config.get("archive-on-error") and ctx.summary["success"]):

            # archive mon data, too
            log.info("Archiving mon data...")
            path = os.path.join(ctx.archive, "data")
            try:
                os.makedirs(path)
            except OSError as e:
                if e.errno == errno.EEXIST:
                    pass
                else:
                    raise
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    is_mon = teuthology.is_type("mon", cluster_name)
                    if is_mon(role):
                        _, _, id_ = teuthology.split_role(role)
                        mon_dir = "/var/lib/ceph/mon/" + "{0}-{1}".format(cluster_name, id_)
                        teuthology.pull_directory_tarball(remote, mon_dir, path + "/" + role + ".tgz")

        log.info("Cleaning ceph cluster...")
        run.wait(
            ctx.cluster.run(
                args=[
                    "sudo",
                    "rm",
                    "-rf",
                    "--",
                    conf_path,
                    keyring_path,
                    data_dir,
                    monmap_path,
                    osdmap_path,
                    run.Raw("{tdir}/../*.pid".format(tdir=testdir)),
                ],
                wait=False,
            )
        )
示例#22
0
def task(ctx, config):
    """
    Test backfill
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'thrashosds task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)
    assert num_osds == 3

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    while len(manager.get_osd_status()['up']) < 3:
        manager.sleep(10)
    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
    manager.wait_for_clean()

    # write some data
    p = rados_start(
        ctx, mon,
        ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096', '--no-cleanup'])
    err = p.wait()
    log.info('err is %d' % err)

    # mark osd.0 out to trigger a rebalance/backfill
    manager.mark_out_osd(0)

    # also mark it down to it won't be included in pg_temps
    manager.kill_osd(0)
    manager.mark_down_osd(0)

    # wait for everything to peer and be happy...
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
    manager.wait_for_recovery()

    # write some new data
    p = rados_start(
        ctx, mon,
        ['-p', 'rbd', 'bench', '30', 'write', '-b', '4096', '--no-cleanup'])

    time.sleep(15)

    # blackhole + restart osd.1
    # this triggers a divergent backfill target
    manager.blackhole_kill_osd(1)
    time.sleep(2)
    manager.revive_osd(1)

    # wait for our writes to complete + succeed
    err = p.wait()
    log.info('err is %d' % err)

    # cluster must recover
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
    manager.wait_for_recovery()

    # re-add osd.0
    manager.revive_osd(0)
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
    manager.wait_for_clean()
示例#23
0
def task(ctx, config):
    """
    Test [deep] scrub
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'scrub_test task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    
    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    for i in range(num_osds):
        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats')
    manager.wait_for_clean()

    # write some data
    p = rados_start(mon, ['-p', 'rbd', 'bench', '1', 'write', '-b', '4096'])
    err = p.exitstatus
    log.info('err is %d' % err)

    # wait for some PG to have data that we can mess with
    victim = None
    osd = None
    while victim is None:
        stats = manager.get_pg_stats()
        for pg in stats:
            size = pg['stat_sum']['num_bytes']
            if size > 0:
                victim = pg['pgid']
                osd = pg['acting'][0]
                break

        if victim is None:
            time.sleep(3)

    log.info('messing with PG %s on osd %d' % (victim, osd))


    (osd_remote,) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys()
    data_path = os.path.join('/tmp/cephtest/data',
                             'osd.{id}.data'.format(id=osd),
                             'current',
                             '{pg}_head'.format(pg=victim)
                            )


    # fuzz time
    ls_fp = StringIO()
    osd_remote.run(
        args=[ 'ls', data_path ],
        stdout=ls_fp,
    )
    ls_out = ls_fp.getvalue()
    ls_fp.close()

    # find an object file we can mess with
    file = None
    for line in ls_out.split('\n'):
        if line.find('object'):
            file = line
            break
    assert file is not None

    log.info('fuzzing %s' % file)

    # put a single \0 at the beginning of the file
    osd_remote.run(
        args=[ 'dd',
               'if=/dev/zero',
               'of=%s' % os.path.join(data_path, file),
               'bs=1', 'count=1', 'conv=notrunc'
             ]
    )

    # scrub, verify inconsistent
    manager.raw_cluster_cmd('pg', 'deep-scrub', victim)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if state.find('scrubbing'):
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert inconsistent
        break


    # repair, verify no longer inconsistent
    manager.raw_cluster_cmd('pg', 'repair', victim)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if state.find('scrubbing'):
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert not inconsistent
        break

    log.info('test successful!')
示例#24
0
def test_incomplete_pgs(ctx, config):
    """
    Test handling of incomplete pgs.  Requires 4 osds.
    """
    testdir = teuthology.get_testdir(ctx)
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)
    assert num_osds == 4

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    while len(manager.get_osd_status()['up']) < 4:
        time.sleep(10)

    manager.flush_pg_stats([0, 1, 2, 3])
    manager.wait_for_clean()

    log.info('Testing incomplete pgs...')

    for i in range(4):
        manager.set_config(
            i,
            osd_recovery_delay_start=1000)

    # move data off of osd.0, osd.1
    manager.raw_cluster_cmd('osd', 'out', '0', '1')
    manager.flush_pg_stats([0, 1, 2, 3], [0, 1])
    manager.wait_for_clean()

    # lots of objects in rbd (no pg log, will backfill)
    p = rados_start(testdir, mon,
                    ['-p', 'rbd', 'bench', '20', 'write', '-b', '1',
                     '--no-cleanup'])
    p.wait()

    # few objects in rbd pool (with pg log, normal recovery)
    for f in range(1, 20):
        p = rados_start(testdir, mon, ['-p', 'rbd', 'put',
                              'foo.%d' % f, '/etc/passwd'])
        p.wait()

    # move it back
    manager.raw_cluster_cmd('osd', 'in', '0', '1')
    manager.raw_cluster_cmd('osd', 'out', '2', '3')
    time.sleep(10)
    manager.flush_pg_stats([0, 1, 2, 3], [2, 3])
    time.sleep(10)
    manager.wait_for_active()

    assert not manager.is_clean()
    assert not manager.is_recovered()

    # kill 2 + 3
    log.info('stopping 2,3')
    manager.kill_osd(2)
    manager.kill_osd(3)
    log.info('...')
    manager.raw_cluster_cmd('osd', 'down', '2', '3')
    manager.flush_pg_stats([0, 1])
    manager.wait_for_active_or_down()

    assert manager.get_num_down() > 0

    # revive 2 + 3
    manager.revive_osd(2)
    manager.revive_osd(3)
    while len(manager.get_osd_status()['up']) < 4:
        log.info('waiting a bit...')
        time.sleep(2)
    log.info('all are up!')

    for i in range(4):
        manager.kick_recovery_wq(i)

    # cluster must recover
    manager.wait_for_clean()
示例#25
0
def task(ctx, config):
    """
    Execute a radosbench parameter sweep

    Puts radosbench in a loop, taking values from the given config at each
    iteration. If given, the min and max values below create a range, e.g.
    min_replicas=1 and max_replicas=3 implies executing with 1-3 replicas.

    Parameters:

        clients: [client list]
        time: seconds to run (default=120)
        sizes: [list of object sizes] (default=[4M])
        mode: <write|read|seq> (default=write)
        repetitions: execute the same configuration multiple times (default=1)
        min_num_replicas: minimum number of replicas to use (default = 3)
        max_num_replicas: maximum number of replicas to use (default = 3)
        min_num_osds: the minimum number of OSDs in a pool (default=all)
        max_num_osds: the maximum number of OSDs in a pool (default=all)
        file: name of CSV-formatted output file (default='radosbench.csv')
        columns: columns to include (default=all)
          - rep: execution number (takes values from 'repetitions')
          - num_osd: number of osds for pool
          - num_replica: number of replicas
          - avg_throughput: throughput
          - avg_latency: latency
          - stdev_throughput:
          - stdev_latency:

    Example:
    - radsobenchsweep:
        columns: [rep, num_osd, num_replica, avg_throughput, stdev_throughput]
    """
    log.info('Beginning radosbenchsweep...')
    assert isinstance(config, dict), 'expecting dictionary for configuration'

    # get and validate config values
    # {

    # only one client supported for now
    if len(config.get('clients', [])) != 1:
        raise Exception("Only one client can be specified")

    # only write mode
    if config.get('mode', 'write') != 'write':
        raise Exception("Only 'write' mode supported for now.")

    # OSDs
    total_osds_in_cluster = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    min_num_osds = config.get('min_num_osds', total_osds_in_cluster)
    max_num_osds = config.get('max_num_osds', total_osds_in_cluster)

    if max_num_osds > total_osds_in_cluster:
        raise Exception('max_num_osds cannot be greater than total in cluster')
    if min_num_osds < 1:
        raise Exception('min_num_osds cannot be less than 1')
    if min_num_osds > max_num_osds:
        raise Exception('min_num_osds cannot be greater than max_num_osd')
    osds = range(0, (total_osds_in_cluster + 1))

    # replicas
    min_num_replicas = config.get('min_num_replicas', 3)
    max_num_replicas = config.get('max_num_replicas', 3)

    if min_num_replicas < 1:
        raise Exception('min_num_replicas cannot be less than 1')
    if min_num_replicas > max_num_replicas:
        raise Exception('min_num_replicas cannot be greater than max_replicas')
    if max_num_replicas > max_num_osds:
        raise Exception('max_num_replicas cannot be greater than max_num_osds')
    replicas = range(min_num_replicas, (max_num_replicas + 1))

    # object size
    sizes = config.get('size', [4 << 20])

    # repetitions
    reps = range(config.get('repetitions', 1))

    # file
    fname = config.get('file', 'radosbench.csv')
    f = open('{}/{}'.format(ctx.archive, fname), 'w')
    f.write(get_csv_header(config) + '\n')
    # }

    # set default pools size=1 to avoid 'unhealthy' issues
    ctx.manager.set_pool_property('data', 'size', 1)
    ctx.manager.set_pool_property('metadata', 'size', 1)
    ctx.manager.set_pool_property('rbd', 'size', 1)

    current_osds_out = 0

    # sweep through all parameters
    for osds_out, size, replica, rep in product(osds, sizes, replicas, reps):

        osds_in = total_osds_in_cluster - osds_out

        if osds_in == 0:
            # we're done
            break

        if current_osds_out != osds_out:
            # take an osd out
            ctx.manager.raw_cluster_cmd(
                'osd', 'reweight', str(osds_out-1), '0.0')
            wait_until_healthy(ctx, config)
            current_osds_out = osds_out

        if osds_in not in range(min_num_osds, (max_num_osds + 1)):
            # no need to execute with a number of osds that wasn't requested
            continue

        if osds_in < replica:
            # cannot execute with more replicas than available osds
            continue

        run_radosbench(ctx, config, f, osds_in, size, replica, rep)

    f.close()

    yield
示例#26
0
def cluster(ctx, config):
    log.info('Creating ceph cluster...')
    run.wait(
        ctx.cluster.run(
            args=[
                'install', '-d', '-m0755', '--',
                '/tmp/cephtest/data',
                ],
            wait=False,
            )
        )


    devs_to_clean = {}
    remote_to_roles_to_devs = {}
    remote_to_roles_to_journals = {}
    osds = ctx.cluster.only(teuthology.is_type('osd'))
    for remote, roles_for_host in osds.remotes.iteritems():
        devs = teuthology.get_scratch_devices(remote)
        roles_to_devs = {}
        roles_to_journals = {}
        if config.get('fs'):
            log.info('fs option selected, checkin for scratch devs')
            log.info('found devs: %s' % (str(devs),))
            roles_to_devs = assign_devs(
                teuthology.roles_of_type(roles_for_host, 'osd'), devs
                )
            if len(roles_to_devs) < len(devs):
                devs = devs[len(roles_to_devs):]
            log.info('dev map: %s' % (str(roles_to_devs),))
            devs_to_clean[remote] = []
            
        if config.get('block_journal'):
            log.info('block journal enabled')
            roles_to_journals = assign_devs(
                teuthology.roles_of_type(roles_for_host, 'osd'), devs
                )
            log.info('journal map: %s', roles_to_journals)

        if config.get('tmpfs_journal'):
            log.info('tmpfs journal enabled')
            roles_to_journals = {}
            remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] )
            for osd in teuthology.roles_of_type(roles_for_host, 'osd'):
                tmpfs = '/mnt/osd.%s' % osd
                roles_to_journals[osd] = tmpfs
                remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] )
            log.info('journal map: %s', roles_to_journals)

        remote_to_roles_to_devs[remote] = roles_to_devs
        remote_to_roles_to_journals[remote] = roles_to_journals


    log.info('Generating config...')
    remotes_and_roles = ctx.cluster.remotes.items()
    roles = [roles for (remote, roles) in remotes_and_roles]
    ips = [host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, roles) in remotes_and_roles)]
    conf = teuthology.skeleton_config(roles=roles, ips=ips)
    for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
        for role, journal in roles_to_journals.iteritems():
            key = "osd." + str(role)
            if key not in conf:
                conf[key] = {}
            conf[key]['osd journal'] = journal
    for section, keys in config['conf'].iteritems():
        for key, value in keys.iteritems():
            log.info("[%s] %s = %s" % (section, key, value))
            if section not in conf:
                conf[section] = {}
            conf[section][key] = value

    if config.get('tmpfs_journal'):
        conf['journal dio'] = False

    ctx.ceph = argparse.Namespace()
    ctx.ceph.conf = conf

    log.info('Writing configs...')
    conf_fp = StringIO()
    conf.write(conf_fp)
    conf_fp.seek(0)
    writes = ctx.cluster.run(
        args=[
            'python',
            '-c',
            'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))',
            '/tmp/cephtest/ceph.conf',
            ],
        stdin=run.PIPE,
        wait=False,
        )
    teuthology.feed_many_stdins_and_close(conf_fp, writes)
    run.wait(writes)

    coverage_dir = '/tmp/cephtest/archive/coverage'

    firstmon = teuthology.get_first_mon(ctx, config)

    log.info('Setting up %s...' % firstmon)
    ctx.cluster.only(firstmon).run(
        args=[
            '/tmp/cephtest/enable-coredump',
            '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
            coverage_dir,
            '/tmp/cephtest/binary/usr/local/bin/ceph-authtool',
            '--create-keyring',
            '/tmp/cephtest/ceph.keyring',
            ],
        )
    ctx.cluster.only(firstmon).run(
        args=[
            '/tmp/cephtest/enable-coredump',
            '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
            coverage_dir,
            '/tmp/cephtest/binary/usr/local/bin/ceph-authtool',
            '--gen-key',
            '--name=mon.',
            '/tmp/cephtest/ceph.keyring',
            ],
        )
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.create_simple_monmap(
        remote=mon0_remote,
        conf=conf,
        )

    log.info('Creating admin key on %s...' % firstmon)
    ctx.cluster.only(firstmon).run(
        args=[
            '/tmp/cephtest/enable-coredump',
            '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
            coverage_dir,
            '/tmp/cephtest/binary/usr/local/bin/ceph-authtool',
            '--gen-key',
            '--name=client.admin',
            '--set-uid=0',
            '--cap', 'mon', 'allow *',
            '--cap', 'osd', 'allow *',
            '--cap', 'mds', 'allow',
            '/tmp/cephtest/ceph.keyring',
            ],
        )

    log.info('Copying monmap to all nodes...')
    keyring = teuthology.get_file(
        remote=mon0_remote,
        path='/tmp/cephtest/ceph.keyring',
        )
    monmap = teuthology.get_file(
        remote=mon0_remote,
        path='/tmp/cephtest/monmap',
        )

    for rem in ctx.cluster.remotes.iterkeys():
        # copy mon key and initial monmap
        log.info('Sending monmap to node {remote}'.format(remote=rem))
        teuthology.write_file(
            remote=rem,
            path='/tmp/cephtest/ceph.keyring',
            data=keyring,
            )
        teuthology.write_file(
            remote=rem,
            path='/tmp/cephtest/monmap',
            data=monmap,
            )

    log.info('Setting up mon nodes...')
    mons = ctx.cluster.only(teuthology.is_type('mon'))
    run.wait(
        mons.run(
            args=[
                '/tmp/cephtest/enable-coredump',
                '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                coverage_dir,
                '/tmp/cephtest/binary/usr/local/bin/osdmaptool',
                '--clobber',
                '--createsimple', '{num:d}'.format(
                    num=teuthology.num_instances_of_type(ctx.cluster, 'osd'),
                    ),
                '/tmp/cephtest/osdmap',
                '--pg_bits', '2',
                '--pgp_bits', '4',
                ],
            wait=False,
            ),
        )

    log.info('Setting up osd nodes...')
    for remote, roles_for_host in osds.remotes.iteritems():
        for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
            remote.run(
                args=[
                    '/tmp/cephtest/enable-coredump',
                    '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                    coverage_dir,
                    '/tmp/cephtest/binary/usr/local/bin/ceph-authtool',
                    '--create-keyring',
                    '--gen-key',
                    '--name=osd.{id}'.format(id=id_),
                    '/tmp/cephtest/data/osd.{id}.keyring'.format(id=id_),
                    ],
                )

    log.info('Setting up mds nodes...')
    mdss = ctx.cluster.only(teuthology.is_type('mds'))
    for remote, roles_for_host in mdss.remotes.iteritems():
        for id_ in teuthology.roles_of_type(roles_for_host, 'mds'):
            remote.run(
                args=[
                    '/tmp/cephtest/enable-coredump',
                    '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                    coverage_dir,
                    '/tmp/cephtest/binary/usr/local/bin/ceph-authtool',
                    '--create-keyring',
                    '--gen-key',
                    '--name=mds.{id}'.format(id=id_),
                    '/tmp/cephtest/data/mds.{id}.keyring'.format(id=id_),
                    ],
                )

    log.info('Setting up client nodes...')
    clients = ctx.cluster.only(teuthology.is_type('client'))
    for remote, roles_for_host in clients.remotes.iteritems():
        for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
            remote.run(
                args=[
                    '/tmp/cephtest/enable-coredump',
                    '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                    coverage_dir,
                    '/tmp/cephtest/binary/usr/local/bin/ceph-authtool',
                    '--create-keyring',
                    '--gen-key',
                    # TODO this --name= is not really obeyed, all unknown "types" are munged to "client"
                    '--name=client.{id}'.format(id=id_),
                    '/tmp/cephtest/data/client.{id}.keyring'.format(id=id_),
                    ],
                )

    log.info('Reading keys from all nodes...')
    keys_fp = StringIO()
    keys = []
    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
        for type_ in ['osd', 'mds', 'client']:
            for id_ in teuthology.roles_of_type(roles_for_host, type_):
                data = teuthology.get_file(
                    remote=remote,
                    path='/tmp/cephtest/data/{type}.{id}.keyring'.format(
                        type=type_,
                        id=id_,
                        ),
                    )
                keys.append((type_, id_, data))
                keys_fp.write(data)

    log.info('Adding keys to all mons...')
    writes = mons.run(
        args=[
            'cat',
            run.Raw('>>'),
            '/tmp/cephtest/ceph.keyring',
            ],
        stdin=run.PIPE,
        wait=False,
        )
    keys_fp.seek(0)
    teuthology.feed_many_stdins_and_close(keys_fp, writes)
    run.wait(writes)
    for type_, id_, data in keys:
        run.wait(
            mons.run(
                args=[
                    '/tmp/cephtest/enable-coredump',
                    '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                    coverage_dir,
                    '/tmp/cephtest/binary/usr/local/bin/ceph-authtool',
                    '/tmp/cephtest/ceph.keyring',
                    '--name={type}.{id}'.format(
                        type=type_,
                        id=id_,
                        ),
                    ] + list(teuthology.generate_caps(type_)),
                wait=False,
                ),
            )

    log.info('Running mkfs on mon nodes...')
    for remote, roles_for_host in mons.remotes.iteritems():
        for id_ in teuthology.roles_of_type(roles_for_host, 'mon'):
            remote.run(
                args=[
                    '/tmp/cephtest/enable-coredump',
                    '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                    coverage_dir,
                    '/tmp/cephtest/binary/usr/local/bin/ceph-mon',
                    '--mkfs',
                    '-i', id_,
                    '-c', '/tmp/cephtest/ceph.conf',
                    '--monmap=/tmp/cephtest/monmap',
                    '--osdmap=/tmp/cephtest/osdmap',
                    '--keyring=/tmp/cephtest/ceph.keyring',
                    ],
                )

    log.info('Running mkfs on osd nodes...')
    for remote, roles_for_host in osds.remotes.iteritems():
        roles_to_devs = remote_to_roles_to_devs[remote]
        roles_to_journals = remote_to_roles_to_journals[remote]
        ctx.disk_config = argparse.Namespace()
        ctx.disk_config.remote_to_roles_to_dev = remote_to_roles_to_devs
        ctx.disk_config.remote_to_roles_to_journals = remote_to_roles_to_journals

        for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
            log.info(str(roles_to_journals))
            log.info(id_)
            remote.run(
                args=[
                    'mkdir',
                    os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_)),
                    ],
                )
            if roles_to_devs.get(id_):
                dev = roles_to_devs[id_]
                fs = config.get('fs')
                package = None
                mkfs_options = config.get('mkfs_options')
                mount_options = config.get('mount_options')
                if fs == 'btrfs':
                    package = 'btrfs-tools'
                    if mount_options is None:
                        mount_options = ['noatime','user_subvol_rm_allowed']
                    if mkfs_options is None:
                        mkfs_options = ['-m', 'single',
                                        '-l', '32768',
                                        '-n', '32768']
                if fs == 'xfs':
                    package = 'xfsprogs'
                    if mount_options is None:
                        mount_options = ['noatime']
                    if mkfs_options is None:
                        mkfs_options = ['-f', '-i', 'size=2048']
                if fs == 'ext4' or fs == 'ext3':
                    if mount_options is None:
                        mount_options = ['noatime','user_xattr']

                if mount_options is None:
                    mount_options = []
                if mkfs_options is None:
                    mkfs_options = []
                mkfs = ['mkfs.%s' % fs] + mkfs_options
                log.info('%s on %s on %s' % (mkfs, dev, remote))
                if package is not None:
                    remote.run(
                        args=[
                            'sudo',
                            'apt-get', 'install', '-y', package
                            ]
                        )
                remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
                log.info('mount %s on %s -o %s' % (dev, remote,
                                                   ','.join(mount_options)))
                remote.run(
                    args=[
                        'sudo',
                        'mount',
                        '-t', fs,
                        '-o', ','.join(mount_options),
                        dev,
                        os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_)),
                        ]
                    )
                remote.run(
                    args=[
                        'sudo', 'chown', '-R', 'ubuntu.ubuntu',
                        os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_))
                        ]
                    )
                remote.run(
                    args=[
                        'sudo', 'chmod', '-R', '755',
                        os.path.join('/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_))
                        ]
                    )
                devs_to_clean[remote].append(
                    os.path.join(
                        '/tmp/cephtest/data', 'osd.{id}.data'.format(id=id_)
                        )
                    )

        for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
            remote.run(
                args=[
                    '/tmp/cephtest/enable-coredump',
                    '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                    coverage_dir,
                    '/tmp/cephtest/binary/usr/local/bin/ceph-osd',
                    '--mkfs',
                    '-i', id_,
                    '-c', '/tmp/cephtest/ceph.conf',
                    '--monmap', '/tmp/cephtest/monmap',
                    ],
                )
    run.wait(
        mons.run(
            args=[
                'rm',
                '--',
                '/tmp/cephtest/monmap',
                '/tmp/cephtest/osdmap',
                ],
            wait=False,
            ),
        )

    try:
        yield
    finally:
        (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()

        log.info('Checking cluster log for badness...')
        def first_in_ceph_log(pattern, excludes):
            args = [
                'egrep', pattern,
                '/tmp/cephtest/archive/log/cluster.%s.log' % firstmon,
                ]
            for exclude in excludes:
                args.extend([run.Raw('|'), 'egrep', '-v', exclude])
            args.extend([
                    run.Raw('|'), 'head', '-n', '1',
                    ])
            r = mon0_remote.run(
                stdout=StringIO(),
                args=args,
                )
            stdout = r.stdout.getvalue()
            if stdout != '':
                return stdout
            return None

        if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
                             config['log_whitelist']) is not None:
            log.warning('Found errors (ERR|WRN|SEC) in cluster log')
            ctx.summary['success'] = False
            # use the most severe problem as the failure reason
            if 'failure_reason' not in ctx.summary:
                for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
                    match = first_in_ceph_log(pattern, config['log_whitelist'])
                    if match is not None:
                        ctx.summary['failure_reason'] = \
                            '"{match}" in cluster log'.format(
                            match=match.rstrip('\n'),
                            )
                        break

        for remote, dirs in devs_to_clean.iteritems():
            for dir_ in dirs:
                log.info('Unmounting %s on %s' % (dir_, remote))
                remote.run(
                    args=[
                        'sync',
                        run.Raw('&&'),
                        'sudo',
                        'umount',
                        '-f',
                        dir_
                        ]
                    )

        if config.get('tmpfs_journal'):
            log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
            for remote, roles_for_host in osds.remotes.iteritems():
                remote.run(
                    args=[ 'sudo', 'umount', '-f', '/mnt' ],
                    check_status=False,
                )

        if ctx.archive is not None:
            # archive mon data, too
            log.info('Archiving mon data...')
            path = os.path.join(ctx.archive, 'data')
            os.makedirs(path)
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    if role.startswith('mon.'):
                        teuthology.pull_directory_tarball(remote,
                                       '/tmp/cephtest/data/%s' % role,
                                       path + '/' + role + '.tgz')

        log.info('Cleaning ceph cluster...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'rm',
                    '-rf',
                    '--',
                    '/tmp/cephtest/ceph.conf',
                    '/tmp/cephtest/ceph.keyring',
                    '/tmp/cephtest/data',
                    '/tmp/cephtest/monmap',
                    run.Raw('/tmp/cephtest/asok.*')
                    ],
                wait=False,
                ),
            )
示例#27
0
def cluster(ctx, config):
    """
    Handle the creation and removal of a ceph cluster.

    On startup:
        Create directories needed for the cluster.
        Create remote journals for all osds.
        Create and set keyring.
        Copy the monmap to tht test systems.
        Setup mon nodes.
        Setup mds nodes.
        Mkfs osd nodes.
        Add keyring information to monmaps
        Mkfs mon nodes.

    On exit:
        If errors occured, extract a failure message and store in ctx.summary.
        Unmount all test files and temporary journaling files.
        Save the monitor information and archive all ceph logs.
        Cleanup the keyring setup, and remove all monitor map and data files left over.

    :param ctx: Context
    :param config: Configuration
    """
    if ctx.config.get('use_existing_cluster', False) is True:
        log.info("'use_existing_cluster' is true; skipping cluster creation")
        yield

    testdir = teuthology.get_testdir(ctx)
    cluster_name = config['cluster']
    data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir,
                                              cluster=cluster_name)
    log.info('Creating ceph cluster %s...', cluster_name)
    run.wait(
        ctx.cluster.run(
            args=[
                'install',
                '-d',
                '-m0755',
                '--',
                data_dir,
            ],
            wait=False,
        ))

    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'install',
                '-d',
                '-m0777',
                '--',
                '/var/run/ceph',
            ],
            wait=False,
        ))

    devs_to_clean = {}
    remote_to_roles_to_devs = {}
    remote_to_roles_to_journals = {}
    osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
    for remote, roles_for_host in osds.remotes.iteritems():
        devs = teuthology.get_scratch_devices(remote)
        roles_to_devs = {}
        roles_to_journals = {}
        if config.get('fs'):
            log.info('fs option selected, checking for scratch devs')
            log.info('found devs: %s' % (str(devs), ))
            devs_id_map = teuthology.get_wwn_id_map(remote, devs)
            iddevs = devs_id_map.values()
            roles_to_devs = assign_devs(
                teuthology.cluster_roles_of_type(roles_for_host, 'osd',
                                                 cluster_name), iddevs)
            if len(roles_to_devs) < len(iddevs):
                iddevs = iddevs[len(roles_to_devs):]
            devs_to_clean[remote] = []

        if config.get('block_journal'):
            log.info('block journal enabled')
            roles_to_journals = assign_devs(
                teuthology.cluster_roles_of_type(roles_for_host, 'osd',
                                                 cluster_name), iddevs)
            log.info('journal map: %s', roles_to_journals)

        if config.get('tmpfs_journal'):
            log.info('tmpfs journal enabled')
            roles_to_journals = {}
            remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
            for role in teuthology.cluster_roles_of_type(
                    roles_for_host, 'osd', cluster_name):
                tmpfs = '/mnt/' + role
                roles_to_journals[role] = tmpfs
                remote.run(args=['truncate', '-s', '1500M', tmpfs])
            log.info('journal map: %s', roles_to_journals)

        log.info('dev map: %s' % (str(roles_to_devs), ))
        remote_to_roles_to_devs[remote] = roles_to_devs
        remote_to_roles_to_journals[remote] = roles_to_journals

    log.info('Generating config...')
    remotes_and_roles = ctx.cluster.remotes.items()
    roles = [role_list for (remote, role_list) in remotes_and_roles]
    ips = [
        host for (host, port) in (remote.ssh.get_transport().getpeername()
                                  for (remote, role_list) in remotes_and_roles)
    ]
    conf = teuthology.skeleton_config(ctx,
                                      roles=roles,
                                      ips=ips,
                                      cluster=cluster_name)
    for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
        for role, journal in roles_to_journals.iteritems():
            name = teuthology.ceph_role(role)
            if name not in conf:
                conf[name] = {}
            conf[name]['osd journal'] = journal
    for section, keys in config['conf'].iteritems():
        for key, value in keys.iteritems():
            log.info("[%s] %s = %s" % (section, key, value))
            if section not in conf:
                conf[section] = {}
            conf[section][key] = value

    if config.get('tmpfs_journal'):
        conf['journal dio'] = False

    if not hasattr(ctx, 'ceph'):
        ctx.ceph = {}
    ctx.ceph[cluster_name] = argparse.Namespace()
    ctx.ceph[cluster_name].conf = conf

    default_keyring = '/etc/ceph/{cluster}.keyring'.format(
        cluster=cluster_name)
    keyring_path = config.get('keyring_path', default_keyring)

    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)

    log.info('Setting up %s...' % firstmon)
    ctx.cluster.only(firstmon).run(args=[
        'sudo',
        'adjust-ulimits',
        'ceph-coverage',
        coverage_dir,
        'ceph-authtool',
        '--create-keyring',
        keyring_path,
    ], )
    ctx.cluster.only(firstmon).run(args=[
        'sudo',
        'adjust-ulimits',
        'ceph-coverage',
        coverage_dir,
        'ceph-authtool',
        '--gen-key',
        '--name=mon.',
        keyring_path,
    ], )
    ctx.cluster.only(firstmon).run(args=[
        'sudo',
        'chmod',
        '0644',
        keyring_path,
    ], )
    (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys()
    monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
                                                   cluster=cluster_name)
    fsid = teuthology.create_simple_monmap(
        ctx,
        remote=mon0_remote,
        conf=conf,
        path=monmap_path,
    )
    if not 'global' in conf:
        conf['global'] = {}
    conf['global']['fsid'] = fsid

    default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
    conf_path = config.get('conf_path', default_conf_path)
    log.info('Writing %s for FSID %s...' % (conf_path, fsid))
    write_conf(ctx, conf_path, cluster_name)

    log.info('Creating admin key on %s...' % firstmon)
    ctx.cluster.only(firstmon).run(args=[
        'sudo',
        'adjust-ulimits',
        'ceph-coverage',
        coverage_dir,
        'ceph-authtool',
        '--gen-key',
        '--name=client.admin',
        '--set-uid=0',
        '--cap',
        'mon',
        'allow *',
        '--cap',
        'osd',
        'allow *',
        '--cap',
        'mds',
        'allow *',
        keyring_path,
    ], )

    log.info('Copying monmap to all nodes...')
    keyring = teuthology.get_file(
        remote=mon0_remote,
        path=keyring_path,
    )
    monmap = teuthology.get_file(
        remote=mon0_remote,
        path=monmap_path,
    )

    for rem in ctx.cluster.remotes.iterkeys():
        # copy mon key and initial monmap
        log.info('Sending monmap to node {remote}'.format(remote=rem))
        teuthology.sudo_write_file(remote=rem,
                                   path=keyring_path,
                                   data=keyring,
                                   perms='0644')
        teuthology.write_file(
            remote=rem,
            path=monmap_path,
            data=monmap,
        )

    log.info('Setting up mon nodes...')
    mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
    osdmap_path = '{tdir}/{cluster}.osdmap'.format(tdir=testdir,
                                                   cluster=cluster_name)
    run.wait(
        mons.run(
            args=[
                'adjust-ulimits',
                'ceph-coverage',
                coverage_dir,
                'osdmaptool',
                '-c',
                conf_path,
                '--clobber',
                '--createsimple',
                '{num:d}'.format(num=teuthology.num_instances_of_type(
                    ctx.cluster, 'osd', cluster_name), ),
                osdmap_path,
                '--pg_bits',
                '2',
                '--pgp_bits',
                '4',
            ],
            wait=False,
        ), )

    log.info('Setting up mds nodes...')
    mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
    for remote, roles_for_host in mdss.remotes.iteritems():
        for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
                                                     cluster_name):
            _, _, id_ = teuthology.split_role(role)
            mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
                cluster=cluster_name,
                id=id_,
            )
            remote.run(args=[
                'sudo',
                'mkdir',
                '-p',
                mds_dir,
                run.Raw('&&'),
                'sudo',
                'adjust-ulimits',
                'ceph-coverage',
                coverage_dir,
                'ceph-authtool',
                '--create-keyring',
                '--gen-key',
                '--name=mds.{id}'.format(id=id_),
                mds_dir + '/keyring',
            ], )

    cclient.create_keyring(ctx, cluster_name)
    log.info('Running mkfs on osd nodes...')

    if not hasattr(ctx, 'disk_config'):
        ctx.disk_config = argparse.Namespace()
    if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
        ctx.disk_config.remote_to_roles_to_dev = {}
    if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
        ctx.disk_config.remote_to_roles_to_journals = {}
    if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
        ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
    if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
        ctx.disk_config.remote_to_roles_to_dev_fstype = {}

    teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev,
                          remote_to_roles_to_devs)
    teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals,
                          remote_to_roles_to_journals)

    log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(
        r=str(ctx.disk_config.remote_to_roles_to_dev)))
    for remote, roles_for_host in osds.remotes.iteritems():
        roles_to_devs = remote_to_roles_to_devs[remote]
        roles_to_journals = remote_to_roles_to_journals[remote]

        for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd',
                                                     cluster_name):
            _, _, id_ = teuthology.split_role(role)
            mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(
                cluster=cluster_name, id=id_)
            remote.run(args=[
                'sudo',
                'mkdir',
                '-p',
                mnt_point,
            ])
            log.info(str(roles_to_journals))
            log.info(role)
            if roles_to_devs.get(role):
                dev = roles_to_devs[role]
                fs = config.get('fs')
                package = None
                mkfs_options = config.get('mkfs_options')
                mount_options = config.get('mount_options')
                if fs == 'btrfs':
                    # package = 'btrfs-tools'
                    if mount_options is None:
                        mount_options = ['noatime', 'user_subvol_rm_allowed']
                    if mkfs_options is None:
                        mkfs_options = [
                            '-m', 'single', '-l', '32768', '-n', '32768'
                        ]
                if fs == 'xfs':
                    # package = 'xfsprogs'
                    if mount_options is None:
                        mount_options = ['noatime']
                    if mkfs_options is None:
                        mkfs_options = ['-f', '-i', 'size=2048']
                if fs == 'ext4' or fs == 'ext3':
                    if mount_options is None:
                        mount_options = ['noatime', 'user_xattr']

                if mount_options is None:
                    mount_options = []
                if mkfs_options is None:
                    mkfs_options = []
                mkfs = ['mkfs.%s' % fs] + mkfs_options
                log.info('%s on %s on %s' % (mkfs, dev, remote))
                if package is not None:
                    remote.run(
                        args=['sudo', 'apt-get', 'install', '-y', package],
                        stdout=StringIO(),
                    )

                try:
                    remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs +
                               [dev])
                except run.CommandFailedError:
                    # Newer btfs-tools doesn't prompt for overwrite, use -f
                    if '-f' not in mount_options:
                        mkfs_options.append('-f')
                        mkfs = ['mkfs.%s' % fs] + mkfs_options
                        log.info('%s on %s on %s' % (mkfs, dev, remote))
                    remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs +
                               [dev])

                log.info('mount %s on %s -o %s' %
                         (dev, remote, ','.join(mount_options)))
                remote.run(args=[
                    'sudo',
                    'mount',
                    '-t',
                    fs,
                    '-o',
                    ','.join(mount_options),
                    dev,
                    mnt_point,
                ])
                if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
                    ctx.disk_config.remote_to_roles_to_dev_mount_options[
                        remote] = {}
                ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][
                    role] = mount_options
                if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
                    ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
                ctx.disk_config.remote_to_roles_to_dev_fstype[remote][
                    role] = fs
                devs_to_clean[remote].append(mnt_point)

        for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd',
                                                     cluster_name):
            _, _, id_ = teuthology.split_role(role)
            remote.run(args=[
                'sudo',
                'MALLOC_CHECK_=3',
                'adjust-ulimits',
                'ceph-coverage',
                coverage_dir,
                'ceph-osd',
                '--cluster',
                cluster_name,
                '--mkfs',
                '--mkkey',
                '-i',
                id_,
                '--monmap',
                monmap_path,
            ], )

    log.info('Reading keys from all nodes...')
    keys_fp = StringIO()
    keys = []
    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
        for type_ in ['mds', 'osd']:
            for role in teuthology.cluster_roles_of_type(
                    roles_for_host, type_, cluster_name):
                _, _, id_ = teuthology.split_role(role)
                data = teuthology.get_file(
                    remote=remote,
                    path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
                        type=type_,
                        id=id_,
                        cluster=cluster_name,
                    ),
                    sudo=True,
                )
                keys.append((type_, id_, data))
                keys_fp.write(data)
    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
        for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
                                                     cluster_name):
            _, _, id_ = teuthology.split_role(role)
            data = teuthology.get_file(
                remote=remote,
                path='/etc/ceph/{cluster}.client.{id}.keyring'.format(
                    id=id_, cluster=cluster_name))
            keys.append(('client', id_, data))
            keys_fp.write(data)

    log.info('Adding keys to all mons...')
    writes = mons.run(
        args=[
            'sudo',
            'tee',
            '-a',
            keyring_path,
        ],
        stdin=run.PIPE,
        wait=False,
        stdout=StringIO(),
    )
    keys_fp.seek(0)
    teuthology.feed_many_stdins_and_close(keys_fp, writes)
    run.wait(writes)
    for type_, id_, data in keys:
        run.wait(
            mons.run(
                args=[
                    'sudo',
                    'adjust-ulimits',
                    'ceph-coverage',
                    coverage_dir,
                    'ceph-authtool',
                    keyring_path,
                    '--name={type}.{id}'.format(
                        type=type_,
                        id=id_,
                    ),
                ] + list(teuthology.generate_caps(type_)),
                wait=False,
            ), )

    log.info('Running mkfs on mon nodes...')
    for remote, roles_for_host in mons.remotes.iteritems():
        for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon',
                                                     cluster_name):
            _, _, id_ = teuthology.split_role(role)
            remote.run(args=[
                'sudo',
                'mkdir',
                '-p',
                '/var/lib/ceph/mon/{cluster}-{id}'.format(
                    id=id_, cluster=cluster_name),
            ], )
            remote.run(args=[
                'sudo',
                'adjust-ulimits',
                'ceph-coverage',
                coverage_dir,
                'ceph-mon',
                '--cluster',
                cluster_name,
                '--mkfs',
                '-i',
                id_,
                '--monmap',
                monmap_path,
                '--osdmap',
                osdmap_path,
                '--keyring',
                keyring_path,
            ], )

    run.wait(
        mons.run(
            args=[
                'rm',
                '--',
                monmap_path,
                osdmap_path,
            ],
            wait=False,
        ), )

    try:
        yield
    except Exception:
        # we need to know this below
        ctx.summary['success'] = False
        raise
    finally:
        (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys()

        log.info('Checking cluster log for badness...')

        def first_in_ceph_log(pattern, excludes):
            """
            Find the first occurence of the pattern specified in the Ceph log,
            Returns None if none found.

            :param pattern: Pattern scanned for.
            :param excludes: Patterns to ignore.
            :return: First line of text (or None if not found)
            """
            args = [
                'sudo',
                'egrep',
                pattern,
                '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
            ]
            for exclude in excludes:
                args.extend([run.Raw('|'), 'egrep', '-v', exclude])
            args.extend([
                run.Raw('|'),
                'head',
                '-n',
                '1',
            ])
            r = mon0_remote.run(
                stdout=StringIO(),
                args=args,
            )
            stdout = r.stdout.getvalue()
            if stdout != '':
                return stdout
            return None

        if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
                             config['log_whitelist']) is not None:
            log.warning('Found errors (ERR|WRN|SEC) in cluster log')
            ctx.summary['success'] = False
            # use the most severe problem as the failure reason
            if 'failure_reason' not in ctx.summary:
                for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
                    match = first_in_ceph_log(pattern, config['log_whitelist'])
                    if match is not None:
                        ctx.summary['failure_reason'] = \
                            '"{match}" in cluster log'.format(
                                match=match.rstrip('\n'),
                            )
                        break

        for remote, dirs in devs_to_clean.iteritems():
            for dir_ in dirs:
                log.info('Unmounting %s on %s' % (dir_, remote))
                try:
                    remote.run(args=[
                        'sync',
                        run.Raw('&&'), 'sudo', 'umount', '-f', dir_
                    ])
                except Exception as e:
                    remote.run(args=[
                        'sudo',
                        run.Raw('PATH=/usr/sbin:$PATH'),
                        'lsof',
                        run.Raw(';'),
                        'ps',
                        'auxf',
                    ])
                    raise e

        if config.get('tmpfs_journal'):
            log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
            for remote, roles_for_host in osds.remotes.iteritems():
                remote.run(
                    args=['sudo', 'umount', '-f', '/mnt'],
                    check_status=False,
                )

        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and ctx.summary['success']):

            # archive mon data, too
            log.info('Archiving mon data...')
            path = os.path.join(ctx.archive, 'data')
            try:
                os.makedirs(path)
            except OSError as e:
                if e.errno == errno.EEXIST:
                    pass
                else:
                    raise
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    is_mon = teuthology.is_type('mon', cluster_name)
                    if is_mon(role):
                        _, _, id_ = teuthology.split_role(role)
                        mon_dir = '/var/lib/ceph/mon/' + \
                                  '{0}-{1}'.format(cluster_name, id_)
                        teuthology.pull_directory_tarball(
                            remote, mon_dir, path + '/' + role + '.tgz')

        log.info('Cleaning ceph cluster...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-rf',
                    '--',
                    conf_path,
                    keyring_path,
                    data_dir,
                    monmap_path,
                    osdmap_path,
                    run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
                ],
                wait=False,
            ), )
def task(ctx, config):
    """
    Test (non-backfill) recovery
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'task only accepts a dict for configuration'
    testdir = teuthology.get_testdir(ctx)
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)
    assert num_osds == 3

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    while len(manager.get_osd_status()['up']) < 3:
        time.sleep(10)
    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
    manager.wait_for_clean()

    # test some osdmap flags
    manager.raw_cluster_cmd('osd', 'set', 'noin')
    manager.raw_cluster_cmd('osd', 'set', 'noout')
    manager.raw_cluster_cmd('osd', 'set', 'noup')
    manager.raw_cluster_cmd('osd', 'set', 'nodown')
    manager.raw_cluster_cmd('osd', 'unset', 'noin')
    manager.raw_cluster_cmd('osd', 'unset', 'noout')
    manager.raw_cluster_cmd('osd', 'unset', 'noup')
    manager.raw_cluster_cmd('osd', 'unset', 'nodown')

    # write some new data
    p = rados_start(
        testdir, mon,
        ['-p', 'rbd', 'bench', '20', 'write', '-b', '4096', '--no-cleanup'])

    time.sleep(15)

    # trigger a divergent target:
    #  blackhole + restart osd.1 (shorter log)
    manager.blackhole_kill_osd(1)
    #  kill osd.2 (longer log... we'll make it divergent below)
    manager.kill_osd(2)
    time.sleep(2)
    manager.revive_osd(1)

    # wait for our writes to complete + succeed
    err = p.wait()
    log.info('err is %d' % err)

    # cluster must repeer
    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.wait_for_active_or_down()

    # write some more (make sure osd.2 really is divergent)
    p = rados_start(testdir, mon,
                    ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096'])
    p.wait()

    # revive divergent osd
    manager.revive_osd(2)

    while len(manager.get_osd_status()['up']) < 3:
        log.info('waiting a bit...')
        time.sleep(2)
    log.info('3 are up!')

    # cluster must recover
    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
    manager.wait_for_clean()
示例#29
0
def task(ctx, config):
    """
    Test [deep] scrub

    tasks:
    - chef:
    - install:
    - ceph:
        log-whitelist:
        - '!= data_digest'
        - '!= omap_digest'
        - '!= size'
        - deep-scrub 0 missing, 1 inconsistent objects
        - deep-scrub [0-9]+ errors
        - repair 0 missing, 1 inconsistent objects
        - repair [0-9]+ errors, [0-9]+ fixed
        - shard [0-9]+ missing
        - deep-scrub 1 missing, 1 inconsistent objects
        - does not match object info size
        - attr name mistmatch
        - deep-scrub 1 missing, 0 inconsistent objects
        - failed to pick suitable auth object
      conf:
        osd:
          osd deep scrub update digest min age: 0
    - scrub_test:
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'scrub_test task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    for i in range(num_osds):
        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
                                '--', '--osd-objectstore-fuse')
    for i in range(num_osds):
        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats')
    manager.wait_for_clean()

    # write some data
    p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1',
                               'write', '-b', '4096'])
    log.info('err is %d' % p.exitstatus)

    # wait for some PG to have data that we can mess with
    pg, acting = wait_for_victim_pg(manager)
    osd = acting[0]

    osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd)
    manager.do_rados(mon, ['-p', 'rbd', 'setomapval', obj_name, 'key', 'val'])
    log.info('err is %d' % p.exitstatus)
    manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', obj_name, 'hdr'])
    log.info('err is %d' % p.exitstatus)

    # Update missing digests, requires "osd deep scrub update digest min age: 0"
    pgnum = get_pgnum(pg)
    manager.do_pg_scrub('rbd', pgnum, 'deep-scrub')

    log.info('messing with PG %s on osd %d' % (pg, osd))
    test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, 'rbd')
    test_repair_bad_omap(ctx, manager, pg, osd, obj_name)
    test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd,
                               obj_name, obj_path)
    log.info('test successful!')

    # shut down fuse mount
    for i in range(num_osds):
        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
                                '--', '--no-osd-objectstore-fuse')
    time.sleep(5)
    log.info('done')
示例#30
0
def task(ctx, config):
    """
    Test [deep] scrub

    tasks:
    - chef:
    - install:
    - ceph:
        log-whitelist:
        - '!= known digest'
        - '!= known omap_digest'
        - deep-scrub 0 missing, 1 inconsistent objects
        - deep-scrub 1 errors
        - repair 0 missing, 1 inconsistent objects
        - repair 1 errors, 1 fixed
    - scrub_test:
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'scrub_test task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    for i in range(num_osds):
        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats')
    manager.wait_for_clean()

    # write some data
    p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1',
                               'write', '-b', '4096'])
    log.info('err is %d' % p.exitstatus)

    # wait for some PG to have data that we can mess with
    pg, acting = wait_for_victim_pg(manager)
    osd = acting[0]

    osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd)
    manager.do_rados(mon, ['-p', 'rbd', 'setomapval', obj_name, 'key', 'val'])
    log.info('err is %d' % p.exitstatus)
    manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', obj_name, 'hdr'])
    log.info('err is %d' % p.exitstatus)

    log.info('messing with PG %s on osd %d' % (pg, osd))
    test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path)
    test_repair_bad_omap(ctx, manager, pg, osd, obj_name)
    test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd,
                               obj_name, obj_path)
    log.info('test successful!')
示例#31
0
def cluster(ctx, config):
    testdir = teuthology.get_testdir(ctx)
    log.info('Creating ceph cluster...')
    run.wait(
        ctx.cluster.run(
            args=[
                'install', '-d', '-m0755', '--',
                '{tdir}/data'.format(tdir=testdir),
                ],
            wait=False,
            )
        )

    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'install', '-d', '-m0777', '--', '/var/run/ceph',
                ],
            wait=False,
            )
        )


    devs_to_clean = {}
    remote_to_roles_to_devs = {}
    remote_to_roles_to_journals = {}
    osds = ctx.cluster.only(teuthology.is_type('osd'))
    for remote, roles_for_host in osds.remotes.iteritems():
        devs = teuthology.get_scratch_devices(remote)
        roles_to_devs = {}
        roles_to_journals = {}
        if config.get('fs'):
            log.info('fs option selected, checking for scratch devs')
            log.info('found devs: %s' % (str(devs),))
            devs_id_map = teuthology.get_wwn_id_map(remote, devs)
            iddevs = devs_id_map.values()
            roles_to_devs = assign_devs(
                teuthology.roles_of_type(roles_for_host, 'osd'), iddevs
                )
            if len(roles_to_devs) < len(iddevs):
                iddevs = iddevs[len(roles_to_devs):]
            devs_to_clean[remote] = []

        if config.get('block_journal'):
            log.info('block journal enabled')
            roles_to_journals = assign_devs(
                teuthology.roles_of_type(roles_for_host, 'osd'), iddevs
                )
            log.info('journal map: %s', roles_to_journals)

        if config.get('tmpfs_journal'):
            log.info('tmpfs journal enabled')
            roles_to_journals = {}
            remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] )
            for osd in teuthology.roles_of_type(roles_for_host, 'osd'):
                tmpfs = '/mnt/osd.%s' % osd
                roles_to_journals[osd] = tmpfs
                remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] )
            log.info('journal map: %s', roles_to_journals)

        log.info('dev map: %s' % (str(roles_to_devs),))
        remote_to_roles_to_devs[remote] = roles_to_devs
        remote_to_roles_to_journals[remote] = roles_to_journals


    log.info('Generating config...')
    remotes_and_roles = ctx.cluster.remotes.items()
    roles = [role_list for (remote, role_list) in remotes_and_roles]
    ips = [host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
    conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips)
    for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
        for role, journal in roles_to_journals.iteritems():
            key = "osd." + str(role)
            if key not in conf:
                conf[key] = {}
            conf[key]['osd journal'] = journal
    for section, keys in config['conf'].iteritems():
        for key, value in keys.iteritems():
            log.info("[%s] %s = %s" % (section, key, value))
            if section not in conf:
                conf[section] = {}
            conf[section][key] = value

    if config.get('tmpfs_journal'):
        conf['journal dio'] = False

    ctx.ceph = argparse.Namespace()
    ctx.ceph.conf = conf

    conf_path = config.get('conf_path', '/etc/ceph/ceph.conf')
    keyring_path = config.get('keyring_path', '/etc/ceph/ceph.keyring')

    log.info('Writing configs...')
    conf_fp = StringIO()
    conf.write(conf_fp)
    conf_fp.seek(0)
    writes = ctx.cluster.run(
        args=[
            'sudo', 'mkdir', '-p', '/etc/ceph', run.Raw('&&'),
            'sudo', 'chmod', '0755', '/etc/ceph', run.Raw('&&'),
            'sudo', 'python',
            '-c',
            'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))',
            conf_path,
            run.Raw('&&'),
            'sudo', 'chmod', '0644', conf_path,
            ],
        stdin=run.PIPE,
        wait=False,
        )
    teuthology.feed_many_stdins_and_close(conf_fp, writes)
    run.wait(writes)

    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    firstmon = teuthology.get_first_mon(ctx, config)

    log.info('Setting up %s...' % firstmon)
    ctx.cluster.only(firstmon).run(
        args=[
            'sudo',
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph-authtool',
            '--create-keyring',
            keyring_path,
            ],
        )
    ctx.cluster.only(firstmon).run(
        args=[
            'sudo',
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph-authtool',
            '--gen-key',
            '--name=mon.',
            keyring_path,
            ],
        )
    ctx.cluster.only(firstmon).run(
        args=[
            'sudo',
            'chmod',
            '0644',
            keyring_path,
            ],
        )
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.create_simple_monmap(
        ctx,
        remote=mon0_remote,
        conf=conf,
        )

    log.info('Creating admin key on %s...' % firstmon)
    ctx.cluster.only(firstmon).run(
        args=[
            'sudo',
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph-authtool',
            '--gen-key',
            '--name=client.admin',
            '--set-uid=0',
            '--cap', 'mon', 'allow *',
            '--cap', 'osd', 'allow *',
            '--cap', 'mds', 'allow',
            keyring_path,
            ],
        )

    log.info('Copying monmap to all nodes...')
    keyring = teuthology.get_file(
        remote=mon0_remote,
        path=keyring_path,
        )
    monmap = teuthology.get_file(
        remote=mon0_remote,
        path='{tdir}/monmap'.format(tdir=testdir),
        )

    for rem in ctx.cluster.remotes.iterkeys():
        # copy mon key and initial monmap
        log.info('Sending monmap to node {remote}'.format(remote=rem))
        teuthology.sudo_write_file(
            remote=rem,
            path=keyring_path,
            data=keyring,
            perms='0644'
            )
        teuthology.write_file(
            remote=rem,
            path='{tdir}/monmap'.format(tdir=testdir),
            data=monmap,
            )

    log.info('Setting up mon nodes...')
    mons = ctx.cluster.only(teuthology.is_type('mon'))
    run.wait(
        mons.run(
            args=[
                'adjust-ulimits',
                'ceph-coverage',
                coverage_dir,
                'osdmaptool',
                '-c', conf_path,
                '--clobber',
                '--createsimple', '{num:d}'.format(
                    num=teuthology.num_instances_of_type(ctx.cluster, 'osd'),
                    ),
                '{tdir}/osdmap'.format(tdir=testdir),
                '--pg_bits', '2',
                '--pgp_bits', '4',
                ],
            wait=False,
            ),
        )

    log.info('Setting up mds nodes...')
    mdss = ctx.cluster.only(teuthology.is_type('mds'))
    for remote, roles_for_host in mdss.remotes.iteritems():
        for id_ in teuthology.roles_of_type(roles_for_host, 'mds'):
            remote.run(
                args=[
                    'sudo',
                    'mkdir',
                    '-p',
                    '/var/lib/ceph/mds/ceph-{id}'.format(id=id_),
                    run.Raw('&&'),
                    'sudo',
                    'adjust-ulimits',
                    'ceph-coverage',
                    coverage_dir,
                    'ceph-authtool',
                    '--create-keyring',
                    '--gen-key',
                    '--name=mds.{id}'.format(id=id_),
                    '/var/lib/ceph/mds/ceph-{id}/keyring'.format(id=id_),
                    ],
                )

    cclient.create_keyring(ctx)
    log.info('Running mkfs on osd nodes...')

    ctx.disk_config = argparse.Namespace()
    ctx.disk_config.remote_to_roles_to_dev = remote_to_roles_to_devs
    ctx.disk_config.remote_to_roles_to_journals = remote_to_roles_to_journals
    ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
    ctx.disk_config.remote_to_roles_to_dev_fstype = {}

    log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
    for remote, roles_for_host in osds.remotes.iteritems():
        roles_to_devs = remote_to_roles_to_devs[remote]
        roles_to_journals = remote_to_roles_to_journals[remote]


        for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
            remote.run(
                args=[
                    'sudo',
                    'mkdir',
                    '-p',
                    '/var/lib/ceph/osd/ceph-{id}'.format(id=id_),
                    ])
            log.info(str(roles_to_journals))
            log.info(id_)
            if roles_to_devs.get(id_):
                dev = roles_to_devs[id_]
                fs = config.get('fs')
                package = None
                mkfs_options = config.get('mkfs_options')
                mount_options = config.get('mount_options')
                if fs == 'btrfs':
                    #package = 'btrfs-tools'
                    if mount_options is None:
                        mount_options = ['noatime','user_subvol_rm_allowed']
                    if mkfs_options is None:
                        mkfs_options = ['-m', 'single',
                                        '-l', '32768',
                                        '-n', '32768']
                if fs == 'xfs':
                    #package = 'xfsprogs'
                    if mount_options is None:
                        mount_options = ['noatime']
                    if mkfs_options is None:
                        mkfs_options = ['-f', '-i', 'size=2048']
                if fs == 'ext4' or fs == 'ext3':
                    if mount_options is None:
                        mount_options = ['noatime','user_xattr']

                if mount_options is None:
                    mount_options = []
                if mkfs_options is None:
                    mkfs_options = []
                mkfs = ['mkfs.%s' % fs] + mkfs_options
                log.info('%s on %s on %s' % (mkfs, dev, remote))
                if package is not None:
                    remote.run(
                        args=[
                            'sudo',
                            'apt-get', 'install', '-y', package
                            ],
                        stdout=StringIO(),
                        )
                remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
                log.info('mount %s on %s -o %s' % (dev, remote,
                                                   ','.join(mount_options)))
                remote.run(
                    args=[
                        'sudo',
                        'mount',
                        '-t', fs,
                        '-o', ','.join(mount_options),
                        dev,
                        os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)),
                        ]
                    )
                if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
                    ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
                ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][id_] = mount_options
                if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
                    ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
                ctx.disk_config.remote_to_roles_to_dev_fstype[remote][id_] = fs
                devs_to_clean[remote].append(
                    os.path.join(
                        os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)),
                        )
                    )

        for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
            remote.run(
                args=[
                    'sudo',
                    'MALLOC_CHECK_=3',
                    'adjust-ulimits',
                    'ceph-coverage',
                    coverage_dir,
                    'ceph-osd',
                    '--mkfs',
                    '--mkkey',
                    '-i', id_,
                    '--monmap', '{tdir}/monmap'.format(tdir=testdir),
                    ],
                )


    log.info('Reading keys from all nodes...')
    keys_fp = StringIO()
    keys = []
    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
        for type_ in ['mds','osd']:
            for id_ in teuthology.roles_of_type(roles_for_host, type_):
                data = teuthology.get_file(
                    remote=remote,
                    path='/var/lib/ceph/{type}/ceph-{id}/keyring'.format(
                        type=type_,
                        id=id_,
                        ),
                    sudo=True,
                    )
                keys.append((type_, id_, data))
                keys_fp.write(data)
    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
        for type_ in ['client']:
            for id_ in teuthology.roles_of_type(roles_for_host, type_):
                data = teuthology.get_file(
                    remote=remote,
                    path='/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
                    )
                keys.append((type_, id_, data))
                keys_fp.write(data)

    log.info('Adding keys to all mons...')
    writes = mons.run(
        args=[
            'sudo', 'tee', '-a',
            keyring_path,
            ],
        stdin=run.PIPE,
        wait=False,
        stdout=StringIO(),
        )
    keys_fp.seek(0)
    teuthology.feed_many_stdins_and_close(keys_fp, writes)
    run.wait(writes)
    for type_, id_, data in keys:
        run.wait(
            mons.run(
                args=[
                    'sudo',
                    'adjust-ulimits',
                    'ceph-coverage',
                    coverage_dir,
                    'ceph-authtool',
                    keyring_path,
                    '--name={type}.{id}'.format(
                        type=type_,
                        id=id_,
                        ),
                    ] + list(teuthology.generate_caps(type_)),
                wait=False,
                ),
            )

    log.info('Running mkfs on mon nodes...')
    for remote, roles_for_host in mons.remotes.iteritems():
        for id_ in teuthology.roles_of_type(roles_for_host, 'mon'):
            remote.run(
                args=[
                  'sudo',
                  'mkdir',
                  '-p',
                  '/var/lib/ceph/mon/ceph-{id}'.format(id=id_),
                  ],
                )
            remote.run(
                args=[
                    'sudo',
                    'adjust-ulimits',
                    'ceph-coverage',
                    coverage_dir,
                    'ceph-mon',
                    '--mkfs',
                    '-i', id_,
                    '--monmap={tdir}/monmap'.format(tdir=testdir),
                    '--osdmap={tdir}/osdmap'.format(tdir=testdir),
                    '--keyring={kpath}'.format(kpath=keyring_path),
                    ],
                )


    run.wait(
        mons.run(
            args=[
                'rm',
                '--',
                '{tdir}/monmap'.format(tdir=testdir),
                '{tdir}/osdmap'.format(tdir=testdir),
                ],
            wait=False,
            ),
        )

    try:
        yield
    except Exception:
        # we need to know this below
        ctx.summary['success'] = False
        raise
    finally:
        (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()

        log.info('Checking cluster log for badness...')
        def first_in_ceph_log(pattern, excludes):
            args = [
                'sudo',
                'egrep', pattern,
                '/var/log/ceph/ceph.log',
                ]
            for exclude in excludes:
                args.extend([run.Raw('|'), 'egrep', '-v', exclude])
            args.extend([
                    run.Raw('|'), 'head', '-n', '1',
                    ])
            r = mon0_remote.run(
                stdout=StringIO(),
                args=args,
                )
            stdout = r.stdout.getvalue()
            if stdout != '':
                return stdout
            return None

        if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
                             config['log_whitelist']) is not None:
            log.warning('Found errors (ERR|WRN|SEC) in cluster log')
            ctx.summary['success'] = False
            # use the most severe problem as the failure reason
            if 'failure_reason' not in ctx.summary:
                for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
                    match = first_in_ceph_log(pattern, config['log_whitelist'])
                    if match is not None:
                        ctx.summary['failure_reason'] = \
                            '"{match}" in cluster log'.format(
                            match=match.rstrip('\n'),
                            )
                        break

        for remote, dirs in devs_to_clean.iteritems():
            for dir_ in dirs:
                log.info('Unmounting %s on %s' % (dir_, remote))
                remote.run(
                    args=[
                        'sync',
                        run.Raw('&&'),
                        'sudo',
                        'umount',
                        '-f',
                        dir_
                        ]
                    )

        if config.get('tmpfs_journal'):
            log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
            for remote, roles_for_host in osds.remotes.iteritems():
                remote.run(
                    args=[ 'sudo', 'umount', '-f', '/mnt' ],
                    check_status=False,
                )

        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and ctx.summary['success']):
            # archive mon data, too
            log.info('Archiving mon data...')
            path = os.path.join(ctx.archive, 'data')
            os.makedirs(path)
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    if role.startswith('mon.'):
                        teuthology.pull_directory_tarball(
                            remote,
                            '/var/lib/ceph/mon',
                            path + '/' + role + '.tgz')

            # and logs
            log.info('Compressing logs...')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo',
                        'find',
                        '/var/log/ceph',
                        '-name',
                        '*.log',
                        '-print0',
                        run.Raw('|'),
                        'sudo',
                        'xargs',
                        '-0',
                        '--no-run-if-empty',
                        '--',
                        'gzip',
                        '--',
                        ],
                    wait=False,
                    ),
                )

            log.info('Archiving logs...')
            path = os.path.join(ctx.archive, 'remote')
            os.makedirs(path)
            for remote in ctx.cluster.remotes.iterkeys():
                sub = os.path.join(path, remote.shortname)
                os.makedirs(sub)
                teuthology.pull_directory(remote, '/var/log/ceph',
                                          os.path.join(sub, 'log'))


        log.info('Cleaning ceph cluster...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-rf',
                    '--',
                    conf_path,
                    keyring_path,
                    '{tdir}/data'.format(tdir=testdir),
                    '{tdir}/monmap'.format(tdir=testdir),
                    ],
                wait=False,
                ),
            )