Exemplo n.º 1
0
def _exec_host(barrier, barrier_queue, remote, sudo, testdir, ls):
    log.info('Running commands on host %s', remote.name)
    args = [
        'TESTDIR={tdir}'.format(tdir=testdir),
        'bash',
        '-s'
        ]
    if sudo:
        args.insert(0, 'sudo')
    
    r = remote.run( args=args, stdin=tor.PIPE, wait=False)
    r.stdin.writelines(['set -e\n'])
    r.stdin.flush()
    for l in ls:
        l.replace('$TESTDIR', testdir)
        if l == "barrier":
            _do_barrier(barrier, barrier_queue, remote)
            continue

        r.stdin.writelines([l, '\n'])
        r.stdin.flush()
    r.stdin.writelines(['\n'])
    r.stdin.flush()
    r.stdin.close()
    tor.wait([r])
Exemplo n.º 2
0
    def umount_wait(self, force=False, require_clean=False):
        """
        :param force: Complete cleanly even if the MDS is offline
        """
        if force:
            assert not require_clean  # mutually exclusive

            # When we expect to be forcing, kill the ceph-fuse process directly.
            # This should avoid hitting the more aggressive fallback killing
            # in umount() which can affect other mounts too.
            self.fuse_daemon.stdin.close()

            # However, we will still hit the aggressive wait if there is an ongoing
            # mount -o remount (especially if the remount is stuck because MDSs
            # are unavailable)

        self.umount()

        try:
            if self.fuse_daemon:
                # Permit a timeout, so that we do not block forever
                run.wait([self.fuse_daemon], 900)
        except MaxWhileTries:
            log.error("process failed to terminate after unmount.  This probably"
                      "indicates a bug within ceph-fuse.")
            raise
        except CommandFailedError:
            if require_clean:
                raise

        self.cleanup()
Exemplo n.º 3
0
def base(ctx, config):
    """
    Create the test directory that we will be using on the remote system
    """
    log.info('Creating test directory...')
    testdir = misc.get_testdir(ctx)
    run.wait(
        ctx.cluster.run(
            args=['mkdir', '-p', '-m0755', '--', testdir],
            wait=False,
        )
    )
    try:
        yield
    finally:
        log.info('Tidying up after the test...')
        # if this fails, one of the earlier cleanups is flawed; don't
        # just cram an rm -rf here
        run.wait(
            ctx.cluster.run(
                args=['find', testdir, '-ls',
                      run.Raw(';'),
                      'rmdir', '--', testdir],
                wait=False,
            ),
        )
Exemplo n.º 4
0
def sudo(ctx, config):
    """
    Enable use of sudo
    """
    log.info('Configuring sudo...')
    sudoers_file = '/etc/sudoers'
    backup_ext = '.orig.teuthology'
    tty_expr = r's/^\([^#]*\) \(requiretty\)/\1 !\2/g'
    pw_expr = r's/^\([^#]*\) !\(visiblepw\)/\1 \2/g'

    run.wait(
        ctx.cluster.run(
            args="sudo sed -i{ext} -e '{tty}' -e '{pw}' {path}".format(
                ext=backup_ext, tty=tty_expr, pw=pw_expr,
                path=sudoers_file
            ),
            wait=False,
        )
    )
    try:
        yield
    finally:
        log.info('Restoring {0}...'.format(sudoers_file))
        ctx.cluster.run(
            args="sudo mv -f {path}{ext} {path}".format(
                path=sudoers_file, ext=backup_ext
            )
        )
Exemplo n.º 5
0
    def umount(self, force=False):
        log.debug('Unmounting client client.{id}...'.format(id=self.client_id))

        cmd=['sudo', 'umount', self.mountpoint]
        if force:
            cmd.append('-f')

        try:
            self.client_remote.run(args=cmd)
        except Exception as e:
            self.client_remote.run(args=[
                'sudo',
                run.Raw('PATH=/usr/sbin:$PATH'),
                'lsof',
                run.Raw(';'),
                'ps', 'auxf',
            ])
            raise e

        rproc = self.client_remote.run(
            args=[
                'rmdir',
                '--',
                self.mountpoint,
            ],
            wait=False
        )
        run.wait([rproc], UMOUNT_TIMEOUT)
        self.mounted = False
Exemplo n.º 6
0
 def wait_for_exit(self):
     """
     clear remote run command value after waiting for exit.
     """
     if self.proc:
         try:
             run.wait([self.proc])
         finally:
             self.proc = None
Exemplo n.º 7
0
def start_apache(ctx, config, on_client = None, except_client = None):
    """
    Start apache on remote sites.
    """
    log.info('Starting apache...')
    testdir = teuthology.get_testdir(ctx)
    apaches = {}
    clients_to_run = [on_client]
    if on_client is None:
        clients_to_run = config.keys()
    for client in clients_to_run:
        cluster_name, daemon_type, client_id = teuthology.split_role(client)
        client_with_cluster = cluster_name + '.' + daemon_type + '.' + client_id
        if client == except_client:
            continue
        (remote,) = ctx.cluster.only(client).remotes.keys()
        system_type = teuthology.get_system_type(remote)
        if system_type == 'deb':
            apache_name = 'apache2'
        else:
            try:
                remote.run(
                    args=[
                        'stat',
                        '/usr/sbin/httpd.worker',
                    ],
                )
                apache_name = '/usr/sbin/httpd.worker'
            except CommandFailedError:
                apache_name = '/usr/sbin/httpd'

        proc = remote.run(
            args=[
                'adjust-ulimits',
                'daemon-helper',
                'kill',
                apache_name,
                '-X',
                '-f',
                '{tdir}/apache/apache.{client_with_cluster}.conf'.format(tdir=testdir,
                                                            client_with_cluster=client_with_cluster),
                ],
            logger=log.getChild(client),
            stdin=run.PIPE,
            wait=False,
            )
        apaches[client_with_cluster] = proc

    try:
        yield
    finally:
        log.info('Stopping apache...')
        for client, proc in apaches.iteritems():
            proc.stdin.close()

        run.wait(apaches.itervalues())
Exemplo n.º 8
0
    def thread():
        """Thread spawned by gevent"""
        clients = ['client.{id}'.format(id=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
        log.info('clients are %s' % clients)
        manager = ctx.managers['ceph']
        if config.get('ec_pool', False):
            profile = config.get('erasure_code_profile', {})
            profile_name = profile.get('name', 'teuthologyprofile')
            manager.create_erasure_code_profile(profile_name, profile)
        else:
            profile_name = None
        for i in range(int(config.get('runs', '1'))):
            log.info("starting run %s out of %s", str(i), config.get('runs', '1'))
            tests = {}
            existing_pools = config.get('pools', [])
            created_pools = []
            for role in config.get('clients', clients):
                assert isinstance(role, basestring)
                PREFIX = 'client.'
                assert role.startswith(PREFIX)
                id_ = role[len(PREFIX):]

                pool = config.get('pool', None)
                if not pool and existing_pools:
                    pool = existing_pools.pop()
                else:
                    pool = manager.create_pool_with_unique_name(
                        erasure_code_profile_name=profile_name,
                        erasure_code_use_overwrites=
                          config.get('erasure_code_use_overwrites', False)
                    )
                    created_pools.append(pool)
                    if config.get('fast_read', False):
                        manager.raw_cluster_cmd(
                            'osd', 'pool', 'set', pool, 'fast_read', 'true')
                    min_size = config.get('min_size', None);
                    if min_size is not None:
                        manager.raw_cluster_cmd(
                            'osd', 'pool', 'set', pool, 'min_size', str(min_size))

                (remote,) = ctx.cluster.only(role).remotes.iterkeys()
                proc = remote.run(
                    args=["CEPH_CLIENT_ID={id_}".format(id_=id_)] + args +
                    ["--pool", pool],
                    logger=log.getChild("rados.{id}".format(id=id_)),
                    stdin=run.PIPE,
                    wait=False
                    )
                tests[id_] = proc
            run.wait(tests.itervalues())

            for pool in created_pools:
                manager.wait_snap_trimming_complete(pool);
                manager.remove_pool(pool)
Exemplo n.º 9
0
 def invoke_logrotate(self):
     # 1) install ceph-test.conf in /etc/logrotate.d
     # 2) continuously loop over logrotate invocation with ceph-test.conf
     while not self.stop_event.is_set():
         self.stop_event.wait(timeout=30)
         run.wait(
             ctx.cluster.run(
                 args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
                       ],
                 wait=False,
             )
         )
Exemplo n.º 10
0
def coredump(ctx, config):
    """
    Stash a coredump of this system if an error occurs.
    """
    log.info('Enabling coredump saving...')
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'install', '-d', '-m0755', '--',
                '{adir}/coredump'.format(adir=archive_dir),
                run.Raw('&&'),
                'sudo', 'sysctl', '-w', 'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(adir=archive_dir),
            ],
            wait=False,
        )
    )

    try:
        yield
    finally:
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo', 'sysctl', '-w', 'kernel.core_pattern=core',
                    run.Raw('&&'),
                    # don't litter the archive dir if there were no cores dumped
                    'rmdir',
                    '--ignore-fail-on-non-empty',
                    '--',
                    '{adir}/coredump'.format(adir=archive_dir),
                ],
                wait=False,
            )
        )

        # set status = 'fail' if the dir is still there = coredumps were
        # seen
        for rem in ctx.cluster.remotes.iterkeys():
            r = rem.run(
                args=[
                    'if', 'test', '!', '-e', '{adir}/coredump'.format(adir=archive_dir), run.Raw(';'), 'then',
                    'echo', 'OK', run.Raw(';'),
                    'fi',
                ],
                stdout=StringIO(),
            )
            if r.stdout.getvalue() != 'OK\n':
                log.warning('Found coredumps on %s, flagging run as failed', rem)
                set_status(ctx.summary, 'fail')
                if 'failure_reason' not in ctx.summary:
                    ctx.summary['failure_reason'] = \
                        'Found coredumps on {rem}'.format(rem=rem)
Exemplo n.º 11
0
def start_apache(ctx, config):
    """
    Start apache on remote sites.
    """
    log.info('Starting apache...')
    testdir = teuthology.get_testdir(ctx)
    apaches = {}
    for client in config.iterkeys():
        (remote,) = ctx.cluster.only(client).remotes.keys()
        system_type = teuthology.get_system_type(remote)
        if system_type == 'deb':
            apache_name = 'apache2'
        else:
            try:
                remote.run(
                    args=[
                        'stat',
                        '/usr/sbin/httpd.worker',
                    ],
                )
                apache_name = '/usr/sbin/httpd.worker'
            except CommandFailedError:
                apache_name = '/usr/sbin/httpd'

        proc = remote.run(
            args=[
                'adjust-ulimits',
                'daemon-helper',
                'kill',
                apache_name,
                '-X',
                '-f',
                '{tdir}/apache/apache.{client}.conf'.format(tdir=testdir,
                                                            client=client),
                ],
            logger=log.getChild(client),
            stdin=run.PIPE,
            wait=False,
            )
        apaches[client] = proc

    try:
        yield
    finally:
        log.info('Stopping apache...')
        for client, proc in apaches.iteritems():
            proc.stdin.close()

        run.wait(apaches.itervalues())
Exemplo n.º 12
0
    def wait(self, timeout=300):
        """
        Wait for daemon to exit

        Wait for daemon to stop (but don't trigger the stop).  Pass up
        any exception.  Mark the daemon as not running.
        """
        self.log.debug('waiting for process to exit')
        try:
            run.wait([self.proc], timeout=timeout)
            self.log.info('Stopped')
        except:
            self.log.info('Failed')
            raise
        finally:
            self.proc = None
Exemplo n.º 13
0
def _exec_role(remote, role, sudo, ls):
    log.info('Running commands on role %s host %s', role, remote.name)
    cid=role.split('.')[1]
    args = ['bash', '-s']
    if sudo:
        args.insert(0, 'sudo')
    r = remote.run( args=args, stdin=tor.PIPE, wait=False)
    r.stdin.writelines(['set -e\n'])
    r.stdin.flush()
    r.stdin.writelines(['cd /tmp/cephtest/mnt.{cid}\n'.format(cid=cid)])
    r.stdin.flush()
    for l in ls:
        r.stdin.writelines([l, '\n'])
        r.stdin.flush()
    r.stdin.writelines(['\n'])
    r.stdin.flush()
    r.stdin.close()
    tor.wait([r])
Exemplo n.º 14
0
    def umount(self, force=False):
        log.debug('Unmounting client client.{id}...'.format(id=self.client_id))

        cmd=['sudo', 'umount', self.mountpoint]
        if force:
            cmd.append('-f')

        self.client_remote.run(args=cmd)

        rproc = self.client_remote.run(
            args=[
                'rmdir',
                '--',
                self.mountpoint,
            ],
            wait=False
        )
        run.wait([rproc], UMOUNT_TIMEOUT)
        self.mounted = False
Exemplo n.º 15
0
def task(ctx, config):
    """
    Run chef-solo on all nodes.

    Optional parameters:
    tasks:
    -chef
        script_url: # override default location for solo-from-scratch for Chef
        chef_repo: # override default Chef repo used by solo-from-scratch
        chef_branch: # to choose a different upstream branch for ceph-qa-chef
    """
    log.info("Running chef-solo...")

    if config is None:
        config = {}

    assert isinstance(config, dict), "chef - need config"
    chef_script = config.get(
        "script_url", "http://git.ceph.com/?p=ceph-qa-chef.git;a=blob_plain;f=solo/solo-from-scratch;hb=HEAD"
    )
    chef_repo = config.get("chef_repo", "")
    chef_branch = config.get("chef_branch", "")
    run.wait(
        ctx.cluster.run(
            args=[
                "wget",
                #                '-q',
                "-O-",
                #                'https://raw.github.com/ceph/ceph-qa-chef/master/solo/solo-from-scratch',
                chef_script,
                run.Raw("|"),
                run.Raw("CHEF_REPO={repo}".format(repo=chef_repo)),
                run.Raw("CHEF_BRANCH={branch}".format(branch=chef_branch)),
                "sh",
                "-x",
            ],
            wait=False,
        )
    )

    log.info("Reconnecting after ceph-qa-chef run")
    misc.reconnect(ctx, 10)  # Reconnect for ulimit and other ceph-qa-chef changes
Exemplo n.º 16
0
def write_conf(ctx, conf_path=DEFAULT_CONF_PATH):
    conf_fp = StringIO()
    ctx.ceph.conf.write(conf_fp)
    conf_fp.seek(0)
    writes = ctx.cluster.run(
        args=[
            'sudo', 'mkdir', '-p', '/etc/ceph', run.Raw('&&'),
            'sudo', 'chmod', '0755', '/etc/ceph', run.Raw('&&'),
            'sudo', 'python',
            '-c',
            'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))',
            conf_path,
            run.Raw('&&'),
            'sudo', 'chmod', '0644', conf_path,
        ],
        stdin=run.PIPE,
        wait=False)
    log.warn("writes: ")
    teuthology.feed_many_stdins_and_close(conf_fp, writes)
    run.wait(writes)
Exemplo n.º 17
0
    def stop(self, timeout=300):
        """
        Stop this daemon instance.

        Note: this can raise a CommandFailedError,
        CommandCrashedError, or ConnectionLostError.

        :param timeout: timeout to pass to orchestra.run.wait()
        """
        if not self.running():
            self.log.error('tried to stop a non-running daemon')
            return
        self.proc.stdin.close()
        self.log.debug('waiting for process to exit')
        try:
            run.wait([self.proc], timeout=timeout)
        except CommandFailedError:
            log.exception("Error while waiting for process to exit")
        self.proc = None
        self.log.info('Stopped')
Exemplo n.º 18
0
    def thread():
        """Thread spawned by gevent"""
        clients = ["client.{id}".format(id=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, "client")]
        log.info("clients are %s" % clients)
        if config.get("ec_pool", False):
            profile = config.get("erasure_code_profile", {})
            profile_name = profile.get("name", "teuthologyprofile")
            ctx.manager.create_erasure_code_profile(profile_name, profile)
        else:
            profile_name = None
        for i in range(int(config.get("runs", "1"))):
            log.info("starting run %s out of %s", str(i), config.get("runs", "1"))
            tests = {}
            existing_pools = config.get("pools", [])
            created_pools = []
            for role in config.get("clients", clients):
                assert isinstance(role, basestring)
                PREFIX = "client."
                assert role.startswith(PREFIX)
                id_ = role[len(PREFIX) :]

                pool = config.get("pool", None)
                if not pool and existing_pools:
                    pool = existing_pools.pop()
                else:
                    pool = ctx.manager.create_pool_with_unique_name(erasure_code_profile_name=profile_name)
                    created_pools.append(pool)

                (remote,) = ctx.cluster.only(role).remotes.iterkeys()
                proc = remote.run(
                    args=["CEPH_CLIENT_ID={id_}".format(id_=id_)] + args + ["--pool", pool],
                    logger=log.getChild("rados.{id}".format(id=id_)),
                    stdin=run.PIPE,
                    wait=False,
                )
                tests[id_] = proc
            run.wait(tests.itervalues())

            for pool in created_pools:
                ctx.manager.remove_pool(pool)
Exemplo n.º 19
0
def archive(ctx, config):
    """
    Handle the creation and deletion of the archive directory.
    """
    log.info('Creating archive directory...')
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=['install', '-d', '-m0755', '--', archive_dir],
            wait=False,
        )
    )

    try:
        yield
    except Exception:
        # we need to know this below
        set_status(ctx.summary, 'fail')
        raise
    finally:
        passed = get_status(ctx.summary) == 'pass'
        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and passed):
            log.info('Transferring archived files...')
            logdir = os.path.join(ctx.archive, 'remote')
            if (not os.path.exists(logdir)):
                os.mkdir(logdir)
            for rem in ctx.cluster.remotes.iterkeys():
                path = os.path.join(logdir, rem.shortname)
                misc.pull_directory(rem, archive_dir, path)
                # Check for coredumps and pull binaries
                fetch_binaries_for_coredumps(path, rem)

        log.info('Removing archive directory...')
        run.wait(
            ctx.cluster.run(
                args=['rm', '-rf', '--', archive_dir],
                wait=False,
            ),
        )
Exemplo n.º 20
0
def _exec_host(barrier, barrier_queue, remote, sudo, testdir, ls):
    """Execute command remotely"""
    log.info("Running commands on host %s", remote.name)
    args = ["TESTDIR={tdir}".format(tdir=testdir), "bash", "-s"]
    if sudo:
        args.insert(0, "sudo")

    r = remote.run(args=args, stdin=tor.PIPE, wait=False)
    r.stdin.writelines(["set -e\n"])
    r.stdin.flush()
    for l in ls:
        l.replace("$TESTDIR", testdir)
        if l == "barrier":
            _do_barrier(barrier, barrier_queue, remote)
            continue

        r.stdin.writelines([l, "\n"])
        r.stdin.flush()
    r.stdin.writelines(["\n"])
    r.stdin.flush()
    r.stdin.close()
    tor.wait([r])
Exemplo n.º 21
0
def task(ctx, config):
    """
    Run chef-solo on all nodes.
    """
    log.info('Running chef-solo...')

    run.wait(
        ctx.cluster.run(
            args=[
                'wget',
#                '-q',
                '-O-',
#                'https://raw.github.com/ceph/ceph-qa-chef/master/solo/solo-from-scratch',
                'http://ceph.com/git/?p=ceph-qa-chef.git;a=blob_plain;f=solo/solo-from-scratch;hb=HEAD',
                run.Raw('|'),
                'sh',
                '-x',
                ],
            wait=False,
            )
        )

    log.info('Reconnecting after ceph-qa-chef run')
    misc.reconnect(ctx, 10)     #Reconnect for ulimit and other ceph-qa-chef changes
Exemplo n.º 22
0
def ceph_log(ctx, config):
    """
    Create /var/log/ceph log directory that is open to everyone.
    Add valgrind and profiling-logger directories.

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Making ceph log dir writeable by non-root...')
    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'chmod',
                '777',
                '/var/log/ceph',
                ],
            wait=False,
            )
        )
    log.info('Disabling ceph logrotate...')
    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'rm', '-f', '--',
                '/etc/logrotate.d/ceph',
                ],
            wait=False,
            )
        )
    log.info('Creating extra log directories...')
    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'install', '-d', '-m0755', '--',
                '/var/log/ceph/valgrind',
                '/var/log/ceph/profiling-logger',
                ],
            wait=False,
            )
        )

    try:
        yield

    finally:
        pass
Exemplo n.º 23
0
def task(ctx, config):
    """
    This task is designed to test locking. It runs an executable
    for each lock attempt you specify, at 0.01 second intervals (to
    preserve ordering of the locks).
    You can also introduce longer intervals by setting an entry
    as a number of seconds, rather than the lock dictionary.
    The config is a list of dictionaries. For each entry in the list, you
    must name the "client" to run on, the "file" to lock, and
    the "holdtime" to hold the lock.
    Optional entries are the "offset" and "length" of the lock. You can also specify a
    "maxwait" timeout period which fails if the executable takes longer
    to complete, and an "expectfail".

    An example::

        tasks:
        - ceph:
        - ceph-fuse: [client.0, client.1]
        - lockfile:
          [{client:client.0, file:testfile, holdtime:10},
          {client:client.1, file:testfile, holdtime:0, maxwait:0, expectfail:true},
          {client:client.1, file:testfile, holdtime:0, maxwait:15, expectfail:false},
          10,
          {client: client.1, lockfile: testfile, holdtime: 5},
          {client: client.2, lockfile: testfile, holdtime: 5, maxwait: 1, expectfail: True}]


    In the past this test would have failed; there was a bug where waitlocks weren't
    cleaned up if the process failed. More involved scenarios are also possible.

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Starting lockfile')
    try:
        assert isinstance(config, list), \
            "task lockfile got invalid config"

        log.info("building executable on each host")
        buildprocs = list()
        # build the locker executable on each client
        clients = list()
        files = list()
        for op in config:
            if not isinstance(op, dict):
                continue
            log.info("got an op")
            log.info("op['client'] = %s", op['client'])
            clients.append(op['client'])
            files.append(op['lockfile'])
            if not "expectfail" in op:
                op["expectfail"] = False
            badconfig = False
            if not "client" in op:
                badconfig = True
            if not "lockfile" in op:
                badconfig = True
            if not "holdtime" in op:
                badconfig = True
            if badconfig:
                raise KeyError("bad config {op_}".format(op_=op))

        testdir = teuthology.get_testdir(ctx)
        clients = set(clients)
        files = set(files)
        lock_procs = list()
        for client in clients:
            (client_remote, ) = ctx.cluster.only(client).remotes.keys()
            log.info("got a client remote")
            (_, _, client_id) = client.partition('.')
            filepath = os.path.join(testdir, 'mnt.{id}'.format(id=client_id),
                                    op["lockfile"])

            proc = client_remote.run(args=[
                'mkdir', '-p', '{tdir}/archive/lockfile'.format(tdir=testdir),
                run.Raw('&&'), 'mkdir', '-p',
                '{tdir}/lockfile'.format(tdir=testdir),
                run.Raw('&&'), 'wget', '-nv', '--no-check-certificate',
                'https://raw.github.com/gregsfortytwo/FileLocker/master/sclockandhold.cpp',
                '-O', '{tdir}/lockfile/sclockandhold.cpp'.format(tdir=testdir),
                run.Raw('&&'), 'g++',
                '{tdir}/lockfile/sclockandhold.cpp'.format(tdir=testdir), '-o',
                '{tdir}/lockfile/sclockandhold'.format(tdir=testdir)
            ],
                                     logger=log.getChild(
                                         'lockfile_client.{id}'.format(
                                             id=client_id)),
                                     wait=False)
            log.info(
                'building sclockandhold on client{id}'.format(id=client_id))
            buildprocs.append(proc)

        # wait for builds to finish
        run.wait(buildprocs)
        log.info('finished building sclockandhold on all clients')

        # create the files to run these locks on
        client = clients.pop()
        clients.add(client)
        (client_remote, ) = ctx.cluster.only(client).remotes.keys()
        (_, _, client_id) = client.partition('.')
        file_procs = list()
        for lockfile in files:
            filepath = os.path.join(testdir, 'mnt.{id}'.format(id=client_id),
                                    lockfile)
            proc = client_remote.run(
                args=[
                    'sudo',
                    'touch',
                    filepath,
                ],
                logger=log.getChild('lockfile_createfile'),
                wait=False)
            file_procs.append(proc)
        run.wait(file_procs)
        file_procs = list()
        for lockfile in files:
            filepath = os.path.join(testdir, 'mnt.{id}'.format(id=client_id),
                                    lockfile)
            proc = client_remote.run(
                args=['sudo', 'chown', 'ubuntu.ubuntu', filepath],
                logger=log.getChild('lockfile_createfile'),
                wait=False)
            file_procs.append(proc)
        run.wait(file_procs)
        log.debug('created files to lock')

        # now actually run the locktests
        for op in config:
            if not isinstance(op, dict):
                assert isinstance(op, int) or isinstance(op, float)
                log.info("sleeping for {sleep} seconds".format(sleep=op))
                time.sleep(op)
                continue
            greenlet = gevent.spawn(lock_one, op, ctx)
            lock_procs.append((greenlet, op))
            time.sleep(0.1)  # to provide proper ordering
        #for op in config

        for (greenlet, op) in lock_procs:
            log.debug('checking lock for op {op_}'.format(op_=op))
            result = greenlet.get()
            if not result:
                raise Exception("Got wrong result for op {op_}".format(op_=op))
        # for (greenlet, op) in lock_procs

    finally:
        #cleanup!
        if lock_procs:
            for (greenlet, op) in lock_procs:
                log.debug('closing proc for op {op_}'.format(op_=op))
                greenlet.kill(block=True)

        for client in clients:
            (client_remote, ) = ctx.cluster.only(client).remotes.keys()
            (_, _, client_id) = client.partition('.')
            filepath = os.path.join(testdir, 'mnt.{id}'.format(id=client_id),
                                    op["lockfile"])
            proc = client_remote.run(args=[
                'rm', '-rf', '{tdir}/lockfile'.format(tdir=testdir),
                run.Raw(';'), 'sudo', 'rm', '-rf', filepath
            ],
                                     wait=True)  #proc
Exemplo n.º 24
0
def rook_operator(ctx, config):
    cluster_name = config['cluster']
    rook_branch = config.get('rook_branch', 'master')
    rook_git_url = config.get('rook_git_url', 'https://github.com/rook/rook')

    log.info(f'Cloning {rook_git_url} branch {rook_branch}')
    ctx.rook[cluster_name].remote.run(
        args=[
            'rm', '-rf', 'rook',
            run.Raw('&&'),
            'git',
            'clone',
            '--single-branch',
            '--branch', rook_branch,
            rook_git_url,
            'rook',
        ]
    )

    # operator.yaml
    operator_yaml = ctx.rook[cluster_name].remote.read_file(
        'rook/cluster/examples/kubernetes/ceph/operator.yaml'
    )
    rook_image = config.get('rook_image')
    if rook_image:
        log.info(f'Patching operator to use image {rook_image}')
        crs = list(yaml.load_all(operator_yaml, Loader=yaml.FullLoader))
        assert len(crs) == 2
        crs[1]['spec']['template']['spec']['containers'][0]['image'] = rook_image
        operator_yaml = yaml.dump_all(crs)
    ctx.rook[cluster_name].remote.write_file('operator.yaml', operator_yaml)

    op_job = None
    try:
        log.info('Deploying operator')
        _kubectl(ctx, config, [
            'create',
            '-f', 'rook/cluster/examples/kubernetes/ceph/crds.yaml',
            '-f', 'rook/cluster/examples/kubernetes/ceph/common.yaml',
            '-f', 'operator.yaml',
        ])

        # on centos:
        if teuthology.get_distro(ctx) == 'centos':
            _kubectl(ctx, config, [
                '-n', 'rook-ceph',
                'set', 'env', 'deploy/rook-ceph-operator',
                'ROOK_HOSTPATH_REQUIRES_PRIVILEGED=true'
            ])

        # wait for operator
        op_name = None
        with safe_while(sleep=10, tries=90, action="wait for operator") as proceed:
            while not op_name and proceed():
                p = _kubectl(
                    ctx, config,
                    ['-n', 'rook-ceph', 'get', 'pods', '-l', 'app=rook-ceph-operator'],
                    stdout=BytesIO(),
                )
                for line in p.stdout.getvalue().decode('utf-8').strip().splitlines():
                    name, ready, status, _ = line.split(None, 3)
                    if status == 'Running':
                        op_name = name
                        break

        # log operator output
        op_job = _kubectl(
            ctx,
            config,
            ['-n', 'rook-ceph', 'logs', '-f', op_name],
            wait=False,
            logger=log.getChild('operator'),
        )

        yield

    except Exception as e:
        log.exception(e)
        raise

    finally:
        log.info('Cleaning up rook operator')
        _kubectl(ctx, config, [
            'delete',
            '-f', 'operator.yaml',
        ])
        if False:
            # don't bother since we'll tear down k8s anyway (and this mysteriously
            # fails sometimes when deleting some of the CRDs... not sure why!)
            _kubectl(ctx, config, [
                'delete',
                '-f', 'rook/cluster/examples/kubernetes/ceph/common.yaml',
            ])
            _kubectl(ctx, config, [
                'delete',
                '-f', 'rook/cluster/examples/kubernetes/ceph/crds.yaml',
            ])
        ctx.rook[cluster_name].remote.run(args=['rm', '-rf', 'rook', 'operator.yaml'])
        if op_job:
            op_job.wait()
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo', 'rm', '-rf', '/var/lib/rook'
                ]
            )
        )
Exemplo n.º 25
0
def build_ceph_cluster(ctx, config):
    """Build a ceph cluster"""

    try:
        log.info('Building ceph cluster using ceph-deploy...')
        testdir = teuthology.get_testdir(ctx)
        ceph_branch = None
        if config.get('branch') is not None:
            cbranch = config.get('branch')
            for var, val in cbranch.iteritems():
                if var == 'testing':
                    ceph_branch = '--{var}'.format(var=var)
                ceph_branch = '--{var}={val}'.format(var=var, val=val)
        node_dev_list = []
        all_nodes = get_all_nodes(ctx, config)
        mds_nodes = get_nodes_using_roles(ctx, config, 'mds')
        mds_nodes = " ".join(mds_nodes)
        mon_node = get_nodes_using_roles(ctx, config, 'mon')
        mon_nodes = " ".join(mon_node)
        new_mon = './ceph-deploy new' + " " + mon_nodes
        install_nodes = './ceph-deploy install ' + ceph_branch + " " + all_nodes
        purge_nodes = './ceph-deploy purge' + " " + all_nodes
        purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes
        mon_hostname = mon_nodes.split(' ')[0]
        mon_hostname = str(mon_hostname)
        gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname
        deploy_mds = './ceph-deploy mds create' + " " + mds_nodes
        no_of_osds = 0

        if mon_nodes is None:
            raise RuntimeError("no monitor nodes in the config file")

        estatus_new = execute_ceph_deploy(ctx, config, new_mon)
        if estatus_new != 0:
            raise RuntimeError("ceph-deploy: new command failed")

        log.info('adding config inputs...')
        testdir = teuthology.get_testdir(ctx)
        conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir)
        first_mon = teuthology.get_first_mon(ctx, config)
        (remote, ) = ctx.cluster.only(first_mon).remotes.keys()

        lines = None
        if config.get('conf') is not None:
            confp = config.get('conf')
            for section, keys in confp.iteritems():
                lines = '[{section}]\n'.format(section=section)
                teuthology.append_lines_to_file(remote,
                                                conf_path,
                                                lines,
                                                sudo=True)
                for key, value in keys.iteritems():
                    log.info("[%s] %s = %s" % (section, key, value))
                    lines = '{key} = {value}\n'.format(key=key, value=value)
                    teuthology.append_lines_to_file(remote,
                                                    conf_path,
                                                    lines,
                                                    sudo=True)

        estatus_install = execute_ceph_deploy(ctx, config, install_nodes)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph")

        mon_create_nodes = './ceph-deploy mon create-initial'
        # If the following fails, it is OK, it might just be that the monitors
        # are taking way more than a minute/monitor to form quorum, so lets
        # try the next block which will wait up to 15 minutes to gatherkeys.
        estatus_mon = execute_ceph_deploy(ctx, config, mon_create_nodes)

        estatus_gather = execute_ceph_deploy(ctx, config, gather_keys)
        max_gather_tries = 90
        gather_tries = 0
        while (estatus_gather != 0):
            gather_tries += 1
            if gather_tries >= max_gather_tries:
                msg = 'ceph-deploy was not able to gatherkeys after 15 minutes'
                raise RuntimeError(msg)
            estatus_gather = execute_ceph_deploy(ctx, config, gather_keys)
            time.sleep(10)

        if mds_nodes:
            estatus_mds = execute_ceph_deploy(ctx, config, deploy_mds)
            if estatus_mds != 0:
                raise RuntimeError("ceph-deploy: Failed to deploy mds")

        if config.get('test_mon_destroy') is not None:
            for d in range(1, len(mon_node)):
                mon_destroy_nodes = './ceph-deploy mon destroy' + " " + mon_node[
                    d]
                estatus_mon_d = execute_ceph_deploy(ctx, config,
                                                    mon_destroy_nodes)
                if estatus_mon_d != 0:
                    raise RuntimeError("ceph-deploy: Failed to delete monitor")

        node_dev_list = get_dev_for_osd(ctx, config)
        osd_create_cmd = './ceph-deploy osd create --zap-disk '
        for d in node_dev_list:
            if config.get('dmcrypt') is not None:
                osd_create_cmd_d = osd_create_cmd + '--dmcrypt' + " " + d
            else:
                osd_create_cmd_d = osd_create_cmd + d
            estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmd_d)
            if estatus_osd == 0:
                log.info('successfully created osd')
                no_of_osds += 1
            else:
                disks = []
                disks = d.split(':')
                dev_disk = disks[0] + ":" + disks[1]
                j_disk = disks[0] + ":" + disks[2]
                zap_disk = './ceph-deploy disk zap ' + dev_disk + " " + j_disk
                execute_ceph_deploy(ctx, config, zap_disk)
                estatus_osd = execute_ceph_deploy(ctx, config,
                                                  osd_create_cmd_d)
                if estatus_osd == 0:
                    log.info('successfully created osd')
                    no_of_osds += 1
                else:
                    raise RuntimeError("ceph-deploy: Failed to create osds")

        if config.get('wait-for-healthy', True) and no_of_osds >= 2:
            is_healthy(ctx=ctx, config=None)

            log.info('Setting up client nodes...')
            conf_path = '/etc/ceph/ceph.conf'
            admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring'
            first_mon = teuthology.get_first_mon(ctx, config)
            (mon0_remote, ) = ctx.cluster.only(first_mon).remotes.keys()
            conf_data = teuthology.get_file(
                remote=mon0_remote,
                path=conf_path,
                sudo=True,
            )
            admin_keyring = teuthology.get_file(
                remote=mon0_remote,
                path=admin_keyring_path,
                sudo=True,
            )

            clients = ctx.cluster.only(teuthology.is_type('client'))
            for remot, roles_for_host in clients.remotes.iteritems():
                for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
                    client_keyring = \
                        '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
                    mon0_remote.run(args=[
                        'cd',
                        '{tdir}'.format(tdir=testdir),
                        run.Raw('&&'),
                        'sudo',
                        'bash',
                        '-c',
                        run.Raw('"'),
                        'ceph',
                        'auth',
                        'get-or-create',
                        'client.{id}'.format(id=id_),
                        'mds',
                        'allow',
                        'mon',
                        'allow *',
                        'osd',
                        'allow *',
                        run.Raw('>'),
                        client_keyring,
                        run.Raw('"'),
                    ], )
                    key_data = teuthology.get_file(
                        remote=mon0_remote,
                        path=client_keyring,
                        sudo=True,
                    )
                    teuthology.sudo_write_file(remote=remot,
                                               path=client_keyring,
                                               data=key_data,
                                               perms='0644')
                    teuthology.sudo_write_file(remote=remot,
                                               path=admin_keyring_path,
                                               data=admin_keyring,
                                               perms='0644')
                    teuthology.sudo_write_file(remote=remot,
                                               path=conf_path,
                                               data=conf_data,
                                               perms='0644')
        else:
            raise RuntimeError(
                "The cluster is NOT operational due to insufficient OSDs")
        yield

    finally:
        log.info('Stopping ceph...')
        ctx.cluster.run(args=[
            'sudo', 'stop', 'ceph-all',
            run.Raw('||'), 'sudo', 'service', 'ceph', 'stop'
        ])

        # Are you really not running anymore?
        # try first with the init tooling
        # ignoring the status so this becomes informational only
        ctx.cluster.run(args=[
            'sudo', 'status', 'ceph-all',
            run.Raw('||'), 'sudo', 'service', 'ceph', 'status'
        ],
                        check_status=False)

        # and now just check for the processes themselves, as if upstart/sysvinit
        # is lying to us. Ignore errors if the grep fails
        ctx.cluster.run(args=[
            'sudo', 'ps', 'aux',
            run.Raw('|'), 'grep', '-v', 'grep',
            run.Raw('|'), 'grep', 'ceph'
        ],
                        check_status=False)

        if ctx.archive is not None:
            # archive mon data, too
            log.info('Archiving mon data...')
            path = os.path.join(ctx.archive, 'data')
            os.makedirs(path)
            mons = ctx.cluster.only(teuthology.is_type('mon'))
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    if role.startswith('mon.'):
                        teuthology.pull_directory_tarball(
                            remote, '/var/lib/ceph/mon',
                            path + '/' + role + '.tgz')

            log.info('Compressing logs...')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo',
                        'find',
                        '/var/log/ceph',
                        '-name',
                        '*.log',
                        '-print0',
                        run.Raw('|'),
                        'sudo',
                        'xargs',
                        '-0',
                        '--no-run-if-empty',
                        '--',
                        'gzip',
                        '--',
                    ],
                    wait=False,
                ), )

            log.info('Archiving logs...')
            path = os.path.join(ctx.archive, 'remote')
            os.makedirs(path)
            for remote in ctx.cluster.remotes.iterkeys():
                sub = os.path.join(path, remote.shortname)
                os.makedirs(sub)
                teuthology.pull_directory(remote, '/var/log/ceph',
                                          os.path.join(sub, 'log'))

        # Prevent these from being undefined if the try block fails
        all_nodes = get_all_nodes(ctx, config)
        purge_nodes = './ceph-deploy purge' + " " + all_nodes
        purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes

        log.info('Purging package...')
        execute_ceph_deploy(ctx, config, purge_nodes)
        log.info('Purging data...')
        execute_ceph_deploy(ctx, config, purgedata_nodes)
Exemplo n.º 26
0
def build_ceph_cluster(ctx, config):
    """Build a ceph cluster"""

    # Expect to find ceph_admin on the first mon by ID, same place that the download task
    # puts it.  Remember this here, because subsequently IDs will change from those in
    # the test config to those that ceph-deploy invents.
    (ceph_admin,) = ctx.cluster.only(
        teuthology.get_first_mon(ctx, config)).remotes.iterkeys()

    def execute_ceph_deploy(cmd):
        """Remotely execute a ceph_deploy command"""
        return ceph_admin.run(
            args=[
                'cd',
                '{tdir}/ceph-deploy'.format(tdir=testdir),
                run.Raw('&&'),
                run.Raw(cmd),
            ],
            check_status=False,
        ).exitstatus

    try:
        log.info('Building ceph cluster using ceph-deploy...')
        testdir = teuthology.get_testdir(ctx)
        ceph_branch = None
        if config.get('branch') is not None:
            cbranch = config.get('branch')
            for var, val in cbranch.iteritems():
                ceph_branch = '--{var}={val}'.format(var=var, val=val)
        all_nodes = get_all_nodes(ctx, config)
        mds_nodes = get_nodes_using_role(ctx, 'mds')
        mds_nodes = " ".join(mds_nodes)
        mon_node = get_nodes_using_role(ctx, 'mon')
        mon_nodes = " ".join(mon_node)
        new_mon = './ceph-deploy new' + " " + mon_nodes
        mon_hostname = mon_nodes.split(' ')[0]
        mon_hostname = str(mon_hostname)
        gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname
        deploy_mds = './ceph-deploy mds create' + " " + mds_nodes
        no_of_osds = 0

        if mon_nodes is None:
            raise RuntimeError("no monitor nodes in the config file")

        estatus_new = execute_ceph_deploy(new_mon)
        if estatus_new != 0:
            raise RuntimeError("ceph-deploy: new command failed")

        log.info('adding config inputs...')
        testdir = teuthology.get_testdir(ctx)
        conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir)

        if config.get('conf') is not None:
            confp = config.get('conf')
            for section, keys in confp.iteritems():
                lines = '[{section}]\n'.format(section=section)
                teuthology.append_lines_to_file(ceph_admin, conf_path, lines,
                                                sudo=True)
                for key, value in keys.iteritems():
                    log.info("[%s] %s = %s" % (section, key, value))
                    lines = '{key} = {value}\n'.format(key=key, value=value)
                    teuthology.append_lines_to_file(
                        ceph_admin, conf_path, lines, sudo=True)

        # install ceph
        dev_branch = ctx.config['branch']
        branch = '--dev={branch}'.format(branch=dev_branch)
        if ceph_branch:
            option = ceph_branch
        else:
            option = branch
        install_nodes = './ceph-deploy install ' + option + " " + all_nodes
        estatus_install = execute_ceph_deploy(install_nodes)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph")
        # install ceph-test package too
        install_nodes2 = './ceph-deploy install --tests ' + option + \
                         " " + all_nodes
        estatus_install = execute_ceph_deploy(install_nodes2)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph-test")

        mon_create_nodes = './ceph-deploy mon create-initial'
        # If the following fails, it is OK, it might just be that the monitors
        # are taking way more than a minute/monitor to form quorum, so lets
        # try the next block which will wait up to 15 minutes to gatherkeys.
        execute_ceph_deploy(mon_create_nodes)

        # create-keys is explicit now
        # http://tracker.ceph.com/issues/16036
        mons = ctx.cluster.only(teuthology.is_type('mon'))
        for remote in mons.remotes.iterkeys():
            remote.run(args=['sudo', 'ceph-create-keys', '--cluster', 'ceph',
                             '--id', remote.shortname])

        estatus_gather = execute_ceph_deploy(gather_keys)
        if mds_nodes:
            estatus_mds = execute_ceph_deploy(deploy_mds)
            if estatus_mds != 0:
                raise RuntimeError("ceph-deploy: Failed to deploy mds")

        if config.get('test_mon_destroy') is not None:
            for d in range(1, len(mon_node)):
                mon_destroy_nodes = './ceph-deploy mon destroy' + \
                    " " + mon_node[d]
                estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes)
                if estatus_mon_d != 0:
                    raise RuntimeError("ceph-deploy: Failed to delete monitor")

        node_dev_list = get_dev_for_osd(ctx, config)
        for d in node_dev_list:
            node = d[0]
            for disk in d[1:]:
                zap = './ceph-deploy disk zap ' + node + ':' + disk
                estatus = execute_ceph_deploy(zap)
                if estatus != 0:
                    raise RuntimeError("ceph-deploy: Failed to zap osds")
            osd_create_cmd = './ceph-deploy osd create '
            if config.get('dmcrypt') is not None:
                osd_create_cmd += '--dmcrypt '
            osd_create_cmd += ":".join(d)
            estatus_osd = execute_ceph_deploy(osd_create_cmd)
            if estatus_osd == 0:
                log.info('successfully created osd')
                no_of_osds += 1
            else:
                raise RuntimeError("ceph-deploy: Failed to create osds")

        if config.get('wait-for-healthy', True) and no_of_osds >= 2:
            is_healthy(ctx=ctx, config=None)

            log.info('Setting up client nodes...')
            conf_path = '/etc/ceph/ceph.conf'
            admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring'
            first_mon = teuthology.get_first_mon(ctx, config)
            (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys()
            conf_data = teuthology.get_file(
                remote=mon0_remote,
                path=conf_path,
                sudo=True,
            )
            admin_keyring = teuthology.get_file(
                remote=mon0_remote,
                path=admin_keyring_path,
                sudo=True,
            )

            clients = ctx.cluster.only(teuthology.is_type('client'))
            for remot, roles_for_host in clients.remotes.iteritems():
                for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
                    client_keyring = \
                        '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
                    mon0_remote.run(
                        args=[
                            'cd',
                            '{tdir}'.format(tdir=testdir),
                            run.Raw('&&'),
                            'sudo', 'bash', '-c',
                            run.Raw('"'), 'ceph',
                            'auth',
                            'get-or-create',
                            'client.{id}'.format(id=id_),
                            'mds', 'allow',
                            'mon', 'allow *',
                            'osd', 'allow *',
                            run.Raw('>'),
                            client_keyring,
                            run.Raw('"'),
                        ],
                    )
                    key_data = teuthology.get_file(
                        remote=mon0_remote,
                        path=client_keyring,
                        sudo=True,
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=client_keyring,
                        data=key_data,
                        perms='0644'
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=admin_keyring_path,
                        data=admin_keyring,
                        perms='0644'
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=conf_path,
                        data=conf_data,
                        perms='0644'
                    )

            if mds_nodes:
                log.info('Configuring CephFS...')
                ceph_fs = Filesystem(ctx)
                if not ceph_fs.legacy_configured():
                    ceph_fs.create()
        elif not config.get('only_mon'):
            raise RuntimeError(
                "The cluster is NOT operational due to insufficient OSDs")
        yield

    except Exception:
        log.info(
            "Error encountered, logging exception before tearing down ceph-deploy")
        log.info(traceback.format_exc())
        raise
    finally:
        if config.get('keep_running'):
            return
        log.info('Stopping ceph...')
        ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
                              'sudo', 'service', 'ceph', 'stop', run.Raw('||'),
                              'sudo', 'systemctl', 'stop', 'ceph.target'])

        # Are you really not running anymore?
        # try first with the init tooling
        # ignoring the status so this becomes informational only
        ctx.cluster.run(
            args=[
                'sudo', 'status', 'ceph-all', run.Raw('||'),
                'sudo', 'service', 'ceph', 'status', run.Raw('||'),
                'sudo', 'systemctl', 'status', 'ceph.target'],
            check_status=False)

        # and now just check for the processes themselves, as if upstart/sysvinit
        # is lying to us. Ignore errors if the grep fails
        ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'),
                              'grep', '-v', 'grep', run.Raw('|'),
                              'grep', 'ceph'], check_status=False)

        if ctx.archive is not None:
            # archive mon data, too
            log.info('Archiving mon data...')
            path = os.path.join(ctx.archive, 'data')
            os.makedirs(path)
            mons = ctx.cluster.only(teuthology.is_type('mon'))
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    if role.startswith('mon.'):
                        teuthology.pull_directory_tarball(
                            remote,
                            '/var/lib/ceph/mon',
                            path + '/' + role + '.tgz')

            log.info('Compressing logs...')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo',
                        'find',
                        '/var/log/ceph',
                        '-name',
                        '*.log',
                        '-print0',
                        run.Raw('|'),
                        'sudo',
                        'xargs',
                        '-0',
                        '--no-run-if-empty',
                        '--',
                        'gzip',
                        '--',
                    ],
                    wait=False,
                ),
            )

            log.info('Archiving logs...')
            path = os.path.join(ctx.archive, 'remote')
            os.makedirs(path)
            for remote in ctx.cluster.remotes.iterkeys():
                sub = os.path.join(path, remote.shortname)
                os.makedirs(sub)
                teuthology.pull_directory(remote, '/var/log/ceph',
                                          os.path.join(sub, 'log'))

        # Prevent these from being undefined if the try block fails
        all_nodes = get_all_nodes(ctx, config)
        purge_nodes = './ceph-deploy purge' + " " + all_nodes
        purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes

        log.info('Purging package...')
        execute_ceph_deploy(purge_nodes)
        log.info('Purging data...')
        execute_ceph_deploy(purgedata_nodes)
Exemplo n.º 27
0
def syslog(ctx, config):
    """
    start syslog / stop syslog on exit.
    """
    if ctx.archive is None:
        # disable this whole feature if we're not going to archive the data
        # anyway
        yield
        return

    log.info('Starting syslog monitoring...')

    archive_dir = misc.get_archive_dir(ctx)
    log_dir = '{adir}/syslog'.format(adir=archive_dir)
    run.wait(
        ctx.cluster.run(
            args=['mkdir', '-p', '-m0755', '--', log_dir],
            wait=False,
        ))

    CONF = '/etc/rsyslog.d/80-cephtest.conf'
    kern_log = '{log_dir}/kern.log'.format(log_dir=log_dir)
    misc_log = '{log_dir}/misc.log'.format(log_dir=log_dir)
    conf_lines = [
        'kern.* -{kern_log};RSYSLOG_FileFormat'.format(kern_log=kern_log),
        '*.*;kern.none -{misc_log};RSYSLOG_FileFormat'.format(
            misc_log=misc_log),
    ]
    conf_fp = StringIO('\n'.join(conf_lines))
    try:
        for rem in ctx.cluster.remotes.iterkeys():
            log_context = 'system_u:object_r:var_log_t:s0'
            for log_path in (kern_log, misc_log):
                rem.run(args=['install', '-m', '666', '/dev/null', log_path])
                rem.chcon(log_path, log_context)
            misc.sudo_write_file(
                remote=rem,
                path=CONF,
                data=conf_fp,
            )
            conf_fp.seek(0)
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'service',
                    # a mere reload (SIGHUP) doesn't seem to make
                    # rsyslog open the files
                    'rsyslog',
                    'restart',
                ],
                wait=False,
            ), )

        yield
    finally:
        log.info('Shutting down syslog monitoring...')

        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-f',
                    '--',
                    CONF,
                    run.Raw('&&'),
                    'sudo',
                    'service',
                    'rsyslog',
                    'restart',
                ],
                wait=False,
            ), )
        # race condition: nothing actually says rsyslog had time to
        # flush the file fully. oh well.

        log.info('Checking logs for errors...')
        for rem in ctx.cluster.remotes.iterkeys():
            log.debug('Checking %s', rem.name)
            r = rem.run(
                args=[
                    'egrep',
                    '--binary-files=text',
                    '\\bBUG\\b|\\bINFO\\b|\\bDEADLOCK\\b',
                    run.Raw('{adir}/syslog/*.log'.format(adir=archive_dir)),
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'task .* blocked for more than .* seconds',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'lockdep is turned off',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'trying to register non-static key',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'DEBUG: fsize',  # xfs_fsr
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'CRON',  # ignore cron noise
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'BUG: bad unlock balance detected',  # #6097
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'inconsistent lock state',  # FIXME see #2523
                    run.Raw('|'),
                    'grep',
                    '-v',
                    '*** DEADLOCK ***',  # part of lockdep output
                    run.Raw('|'),
                    'grep',
                    '-v',
                    # FIXME see #2590 and #147
                    'INFO: possible irq lock inversion dependency detected',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'INFO: NMI handler (perf_event_nmi_handler) took too long to run',  # noqa
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'INFO: recovery required on readonly',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'ceph-create-keys: INFO',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'INFO:ceph-create-keys',
                    run.Raw('|'),
                    'grep',
                    '-v',
                    'Loaded datasource DataSourceOpenStack',
                    run.Raw('|'),
                    'head',
                    '-n',
                    '1',
                ],
                stdout=StringIO(),
            )
            stdout = r.stdout.getvalue()
            if stdout != '':
                log.error('Error in syslog on %s: %s', rem.name, stdout)
                set_status(ctx.summary, 'fail')
                if 'failure_reason' not in ctx.summary:
                    ctx.summary['failure_reason'] = \
                        "'{error}' in syslog".format(error=stdout)

        log.info('Compressing syslogs...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'find',
                    '{adir}/syslog'.format(adir=archive_dir),
                    '-name',
                    '*.log',
                    '-print0',
                    run.Raw('|'),
                    'sudo',
                    'xargs',
                    '-0',
                    '--no-run-if-empty',
                    '--',
                    'gzip',
                    '--',
                ],
                wait=False,
            ), )
Exemplo n.º 28
0
def task(ctx, config):
    """
    Create the specified number of pools and write 16 objects to them (thereby forcing
    the PG creation on each OSD). This task creates pools from all the clients,
    in parallel. It is easy to add other daemon types which have the appropriate
    permissions, but I don't think anything else does.
    The config is just the number of pools to create. I recommend setting
    "mon create pg interval" to a very low value in your ceph config to speed
    this up.
    
    You probably want to do this to look at memory consumption, and
    maybe to test how performance changes with the number of PGs. For example:
    
    tasks:
    - ceph:
        config:
          mon:
            mon create pg interval: 1
    - manypools: 3000
    - radosbench:
        clients: [client.0]
        time: 360
    """

    log.info('creating {n} pools'.format(n=config))

    poolnum = int(config)
    creator_remotes = []
    client_roles = teuthology.all_roles_of_type(ctx.cluster, 'client')
    log.info(
        'got client_roles={client_roles_}'.format(client_roles_=client_roles))
    for role in client_roles:
        log.info('role={role_}'.format(role_=role))
        (creator_remote, ) = ctx.cluster.only(
            'client.{id}'.format(id=role)).remotes.iterkeys()
        creator_remotes.append((creator_remote, 'client.{id}'.format(id=role)))

    remaining_pools = poolnum
    poolprocs = dict()
    while (remaining_pools > 0):
        log.info('{n} pools remaining to create'.format(n=remaining_pools))
        for remote, role_ in creator_remotes:
            poolnum = remaining_pools
            remaining_pools -= 1
            if remaining_pools < 0:
                continue
            log.info('creating pool{num} on {role}'.format(num=poolnum,
                                                           role=role_))
            proc = remote.run(args=[
                'rados', '--name', role_, 'mkpool',
                'pool{num}'.format(num=poolnum), '-1',
                run.Raw('&&'), 'rados', '--name', role_, '--pool',
                'pool{num}'.format(num=poolnum), 'bench', '0', 'write', '-t',
                '16', '--block-size', '1'
            ],
                              wait=False)
            log.info('waiting for pool and object creates')
            poolprocs[remote] = proc

        run.wait(poolprocs.itervalues())

    log.info(
        'created all {n} pools and wrote 16 objects to each'.format(n=poolnum))
Exemplo n.º 29
0
    def test_object_deletion(self):
        """
        That the MDS has a clean 'damaged' response to loss of any single metadata object
        """

        self._simple_workload_write()

        # Hmm, actually it would be nice to permute whether the metadata pool
        # state contains sessions or not, but for the moment close this session
        # to avoid waiting through reconnect on every MDS start.
        self.mount_a.umount_wait()
        for mds_name in self.fs.get_active_names():
            self.fs.mds_asok(["flush", "journal"], mds_name)

        self.fs.mds_stop()
        self.fs.mds_fail()

        self.fs.rados(['export', '/tmp/metadata.bin'])

        def is_ignored(obj_id, dentry=None):
            """
            A filter to avoid redundantly mutating many similar objects (e.g.
            stray dirfrags) or similar dentries (e.g. stray dir dentries)
            """
            if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000":
                return True

            if dentry and obj_id == "100.00000000":
                if re.match("stray.+_head",
                            dentry) and dentry != "stray0_head":
                    return True

            return False

        def get_path(obj_id, dentry=None):
            """
            What filesystem path does this object or dentry correspond to?   i.e.
            what should I poke to see EIO after damaging it?
            """

            if obj_id == "1.00000000" and dentry == "subdir_head":
                return "./subdir"
            elif obj_id == "10000000000.00000000" and dentry == "sixmegs_head":
                return "./subdir/sixmegs"

            # None means ls will do an "ls -R" in hope of seeing some errors
            return None

        objects = self.fs.rados(["ls"]).split("\n")
        objects = [o for o in objects if not is_ignored(o)]

        # Find all objects with an OMAP header
        omap_header_objs = []
        for o in objects:
            header = self.fs.rados(["getomapheader", o])
            # The rados CLI wraps the header output in a hex-printed style
            header_bytes = int(
                re.match("header \((.+) bytes\)", header).group(1))
            if header_bytes > 0:
                omap_header_objs.append(o)

        # Find all OMAP key/vals
        omap_keys = []
        for o in objects:
            keys_str = self.fs.rados(["listomapkeys", o])
            if keys_str:
                for key in keys_str.split("\n"):
                    if not is_ignored(o, key):
                        omap_keys.append((o, key))

        # Find objects that have data in their bodies
        data_objects = []
        for obj_id in objects:
            stat_out = self.fs.rados(["stat", obj_id])
            size = int(re.match(".+, size (.+)$", stat_out).group(1))
            if size > 0:
                data_objects.append(obj_id)

        # Define the various forms of damage we will inflict
        class MetadataMutation(object):
            def __init__(self,
                         obj_id_,
                         desc_,
                         mutate_fn_,
                         expectation_,
                         ls_path=None):
                self.obj_id = obj_id_
                self.desc = desc_
                self.mutate_fn = mutate_fn_
                self.expectation = expectation_
                if ls_path is None:
                    self.ls_path = "."
                else:
                    self.ls_path = ls_path

            def __eq__(self, other):
                return self.desc == other.desc

            def __hash__(self):
                return hash(self.desc)

        junk = "deadbeef" * 10
        mutations = []

        # Removals
        for obj_id in objects:
            if obj_id in [
                    # JournalPointers are auto-replaced if missing (same path as upgrade)
                    "400.00000000",
                    # Missing dirfrags for non-system dirs result in empty directory
                    "10000000000.00000000",
                    # PurgeQueue is auto-created if not found on startup
                    "500.00000000",
                    # open file table is auto-created if not found on startup
                    "mds0_openfiles.0"
            ]:
                expectation = NO_DAMAGE
            else:
                expectation = DAMAGED_ON_START

            log.info("Expectation on rm '{0}' will be '{1}'".format(
                obj_id, expectation))

            mutations.append(
                MetadataMutation(obj_id,
                                 "Delete {0}".format(obj_id),
                                 lambda o=obj_id: self.fs.rados(["rm", o]),
                                 expectation))

        # Blatant corruptions
        mutations.extend([
            MetadataMutation(
                o,
                "Corrupt {0}".format(o),
                lambda o=o: self.fs.rados(["put", o, "-"], stdin_data=junk),
                DAMAGED_ON_START) for o in data_objects
        ])

        # Truncations
        for obj_id in data_objects:
            if obj_id == "500.00000000":
                # The PurgeQueue is allowed to be empty: Journaler interprets
                # an empty header object as an empty journal.
                expectation = NO_DAMAGE
            else:
                expectation = DAMAGED_ON_START

            mutations.append(
                MetadataMutation(
                    o,
                    "Truncate {0}".format(o),
                    lambda o=o: self.fs.rados(["truncate", o, "0"]),
                    DAMAGED_ON_START))

        # OMAP value corruptions
        for o, k in omap_keys:
            if o.startswith("100."):
                # Anything in rank 0's 'mydir'
                expectation = DAMAGED_ON_START
            else:
                expectation = EIO_ON_LS

            mutations.append(
                MetadataMutation(
                    o,
                    "Corrupt omap key {0}:{1}".format(o, k),
                    lambda o=o, k=k: self.fs.rados(["setomapval", o, k, junk]),
                    expectation,
                    get_path(o, k)))

        # OMAP header corruptions
        for obj_id in omap_header_objs:
            if re.match("60.\.00000000", obj_id) \
                    or obj_id in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
                expectation = DAMAGED_ON_START
            else:
                expectation = NO_DAMAGE

            log.info(
                "Expectation on corrupt header '{0}' will be '{1}'".format(
                    obj_id, expectation))

            mutations.append(
                MetadataMutation(
                    obj_id,
                    "Corrupt omap header on {0}".format(obj_id),
                    lambda o=obj_id: self.fs.rados(["setomapheader", o, junk]),
                    expectation))

        results = {}

        for mutation in mutations:
            log.info("Applying mutation '{0}'".format(mutation.desc))

            # Reset MDS state
            self.mount_a.umount_wait(force=True)
            self.fs.mds_stop()
            self.fs.mds_fail()
            self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')

            # Reset RADOS pool state
            self.fs.rados(['import', '/tmp/metadata.bin'])

            # Inject the mutation
            mutation.mutate_fn()

            # Try starting the MDS
            self.fs.mds_restart()

            # How long we'll wait between starting a daemon and expecting
            # it to make it through startup, and potentially declare itself
            # damaged to the mon cluster.
            startup_timeout = 60

            if mutation.expectation not in (EIO_ON_LS, DAMAGED_ON_LS,
                                            NO_DAMAGE):
                if mutation.expectation == DAMAGED_ON_START:
                    # The MDS may pass through active before making it to damaged
                    try:
                        self.wait_until_true(lambda: self.is_marked_damaged(0),
                                             startup_timeout)
                    except RuntimeError:
                        pass

                # Wait for MDS to either come up or go into damaged state
                try:
                    self.wait_until_true(
                        lambda: self.is_marked_damaged(0) or self.fs.
                        are_daemons_healthy(), startup_timeout)
                except RuntimeError:
                    crashed = False
                    # Didn't make it to healthy or damaged, did it crash?
                    for daemon_id, daemon in self.fs.mds_daemons.items():
                        if daemon.proc and daemon.proc.finished:
                            crashed = True
                            log.error("Daemon {0} crashed!".format(daemon_id))
                            daemon.proc = None  # So that subsequent stop() doesn't raise error
                    if not crashed:
                        # Didn't go health, didn't go damaged, didn't crash, so what?
                        raise
                    else:
                        log.info("Result: Mutation '{0}' led to crash".format(
                            mutation.desc))
                        results[mutation] = CRASHED
                        continue
                if self.is_marked_damaged(0):
                    log.info(
                        "Result: Mutation '{0}' led to DAMAGED state".format(
                            mutation.desc))
                    results[mutation] = DAMAGED_ON_START
                    continue
                else:
                    log.info(
                        "Mutation '{0}' did not prevent MDS startup, attempting ls..."
                        .format(mutation.desc))
            else:
                try:
                    self.wait_until_true(self.fs.are_daemons_healthy, 60)
                except RuntimeError:
                    log.info(
                        "Result: Mutation '{0}' should have left us healthy, actually not."
                        .format(mutation.desc))
                    if self.is_marked_damaged(0):
                        results[mutation] = DAMAGED_ON_START
                    else:
                        results[mutation] = FAILED_SERVER
                    continue
                log.info(
                    "Daemons came up after mutation '{0}', proceeding to ls".
                    format(mutation.desc))

            # MDS is up, should go damaged on ls or client mount
            self.mount_a.mount()
            self.mount_a.wait_until_mounted()
            if mutation.ls_path == ".":
                proc = self.mount_a.run_shell(["ls", "-R", mutation.ls_path],
                                              wait=False)
            else:
                proc = self.mount_a.stat(mutation.ls_path, wait=False)

            if mutation.expectation == DAMAGED_ON_LS:
                try:
                    self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
                    log.info(
                        "Result: Mutation '{0}' led to DAMAGED state after ls".
                        format(mutation.desc))
                    results[mutation] = DAMAGED_ON_LS
                except RuntimeError:
                    if self.fs.are_daemons_healthy():
                        log.error(
                            "Result: Failed to go damaged on mutation '{0}', actually went active"
                            .format(mutation.desc))
                        results[mutation] = NO_DAMAGE
                    else:
                        log.error(
                            "Result: Failed to go damaged on mutation '{0}'".
                            format(mutation.desc))
                        results[mutation] = FAILED_SERVER

            else:
                try:
                    wait([proc], 20)
                    log.info(
                        "Result: Mutation '{0}' did not caused DAMAGED state".
                        format(mutation.desc))
                    results[mutation] = NO_DAMAGE
                except MaxWhileTries:
                    log.info(
                        "Result: Failed to complete client IO on mutation '{0}'"
                        .format(mutation.desc))
                    results[mutation] = FAILED_CLIENT
                except CommandFailedError as e:
                    if e.exitstatus == errno.EIO:
                        log.info("Result: EIO on client")
                        results[mutation] = EIO_ON_LS
                    else:
                        log.info(
                            "Result: unexpected error {0} on client".format(e))
                        results[mutation] = FAILED_CLIENT

            if mutation.expectation == EIO_ON_LS:
                # EIOs mean something handled by DamageTable: assert that it has
                # been populated
                damage = json.loads(
                    self.fs.mon_manager.raw_cluster_cmd(
                        'tell',
                        'mds.{0}'.format(self.fs.get_active_names()[0]),
                        "damage", "ls", '--format=json-pretty'))
                if len(damage) == 0:
                    results[mutation] = EIO_NO_DAMAGE

        failures = [(mutation, result)
                    for (mutation, result) in results.items()
                    if mutation.expectation != result]
        if failures:
            log.error("{0} mutations had unexpected outcomes:".format(
                len(failures)))
            for mutation, result in failures:
                log.error("  Expected '{0}' actually '{1}' from '{2}'".format(
                    mutation.expectation, result, mutation.desc))
            raise RuntimeError("{0} mutations had unexpected outcomes".format(
                len(failures)))
        else:
            log.info("All {0} mutations had expected outcomes".format(
                len(mutations)))
Exemplo n.º 30
0
def task(ctx, config):
    """
    Run watch_notify_same_primary

    The config should be as follows:

    watch_notify_same_primary:
        clients: [client list]

    The client list should contain 1 client

    The test requires 3 osds.

    example:

    tasks:
    - ceph:
    - watch_notify_same_primary:
        clients: [client.0]
    - interactive:
    """
    log.info('Beginning watch_notify_same_primary...')
    assert isinstance(config, dict), \
        "please list clients to run on"

    clients = config.get('clients', ['client.0'])
    assert len(clients) == 1
    role = clients[0]
    assert isinstance(role, basestring)
    PREFIX = 'client.'
    assert role.startswith(PREFIX)
    (remote,) = ctx.cluster.only(role).remotes.keys()
    manager = ctx.managers['ceph']
    manager.raw_cluster_cmd('osd', 'set', 'noout')

    pool = manager.create_pool_with_unique_name()
    def obj(n): return "foo-{num}".format(num=n)
    def start_watch(n):
        remote.run(
            args = [
                "rados",
                "-p", pool,
                "put",
                obj(n),
                "/etc/resolv.conf"],
            logger=log.getChild('watch.{id}'.format(id=n)))
        proc = remote.run(
            args = [
                "rados",
                "-p", pool,
                "watch",
                obj(n)],
            stdin=run.PIPE,
            stdout=StringIO(),
            stderr=StringIO(),
            wait=False)
        return proc

    num = 20

    watches = [start_watch(i) for i in range(num)]

    # wait for them all to register
    for i in range(num):
        with safe_while() as proceed:
            while proceed():
                proc = remote.run(
                    args = [
                        "rados",
                        "-p", pool,
                        "listwatchers",
                        obj(i)],
                    stdout=StringIO())
                lines = proc.stdout.getvalue()
                num_watchers = lines.count('watcher=')
                log.info('i see %d watchers for %s', num_watchers, obj(i))
                if num_watchers >= 1:
                    break

    def notify(n, msg):
        remote.run(
            args = [
                "rados",
                "-p", pool,
                "notify",
                obj(n),
                msg],
            logger=log.getChild('notify.{id}'.format(id=n)))

    [notify(n, 'notify1') for n in range(len(watches))]

    manager.kill_osd(0)
    manager.mark_down_osd(0)

    [notify(n, 'notify2') for n in range(len(watches))]

    try:
        yield
    finally:
        log.info('joining watch_notify_stress')
        for watch in watches:
            watch.stdin.write("\n")

        run.wait(watches)

        for watch in watches:
            lines = watch.stdout.getvalue().split("\n")
            got1 = False
            got2 = False
            for l in lines:
                if 'notify1' in l:
                    got1 = True
                if 'notify2' in l:
                    got2 = True
            log.info(lines)
            assert got1 and got2

        manager.revive_osd(0)
        manager.remove_pool(pool)
Exemplo n.º 31
0
def kubeadm_init_join(ctx, config):
    cluster_name = config['cluster']

    bootstrap_remote = None
    remotes = {}  # remote -> ip
    for remote, roles in ctx.cluster.remotes.items():
        for role in roles:
            if role.startswith('host.'):
                if not bootstrap_remote:
                    bootstrap_remote = remote
                if remote not in remotes:
                    remotes[remote] = remote.ssh.get_transport().getpeername(
                    )[0]
    if not bootstrap_remote:
        raise RuntimeError('must define at least one host.something role')
    ctx.kubeadm[cluster_name].bootstrap_remote = bootstrap_remote
    ctx.kubeadm[cluster_name].remotes = remotes
    ctx.kubeadm[cluster_name].token = 'abcdef.' + ''.join([
        random.choice('0123456789abcdefghijklmnopqrstuvwxyz')
        for _ in range(16)
    ])
    log.info(f'Token: {ctx.kubeadm[cluster_name].token}')
    log.info(f'Remotes: {ctx.kubeadm[cluster_name].remotes}')

    try:
        # init
        cmd = [
            'sudo',
            'kubeadm',
            'init',
            '--node-name',
            ctx.kubeadm[cluster_name].bootstrap_remote.shortname,
            '--token',
            ctx.kubeadm[cluster_name].token,
            '--pod-network-cidr',
            str(ctx.kubeadm[cluster_name].pod_subnet),
        ]
        bootstrap_remote.run(args=cmd)

        # join additional nodes
        joins = []
        for remote, ip in ctx.kubeadm[cluster_name].remotes.items():
            if remote == bootstrap_remote:
                continue
            cmd = [
                'sudo',
                'kubeadm',
                'join',
                ctx.kubeadm[cluster_name].remotes[
                    ctx.kubeadm[cluster_name].bootstrap_remote] + ':6443',
                '--node-name',
                remote.shortname,
                '--token',
                ctx.kubeadm[cluster_name].token,
                '--discovery-token-unsafe-skip-ca-verification',
            ]
            joins.append(remote.run(args=cmd, wait=False))
        run.wait(joins)
        yield

    except Exception as e:
        log.exception(e)
        raise

    finally:
        log.info('Cleaning up node')
        run.wait(
            ctx.cluster.run(
                args=['sudo', 'kubeadm', 'reset', 'cleanup-node', '-f'],
                wait=False,
            ))
Exemplo n.º 32
0
def task(ctx, config):
    """
    Test handling of lost objects.

    A pretty rigid cluster is brought up and tested by this task
    """
    POOL = 'unfounddel_pool'
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'lost_unfound task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon, ) = ctx.cluster.only(first_mon).remotes.keys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    while len(manager.get_osd_status()['up']) < 3:
        time.sleep(10)
    manager.flush_pg_stats([0, 1, 2])
    manager.wait_for_clean()

    manager.create_pool(POOL)

    # something that is always there
    dummyfile = '/etc/fstab'

    # take an osd out until the very end
    manager.kill_osd(2)
    manager.mark_down_osd(2)
    manager.mark_out_osd(2)

    # kludge to make sure they get a map
    rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile])

    manager.flush_pg_stats([0, 1])
    manager.wait_for_recovery()

    # create old objects
    for f in range(1, 10):
        rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
        rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
        rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f])

    # delay recovery, and make the pg log very long (to prevent backfill)
    manager.raw_cluster_cmd(
        'tell', 'osd.1', 'injectargs',
        '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000')

    manager.kill_osd(0)
    manager.mark_down_osd(0)

    for f in range(1, 10):
        rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile])
        rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
        rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])

    # bring osd.0 back up, let it peer, but don't replicate the new
    # objects...
    log.info('osd.0 command_args is %s' % 'foo')
    log.info(ctx.daemons.get_daemon('osd', 0).command_args)
    ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend(
        ['--osd-recovery-delay-start', '1000'])
    manager.revive_osd(0)
    manager.mark_in_osd(0)
    manager.wait_till_osd_is_up(0)

    manager.flush_pg_stats([0, 1])
    manager.wait_till_active()

    # take out osd.1 and the only copy of those objects.
    manager.kill_osd(1)
    manager.mark_down_osd(1)
    manager.mark_out_osd(1)
    manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it')

    # bring up osd.2 so that things would otherwise, in theory, recovery fully
    manager.revive_osd(2)
    manager.mark_in_osd(2)
    manager.wait_till_osd_is_up(2)

    manager.flush_pg_stats([0, 2])
    manager.wait_till_active()
    manager.flush_pg_stats([0, 2])

    # verify that there are unfound objects
    unfound = manager.get_num_unfound_objects()
    log.info("there are %d unfound objects" % unfound)
    assert unfound

    testdir = teuthology.get_testdir(ctx)
    procs = []
    if config.get('parallel_bench', True):
        procs.append(
            mon.run(args=[
                "/bin/sh",
                "-c",
                " ".join([
                    'adjust-ulimits',
                    'ceph-coverage',
                    '{tdir}/archive/coverage',
                    'rados',
                    '--no-log-to-stderr',
                    '--name',
                    'client.admin',
                    '-b',
                    str(4 << 10),
                    '-p',
                    POOL,
                    '-t',
                    '20',
                    'bench',
                    '240',
                    'write',
                ]).format(tdir=testdir),
            ],
                    logger=log.getChild(
                        'radosbench.{id}'.format(id='client.admin')),
                    stdin=run.PIPE,
                    wait=False))
    time.sleep(10)

    # mark stuff lost
    pgs = manager.get_pg_stats()
    for pg in pgs:
        if pg['stat_sum']['num_objects_unfound'] > 0:
            primary = 'osd.%d' % pg['acting'][0]

            # verify that i can list them direct from the osd
            log.info('listing missing/lost in %s state %s', pg['pgid'],
                     pg['state'])
            m = manager.list_pg_unfound(pg['pgid'])
            #log.info('%s' % m)
            assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
            num_unfound = 0
            for o in m['objects']:
                if len(o['locations']) == 0:
                    num_unfound += 1
            assert m['num_unfound'] == num_unfound

            log.info("reverting unfound in %s on %s", pg['pgid'], primary)
            manager.raw_cluster_cmd('pg', pg['pgid'], 'mark_unfound_lost',
                                    'delete')
        else:
            log.info("no unfound in %s", pg['pgid'])

    manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
    manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
    manager.flush_pg_stats([0, 2])
    manager.wait_for_recovery()

    # verify result
    for f in range(1, 10):
        err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-'])
        assert err
        err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-'])
        assert err
        err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-'])
        assert err

    # see if osd.1 can cope
    manager.mark_in_osd(1)
    manager.revive_osd(1)
    manager.wait_till_osd_is_up(1)
    manager.wait_for_clean()
    run.wait(procs)
Exemplo n.º 33
0
def ceph_log(ctx, config):
    cluster_name = config['cluster']
    fsid = ctx.ceph[cluster_name].fsid

    try:
        yield

    except Exception:
        # we need to know this below
        ctx.summary['success'] = False
        raise

    finally:
        log.info('Checking cluster log for badness...')

        def first_in_ceph_log(pattern, excludes):
            """
            Find the first occurrence of the pattern specified in the Ceph log,
            Returns None if none found.

            :param pattern: Pattern scanned for.
            :param excludes: Patterns to ignore.
            :return: First line of text (or None if not found)
            """
            args = [
                'sudo',
                'egrep',
                pattern,
                '/var/log/ceph/{fsid}/ceph.log'.format(fsid=fsid),
            ]
            if excludes:
                for exclude in excludes:
                    args.extend([run.Raw('|'), 'egrep', '-v', exclude])
            args.extend([
                run.Raw('|'),
                'head',
                '-n',
                '1',
            ])
            r = ctx.ceph[cluster_name].bootstrap_remote.run(
                stdout=StringIO(),
                args=args,
            )
            stdout = r.stdout.getvalue()
            if stdout != '':
                return stdout
            return None

        if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
                             config.get('log-whitelist')) is not None:
            log.warning('Found errors (ERR|WRN|SEC) in cluster log')
            ctx.summary['success'] = False
            # use the most severe problem as the failure reason
            if 'failure_reason' not in ctx.summary:
                for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
                    match = first_in_ceph_log(pattern, config['log-whitelist'])
                    if match is not None:
                        ctx.summary['failure_reason'] = \
                            '"{match}" in cluster log'.format(
                                match=match.rstrip('\n'),
                            )
                        break

        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and ctx.summary['success']):
            # and logs
            log.info('Compressing logs...')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo',
                        'find',
                        '/var/log/ceph',  # all logs, not just for the cluster
                        '-name',
                        '*.log',
                        '-print0',
                        run.Raw('|'),
                        'sudo',
                        'xargs',
                        '-0',
                        '--no-run-if-empty',
                        '--',
                        'gzip',
                        '--',
                    ],
                    wait=False,
                ), )

            log.info('Archiving logs...')
            path = os.path.join(ctx.archive, 'remote')
            try:
                os.makedirs(path)
            except OSError:
                pass
            for remote in ctx.cluster.remotes.keys():
                sub = os.path.join(path, remote.name)
                try:
                    os.makedirs(sub)
                except OSError:
                    pass
                teuthology.pull_directory(
                    remote,
                    '/var/log/ceph',  # everything
                    os.path.join(sub, 'log'))
Exemplo n.º 34
0
def coredump(ctx, config):
    """
    Stash a coredump of this system if an error occurs.
    """
    log.info('Enabling coredump saving...')
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'install',
                '-d',
                '-m0755',
                '--',
                '{adir}/coredump'.format(adir=archive_dir),
                run.Raw('&&'),
                'sudo',
                'sysctl',
                '-w',
                'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(
                    adir=archive_dir),
                run.Raw('&&'),
                'echo',
                'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(
                    adir=archive_dir),
                run.Raw('|'),
                'sudo',
                'tee',
                '-a',
                '/etc/sysctl.conf',
            ],
            wait=False,
        ))

    try:
        yield
    finally:
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'sysctl',
                    '-w',
                    'kernel.core_pattern=core',
                    run.Raw('&&'),
                    # don't litter the archive dir if there were no cores dumped
                    'rmdir',
                    '--ignore-fail-on-non-empty',
                    '--',
                    '{adir}/coredump'.format(adir=archive_dir),
                ],
                wait=False,
            ))

        # set status = 'fail' if the dir is still there = coredumps were
        # seen
        for rem in ctx.cluster.remotes.keys():
            try:
                rem.sh("test -e " + archive_dir + "/coredump")
            except run.CommandFailedError:
                continue
            log.warning('Found coredumps on %s, flagging run as failed', rem)
            set_status(ctx.summary, 'fail')
            if 'failure_reason' not in ctx.summary:
                ctx.summary['failure_reason'] = \
                    'Found coredumps on {rem}'.format(rem=rem)
Exemplo n.º 35
0
def ceph_log(ctx, config):
    """
    Create /var/log/ceph log directory that is open to everyone.
    Add valgrind and profiling-logger directories.

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Making ceph log dir writeable by non-root...')
    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'chmod',
                '777',
                '/var/log/ceph',
            ],
            wait=False,
        ))
    log.info('Disabling ceph logrotate...')
    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'rm',
                '-f',
                '--',
                '/etc/logrotate.d/ceph',
            ],
            wait=False,
        ))
    log.info('Creating extra log directories...')
    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'install',
                '-d',
                '-m0755',
                '--',
                '/var/log/ceph/valgrind',
                '/var/log/ceph/profiling-logger',
            ],
            wait=False,
        ))

    try:
        yield

    finally:
        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and ctx.summary['success']):
            # and logs
            log.info('Compressing logs...')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo',
                        'find',
                        '/var/log/ceph',
                        '-name',
                        '*.log',
                        '-print0',
                        run.Raw('|'),
                        'sudo',
                        'xargs',
                        '-0',
                        '--no-run-if-empty',
                        '--',
                        'gzip',
                        '--',
                    ],
                    wait=False,
                ), )

            log.info('Archiving logs...')
            path = os.path.join(ctx.archive, 'remote')
            os.makedirs(path)
            for remote in ctx.cluster.remotes.iterkeys():
                sub = os.path.join(path, remote.shortname)
                os.makedirs(sub)
                teuthology.pull_directory(remote, '/var/log/ceph',
                                          os.path.join(sub, 'log'))
Exemplo n.º 36
0
def task(ctx, config):
    """
    Run a background task.

    Run the given command on a client, similar to exec.  However, when
    we hit the finally because the subsequent task is ready to exit, kill
    the child process.

    We do not do any error code checking here since we are forcefully killing
    off the child when we are done.

    If the command a list, we simply join it with ;'s.

    Example::

       tasks:
       - install:
       - background_exec:
           client.0: while true ; do date ; sleep 1 ; done
           client.1:
             - while true
             - do id
             - sleep 1
             - done
       - exec:
           client.0:
             - sleep 10

    """
    assert isinstance(config, dict), "task background got invalid config"

    testdir = misc.get_testdir(ctx)

    tasks = {}
    for role, cmd in config.iteritems():
        (remote, ) = ctx.cluster.only(role).remotes.iterkeys()
        log.info('Running background command on role %s host %s', role,
                 remote.name)
        if isinstance(cmd, list):
            cmd = '; '.join(cmd)
        cmd.replace('$TESTDIR', testdir)
        tasks[remote.name] = remote.run(args=[
            'sudo',
            'TESTDIR=%s' % testdir,
            'daemon-helper',
            'kill',
            '--kill-group',
            'bash',
            '-c',
            cmd,
        ],
                                        wait=False,
                                        stdin=run.PIPE,
                                        check_status=False,
                                        logger=log.getChild(remote.name))

    try:
        yield

    finally:
        for name, task in tasks.iteritems():
            log.info('Stopping background command on %s', name)
            task.stdin.close()
        run.wait(tasks.itervalues())
Exemplo n.º 37
0
def run_qemu(ctx, config):
    """Setup kvm environment and start qemu"""
    procs = []
    testdir = teuthology.get_testdir(ctx)
    for client, client_config in config.items():
        (remote, ) = ctx.cluster.only(client).remotes.keys()
        log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir,
                                                        client=client)
        remote.run(args=[
            'mkdir',
            log_dir,
            run.Raw('&&'),
            'sudo',
            'modprobe',
            'kvm',
        ])

        nfs_service_name = 'nfs'
        if remote.os.name in ['rhel', 'centos'
                              ] and float(remote.os.version) >= 8:
            nfs_service_name = 'nfs-server'

        # make an nfs mount to use for logging and to
        # allow to test to tell teuthology the tests outcome
        _setup_nfs_mount(remote, client, nfs_service_name, log_dir)

        # Hack to make sure /dev/kvm permissions are set correctly
        # See http://tracker.ceph.com/issues/17977 and
        # https://bugzilla.redhat.com/show_bug.cgi?id=1333159
        remote.run(args='sudo udevadm control --reload')
        remote.run(args='sudo udevadm trigger /dev/kvm')
        remote.run(args='ls -l /dev/kvm')

        qemu_cmd = 'qemu-system-x86_64'
        if remote.os.package_type == "rpm":
            qemu_cmd = "/usr/libexec/qemu-kvm"
        args = [
            'adjust-ulimits',
            'ceph-coverage',
            '{tdir}/archive/coverage'.format(tdir=testdir),
            'daemon-helper',
            'term',
            qemu_cmd,
            '-enable-kvm',
            '-nographic',
            '-cpu',
            'host',
            '-smp',
            str(client_config.get('cpus', DEFAULT_CPUS)),
            '-m',
            str(client_config.get('memory', DEFAULT_MEM)),
            # cd holding metadata for cloud-init
            '-cdrom',
            '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
        ]

        cachemode = 'none'
        ceph_config = ctx.ceph['ceph'].conf.get('global', {})
        ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
        ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
        if ceph_config.get('rbd cache', True):
            if ceph_config.get('rbd cache max dirty', 1) > 0:
                cachemode = 'writeback'
            else:
                cachemode = 'writethrough'

        clone = client_config.get('clone', False)
        num_disks = client_config.get('disks', DEFAULT_NUM_DISKS)
        if isinstance(num_disks, list):
            num_disks = len(num_disks)
        for i in range(num_disks):
            suffix = '-clone' if clone else ''
            args.extend([
                '-drive',
                'file=rbd:rbd/{img}:id={id},format=raw,if=virtio,cache={cachemode}'
                .format(
                    img='{client}.{num}{suffix}'.format(client=client,
                                                        num=i,
                                                        suffix=suffix),
                    id=client[len('client.'):],
                    cachemode=cachemode,
                ),
            ])
        time_wait = client_config.get('time_wait', 0)

        log.info('starting qemu...')
        procs.append(
            remote.run(
                args=args,
                logger=log.getChild(client),
                stdin=run.PIPE,
                wait=False,
            ))

    try:
        yield
    finally:
        log.info('waiting for qemu tests to finish...')
        run.wait(procs)

        if time_wait > 0:
            log.debug('waiting {time_wait} sec for workloads detect finish...'.
                      format(time_wait=time_wait))
            time.sleep(time_wait)

        log.debug('checking that qemu tests succeeded...')
        for client in config.keys():
            (remote, ) = ctx.cluster.only(client).remotes.keys()

            # ensure we have permissions to all the logs
            log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir,
                                                            client=client)
            remote.run(args=['sudo', 'chmod', 'a+rw', '-R', log_dir])

            # teardown nfs mount
            _teardown_nfs_mount(remote, client, nfs_service_name)
            # check for test status
            remote.run(args=[
                'test',
                '-f',
                '{tdir}/archive/qemu/{client}/success'.format(tdir=testdir,
                                                              client=client),
            ], )
        log.info("Deleting exported directory...")
        for client in config.keys():
            (remote, ) = ctx.cluster.only(client).remotes.keys()
            remote.run(args=['sudo', 'rm', '-r', '/export'])
Exemplo n.º 38
0
def run_qemu(ctx, config):
    """Setup kvm environment and start qemu"""
    procs = []
    testdir = teuthology.get_testdir(ctx)
    for client, client_config in config.iteritems():
        (remote, ) = ctx.cluster.only(client).remotes.keys()
        log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir,
                                                        client=client)
        remote.run(args=[
            'mkdir',
            log_dir,
            run.Raw('&&'),
            'sudo',
            'modprobe',
            'kvm',
        ])

        # make an nfs mount to use for logging and to
        # allow to test to tell teuthology the tests outcome
        _setup_nfs_mount(remote, client, log_dir)

        base_file = '{tdir}/qemu/base.{client}.qcow2'.format(tdir=testdir,
                                                             client=client)
        # Hack to make sure /dev/kvm permissions are set correctly
        # See http://tracker.ceph.com/issues/17977 and
        # https://bugzilla.redhat.com/show_bug.cgi?id=1333159
        remote.run(args='sudo udevadm control --reload')
        remote.run(args='sudo udevadm trigger /dev/kvm')
        remote.run(args='ls -l /dev/kvm')

        qemu_cmd = 'qemu-system-x86_64'
        if remote.os.package_type == "rpm":
            qemu_cmd = "/usr/libexec/qemu-kvm"
        args = [
            'adjust-ulimits',
            'ceph-coverage',
            '{tdir}/archive/coverage'.format(tdir=testdir),
            'daemon-helper',
            'term',
            qemu_cmd,
            '-enable-kvm',
            '-nographic',
            '-m',
            str(client_config.get('memory', DEFAULT_MEM)),
            # base OS device
            '-drive',
            'file={base},format=qcow2,if=virtio'.format(base=base_file),
            # cd holding metadata for cloud-init
            '-cdrom',
            '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
        ]

        cachemode = 'none'
        ceph_config = ctx.ceph['ceph'].conf.get('global', {})
        ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
        ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
        if ceph_config.get('rbd cache'):
            if ceph_config.get('rbd cache max dirty', 1) > 0:
                cachemode = 'writeback'
            else:
                cachemode = 'writethrough'

        clone = client_config.get('clone', False)
        for i in xrange(client_config.get('num_rbd', DEFAULT_NUM_RBD)):
            suffix = '-clone' if clone else ''
            args.extend([
                '-drive',
                'file=rbd:rbd/{img}:id={id},format=raw,if=virtio,cache={cachemode}'
                .format(
                    img='{client}.{num}{suffix}'.format(client=client,
                                                        num=i,
                                                        suffix=suffix),
                    id=client[len('client.'):],
                    cachemode=cachemode,
                ),
            ])

        log.info('starting qemu...')
        procs.append(
            remote.run(
                args=args,
                logger=log.getChild(client),
                stdin=run.PIPE,
                wait=False,
            ))

    try:
        yield
    finally:
        log.info('waiting for qemu tests to finish...')
        run.wait(procs)

        log.debug('checking that qemu tests succeeded...')
        for client in config.iterkeys():
            (remote, ) = ctx.cluster.only(client).remotes.keys()
            # teardown nfs mount
            _teardown_nfs_mount(remote, client)
            # check for test status
            remote.run(args=[
                'test',
                '-f',
                '{tdir}/archive/qemu/{client}/success'.format(tdir=testdir,
                                                              client=client),
            ], )
Exemplo n.º 39
0
def ship_utilities(ctx, config):
    """
    Write a copy of valgrind.supp to each of the remote sites.  Set executables
    used by Ceph in /usr/local/bin.  When finished (upon exit of the teuthology
    run), remove these files.

    :param ctx: Context
    :param config: Configuration
    """
    assert config is None
    testdir = teuthology.get_testdir(ctx)
    filenames = []

    log.info('Shipping valgrind.supp...')
    assert 'suite_path' in ctx.config
    try:
        with open(os.path.join(ctx.config['suite_path'], 'valgrind.supp'),
                  'rb') as f:
            fn = os.path.join(testdir, 'valgrind.supp')
            filenames.append(fn)
            for rem in ctx.cluster.remotes.keys():
                teuthology.sudo_write_file(
                    remote=rem,
                    path=fn,
                    data=f,
                )
                f.seek(0)
    except IOError as e:
        log.info('Cannot ship supression file for valgrind: %s...', e.strerror)

    FILES = ['daemon-helper', 'adjust-ulimits']
    destdir = '/usr/bin'
    for filename in FILES:
        log.info('Shipping %r...', filename)
        src = os.path.join(os.path.dirname(__file__), filename)
        dst = os.path.join(destdir, filename)
        filenames.append(dst)
        with open(src, 'rb') as f:
            for rem in ctx.cluster.remotes.keys():
                teuthology.sudo_write_file(
                    remote=rem,
                    path=dst,
                    data=f,
                )
                f.seek(0)
                rem.run(args=[
                    'sudo',
                    'chmod',
                    'a=rx',
                    '--',
                    dst,
                ], )

    try:
        yield
    finally:
        log.info('Removing shipped files: %s...', ' '.join(filenames))
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-f',
                    '--',
                ] + list(filenames),
                wait=False,
            ), )
Exemplo n.º 40
0
def kubeadm_install(ctx, config):
    version = config.get('version', '1.21')

    os_type = teuthology.get_distro(ctx)
    os_version = teuthology.get_distro_version(ctx)

    try:
        if os_type in ['centos', 'rhel']:
            os = f"CentOS_{os_version.split('.')[0]}"
            log.info('Installing cri-o')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo',
                        'curl', '-L', '-o',
                        '/etc/yum.repos.d/devel:kubic:libcontainers:stable.repo',
                        f'https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/{os}/devel:kubic:libcontainers:stable.repo',
                        run.Raw('&&'),
                        'sudo',
                        'curl', '-L', '-o',
                        f'/etc/yum.repos.d/devel:kubic:libcontainers:stable:cri-o:{version}.repo',
                        f'https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable:/cri-o:/{version}/{os}/devel:kubic:libcontainers:stable:cri-o:{version}.repo',
                        run.Raw('&&'),
                        'sudo', 'dnf', 'install', '-y', 'cri-o',
                    ],
                    wait=False,
                )
            )

            log.info('Installing kube{adm,ctl,let}')
            repo = """[kubernetes]
name=Kubernetes
baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-$basearch
enabled=1
gpgcheck=1
repo_gpgcheck=1
gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
"""
            for remote in ctx.cluster.remotes.keys():
                remote.write_file(
                    '/etc/yum.repos.d/kubernetes.repo',
                    repo,
                    sudo=True,
                )
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo', 'dnf', 'install', '-y',
                        'kubelet', 'kubeadm', 'kubectl',
                        'iproute-tc', 'bridge-utils',
                    ],
                    wait=False,
                )
            )

            # fix cni config
            for remote in ctx.cluster.remotes.keys():
                conf = """# from https://github.com/cri-o/cri-o/blob/master/tutorials/kubernetes.md#flannel-network
{
    "name": "crio",
    "type": "flannel"
}
"""
                remote.write_file('/etc/cni/net.d/10-crio-flannel.conf', conf, sudo=True)
                remote.run(args=[
                    'sudo', 'rm', '-f',
                    '/etc/cni/net.d/87-podman-bridge.conflist',
                    '/etc/cni/net.d/100-crio-bridge.conf',
                ])

            # start crio
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo', 'systemctl', 'daemon-reload',
                        run.Raw('&&'),
                        'sudo', 'systemctl', 'enable', 'crio', '--now',
                    ],
                    wait=False,
                )
            )

        elif os_type == 'ubuntu':
            os = f"xUbuntu_{os_version}"
            log.info('Installing kube{adm,ctl,let}')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo', 'apt', 'update',
                        run.Raw('&&'),
                        'sudo', 'apt', 'install', '-y',
                        'apt-transport-https', 'ca-certificates', 'curl',
                        run.Raw('&&'),
                        'sudo', 'curl', '-fsSLo',
                        '/usr/share/keyrings/kubernetes-archive-keyring.gpg',
                        'https://packages.cloud.google.com/apt/doc/apt-key.gpg',
                        run.Raw('&&'),
                        'echo', 'deb [signed-by=/usr/share/keyrings/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main',
                        run.Raw('|'),
                        'sudo', 'tee', '/etc/apt/sources.list.d/kubernetes.list',
                        run.Raw('&&'),
                        'sudo', 'apt', 'update',
                        run.Raw('&&'),
                        'sudo', 'apt', 'install', '-y',
                        'kubelet', 'kubeadm', 'kubectl',
                        'bridge-utils',
                    ],
                    wait=False,
                )
            )

        else:
            raise RuntimeError(f'unsupported distro {os_type} for cri-o')

        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo', 'systemctl', 'enable', '--now', 'kubelet',
                    run.Raw('&&'),
                    'sudo', 'kubeadm', 'config', 'images', 'pull',
                ],
                wait=False,
            )
        )

        yield

    finally:
        if config.get('uninstall', True):
            log.info('Uninstalling kube{adm,let,ctl}')
            if os_type in ['centos', 'rhel']:
                run.wait(
                    ctx.cluster.run(
                        args=[
                            'sudo', 'rm', '-f',
                            '/etc/yum.repos.d/kubernetes.repo',
                            run.Raw('&&'),
                            'sudo', 'dnf', 'remove', '-y',
                            'kubeadm', 'kubelet', 'kubectl', 'cri-o',
                        ],
                        wait=False
                    )
                )
            elif os_type == 'ubuntu' and False:
                run.wait(
                    ctx.cluster.run(
                        args=[
                            'sudo', 'rm', '-f',
                            '/etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list',
                            f'/etc/apt/sources.list.d/devel:kubic:libcontainers:stable:cri-o:{version}.list',
                            '/etc/apt/trusted.gpg.d/libcontainers-cri-o.gpg',
                            run.Raw('&&'),
                            'sudo', 'apt', 'remove', '-y',
                            'kkubeadm', 'kubelet', 'kubectl', 'cri-o', 'cri-o-runc',
                        ],
                        wait=False,
                    )
                )
def task(ctx, config):
    """
    Run watch_notify_same_primary

    The config should be as follows:

    watch_notify_same_primary:
        clients: [client list]

    The client list should contain 1 client

    The test requires 3 osds.

    example:

    tasks:
    - ceph:
    - watch_notify_same_primary:
        clients: [client.0]
    - interactive:
    """
    log.info('Beginning watch_notify_same_primary...')
    assert isinstance(config, dict), \
        "please list clients to run on"

    clients = config.get('clients', ['client.0'])
    assert len(clients) == 1
    role = clients[0]
    assert isinstance(role, basestring)
    PREFIX = 'client.'
    assert role.startswith(PREFIX)
    (remote,) = ctx.cluster.only(role).remotes.iterkeys()
    ctx.manager.raw_cluster_cmd('osd', 'set', 'noout')

    pool = ctx.manager.create_pool_with_unique_name()
    def obj(n): return "foo-{num}".format(num=n)
    def start_watch(n):
        remote.run(
            args = [
                "rados",
                "-p", pool,
                "put",
                obj(n),
                "/etc/resolv.conf"],
            logger=log.getChild('watch.{id}'.format(id=n)))
        return remote.run(
            args = [
                "rados",
                "-p", pool,
                "watch",
                obj(n)],
            stdin=run.PIPE,
            stdout=StringIO(),
            stderr=StringIO(),
            wait=False)
    watches = [start_watch(i) for i in range(20)]

    def notify(n, msg):
        remote.run(
            args = [
                "rados",
                "-p", pool,
                "notify",
                obj(n),
                msg],
            logger=log.getChild('notify.{id}'.format(id=n)))

    [notify(n, 'notify1') for n in range(len(watches))]

    ctx.manager.kill_osd(0)
    ctx.manager.mark_down_osd(0)

    [notify(n, 'notify2') for n in range(len(watches))]

    try:
        yield
    finally:
        log.info('joining watch_notify_stress')
        for watch in watches:
            watch.stdin.write("\n")

        run.wait(watches)

        for watch in watches:
            lines = watch.stdout.getvalue().split("\n")
            print lines
            assert len(lines) == 4

        ctx.manager.revive_osd(0)
        ctx.manager.remove_pool(pool)
Exemplo n.º 42
0
def task(ctx, config):
    """
    Test handling of lost objects on an ec pool.

    A pretty rigid cluster is brought up andtested by this task
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'lost_unfound task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.keys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    manager.wait_for_clean()

    profile = config.get('erasure_code_profile', {
        'k': '2',
        'm': '2',
        'crush-failure-domain': 'osd'
    })
    profile_name = profile.get('name', 'lost_unfound')
    manager.create_erasure_code_profile(profile_name, profile)
    pool = manager.create_pool_with_unique_name(
        erasure_code_profile_name=profile_name,
        min_size=2)

    # something that is always there, readable and never empty
    dummyfile = '/etc/group'

    # kludge to make sure they get a map
    rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile])

    manager.flush_pg_stats([0, 1])
    manager.wait_for_recovery()

    # create old objects
    for f in range(1, 10):
        rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile])
        rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile])
        rados(ctx, mon, ['-p', pool, 'rm', 'existed_%d' % f])

    # delay recovery, and make the pg log very long (to prevent backfill)
    manager.raw_cluster_cmd(
            'tell', 'osd.1',
            'injectargs',
            '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000'
            )

    manager.kill_osd(0)
    manager.mark_down_osd(0)
    manager.kill_osd(3)
    manager.mark_down_osd(3)
    
    for f in range(1, 10):
        rados(ctx, mon, ['-p', pool, 'put', 'new_%d' % f, dummyfile])
        rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile])
        rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile])

    # take out osd.1 and a necessary shard of those objects.
    manager.kill_osd(1)
    manager.mark_down_osd(1)
    manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it')
    manager.revive_osd(0)
    manager.wait_till_osd_is_up(0)
    manager.revive_osd(3)
    manager.wait_till_osd_is_up(3)

    manager.flush_pg_stats([0, 2, 3])
    manager.wait_till_active()
    manager.flush_pg_stats([0, 2, 3])

    # verify that there are unfound objects
    unfound = manager.get_num_unfound_objects()
    log.info("there are %d unfound objects" % unfound)
    assert unfound

    testdir = teuthology.get_testdir(ctx)
    procs = []
    if config.get('parallel_bench', True):
        procs.append(mon.run(
            args=[
                "/bin/sh", "-c",
                " ".join(['adjust-ulimits',
                          'ceph-coverage',
                          '{tdir}/archive/coverage',
                          'rados',
                          '--no-log-to-stderr',
                          '--name', 'client.admin',
                          '-b', str(4<<10),
                          '-p' , pool,
                          '-t', '20',
                          'bench', '240', 'write',
                      ]).format(tdir=testdir),
            ],
            logger=log.getChild('radosbench.{id}'.format(id='client.admin')),
            stdin=run.PIPE,
            wait=False
        ))
    time.sleep(10)

    # mark stuff lost
    pgs = manager.get_pg_stats()
    for pg in pgs:
        if pg['stat_sum']['num_objects_unfound'] > 0:
            # verify that i can list them direct from the osd
            log.info('listing missing/lost in %s state %s', pg['pgid'],
                     pg['state']);
            m = manager.list_pg_unfound(pg['pgid'])
            log.info('%s' % m)
            assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']

            log.info("reverting unfound in %s", pg['pgid'])
            manager.raw_cluster_cmd('pg', pg['pgid'],
                                    'mark_unfound_lost', 'delete')
        else:
            log.info("no unfound in %s", pg['pgid'])

    manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
    manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
    manager.raw_cluster_cmd('tell', 'osd.3', 'debug', 'kick_recovery_wq', '5')
    manager.flush_pg_stats([0, 2, 3])
    manager.wait_for_recovery()

    if not config.get('parallel_bench', True):
        time.sleep(20)

    # verify result
    for f in range(1, 10):
        err = rados(ctx, mon, ['-p', pool, 'get', 'new_%d' % f, '-'])
        assert err
        err = rados(ctx, mon, ['-p', pool, 'get', 'existed_%d' % f, '-'])
        assert err
        err = rados(ctx, mon, ['-p', pool, 'get', 'existing_%d' % f, '-'])
        assert err

    # see if osd.1 can cope
    manager.revive_osd(1)
    manager.wait_till_osd_is_up(1)
    manager.wait_for_clean()
    run.wait(procs)
Exemplo n.º 43
0
    def test_object_deletion(self):
        """
        That the MDS has a clean 'damaged' response to loss of any single metadata object
        """

        self._simple_workload_write()

        # Hmm, actually it would be nice to permute whether the metadata pool
        # state contains sessions or not, but for the moment close this session
        # to avoid waiting through reconnect on every MDS start.
        self.mount_a.umount_wait()
        for mds_name in self.fs.get_active_names():
            self.fs.mds_asok(["flush", "journal"], mds_name)

        self.fs.mds_stop()
        self.fs.mds_fail()

        self.fs.rados(['export', '/tmp/metadata.bin'])

        def is_ignored(obj_id, dentry=None):
            """
            A filter to avoid redundantly mutating many similar objects (e.g.
            stray dirfrags) or similar dentries (e.g. stray dir dentries)
            """
            if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000":
                return True

            if dentry and obj_id == "100.00000000":
                if re.match("stray.+_head", dentry) and dentry != "stray0_head":
                    return True

            return False

        def get_path(obj_id, dentry=None):
            """
            What filesystem path does this object or dentry correspond to?   i.e.
            what should I poke to see EIO after damaging it?
            """

            if obj_id == "1.00000000" and dentry == "subdir_head":
                return "./subdir"
            elif obj_id == "10000000000.00000000" and dentry == "sixmegs_head":
                return "./subdir/sixmegs"

            # None means ls will do an "ls -R" in hope of seeing some errors
            return None

        objects = self.fs.rados(["ls"]).split("\n")
        objects = [o for o in objects if not is_ignored(o)]

        # Find all objects with an OMAP header
        omap_header_objs = []
        for o in objects:
            header = self.fs.rados(["getomapheader", o])
            # The rados CLI wraps the header output in a hex-printed style
            header_bytes = int(re.match("header \((.+) bytes\)", header).group(1))
            if header_bytes > 0:
                omap_header_objs.append(o)

        # Find all OMAP key/vals
        omap_keys = []
        for o in objects:
            keys_str = self.fs.rados(["listomapkeys", o])
            if keys_str:
                for key in keys_str.split("\n"):
                    if not is_ignored(o, key):
                        omap_keys.append((o, key))

        # Find objects that have data in their bodies
        data_objects = []
        for obj_id in objects:
            stat_out = self.fs.rados(["stat", obj_id])
            size = int(re.match(".+, size (.+)$", stat_out).group(1))
            if size > 0:
                data_objects.append(obj_id)

        # Define the various forms of damage we will inflict
        class MetadataMutation(object):
            def __init__(self, obj_id_, desc_, mutate_fn_, expectation_, ls_path=None):
                self.obj_id = obj_id_
                self.desc = desc_
                self.mutate_fn = mutate_fn_
                self.expectation = expectation_
                if ls_path is None:
                    self.ls_path = "."
                else:
                    self.ls_path = ls_path

            def __eq__(self, other):
                return self.desc == other.desc

            def __hash__(self):
                return hash(self.desc)

        junk = "deadbeef" * 10
        mutations = []

        # Removals
        for o in objects:
            if o in [
                # JournalPointers are auto-replaced if missing (same path as upgrade)
                "400.00000000",
                # Missing dirfrags for non-system dirs result in empty directory
                "10000000000.00000000",
                # PurgeQueue is auto-created if not found on startup
                "500.00000000",
                # open file table is auto-created if not found on startup
                "mds0_openfiles.0"
            ]:
                expectation = NO_DAMAGE
            else:
                expectation = DAMAGED_ON_START

            log.info("Expectation on rm '{0}' will be '{1}'".format(
                o, expectation
            ))

            mutations.append(MetadataMutation(
                o,
                "Delete {0}".format(o),
                lambda o=o: self.fs.rados(["rm", o]),
                expectation
            ))

        # Blatant corruptions
        for obj_id in data_objects:
            if obj_id == "500.00000000":
                # purge queue corruption results in read-only FS
                mutations.append(MetadataMutation(
                    obj_id,
                    "Corrupt {0}".format(obj_id),
                    lambda o=obj_id: self.fs.rados(["put", o, "-"], stdin_data=junk),
                    READONLY
                ))
            else:
                mutations.append(MetadataMutation(
                    obj_id,
                    "Corrupt {0}".format(obj_id),
                    lambda o=obj_id: self.fs.rados(["put", o, "-"], stdin_data=junk),
                    DAMAGED_ON_START
                ))

        # Truncations
        for o in data_objects:
            if o == "500.00000000":
                # The PurgeQueue is allowed to be empty: Journaler interprets
                # an empty header object as an empty journal.
                expectation = NO_DAMAGE
            else:
                expectation = DAMAGED_ON_START

            mutations.append(
                MetadataMutation(
                    o,
                    "Truncate {0}".format(o),
                    lambda o=o: self.fs.rados(["truncate", o, "0"]),
                    expectation
            ))

        # OMAP value corruptions
        for o, k in omap_keys:
            if o.startswith("100."):
                # Anything in rank 0's 'mydir'
                expectation = DAMAGED_ON_START
            else:
                expectation = EIO_ON_LS

            mutations.append(
                MetadataMutation(
                    o,
                    "Corrupt omap key {0}:{1}".format(o, k),
                    lambda o=o,k=k: self.fs.rados(["setomapval", o, k, junk]),
                    expectation,
                    get_path(o, k)
                )
            )

        # OMAP header corruptions
        for o in omap_header_objs:
            if re.match("60.\.00000000", o) \
                    or o in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
                expectation = DAMAGED_ON_START
            else:
                expectation = NO_DAMAGE

            log.info("Expectation on corrupt header '{0}' will be '{1}'".format(
                o, expectation
            ))

            mutations.append(
                MetadataMutation(
                    o,
                    "Corrupt omap header on {0}".format(o),
                    lambda o=o: self.fs.rados(["setomapheader", o, junk]),
                    expectation
                )
            )

        results = {}

        for mutation in mutations:
            log.info("Applying mutation '{0}'".format(mutation.desc))

            # Reset MDS state
            self.mount_a.umount_wait(force=True)
            self.fs.mds_stop()
            self.fs.mds_fail()
            self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')

            # Reset RADOS pool state
            self.fs.rados(['import', '/tmp/metadata.bin'])

            # Inject the mutation
            mutation.mutate_fn()

            # Try starting the MDS
            self.fs.mds_restart()

            # How long we'll wait between starting a daemon and expecting
            # it to make it through startup, and potentially declare itself
            # damaged to the mon cluster.
            startup_timeout = 60

            if mutation.expectation not in (EIO_ON_LS, DAMAGED_ON_LS, NO_DAMAGE):
                if mutation.expectation == DAMAGED_ON_START:
                    # The MDS may pass through active before making it to damaged
                    try:
                        self.wait_until_true(lambda: self.is_marked_damaged(0), startup_timeout)
                    except RuntimeError:
                        pass

                # Wait for MDS to either come up or go into damaged state
                try:
                    self.wait_until_true(lambda: self.is_marked_damaged(0) or self.fs.are_daemons_healthy(), startup_timeout)
                except RuntimeError:
                    crashed = False
                    # Didn't make it to healthy or damaged, did it crash?
                    for daemon_id, daemon in self.fs.mds_daemons.items():
                        if daemon.proc and daemon.proc.finished:
                            crashed = True
                            log.error("Daemon {0} crashed!".format(daemon_id))
                            daemon.proc = None  # So that subsequent stop() doesn't raise error
                    if not crashed:
                        # Didn't go health, didn't go damaged, didn't crash, so what?
                        raise
                    else:
                        log.info("Result: Mutation '{0}' led to crash".format(mutation.desc))
                        results[mutation] = CRASHED
                        continue
                if self.is_marked_damaged(0):
                    log.info("Result: Mutation '{0}' led to DAMAGED state".format(mutation.desc))
                    results[mutation] = DAMAGED_ON_START
                    continue
                else:
                    log.info("Mutation '{0}' did not prevent MDS startup, attempting ls...".format(mutation.desc))
            else:
                try:
                    self.wait_until_true(self.fs.are_daemons_healthy, 60)
                except RuntimeError:
                    log.info("Result: Mutation '{0}' should have left us healthy, actually not.".format(mutation.desc))
                    if self.is_marked_damaged(0):
                        results[mutation] = DAMAGED_ON_START
                    else:
                        results[mutation] = FAILED_SERVER
                    continue
                log.info("Daemons came up after mutation '{0}', proceeding to ls".format(mutation.desc))

            # MDS is up, should go damaged on ls or client mount
            self.mount_a.mount()
            self.mount_a.wait_until_mounted()
            if mutation.ls_path == ".":
                proc = self.mount_a.run_shell(["ls", "-R", mutation.ls_path], wait=False)
            else:
                proc = self.mount_a.stat(mutation.ls_path, wait=False)

            if mutation.expectation == DAMAGED_ON_LS:
                try:
                    self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
                    log.info("Result: Mutation '{0}' led to DAMAGED state after ls".format(mutation.desc))
                    results[mutation] = DAMAGED_ON_LS
                except RuntimeError:
                    if self.fs.are_daemons_healthy():
                        log.error("Result: Failed to go damaged on mutation '{0}', actually went active".format(
                            mutation.desc))
                        results[mutation] = NO_DAMAGE
                    else:
                        log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc))
                        results[mutation] = FAILED_SERVER
            elif mutation.expectation == READONLY:
                proc = self.mount_a.run_shell(["mkdir", "foo"], wait=False)
                try:
                    proc.wait()
                except CommandFailedError:
                    stderr = proc.stderr.getvalue()
                    log.info(stderr)
                    if "Read-only file system".lower() in stderr.lower():
                        pass
                    else:
                        raise
            else:
                try:
                    wait([proc], 20)
                    log.info("Result: Mutation '{0}' did not caused DAMAGED state".format(mutation.desc))
                    results[mutation] = NO_DAMAGE
                except MaxWhileTries:
                    log.info("Result: Failed to complete client IO on mutation '{0}'".format(mutation.desc))
                    results[mutation] = FAILED_CLIENT
                except CommandFailedError as e:
                    if e.exitstatus == errno.EIO:
                        log.info("Result: EIO on client")
                        results[mutation] = EIO_ON_LS
                    else:
                        log.info("Result: unexpected error {0} on client".format(e))
                        results[mutation] = FAILED_CLIENT

            if mutation.expectation == EIO_ON_LS:
                # EIOs mean something handled by DamageTable: assert that it has
                # been populated
                damage = json.loads(
                    self.fs.mon_manager.raw_cluster_cmd(
                        'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), "damage", "ls", '--format=json-pretty'))
                if len(damage) == 0:
                    results[mutation] = EIO_NO_DAMAGE

        failures = [(mutation, result) for (mutation, result) in results.items() if mutation.expectation != result]
        if failures:
            log.error("{0} mutations had unexpected outcomes:".format(len(failures)))
            for mutation, result in failures:
                log.error("  Expected '{0}' actually '{1}' from '{2}'".format(
                    mutation.expectation, result, mutation.desc
                ))
            raise RuntimeError("{0} mutations had unexpected outcomes".format(len(failures)))
        else:
            log.info("All {0} mutations had expected outcomes".format(len(mutations)))
Exemplo n.º 44
0
    def test_object_deletion(self):
        """
        That the MDS has a clean 'damaged' response to loss of any single metadata object
        """

        self._simple_workload_write()

        # Hmm, actually it would be nice to permute whether the metadata pool
        # state contains sessions or not, but for the moment close this session
        # to avoid waiting through reconnect on every MDS start.
        self.mount_a.umount_wait()
        for mds_name in self.fs.get_active_names():
            self.fs.mds_asok(["flush", "journal"], mds_name)

        self.fs.mds_stop()
        self.fs.mds_fail()

        self.fs.rados(['export', '/tmp/metadata.bin'])

        def is_ignored(obj_id):
            """
            A filter to avoid redundantly mutating many similar objects (e.g.
            stray dirfrags)
            """
            if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000":
                return True

            return False

        objects = self.fs.rados(["ls"]).split("\n")
        objects = [o for o in objects if not is_ignored(o)]

        # Find all objects with an OMAP header
        omap_header_objs = []
        for o in objects:
            header = self.fs.rados(["getomapheader", o])
            # The rados CLI wraps the header output in a hex-printed style
            header_bytes = int(re.match("header \((.+) bytes\)", header).group(1))
            if header_bytes > 0:
                omap_header_objs.append(o)

        # Find all OMAP key/vals
        omap_keys = []
        for o in objects:
            keys_str = self.fs.rados(["listomapkeys", o])
            if keys_str:
                for key in keys_str.split("\n"):
                    omap_keys.append((o, key))

        # Find objects that have data in their bodies
        data_objects = []
        for obj_id in objects:
            stat_out = self.fs.rados(["stat", obj_id])
            size = int(re.match(".+, size (.+)$", stat_out).group(1))
            if size > 0:
                data_objects.append(obj_id)

        # Define the various forms of damage we will inflict
        class MetadataMutation(object):
            def __init__(self, obj_id_, desc_, mutate_fn_, expectation_):
                self.obj_id = obj_id_
                self.desc = desc_
                self.mutate_fn = mutate_fn_
                self.expectation = expectation_

            def __eq__(self, other):
                return self.desc == other.desc

            def __hash__(self):
                return hash(self.desc)

        # Removals
        mutations = []
        for obj_id in objects:
            if obj_id in [
                "400.00000000",
                "100.00000000",
                "10000000000.00000000",
                "1.00000000"
            ]:
                expectation = NO_DAMAGE
            else:
                expectation = DAMAGED_ON_START

            log.info("Expectation on rm '{0}' will be '{1}'".format(
                obj_id, expectation
            ))

            mutations.append(MetadataMutation(
                obj_id,
                "Delete {0}".format(obj_id),
                lambda o=obj_id: self.fs.rados(["rm", o]),
                expectation
            ))

        junk = "deadbeef" * 10

        # Blatant corruptions
        mutations.extend([
            MetadataMutation(
                o,
                "Corrupt {0}".format(o),
                lambda o=o: self.fs.rados(["put", o, "-"], stdin_data=junk),
                DAMAGED_ON_START
            ) for o in data_objects
        ])

        # Truncations
        mutations.extend([
            MetadataMutation(
                o,
                "Truncate {0}".format(o),
                lambda o=o: self.fs.rados(["truncate", o, "0"]),
                DAMAGED_ON_START
            ) for o in data_objects
        ])

        # OMAP value corruptions
        for o, k in omap_keys:
            if o.startswith("1.") or o.startswith("100."):
                expectation = DAMAGED_ON_START
            else:
                expectation = DAMAGED_ON_LS

            mutations.append(
                MetadataMutation(
                    o,
                    "Corrupt omap key {0}:{1}".format(o, k),
                    lambda o=o: self.fs.rados(["setomapval", o, k, junk]),
                    expectation
                )
            )

        # OMAP header corruptions
        for obj_id in omap_header_objs:
            if obj_id == "mds0_sessionmap" or re.match("60.\.00000000", obj_id):
                expectation = DAMAGED_ON_START
            else:
                expectation = NO_DAMAGE

            log.info("Expectation on corrupt header '{0}' will be '{1}'".format(
                obj_id, expectation
            ))

            mutations.append(
                MetadataMutation(
                    obj_id,
                    "Corrupt omap header on {0}".format(obj_id),
                    lambda o=obj_id: self.fs.rados(["setomapheader", o, junk]),
                    expectation
                )
            )

        results = {}

        for mutation in mutations:
            log.info("Applying mutation '{0}'".format(mutation.desc))

            # Reset MDS state
            self.mount_a.umount_wait(force=True)
            self.fs.mds_stop()
            self.fs.mds_fail()
            self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')

            # Reset RADOS pool state
            self.fs.rados(['import', '/tmp/metadata.bin'])

            # Inject the mutation
            mutation.mutate_fn()

            # Try starting the MDS
            self.fs.mds_restart()

            if mutation.expectation not in (DAMAGED_ON_LS, NO_DAMAGE):
                # Wait for MDS to either come up or go into damaged state
                try:
                    self.wait_until_true(lambda: self.is_marked_damaged(0) or self.fs.are_daemons_healthy(), 60)
                except RuntimeError:
                    crashed = False
                    # Didn't make it to healthy or damaged, did it crash?
                    for daemon_id, daemon in self.fs.mds_daemons.items():
                        if daemon.proc.finished:
                            crashed = True
                            log.error("Daemon {0} crashed!".format(daemon_id))
                            daemon.proc = None  # So that subsequent stop() doesn't raise error
                    if not crashed:
                        # Didn't go health, didn't go damaged, didn't crash, so what?
                        raise
                    else:
                        log.info("Result: Mutation '{0}' led to crash".format(mutation.desc))
                        results[mutation] = CRASHED
                        continue
                if self.is_marked_damaged(0):
                    log.info("Result: Mutation '{0}' led to DAMAGED state".format(mutation.desc))
                    results[mutation] = DAMAGED_ON_START
                    continue
                else:
                    log.info("Mutation '{0}' did not prevent MDS startup, attempting ls...".format(mutation.desc))
            else:
                try:
                    self.wait_until_true(self.fs.are_daemons_healthy, 60)
                except RuntimeError:
                    log.info("Result: Mutation '{0}' should have left us healthy, actually not.".format(mutation.desc))
                    if self.is_marked_damaged(0):
                        results[mutation] = DAMAGED_ON_START
                    else:
                        results[mutation] = FAILED_SERVER
                    continue
                log.info("Daemons came up after mutation '{0}', proceeding to ls".format(mutation.desc))

            # MDS is up, should go damaged on ls or client mount
            self.mount_a.mount()
            self.mount_a.wait_until_mounted()
            proc = self.mount_a.run_shell(["ls", "-R"], wait=False)

            if mutation.expectation == DAMAGED_ON_LS:
                try:
                    self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
                    log.info("Result: Mutation '{0}' led to DAMAGED state after ls".format(mutation.desc))
                    results[mutation] = DAMAGED_ON_LS
                except RuntimeError:
                    if self.fs.are_daemons_healthy():
                        log.error("Result: Failed to go damaged on mutation '{0}', actually went active".format(
                            mutation.desc))
                        results[mutation] = NO_DAMAGE
                    else:
                        log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc))
                        results[mutation] = FAILED_SERVER

            else:
                try:
                    wait([proc], 20)
                    log.info("Result: As expected, mutation '{0}' did not caused DAMAGED state".format(mutation.desc))
                    results[mutation] = NO_DAMAGE
                except MaxWhileTries:
                    log.info("Result: Failed to complete client IO on mutation '{0}'".format(mutation.desc))
                    results[mutation] = FAILED_CLIENT

        failures = [(mutation, result) for (mutation, result) in results.items() if mutation.expectation != result]
        if failures:
            log.error("{0} mutations had unexpected outcomes:".format(len(failures)))
            for mutation, result in failures:
                log.error("  Expected '{0}' actually '{1}' from '{2}'".format(
                    mutation.expectation, result, mutation.desc
                ))
            raise RuntimeError("{0} mutations had unexpected outcomes".format(len(failures)))
        else:
            log.info("All mutations had expected outcomes")
Exemplo n.º 45
0
def task(ctx, config):
    """
    Set up and tear down a Ceph cluster.

    For example::

        tasks:
        - ceph:
        - interactive:

    You can also specify what branch to run::

        tasks:
        - ceph:
            branch: foo

    Or a tag::

        tasks:
        - ceph:
            tag: v0.42.13

    Or a sha1::

        tasks:
        - ceph:
            sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed

    Or a local source dir::

        tasks:
        - ceph:
            path: /home/sage/ceph

    To capture code coverage data, use::

        tasks:
        - ceph:
            coverage: true

    To use btrfs, ext4, or xfs on the target's scratch disks, use::

        tasks:
        - ceph:
            fs: xfs
            mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
            mount_options: [nobarrier, inode64]

    Note, this will cause the task to check the /scratch_devs file on each node
    for available devices.  If no such file is found, /dev/sdb will be used.

    To run some daemons under valgrind, include their names
    and the tool/args to use in a valgrind section::

        tasks:
        - ceph:
          valgrind:
            mds.1: --tool=memcheck
            osd.1: [--tool=memcheck, --leak-check=no]

    Those nodes which are using memcheck or valgrind will get
    checked for bad results.

    To adjust or modify config options, use::

        tasks:
        - ceph:
            conf:
              section:
                key: value

    For example::

        tasks:
        - ceph:
            conf:
              mds.0:
                some option: value
                other key: other value
              client.0:
                debug client: 10
                debug ms: 1

    By default, the cluster log is checked for errors and warnings,
    and the run marked failed if any appear. You can ignore log
    entries by giving a list of egrep compatible regexes, i.e.:

        tasks:
        - ceph:
            log-whitelist: ['foo.*bar', 'bad message']

    :param ctx: Context
    :param config: Configuration
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        "task ceph only supports a dictionary for configuration"

    overrides = ctx.config.get('overrides', {})
    teuthology.deep_merge(config, overrides.get('ceph', {}))

    ctx.daemons = DaemonGroup()

    testdir = teuthology.get_testdir(ctx)
    if config.get('coverage'):
        coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
        log.info('Creating coverage directory...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'install', '-d', '-m0755', '--',
                    coverage_dir,
                    ],
                wait=False,
                )
            )

    with contextutil.nested(
        lambda: ceph_log(ctx=ctx, config=None),
        lambda: valgrind_post(ctx=ctx, config=config),
        lambda: cluster(ctx=ctx, config=dict(
                conf=config.get('conf', {}),
                fs=config.get('fs', None),
                mkfs_options=config.get('mkfs_options', None),
                mount_options=config.get('mount_options',None),
                block_journal=config.get('block_journal', None),
                tmpfs_journal=config.get('tmpfs_journal', None),
                log_whitelist=config.get('log-whitelist', []),
                cpu_profile=set(config.get('cpu_profile', [])),
                )),
        lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
        lambda: crush_setup(ctx=ctx, config=config),
        lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
        lambda: cephfs_setup(ctx=ctx, config=config),
        lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
        ):
        try:
            if config.get('wait-for-healthy', True):
                healthy(ctx=ctx, config=None)
            first_mon = teuthology.get_first_mon(ctx, config)
            (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
            ctx.manager = CephManager(
                mon,
                ctx=ctx,
                logger=log.getChild('ceph_manager'),
            )
            yield
        finally:
            if config.get('wait-for-scrub', True):
                osd_scrub_pgs(ctx, config)
Exemplo n.º 46
0
def ceph_log(ctx, config):
    """
    Create /var/log/ceph log directory that is open to everyone.
    Add valgrind and profiling-logger directories.

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Making ceph log dir writeable by non-root...')
    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'chmod',
                '777',
                '/var/log/ceph',
                ],
            wait=False,
            )
        )
    log.info('Disabling ceph logrotate...')
    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'rm', '-f', '--',
                '/etc/logrotate.d/ceph',
                ],
            wait=False,
            )
        )
    log.info('Creating extra log directories...')
    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'install', '-d', '-m0777', '--',
                '/var/log/ceph/valgrind',
                '/var/log/ceph/profiling-logger',
                ],
            wait=False,
            )
        )

    try:
        yield

    finally:
        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and ctx.summary['success']):
            # and logs
            log.info('Compressing logs...')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo',
                        'find',
                        '/var/log/ceph',
                        '-name',
                        '*.log',
                        '-print0',
                        run.Raw('|'),
                        'sudo',
                        'xargs',
                        '-0',
                        '--no-run-if-empty',
                        '--',
                        'gzip',
                        '--',
                        ],
                    wait=False,
                    ),
                )

            log.info('Archiving logs...')
            path = os.path.join(ctx.archive, 'remote')
            os.makedirs(path)
            for remote in ctx.cluster.remotes.iterkeys():
                sub = os.path.join(path, remote.shortname)
                os.makedirs(sub)
                teuthology.pull_directory(remote, '/var/log/ceph',
                                          os.path.join(sub, 'log'))
Exemplo n.º 47
0
def task(ctx, config):
    """
    Run radosbench

    The config should be as follows:

    radosbench:
        clients: [client list]
        time: <seconds to run>
        pool: <pool to use>
        size: write size to use
        unique_pool: use a unique pool, defaults to False
        ec_pool: create an ec pool, defaults to False
        create_pool: create pool, defaults to False
        erasure_code_profile:
          name: teuthologyprofile
          k: 2
          m: 1
          ruleset-failure-domain: osd
        cleanup: false (defaults to true)
    example:

    tasks:
    - ceph:
    - radosbench:
        clients: [client.0]
        time: 360
    - interactive:
    """
    log.info('Beginning radosbench...')
    assert isinstance(config, dict), \
        "please list clients to run on"
    radosbench = {}

    testdir = teuthology.get_testdir(ctx)

    create_pool = config.get('create_pool', True)
    for role in config.get('clients', ['client.0']):
        assert isinstance(role, basestring)
        PREFIX = 'client.'
        assert role.startswith(PREFIX)
        id_ = role[len(PREFIX):]
        (remote,) = ctx.cluster.only(role).remotes.iterkeys()

        if config.get('ec_pool', False):
            profile = config.get('erasure_code_profile', {})
            profile_name = profile.get('name', 'teuthologyprofile')
            ctx.manager.create_erasure_code_profile(profile_name, profile)
        else:
            profile_name = None

        cleanup = []
        if not config.get('cleanup', True):
            cleanup = ['--no-cleanup']

        pool = config.get('pool', 'data')
        if create_pool:
            if pool != 'data':
                ctx.manager.create_pool(pool, erasure_code_profile_name=profile_name)
            else:
                pool = ctx.manager.create_pool_with_unique_name(erasure_code_profile_name=profile_name)

        proc = remote.run(
            args=[
                "/bin/sh", "-c",
                " ".join(['adjust-ulimits',
                          'ceph-coverage',
                          '{tdir}/archive/coverage',
                          'rados',
			  '--no-log-to-stderr',
                          '--name', role,
                          '-b', str(config.get('size', 4<<20)),
                          '-p' , pool,
                          'bench', str(config.get('time', 360)), 'write',
                          ] + cleanup).format(tdir=testdir),
                ],
            logger=log.getChild('radosbench.{id}'.format(id=id_)),
            stdin=run.PIPE,
            wait=False
            )
        radosbench[id_] = proc

    try:
        yield
    finally:
        timeout = config.get('time', 360) * 5 + 180
        log.info('joining radosbench (timing out after %ss)', timeout)
        run.wait(radosbench.itervalues(), timeout=timeout)

        if pool is not 'data' and create_pool:
            ctx.manager.remove_pool(pool)
Exemplo n.º 48
0
def cluster(ctx, config):
    """
    Handle the creation and removal of a ceph cluster.

    On startup:
        Create directories needed for the cluster.
        Create remote journals for all osds.
        Create and set keyring.
        Copy the monmap to tht test systems.
        Setup mon nodes.
        Setup mds nodes.
        Mkfs osd nodes.
        Add keyring information to monmaps
        Mkfs mon nodes.

    On exit:
        If errors occured, extract a failure message and store in ctx.summary.
        Unmount all test files and temporary journaling files.
        Save the monitor information and archive all ceph logs.
        Cleanup the keyring setup, and remove all monitor map and data files left over.

    :param ctx: Context
    :param config: Configuration
    """
    if ctx.config.get('use_existing_cluster', False) is True:
        log.info("'use_existing_cluster' is true; skipping cluster creation")
        yield

    testdir = teuthology.get_testdir(ctx)
    log.info('Creating ceph cluster...')
    run.wait(
        ctx.cluster.run(
            args=[
                'install', '-d', '-m0755', '--',
                '{tdir}/data'.format(tdir=testdir),
                ],
            wait=False,
            )
        )

    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'install', '-d', '-m0777', '--', '/var/run/ceph',
                ],
            wait=False,
            )
        )


    devs_to_clean = {}
    remote_to_roles_to_devs = {}
    remote_to_roles_to_journals = {}
    osds = ctx.cluster.only(teuthology.is_type('osd'))
    for remote, roles_for_host in osds.remotes.iteritems():
        devs = teuthology.get_scratch_devices(remote)
        roles_to_devs = {}
        roles_to_journals = {}
        if config.get('fs'):
            log.info('fs option selected, checking for scratch devs')
            log.info('found devs: %s' % (str(devs),))
            devs_id_map = teuthology.get_wwn_id_map(remote, devs)
            iddevs = devs_id_map.values()
            roles_to_devs = assign_devs(
                teuthology.roles_of_type(roles_for_host, 'osd'), iddevs
                )
            if len(roles_to_devs) < len(iddevs):
                iddevs = iddevs[len(roles_to_devs):]
            devs_to_clean[remote] = []

        if config.get('block_journal'):
            log.info('block journal enabled')
            roles_to_journals = assign_devs(
                teuthology.roles_of_type(roles_for_host, 'osd'), iddevs
                )
            log.info('journal map: %s', roles_to_journals)

        if config.get('tmpfs_journal'):
            log.info('tmpfs journal enabled')
            roles_to_journals = {}
            remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] )
            for osd in teuthology.roles_of_type(roles_for_host, 'osd'):
                tmpfs = '/mnt/osd.%s' % osd
                roles_to_journals[osd] = tmpfs
                remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] )
            log.info('journal map: %s', roles_to_journals)

        log.info('dev map: %s' % (str(roles_to_devs),))
        remote_to_roles_to_devs[remote] = roles_to_devs
        remote_to_roles_to_journals[remote] = roles_to_journals


    log.info('Generating config...')
    remotes_and_roles = ctx.cluster.remotes.items()
    roles = [role_list for (remote, role_list) in remotes_and_roles]
    ips = [host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
    conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips)
    for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
        for role, journal in roles_to_journals.iteritems():
            key = "osd." + str(role)
            if key not in conf:
                conf[key] = {}
            conf[key]['osd journal'] = journal
    for section, keys in config['conf'].iteritems():
        for key, value in keys.iteritems():
            log.info("[%s] %s = %s" % (section, key, value))
            if section not in conf:
                conf[section] = {}
            conf[section][key] = value

    if config.get('tmpfs_journal'):
        conf['journal dio'] = False

    ctx.ceph = argparse.Namespace()
    ctx.ceph.conf = conf

    keyring_path = config.get('keyring_path', '/etc/ceph/ceph.keyring')

    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    firstmon = teuthology.get_first_mon(ctx, config)

    log.info('Setting up %s...' % firstmon)
    ctx.cluster.only(firstmon).run(
        args=[
            'sudo',
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph-authtool',
            '--create-keyring',
            keyring_path,
            ],
        )
    ctx.cluster.only(firstmon).run(
        args=[
            'sudo',
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph-authtool',
            '--gen-key',
            '--name=mon.',
            keyring_path,
            ],
        )
    ctx.cluster.only(firstmon).run(
        args=[
            'sudo',
            'chmod',
            '0644',
            keyring_path,
            ],
        )
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    fsid = teuthology.create_simple_monmap(
        ctx,
        remote=mon0_remote,
        conf=conf,
        )
    if not 'global' in conf:
        conf['global'] = {}
    conf['global']['fsid'] = fsid

    log.info('Writing ceph.conf for FSID %s...' % fsid)
    conf_path = config.get('conf_path', DEFAULT_CONF_PATH)
    write_conf(ctx, conf_path)

    log.info('Creating admin key on %s...' % firstmon)
    ctx.cluster.only(firstmon).run(
        args=[
            'sudo',
            'adjust-ulimits',
            'ceph-coverage',
            coverage_dir,
            'ceph-authtool',
            '--gen-key',
            '--name=client.admin',
            '--set-uid=0',
            '--cap', 'mon', 'allow *',
            '--cap', 'osd', 'allow *',
            '--cap', 'mds', 'allow *',
            keyring_path,
            ],
        )

    log.info('Copying monmap to all nodes...')
    keyring = teuthology.get_file(
        remote=mon0_remote,
        path=keyring_path,
        )
    monmap = teuthology.get_file(
        remote=mon0_remote,
        path='{tdir}/monmap'.format(tdir=testdir),
        )

    for rem in ctx.cluster.remotes.iterkeys():
        # copy mon key and initial monmap
        log.info('Sending monmap to node {remote}'.format(remote=rem))
        teuthology.sudo_write_file(
            remote=rem,
            path=keyring_path,
            data=keyring,
            perms='0644'
            )
        teuthology.write_file(
            remote=rem,
            path='{tdir}/monmap'.format(tdir=testdir),
            data=monmap,
            )

    log.info('Setting up mon nodes...')
    mons = ctx.cluster.only(teuthology.is_type('mon'))
    run.wait(
        mons.run(
            args=[
                'adjust-ulimits',
                'ceph-coverage',
                coverage_dir,
                'osdmaptool',
                '-c', conf_path,
                '--clobber',
                '--createsimple', '{num:d}'.format(
                    num=teuthology.num_instances_of_type(ctx.cluster, 'osd'),
                    ),
                '{tdir}/osdmap'.format(tdir=testdir),
                '--pg_bits', '2',
                '--pgp_bits', '4',
                ],
            wait=False,
            ),
        )

    log.info('Setting up mds nodes...')
    mdss = ctx.cluster.only(teuthology.is_type('mds'))
    for remote, roles_for_host in mdss.remotes.iteritems():
        for id_ in teuthology.roles_of_type(roles_for_host, 'mds'):
            remote.run(
                args=[
                    'sudo',
                    'mkdir',
                    '-p',
                    '/var/lib/ceph/mds/ceph-{id}'.format(id=id_),
                    run.Raw('&&'),
                    'sudo',
                    'adjust-ulimits',
                    'ceph-coverage',
                    coverage_dir,
                    'ceph-authtool',
                    '--create-keyring',
                    '--gen-key',
                    '--name=mds.{id}'.format(id=id_),
                    '/var/lib/ceph/mds/ceph-{id}/keyring'.format(id=id_),
                    ],
                )

    cclient.create_keyring(ctx)
    log.info('Running mkfs on osd nodes...')

    ctx.disk_config = argparse.Namespace()
    ctx.disk_config.remote_to_roles_to_dev = remote_to_roles_to_devs
    ctx.disk_config.remote_to_roles_to_journals = remote_to_roles_to_journals
    ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
    ctx.disk_config.remote_to_roles_to_dev_fstype = {}

    log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
    for remote, roles_for_host in osds.remotes.iteritems():
        roles_to_devs = remote_to_roles_to_devs[remote]
        roles_to_journals = remote_to_roles_to_journals[remote]


        for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
            remote.run(
                args=[
                    'sudo',
                    'mkdir',
                    '-p',
                    '/var/lib/ceph/osd/ceph-{id}'.format(id=id_),
                    ])
            log.info(str(roles_to_journals))
            log.info(id_)
            if roles_to_devs.get(id_):
                dev = roles_to_devs[id_]
                fs = config.get('fs')
                package = None
                mkfs_options = config.get('mkfs_options')
                mount_options = config.get('mount_options')
                if fs == 'btrfs':
                    #package = 'btrfs-tools'
                    if mount_options is None:
                        mount_options = ['noatime','user_subvol_rm_allowed']
                    if mkfs_options is None:
                        mkfs_options = ['-m', 'single',
                                        '-l', '32768',
                                        '-n', '32768']
                if fs == 'xfs':
                    #package = 'xfsprogs'
                    if mount_options is None:
                        mount_options = ['noatime']
                    if mkfs_options is None:
                        mkfs_options = ['-f', '-i', 'size=2048']
                if fs == 'ext4' or fs == 'ext3':
                    if mount_options is None:
                        mount_options = ['noatime','user_xattr']

                if mount_options is None:
                    mount_options = []
                if mkfs_options is None:
                    mkfs_options = []
                mkfs = ['mkfs.%s' % fs] + mkfs_options
                log.info('%s on %s on %s' % (mkfs, dev, remote))
                if package is not None:
                    remote.run(
                        args=[
                            'sudo',
                            'apt-get', 'install', '-y', package
                            ],
                        stdout=StringIO(),
                        )

                try:
                    remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
                except run.CommandFailedError:
                    # Newer btfs-tools doesn't prompt for overwrite, use -f
                    if '-f' not in mount_options:
                        mkfs_options.append('-f')
                        mkfs = ['mkfs.%s' % fs] + mkfs_options
                        log.info('%s on %s on %s' % (mkfs, dev, remote))
                    remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])

                log.info('mount %s on %s -o %s' % (dev, remote,
                                                   ','.join(mount_options)))
                remote.run(
                    args=[
                        'sudo',
                        'mount',
                        '-t', fs,
                        '-o', ','.join(mount_options),
                        dev,
                        os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)),
                        ]
                    )
                if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
                    ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
                ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][id_] = mount_options
                if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
                    ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
                ctx.disk_config.remote_to_roles_to_dev_fstype[remote][id_] = fs
                devs_to_clean[remote].append(
                    os.path.join(
                        os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)),
                        )
                    )

        for id_ in teuthology.roles_of_type(roles_for_host, 'osd'):
            remote.run(
                args=[
                    'sudo',
                    'MALLOC_CHECK_=3',
                    'adjust-ulimits',
                    'ceph-coverage',
                    coverage_dir,
                    'ceph-osd',
                    '--mkfs',
                    '--mkkey',
                    '-i', id_,
                    '--monmap', '{tdir}/monmap'.format(tdir=testdir),
                    ],
                )


    log.info('Reading keys from all nodes...')
    keys_fp = StringIO()
    keys = []
    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
        for type_ in ['mds','osd']:
            for id_ in teuthology.roles_of_type(roles_for_host, type_):
                data = teuthology.get_file(
                    remote=remote,
                    path='/var/lib/ceph/{type}/ceph-{id}/keyring'.format(
                        type=type_,
                        id=id_,
                        ),
                    sudo=True,
                    )
                keys.append((type_, id_, data))
                keys_fp.write(data)
    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
        for type_ in ['client']:
            for id_ in teuthology.roles_of_type(roles_for_host, type_):
                data = teuthology.get_file(
                    remote=remote,
                    path='/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
                    )
                keys.append((type_, id_, data))
                keys_fp.write(data)

    log.info('Adding keys to all mons...')
    writes = mons.run(
        args=[
            'sudo', 'tee', '-a',
            keyring_path,
            ],
        stdin=run.PIPE,
        wait=False,
        stdout=StringIO(),
        )
    keys_fp.seek(0)
    teuthology.feed_many_stdins_and_close(keys_fp, writes)
    run.wait(writes)
    for type_, id_, data in keys:
        run.wait(
            mons.run(
                args=[
                    'sudo',
                    'adjust-ulimits',
                    'ceph-coverage',
                    coverage_dir,
                    'ceph-authtool',
                    keyring_path,
                    '--name={type}.{id}'.format(
                        type=type_,
                        id=id_,
                        ),
                    ] + list(teuthology.generate_caps(type_)),
                wait=False,
                ),
            )

    log.info('Running mkfs on mon nodes...')
    for remote, roles_for_host in mons.remotes.iteritems():
        for id_ in teuthology.roles_of_type(roles_for_host, 'mon'):
            remote.run(
                args=[
                  'sudo',
                  'mkdir',
                  '-p',
                  '/var/lib/ceph/mon/ceph-{id}'.format(id=id_),
                  ],
                )
            remote.run(
                args=[
                    'sudo',
                    'adjust-ulimits',
                    'ceph-coverage',
                    coverage_dir,
                    'ceph-mon',
                    '--mkfs',
                    '-i', id_,
                    '--monmap={tdir}/monmap'.format(tdir=testdir),
                    '--osdmap={tdir}/osdmap'.format(tdir=testdir),
                    '--keyring={kpath}'.format(kpath=keyring_path),
                    ],
                )


    run.wait(
        mons.run(
            args=[
                'rm',
                '--',
                '{tdir}/monmap'.format(tdir=testdir),
                '{tdir}/osdmap'.format(tdir=testdir),
                ],
            wait=False,
            ),
        )

    try:
        yield
    except Exception:
        # we need to know this below
        ctx.summary['success'] = False
        raise
    finally:
        (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()

        log.info('Checking cluster log for badness...')
        def first_in_ceph_log(pattern, excludes):
            """
            Find the first occurence of the pattern specified in the Ceph log,
            Returns None if none found.

            :param pattern: Pattern scanned for.
            :param excludes: Patterns to ignore.
            :return: First line of text (or None if not found)
            """
            args = [
                'sudo',
                'egrep', pattern,
                '/var/log/ceph/ceph.log',
                ]
            for exclude in excludes:
                args.extend([run.Raw('|'), 'egrep', '-v', exclude])
            args.extend([
                    run.Raw('|'), 'head', '-n', '1',
                    ])
            r = mon0_remote.run(
                stdout=StringIO(),
                args=args,
                )
            stdout = r.stdout.getvalue()
            if stdout != '':
                return stdout
            return None

        if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
                             config['log_whitelist']) is not None:
            log.warning('Found errors (ERR|WRN|SEC) in cluster log')
            ctx.summary['success'] = False
            # use the most severe problem as the failure reason
            if 'failure_reason' not in ctx.summary:
                for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
                    match = first_in_ceph_log(pattern, config['log_whitelist'])
                    if match is not None:
                        ctx.summary['failure_reason'] = \
                            '"{match}" in cluster log'.format(
                            match=match.rstrip('\n'),
                            )
                        break

        for remote, dirs in devs_to_clean.iteritems():
            for dir_ in dirs:
                log.info('Unmounting %s on %s' % (dir_, remote))
                try:
                    remote.run(
                        args=[
                            'sync',
                            run.Raw('&&'),
                            'sudo',
                            'umount',
                            '-f',
                            dir_
                        ]
                    )
                except Exception as e:
                    remote.run(args=[
                            'sudo',
                            run.Raw('PATH=/usr/sbin:$PATH'),
                            'lsof',
                            run.Raw(';'),
                            'ps', 'auxf',
                            ])
                    raise e

        if config.get('tmpfs_journal'):
            log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
            for remote, roles_for_host in osds.remotes.iteritems():
                remote.run(
                    args=[ 'sudo', 'umount', '-f', '/mnt' ],
                    check_status=False,
                )

        if ctx.archive is not None and \
           not (ctx.config.get('archive-on-error') and ctx.summary['success']):

            # archive mon data, too
            log.info('Archiving mon data...')
            path = os.path.join(ctx.archive, 'data')
            os.makedirs(path)
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    if role.startswith('mon.'):
                        teuthology.pull_directory_tarball(
                            remote,
                            '/var/lib/ceph/mon',
                            path + '/' + role + '.tgz')

        log.info('Cleaning ceph cluster...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-rf',
                    '--',
                    conf_path,
                    keyring_path,
                    '{tdir}/data'.format(tdir=testdir),
                    '{tdir}/monmap'.format(tdir=testdir),
                    ],
                wait=False,
                ),
            )
Exemplo n.º 49
0
def task(ctx, config):
    """
    Run omapbench

    The config should be as follows::

		  omapbench:
		      clients: [client list]
		      threads: <threads at once>
		      objects: <number of objects to write>
		      entries: <number of entries per object map>
		      keysize: <number of characters per object map key>
		      valsize: <number of characters per object map val>
		      increment: <interval to show in histogram (in ms)>
		      omaptype: <how the omaps should be generated>

    example::

		  tasks:
		  - ceph:
		  - omapbench:
		      clients: [client.0]
		      threads: 30
		      objects: 1000
		      entries: 10
		      keysize: 10
		      valsize: 100
		      increment: 100
		      omaptype: uniform
		  - interactive:
    """
    log.info('Beginning omapbench...')
    assert isinstance(config, dict), \
        "please list clients to run on"
    omapbench = {}
    testdir = teuthology.get_testdir(ctx)
    print(str(config.get('increment', -1)))
    for role in config.get('clients', ['client.0']):
        assert isinstance(role, basestring)
        PREFIX = 'client.'
        assert role.startswith(PREFIX)
        id_ = role[len(PREFIX):]
        (remote, ) = ctx.cluster.only(role).remotes.iterkeys()
        proc = remote.run(args=[
            "/bin/sh",
            "-c",
            " ".join([
                'adjust-ulimits', 'ceph-coverage', '{tdir}/archive/coverage',
                'omapbench', '--name', role[len(PREFIX):], '-t',
                str(config.get('threads', 30)), '-o',
                str(config.get('objects', 1000)), '--entries',
                str(config.get('entries', 10)), '--keysize',
                str(config.get('keysize', 10)), '--valsize',
                str(config.get('valsize', 1000)), '--inc',
                str(config.get('increment', 10)), '--omaptype',
                str(config.get('omaptype', 'uniform'))
            ]).format(tdir=testdir),
        ],
                          logger=log.getChild('omapbench.{id}'.format(id=id_)),
                          stdin=run.PIPE,
                          wait=False)
        omapbench[id_] = proc

    try:
        yield
    finally:
        log.info('joining omapbench')
        run.wait(omapbench.itervalues())
Exemplo n.º 50
0
    def do_thrash(self):
        """
        Perform the random thrashing action
        """

        self.log('starting thrash for cluster {cluster}'.format(
            cluster=self.cluster))
        stats = {
            "kill": 0,
        }

        while not self.stopping.is_set():
            delay = self.max_thrash_delay
            if self.randomize:
                delay = random.randrange(self.min_thrash_delay,
                                         self.max_thrash_delay)

            if delay > 0.0:
                self.log('waiting for {delay} secs before thrashing'.format(
                    delay=delay))
                self.stopping.wait(delay)
                if self.stopping.is_set():
                    continue

            killed_daemons = []

            weight = 1.0 / len(self.daemons)
            count = 0
            for daemon in self.daemons:
                skip = random.uniform(0.0, 1.0)
                if weight <= skip:
                    self.log(
                        'skipping daemon {label} with skip ({skip}) > weight ({weight})'
                        .format(label=daemon.id_, skip=skip, weight=weight))
                    continue

                self.log('kill {label}'.format(label=daemon.id_))
                try:
                    daemon.signal(signal.SIGTERM)
                except socket.error:
                    pass
                killed_daemons.append(daemon)
                stats['kill'] += 1

                # if we've reached max_thrash, we're done
                count += 1
                if count >= self.max_thrash:
                    break

            if killed_daemons:
                # wait for a while before restarting
                delay = self.max_revive_delay
                if self.randomize:
                    delay = random.randrange(0.0, self.max_revive_delay)

                self.log(
                    'waiting for {delay} secs before reviving daemons'.format(
                        delay=delay))
                sleep(delay)

                for daemon in killed_daemons:
                    self.log('waiting for {label}'.format(label=daemon.id_))
                    try:
                        run.wait([daemon.proc], timeout=600)
                    except CommandFailedError:
                        pass
                    except:
                        self.log(
                            'Failed to stop {label}'.format(label=daemon.id_))

                        try:
                            # try to capture a core dump
                            daemon.signal(signal.SIGABRT)
                        except socket.error:
                            pass
                        raise
                    finally:
                        daemon.reset()

                for daemon in killed_daemons:
                    self.log('reviving {label}'.format(label=daemon.id_))
                    daemon.start()

        for stat in stats:
            self.log("stat['{key}'] = {value}".format(key=stat,
                                                      value=stats[stat]))
Exemplo n.º 51
0
def ceph_log(ctx, config):
    """
    Create /var/log/ceph log directory that is open to everyone.
    Add valgrind and profiling-logger directories.

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Making ceph log dir writeable by non-root...')
    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'chmod',
                '777',
                '/var/log/ceph',
            ],
            wait=False,
        ))
    log.info('Disabling ceph logrotate...')
    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'rm',
                '-f',
                '--',
                '/etc/logrotate.d/ceph',
            ],
            wait=False,
        ))
    log.info('Creating extra log directories...')
    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'install',
                '-d',
                '-m0777',
                '--',
                '/var/log/ceph/valgrind',
                '/var/log/ceph/profiling-logger',
            ],
            wait=False,
        ))

    class Rotater(object):
        stop_event = gevent.event.Event()

        def invoke_logrotate(self):
            # 1) install ceph-test.conf in /etc/logrotate.d
            # 2) continuously loop over logrotate invocation with ceph-test.conf
            while not self.stop_event.is_set():
                self.stop_event.wait(timeout=30)
                run.wait(
                    ctx.cluster.run(
                        args=[
                            'sudo', 'logrotate',
                            '/etc/logrotate.d/ceph-test.conf'
                        ],
                        wait=False,
                    ))

        def begin(self):
            self.thread = gevent.spawn(self.invoke_logrotate)

        def end(self):
            self.stop_event.set()
            self.thread.get()

    def write_rotate_conf(ctx, daemons):
        testdir = teuthology.get_testdir(ctx)
        rotate_conf_path = os.path.join(os.path.dirname(__file__),
                                        'logrotate.conf')
        with file(rotate_conf_path, 'rb') as f:
            conf = ""
            for daemon, size in daemons.iteritems():
                log.info('writing logrotate stanza for {daemon}'.format(
                    daemon=daemon))
                conf += f.read().format(daemon_type=daemon, max_size=size)
                f.seek(0, 0)

            for remote in ctx.cluster.remotes.iterkeys():
                teuthology.write_file(
                    remote=remote,
                    path='{tdir}/logrotate.ceph-test.conf'.format(
                        tdir=testdir),
                    data=StringIO(conf))
                remote.run(args=[
                    'sudo', 'mv', '{tdir}/logrotate.ceph-test.conf'.format(
                        tdir=testdir), '/etc/logrotate.d/ceph-test.conf',
                    run.Raw('&&'), 'sudo', 'chmod', '0644',
                    '/etc/logrotate.d/ceph-test.conf',
                    run.Raw('&&'), 'sudo', 'chown', 'root.root',
                    '/etc/logrotate.d/ceph-test.conf'
                ])
                remote.chcon('/etc/logrotate.d/ceph-test.conf',
                             'system_u:object_r:etc_t:s0')

    if ctx.config.get('log-rotate'):
        daemons = ctx.config.get('log-rotate')
        log.info('Setting up log rotation with ' + str(daemons))
        write_rotate_conf(ctx, daemons)
        logrotater = Rotater()
        logrotater.begin()
    try:
        yield

    finally:
        if ctx.config.get('log-rotate'):
            log.info('Shutting down logrotate')
            logrotater.end()
            ctx.cluster.run(
                args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'])
        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and ctx.summary['success']):
            # and logs
            log.info('Compressing logs...')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo',
                        'find',
                        '/var/log/ceph',
                        '-name',
                        '*.log',
                        '-print0',
                        run.Raw('|'),
                        'sudo',
                        'xargs',
                        '-0',
                        '--no-run-if-empty',
                        '--',
                        'gzip',
                        '--',
                    ],
                    wait=False,
                ), )

            log.info('Archiving logs...')
            path = os.path.join(ctx.archive, 'remote')
            os.makedirs(path)
            for remote in ctx.cluster.remotes.iterkeys():
                sub = os.path.join(path, remote.shortname)
                os.makedirs(sub)
                teuthology.pull_directory(remote, '/var/log/ceph',
                                          os.path.join(sub, 'log'))
Exemplo n.º 52
0
def cluster(ctx, config):
    """
    Handle the creation and removal of a ceph cluster.

    On startup:
        Create directories needed for the cluster.
        Create remote journals for all osds.
        Create and set keyring.
        Copy the monmap to tht test systems.
        Setup mon nodes.
        Setup mds nodes.
        Mkfs osd nodes.
        Add keyring information to monmaps
        Mkfs mon nodes.

    On exit:
        If errors occured, extract a failure message and store in ctx.summary.
        Unmount all test files and temporary journaling files.
        Save the monitor information and archive all ceph logs.
        Cleanup the keyring setup, and remove all monitor map and data files left over.

    :param ctx: Context
    :param config: Configuration
    """
    if ctx.config.get('use_existing_cluster', False) is True:
        log.info("'use_existing_cluster' is true; skipping cluster creation")
        yield

    testdir = teuthology.get_testdir(ctx)
    cluster_name = config['cluster']
    data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir,
                                              cluster=cluster_name)
    log.info('Creating ceph cluster %s...', cluster_name)
    run.wait(
        ctx.cluster.run(
            args=[
                'install',
                '-d',
                '-m0755',
                '--',
                data_dir,
            ],
            wait=False,
        ))

    run.wait(
        ctx.cluster.run(
            args=[
                'sudo',
                'install',
                '-d',
                '-m0777',
                '--',
                '/var/run/ceph',
            ],
            wait=False,
        ))

    devs_to_clean = {}
    remote_to_roles_to_devs = {}
    remote_to_roles_to_journals = {}
    osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
    for remote, roles_for_host in osds.remotes.iteritems():
        devs = teuthology.get_scratch_devices(remote)
        roles_to_devs = {}
        roles_to_journals = {}
        if config.get('fs'):
            log.info('fs option selected, checking for scratch devs')
            log.info('found devs: %s' % (str(devs), ))
            devs_id_map = teuthology.get_wwn_id_map(remote, devs)
            iddevs = devs_id_map.values()
            roles_to_devs = assign_devs(
                teuthology.cluster_roles_of_type(roles_for_host, 'osd',
                                                 cluster_name), iddevs)
            if len(roles_to_devs) < len(iddevs):
                iddevs = iddevs[len(roles_to_devs):]
            devs_to_clean[remote] = []

        if config.get('block_journal'):
            log.info('block journal enabled')
            roles_to_journals = assign_devs(
                teuthology.cluster_roles_of_type(roles_for_host, 'osd',
                                                 cluster_name), iddevs)
            log.info('journal map: %s', roles_to_journals)

        if config.get('tmpfs_journal'):
            log.info('tmpfs journal enabled')
            roles_to_journals = {}
            remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
            for role in teuthology.cluster_roles_of_type(
                    roles_for_host, 'osd', cluster_name):
                tmpfs = '/mnt/' + role
                roles_to_journals[role] = tmpfs
                remote.run(args=['truncate', '-s', '1500M', tmpfs])
            log.info('journal map: %s', roles_to_journals)

        log.info('dev map: %s' % (str(roles_to_devs), ))
        remote_to_roles_to_devs[remote] = roles_to_devs
        remote_to_roles_to_journals[remote] = roles_to_journals

    log.info('Generating config...')
    remotes_and_roles = ctx.cluster.remotes.items()
    roles = [role_list for (remote, role_list) in remotes_and_roles]
    ips = [
        host for (host, port) in (remote.ssh.get_transport().getpeername()
                                  for (remote, role_list) in remotes_and_roles)
    ]
    conf = teuthology.skeleton_config(ctx,
                                      roles=roles,
                                      ips=ips,
                                      cluster=cluster_name)
    for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
        for role, journal in roles_to_journals.iteritems():
            name = teuthology.ceph_role(role)
            if name not in conf:
                conf[name] = {}
            conf[name]['osd journal'] = journal
    for section, keys in config['conf'].iteritems():
        for key, value in keys.iteritems():
            log.info("[%s] %s = %s" % (section, key, value))
            if section not in conf:
                conf[section] = {}
            conf[section][key] = value

    if config.get('tmpfs_journal'):
        conf['journal dio'] = False

    if not hasattr(ctx, 'ceph'):
        ctx.ceph = {}
    ctx.ceph[cluster_name] = argparse.Namespace()
    ctx.ceph[cluster_name].conf = conf

    default_keyring = '/etc/ceph/{cluster}.keyring'.format(
        cluster=cluster_name)
    keyring_path = config.get('keyring_path', default_keyring)

    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)

    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)

    log.info('Setting up %s...' % firstmon)
    ctx.cluster.only(firstmon).run(args=[
        'sudo',
        'adjust-ulimits',
        'ceph-coverage',
        coverage_dir,
        'ceph-authtool',
        '--create-keyring',
        keyring_path,
    ], )
    ctx.cluster.only(firstmon).run(args=[
        'sudo',
        'adjust-ulimits',
        'ceph-coverage',
        coverage_dir,
        'ceph-authtool',
        '--gen-key',
        '--name=mon.',
        keyring_path,
    ], )
    ctx.cluster.only(firstmon).run(args=[
        'sudo',
        'chmod',
        '0644',
        keyring_path,
    ], )
    (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys()
    monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
                                                   cluster=cluster_name)
    fsid = teuthology.create_simple_monmap(
        ctx,
        remote=mon0_remote,
        conf=conf,
        path=monmap_path,
    )
    if not 'global' in conf:
        conf['global'] = {}
    conf['global']['fsid'] = fsid

    default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
    conf_path = config.get('conf_path', default_conf_path)
    log.info('Writing %s for FSID %s...' % (conf_path, fsid))
    write_conf(ctx, conf_path, cluster_name)

    log.info('Creating admin key on %s...' % firstmon)
    ctx.cluster.only(firstmon).run(args=[
        'sudo',
        'adjust-ulimits',
        'ceph-coverage',
        coverage_dir,
        'ceph-authtool',
        '--gen-key',
        '--name=client.admin',
        '--set-uid=0',
        '--cap',
        'mon',
        'allow *',
        '--cap',
        'osd',
        'allow *',
        '--cap',
        'mds',
        'allow *',
        keyring_path,
    ], )

    log.info('Copying monmap to all nodes...')
    keyring = teuthology.get_file(
        remote=mon0_remote,
        path=keyring_path,
    )
    monmap = teuthology.get_file(
        remote=mon0_remote,
        path=monmap_path,
    )

    for rem in ctx.cluster.remotes.iterkeys():
        # copy mon key and initial monmap
        log.info('Sending monmap to node {remote}'.format(remote=rem))
        teuthology.sudo_write_file(remote=rem,
                                   path=keyring_path,
                                   data=keyring,
                                   perms='0644')
        teuthology.write_file(
            remote=rem,
            path=monmap_path,
            data=monmap,
        )

    log.info('Setting up mon nodes...')
    mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
    osdmap_path = '{tdir}/{cluster}.osdmap'.format(tdir=testdir,
                                                   cluster=cluster_name)
    run.wait(
        mons.run(
            args=[
                'adjust-ulimits',
                'ceph-coverage',
                coverage_dir,
                'osdmaptool',
                '-c',
                conf_path,
                '--clobber',
                '--createsimple',
                '{num:d}'.format(num=teuthology.num_instances_of_type(
                    ctx.cluster, 'osd', cluster_name), ),
                osdmap_path,
                '--pg_bits',
                '2',
                '--pgp_bits',
                '4',
            ],
            wait=False,
        ), )

    log.info('Setting up mds nodes...')
    mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
    for remote, roles_for_host in mdss.remotes.iteritems():
        for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
                                                     cluster_name):
            _, _, id_ = teuthology.split_role(role)
            mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
                cluster=cluster_name,
                id=id_,
            )
            remote.run(args=[
                'sudo',
                'mkdir',
                '-p',
                mds_dir,
                run.Raw('&&'),
                'sudo',
                'adjust-ulimits',
                'ceph-coverage',
                coverage_dir,
                'ceph-authtool',
                '--create-keyring',
                '--gen-key',
                '--name=mds.{id}'.format(id=id_),
                mds_dir + '/keyring',
            ], )

    cclient.create_keyring(ctx, cluster_name)
    log.info('Running mkfs on osd nodes...')

    if not hasattr(ctx, 'disk_config'):
        ctx.disk_config = argparse.Namespace()
    if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
        ctx.disk_config.remote_to_roles_to_dev = {}
    if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
        ctx.disk_config.remote_to_roles_to_journals = {}
    if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
        ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
    if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
        ctx.disk_config.remote_to_roles_to_dev_fstype = {}

    teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev,
                          remote_to_roles_to_devs)
    teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals,
                          remote_to_roles_to_journals)

    log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(
        r=str(ctx.disk_config.remote_to_roles_to_dev)))
    for remote, roles_for_host in osds.remotes.iteritems():
        roles_to_devs = remote_to_roles_to_devs[remote]
        roles_to_journals = remote_to_roles_to_journals[remote]

        for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd',
                                                     cluster_name):
            _, _, id_ = teuthology.split_role(role)
            mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(
                cluster=cluster_name, id=id_)
            remote.run(args=[
                'sudo',
                'mkdir',
                '-p',
                mnt_point,
            ])
            log.info(str(roles_to_journals))
            log.info(role)
            if roles_to_devs.get(role):
                dev = roles_to_devs[role]
                fs = config.get('fs')
                package = None
                mkfs_options = config.get('mkfs_options')
                mount_options = config.get('mount_options')
                if fs == 'btrfs':
                    # package = 'btrfs-tools'
                    if mount_options is None:
                        mount_options = ['noatime', 'user_subvol_rm_allowed']
                    if mkfs_options is None:
                        mkfs_options = [
                            '-m', 'single', '-l', '32768', '-n', '32768'
                        ]
                if fs == 'xfs':
                    # package = 'xfsprogs'
                    if mount_options is None:
                        mount_options = ['noatime']
                    if mkfs_options is None:
                        mkfs_options = ['-f', '-i', 'size=2048']
                if fs == 'ext4' or fs == 'ext3':
                    if mount_options is None:
                        mount_options = ['noatime', 'user_xattr']

                if mount_options is None:
                    mount_options = []
                if mkfs_options is None:
                    mkfs_options = []
                mkfs = ['mkfs.%s' % fs] + mkfs_options
                log.info('%s on %s on %s' % (mkfs, dev, remote))
                if package is not None:
                    remote.run(
                        args=['sudo', 'apt-get', 'install', '-y', package],
                        stdout=StringIO(),
                    )

                try:
                    remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs +
                               [dev])
                except run.CommandFailedError:
                    # Newer btfs-tools doesn't prompt for overwrite, use -f
                    if '-f' not in mount_options:
                        mkfs_options.append('-f')
                        mkfs = ['mkfs.%s' % fs] + mkfs_options
                        log.info('%s on %s on %s' % (mkfs, dev, remote))
                    remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs +
                               [dev])

                log.info('mount %s on %s -o %s' %
                         (dev, remote, ','.join(mount_options)))
                remote.run(args=[
                    'sudo',
                    'mount',
                    '-t',
                    fs,
                    '-o',
                    ','.join(mount_options),
                    dev,
                    mnt_point,
                ])
                if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
                    ctx.disk_config.remote_to_roles_to_dev_mount_options[
                        remote] = {}
                ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][
                    role] = mount_options
                if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
                    ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
                ctx.disk_config.remote_to_roles_to_dev_fstype[remote][
                    role] = fs
                devs_to_clean[remote].append(mnt_point)

        for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd',
                                                     cluster_name):
            _, _, id_ = teuthology.split_role(role)
            remote.run(args=[
                'sudo',
                'MALLOC_CHECK_=3',
                'adjust-ulimits',
                'ceph-coverage',
                coverage_dir,
                'ceph-osd',
                '--cluster',
                cluster_name,
                '--mkfs',
                '--mkkey',
                '-i',
                id_,
                '--monmap',
                monmap_path,
            ], )

    log.info('Reading keys from all nodes...')
    keys_fp = StringIO()
    keys = []
    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
        for type_ in ['mds', 'osd']:
            for role in teuthology.cluster_roles_of_type(
                    roles_for_host, type_, cluster_name):
                _, _, id_ = teuthology.split_role(role)
                data = teuthology.get_file(
                    remote=remote,
                    path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
                        type=type_,
                        id=id_,
                        cluster=cluster_name,
                    ),
                    sudo=True,
                )
                keys.append((type_, id_, data))
                keys_fp.write(data)
    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
        for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
                                                     cluster_name):
            _, _, id_ = teuthology.split_role(role)
            data = teuthology.get_file(
                remote=remote,
                path='/etc/ceph/{cluster}.client.{id}.keyring'.format(
                    id=id_, cluster=cluster_name))
            keys.append(('client', id_, data))
            keys_fp.write(data)

    log.info('Adding keys to all mons...')
    writes = mons.run(
        args=[
            'sudo',
            'tee',
            '-a',
            keyring_path,
        ],
        stdin=run.PIPE,
        wait=False,
        stdout=StringIO(),
    )
    keys_fp.seek(0)
    teuthology.feed_many_stdins_and_close(keys_fp, writes)
    run.wait(writes)
    for type_, id_, data in keys:
        run.wait(
            mons.run(
                args=[
                    'sudo',
                    'adjust-ulimits',
                    'ceph-coverage',
                    coverage_dir,
                    'ceph-authtool',
                    keyring_path,
                    '--name={type}.{id}'.format(
                        type=type_,
                        id=id_,
                    ),
                ] + list(teuthology.generate_caps(type_)),
                wait=False,
            ), )

    log.info('Running mkfs on mon nodes...')
    for remote, roles_for_host in mons.remotes.iteritems():
        for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon',
                                                     cluster_name):
            _, _, id_ = teuthology.split_role(role)
            remote.run(args=[
                'sudo',
                'mkdir',
                '-p',
                '/var/lib/ceph/mon/{cluster}-{id}'.format(
                    id=id_, cluster=cluster_name),
            ], )
            remote.run(args=[
                'sudo',
                'adjust-ulimits',
                'ceph-coverage',
                coverage_dir,
                'ceph-mon',
                '--cluster',
                cluster_name,
                '--mkfs',
                '-i',
                id_,
                '--monmap',
                monmap_path,
                '--osdmap',
                osdmap_path,
                '--keyring',
                keyring_path,
            ], )

    run.wait(
        mons.run(
            args=[
                'rm',
                '--',
                monmap_path,
                osdmap_path,
            ],
            wait=False,
        ), )

    try:
        yield
    except Exception:
        # we need to know this below
        ctx.summary['success'] = False
        raise
    finally:
        (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys()

        log.info('Checking cluster log for badness...')

        def first_in_ceph_log(pattern, excludes):
            """
            Find the first occurence of the pattern specified in the Ceph log,
            Returns None if none found.

            :param pattern: Pattern scanned for.
            :param excludes: Patterns to ignore.
            :return: First line of text (or None if not found)
            """
            args = [
                'sudo',
                'egrep',
                pattern,
                '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
            ]
            for exclude in excludes:
                args.extend([run.Raw('|'), 'egrep', '-v', exclude])
            args.extend([
                run.Raw('|'),
                'head',
                '-n',
                '1',
            ])
            r = mon0_remote.run(
                stdout=StringIO(),
                args=args,
            )
            stdout = r.stdout.getvalue()
            if stdout != '':
                return stdout
            return None

        if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
                             config['log_whitelist']) is not None:
            log.warning('Found errors (ERR|WRN|SEC) in cluster log')
            ctx.summary['success'] = False
            # use the most severe problem as the failure reason
            if 'failure_reason' not in ctx.summary:
                for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
                    match = first_in_ceph_log(pattern, config['log_whitelist'])
                    if match is not None:
                        ctx.summary['failure_reason'] = \
                            '"{match}" in cluster log'.format(
                                match=match.rstrip('\n'),
                            )
                        break

        for remote, dirs in devs_to_clean.iteritems():
            for dir_ in dirs:
                log.info('Unmounting %s on %s' % (dir_, remote))
                try:
                    remote.run(args=[
                        'sync',
                        run.Raw('&&'), 'sudo', 'umount', '-f', dir_
                    ])
                except Exception as e:
                    remote.run(args=[
                        'sudo',
                        run.Raw('PATH=/usr/sbin:$PATH'),
                        'lsof',
                        run.Raw(';'),
                        'ps',
                        'auxf',
                    ])
                    raise e

        if config.get('tmpfs_journal'):
            log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
            for remote, roles_for_host in osds.remotes.iteritems():
                remote.run(
                    args=['sudo', 'umount', '-f', '/mnt'],
                    check_status=False,
                )

        if ctx.archive is not None and \
                not (ctx.config.get('archive-on-error') and ctx.summary['success']):

            # archive mon data, too
            log.info('Archiving mon data...')
            path = os.path.join(ctx.archive, 'data')
            try:
                os.makedirs(path)
            except OSError as e:
                if e.errno == errno.EEXIST:
                    pass
                else:
                    raise
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    is_mon = teuthology.is_type('mon', cluster_name)
                    if is_mon(role):
                        _, _, id_ = teuthology.split_role(role)
                        mon_dir = '/var/lib/ceph/mon/' + \
                                  '{0}-{1}'.format(cluster_name, id_)
                        teuthology.pull_directory_tarball(
                            remote, mon_dir, path + '/' + role + '.tgz')

        log.info('Cleaning ceph cluster...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-rf',
                    '--',
                    conf_path,
                    keyring_path,
                    data_dir,
                    monmap_path,
                    osdmap_path,
                    run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
                ],
                wait=False,
            ), )
Exemplo n.º 53
0
def task(ctx, config):
    """
    Run radosbench

    The config should be as follows:

    radosbench:
        clients: [client list]
        time: <seconds to run>
        pool: <pool to use>
        size: write size to use
        concurrency: max number of outstanding writes (16)
        objectsize: object size to use
        unique_pool: use a unique pool, defaults to False
        ec_pool: create an ec pool, defaults to False
        create_pool: create pool, defaults to True
        erasure_code_profile:
          name: teuthologyprofile
          k: 2
          m: 1
          crush-failure-domain: osd
        cleanup: false (defaults to true)
        type: <write|seq|rand> (defaults to write)
    example:

    tasks:
    - ceph:
    - radosbench:
        clients: [client.0]
        time: 360
    - interactive:
    """
    log.info('Beginning radosbench...')
    assert isinstance(config, dict), \
        "please list clients to run on"
    radosbench = {}

    testdir = teuthology.get_testdir(ctx)
    manager = ctx.managers['ceph']
    runtype = config.get('type', 'write')

    create_pool = config.get('create_pool', True)
    for role in config.get('clients', ['client.0']):
        assert isinstance(role, six.string_types)
        PREFIX = 'client.'
        assert role.startswith(PREFIX)
        id_ = role[len(PREFIX):]
        (remote, ) = ctx.cluster.only(role).remotes.keys()

        if config.get('ec_pool', False):
            profile = config.get('erasure_code_profile', {})
            profile_name = profile.get('name', 'teuthologyprofile')
            manager.create_erasure_code_profile(profile_name, profile)
        else:
            profile_name = None

        cleanup = []
        if not config.get('cleanup', True):
            cleanup = ['--no-cleanup']

        pool = config.get('pool', 'data')
        if create_pool:
            if pool != 'data':
                manager.create_pool(pool,
                                    erasure_code_profile_name=profile_name)
            else:
                pool = manager.create_pool_with_unique_name(
                    erasure_code_profile_name=profile_name)

        concurrency = config.get('concurrency', 16)
        osize = config.get('objectsize', 65536)
        if osize == 0:
            objectsize = []
        else:
            objectsize = ['--object-size', str(osize)]
        size = ['-b', str(config.get('size', 65536))]
        # If doing a reading run then populate data
        if runtype != "write":
            proc = remote.run(args=[
                "/bin/sh",
                "-c",
                " ".join(
                    [
                        'adjust-ulimits', 'ceph-coverage',
                        '{tdir}/archive/coverage', 'rados',
                        '--no-log-to-stderr', '--name', role
                    ] + size + objectsize + ['-t', str(concurrency)] +
                    ['-p', pool, 'bench',
                     str(60), "write", "--no-cleanup"]).format(tdir=testdir),
            ],
                              logger=log.getChild(
                                  'radosbench.{id}'.format(id=id_)),
                              wait=True)
            size = []
            objectsize = []

        proc = remote.run(args=[
            "/bin/sh",
            "-c",
            " ".join([
                'adjust-ulimits', 'ceph-coverage', '{tdir}/archive/coverage',
                'rados', '--no-log-to-stderr', '--name', role
            ] + size + objectsize + [
                '-p',
                pool,
                'bench',
                str(config.get('time', 360)),
                runtype,
            ] + cleanup).format(tdir=testdir),
        ],
                          logger=log.getChild(
                              'radosbench.{id}'.format(id=id_)),
                          stdin=run.PIPE,
                          wait=False)
        radosbench[id_] = proc

    try:
        yield
    finally:
        timeout = config.get('time', 360) * 30 + 300
        log.info('joining radosbench (timing out after %ss)', timeout)
        run.wait(radosbench.values(), timeout=timeout)

        if pool != 'data' and create_pool:
            manager.remove_pool(pool)
Exemplo n.º 54
0
def build_ceph_cluster(ctx, config):
    """Build a ceph cluster"""

    # Expect to find ceph_admin on the first mon by ID, same place that the download task
    # puts it.  Remember this here, because subsequently IDs will change from those in
    # the test config to those that ceph-deploy invents.

    (ceph_admin, ) = ctx.cluster.only('mon.a').remotes.keys()

    def execute_ceph_deploy(cmd):
        """Remotely execute a ceph_deploy command"""
        return ceph_admin.run(
            args=[
                'cd',
                '{tdir}/ceph-deploy'.format(tdir=testdir),
                run.Raw('&&'),
                run.Raw(cmd),
            ],
            check_status=False,
        ).exitstatus

    def ceph_disk_osd_create(ctx, config):
        node_dev_list = get_dev_for_osd(ctx, config)
        no_of_osds = 0
        for d in node_dev_list:
            node = d[0]
            for disk in d[1:]:
                zap = './ceph-deploy disk zap ' + node + ':' + disk
                estatus = execute_ceph_deploy(zap)
                if estatus != 0:
                    raise RuntimeError("ceph-deploy: Failed to zap osds")
            osd_create_cmd = './ceph-deploy osd create '
            # first check for filestore, default is bluestore with ceph-deploy
            if config.get('filestore') is not None:
                osd_create_cmd += '--filestore '
            elif config.get('bluestore') is not None:
                osd_create_cmd += '--bluestore '
            if config.get('dmcrypt') is not None:
                osd_create_cmd += '--dmcrypt '
            osd_create_cmd += ":".join(d)
            estatus_osd = execute_ceph_deploy(osd_create_cmd)
            if estatus_osd == 0:
                log.info('successfully created osd')
                no_of_osds += 1
            else:
                raise RuntimeError("ceph-deploy: Failed to create osds")
        return no_of_osds

    def ceph_volume_osd_create(ctx, config):
        osds = ctx.cluster.only(teuthology.is_type('osd'))
        no_of_osds = 0
        for remote in osds.remotes.keys():
            # all devs should be lvm
            osd_create_cmd = './ceph-deploy osd create --debug ' + remote.shortname + ' '
            # default is bluestore so we just need config item for filestore
            roles = ctx.cluster.remotes[remote]
            dev_needed = len(
                [role for role in roles if role.startswith('osd')])
            all_devs = teuthology.get_scratch_devices(remote)
            log.info("node={n}, need_devs={d}, available={a}".format(
                n=remote.shortname,
                d=dev_needed,
                a=all_devs,
            ))
            devs = all_devs[0:dev_needed]
            # rest of the devices can be used for journal if required
            jdevs = dev_needed
            for device in devs:
                device_split = device.split('/')
                lv_device = device_split[-2] + '/' + device_split[-1]
                if config.get('filestore') is not None:
                    osd_create_cmd += '--filestore --data ' + lv_device + ' '
                    # filestore with ceph-volume also needs journal disk
                    try:
                        jdevice = all_devs.pop(jdevs)
                    except IndexError:
                        raise RuntimeError("No device available for \
                                            journal configuration")
                    jdevice_split = jdevice.split('/')
                    j_lv = jdevice_split[-2] + '/' + jdevice_split[-1]
                    osd_create_cmd += '--journal ' + j_lv
                else:
                    osd_create_cmd += ' --data ' + lv_device
                estatus_osd = execute_ceph_deploy(osd_create_cmd)
                if estatus_osd == 0:
                    log.info('successfully created osd')
                    no_of_osds += 1
                else:
                    raise RuntimeError("ceph-deploy: Failed to create osds")
        return no_of_osds

    try:
        log.info('Building ceph cluster using ceph-deploy...')
        testdir = teuthology.get_testdir(ctx)
        ceph_branch = None
        if config.get('branch') is not None:
            cbranch = config.get('branch')
            for var, val in cbranch.iteritems():
                ceph_branch = '--{var}={val}'.format(var=var, val=val)
        all_nodes = get_all_nodes(ctx, config)
        mds_nodes = get_nodes_using_role(ctx, 'mds')
        mds_nodes = " ".join(mds_nodes)
        mon_node = get_nodes_using_role(ctx, 'mon')
        mon_nodes = " ".join(mon_node)
        # skip mgr based on config item
        # this is needed when test uses latest code to install old ceph
        # versions
        skip_mgr = config.get('skip-mgr', False)
        if not skip_mgr:
            mgr_nodes = get_nodes_using_role(ctx, 'mgr')
            mgr_nodes = " ".join(mgr_nodes)
        new_mon = './ceph-deploy new' + " " + mon_nodes
        if not skip_mgr:
            mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
        mon_hostname = mon_nodes.split(' ')[0]
        mon_hostname = str(mon_hostname)
        gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname
        deploy_mds = './ceph-deploy mds create' + " " + mds_nodes

        if mon_nodes is None:
            raise RuntimeError("no monitor nodes in the config file")

        estatus_new = execute_ceph_deploy(new_mon)
        if estatus_new != 0:
            raise RuntimeError("ceph-deploy: new command failed")

        log.info('adding config inputs...')
        testdir = teuthology.get_testdir(ctx)
        conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir)

        if config.get('conf') is not None:
            confp = config.get('conf')
            for section, keys in confp.iteritems():
                lines = '[{section}]\n'.format(section=section)
                teuthology.append_lines_to_file(ceph_admin,
                                                conf_path,
                                                lines,
                                                sudo=True)
                for key, value in keys.iteritems():
                    log.info("[%s] %s = %s" % (section, key, value))
                    lines = '{key} = {value}\n'.format(key=key, value=value)
                    teuthology.append_lines_to_file(ceph_admin,
                                                    conf_path,
                                                    lines,
                                                    sudo=True)

        # install ceph
        dev_branch = ctx.config['branch']
        branch = '--dev={branch}'.format(branch=dev_branch)
        if ceph_branch:
            option = ceph_branch
        else:
            option = branch
        install_nodes = './ceph-deploy install ' + option + " " + all_nodes
        estatus_install = execute_ceph_deploy(install_nodes)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph")
        # install ceph-test package too
        install_nodes2 = './ceph-deploy install --tests ' + option + \
                         " " + all_nodes
        estatus_install = execute_ceph_deploy(install_nodes2)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph-test")

        mon_create_nodes = './ceph-deploy mon create-initial'
        # If the following fails, it is OK, it might just be that the monitors
        # are taking way more than a minute/monitor to form quorum, so lets
        # try the next block which will wait up to 15 minutes to gatherkeys.
        execute_ceph_deploy(mon_create_nodes)

        estatus_gather = execute_ceph_deploy(gather_keys)
        if estatus_gather != 0:
            raise RuntimeError("ceph-deploy: Failed during gather keys")

        # install admin key on mons (ceph-create-keys doesn't do this any more)
        mons = ctx.cluster.only(teuthology.is_type('mon'))
        for remote in mons.remotes.keys():
            execute_ceph_deploy('./ceph-deploy admin ' + remote.shortname)

        # create osd's
        if config.get('use-ceph-volume', False):
            no_of_osds = ceph_volume_osd_create(ctx, config)
        else:
            # this method will only work with ceph-deploy v1.5.39 or older
            no_of_osds = ceph_disk_osd_create(ctx, config)

        if not skip_mgr:
            execute_ceph_deploy(mgr_create)

        if mds_nodes:
            estatus_mds = execute_ceph_deploy(deploy_mds)
            if estatus_mds != 0:
                raise RuntimeError("ceph-deploy: Failed to deploy mds")

        if config.get('test_mon_destroy') is not None:
            for d in range(1, len(mon_node)):
                mon_destroy_nodes = './ceph-deploy mon destroy' + \
                    " " + mon_node[d]
                estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes)
                if estatus_mon_d != 0:
                    raise RuntimeError("ceph-deploy: Failed to delete monitor")

        if config.get('wait-for-healthy', True) and no_of_osds >= 2:
            is_healthy(ctx=ctx, config=None)

            log.info('Setting up client nodes...')
            conf_path = '/etc/ceph/ceph.conf'
            admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring'
            first_mon = teuthology.get_first_mon(ctx, config)
            (mon0_remote, ) = ctx.cluster.only(first_mon).remotes.keys()
            conf_data = teuthology.get_file(
                remote=mon0_remote,
                path=conf_path,
                sudo=True,
            )
            admin_keyring = teuthology.get_file(
                remote=mon0_remote,
                path=admin_keyring_path,
                sudo=True,
            )

            clients = ctx.cluster.only(teuthology.is_type('client'))
            for remot, roles_for_host in clients.remotes.iteritems():
                for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
                    client_keyring = \
                        '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
                    mon0_remote.run(args=[
                        'cd',
                        '{tdir}'.format(tdir=testdir),
                        run.Raw('&&'),
                        'sudo',
                        'bash',
                        '-c',
                        run.Raw('"'),
                        'ceph',
                        'auth',
                        'get-or-create',
                        'client.{id}'.format(id=id_),
                        'mds',
                        'allow',
                        'mon',
                        'allow *',
                        'osd',
                        'allow *',
                        run.Raw('>'),
                        client_keyring,
                        run.Raw('"'),
                    ], )
                    key_data = teuthology.get_file(
                        remote=mon0_remote,
                        path=client_keyring,
                        sudo=True,
                    )
                    teuthology.sudo_write_file(remote=remot,
                                               path=client_keyring,
                                               data=key_data,
                                               perms='0644')
                    teuthology.sudo_write_file(remote=remot,
                                               path=admin_keyring_path,
                                               data=admin_keyring,
                                               perms='0644')
                    teuthology.sudo_write_file(remote=remot,
                                               path=conf_path,
                                               data=conf_data,
                                               perms='0644')

            if mds_nodes:
                log.info('Configuring CephFS...')
                Filesystem(ctx, create=True)
        elif not config.get('only_mon'):
            raise RuntimeError(
                "The cluster is NOT operational due to insufficient OSDs")
        # create rbd pool
        ceph_admin.run(args=[
            'sudo', 'ceph', '--cluster', 'ceph', 'osd', 'pool', 'create',
            'rbd', '128', '128'
        ],
                       check_status=False)
        ceph_admin.run(args=[
            'sudo', 'ceph', '--cluster', 'ceph', 'osd', 'pool', 'application',
            'enable', 'rbd', 'rbd', '--yes-i-really-mean-it'
        ],
                       check_status=False)
        yield

    except Exception:
        log.info(
            "Error encountered, logging exception before tearing down ceph-deploy"
        )
        log.info(traceback.format_exc())
        raise
    finally:
        if config.get('keep_running'):
            return
        log.info('Stopping ceph...')
        ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'],
                        check_status=False)
        time.sleep(4)

        # and now just check for the processes themselves, as if upstart/sysvinit
        # is lying to us. Ignore errors if the grep fails
        ctx.cluster.run(args=[
            'sudo', 'ps', 'aux',
            run.Raw('|'), 'grep', '-v', 'grep',
            run.Raw('|'), 'grep', 'ceph'
        ],
                        check_status=False)
        ctx.cluster.run(
            args=['sudo', 'systemctl',
                  run.Raw('|'), 'grep', 'ceph'],
            check_status=False)

        if ctx.archive is not None:
            # archive mon data, too
            log.info('Archiving mon data...')
            path = os.path.join(ctx.archive, 'data')
            os.makedirs(path)
            mons = ctx.cluster.only(teuthology.is_type('mon'))
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    if role.startswith('mon.'):
                        teuthology.pull_directory_tarball(
                            remote, '/var/lib/ceph/mon',
                            path + '/' + role + '.tgz')

            log.info('Compressing logs...')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo',
                        'find',
                        '/var/log/ceph',
                        '-name',
                        '*.log',
                        '-print0',
                        run.Raw('|'),
                        'sudo',
                        'xargs',
                        '-0',
                        '--no-run-if-empty',
                        '--',
                        'gzip',
                        '--',
                    ],
                    wait=False,
                ), )

            log.info('Archiving logs...')
            path = os.path.join(ctx.archive, 'remote')
            os.makedirs(path)
            for remote in ctx.cluster.remotes.keys():
                sub = os.path.join(path, remote.shortname)
                os.makedirs(sub)
                teuthology.pull_directory(remote, '/var/log/ceph',
                                          os.path.join(sub, 'log'))

        # Prevent these from being undefined if the try block fails
        all_nodes = get_all_nodes(ctx, config)
        purge_nodes = './ceph-deploy purge' + " " + all_nodes
        purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes

        log.info('Purging package...')
        execute_ceph_deploy(purge_nodes)
        log.info('Purging data...')
        execute_ceph_deploy(purgedata_nodes)
Exemplo n.º 55
0
def install_hadoop(ctx, config):
    testdir = teuthology.get_testdir(ctx)

    log.info("Downloading Hadoop...")
    hadoop_tarball = "{tdir}/hadoop.tar.gz".format(tdir=testdir)
    hadoops = ctx.cluster.only(is_hadoop_type(''))
    run.wait(
        hadoops.run(
            args=['wget', '-nv', '-O', hadoop_tarball, HADOOP_2x_URL],
            wait=False,
        ))

    log.info("Create directory for Hadoop install...")
    hadoop_dir = "{tdir}/hadoop".format(tdir=testdir)
    run.wait(hadoops.run(
        args=['mkdir', hadoop_dir],
        wait=False,
    ))

    log.info("Unpacking Hadoop...")
    run.wait(
        hadoops.run(
            args=[
                'tar', 'xzf', hadoop_tarball, '--strip-components=1', '-C',
                hadoop_dir
            ],
            wait=False,
        ))

    log.info("Removing Hadoop download...")
    run.wait(hadoops.run(
        args=['rm', hadoop_tarball],
        wait=False,
    ))

    log.info("Create Hadoop temporary directory...")
    hadoop_tmp_dir = "{tdir}/hadoop_tmp".format(tdir=testdir)
    run.wait(hadoops.run(
        args=['mkdir', hadoop_tmp_dir],
        wait=False,
    ))

    if not config.get('hdfs', False):
        log.info("Fetching cephfs-hadoop...")

        sha1, url = teuthology.get_ceph_binary_url(package="hadoop",
                                                   format="jar",
                                                   dist="precise",
                                                   arch="x86_64",
                                                   flavor="basic",
                                                   branch="master")

        run.wait(
            hadoops.run(
                args=[
                    'wget',
                    '-nv',
                    '-O',
                    "{tdir}/cephfs-hadoop.jar".format(tdir=testdir),  # FIXME
                    url + "/cephfs-hadoop-0.80.6.jar",  # FIXME
                ],
                wait=False,
            ))

        run.wait(
            hadoops.run(
                args=[
                    'mv',
                    "{tdir}/cephfs-hadoop.jar".format(tdir=testdir),
                    "{tdir}/hadoop/share/hadoop/common/".format(tdir=testdir),
                ],
                wait=False,
            ))

        # Copy JNI native bits. Need to do this explicitly because the
        # handling is dependent on the os-type.
        for remote in hadoops.remotes:
            libcephfs_jni_path = None
            if remote.os.package_type == 'rpm':
                libcephfs_jni_path = "/usr/lib64/libcephfs_jni.so.1.0.0"
            elif remote.os.package_type == 'deb':
                libcephfs_jni_path = "/usr/lib/jni/libcephfs_jni.so"
            else:
                raise UnsupportedPackageTypeError(remote)

            libcephfs_jni_fname = "libcephfs_jni.so"
            remote.run(args=[
                'cp',
                libcephfs_jni_path,
                "{tdir}/hadoop/lib/native/{fname}".format(
                    tdir=testdir, fname=libcephfs_jni_fname),
            ])

        run.wait(
            hadoops.run(
                args=[
                    'cp',
                    "/usr/share/java/libcephfs.jar",
                    "{tdir}/hadoop/share/hadoop/common/".format(tdir=testdir),
                ],
                wait=False,
            ))

    configure(ctx, config, hadoops)

    try:
        yield
    finally:
        run.wait(
            hadoops.run(
                args=['rm', '-rf', hadoop_dir, hadoop_tmp_dir],
                wait=False,
            ))
Exemplo n.º 56
0
def build_ceph_cluster(ctx, config):
    """Build a ceph cluster"""

    try:
        log.info('Building ceph cluster using ceph-deploy...')
        testdir = teuthology.get_testdir(ctx)
        ceph_branch = None
        if config.get('branch') is not None:
            cbranch = config.get('branch')
            for var, val in cbranch.iteritems():
                if var == 'testing':
                    ceph_branch = '--{var}'.format(var=var)
                ceph_branch = '--{var}={val}'.format(var=var, val=val)
        node_dev_list = []
        all_nodes = get_all_nodes(ctx, config)
        mds_nodes = get_nodes_using_roles(ctx, config, 'mds')
        mds_nodes = " ".join(mds_nodes)
        mon_node = get_nodes_using_roles(ctx, config, 'mon')
        mon_nodes = " ".join(mon_node)
        new_mon = './ceph-deploy new'+" "+mon_nodes
        install_nodes = './ceph-deploy install '+ceph_branch+" "+all_nodes
        purge_nodes = './ceph-deploy purge'+" "+all_nodes
        purgedata_nodes = './ceph-deploy purgedata'+" "+all_nodes
        mon_hostname = mon_nodes.split(' ')[0]
        mon_hostname = str(mon_hostname)
        gather_keys = './ceph-deploy gatherkeys'+" "+mon_hostname
        deploy_mds = './ceph-deploy mds create'+" "+mds_nodes
        no_of_osds = 0

        if mon_nodes is None:
            raise RuntimeError("no monitor nodes in the config file")

        estatus_new = execute_ceph_deploy(ctx, config, new_mon)
        if estatus_new != 0:
            raise RuntimeError("ceph-deploy: new command failed")

        log.info('adding config inputs...')
        testdir = teuthology.get_testdir(ctx)
        conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir)
        first_mon = teuthology.get_first_mon(ctx, config)
        (remote,) = ctx.cluster.only(first_mon).remotes.keys()

        lines = None
        if config.get('conf') is not None:
            confp = config.get('conf')
            for section, keys in confp.iteritems():
                lines = '[{section}]\n'.format(section=section)
                teuthology.append_lines_to_file(remote, conf_path, lines,
                                                sudo=True)
                for key, value in keys.iteritems():
                    log.info("[%s] %s = %s" % (section, key, value))
                    lines = '{key} = {value}\n'.format(key=key, value=value)
                    teuthology.append_lines_to_file(remote, conf_path, lines,
                                                    sudo=True)

        estatus_install = execute_ceph_deploy(ctx, config, install_nodes)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph")

        mon_create_nodes = './ceph-deploy mon create-initial'
        # If the following fails, it is OK, it might just be that the monitors
        # are taking way more than a minute/monitor to form quorum, so lets
        # try the next block which will wait up to 15 minutes to gatherkeys.
        execute_ceph_deploy(ctx, config, mon_create_nodes)

        estatus_gather = execute_ceph_deploy(ctx, config, gather_keys)
        max_gather_tries = 90
        gather_tries = 0
        while (estatus_gather != 0):
            gather_tries += 1
            if gather_tries >= max_gather_tries:
                msg = 'ceph-deploy was not able to gatherkeys after 15 minutes'
                raise RuntimeError(msg)
            estatus_gather = execute_ceph_deploy(ctx, config, gather_keys)
            time.sleep(10)

        if mds_nodes:
            estatus_mds = execute_ceph_deploy(ctx, config, deploy_mds)
            if estatus_mds != 0:
                raise RuntimeError("ceph-deploy: Failed to deploy mds")

        if config.get('test_mon_destroy') is not None:
            for d in range(1, len(mon_node)):
                mon_destroy_nodes = './ceph-deploy mon destroy'+" "+mon_node[d]
                estatus_mon_d = execute_ceph_deploy(ctx, config,
                                                    mon_destroy_nodes)
                if estatus_mon_d != 0:
                    raise RuntimeError("ceph-deploy: Failed to delete monitor")

        node_dev_list = get_dev_for_osd(ctx, config)
        osd_create_cmd = './ceph-deploy osd create --zap-disk '
        for d in node_dev_list:
            if config.get('dmcrypt') is not None:
                osd_create_cmd_d = osd_create_cmd+'--dmcrypt'+" "+d
            else:
                osd_create_cmd_d = osd_create_cmd+d
            estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmd_d)
            if estatus_osd == 0:
                log.info('successfully created osd')
                no_of_osds += 1
            else:
                disks = []
                disks = d.split(':')
                dev_disk = disks[0]+":"+disks[1]
                j_disk = disks[0]+":"+disks[2]
                zap_disk = './ceph-deploy disk zap '+dev_disk+" "+j_disk
                execute_ceph_deploy(ctx, config, zap_disk)
                estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmd_d)
                if estatus_osd == 0:
                    log.info('successfully created osd')
                    no_of_osds += 1
                else:
                    raise RuntimeError("ceph-deploy: Failed to create osds")

        if config.get('wait-for-healthy', True) and no_of_osds >= 2:
            is_healthy(ctx=ctx, config=None)

            log.info('Setting up client nodes...')
            conf_path = '/etc/ceph/ceph.conf'
            admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring'
            first_mon = teuthology.get_first_mon(ctx, config)
            (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys()
            conf_data = teuthology.get_file(
                remote=mon0_remote,
                path=conf_path,
                sudo=True,
                )
            admin_keyring = teuthology.get_file(
                remote=mon0_remote,
                path=admin_keyring_path,
                sudo=True,
                )

            clients = ctx.cluster.only(teuthology.is_type('client'))
            for remot, roles_for_host in clients.remotes.iteritems():
                for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
                    client_keyring = \
                        '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
                    mon0_remote.run(
                        args=[
                            'cd',
                            '{tdir}'.format(tdir=testdir),
                            run.Raw('&&'),
                            'sudo', 'bash', '-c',
                            run.Raw('"'), 'ceph',
                            'auth',
                            'get-or-create',
                            'client.{id}'.format(id=id_),
                            'mds', 'allow',
                            'mon', 'allow *',
                            'osd', 'allow *',
                            run.Raw('>'),
                            client_keyring,
                            run.Raw('"'),
                            ],
                        )
                    key_data = teuthology.get_file(
                        remote=mon0_remote,
                        path=client_keyring,
                        sudo=True,
                        )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=client_keyring,
                        data=key_data,
                        perms='0644'
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=admin_keyring_path,
                        data=admin_keyring,
                        perms='0644'
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=conf_path,
                        data=conf_data,
                        perms='0644'
                    )
        else:
            raise RuntimeError(
                "The cluster is NOT operational due to insufficient OSDs")
        yield

    finally:
        log.info('Stopping ceph...')
        ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
                              'sudo', 'service', 'ceph', 'stop' ])

        # Are you really not running anymore?
        # try first with the init tooling
        # ignoring the status so this becomes informational only
        ctx.cluster.run(args=['sudo', 'status', 'ceph-all', run.Raw('||'),
                              'sudo', 'service',  'ceph', 'status'],
                              check_status=False)

        # and now just check for the processes themselves, as if upstart/sysvinit
        # is lying to us. Ignore errors if the grep fails
        ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'),
                              'grep', '-v', 'grep', run.Raw('|'),
                              'grep', 'ceph'], check_status=False)

        if ctx.archive is not None:
            # archive mon data, too
            log.info('Archiving mon data...')
            path = os.path.join(ctx.archive, 'data')
            os.makedirs(path)
            mons = ctx.cluster.only(teuthology.is_type('mon'))
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    if role.startswith('mon.'):
                        teuthology.pull_directory_tarball(
                            remote,
                            '/var/lib/ceph/mon',
                            path + '/' + role + '.tgz')

            log.info('Compressing logs...')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo',
                        'find',
                        '/var/log/ceph',
                        '-name',
                        '*.log',
                        '-print0',
                        run.Raw('|'),
                        'sudo',
                        'xargs',
                        '-0',
                        '--no-run-if-empty',
                        '--',
                        'gzip',
                        '--',
                        ],
                    wait=False,
                    ),
                )

            log.info('Archiving logs...')
            path = os.path.join(ctx.archive, 'remote')
            os.makedirs(path)
            for remote in ctx.cluster.remotes.iterkeys():
                sub = os.path.join(path, remote.shortname)
                os.makedirs(sub)
                teuthology.pull_directory(remote, '/var/log/ceph',
                                          os.path.join(sub, 'log'))

        # Prevent these from being undefined if the try block fails
        all_nodes = get_all_nodes(ctx, config)
        purge_nodes = './ceph-deploy purge'+" "+all_nodes
        purgedata_nodes = './ceph-deploy purgedata'+" "+all_nodes

        log.info('Purging package...')
        execute_ceph_deploy(ctx, config, purge_nodes)
        log.info('Purging data...')
        execute_ceph_deploy(ctx, config, purgedata_nodes)
Exemplo n.º 57
0
def build_ceph_cluster(ctx, config):
    """Build a ceph cluster"""

    # Expect to find ceph_admin on the first mon by ID, same place that the download task
    # puts it.  Remember this here, because subsequently IDs will change from those in
    # the test config to those that ceph-deploy invents.
    (ceph_admin,) = ctx.cluster.only(
        teuthology.get_first_mon(ctx, config)).remotes.iterkeys()

    def execute_ceph_deploy(cmd):
        """Remotely execute a ceph_deploy command"""
        return ceph_admin.run(
            args=[
                'cd',
                '{tdir}/ceph-deploy'.format(tdir=testdir),
                run.Raw('&&'),
                run.Raw(cmd),
            ],
            check_status=False,
        ).exitstatus

    try:
        log.info('Building ceph cluster using ceph-deploy...')
        testdir = teuthology.get_testdir(ctx)
        ceph_branch = None
        if config.get('branch') is not None:
            cbranch = config.get('branch')
            for var, val in cbranch.iteritems():
                ceph_branch = '--{var}={val}'.format(var=var, val=val)
        all_nodes = get_all_nodes(ctx, config)
        mds_nodes = get_nodes_using_role(ctx, 'mds')
        mds_nodes = " ".join(mds_nodes)
        mon_node = get_nodes_using_role(ctx, 'mon')
        mon_nodes = " ".join(mon_node)
        mgr_nodes = get_nodes_using_role(ctx, 'mgr')
        mgr_nodes = " ".join(mgr_nodes)
        new_mon = './ceph-deploy new' + " " + mon_nodes
        mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
        mon_hostname = mon_nodes.split(' ')[0]
        mon_hostname = str(mon_hostname)
        gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname
        deploy_mds = './ceph-deploy mds create' + " " + mds_nodes
        no_of_osds = 0

        if mon_nodes is None:
            raise RuntimeError("no monitor nodes in the config file")

        estatus_new = execute_ceph_deploy(new_mon)
        if estatus_new != 0:
            raise RuntimeError("ceph-deploy: new command failed")

        log.info('adding config inputs...')
        testdir = teuthology.get_testdir(ctx)
        conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir)

        if config.get('conf') is not None:
            confp = config.get('conf')
            for section, keys in confp.iteritems():
                lines = '[{section}]\n'.format(section=section)
                teuthology.append_lines_to_file(ceph_admin, conf_path, lines,
                                                sudo=True)
                for key, value in keys.iteritems():
                    log.info("[%s] %s = %s" % (section, key, value))
                    lines = '{key} = {value}\n'.format(key=key, value=value)
                    teuthology.append_lines_to_file(
                        ceph_admin, conf_path, lines, sudo=True)

        # install ceph
        dev_branch = ctx.config['branch']
        branch = '--dev={branch}'.format(branch=dev_branch)
        if ceph_branch:
            option = ceph_branch
        else:
            option = branch
        install_nodes = './ceph-deploy install ' + option + " " + all_nodes
        estatus_install = execute_ceph_deploy(install_nodes)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph")
        # install ceph-test package too
        install_nodes2 = './ceph-deploy install --tests ' + option + \
                         " " + all_nodes
        estatus_install = execute_ceph_deploy(install_nodes2)
        if estatus_install != 0:
            raise RuntimeError("ceph-deploy: Failed to install ceph-test")

        mon_create_nodes = './ceph-deploy mon create-initial'
        # If the following fails, it is OK, it might just be that the monitors
        # are taking way more than a minute/monitor to form quorum, so lets
        # try the next block which will wait up to 15 minutes to gatherkeys.
        execute_ceph_deploy(mon_create_nodes)
        execute_ceph_deploy(mgr_create)

        # create-keys is explicit now
        # http://tracker.ceph.com/issues/16036
        mons = ctx.cluster.only(teuthology.is_type('mon'))
        for remote in mons.remotes.iterkeys():
            remote.run(args=['sudo', 'ceph-create-keys', '--cluster', 'ceph',
                             '--id', remote.shortname])

        estatus_gather = execute_ceph_deploy(gather_keys)
        if mds_nodes:
            estatus_mds = execute_ceph_deploy(deploy_mds)
            if estatus_mds != 0:
                raise RuntimeError("ceph-deploy: Failed to deploy mds")

        if config.get('test_mon_destroy') is not None:
            for d in range(1, len(mon_node)):
                mon_destroy_nodes = './ceph-deploy mon destroy' + \
                    " " + mon_node[d]
                estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes)
                if estatus_mon_d != 0:
                    raise RuntimeError("ceph-deploy: Failed to delete monitor")

        node_dev_list = get_dev_for_osd(ctx, config)
        for d in node_dev_list:
            node = d[0]
            for disk in d[1:]:
                zap = './ceph-deploy disk zap ' + node + ':' + disk
                estatus = execute_ceph_deploy(zap)
                if estatus != 0:
                    raise RuntimeError("ceph-deploy: Failed to zap osds")
            osd_create_cmd = './ceph-deploy osd create '
            if config.get('dmcrypt') is not None:
                osd_create_cmd += '--dmcrypt '
            osd_create_cmd += ":".join(d)
            estatus_osd = execute_ceph_deploy(osd_create_cmd)
            if estatus_osd == 0:
                log.info('successfully created osd')
                no_of_osds += 1
            else:
                raise RuntimeError("ceph-deploy: Failed to create osds")

        if config.get('wait-for-healthy', True) and no_of_osds >= 2:
            is_healthy(ctx=ctx, config=None)

            log.info('Setting up client nodes...')
            conf_path = '/etc/ceph/ceph.conf'
            admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring'
            first_mon = teuthology.get_first_mon(ctx, config)
            (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys()
            conf_data = teuthology.get_file(
                remote=mon0_remote,
                path=conf_path,
                sudo=True,
            )
            admin_keyring = teuthology.get_file(
                remote=mon0_remote,
                path=admin_keyring_path,
                sudo=True,
            )

            clients = ctx.cluster.only(teuthology.is_type('client'))
            for remot, roles_for_host in clients.remotes.iteritems():
                for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
                    client_keyring = \
                        '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
                    mon0_remote.run(
                        args=[
                            'cd',
                            '{tdir}'.format(tdir=testdir),
                            run.Raw('&&'),
                            'sudo', 'bash', '-c',
                            run.Raw('"'), 'ceph',
                            'auth',
                            'get-or-create',
                            'client.{id}'.format(id=id_),
                            'mds', 'allow',
                            'mon', 'allow *',
                            'osd', 'allow *',
                            run.Raw('>'),
                            client_keyring,
                            run.Raw('"'),
                        ],
                    )
                    key_data = teuthology.get_file(
                        remote=mon0_remote,
                        path=client_keyring,
                        sudo=True,
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=client_keyring,
                        data=key_data,
                        perms='0644'
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=admin_keyring_path,
                        data=admin_keyring,
                        perms='0644'
                    )
                    teuthology.sudo_write_file(
                        remote=remot,
                        path=conf_path,
                        data=conf_data,
                        perms='0644'
                    )

            if mds_nodes:
                log.info('Configuring CephFS...')
                ceph_fs = Filesystem(ctx, create=True)
        elif not config.get('only_mon'):
            raise RuntimeError(
                "The cluster is NOT operational due to insufficient OSDs")
        yield

    except Exception:
        log.info(
            "Error encountered, logging exception before tearing down ceph-deploy")
        log.info(traceback.format_exc())
        raise
    finally:
        if config.get('keep_running'):
            return
        log.info('Stopping ceph...')
        ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
                              'sudo', 'service', 'ceph', 'stop', run.Raw('||'),
                              'sudo', 'systemctl', 'stop', 'ceph.target'])

        # Are you really not running anymore?
        # try first with the init tooling
        # ignoring the status so this becomes informational only
        ctx.cluster.run(
            args=[
                'sudo', 'status', 'ceph-all', run.Raw('||'),
                'sudo', 'service', 'ceph', 'status', run.Raw('||'),
                'sudo', 'systemctl', 'status', 'ceph.target'],
            check_status=False)

        # and now just check for the processes themselves, as if upstart/sysvinit
        # is lying to us. Ignore errors if the grep fails
        ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'),
                              'grep', '-v', 'grep', run.Raw('|'),
                              'grep', 'ceph'], check_status=False)

        if ctx.archive is not None:
            # archive mon data, too
            log.info('Archiving mon data...')
            path = os.path.join(ctx.archive, 'data')
            os.makedirs(path)
            mons = ctx.cluster.only(teuthology.is_type('mon'))
            for remote, roles in mons.remotes.iteritems():
                for role in roles:
                    if role.startswith('mon.'):
                        teuthology.pull_directory_tarball(
                            remote,
                            '/var/lib/ceph/mon',
                            path + '/' + role + '.tgz')

            log.info('Compressing logs...')
            run.wait(
                ctx.cluster.run(
                    args=[
                        'sudo',
                        'find',
                        '/var/log/ceph',
                        '-name',
                        '*.log',
                        '-print0',
                        run.Raw('|'),
                        'sudo',
                        'xargs',
                        '-0',
                        '--no-run-if-empty',
                        '--',
                        'gzip',
                        '--',
                    ],
                    wait=False,
                ),
            )

            log.info('Archiving logs...')
            path = os.path.join(ctx.archive, 'remote')
            os.makedirs(path)
            for remote in ctx.cluster.remotes.iterkeys():
                sub = os.path.join(path, remote.shortname)
                os.makedirs(sub)
                teuthology.pull_directory(remote, '/var/log/ceph',
                                          os.path.join(sub, 'log'))

        # Prevent these from being undefined if the try block fails
        all_nodes = get_all_nodes(ctx, config)
        purge_nodes = './ceph-deploy purge' + " " + all_nodes
        purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes

        log.info('Purging package...')
        execute_ceph_deploy(purge_nodes)
        log.info('Purging data...')
        execute_ceph_deploy(purgedata_nodes)
Exemplo n.º 58
0
def coredump(ctx, config):
    """
    Stash a coredump of this system if an error occurs.
    """
    log.info('Enabling coredump saving...')
    archive_dir = misc.get_archive_dir(ctx)
    run.wait(
        ctx.cluster.run(
            args=[
                'install',
                '-d',
                '-m0755',
                '--',
                '{adir}/coredump'.format(adir=archive_dir),
                run.Raw('&&'),
                'sudo',
                'sysctl',
                '-w',
                'kernel.core_pattern={adir}/coredump/%t.%p.core'.format(
                    adir=archive_dir),
            ],
            wait=False,
        ))

    try:
        yield
    finally:
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'sysctl',
                    '-w',
                    'kernel.core_pattern=core',
                    run.Raw('&&'),
                    # don't litter the archive dir if there were no cores dumped
                    'rmdir',
                    '--ignore-fail-on-non-empty',
                    '--',
                    '{adir}/coredump'.format(adir=archive_dir),
                ],
                wait=False,
            ))

        # set status = 'fail' if the dir is still there = coredumps were
        # seen
        for rem in ctx.cluster.remotes.iterkeys():
            r = rem.run(
                args=[
                    'if',
                    'test',
                    '!',
                    '-e',
                    '{adir}/coredump'.format(adir=archive_dir),
                    run.Raw(';'),
                    'then',
                    'echo',
                    'OK',
                    run.Raw(';'),
                    'fi',
                ],
                stdout=StringIO(),
            )
            if r.stdout.getvalue() != 'OK\n':
                log.warning('Found coredumps on %s, flagging run as failed',
                            rem)
                set_status(ctx.summary, 'fail')
                if 'failure_reason' not in ctx.summary:
                    ctx.summary['failure_reason'] = \
                        'Found coredumps on {rem}'.format(rem=rem)
Exemplo n.º 59
0
def task(ctx, config):
    """
    Set up and tear down a Ceph cluster.

    For example::

        tasks:
        - ceph:
        - interactive:

    You can also specify what branch to run::

        tasks:
        - ceph:
            branch: foo

    Or a tag::

        tasks:
        - ceph:
            tag: v0.42.13

    Or a sha1::

        tasks:
        - ceph:
            sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed

    Or a local source dir::

        tasks:
        - ceph:
            path: /home/sage/ceph

    To capture code coverage data, use::

        tasks:
        - ceph:
            coverage: true

    To use btrfs, ext4, or xfs on the target's scratch disks, use::

        tasks:
        - ceph:
            fs: xfs
            mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
            mount_options: [nobarrier, inode64]

    Note, this will cause the task to check the /scratch_devs file on each node
    for available devices.  If no such file is found, /dev/sdb will be used.

    To run some daemons under valgrind, include their names
    and the tool/args to use in a valgrind section::

        tasks:
        - ceph:
          valgrind:
            mds.1: --tool=memcheck
            osd.1: [--tool=memcheck, --leak-check=no]

    Those nodes which are using memcheck or valgrind will get
    checked for bad results.

    To adjust or modify config options, use::

        tasks:
        - ceph:
            conf:
              section:
                key: value

    For example::

        tasks:
        - ceph:
            conf:
              mds.0:
                some option: value
                other key: other value
              client.0:
                debug client: 10
                debug ms: 1

    By default, the cluster log is checked for errors and warnings,
    and the run marked failed if any appear. You can ignore log
    entries by giving a list of egrep compatible regexes, i.e.:

        tasks:
        - ceph:
            log-whitelist: ['foo.*bar', 'bad message']

    To run multiple ceph clusters, use multiple ceph tasks, and roles
    with a cluster name prefix, e.g. cluster1.client.0. Roles with no
    cluster use the default cluster name, 'ceph'. OSDs from separate
    clusters must be on separate hosts. Clients and non-osd daemons
    from multiple clusters may be colocated. For each cluster, add an
    instance of the ceph task with the cluster name specified, e.g.::

        roles:
        - [mon.a, osd.0, osd.1]
        - [backup.mon.a, backup.osd.0, backup.osd.1]
        - [client.0, backup.client.0]
        tasks:
        - ceph:
            cluster: ceph
        - ceph:
            cluster: backup

    :param ctx: Context
    :param config: Configuration

    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        "task ceph only supports a dictionary for configuration"

    overrides = ctx.config.get('overrides', {})
    teuthology.deep_merge(config, overrides.get('ceph', {}))

    first_ceph_cluster = False
    if not hasattr(ctx, 'daemons'):
        first_ceph_cluster = True
        ctx.daemons = DaemonGroup()

    testdir = teuthology.get_testdir(ctx)
    if config.get('coverage'):
        coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
        log.info('Creating coverage directory...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'install',
                    '-d',
                    '-m0755',
                    '--',
                    coverage_dir,
                ],
                wait=False,
            ))

    if 'cluster' not in config:
        config['cluster'] = 'ceph'

    validate_config(ctx, config)

    subtasks = []
    if first_ceph_cluster:
        # these tasks handle general log setup and parsing on all hosts,
        # so they should only be run once
        subtasks = [
            lambda: ceph_log(ctx=ctx, config=None),
            lambda: valgrind_post(ctx=ctx, config=config),
        ]

    subtasks += [
        lambda: cluster(ctx=ctx,
                        config=dict(
                            conf=config.get('conf', {}),
                            fs=config.get('fs', None),
                            mkfs_options=config.get('mkfs_options', None),
                            mount_options=config.get('mount_options', None),
                            block_journal=config.get('block_journal', None),
                            tmpfs_journal=config.get('tmpfs_journal', None),
                            log_whitelist=config.get('log-whitelist', []),
                            cpu_profile=set(config.get('cpu_profile', []), ),
                            cluster=config['cluster'],
                        )),
        lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
        lambda: crush_setup(ctx=ctx, config=config),
        lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
        lambda: cephfs_setup(ctx=ctx, config=config),
        lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
    ]

    with contextutil.nested(*subtasks):
        try:
            if config.get('wait-for-healthy', True):
                healthy(ctx=ctx, config=dict(cluster=config['cluster']))
            first_mon = teuthology.get_first_mon(ctx, config,
                                                 config['cluster'])
            (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys()
            if not hasattr(ctx, 'managers'):
                ctx.managers = {}
            ctx.managers[config['cluster']] = CephManager(
                mon,
                ctx=ctx,
                logger=log.getChild('ceph_manager.' + config['cluster']),
                cluster=config['cluster'],
            )
            yield
        finally:
            if config.get('wait-for-scrub', True):
                osd_scrub_pgs(ctx, config)