Exemplo n.º 1
0
    def __init__(self, ctx, manager, config, name, logger):
        super(MonitorThrasher, self).__init__()

        self.ctx = ctx
        self.manager = manager
        self.manager.wait_for_clean()

        self.stopping = False
        self.logger = logger
        self.config = config
        self.name = name

        if self.config is None:
            self.config = dict()

        """ Test reproducibility """
        self.random_seed = self.config.get('seed', None)

        if self.random_seed is None:
            self.random_seed = int(time.time())

        self.rng = random.Random()
        self.rng.seed(int(self.random_seed))

        """ Monitor thrashing """
        self.revive_delay = float(self.config.get('revive_delay', 10.0))
        self.thrash_delay = float(self.config.get('thrash_delay', 0.0))

        self.thrash_many = self.config.get('thrash_many', False)
        self.maintain_quorum = self.config.get('maintain_quorum', True)

        self.scrub = self.config.get('scrub', True)

        self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
        self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))

        assert self.max_killable() > 0, \
            'Unable to kill at least one monitor with the current config.'

        """ Store thrashing """
        self.store_thrash = self.config.get('store_thrash', False)
        self.store_thrash_probability = int(
            self.config.get('store_thrash_probability', 50))
        if self.store_thrash:
            assert self.store_thrash_probability > 0, \
                'store_thrash is set, probability must be > 0'
            assert self.maintain_quorum, \
                'store_thrash = true must imply maintain_quorum = true'

        #MDS failover
        self.mds_failover = self.config.get('check_mds_failover', False)

        if self.mds_failover:
            self.mds_cluster = MDSCluster(ctx)

        self.thread = gevent.spawn(self.do_thrash)
Exemplo n.º 2
0
def ready(ctx, config):
    """
    That the file system is ready for clients.
    """

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'task only accepts a dict for configuration'

    timeout = config.get('timeout', 300)

    mdsc = MDSCluster(ctx)
    status = mdsc.status()

    for filesystem in status.get_filesystems():
        fs = Filesystem(ctx, fscid=filesystem['id'])
        fs.wait_for_daemons(timeout=timeout, status=status)
Exemplo n.º 3
0
def pre_upgrade_save(ctx, config):
    """
    That the upgrade procedure doesn't clobber state: save state.
    """

    mdsc = MDSCluster(ctx)
    status = mdsc.status()

    state = {}
    ctx['mds-upgrade-state'] = state

    for fs in list(status.get_filesystems()):
        fscid = fs['id']
        mdsmap = fs['mdsmap']
        fs_state = {}
        fs_state['epoch'] = mdsmap['epoch']
        fs_state['max_mds'] = mdsmap['max_mds']
        fs_state['flags'] = mdsmap['flags'] & UPGRADE_FLAGS_MASK
        state[fscid] = fs_state
        log.debug(f"fs fscid={fscid},name={mdsmap['fs_name']} state = {fs_state}")
Exemplo n.º 4
0
def post_upgrade_checks(ctx, config):
    """
    That the upgrade procedure doesn't clobber state.
    """

    state = ctx['mds-upgrade-state']

    mdsc = MDSCluster(ctx)
    status = mdsc.status()

    for fs in list(status.get_filesystems()):
        fscid = fs['id']
        mdsmap = fs['mdsmap']
        fs_state = state[fscid]
        log.debug(f"checking fs fscid={fscid},name={mdsmap['fs_name']} state = {fs_state}")

        # check state was restored to previous values
        assert fs_state['max_mds'] == mdsmap['max_mds']
        assert fs_state['flags'] == (mdsmap['flags'] & UPGRADE_FLAGS_MASK)

        # now confirm that the upgrade procedure was followed
        epoch = mdsmap['epoch']
        pre_upgrade_epoch = fs_state['epoch']
        assert pre_upgrade_epoch < epoch
        should_decrease_max_mds = fs_state['max_mds'] > 1
        did_decrease_max_mds = False
        should_disable_allow_standby_replay = fs_state['flags'] & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY
        did_disable_allow_standby_replay = False
        for i in range(pre_upgrade_epoch+1, mdsmap['epoch']):
            old_status = mdsc.status(epoch=i)
            old_fs = old_status.get_fsmap(fscid)
            old_mdsmap = old_fs['mdsmap']
            if should_decrease_max_mds and old_mdsmap['max_mds'] == 1:
                log.debug(f"max_mds reduced in epoch {i}")
                did_decrease_max_mds = True
            if should_disable_allow_standby_replay and not (old_mdsmap['flags'] & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY):
                log.debug(f"allow_standby_replay disabled in epoch {i}")
                did_disable_allow_standby_replay = True
        assert not should_decrease_max_mds or did_decrease_max_mds
        assert not should_disable_allow_standby_replay or did_disable_allow_standby_replay
Exemplo n.º 5
0
def task(ctx, config):
    """
    Run the CephFS test cases.

    Run everything in tasks/cephfs/test_*.py:

    ::

        tasks:
          - install:
          - ceph:
          - ceph-fuse:
          - cephfs_test_runner:

    `modules` argument allows running only some specific modules:

    ::

        tasks:
            ...
          - cephfs_test_runner:
              modules:
                - tasks.cephfs.test_sessionmap
                - tasks.cephfs.test_auto_repair

    By default, any cases that can't be run on the current cluster configuration
    will generate a failure.  When the optional `fail_on_skip` argument is set
    to false, any tests that can't be run on the current configuration will
    simply be skipped:

    ::
        tasks:
            ...
         - cephfs_test_runner:
           fail_on_skip: false

    """

    ceph_cluster = CephCluster(ctx)

    if len(list(misc.all_roles_of_type(ctx.cluster, 'mds'))):
        mds_cluster = MDSCluster(ctx)
        fs = Filesystem(ctx)
    else:
        mds_cluster = None
        fs = None

    if len(list(misc.all_roles_of_type(ctx.cluster, 'mgr'))):
        mgr_cluster = MgrCluster(ctx)
    else:
        mgr_cluster = None

    # Mount objects, sorted by ID
    if hasattr(ctx, 'mounts'):
        mounts = [
            v for k, v in sorted(ctx.mounts.items(),
                                 lambda a, b: cmp(a[0], b[0]))
        ]
    else:
        # The test configuration has a filesystem but no fuse/kclient mounts
        mounts = []

    decorating_loader = DecoratingLoader({
        "ctx": ctx,
        "mounts": mounts,
        "fs": fs,
        "ceph_cluster": ceph_cluster,
        "mds_cluster": mds_cluster,
        "mgr_cluster": mgr_cluster,
    })

    fail_on_skip = config.get('fail_on_skip', True)

    # Put useful things onto ctx for interactive debugging
    ctx.fs = fs
    ctx.mds_cluster = mds_cluster
    ctx.mgr_cluster = mgr_cluster

    # Depending on config, either load specific modules, or scan for moduless
    if config and 'modules' in config and config['modules']:
        module_suites = []
        for mod_name in config['modules']:
            # Test names like cephfs.test_auto_repair
            module_suites.append(decorating_loader.loadTestsFromName(mod_name))
        overall_suite = suite.TestSuite(module_suites)
    else:
        # Default, run all tests
        overall_suite = decorating_loader.discover(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "cephfs/"))

    if ctx.config.get("interactive-on-error", False):
        InteractiveFailureResult.ctx = ctx
        result_class = InteractiveFailureResult
    else:
        result_class = unittest.TextTestResult

    class LoggingResult(result_class):
        def startTest(self, test):
            log.info("Starting test: {0}".format(self.getDescription(test)))
            return super(LoggingResult, self).startTest(test)

        def addSkip(self, test, reason):
            if fail_on_skip:
                # Don't just call addFailure because that requires a traceback
                self.failures.append((test, reason))
            else:
                super(LoggingResult, self).addSkip(test, reason)

    # Execute!
    result = unittest.TextTestRunner(stream=LogStream(),
                                     resultclass=LoggingResult,
                                     verbosity=2,
                                     failfast=True).run(overall_suite)

    if not result.wasSuccessful():
        result.printErrors()  # duplicate output at end for convenience

        bad_tests = []
        for test, error in result.errors:
            bad_tests.append(str(test))
        for test, failure in result.failures:
            bad_tests.append(str(test))

        raise RuntimeError("Test failure: {0}".format(", ".join(bad_tests)))

    yield
Exemplo n.º 6
0
def task(ctx, config):
    """
    Stress test the mds by thrashing while another task/workunit
    is running.

    Please refer to MDSThrasher class for further information on the
    available options.
    """

    mds_cluster = MDSCluster(ctx)

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'mds_thrash task only accepts a dict for configuration'
    mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
    assert len(mdslist) > 1, \
        'mds_thrash task requires at least 2 metadata servers'

    # choose random seed
    if 'seed' in config:
        seed = int(config['seed'])
    else:
        seed = int(time.time())
    log.info('mds thrasher using random seed: {seed}'.format(seed=seed))
    random.seed(seed)

    (first, ) = ctx.cluster.only(
        'mds.{_id}'.format(_id=mdslist[0])).remotes.keys()
    manager = ceph_manager.CephManager(
        first,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    # make sure everyone is in active, standby, or standby-replay
    log.info('Wait for all MDSs to reach steady state...')
    status = mds_cluster.status()
    while True:
        steady = True
        for info in status.get_all():
            state = info['state']
            if state not in ('up:active', 'up:standby', 'up:standby-replay'):
                steady = False
                break
        if steady:
            break
        sleep(2)
        status = mds_cluster.status()
    log.info('Ready to start thrashing')

    manager.wait_for_clean()
    assert manager.is_clean()

    if 'cluster' not in config:
        config['cluster'] = 'ceph'

    for fs in status.get_filesystems():
        thrasher = MDSThrasher(ctx, manager, config, Filesystem(ctx, fs['id']),
                               fs['mdsmap']['max_mds'])
        thrasher.start()
        ctx.ceph[config['cluster']].thrashers.append(thrasher)

    try:
        log.debug('Yielding')
        yield
    finally:
        log.info('joining mds_thrasher')
        thrasher.stop()
        if thrasher.exception is not None:
            raise RuntimeError('error during thrashing')
        thrasher.join()
        log.info('done joining')
Exemplo n.º 7
0
def task(ctx, config):
    """
    Stress test the mds by thrashing while another task/workunit
    is running.

    Please refer to MDSThrasher class for further information on the
    available options.
    """

    mds_cluster = MDSCluster(ctx)

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'mds_thrash task only accepts a dict for configuration'
    mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
    assert len(mdslist) > 1, \
        'mds_thrash task requires at least 2 metadata servers'

    # choose random seed
    if 'seed' in config:
        seed = int(config['seed'])
    else:
        seed = int(time.time())
    log.info('mds thrasher using random seed: {seed}'.format(seed=seed))
    random.seed(seed)

    max_thrashers = config.get('max_thrash', 1)
    thrashers = {}

    (first, ) = ctx.cluster.only(
        'mds.{_id}'.format(_id=mdslist[0])).remotes.iterkeys()
    manager = ceph_manager.CephManager(
        first,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    # make sure everyone is in active, standby, or standby-replay
    log.info('Wait for all MDSs to reach steady state...')
    statuses = None
    statuses_by_rank = None
    while True:
        statuses = {m: mds_cluster.get_mds_info(m) for m in mdslist}
        statuses_by_rank = {}
        for _, s in statuses.iteritems():
            if isinstance(s, dict):
                statuses_by_rank[s['rank']] = s

        ready = filter(
            lambda (_, s): s is not None and (s['state'] == 'up:active' or s[
                'state'] == 'up:standby' or s['state'] == 'up:standby-replay'),
            statuses.items())
        if len(ready) == len(statuses):
            break
        time.sleep(2)
    log.info('Ready to start thrashing')

    # setup failure groups
    failure_groups = {}
    actives = {
        s['name']: s
        for (_, s) in statuses.iteritems() if s['state'] == 'up:active'
    }
    log.info('Actives is: {d}'.format(d=actives))
    log.info('Statuses is: {d}'.format(d=statuses_by_rank))
    for active in actives:
        for (r, s) in statuses.iteritems():
            if s['standby_for_name'] == active:
                if not active in failure_groups:
                    failure_groups[active] = []
                log.info('Assigning mds rank {r} to failure group {g}'.format(
                    r=r, g=active))
                failure_groups[active].append(r)

    manager.wait_for_clean()
    for (active, standbys) in failure_groups.iteritems():
        weight = 1.0
        if 'thrash_weights' in config:
            weight = int(config['thrash_weights'].get(
                'mds.{_id}'.format(_id=active), '0.0'))

        failure_group = [active]
        failure_group.extend(standbys)

        thrasher = MDSThrasher(
            ctx,
            manager,
            mds_cluster,
            config,
            logger=log.getChild(
                'mds_thrasher.failure_group.[{a}, {sbs}]'.format(
                    a=active, sbs=', '.join(standbys))),
            failure_group=failure_group,
            weight=weight)
        thrasher.start()
        thrashers[active] = thrasher

        # if thrash_weights isn't specified and we've reached max_thrash,
        # we're done
        if 'thrash_weights' not in config and len(thrashers) == max_thrashers:
            break

    try:
        log.debug('Yielding')
        yield
    finally:
        log.info('joining mds_thrashers')
        for t in thrashers:
            log.info('join thrasher for failure group [{fg}]'.format(
                fg=', '.join(failure_group)))
            thrashers[t].stop()
            thrashers[t].get()  # Raise any exception from _run()
            thrashers[t].join()
        log.info('done joining')
Exemplo n.º 8
0
def task(ctx, config):
    """
    Stress test the mds by running scrub iterations while another task/workunit
    is running.
    Example config:

    - fwd_scrub:
      scrub_timeout: 300
      sleep_between_iterations: 1
    """

    mds_cluster = MDSCluster(ctx)

    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'fwd_scrub task only accepts a dict for configuration'
    mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
    assert len(mdslist) > 0, \
        'fwd_scrub task requires at least 1 metadata server'

    (first, ) = ctx.cluster.only(f'mds.{mdslist[0]}').remotes.keys()
    manager = ceph_manager.CephManager(
        first,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
    )

    # make sure everyone is in active, standby, or standby-replay
    log.info('Wait for all MDSs to reach steady state...')
    status = mds_cluster.status()
    while True:
        steady = True
        for info in status.get_all():
            state = info['state']
            if state not in ('up:active', 'up:standby', 'up:standby-replay'):
                steady = False
                break
        if steady:
            break
        sleep(2)
        status = mds_cluster.status()

    log.info('Ready to start scrub thrashing')

    manager.wait_for_clean()
    assert manager.is_clean()

    if 'cluster' not in config:
        config['cluster'] = 'ceph'

    for fs in status.get_filesystems():
        fwd_scrubber = ForwardScrubber(Filesystem(ctx, fscid=fs['id']),
                                       config['scrub_timeout'],
                                       config['sleep_between_iterations'])
        fwd_scrubber.start()
        ctx.ceph[config['cluster']].thrashers.append(fwd_scrubber)

    try:
        log.debug('Yielding')
        yield
    finally:
        log.info('joining ForwardScrubbers')
        stop_all_fwd_scrubbers(ctx.ceph[config['cluster']].thrashers)
        log.info('done joining')
Exemplo n.º 9
0
def task(ctx, config):
    """
    Mount/unmount a ``ceph-fuse`` client.

    The config is optional and defaults to mounting on all clients. If
    a config is given, it is expected to be a list of clients to do
    this operation on. This lets you e.g. set up one client with
    ``ceph-fuse`` and another with ``kclient``.

    Example that mounts all clients::

        tasks:
        - ceph:
        - ceph-fuse:
        - interactive:

    Example that uses both ``kclient` and ``ceph-fuse``::

        tasks:
        - ceph:
        - ceph-fuse: [client.0]
        - kclient: [client.1]
        - interactive:

    Example that enables valgrind:

        tasks:
        - ceph:
        - ceph-fuse:
            client.0:
              valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
        - interactive:

    Example that stops an already-mounted client:

    ::

        tasks:
            - ceph:
            - ceph-fuse: [client.0]
            - ... do something that requires the FS mounted ...
            - ceph-fuse:
                client.0:
                    mounted: false
            - ... do something that requires the FS unmounted ...

    Example that adds more generous wait time for mount (for virtual machines):

        tasks:
        - ceph:
        - ceph-fuse:
            client.0:
              mount_wait: 60 # default is 0, do not wait before checking /sys/
              mount_timeout: 120 # default is 30, give up if /sys/ is not populated
        - interactive:

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Mounting ceph-fuse clients...')

    testdir = teuthology.get_testdir(ctx)
    config = get_client_configs(ctx, config)

    # List clients we will configure mounts for, default is all clients
    clients = list(
        teuthology.get_clients(ctx=ctx,
                               roles=filter(lambda x: 'client.' in x,
                                            config.keys())))

    all_mounts = getattr(ctx, 'mounts', {})
    mounted_by_me = {}

    log.info('Wait for MDS to reach steady state...')
    mds_cluster = MDSCluster(ctx)
    status = mds_cluster.status()
    for filesystem in status.get_filesystems():
        fs = Filesystem(ctx, fscid=filesystem['id'])
        fs.wait_for_daemons()
    log.info('Ready to start ceph-fuse...')

    # Construct any new FuseMount instances
    for id_, remote in clients:
        client_config = config.get("client.%s" % id_)
        if client_config is None:
            client_config = {}

        if id_ not in all_mounts:
            fuse_mount = FuseMount(client_config, testdir, id_, remote)
            all_mounts[id_] = fuse_mount
        else:
            # Catch bad configs where someone has e.g. tried to use ceph-fuse and kcephfs for the same client
            assert isinstance(all_mounts[id_], FuseMount)

        if not config.get("disabled", False) and client_config.get(
                'mounted', True):
            mounted_by_me[id_] = all_mounts[id_]

    ctx.mounts = all_mounts

    # Mount any clients we have been asked to (default to mount all)
    for mount in mounted_by_me.values():
        mount.mount()

    for mount in mounted_by_me.values():
        mount.wait_until_mounted()

    # Umount any pre-existing clients that we have not been asked to mount
    for client_id in set(all_mounts.keys()) - set(mounted_by_me.keys()):
        mount = all_mounts[client_id]
        if mount.is_mounted():
            mount.umount_wait()

    try:
        yield all_mounts
    finally:
        log.info('Unmounting ceph-fuse clients...')

        for mount in mounted_by_me.values():
            # Conditional because an inner context might have umounted it
            if mount.is_mounted():
                mount.umount_wait()