Пример #1
0
def healthy(ctx, config):
    """
    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.

    :param ctx: Context
    :param config: Configuration
    """
    config = config if isinstance(config, dict) else dict()
    cluster_name = config.get('cluster', 'ceph')
    log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
    (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(
        ctx,
        cluster=ctx.cluster,
        remote=mon0_remote,
        ceph_cluster=cluster_name,
    )
    teuthology.wait_until_healthy(
        ctx,
        remote=mon0_remote,
        ceph_cluster=cluster_name,
    )

    if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
        # Some MDSs exist, wait for them to be healthy
        ceph_fs = Filesystem(ctx)  # TODO: make Filesystem cluster-aware
        ceph_fs.wait_for_daemons(timeout=300)
Пример #2
0
def healthy(ctx, config):
    """
    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.

    :param ctx: Context
    :param config: Configuration
    """
    config = config if isinstance(config, dict) else dict()
    cluster_name = config.get('cluster', 'ceph')
    log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(
        ctx,
        cluster=ctx.cluster,
        remote=mon0_remote,
        ceph_cluster=cluster_name,
    )
    teuthology.wait_until_healthy(
        ctx,
        remote=mon0_remote,
        ceph_cluster=cluster_name,
    )

    if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
        # Some MDSs exist, wait for them to be healthy
        ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
        ceph_fs.wait_for_daemons(timeout=300)
Пример #3
0
def healthy(ctx, config):
    log.info('Waiting until ceph is healthy...')
    firstmon = teuthology.get_first_mon(ctx, config)
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(
        cluster=ctx.cluster,
        remote=mon0_remote
        )
    teuthology.wait_until_healthy(
        remote=mon0_remote,
        )
Пример #4
0
def healthy(ctx, config):
    log.info('Waiting until ceph is healthy...')
    firstmon = teuthology.get_first_mon(ctx, config)
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(
        cluster=ctx.cluster,
        remote=mon0_remote
        )
    teuthology.wait_until_healthy(
        remote=mon0_remote,
        )
Пример #5
0
def healthy(ctx, config):
    """
    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Waiting until ceph is healthy...')
    firstmon = teuthology.get_first_mon(ctx, config)
    (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(ctx, cluster=ctx.cluster, remote=mon0_remote)
    teuthology.wait_until_healthy(
        ctx,
        remote=mon0_remote,
    )
Пример #6
0
def healthy(ctx, config):
    """
    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Waiting until ceph is healthy...')
    firstmon = teuthology.get_first_mon(ctx, config)
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(
        ctx,
        cluster=ctx.cluster,
        remote=mon0_remote
        )
    teuthology.wait_until_healthy(
        ctx,
        remote=mon0_remote,
        )
Пример #7
0
def healthy(ctx, config):
    """
    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Waiting until ceph is healthy...')
    firstmon = teuthology.get_first_mon(ctx, config)
    (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(ctx, cluster=ctx.cluster, remote=mon0_remote)
    teuthology.wait_until_healthy(
        ctx,
        remote=mon0_remote,
    )

    if ctx.cluster.only(teuthology.is_type('mds')).remotes:
        # Some MDSs exist, wait for them to be healthy
        ceph_fs = Filesystem(ctx)
        ceph_fs.wait_for_daemons(timeout=300)
Пример #8
0
def healthy(ctx, config):
    """
    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.

    :param ctx: Context
    :param config: Configuration
    """
    log.info('Waiting until ceph is healthy...')
    firstmon = teuthology.get_first_mon(ctx, config)
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
    teuthology.wait_until_osds_up(
        ctx,
        cluster=ctx.cluster,
        remote=mon0_remote
    )
    teuthology.wait_until_healthy(
        ctx,
        remote=mon0_remote,
    )

    if ctx.cluster.only(teuthology.is_type('mds')).remotes:
        # Some MDSs exist, wait for them to be healthy
        ceph_fs = Filesystem(ctx)
        ceph_fs.wait_for_daemons(timeout=300)
Пример #9
0
def task(ctx, config):
    """
      - tasks:
          ceph-deploy:
          systemd:

    Test ceph systemd services can start, stop and restart and
    check for any failed services and report back errors
    """
    for remote, roles in ctx.cluster.remotes.items():
        remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph'])
        units = remote.sh('sudo systemctl list-units | grep ceph',
                          check_status=False)
        log.info(units)
        if units.find('failed'):
            log.info("Ceph services in failed state")

        # test overall service stop and start using ceph.target
        # ceph.target tests are meant for ceph systemd tests
        # and not actual process testing using 'ps'
        log.info("Stopping all Ceph services")
        remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
        status = _remote_service_status(remote, 'ceph.target')
        log.info(status)
        log.info("Checking process status")
        ps_eaf = remote.sh('sudo ps -eaf | grep ceph')
        if ps_eaf.find('Active: inactive'):
            log.info("Successfully stopped all ceph services")
        else:
            log.info("Failed to stop ceph services")

        log.info("Starting all Ceph services")
        remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target'])
        status = _remote_service_status(remote, 'ceph.target')
        log.info(status)
        if status.find('Active: active'):
            log.info("Successfully started all Ceph services")
        else:
            log.info("info", "Failed to start Ceph services")
        ps_eaf = remote.sh('sudo ps -eaf | grep ceph')
        log.info(ps_eaf)
        time.sleep(4)

        # test individual services start stop
        name = remote.shortname
        mon_name = 'ceph-mon@' + name + '.service'
        mds_name = 'ceph-mds@' + name + '.service'
        mgr_name = 'ceph-mgr@' + name + '.service'
        mon_role_name = 'mon.' + name
        mds_role_name = 'mds.' + name
        mgr_role_name = 'mgr.' + name
        m_osd = re.search('--id (\d+) --setuser ceph', ps_eaf)
        if m_osd:
            osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1))
            remote.run(args=['sudo', 'systemctl', 'status', osd_service])
            remote.run(args=['sudo', 'systemctl', 'stop', osd_service])
            time.sleep(4)  # immediate check will result in deactivating state
            status = _remote_service_status(remote, osd_service)
            log.info(status)
            if status.find('Active: inactive'):
                log.info("Successfully stopped single osd ceph service")
            else:
                log.info("Failed to stop ceph osd services")
            remote.sh(['sudo', 'systemctl', 'start', osd_service])
            time.sleep(4)
        if mon_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mon_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mon_name])
            time.sleep(4)  # immediate check will result in deactivating state
            status = _remote_service_status(remote, mon_name)
            if status.find('Active: inactive'):
                log.info("Successfully stopped single mon ceph service")
            else:
                log.info("Failed to stop ceph mon service")
            remote.run(args=['sudo', 'systemctl', 'start', mon_name])
            time.sleep(4)
        if mgr_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mgr_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mgr_name])
            time.sleep(4)  # immediate check will result in deactivating state
            status = _remote_service_status(remote, mgr_name)
            if status.find('Active: inactive'):
                log.info("Successfully stopped single ceph mgr service")
            else:
                log.info("Failed to stop ceph mgr service")
            remote.run(args=['sudo', 'systemctl', 'start', mgr_name])
            time.sleep(4)
        if mds_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mds_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mds_name])
            time.sleep(4)  # immediate check will result in deactivating state
            status = _remote_service_status(remote, mds_name)
            if status.find('Active: inactive'):
                log.info("Successfully stopped single ceph mds service")
            else:
                log.info("Failed to stop ceph mds service")
            remote.run(args=['sudo', 'systemctl', 'start', mds_name])
            time.sleep(4)

    # reboot all nodes and verify the systemd units restart
    # workunit that runs would fail if any of the systemd unit doesnt start
    ctx.cluster.run(args='sudo reboot', wait=False, check_status=False)
    # avoid immediate reconnect
    time.sleep(120)
    reconnect(ctx, 480)  # reconnect all nodes
    # for debug info
    ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph'])
    # wait for HEALTH_OK
    mon = get_first_mon(ctx, config)
    (mon_remote, ) = ctx.cluster.only(mon).remotes.keys()
    wait_until_healthy(ctx, mon_remote, use_sudo=True)
    yield
Пример #10
0
def upgrade(ctx, config):
    """
     Upgrade using ceph-deploy
     eg:
       ceph-deploy.upgrade:
          # to upgrade to specific branch, use
          branch:
             stable: jewel
           # to setup mgr node, use
           setup-mgr-node: True
           # to wait for cluster to be healthy after all upgrade, use
           wait-for-healthy: True
           role: (upgrades the below roles serially)
              mon.a
              mon.b
              osd.0
     """
    roles = config.get('roles')
    # get the roles that are mapped as per ceph-deploy
    # roles are mapped for mon/mds eg: mon.a  => mon.host_short_name
    mapped_role = ctx.cluster.mapped_role
    log.info("roles={r}, mapped_roles={mr}".format(r=roles, mr=mapped_role))
    if config.get('branch'):
        branch = config.get('branch')
        (var, val) = branch.items()[0]
        ceph_branch = '--{var}={val}'.format(var=var, val=val)
    else:
        # default to wip-branch under test
        dev_branch = ctx.config['branch']
        ceph_branch = '--dev={branch}'.format(branch=dev_branch)
    # get the node used for initial deployment which is mon.a
    mon_a = mapped_role.get('mon.a')
    (ceph_admin,) = ctx.cluster.only(mon_a).remotes.keys()
    testdir = teuthology.get_testdir(ctx)
    cmd = './ceph-deploy install ' + ceph_branch
    for role in roles:
        # check if this role is mapped (mon or mds)
        if mapped_role.get(role):
            role = mapped_role.get(role)
        remotes_and_roles = ctx.cluster.only(role).remotes
        for remote, roles in remotes_and_roles.items():
            nodename = remote.shortname
            cmd = cmd + ' ' + nodename
            log.info("Upgrading ceph on  %s", nodename)
            ceph_admin.run(
                args=[
                    'cd',
                    '{tdir}/ceph-deploy'.format(tdir=testdir),
                    run.Raw('&&'),
                    run.Raw(cmd),
                ],
            )
            # restart all ceph services, ideally upgrade should but it does not
            remote.run(
                args=[
                    'sudo', 'systemctl', 'restart', 'ceph.target'
                ]
            )
            ceph_admin.run(args=['sudo', 'ceph', '-s'])

    # workaround for http://tracker.ceph.com/issues/20950
    # write the correct mgr key to disk
    if config.get('setup-mgr-node', None):
        mons = ctx.cluster.only(teuthology.is_type('mon'))
        for remote, roles in mons.remotes.items():
            remote.run(
                args=[
                    run.Raw('sudo ceph auth get client.bootstrap-mgr'),
                    run.Raw('|'),
                    run.Raw('sudo tee'),
                    run.Raw('/var/lib/ceph/bootstrap-mgr/ceph.keyring')
                ]
            )

    if config.get('setup-mgr-node', None):
        mgr_nodes = get_nodes_using_role(ctx, 'mgr')
        mgr_nodes = " ".join(mgr_nodes)
        mgr_install = './ceph-deploy install --mgr ' + ceph_branch + " " + mgr_nodes
        mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
        # install mgr
        ceph_admin.run(
            args=[
                'cd',
                '{tdir}/ceph-deploy'.format(tdir=testdir),
                run.Raw('&&'),
                run.Raw(mgr_install),
                ],
            )
        # create mgr
        ceph_admin.run(
            args=[
                'cd',
                '{tdir}/ceph-deploy'.format(tdir=testdir),
                run.Raw('&&'),
                run.Raw(mgr_create),
                ],
            )
        ceph_admin.run(args=['sudo', 'ceph', '-s'])
    if config.get('wait-for-healthy', None):
        wait_until_healthy(ctx, ceph_admin, use_sudo=True)
    yield
Пример #11
0
def wait_until_healthy(ctx, config):
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys()
    teuthology.wait_until_healthy(ctx, mon_remote)
Пример #12
0
def upgrade(ctx, config):
    """
     Upgrade using ceph-deploy
     eg:
       ceph-deploy.upgrade:
          # to upgrade to specific branch, use
          branch:
             stable: jewel
           # to setup mgr node, use
           setup-mgr-node: True
           # to wait for cluster to be healthy after all upgrade, use
           wait-for-healthy: True
           role: (upgrades the below roles serially)
              mon.a
              mon.b
              osd.0
     """
    roles = config.get('roles')
    # get the roles that are mapped as per ceph-deploy
    # roles are mapped for mon/mds eg: mon.a  => mon.host_short_name
    mapped_role = ctx.cluster.mapped_role
    if config.get('branch'):
        branch = config.get('branch')
        (var, val) = branch.items()[0]
        ceph_branch = '--{var}={val}'.format(var=var, val=val)
    else:
        # default to master
        ceph_branch = '--dev=master'
    # get the node used for initial deployment which is mon.a
    mon_a = mapped_role.get('mon.a')
    (ceph_admin,) = ctx.cluster.only(mon_a).remotes.iterkeys()
    testdir = teuthology.get_testdir(ctx)
    cmd = './ceph-deploy install ' + ceph_branch
    for role in roles:
        # check if this role is mapped (mon or mds)
        if mapped_role.get(role):
            role = mapped_role.get(role)
        remotes_and_roles = ctx.cluster.only(role).remotes
        for remote, roles in remotes_and_roles.iteritems():
            nodename = remote.shortname
            cmd = cmd + ' ' + nodename
            log.info("Upgrading ceph on  %s", nodename)
            ceph_admin.run(
                args=[
                    'cd',
                    '{tdir}/ceph-deploy'.format(tdir=testdir),
                    run.Raw('&&'),
                    run.Raw(cmd),
                ],
            )
            # restart all ceph services, ideally upgrade should but it does not
            remote.run(
                args=[
                    'sudo', 'systemctl', 'restart', 'ceph.target'
                ]
            )
            ceph_admin.run(args=['sudo', 'ceph', '-s'])

    # workaround for http://tracker.ceph.com/issues/20950
    # write the correct mgr key to disk
    if config.get('setup-mgr-node', None):
        mons = ctx.cluster.only(teuthology.is_type('mon'))
        for remote, roles in mons.remotes.iteritems():
            remote.run(
                args=[
                    run.Raw('sudo ceph auth get client.bootstrap-mgr'),
                    run.Raw('|'),
                    run.Raw('sudo tee'),
                    run.Raw('/var/lib/ceph/bootstrap-mgr/ceph.keyring')
                ]
            )

    if config.get('setup-mgr-node', None):
        mgr_nodes = get_nodes_using_role(ctx, 'mgr')
        mgr_nodes = " ".join(mgr_nodes)
        mgr_install = './ceph-deploy install --mgr ' + ceph_branch + " " + mgr_nodes
        mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
        # install mgr
        ceph_admin.run(
            args=[
                'cd',
                '{tdir}/ceph-deploy'.format(tdir=testdir),
                run.Raw('&&'),
                run.Raw(mgr_install),
                ],
            )
        # create mgr
        ceph_admin.run(
            args=[
                'cd',
                '{tdir}/ceph-deploy'.format(tdir=testdir),
                run.Raw('&&'),
                run.Raw(mgr_create),
                ],
            )
        ceph_admin.run(args=['sudo', 'ceph', '-s'])
    if config.get('wait-for-healthy', None):
        wait_until_healthy(ctx, ceph_admin, use_sudo=True)
    yield
Пример #13
0
def wait_until_healthy(ctx, config):
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    teuthology.wait_until_healthy(ctx, mon_remote)
Пример #14
0
def task(ctx, config):
    """
      - tasks:
          ceph-deploy:
          systemd:

    Test ceph systemd services can start, stop and restart and
    check for any failed services and report back errors
    """
    for remote, roles in ctx.cluster.remotes.iteritems():
        remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
                         'grep', 'ceph'])
        r = remote.run(args=['sudo', 'systemctl', 'list-units', run.Raw('|'),
                             'grep', 'ceph'], stdout=StringIO(),
                       check_status=False)
        log.info(r.stdout.getvalue())
        if r.stdout.getvalue().find('failed'):
            log.info("Ceph services in failed state")

        # test overall service stop and start using ceph.target
        # ceph.target tests are meant for ceph systemd tests
        # and not actual process testing using 'ps'
        log.info("Stopping all Ceph services")
        remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
        r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
                       stdout=StringIO(), check_status=False)
        log.info(r.stdout.getvalue())
        log.info("Checking process status")
        r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
                             'grep', 'ceph'], stdout=StringIO())
        if r.stdout.getvalue().find('Active: inactive'):
            log.info("Sucessfully stopped all ceph services")
        else:
            log.info("Failed to stop ceph services")

        log.info("Starting all Ceph services")
        remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target'])
        r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
                       stdout=StringIO())
        log.info(r.stdout.getvalue())
        if r.stdout.getvalue().find('Active: active'):
            log.info("Sucessfully started all Ceph services")
        else:
            log.info("info", "Failed to start Ceph services")
        r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
                             'grep', 'ceph'], stdout=StringIO())
        log.info(r.stdout.getvalue())
        time.sleep(4)

        # test individual services start stop
        name = remote.shortname
        mon_name = 'ceph-mon@' + name + '.service'
        mds_name = 'ceph-mds@' + name + '.service'
        mgr_name = 'ceph-mgr@' + name + '.service'
        mon_role_name = 'mon.' + name
        mds_role_name = 'mds.' + name
        mgr_role_name = 'mgr.' + name
        m_osd = re.search('--id (\d+) --setuser ceph', r.stdout.getvalue())
        if m_osd:
            osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1))
            remote.run(args=['sudo', 'systemctl', 'status',
                             osd_service])
            remote.run(args=['sudo', 'systemctl', 'stop',
                             osd_service])
            time.sleep(4)  # immediate check will result in deactivating state
            r = remote.run(args=['sudo', 'systemctl', 'status', osd_service],
                           stdout=StringIO(), check_status=False)
            log.info(r.stdout.getvalue())
            if r.stdout.getvalue().find('Active: inactive'):
                log.info("Sucessfully stopped single osd ceph service")
            else:
                log.info("Failed to stop ceph osd services")
            remote.run(args=['sudo', 'systemctl', 'start',
                             osd_service])
            time.sleep(4)
        if mon_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mon_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mon_name])
            time.sleep(4)  # immediate check will result in deactivating state
            r = remote.run(args=['sudo', 'systemctl', 'status', mon_name],
                           stdout=StringIO(), check_status=False)
            if r.stdout.getvalue().find('Active: inactive'):
                log.info("Sucessfully stopped single mon ceph service")
            else:
                log.info("Failed to stop ceph mon service")
            remote.run(args=['sudo', 'systemctl', 'start', mon_name])
            time.sleep(4)
        if mgr_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mgr_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mgr_name])
            time.sleep(4)  # immediate check will result in deactivating state
            r = remote.run(args=['sudo', 'systemctl', 'status', mgr_name],
                           stdout=StringIO(), check_status=False)
            if r.stdout.getvalue().find('Active: inactive'):
                log.info("Sucessfully stopped single ceph mgr service")
            else:
                log.info("Failed to stop ceph mgr service")
            remote.run(args=['sudo', 'systemctl', 'start', mgr_name])
            time.sleep(4)
        if mds_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mds_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mds_name])
            time.sleep(4)  # immediate check will result in deactivating state
            r = remote.run(args=['sudo', 'systemctl', 'status', mds_name],
                           stdout=StringIO(), check_status=False)
            if r.stdout.getvalue().find('Active: inactive'):
                log.info("Sucessfully stopped single ceph mds service")
            else:
                log.info("Failed to stop ceph mds service")
            remote.run(args=['sudo', 'systemctl', 'start', mds_name])
            time.sleep(4)

    # reboot all nodes and verify the systemd units restart
    # workunit that runs would fail if any of the systemd unit doesnt start
    ctx.cluster.run(args='sudo reboot', wait=False, check_status=False)
    # avoid immediate reconnect
    time.sleep(120)
    reconnect(ctx, 480)  # reconnect all nodes
    # for debug info
    ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
                          'grep', 'ceph'])
    # wait for HEALTH_OK
    mon = get_first_mon(ctx, config)
    (mon_remote,) = ctx.cluster.only(mon).remotes.iterkeys()
    wait_until_healthy(ctx, mon_remote)
    yield