def healthy(ctx, config): """ Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK. :param ctx: Context :param config: Configuration """ config = config if isinstance(config, dict) else dict() cluster_name = config.get('cluster', 'ceph') log.info('Waiting until ceph cluster %s is healthy...', cluster_name) firstmon = teuthology.get_first_mon(ctx, config, cluster_name) (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys() teuthology.wait_until_osds_up( ctx, cluster=ctx.cluster, remote=mon0_remote, ceph_cluster=cluster_name, ) teuthology.wait_until_healthy( ctx, remote=mon0_remote, ceph_cluster=cluster_name, ) if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes: # Some MDSs exist, wait for them to be healthy ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware ceph_fs.wait_for_daemons(timeout=300)
def healthy(ctx, config): """ Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK. :param ctx: Context :param config: Configuration """ config = config if isinstance(config, dict) else dict() cluster_name = config.get('cluster', 'ceph') log.info('Waiting until ceph cluster %s is healthy...', cluster_name) firstmon = teuthology.get_first_mon(ctx, config, cluster_name) (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() teuthology.wait_until_osds_up( ctx, cluster=ctx.cluster, remote=mon0_remote, ceph_cluster=cluster_name, ) teuthology.wait_until_healthy( ctx, remote=mon0_remote, ceph_cluster=cluster_name, ) if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes: # Some MDSs exist, wait for them to be healthy ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware ceph_fs.wait_for_daemons(timeout=300)
def healthy(ctx, config): log.info('Waiting until ceph is healthy...') firstmon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() teuthology.wait_until_osds_up( cluster=ctx.cluster, remote=mon0_remote ) teuthology.wait_until_healthy( remote=mon0_remote, )
def healthy(ctx, config): """ Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK. :param ctx: Context :param config: Configuration """ log.info('Waiting until ceph is healthy...') firstmon = teuthology.get_first_mon(ctx, config) (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys() teuthology.wait_until_osds_up(ctx, cluster=ctx.cluster, remote=mon0_remote) teuthology.wait_until_healthy( ctx, remote=mon0_remote, )
def healthy(ctx, config): """ Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK. :param ctx: Context :param config: Configuration """ log.info('Waiting until ceph is healthy...') firstmon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() teuthology.wait_until_osds_up( ctx, cluster=ctx.cluster, remote=mon0_remote ) teuthology.wait_until_healthy( ctx, remote=mon0_remote, )
def healthy(ctx, config): """ Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK. :param ctx: Context :param config: Configuration """ log.info('Waiting until ceph is healthy...') firstmon = teuthology.get_first_mon(ctx, config) (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys() teuthology.wait_until_osds_up(ctx, cluster=ctx.cluster, remote=mon0_remote) teuthology.wait_until_healthy( ctx, remote=mon0_remote, ) if ctx.cluster.only(teuthology.is_type('mds')).remotes: # Some MDSs exist, wait for them to be healthy ceph_fs = Filesystem(ctx) ceph_fs.wait_for_daemons(timeout=300)
def healthy(ctx, config): """ Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK. :param ctx: Context :param config: Configuration """ log.info('Waiting until ceph is healthy...') firstmon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() teuthology.wait_until_osds_up( ctx, cluster=ctx.cluster, remote=mon0_remote ) teuthology.wait_until_healthy( ctx, remote=mon0_remote, ) if ctx.cluster.only(teuthology.is_type('mds')).remotes: # Some MDSs exist, wait for them to be healthy ceph_fs = Filesystem(ctx) ceph_fs.wait_for_daemons(timeout=300)
def task(ctx, config): """ - tasks: ceph-deploy: systemd: Test ceph systemd services can start, stop and restart and check for any failed services and report back errors """ for remote, roles in ctx.cluster.remotes.items(): remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph']) units = remote.sh('sudo systemctl list-units | grep ceph', check_status=False) log.info(units) if units.find('failed'): log.info("Ceph services in failed state") # test overall service stop and start using ceph.target # ceph.target tests are meant for ceph systemd tests # and not actual process testing using 'ps' log.info("Stopping all Ceph services") remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target']) status = _remote_service_status(remote, 'ceph.target') log.info(status) log.info("Checking process status") ps_eaf = remote.sh('sudo ps -eaf | grep ceph') if ps_eaf.find('Active: inactive'): log.info("Successfully stopped all ceph services") else: log.info("Failed to stop ceph services") log.info("Starting all Ceph services") remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target']) status = _remote_service_status(remote, 'ceph.target') log.info(status) if status.find('Active: active'): log.info("Successfully started all Ceph services") else: log.info("info", "Failed to start Ceph services") ps_eaf = remote.sh('sudo ps -eaf | grep ceph') log.info(ps_eaf) time.sleep(4) # test individual services start stop name = remote.shortname mon_name = 'ceph-mon@' + name + '.service' mds_name = 'ceph-mds@' + name + '.service' mgr_name = 'ceph-mgr@' + name + '.service' mon_role_name = 'mon.' + name mds_role_name = 'mds.' + name mgr_role_name = 'mgr.' + name m_osd = re.search('--id (\d+) --setuser ceph', ps_eaf) if m_osd: osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1)) remote.run(args=['sudo', 'systemctl', 'status', osd_service]) remote.run(args=['sudo', 'systemctl', 'stop', osd_service]) time.sleep(4) # immediate check will result in deactivating state status = _remote_service_status(remote, osd_service) log.info(status) if status.find('Active: inactive'): log.info("Successfully stopped single osd ceph service") else: log.info("Failed to stop ceph osd services") remote.sh(['sudo', 'systemctl', 'start', osd_service]) time.sleep(4) if mon_role_name in roles: remote.run(args=['sudo', 'systemctl', 'status', mon_name]) remote.run(args=['sudo', 'systemctl', 'stop', mon_name]) time.sleep(4) # immediate check will result in deactivating state status = _remote_service_status(remote, mon_name) if status.find('Active: inactive'): log.info("Successfully stopped single mon ceph service") else: log.info("Failed to stop ceph mon service") remote.run(args=['sudo', 'systemctl', 'start', mon_name]) time.sleep(4) if mgr_role_name in roles: remote.run(args=['sudo', 'systemctl', 'status', mgr_name]) remote.run(args=['sudo', 'systemctl', 'stop', mgr_name]) time.sleep(4) # immediate check will result in deactivating state status = _remote_service_status(remote, mgr_name) if status.find('Active: inactive'): log.info("Successfully stopped single ceph mgr service") else: log.info("Failed to stop ceph mgr service") remote.run(args=['sudo', 'systemctl', 'start', mgr_name]) time.sleep(4) if mds_role_name in roles: remote.run(args=['sudo', 'systemctl', 'status', mds_name]) remote.run(args=['sudo', 'systemctl', 'stop', mds_name]) time.sleep(4) # immediate check will result in deactivating state status = _remote_service_status(remote, mds_name) if status.find('Active: inactive'): log.info("Successfully stopped single ceph mds service") else: log.info("Failed to stop ceph mds service") remote.run(args=['sudo', 'systemctl', 'start', mds_name]) time.sleep(4) # reboot all nodes and verify the systemd units restart # workunit that runs would fail if any of the systemd unit doesnt start ctx.cluster.run(args='sudo reboot', wait=False, check_status=False) # avoid immediate reconnect time.sleep(120) reconnect(ctx, 480) # reconnect all nodes # for debug info ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph']) # wait for HEALTH_OK mon = get_first_mon(ctx, config) (mon_remote, ) = ctx.cluster.only(mon).remotes.keys() wait_until_healthy(ctx, mon_remote, use_sudo=True) yield
def upgrade(ctx, config): """ Upgrade using ceph-deploy eg: ceph-deploy.upgrade: # to upgrade to specific branch, use branch: stable: jewel # to setup mgr node, use setup-mgr-node: True # to wait for cluster to be healthy after all upgrade, use wait-for-healthy: True role: (upgrades the below roles serially) mon.a mon.b osd.0 """ roles = config.get('roles') # get the roles that are mapped as per ceph-deploy # roles are mapped for mon/mds eg: mon.a => mon.host_short_name mapped_role = ctx.cluster.mapped_role log.info("roles={r}, mapped_roles={mr}".format(r=roles, mr=mapped_role)) if config.get('branch'): branch = config.get('branch') (var, val) = branch.items()[0] ceph_branch = '--{var}={val}'.format(var=var, val=val) else: # default to wip-branch under test dev_branch = ctx.config['branch'] ceph_branch = '--dev={branch}'.format(branch=dev_branch) # get the node used for initial deployment which is mon.a mon_a = mapped_role.get('mon.a') (ceph_admin,) = ctx.cluster.only(mon_a).remotes.keys() testdir = teuthology.get_testdir(ctx) cmd = './ceph-deploy install ' + ceph_branch for role in roles: # check if this role is mapped (mon or mds) if mapped_role.get(role): role = mapped_role.get(role) remotes_and_roles = ctx.cluster.only(role).remotes for remote, roles in remotes_and_roles.items(): nodename = remote.shortname cmd = cmd + ' ' + nodename log.info("Upgrading ceph on %s", nodename) ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(cmd), ], ) # restart all ceph services, ideally upgrade should but it does not remote.run( args=[ 'sudo', 'systemctl', 'restart', 'ceph.target' ] ) ceph_admin.run(args=['sudo', 'ceph', '-s']) # workaround for http://tracker.ceph.com/issues/20950 # write the correct mgr key to disk if config.get('setup-mgr-node', None): mons = ctx.cluster.only(teuthology.is_type('mon')) for remote, roles in mons.remotes.items(): remote.run( args=[ run.Raw('sudo ceph auth get client.bootstrap-mgr'), run.Raw('|'), run.Raw('sudo tee'), run.Raw('/var/lib/ceph/bootstrap-mgr/ceph.keyring') ] ) if config.get('setup-mgr-node', None): mgr_nodes = get_nodes_using_role(ctx, 'mgr') mgr_nodes = " ".join(mgr_nodes) mgr_install = './ceph-deploy install --mgr ' + ceph_branch + " " + mgr_nodes mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes # install mgr ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(mgr_install), ], ) # create mgr ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(mgr_create), ], ) ceph_admin.run(args=['sudo', 'ceph', '-s']) if config.get('wait-for-healthy', None): wait_until_healthy(ctx, ceph_admin, use_sudo=True) yield
def wait_until_healthy(ctx, config): first_mon = teuthology.get_first_mon(ctx, config) (mon_remote,) = ctx.cluster.only(first_mon).remotes.keys() teuthology.wait_until_healthy(ctx, mon_remote)
def upgrade(ctx, config): """ Upgrade using ceph-deploy eg: ceph-deploy.upgrade: # to upgrade to specific branch, use branch: stable: jewel # to setup mgr node, use setup-mgr-node: True # to wait for cluster to be healthy after all upgrade, use wait-for-healthy: True role: (upgrades the below roles serially) mon.a mon.b osd.0 """ roles = config.get('roles') # get the roles that are mapped as per ceph-deploy # roles are mapped for mon/mds eg: mon.a => mon.host_short_name mapped_role = ctx.cluster.mapped_role if config.get('branch'): branch = config.get('branch') (var, val) = branch.items()[0] ceph_branch = '--{var}={val}'.format(var=var, val=val) else: # default to master ceph_branch = '--dev=master' # get the node used for initial deployment which is mon.a mon_a = mapped_role.get('mon.a') (ceph_admin,) = ctx.cluster.only(mon_a).remotes.iterkeys() testdir = teuthology.get_testdir(ctx) cmd = './ceph-deploy install ' + ceph_branch for role in roles: # check if this role is mapped (mon or mds) if mapped_role.get(role): role = mapped_role.get(role) remotes_and_roles = ctx.cluster.only(role).remotes for remote, roles in remotes_and_roles.iteritems(): nodename = remote.shortname cmd = cmd + ' ' + nodename log.info("Upgrading ceph on %s", nodename) ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(cmd), ], ) # restart all ceph services, ideally upgrade should but it does not remote.run( args=[ 'sudo', 'systemctl', 'restart', 'ceph.target' ] ) ceph_admin.run(args=['sudo', 'ceph', '-s']) # workaround for http://tracker.ceph.com/issues/20950 # write the correct mgr key to disk if config.get('setup-mgr-node', None): mons = ctx.cluster.only(teuthology.is_type('mon')) for remote, roles in mons.remotes.iteritems(): remote.run( args=[ run.Raw('sudo ceph auth get client.bootstrap-mgr'), run.Raw('|'), run.Raw('sudo tee'), run.Raw('/var/lib/ceph/bootstrap-mgr/ceph.keyring') ] ) if config.get('setup-mgr-node', None): mgr_nodes = get_nodes_using_role(ctx, 'mgr') mgr_nodes = " ".join(mgr_nodes) mgr_install = './ceph-deploy install --mgr ' + ceph_branch + " " + mgr_nodes mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes # install mgr ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(mgr_install), ], ) # create mgr ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(mgr_create), ], ) ceph_admin.run(args=['sudo', 'ceph', '-s']) if config.get('wait-for-healthy', None): wait_until_healthy(ctx, ceph_admin, use_sudo=True) yield
def wait_until_healthy(ctx, config): first_mon = teuthology.get_first_mon(ctx, config) (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys() teuthology.wait_until_healthy(ctx, mon_remote)
def task(ctx, config): """ - tasks: ceph-deploy: systemd: Test ceph systemd services can start, stop and restart and check for any failed services and report back errors """ for remote, roles in ctx.cluster.remotes.iteritems(): remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph']) r = remote.run(args=['sudo', 'systemctl', 'list-units', run.Raw('|'), 'grep', 'ceph'], stdout=StringIO(), check_status=False) log.info(r.stdout.getvalue()) if r.stdout.getvalue().find('failed'): log.info("Ceph services in failed state") # test overall service stop and start using ceph.target # ceph.target tests are meant for ceph systemd tests # and not actual process testing using 'ps' log.info("Stopping all Ceph services") remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target']) r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'], stdout=StringIO(), check_status=False) log.info(r.stdout.getvalue()) log.info("Checking process status") r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph'], stdout=StringIO()) if r.stdout.getvalue().find('Active: inactive'): log.info("Sucessfully stopped all ceph services") else: log.info("Failed to stop ceph services") log.info("Starting all Ceph services") remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target']) r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'], stdout=StringIO()) log.info(r.stdout.getvalue()) if r.stdout.getvalue().find('Active: active'): log.info("Sucessfully started all Ceph services") else: log.info("info", "Failed to start Ceph services") r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph'], stdout=StringIO()) log.info(r.stdout.getvalue()) time.sleep(4) # test individual services start stop name = remote.shortname mon_name = 'ceph-mon@' + name + '.service' mds_name = 'ceph-mds@' + name + '.service' mgr_name = 'ceph-mgr@' + name + '.service' mon_role_name = 'mon.' + name mds_role_name = 'mds.' + name mgr_role_name = 'mgr.' + name m_osd = re.search('--id (\d+) --setuser ceph', r.stdout.getvalue()) if m_osd: osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1)) remote.run(args=['sudo', 'systemctl', 'status', osd_service]) remote.run(args=['sudo', 'systemctl', 'stop', osd_service]) time.sleep(4) # immediate check will result in deactivating state r = remote.run(args=['sudo', 'systemctl', 'status', osd_service], stdout=StringIO(), check_status=False) log.info(r.stdout.getvalue()) if r.stdout.getvalue().find('Active: inactive'): log.info("Sucessfully stopped single osd ceph service") else: log.info("Failed to stop ceph osd services") remote.run(args=['sudo', 'systemctl', 'start', osd_service]) time.sleep(4) if mon_role_name in roles: remote.run(args=['sudo', 'systemctl', 'status', mon_name]) remote.run(args=['sudo', 'systemctl', 'stop', mon_name]) time.sleep(4) # immediate check will result in deactivating state r = remote.run(args=['sudo', 'systemctl', 'status', mon_name], stdout=StringIO(), check_status=False) if r.stdout.getvalue().find('Active: inactive'): log.info("Sucessfully stopped single mon ceph service") else: log.info("Failed to stop ceph mon service") remote.run(args=['sudo', 'systemctl', 'start', mon_name]) time.sleep(4) if mgr_role_name in roles: remote.run(args=['sudo', 'systemctl', 'status', mgr_name]) remote.run(args=['sudo', 'systemctl', 'stop', mgr_name]) time.sleep(4) # immediate check will result in deactivating state r = remote.run(args=['sudo', 'systemctl', 'status', mgr_name], stdout=StringIO(), check_status=False) if r.stdout.getvalue().find('Active: inactive'): log.info("Sucessfully stopped single ceph mgr service") else: log.info("Failed to stop ceph mgr service") remote.run(args=['sudo', 'systemctl', 'start', mgr_name]) time.sleep(4) if mds_role_name in roles: remote.run(args=['sudo', 'systemctl', 'status', mds_name]) remote.run(args=['sudo', 'systemctl', 'stop', mds_name]) time.sleep(4) # immediate check will result in deactivating state r = remote.run(args=['sudo', 'systemctl', 'status', mds_name], stdout=StringIO(), check_status=False) if r.stdout.getvalue().find('Active: inactive'): log.info("Sucessfully stopped single ceph mds service") else: log.info("Failed to stop ceph mds service") remote.run(args=['sudo', 'systemctl', 'start', mds_name]) time.sleep(4) # reboot all nodes and verify the systemd units restart # workunit that runs would fail if any of the systemd unit doesnt start ctx.cluster.run(args='sudo reboot', wait=False, check_status=False) # avoid immediate reconnect time.sleep(120) reconnect(ctx, 480) # reconnect all nodes # for debug info ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'), 'grep', 'ceph']) # wait for HEALTH_OK mon = get_first_mon(ctx, config) (mon_remote,) = ctx.cluster.only(mon).remotes.iterkeys() wait_until_healthy(ctx, mon_remote) yield