def validate_cluster(ctx): """ Check that there is exactly one master and at least one slave configured """ log.info('Vaidating Hadoop configuration') slaves = ctx.cluster.only(teuthology.is_type('hadoop.slave')) if (len(slaves.remotes) < 1): raise Exception("At least one hadoop.slave must be specified") else: log.info(str(len(slaves.remotes)) + " slaves specified") masters = ctx.cluster.only(teuthology.is_type('hadoop.master')) if (len(masters.remotes) == 1): pass else: raise Exception( "Exactly one hadoop.master must be specified. Currently there are " + str(len(masters.remotes))) try: yield finally: pass
def execute(ctx, config): procs = [] osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, roles_for_host in osds.remotes.iteritems(): roles_to_devs = ctx.disk_config.remote_to_roles_to_dev[remote] for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): if roles_to_devs.get(id_): dev = roles_to_devs[id_] log.info("running blktrace on %s: %s" % (remote.name, dev)) proc = remote.run( args=[ 'cd', log_dir, run.Raw(';'), '/tmp/cephtest/daemon-helper', daemon_signal, 'sudo', blktrace, '-o', dev.rsplit("/", 1)[1], '-d', dev, ], wait=False, stdin=run.PIPE, ) procs.append(proc) try: yield finally: osds = ctx.cluster.only(teuthology.is_type('osd')) log.info('stopping blktrace processs') for proc in procs: proc.stdin.close()
def task(ctx, config): """ Test monitor recovery from OSD """ if config is None: config = {} assert isinstance(config, dict), \ 'task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager')) mons = ctx.cluster.only(teuthology.is_type('mon')) # note down the first cluster_name and mon_id # we will recover it later on cluster_name, _, mon_id = teuthology.split_role(first_mon) _nuke_mons(manager, mons, mon_id) default_keyring = '/etc/ceph/{cluster}.keyring'.format( cluster=cluster_name) keyring_path = config.get('keyring_path', default_keyring) _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path) _revive_mons(manager, mons, mon_id, keyring_path) _revive_mgrs(ctx, manager) _revive_osds(ctx, manager)
def healthy(ctx, config): """ Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK. :param ctx: Context :param config: Configuration """ config = config if isinstance(config, dict) else dict() cluster_name = config.get('cluster', 'ceph') log.info('Waiting until ceph cluster %s is healthy...', cluster_name) firstmon = teuthology.get_first_mon(ctx, config, cluster_name) (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() teuthology.wait_until_osds_up( ctx, cluster=ctx.cluster, remote=mon0_remote, ceph_cluster=cluster_name, ) teuthology.wait_until_healthy( ctx, remote=mon0_remote, ceph_cluster=cluster_name, ) if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes: # Some MDSs exist, wait for them to be healthy ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware ceph_fs.wait_for_daemons(timeout=300)
def get_masters_data(ctx): tempdir = teuthology.get_testdir(ctx) path = "{tdir}/hadoop/etc/hadoop/masters".format(tdir=tempdir) nodes = ctx.cluster.only(teuthology.is_type('hadoop.master')) hosts = [s.ssh.get_transport().getpeername()[0] for s in nodes.remotes] data = '\n'.join(hosts) return path, data
def get_core_site_data(ctx, config): tempdir = teuthology.get_testdir(ctx) path = "{tdir}/hadoop/etc/hadoop/core-site.xml".format(tdir=tempdir) nodes = ctx.cluster.only(teuthology.is_type('hadoop.master')) host = [s.ssh.get_transport().getpeername()[0] for s in nodes.remotes][0] conf = {} if config.get('hdfs', False): conf.update({ 'fs.defaultFS': 'hdfs://{namenode}:9000', 'hadoop.tmp.dir': '{tdir}/hadoop_tmp', }) else: conf.update({ 'fs.default.name': 'ceph://{namenode}:6789/', 'fs.defaultFS': 'ceph://{namenode}:6789/', 'ceph.conf.file': '/etc/ceph/ceph.conf', 'ceph.mon.address': '{namenode}:6789', 'ceph.auth.id': 'admin', #'ceph.data.pools': 'cephfs_data', 'fs.AbstractFileSystem.ceph.impl': 'org.apache.hadoop.fs.ceph.CephFs', 'fs.ceph.impl': 'org.apache.hadoop.fs.ceph.CephFileSystem', }) data_tmpl = dict_to_hadoop_conf(conf) return path, data_tmpl.format(tdir=tempdir, namenode=host)
def run_rest_api_daemon(ctx, api_clients): if not hasattr(ctx, 'daemons'): ctx.daemons = CephState() remotes = ctx.cluster.only(teuthology.is_type('client')).remotes testdir = teuthology.get_testdir(ctx) coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) for rems, roles in remotes.iteritems(): for whole_id_ in roles: if whole_id_ in api_clients: id_ = whole_id_[len('clients'):] run_cmd = [ 'sudo', 'daemon-helper', 'kill', 'ceph-rest-api', '-n', 'client.rest{id}'.format(id=id_), ] cl_rest_id = 'client.rest{id}'.format(id=id_) ctx.daemons.add_daemon(rems, 'restapi', cl_rest_id, args=run_cmd, logger=log.getChild(cl_rest_id), stdin=run.PIPE, wait=False, ) try: yield finally: """ TO DO: destroy daemons started -- modify iter_daemons_of_role """ teuthology.stop_daemons_of_type(ctx, 'restapi')
def write_mapred_site(ctx): mapredSiteFile = "{tdir}/apache_hadoop/conf/mapred-site.xml".format(tdir=teuthology.get_testdir(ctx)) master_ip = get_hadoop_master_ip(ctx) log.info("adding host {remote} as jobtracker".format(remote=master_ip)) hadoopNodes = ctx.cluster.only(teuthology.is_type("hadoop")) for remote, roles_for_host in hadoopNodes.remotes.iteritems(): teuthology.write_file( remote, mapredSiteFile, """<?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <!-- Put site-specific property overrides in this file. --> <configuration> <property> <name>mapred.job.tracker</name> <value>{remote}:54311</value> </property> </configuration> """.format( remote=master_ip ), ) log.info("wrote file: " + mapredSiteFile + " to host: " + str(remote))
def create_keyring(ctx): log.info('Setting up client nodes...') clients = ctx.cluster.only(teuthology.is_type('client')) testdir = teuthology.get_testdir(ctx) coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) for remote, roles_for_host in clients.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'client'): client_keyring = '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) remote.run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', '--gen-key', # TODO this --name= is not really obeyed, all unknown "types" are munged to "client" '--name=client.{id}'.format(id=id_), client_keyring, run.Raw('&&'), 'sudo', 'chmod', '0644', client_keyring, ], )
def write_mapred_site(ctx): """ Add required entries to conf/mapred-site.xml """ mapred_site_file = "{tdir}/apache_hadoop/conf/mapred-site.xml".format( tdir=teuthology.get_testdir(ctx)) master_ip = get_hadoop_master_ip(ctx) log.info('adding host {remote} as jobtracker'.format(remote=master_ip)) hadoop_nodes = ctx.cluster.only(teuthology.is_type('hadoop')) for remote in hadoop_nodes.remotes: teuthology.write_file(remote, mapred_site_file, '''<?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <!-- Put site-specific property overrides in this file. --> <configuration> <property> <name>mapred.job.tracker</name> <value>{remote}:54311</value> </property> </configuration> '''.format(remote=master_ip)) log.info("wrote file: " + mapred_site_file + " to host: " + str(remote))
def create_keyring(ctx, cluster_name): """ Set up key ring on remote sites """ log.info('Setting up client nodes...') clients = ctx.cluster.only(teuthology.is_type('client', cluster_name)) testdir = teuthology.get_testdir(ctx) coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) for remote, roles_for_host in clients.remotes.iteritems(): for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name): name = teuthology.ceph_role(role) client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name, name) remote.run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', '--gen-key', # TODO this --name= is not really obeyed, all unknown "types" are munged to "client" '--name={name}'.format(name=name), client_keyring, run.Raw('&&'), 'sudo', 'chmod', '0644', client_keyring, ], )
def cephfs_setup(ctx, config): testdir = teuthology.get_testdir(ctx) coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) first_mon = teuthology.get_first_mon(ctx, config) (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys() mdss = ctx.cluster.only(teuthology.is_type('mds')) # If there are any MDSs, then create a filesystem for them to use # Do this last because requires mon cluster to be up and running if mdss.remotes: log.info('Setting up CephFS filesystem...') ceph_fs = Filesystem(ctx) if not ceph_fs.legacy_configured(): ceph_fs.create() is_active_mds = lambda role: role.startswith('mds.') and not role.endswith('-s') and role.find('-s-') == -1 all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles] num_active = len([r for r in all_roles if is_active_mds(r)]) mon_remote.run(args=[ 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph', 'mds', 'set_max_mds', str(num_active)]) yield
def write_core_site(ctx, config): coreSiteFile = "/tmp/cephtest/hadoop/conf/core-site.xml" hadoopNodes = ctx.cluster.only(teuthology.is_type('hadoop')) for remote, roles_for_host in hadoopNodes.remotes.iteritems(): # check the config to see if we should use hdfs or ceph default_fs_string = "" if config.get('hdfs'): default_fs_string = 'hdfs://{master_ip}:54310'.format(master_ip=get_hadoop_master_ip(ctx)) else: default_fs_string = 'ceph:///' teuthology.write_file(remote, coreSiteFile, '''<?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <!-- Put site-specific property overrides in this file. --> <configuration> <property> <name>hadoop.tmp.dir</name> <value>/tmp/hadoop/tmp</value> </property> <property> <name>fs.default.name</name> <value>{default_fs}</value> </property> <property> <name>ceph.conf.file</name> <value>/tmp/cephtest/ceph.conf</value> </property> </configuration> '''.format(default_fs=default_fs_string)) log.info("wrote file: " + coreSiteFile + " to host: " + str(remote))
def _revive_mons(manager, mons, recovered, keyring_path): # revive monitors # the initial monmap is in the ceph.conf, so we are good. n_mons = 0 is_mon = teuthology.is_type('mon') for remote, roles in mons.remotes.iteritems(): for role in roles: if not is_mon(role): continue cluster, _, m = teuthology.split_role(role) if recovered != m: log.info('running mkfs on {cluster}:mon.{mon}'.format( cluster=cluster, mon=m)) remote.run( args=[ 'sudo', 'ceph-mon', '--cluster', cluster, '--mkfs', '-i', m, '--keyring', keyring_path]) log.info('reviving mon.{0}'.format(m)) manager.revive_mon(m) n_mons += 1 manager.wait_for_mon_quorum_size(n_mons, timeout=30)
def _spawn_on_all_clients(ctx, refspec, tests, env, subdir, timeout=None): """ Make a scratch directory for each client in the cluster, and then for each test spawn _run_tests() for each role. See run_tests() for parameter documentation. """ is_client = misc.is_type('client') client_remotes = {} created_mountpoint = {} for remote, roles_for_host in ctx.cluster.remotes.items(): for role in roles_for_host: if is_client(role): client_remotes[role] = remote created_mountpoint[role] = _make_scratch_dir(ctx, role, subdir) for unit in tests: with parallel() as p: for role, remote in client_remotes.items(): p.spawn(_run_tests, ctx, refspec, role, [unit], env, subdir, timeout=timeout) # cleanup the generated client directories for role, _ in client_remotes.items(): _delete_dir(ctx, role, created_mountpoint[role])
def configure(ctx, config, hadoops): tempdir = teuthology.get_testdir(ctx) log.info("Writing Hadoop slaves file...") for remote in hadoops.remotes: path, data = get_slaves_data(ctx) teuthology.write_file(remote, path, StringIO(data)) log.info("Writing Hadoop masters file...") for remote in hadoops.remotes: path, data = get_masters_data(ctx) teuthology.write_file(remote, path, StringIO(data)) log.info("Writing Hadoop core-site.xml file...") for remote in hadoops.remotes: path, data = get_core_site_data(ctx, config) teuthology.write_file(remote, path, StringIO(data)) log.info("Writing Hadoop yarn-site.xml file...") for remote in hadoops.remotes: path, data = get_yarn_site_data(ctx) teuthology.write_file(remote, path, StringIO(data)) log.info("Writing Hadoop hdfs-site.xml file...") for remote in hadoops.remotes: path, data = get_hdfs_site_data(ctx) teuthology.write_file(remote, path, StringIO(data)) log.info("Writing Hadoop mapred-site.xml file...") for remote in hadoops.remotes: path, data = get_mapred_site_data(ctx) teuthology.write_file(remote, path, StringIO(data)) log.info("Setting JAVA_HOME in hadoop-env.sh...") for remote in hadoops.remotes: path = "{tdir}/hadoop/etc/hadoop/hadoop-env.sh".format(tdir=tempdir) if remote.os.package_type == 'rpm': data = "JAVA_HOME=/usr/lib/jvm/java\n" elif remote.os.package_type == 'deb': data = "JAVA_HOME=/usr/lib/jvm/default-java\n" else: raise UnsupportedPackageTypeError(remote) teuthology.prepend_lines_to_file(remote, path, data) if config.get('hdfs', False): log.info("Formatting HDFS...") testdir = teuthology.get_testdir(ctx) hadoop_dir = "{tdir}/hadoop/".format(tdir=testdir) masters = ctx.cluster.only(teuthology.is_type('hadoop.master')) assert len(masters.remotes) == 1 master = masters.remotes.keys()[0] master.run( args = [ hadoop_dir + "bin/hadoop", "namenode", "-format" ], wait = True, )
def write_slaves(ctx): log.info('Setting up slave nodes...') slavesFile = "/tmp/cephtest/hadoop/conf/slaves" tmpFile = StringIO() slaves = ctx.cluster.only(teuthology.is_type('hadoop.slave')) for remote, roles_for_host in slaves.remotes.iteritems(): tmpFile.write('{remote}\n'.format(remote=remote.ssh.get_transport().getpeername()[0])) tmpFile.seek(0) hadoopNodes = ctx.cluster.only(teuthology.is_type('hadoop')) for remote, roles_for_host in hadoopNodes.remotes.iteritems(): teuthology.write_file(remote=remote, path=slavesFile, data=tmpFile) tmpFile.seek(0) log.info("wrote file: " + slavesFile + " to host: " + str(remote))
def write_slaves(ctx): log.info("Setting up slave nodes...") slavesFile = "{tdir}/apache_hadoop/conf/slaves".format(tdir=teuthology.get_testdir(ctx)) tmpFile = StringIO() slaves = ctx.cluster.only(teuthology.is_type("hadoop.slave")) for remote, roles_for_host in slaves.remotes.iteritems(): tmpFile.write("{remote}\n".format(remote=remote.ssh.get_transport().getpeername()[0])) tmpFile.seek(0) hadoopNodes = ctx.cluster.only(teuthology.is_type("hadoop")) for remote, roles_for_host in hadoopNodes.remotes.iteritems(): teuthology.write_file(remote=remote, path=slavesFile, data=tmpFile) tmpFile.seek(0) log.info("wrote file: " + slavesFile + " to host: " + str(remote))
def setup(ctx, config): osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, roles_for_host in osds.remotes.iteritems(): log.info('Creating %s on %s' % (log_dir,remote.name)) remote.run( args=['mkdir', '-p', '-m0755', '--', log_dir], wait=False, ) yield
def binaries(ctx, config): path = config.get("path") if path is None: # fetch Apache Hadoop from gitbuilder log.info("Fetching and unpacking Apache Hadoop binaries from gitbuilder...") apache_sha1, apache_hadoop_bindir_url = teuthology.get_ceph_binary_url( package="apache-hadoop", branch=config.get("apache_branch"), tag=config.get("tag"), sha1=config.get("sha1"), flavor=config.get("flavor"), format=config.get("format"), dist=config.get("dist"), arch=config.get("arch"), ) log.info("apache_hadoop_bindir_url %s" % (apache_hadoop_bindir_url)) ctx.summary["apache-hadoop-sha1"] = apache_sha1 # fetch Inktank Hadoop from gitbuilder log.info("Fetching and unpacking Inktank Hadoop binaries from gitbuilder...") inktank_sha1, inktank_hadoop_bindir_url = teuthology.get_ceph_binary_url( package="hadoop", branch=config.get("inktank_branch"), tag=config.get("tag"), sha1=config.get("sha1"), flavor=config.get("flavor"), format=config.get("format"), dist=config.get("dist"), arch=config.get("arch"), ) log.info("inktank_hadoop_bindir_url %s" % (inktank_hadoop_bindir_url)) ctx.summary["inktank-hadoop-sha1"] = inktank_sha1 else: raise Exception("The hadoop task does not support the path argument at present") with parallel() as p: hadoopNodes = ctx.cluster.only(teuthology.is_type("hadoop")) # these can happen independently for remote in hadoopNodes.remotes.iterkeys(): p.spawn(_node_binaries, ctx, config, remote, inktank_hadoop_bindir_url, apache_hadoop_bindir_url) try: yield finally: log.info("Removing hadoop binaries...") run.wait( ctx.cluster.run( args=["rm", "-rf", "--", "{tdir}/apache_hadoop".format(tdir=teuthology.get_testdir(ctx))], wait=False ) ) run.wait( ctx.cluster.run( args=["rm", "-rf", "--", "{tdir}/inktank_hadoop".format(tdir=teuthology.get_testdir(ctx))], wait=False ) )
def _get_master(ctx): """ Return the hadoop master. If more than one is found, fail an assertion """ master = ctx.cluster.only(teuthology.is_type('hadoop.master')) assert 1 == len(master.remotes.items()), \ 'There must be exactly 1 hadoop.master configured' return master.remotes.items()[0]
def _revive_osds(ctx, manager): is_osd = teuthology.is_type('osd') osds = ctx.cluster.only(is_osd) for _, roles in osds.remotes.iteritems(): for role in roles: if not is_osd(role): continue _, _, osd_id = teuthology.split_role(role) log.info('reviving osd.{0}'.format(osd_id)) manager.revive_osd(osd_id)
def tgt_devname_get(ctx, test_image): """ Get the name of the newly created device by following the by-path link (which is symbolically linked to the appropriate /dev/sd* file). """ remotes = ctx.cluster.only(teuthology.is_type('client')).remotes rem_name = _get_remote_name(remotes, test_image) lnkpath = '/dev/disk/by-path/ip-%s:3260-iscsi-rbd-lun-1' % \ socket.gethostbyname(rem_name) return lnkpath
def write_master(ctx): mastersFile = "/tmp/cephtest/hadoop/conf/masters" master = _get_master(ctx) remote, _ = master hadoopNodes = ctx.cluster.only(teuthology.is_type('hadoop')) for remote, roles_for_host in hadoopNodes.remotes.iteritems(): teuthology.write_file(remote, mastersFile, '{remote}\n'.format(remote=remote.ssh.get_transport().getpeername()[0])) log.info("wrote file: " + mastersFile + " to host: " + str(remote))
def _revive_mgrs(ctx, manager): is_mgr = teuthology.is_type('mgr') mgrs = ctx.cluster.only(is_mgr) for _, roles in mgrs.remotes.iteritems(): for role in roles: if not is_mgr(role): continue _, _, mgr_id = teuthology.split_role(role) log.info('reviving mgr.{0}'.format(mgr_id)) manager.revive_mgr(mgr_id)
def task(ctx, config): """ Start up tgt. To start on on all clients:: tasks: - ceph: - tgt: To start on certain clients:: tasks: - ceph: - tgt: [client.0, client.3] or tasks: - ceph: - tgt: client.0: client.3: An image blocksize size can also be specified:: tasks: - ceph: - tgt: image_size = 20480 The general flow of things here is: 1. Find clients on which tgt is supposed to run (start_tgtd) 2. Remotely start up tgt daemon On cleanup: 3. Stop tgt daemon The iscsi administration is handled by the iscsi task. """ if config: config = {key : val for key, val in config.items() if key.startswith('client')} # config at this point should only contain keys starting with 'client' start_tgtd = [] remotes = ctx.cluster.only(teuthology.is_type('client')).remotes log.info(remotes) if not config: start_tgtd = ['client.{id}'.format(id=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')] else: start_tgtd = config log.info(start_tgtd) with contextutil.nested( lambda: start_tgt_remotes(ctx=ctx, start_tgtd=start_tgtd),): yield
def create_ceph_conf(ctx, config): devs_to_clean = {} remote_to_roles_to_devs = {} remote_to_roles_to_journals = {} osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, roles_for_host in osds.remotes.iteritems(): devs = teuthology.get_scratch_devices(remote) roles_to_devs = {} roles_to_journals = {} if config.get('fs'): log.info('fs option selected, checking for scratch devs') log.info('found devs: %s' % (str(devs),)) devs_id_map = teuthology.get_wwn_id_map(remote, devs) iddevs = devs_id_map.values() roles_to_devs = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), iddevs ) if len(roles_to_devs) < len(iddevs): iddevs = iddevs[len(roles_to_devs):] devs_to_clean[remote] = [] if config.get('block_journal'): log.info('block journal enabled') roles_to_journals = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), iddevs ) log.info('journal map: %s', roles_to_journals) if config.get('tmpfs_journal'): log.info('tmpfs journal enabled') roles_to_journals = {} remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] ) for osd in teuthology.roles_of_type(roles_for_host, 'osd'): tmpfs = '/mnt/osd.%s' % osd roles_to_journals[osd] = tmpfs remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] ) log.info('journal map: %s', roles_to_journals) log.info('dev map: %s' % (str(roles_to_devs),)) remote_to_roles_to_devs[remote] = roles_to_devs remote_to_roles_to_journals[remote] = roles_to_journals log.info('Generating config...') remotes_and_roles = ctx.cluster.remotes.items() roles = [role_list for (remote, role_list) in remotes_and_roles] ips = [host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)] conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips) ctx.ceph = argparse.Namespace() ctx.ceph.conf = conf log.info(ctx) yield
def execute(ctx, config): """ Run the blktrace program on remote machines. """ procs = [] testdir = teuthology.get_testdir(ctx) log_dir = '{tdir}/archive/performance/blktrace'.format(tdir=testdir) osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, roles_for_host in osds.remotes.iteritems(): roles_to_devs = ctx.disk_config.remote_to_roles_to_dev[remote] for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', config['cluster']): if roles_to_devs.get(role): dev = roles_to_devs[role] log.info("running blktrace on %s: %s" % (remote.name, dev)) proc = remote.run( args=[ 'cd', log_dir, run.Raw(';'), 'daemon-helper', daemon_signal, 'sudo', blktrace, '-o', dev.rsplit("/", 1)[1], '-d', dev, ], wait=False, stdin=run.PIPE, ) procs.append(proc) try: yield finally: osds = ctx.cluster.only(teuthology.is_type('osd')) log.info('stopping blktrace processs') for proc in procs: proc.stdin.close()
def validate_config(ctx, config): log.info("Vaidating Hadoop configuration") slaves = ctx.cluster.only(teuthology.is_type("hadoop.slave")) if len(slaves.remotes) < 1: raise Exception("At least one hadoop.slave must be specified") else: log.info(str(len(slaves.remotes)) + " slaves specified") masters = ctx.cluster.only(teuthology.is_type("hadoop.master")) if len(masters.remotes) == 1: pass else: raise Exception("Exactly one hadoop.master must be specified. Currently there are " + str(len(masters.remotes))) try: yield finally: pass
def setup(ctx, config): osds = ctx.cluster.only(teuthology.is_type('osd')) log_dir = '{tdir}/archive/performance/blktrace'.format(tdir=teuthology.get_testdir(ctx)) for remote, roles_for_host in osds.remotes.iteritems(): log.info('Creating %s on %s' % (log_dir,remote.name)) remote.run( args=['mkdir', '-p', '-m0755', '--', log_dir], wait=False, ) yield
def task(ctx, config): log.info('starting nfs_ganesha_rgw tests') # RGW and NFS should be on the same machine if config is None: config = {} assert isinstance(config, dict), \ "task set-repo only supports a dictionary for configuration" test_name = config['test-name'] + ".yaml" script_name = tests_mapper.get(config['test-name'], None) + ".py" nfs_version = config['nfs-version'] mount_dir = config['mount-dir'] branch = config.get("branch", "master") log.info('got test_name: %s' % test_name) log.info('got nfs version: %s' % nfs_version) log.info('got mount dir: %s' % mount_dir) remotes = ctx.cluster.only(teuthology.is_type('mon')) mon = [remote for remote, roles_for_host in remotes.remotes.items()] rgw_remote = ctx.cluster.only(teuthology.is_type('rgw')) rgw = [remote for remote, roles_for_host in rgw_remote.remotes.items()] # installing nfs-ganesha-selinux package if rgw[0].os.version.startswith('7'): rgw[0].run( args=['sudo', 'yum', 'install', '-y', 'nfs-ganesha-selinux']) # clone the repo rgw[0].run(args=['sudo', 'rm', '-rf', 'nfs_ganesha_rgw'], check_status=False) rgw[0].run(args=['sudo', 'rm', '-rf', run.Raw('/tmp/nfs-ganesh-rgw_log*')], check_status=False) rgw[0].run(args=['mkdir', '-p', 'nfs_ganesha_rgw']) # stop native nfs_ganesha service. rgw[0].run(args=['sudo', 'systemctl', 'stop', 'nfs-server.service' ]) # systemctl stop nfs-server.service rgw[0].run(args=['sudo', 'systemctl', 'disable', 'nfs-server.service' ]) # systemctl disable nfs-server.service out = io.StringIO() mon[0].run(args=['sudo', 'cat', '/etc/ceph/ceph.client.admin.keyring'], stdout=out) v_as_out = out.read() teuthology.create_file(rgw[0], '/etc/ceph/ceph.client.admin.keyring', data=v_as_out, sudo=True) # parsing nfs_ganesha conf file out = io.StringIO() rgw[0].run(args=['sudo', 'cat', '/etc/ganesha/ganesha.conf'], stdout=out) v_as_out = out.readlines() clean = lambda x: re.sub('[^A-Za-z0-9]+', '', x) for content in v_as_out: if 'Access_Key_Id' in content: access_key = clean(content.split('=')[1]) if 'Secret_Access_Key' in content: secret_key = clean(content.split('=')[1]) if 'User_Id' in content: rgw_user_id = clean(content.split('=')[1]) if 'Pseudo' in content: pseudo = content.split('=')[1].strip(' ').strip('\n').strip( ' ').strip(';').strip('/') rgw[0].run(args=['sudo', 'setenforce', '1']) log.info('restarting nfs-ganesha service') rgw[0].run(args=['sudo', 'systemctl', 'restart', 'nfs-ganesha.service']) time.sleep(60) rgw[0].run(args=[ 'cd', 'nfs_ganesha_rgw', run.Raw(';'), 'git', 'clone', 'https://github.com/red-hat-storage/ceph-qe-scripts.git' ]) rgw[0].run(args=[ 'cd', 'nfs_ganesha_rgw/ceph-qe-scripts', run.Raw(';'), 'git', 'checkout', '%s' % branch ]) rgw[0].run(args=['python3', '-m', 'venv', 'venv']) rgw[0].run(args=[ 'source', 'venv/bin/activate', run.Raw(';'), run.Raw('pip3 install --upgrade setuptools'), run.Raw(';'), 'deactivate' ]) rgw[0].run(args=[ 'source', 'venv/bin/activate', run.Raw(';'), run.Raw( 'pip3 install boto boto3 names PyYaml psutil ConfigParser python-swiftclient ' 'swiftly simplejson rgwadmin'), run.Raw(';'), 'deactivate' ]) # copy rgw user details (yaml format) to nfs node or rgw node rgw_user_config = dict(user_id=rgw_user_id, access_key=access_key, secret_key=secret_key, rgw_hostname=rgw[0].shortname, ganesha_config_exists=True, already_mounted=False, nfs_version=nfs_version, nfs_mnt_point=mount_dir, Pseudo=pseudo) rgw_user_config_fname = 'rgw_user.yaml' temp_yaml_file = rgw_user_config_fname + "_" + str( os.getpid()) + pwd.getpwuid(os.getuid()).pw_name log.info('creating rgw_user_config_fname: %s' % rgw_user_config) local_file = '/tmp/' + temp_yaml_file with open(local_file, 'w') as outfile: outfile.write(yaml.dump(rgw_user_config, default_flow_style=False)) log.info('copying rgw_user_config_fname to the client node') destination_location = 'nfs_ganesha_rgw/ceph-qe-scripts/rgw/v2/tests/nfs_ganesha/config/' + rgw_user_config_fname rgw[0].put_file(local_file, destination_location) rgw[0].run(args=[run.Raw('sudo rm -rf %s' % local_file)], check_status=False) # run the test rgw[0].run(args=[ 'source', 'venv/bin/activate', run.Raw(';'), run.Raw( 'python3 nfs_ganesha_rgw/ceph-qe-scripts/rgw/v2/tests/nfs_ganesha/%s ' '-r nfs_ganesha_rgw/ceph-qe-scripts/rgw/v2/tests/nfs_ganesha/config/rgw_user.yaml ' '-c nfs_ganesha_rgw/ceph-qe-scripts/rgw/v2/tests/nfs_ganesha/config/%s ' % (script_name, test_name)), run.Raw(';'), 'deactivate' ]) try: yield finally: log.info("Deleting the test soot") rgw[0].run(args=['sudo', 'umount', run.Raw('%s' % mount_dir)]) cleanup = lambda x: rgw[0].run(args=[run.Raw('sudo rm -rf %s' % x)]) soot = [ 'venv', 'rgw-tests', 'test_data' '*.json', 'Download.*', 'Download', '*.mpFile', 'x*', 'key.*', 'Mp.*', '*.key.*' ] list(map(cleanup, soot))
def start_tgt_remotes(ctx, start_tgtd): """ This subtask starts up a tgtd on the clients specified """ remotes = ctx.cluster.only(teuthology.is_type('client')).remotes tgtd_list = [] for rem, roles in remotes.items(): for _id in roles: if _id in start_tgtd: if not rem in tgtd_list: tgtd_list.append(rem) size = ctx.config.get('image_size', 10240) rem.run(args=[ 'rbd', 'create', 'iscsi-image', '--size', str(size), ]) rem.run(args=[ 'sudo', 'tgtadm', '--lld', 'iscsi', '--mode', 'target', '--op', 'new', '--tid', '1', '--targetname', 'rbd', ]) rem.run(args=[ 'sudo', 'tgtadm', '--lld', 'iscsi', '--mode', 'logicalunit', '--op', 'new', '--tid', '1', '--lun', '1', '--backing-store', 'iscsi-image', '--bstype', 'rbd', ]) rem.run(args=[ 'sudo', 'tgtadm', '--lld', 'iscsi', '--op', 'bind', '--mode', 'target', '--tid', '1', '-I', 'ALL', ]) try: yield finally: for rem in tgtd_list: rem.run(args=[ 'sudo', 'tgtadm', '--lld', 'iscsi', '--mode', 'target', '--op', 'delete', '--force', '--tid', '1', ]) rem.run(args=[ 'rbd', 'snap', 'purge', 'iscsi-image', ]) rem.run(args=[ 'sudo', 'rbd', 'rm', 'iscsi-image', ])
def cluster(ctx, config): """ Handle the creation and removal of a ceph cluster. On startup: Create directories needed for the cluster. Create remote journals for all osds. Create and set keyring. Copy the monmap to tht test systems. Setup mon nodes. Setup mds nodes. Mkfs osd nodes. Add keyring information to monmaps Mkfs mon nodes. On exit: If errors occured, extract a failure message and store in ctx.summary. Unmount all test files and temporary journaling files. Save the monitor information and archive all ceph logs. Cleanup the keyring setup, and remove all monitor map and data files left over. :param ctx: Context :param config: Configuration """ if ctx.config.get('use_existing_cluster', False) is True: log.info("'use_existing_cluster' is true; skipping cluster creation") yield testdir = teuthology.get_testdir(ctx) log.info('Creating ceph cluster...') run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', '{tdir}/data'.format(tdir=testdir), ], wait=False, )) run.wait( ctx.cluster.run( args=[ 'sudo', 'install', '-d', '-m0777', '--', '/var/run/ceph', ], wait=False, )) devs_to_clean = {} remote_to_roles_to_devs = {} remote_to_roles_to_journals = {} osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, roles_for_host in osds.remotes.iteritems(): devs = teuthology.get_scratch_devices(remote) roles_to_devs = {} roles_to_journals = {} if config.get('fs'): log.info('fs option selected, checking for scratch devs') log.info('found devs: %s' % (str(devs), )) devs_id_map = teuthology.get_wwn_id_map(remote, devs) iddevs = devs_id_map.values() roles_to_devs = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), iddevs) if len(roles_to_devs) < len(iddevs): iddevs = iddevs[len(roles_to_devs):] devs_to_clean[remote] = [] if config.get('block_journal'): log.info('block journal enabled') roles_to_journals = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), iddevs) log.info('journal map: %s', roles_to_journals) if config.get('tmpfs_journal'): log.info('tmpfs journal enabled') roles_to_journals = {} remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt']) for osd in teuthology.roles_of_type(roles_for_host, 'osd'): tmpfs = '/mnt/osd.%s' % osd roles_to_journals[osd] = tmpfs remote.run(args=['truncate', '-s', '1500M', tmpfs]) log.info('journal map: %s', roles_to_journals) log.info('dev map: %s' % (str(roles_to_devs), )) remote_to_roles_to_devs[remote] = roles_to_devs remote_to_roles_to_journals[remote] = roles_to_journals log.info('Generating config...') remotes_and_roles = ctx.cluster.remotes.items() roles = [role_list for (remote, role_list) in remotes_and_roles] ips = [ host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles) ] conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips) for remote, roles_to_journals in remote_to_roles_to_journals.iteritems(): for role, journal in roles_to_journals.iteritems(): key = "osd." + str(role) if key not in conf: conf[key] = {} conf[key]['osd journal'] = journal for section, keys in config['conf'].iteritems(): for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) if section not in conf: conf[section] = {} conf[section][key] = value if config.get('tmpfs_journal'): conf['journal dio'] = False ctx.ceph = argparse.Namespace() ctx.ceph.conf = conf keyring_path = config.get('keyring_path', '/etc/ceph/ceph.keyring') coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) firstmon = teuthology.get_first_mon(ctx, config) log.info('Setting up %s...' % firstmon) ctx.cluster.only(firstmon).run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', keyring_path, ], ) ctx.cluster.only(firstmon).run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--gen-key', '--name=mon.', keyring_path, ], ) ctx.cluster.only(firstmon).run(args=[ 'sudo', 'chmod', '0644', keyring_path, ], ) (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys() fsid = teuthology.create_simple_monmap( ctx, remote=mon0_remote, conf=conf, ) if not 'global' in conf: conf['global'] = {} conf['global']['fsid'] = fsid conf_path = config.get('conf_path', DEFAULT_CONF_PATH) log.info('Writing %s for FSID %s...' % (conf_path, fsid)) write_conf(ctx, conf_path) log.info('Creating admin key on %s...' % firstmon) ctx.cluster.only(firstmon).run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--gen-key', '--name=client.admin', '--set-uid=0', '--cap', 'mon', 'allow *', '--cap', 'osd', 'allow *', '--cap', 'mds', 'allow *', keyring_path, ], ) log.info('Copying monmap to all nodes...') keyring = teuthology.get_file( remote=mon0_remote, path=keyring_path, ) monmap = teuthology.get_file( remote=mon0_remote, path='{tdir}/monmap'.format(tdir=testdir), ) for rem in ctx.cluster.remotes.iterkeys(): # copy mon key and initial monmap log.info('Sending monmap to node {remote}'.format(remote=rem)) teuthology.sudo_write_file(remote=rem, path=keyring_path, data=keyring, perms='0644') teuthology.write_file( remote=rem, path='{tdir}/monmap'.format(tdir=testdir), data=monmap, ) log.info('Setting up mon nodes...') mons = ctx.cluster.only(teuthology.is_type('mon')) run.wait( mons.run( args=[ 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'osdmaptool', '-c', conf_path, '--clobber', '--createsimple', '{num:d}'.format(num=teuthology.num_instances_of_type( ctx.cluster, 'osd'), ), '{tdir}/osdmap'.format(tdir=testdir), '--pg_bits', '2', '--pgp_bits', '4', ], wait=False, ), ) log.info('Setting up mds nodes...') mdss = ctx.cluster.only(teuthology.is_type('mds')) for remote, roles_for_host in mdss.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'mds'): remote.run(args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/mds/ceph-{id}'.format(id=id_), run.Raw('&&'), 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', '--gen-key', '--name=mds.{id}'.format(id=id_), '/var/lib/ceph/mds/ceph-{id}/keyring'.format(id=id_), ], ) cclient.create_keyring(ctx) log.info('Running mkfs on osd nodes...') ctx.disk_config = argparse.Namespace() ctx.disk_config.remote_to_roles_to_dev = remote_to_roles_to_devs ctx.disk_config.remote_to_roles_to_journals = remote_to_roles_to_journals ctx.disk_config.remote_to_roles_to_dev_mount_options = {} ctx.disk_config.remote_to_roles_to_dev_fstype = {} log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format( r=str(ctx.disk_config.remote_to_roles_to_dev))) for remote, roles_for_host in osds.remotes.iteritems(): roles_to_devs = remote_to_roles_to_devs[remote] roles_to_journals = remote_to_roles_to_journals[remote] for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run(args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/osd/ceph-{id}'.format(id=id_), ]) log.info(str(roles_to_journals)) log.info(id_) if roles_to_devs.get(id_): dev = roles_to_devs[id_] fs = config.get('fs') package = None mkfs_options = config.get('mkfs_options') mount_options = config.get('mount_options') if fs == 'btrfs': # package = 'btrfs-tools' if mount_options is None: mount_options = ['noatime', 'user_subvol_rm_allowed'] if mkfs_options is None: mkfs_options = [ '-m', 'single', '-l', '32768', '-n', '32768' ] if fs == 'xfs': # package = 'xfsprogs' if mount_options is None: mount_options = ['noatime'] if mkfs_options is None: mkfs_options = ['-f', '-i', 'size=2048'] if fs == 'ext4' or fs == 'ext3': if mount_options is None: mount_options = ['noatime', 'user_xattr'] if mount_options is None: mount_options = [] if mkfs_options is None: mkfs_options = [] mkfs = ['mkfs.%s' % fs] + mkfs_options log.info('%s on %s on %s' % (mkfs, dev, remote)) if package is not None: remote.run( args=['sudo', 'apt-get', 'install', '-y', package], stdout=StringIO(), ) try: remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) except run.CommandFailedError: # Newer btfs-tools doesn't prompt for overwrite, use -f if '-f' not in mount_options: mkfs_options.append('-f') mkfs = ['mkfs.%s' % fs] + mkfs_options log.info('%s on %s on %s' % (mkfs, dev, remote)) remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) log.info('mount %s on %s -o %s' % (dev, remote, ','.join(mount_options))) remote.run(args=[ 'sudo', 'mount', '-t', fs, '-o', ','.join(mount_options), dev, os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format( id=id_)), ]) if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options: ctx.disk_config.remote_to_roles_to_dev_mount_options[ remote] = {} ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][ id_] = mount_options if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype: ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {} ctx.disk_config.remote_to_roles_to_dev_fstype[remote][id_] = fs devs_to_clean[remote].append( os.path.join( os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)), )) for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run(args=[ 'sudo', 'MALLOC_CHECK_=3', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-osd', '--mkfs', '--mkkey', '-i', id_, '--monmap', '{tdir}/monmap'.format(tdir=testdir), ], ) log.info('Reading keys from all nodes...') keys_fp = StringIO() keys = [] for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for type_ in ['mds', 'osd']: for id_ in teuthology.roles_of_type(roles_for_host, type_): data = teuthology.get_file( remote=remote, path='/var/lib/ceph/{type}/ceph-{id}/keyring'.format( type=type_, id=id_, ), sudo=True, ) keys.append((type_, id_, data)) keys_fp.write(data) for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for type_ in ['client']: for id_ in teuthology.roles_of_type(roles_for_host, type_): data = teuthology.get_file( remote=remote, path='/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)) keys.append((type_, id_, data)) keys_fp.write(data) log.info('Adding keys to all mons...') writes = mons.run( args=[ 'sudo', 'tee', '-a', keyring_path, ], stdin=run.PIPE, wait=False, stdout=StringIO(), ) keys_fp.seek(0) teuthology.feed_many_stdins_and_close(keys_fp, writes) run.wait(writes) for type_, id_, data in keys: run.wait( mons.run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', keyring_path, '--name={type}.{id}'.format( type=type_, id=id_, ), ] + list(teuthology.generate_caps(type_)), wait=False, ), ) log.info('Running mkfs on mon nodes...') for remote, roles_for_host in mons.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'mon'): remote.run(args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/mon/ceph-{id}'.format(id=id_), ], ) remote.run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-mon', '--mkfs', '-i', id_, '--monmap={tdir}/monmap'.format(tdir=testdir), '--osdmap={tdir}/osdmap'.format(tdir=testdir), '--keyring={kpath}'.format(kpath=keyring_path), ], ) run.wait( mons.run( args=[ 'rm', '--', '{tdir}/monmap'.format(tdir=testdir), '{tdir}/osdmap'.format(tdir=testdir), ], wait=False, ), ) try: yield except Exception: # we need to know this below ctx.summary['success'] = False raise finally: (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys() log.info('Checking cluster log for badness...') def first_in_ceph_log(pattern, excludes): """ Find the first occurence of the pattern specified in the Ceph log, Returns None if none found. :param pattern: Pattern scanned for. :param excludes: Patterns to ignore. :return: First line of text (or None if not found) """ args = [ 'sudo', 'egrep', pattern, '/var/log/ceph/ceph.log', ] for exclude in excludes: args.extend([run.Raw('|'), 'egrep', '-v', exclude]) args.extend([ run.Raw('|'), 'head', '-n', '1', ]) r = mon0_remote.run( stdout=StringIO(), args=args, ) stdout = r.stdout.getvalue() if stdout != '': return stdout return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', config['log_whitelist']) is not None: log.warning('Found errors (ERR|WRN|SEC) in cluster log') ctx.summary['success'] = False # use the most severe problem as the failure reason if 'failure_reason' not in ctx.summary: for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']: match = first_in_ceph_log(pattern, config['log_whitelist']) if match is not None: ctx.summary['failure_reason'] = \ '"{match}" in cluster log'.format( match=match.rstrip('\n'), ) break for remote, dirs in devs_to_clean.iteritems(): for dir_ in dirs: log.info('Unmounting %s on %s' % (dir_, remote)) try: remote.run(args=[ 'sync', run.Raw('&&'), 'sudo', 'umount', '-f', dir_ ]) except Exception as e: remote.run(args=[ 'sudo', run.Raw('PATH=/usr/sbin:$PATH'), 'lsof', run.Raw(';'), 'ps', 'auxf', ]) raise e if config.get('tmpfs_journal'): log.info('tmpfs journal enabled - unmounting tmpfs at /mnt') for remote, roles_for_host in osds.remotes.iteritems(): remote.run( args=['sudo', 'umount', '-f', '/mnt'], check_status=False, ) if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') log.info('Cleaning ceph cluster...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-rf', '--', conf_path, keyring_path, '{tdir}/data'.format(tdir=testdir), '{tdir}/monmap'.format(tdir=testdir), ], wait=False, ), )
def run_daemon(ctx, config, type_): """ Run daemons for a role type. Handle the startup and termination of a a daemon. On startup -- set coverages, cpu_profile, valgrind values for all remotes, and a max_mds value for one mds. On cleanup -- Stop all existing daemons of this type. :param ctx: Context :param config: Configuration :paran type_: Role type """ log.info('Starting %s daemons...' % type_) testdir = teuthology.get_testdir(ctx) daemons = ctx.cluster.only(teuthology.is_type(type_)) # check whether any daemons if this type are configured if daemons is None: return coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) daemon_signal = 'kill' if config.get('coverage') or config.get('valgrind') is not None: daemon_signal = 'term' for remote, roles_for_host in daemons.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, type_): name = '%s.%s' % (type_, id_) run_cmd = [ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'daemon-helper', daemon_signal, ] run_cmd_tail = ['ceph-%s' % (type_), '-f', '-i', id_] if type_ in config.get('cpu_profile', []): profile_path = '/var/log/ceph/profiling-logger/%s.%s.prof' % ( type_, id_) run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path]) if config.get('valgrind') is not None: valgrind_args = None if type_ in config['valgrind']: valgrind_args = config['valgrind'][type_] if name in config['valgrind']: valgrind_args = config['valgrind'][name] run_cmd = teuthology.get_valgrind_args(testdir, name, run_cmd, valgrind_args) run_cmd.extend(run_cmd_tail) ctx.daemons.add_daemon( remote, type_, id_, args=run_cmd, logger=log.getChild(name), stdin=run.PIPE, wait=False, ) try: yield finally: teuthology.stop_daemons_of_type(ctx, type_)
def task(ctx, config): """ "Thrash" the OSDs by randomly marking them out/down (and then back in) until the task is ended. This loops, and every op_delay seconds it randomly chooses to add or remove an OSD (even odds) unless there are fewer than min_out OSDs out of the cluster, or more than min_in OSDs in the cluster. All commands are run on mon0 and it stops when __exit__ is called. The config is optional, and is a dict containing some or all of: cluster: (default 'ceph') the name of the cluster to thrash min_in: (default 3) the minimum number of OSDs to keep in the cluster min_out: (default 0) the minimum number of OSDs to keep out of the cluster op_delay: (5) the length of time to sleep between changing an OSD's status min_dead: (0) minimum number of osds to leave down/dead. max_dead: (0) maximum number of osds to leave down/dead before waiting for clean. This should probably be num_replicas - 1. clean_interval: (60) the approximate length of time to loop before waiting until the cluster goes clean. (In reality this is used to probabilistically choose when to wait, and the method used makes it closer to -- but not identical to -- the half-life.) scrub_interval: (-1) the approximate length of time to loop before waiting until a scrub is performed while cleaning. (In reality this is used to probabilistically choose when to wait, and it only applies to the cases where cleaning is being performed). -1 is used to indicate that no scrubbing will be done. chance_down: (0.4) the probability that the thrasher will mark an OSD down rather than marking it out. (The thrasher will not consider that OSD out of the cluster, since presently an OSD wrongly marked down will mark itself back up again.) This value can be either an integer (eg, 75) or a float probability (eg 0.75). chance_test_min_size: (0) chance to run test_pool_min_size, which: - kills all but one osd - waits - kills that osd - revives all other osds - verifies that the osds fully recover timeout: (360) the number of seconds to wait for the cluster to become clean after each cluster change. If this doesn't happen within the timeout, an exception will be raised. revive_timeout: (150) number of seconds to wait for an osd asok to appear after attempting to revive the osd thrash_primary_affinity: (true) randomly adjust primary-affinity chance_pgnum_grow: (0) chance to increase a pool's size chance_pgpnum_fix: (0) chance to adjust pgpnum to pg for a pool pool_grow_by: (10) amount to increase pgnum by max_pgs_per_pool_osd: (1200) don't expand pools past this size per osd pause_short: (3) duration of short pause pause_long: (80) duration of long pause pause_check_after: (50) assert osd down after this long chance_inject_pause_short: (1) chance of injecting short stall chance_inject_pause_long: (0) chance of injecting long stall clean_wait: (0) duration to wait before resuming thrashing once clean sighup_delay: (0.1) duration to delay between sending signal.SIGHUP to a random live osd powercycle: (false) whether to power cycle the node instead of just the osd process. Note that this assumes that a single osd is the only important process on the node. bdev_inject_crash: (0) seconds to delay while inducing a synthetic crash. the delay lets the BlockDevice "accept" more aio operations but blocks any flush, and then eventually crashes (losing some or all ios). If 0, no bdev failure injection is enabled. bdev_inject_crash_probability: (.5) probability of doing a bdev failure injection crash vs a normal OSD kill. chance_test_backfill_full: (0) chance to simulate full disks stopping backfill chance_test_map_discontinuity: (0) chance to test map discontinuity map_discontinuity_sleep_time: (40) time to wait for map trims ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down chance_move_pg: (1.0) chance of moving a pg if more than 1 osd is down (default 100%) optrack_toggle_delay: (2.0) duration to delay between toggling op tracker enablement to all osds dump_ops_enable: (true) continuously dump ops on all live osds noscrub_toggle_delay: (2.0) duration to delay between toggling noscrub disable_objectstore_tool_tests: (false) disable ceph_objectstore_tool based tests chance_thrash_cluster_full: .05 chance_thrash_pg_upmap: 1.0 chance_thrash_pg_upmap_items: 1.0 example: tasks: - ceph: - thrashosds: cluster: ceph chance_down: 10 op_delay: 3 min_in: 1 timeout: 600 - interactive: """ if config is None: config = {} assert isinstance(config, dict), \ 'thrashosds task only accepts a dict for configuration' # add default value for sighup_delay config['sighup_delay'] = config.get('sighup_delay', 0.1) # add default value for optrack_toggle_delay config['optrack_toggle_delay'] = config.get('optrack_toggle_delay', 2.0) # add default value for dump_ops_enable config['dump_ops_enable'] = config.get('dump_ops_enable', "true") # add default value for noscrub_toggle_delay config['noscrub_toggle_delay'] = config.get('noscrub_toggle_delay', 2.0) log.info("config is {config}".format(config=str(config))) overrides = ctx.config.get('overrides', {}) log.info("overrides is {overrides}".format(overrides=str(overrides))) teuthology.deep_merge(config, overrides.get('thrashosds', {})) cluster = config.get('cluster', 'ceph') log.info("config is {config}".format(config=str(config))) if 'powercycle' in config: # sync everyone first to avoid collateral damage to / etc. log.info('Doing preliminary sync to avoid collateral damage...') ctx.cluster.run(args=['sync']) if 'ipmi_user' in ctx.teuthology_config: for remote in ctx.cluster.remotes.keys(): log.debug('checking console status of %s' % remote.shortname) if not remote.console.check_status(): log.warn('Failed to get console status for %s', remote.shortname) # check that all osd remotes have a valid console osds = ctx.cluster.only(teuthology.is_type('osd', cluster)) for remote in osds.remotes.keys(): if not remote.console.has_ipmi_credentials: raise Exception( 'IPMI console required for powercycling, ' 'but not available on osd role: {r}'.format( r=remote.name)) cluster_manager = ctx.managers[cluster] for f in ['powercycle', 'bdev_inject_crash']: if config.get(f): cluster_manager.config[f] = config.get(f) log.info('Beginning thrashosds...') thrash_proc = ceph_manager.Thrasher(cluster_manager, config, logger=log.getChild('thrasher')) try: yield finally: log.info('joining thrashosds') thrash_proc.do_join() cluster_manager.wait_for_all_up() cluster_manager.flush_all_pg_stats() cluster_manager.wait_for_recovery(config.get('timeout', 360))
def build_ceph_cluster(ctx, config): """Build a ceph cluster""" # Expect to find ceph_admin on the first mon by ID, same place that the download task # puts it. Remember this here, because subsequently IDs will change from those in # the test config to those that ceph-deploy invents. (ceph_admin,) = ctx.cluster.only('mon.a').remotes.keys() def execute_ceph_deploy(cmd): """Remotely execute a ceph_deploy command""" return ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(cmd), ], check_status=False, ).exitstatus def ceph_disk_osd_create(ctx, config): node_dev_list = get_dev_for_osd(ctx, config) no_of_osds = 0 for d in node_dev_list: node = d[0] for disk in d[1:]: zap = './ceph-deploy disk zap ' + node + ':' + disk estatus = execute_ceph_deploy(zap) if estatus != 0: raise RuntimeError("ceph-deploy: Failed to zap osds") osd_create_cmd = './ceph-deploy osd create ' # first check for filestore, default is bluestore with ceph-deploy if config.get('filestore') is not None: osd_create_cmd += '--filestore ' elif config.get('bluestore') is not None: osd_create_cmd += '--bluestore ' if config.get('dmcrypt') is not None: osd_create_cmd += '--dmcrypt ' osd_create_cmd += ":".join(d) estatus_osd = execute_ceph_deploy(osd_create_cmd) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") return no_of_osds def ceph_volume_osd_create(ctx, config): osds = ctx.cluster.only(teuthology.is_type('osd')) no_of_osds = 0 for remote in osds.remotes.keys(): # all devs should be lvm osd_create_cmd = './ceph-deploy osd create --debug ' + remote.shortname + ' ' # default is bluestore so we just need config item for filestore roles = ctx.cluster.remotes[remote] dev_needed = len([role for role in roles if role.startswith('osd')]) all_devs = teuthology.get_scratch_devices(remote) log.info("node={n}, need_devs={d}, available={a}".format( n=remote.shortname, d=dev_needed, a=all_devs, )) devs = all_devs[0:dev_needed] # rest of the devices can be used for journal if required jdevs = dev_needed for device in devs: device_split = device.split('/') lv_device = device_split[-2] + '/' + device_split[-1] if config.get('filestore') is not None: osd_create_cmd += '--filestore --data ' + lv_device + ' ' # filestore with ceph-volume also needs journal disk try: jdevice = all_devs.pop(jdevs) except IndexError: raise RuntimeError("No device available for \ journal configuration") jdevice_split = jdevice.split('/') j_lv = jdevice_split[-2] + '/' + jdevice_split[-1] osd_create_cmd += '--journal ' + j_lv else: osd_create_cmd += ' --data ' + lv_device estatus_osd = execute_ceph_deploy(osd_create_cmd) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") return no_of_osds try: log.info('Building ceph cluster using ceph-deploy...') testdir = teuthology.get_testdir(ctx) ceph_branch = None if config.get('branch') is not None: cbranch = config.get('branch') for var, val in cbranch.items(): ceph_branch = '--{var}={val}'.format(var=var, val=val) all_nodes = get_all_nodes(ctx, config) mds_nodes = get_nodes_using_role(ctx, 'mds') mds_nodes = " ".join(mds_nodes) mon_node = get_nodes_using_role(ctx, 'mon') mon_nodes = " ".join(mon_node) # skip mgr based on config item # this is needed when test uses latest code to install old ceph # versions skip_mgr = config.get('skip-mgr', False) if not skip_mgr: mgr_nodes = get_nodes_using_role(ctx, 'mgr') mgr_nodes = " ".join(mgr_nodes) new_mon = './ceph-deploy new' + " " + mon_nodes if not skip_mgr: mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes mon_hostname = mon_nodes.split(' ')[0] mon_hostname = str(mon_hostname) gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname deploy_mds = './ceph-deploy mds create' + " " + mds_nodes if mon_nodes is None: raise RuntimeError("no monitor nodes in the config file") estatus_new = execute_ceph_deploy(new_mon) if estatus_new != 0: raise RuntimeError("ceph-deploy: new command failed") log.info('adding config inputs...') testdir = teuthology.get_testdir(ctx) conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir) if config.get('conf') is not None: confp = config.get('conf') for section, keys in confp.items(): lines = '[{section}]\n'.format(section=section) teuthology.append_lines_to_file(ceph_admin, conf_path, lines, sudo=True) for key, value in keys.items(): log.info("[%s] %s = %s" % (section, key, value)) lines = '{key} = {value}\n'.format(key=key, value=value) teuthology.append_lines_to_file( ceph_admin, conf_path, lines, sudo=True) # install ceph dev_branch = ctx.config['branch'] branch = '--dev={branch}'.format(branch=dev_branch) if ceph_branch: option = ceph_branch else: option = branch install_nodes = './ceph-deploy install ' + option + " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph") # install ceph-test package too install_nodes2 = './ceph-deploy install --tests ' + option + \ " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes2) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph-test") mon_create_nodes = './ceph-deploy mon create-initial' # If the following fails, it is OK, it might just be that the monitors # are taking way more than a minute/monitor to form quorum, so lets # try the next block which will wait up to 15 minutes to gatherkeys. execute_ceph_deploy(mon_create_nodes) estatus_gather = execute_ceph_deploy(gather_keys) if estatus_gather != 0: raise RuntimeError("ceph-deploy: Failed during gather keys") # install admin key on mons (ceph-create-keys doesn't do this any more) mons = ctx.cluster.only(teuthology.is_type('mon')) for remote in mons.remotes.keys(): execute_ceph_deploy('./ceph-deploy admin ' + remote.shortname) # create osd's if config.get('use-ceph-volume', False): no_of_osds = ceph_volume_osd_create(ctx, config) else: # this method will only work with ceph-deploy v1.5.39 or older no_of_osds = ceph_disk_osd_create(ctx, config) if not skip_mgr: execute_ceph_deploy(mgr_create) if mds_nodes: estatus_mds = execute_ceph_deploy(deploy_mds) if estatus_mds != 0: raise RuntimeError("ceph-deploy: Failed to deploy mds") if config.get('test_mon_destroy') is not None: for d in range(1, len(mon_node)): mon_destroy_nodes = './ceph-deploy mon destroy' + \ " " + mon_node[d] estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes) if estatus_mon_d != 0: raise RuntimeError("ceph-deploy: Failed to delete monitor") if config.get('wait-for-healthy', True) and no_of_osds >= 2: is_healthy(ctx=ctx, config=None) log.info('Setting up client nodes...') conf_path = '/etc/ceph/ceph.conf' admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring' first_mon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys() conf_data = teuthology.get_file( remote=mon0_remote, path=conf_path, sudo=True, ) admin_keyring = teuthology.get_file( remote=mon0_remote, path=admin_keyring_path, sudo=True, ) clients = ctx.cluster.only(teuthology.is_type('client')) for remot, roles_for_host in clients.remotes.items(): for id_ in teuthology.roles_of_type(roles_for_host, 'client'): client_keyring = \ '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) mon0_remote.run( args=[ 'cd', '{tdir}'.format(tdir=testdir), run.Raw('&&'), 'sudo', 'bash', '-c', run.Raw('"'), 'ceph', 'auth', 'get-or-create', 'client.{id}'.format(id=id_), 'mds', 'allow', 'mon', 'allow *', 'osd', 'allow *', run.Raw('>'), client_keyring, run.Raw('"'), ], ) key_data = teuthology.get_file( remote=mon0_remote, path=client_keyring, sudo=True, ) teuthology.sudo_write_file( remote=remot, path=client_keyring, data=key_data, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=admin_keyring_path, data=admin_keyring, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=conf_path, data=conf_data, perms='0644' ) if mds_nodes: log.info('Configuring CephFS...') Filesystem(ctx, create=True) elif not config.get('only_mon'): raise RuntimeError( "The cluster is NOT operational due to insufficient OSDs") # create rbd pool ceph_admin.run( args=[ 'sudo', 'ceph', '--cluster', 'ceph', 'osd', 'pool', 'create', 'rbd', '128', '128'], check_status=False) ceph_admin.run( args=[ 'sudo', 'ceph', '--cluster', 'ceph', 'osd', 'pool', 'application', 'enable', 'rbd', 'rbd', '--yes-i-really-mean-it' ], check_status=False) yield except Exception: log.info( "Error encountered, logging exception before tearing down ceph-deploy") log.info(traceback.format_exc()) raise finally: if config.get('keep_running'): return log.info('Stopping ceph...') ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'], check_status=False) time.sleep(4) # and now just check for the processes themselves, as if upstart/sysvinit # is lying to us. Ignore errors if the grep fails ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'), 'grep', '-v', 'grep', run.Raw('|'), 'grep', 'ceph'], check_status=False) ctx.cluster.run(args=['sudo', 'systemctl', run.Raw('|'), 'grep', 'ceph'], check_status=False) if ctx.archive is not None: # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) mons = ctx.cluster.only(teuthology.is_type('mon')) for remote, roles in mons.remotes.items(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.keys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) # Prevent these from being undefined if the try block fails all_nodes = get_all_nodes(ctx, config) purge_nodes = './ceph-deploy purge' + " " + all_nodes purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes log.info('Purging package...') execute_ceph_deploy(purge_nodes) log.info('Purging data...') execute_ceph_deploy(purgedata_nodes)
def build_ceph_cluster(ctx, config): """Build a ceph cluster""" # Expect to find ceph_admin on the first mon by ID, same place that the download task # puts it. Remember this here, because subsequently IDs will change from those in # the test config to those that ceph-deploy invents. (ceph_admin, ) = ctx.cluster.only(teuthology.get_first_mon( ctx, config)).remotes.iterkeys() def execute_ceph_deploy(cmd): """Remotely execute a ceph_deploy command""" return ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(cmd), ], check_status=False, ).exitstatus try: log.info('Building ceph cluster using ceph-deploy...') testdir = teuthology.get_testdir(ctx) ceph_branch = None if config.get('branch') is not None: cbranch = config.get('branch') for var, val in cbranch.iteritems(): ceph_branch = '--{var}={val}'.format(var=var, val=val) all_nodes = get_all_nodes(ctx, config) mds_nodes = get_nodes_using_role(ctx, 'mds') mds_nodes = " ".join(mds_nodes) mon_node = get_nodes_using_role(ctx, 'mon') mon_nodes = " ".join(mon_node) new_mon = './ceph-deploy new' + " " + mon_nodes mon_hostname = mon_nodes.split(' ')[0] mon_hostname = str(mon_hostname) gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname deploy_mds = './ceph-deploy mds create' + " " + mds_nodes no_of_osds = 0 if mon_nodes is None: raise RuntimeError("no monitor nodes in the config file") estatus_new = execute_ceph_deploy(new_mon) if estatus_new != 0: raise RuntimeError("ceph-deploy: new command failed") log.info('adding config inputs...') testdir = teuthology.get_testdir(ctx) conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir) if config.get('conf') is not None: confp = config.get('conf') for section, keys in confp.iteritems(): lines = '[{section}]\n'.format(section=section) teuthology.append_lines_to_file(ceph_admin, conf_path, lines, sudo=True) for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) lines = '{key} = {value}\n'.format(key=key, value=value) teuthology.append_lines_to_file(ceph_admin, conf_path, lines, sudo=True) # install ceph install_nodes = './ceph-deploy install ' + \ (ceph_branch if ceph_branch else "--dev=master") + " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph") # install ceph-test package too install_nodes2 = './ceph-deploy install --tests ' + \ (ceph_branch if ceph_branch else "--dev=master") + " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes2) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph-test") mon_create_nodes = './ceph-deploy mon create-initial' # If the following fails, it is OK, it might just be that the monitors # are taking way more than a minute/monitor to form quorum, so lets # try the next block which will wait up to 15 minutes to gatherkeys. execute_ceph_deploy(mon_create_nodes) estatus_gather = execute_ceph_deploy(gather_keys) max_gather_tries = 90 gather_tries = 0 while (estatus_gather != 0): gather_tries += 1 if gather_tries >= max_gather_tries: msg = 'ceph-deploy was not able to gatherkeys after 15 minutes' raise RuntimeError(msg) estatus_gather = execute_ceph_deploy(gather_keys) time.sleep(10) if mds_nodes: estatus_mds = execute_ceph_deploy(deploy_mds) if estatus_mds != 0: raise RuntimeError("ceph-deploy: Failed to deploy mds") if config.get('test_mon_destroy') is not None: for d in range(1, len(mon_node)): mon_destroy_nodes = './ceph-deploy mon destroy' + \ " " + mon_node[d] estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes) if estatus_mon_d != 0: raise RuntimeError("ceph-deploy: Failed to delete monitor") node_dev_list = get_dev_for_osd(ctx, config) for d in node_dev_list: node = d[0] for disk in d[1:]: zap = './ceph-deploy disk zap ' + node + ':' + disk estatus = execute_ceph_deploy(zap) if estatus != 0: raise RuntimeError("ceph-deploy: Failed to zap osds") osd_create_cmd = './ceph-deploy osd create ' if config.get('dmcrypt') is not None: osd_create_cmd += '--dmcrypt ' osd_create_cmd += ":".join(d) estatus_osd = execute_ceph_deploy(osd_create_cmd) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") if config.get('wait-for-healthy', True) and no_of_osds >= 2: is_healthy(ctx=ctx, config=None) log.info('Setting up client nodes...') conf_path = '/etc/ceph/ceph.conf' admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring' first_mon = teuthology.get_first_mon(ctx, config) (mon0_remote, ) = ctx.cluster.only(first_mon).remotes.keys() conf_data = teuthology.get_file( remote=mon0_remote, path=conf_path, sudo=True, ) admin_keyring = teuthology.get_file( remote=mon0_remote, path=admin_keyring_path, sudo=True, ) clients = ctx.cluster.only(teuthology.is_type('client')) for remot, roles_for_host in clients.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'client'): client_keyring = \ '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) mon0_remote.run(args=[ 'cd', '{tdir}'.format(tdir=testdir), run.Raw('&&'), 'sudo', 'bash', '-c', run.Raw('"'), 'ceph', 'auth', 'get-or-create', 'client.{id}'.format(id=id_), 'mds', 'allow', 'mon', 'allow *', 'osd', 'allow *', run.Raw('>'), client_keyring, run.Raw('"'), ], ) key_data = teuthology.get_file( remote=mon0_remote, path=client_keyring, sudo=True, ) teuthology.sudo_write_file(remote=remot, path=client_keyring, data=key_data, perms='0644') teuthology.sudo_write_file(remote=remot, path=admin_keyring_path, data=admin_keyring, perms='0644') teuthology.sudo_write_file(remote=remot, path=conf_path, data=conf_data, perms='0644') if mds_nodes: log.info('Configuring CephFS...') ceph_fs = Filesystem(ctx, admin_remote=clients.remotes.keys()[0]) if not ceph_fs.legacy_configured(): ceph_fs.create() elif not config.get('only_mon'): raise RuntimeError( "The cluster is NOT operational due to insufficient OSDs") yield except Exception: log.info( "Error encountered, logging exception before tearing down ceph-deploy" ) log.info(traceback.format_exc()) raise finally: if config.get('keep_running'): return log.info('Stopping ceph...') ctx.cluster.run(args=[ 'sudo', 'stop', 'ceph-all', run.Raw('||'), 'sudo', 'service', 'ceph', 'stop', run.Raw('||'), 'sudo', 'systemctl', 'stop', 'ceph.target' ]) # Are you really not running anymore? # try first with the init tooling # ignoring the status so this becomes informational only ctx.cluster.run(args=[ 'sudo', 'status', 'ceph-all', run.Raw('||'), 'sudo', 'service', 'ceph', 'status', run.Raw('||'), 'sudo', 'systemctl', 'status', 'ceph.target' ], check_status=False) # and now just check for the processes themselves, as if upstart/sysvinit # is lying to us. Ignore errors if the grep fails ctx.cluster.run(args=[ 'sudo', 'ps', 'aux', run.Raw('|'), 'grep', '-v', 'grep', run.Raw('|'), 'grep', 'ceph' ], check_status=False) if ctx.archive is not None: # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) mons = ctx.cluster.only(teuthology.is_type('mon')) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) # Prevent these from being undefined if the try block fails all_nodes = get_all_nodes(ctx, config) purge_nodes = './ceph-deploy purge' + " " + all_nodes purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes log.info('Purging package...') execute_ceph_deploy(purge_nodes) log.info('Purging data...') execute_ceph_deploy(purgedata_nodes)
def run_daemon(ctx, config, type_): log.info('Starting %s daemons...' % type_) testdir = teuthology.get_testdir(ctx) daemons = ctx.cluster.only(teuthology.is_type(type_)) coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) daemon_signal = 'kill' if config.get('coverage') or config.get('valgrind') is not None: daemon_signal = 'term' num_active = 0 for remote, roles_for_host in daemons.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, type_): name = '%s.%s' % (type_, id_) if not (id_.endswith('-s')) and (id_.find('-s-') == -1): num_active += 1 run_cmd = [ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'daemon-helper', daemon_signal, ] run_cmd_tail = [ 'ceph-%s' % (type_), '-f', '-i', id_] if type_ in config.get('cpu_profile', []): profile_path = '/var/log/ceph/profiling-logger/%s.%s.prof' % (type_, id_) run_cmd.extend([ 'env', 'CPUPROFILE=%s' % profile_path ]) if config.get('valgrind') is not None: valgrind_args = None if type_ in config['valgrind']: valgrind_args = config['valgrind'][type_] if name in config['valgrind']: valgrind_args = config['valgrind'][name] run_cmd = teuthology.get_valgrind_args(testdir, name, run_cmd, valgrind_args) run_cmd.extend(run_cmd_tail) ctx.daemons.add_daemon(remote, type_, id_, args=run_cmd, logger=log.getChild(name), stdin=run.PIPE, wait=False, ) if type_ == 'mds': firstmon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() mon0_remote.run(args=[ 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph', 'mds', 'set_max_mds', str(num_active)]) try: yield finally: teuthology.stop_daemons_of_type(ctx, type_)
def task(ctx, config): """ Run ceph_objectstore_tool test The config should be as follows:: ceph_objectstore_tool: objects: 20 # <number of objects> pgnum: 12 """ if config is None: config = {} assert isinstance(config, dict), \ 'ceph_objectstore_tool task only accepts a dict for configuration' log.info('Beginning ceph_objectstore_tool...') log.debug(config) log.debug(ctx) clients = ctx.cluster.only(teuthology.is_type('client')) assert len(clients.remotes) > 0, 'Must specify at least 1 client' (cli_remote, _) = clients.remotes.popitem() log.debug(cli_remote) # clients = dict(teuthology.get_clients(ctx=ctx, roles=config.keys())) # client = clients.popitem() # log.info(client) osds = ctx.cluster.only(teuthology.is_type('osd')) log.info("OSDS") log.info(osds) log.info(osds.remotes) first_mon = teuthology.get_first_mon(ctx, config) (mon, ) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, config=config, logger=log.getChild('ceph_manager'), ) ctx.manager = manager while (len(manager.get_osd_status()['up']) != len( manager.get_osd_status()['raw'])): time.sleep(10) while (len(manager.get_osd_status()['in']) != len( manager.get_osd_status()['up'])): time.sleep(10) manager.raw_cluster_cmd('osd', 'set', 'noout') manager.raw_cluster_cmd('osd', 'set', 'nodown') PGNUM = config.get('pgnum', 12) log.info("pgnum: {num}".format(num=PGNUM)) ERRORS = 0 REP_POOL = "rep_pool" REP_NAME = "REPobject" create_replicated_pool(cli_remote, REP_POOL, PGNUM) ERRORS += test_objectstore(ctx, config, cli_remote, REP_POOL, REP_NAME) EC_POOL = "ec_pool" EC_NAME = "ECobject" create_ec_pool(cli_remote, EC_POOL, 'default', PGNUM) ERRORS += test_objectstore(ctx, config, cli_remote, EC_POOL, EC_NAME, ec=True) if ERRORS == 0: log.info("TEST PASSED") else: log.error("TEST FAILED WITH {errcount} ERRORS".format(errcount=ERRORS)) assert ERRORS == 0 try: yield finally: log.info('Ending ceph_objectstore_tool')
def build_ceph_cluster(ctx, config): log.info('Building ceph cluster using ceph-deploy...') testdir = teuthology.get_testdir(ctx) ceph_branch = None if config.get('branch') is not None: cbranch = config.get('branch') for var, val in cbranch.iteritems(): if var == 'testing': ceph_branch = '--{var}'.format(var=var) ceph_branch = '--{var}={val}'.format(var=var, val=val) node_dev_list = [] all_nodes = get_all_nodes(ctx, config) mds_nodes = get_nodes_using_roles(ctx, config, 'mds') mds_nodes = " ".join(mds_nodes) mon_node = get_nodes_using_roles(ctx, config, 'mon') mon_nodes = " ".join(mon_node) new_mon = './ceph-deploy new' + " " + mon_nodes install_nodes = './ceph-deploy install ' + ceph_branch + " " + all_nodes purge_nodes = './ceph-deploy purge' + " " + all_nodes purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes mon_hostname = mon_nodes.split(' ')[0] mon_hostname = str(mon_hostname) gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname deploy_mds = './ceph-deploy mds create' + " " + mds_nodes no_of_osds = 0 if mon_nodes is None: raise RuntimeError("no monitor nodes in the config file") estatus_new = execute_ceph_deploy(ctx, config, new_mon) if estatus_new != 0: raise RuntimeError("ceph-deploy: new command failed") log.info('adding config inputs...') testdir = teuthology.get_testdir(ctx) conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir) first_mon = teuthology.get_first_mon(ctx, config) (remote, ) = ctx.cluster.only(first_mon).remotes.keys() lines = None if config.get('conf') is not None: confp = config.get('conf') for section, keys in confp.iteritems(): lines = '[{section}]\n'.format(section=section) teuthology.append_lines_to_file(remote, conf_path, lines, sudo=True) for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) lines = '{key} = {value}\n'.format(key=key, value=value) teuthology.append_lines_to_file(remote, conf_path, lines, sudo=True) estatus_install = execute_ceph_deploy(ctx, config, install_nodes) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph") mon_no = None mon_no = config.get('mon_initial_members') if mon_no is not None: i = 0 mon1 = [] while (i < mon_no): mon1.append(mon_node[i]) i = i + 1 initial_mons = " ".join(mon1) for k in range(mon_no, len(mon_node)): mon_create_nodes = './ceph-deploy mon create' + " " + initial_mons + " " + mon_node[ k] estatus_mon = execute_ceph_deploy(ctx, config, mon_create_nodes) if estatus_mon != 0: raise RuntimeError("ceph-deploy: Failed to create monitor") else: mon_create_nodes = './ceph-deploy mon create' + " " + mon_nodes estatus_mon = execute_ceph_deploy(ctx, config, mon_create_nodes) if estatus_mon != 0: raise RuntimeError("ceph-deploy: Failed to create monitors") estatus_gather = execute_ceph_deploy(ctx, config, gather_keys) while (estatus_gather != 0): #mon_create_nodes = './ceph-deploy mon create'+" "+mon_node[0] #execute_ceph_deploy(ctx, config, mon_create_nodes) estatus_gather = execute_ceph_deploy(ctx, config, gather_keys) if mds_nodes: estatus_mds = execute_ceph_deploy(ctx, config, deploy_mds) if estatus_mds != 0: raise RuntimeError("ceph-deploy: Failed to deploy mds") if config.get('test_mon_destroy') is not None: for d in range(1, len(mon_node)): mon_destroy_nodes = './ceph-deploy mon destroy' + " " + mon_node[d] estatus_mon_d = execute_ceph_deploy(ctx, config, mon_destroy_nodes) if estatus_mon_d != 0: raise RuntimeError("ceph-deploy: Failed to delete monitor") node_dev_list = get_dev_for_osd(ctx, config) for d in node_dev_list: osd_create_cmds = './ceph-deploy osd create --zap-disk' + " " + d estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmds) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: zap_disk = './ceph-deploy disk zap' + " " + d execute_ceph_deploy(ctx, config, zap_disk) estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmds) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") if config.get('wait-for-healthy', True) and no_of_osds >= 2: is_healthy(ctx=ctx, config=None) log.info('Setting up client nodes...') conf_path = '/etc/ceph/ceph.conf' admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring' first_mon = teuthology.get_first_mon(ctx, config) (mon0_remote, ) = ctx.cluster.only(first_mon).remotes.keys() conf_data = teuthology.get_file( remote=mon0_remote, path=conf_path, sudo=True, ) admin_keyring = teuthology.get_file( remote=mon0_remote, path=admin_keyring_path, sudo=True, ) clients = ctx.cluster.only(teuthology.is_type('client')) for remot, roles_for_host in clients.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'client'): client_keyring = '/etc/ceph/ceph.client.{id}.keyring'.format( id=id_) mon0_remote.run(args=[ 'cd', '{tdir}'.format(tdir=testdir), run.Raw('&&'), 'sudo', 'bash', '-c', run.Raw('"'), 'ceph', 'auth', 'get-or-create', 'client.{id}'.format(id=id_), 'mds', 'allow', 'mon', 'allow *', 'osd', 'allow *', run.Raw('>'), client_keyring, run.Raw('"'), ], ) key_data = teuthology.get_file( remote=mon0_remote, path=client_keyring, sudo=True, ) teuthology.sudo_write_file(remote=remot, path=client_keyring, data=key_data, perms='0644') teuthology.sudo_write_file(remote=remot, path=admin_keyring_path, data=admin_keyring, perms='0644') teuthology.sudo_write_file(remote=remot, path=conf_path, data=conf_data, perms='0644') else: raise RuntimeError( "The cluster is NOT operational due to insufficient OSDs") try: yield finally: log.info('Stopping ceph...') ctx.cluster.run(args=[ 'sudo', 'stop', 'ceph-all', run.Raw('||'), 'sudo', 'service', 'ceph', 'stop' ]) if ctx.archive is not None: # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) mons = ctx.cluster.only(teuthology.is_type('mon')) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) log.info('Purging package...') execute_ceph_deploy(ctx, config, purge_nodes) log.info('Purging data...') execute_ceph_deploy(ctx, config, purgedata_nodes)
def ceph_mons(ctx, config): """ Deploy any additional mons """ cluster_name = config['cluster'] fsid = ctx.ceph[cluster_name].fsid try: daemons = {} if config.get('add_mons_via_daemon_add'): # This is the old way of adding mons that works with the (early) octopus # cephadm scheduler. num_mons = 1 for remote, roles in ctx.cluster.remotes.items(): for mon in [ r for r in roles if teuthology.is_type('mon', cluster_name)(r) ]: c_, _, id_ = teuthology.split_role(mon) if c_ == cluster_name and id_ == ctx.ceph[ cluster_name].first_mon: continue log.info('Adding %s on %s' % (mon, remote.shortname)) num_mons += 1 _shell(ctx, cluster_name, remote, [ 'ceph', 'orch', 'daemon', 'add', 'mon', remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_, ]) ctx.daemons.register_daemon( remote, 'mon', id_, cluster=cluster_name, fsid=fsid, logger=log.getChild(mon), wait=False, started=True, ) daemons[mon] = (remote, id_) with contextutil.safe_while(sleep=1, tries=180) as proceed: while proceed(): log.info('Waiting for %d mons in monmap...' % (num_mons)) r = _shell( ctx=ctx, cluster_name=cluster_name, remote=remote, args=[ 'ceph', 'mon', 'dump', '-f', 'json', ], stdout=StringIO(), ) j = json.loads(r.stdout.getvalue()) if len(j['mons']) == num_mons: break else: nodes = [] for remote, roles in ctx.cluster.remotes.items(): for mon in [ r for r in roles if teuthology.is_type('mon', cluster_name)(r) ]: c_, _, id_ = teuthology.split_role(mon) log.info('Adding %s on %s' % (mon, remote.shortname)) nodes.append(remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_) if c_ == cluster_name and id_ == ctx.ceph[ cluster_name].first_mon: continue daemons[mon] = (remote, id_) _shell(ctx, cluster_name, remote, [ 'ceph', 'orch', 'apply', 'mon', str(len(nodes)) + ';' + ';'.join(nodes) ]) for mgr, i in daemons.items(): remote, id_ = i ctx.daemons.register_daemon( remote, 'mon', id_, cluster=cluster_name, fsid=fsid, logger=log.getChild(mon), wait=False, started=True, ) with contextutil.safe_while(sleep=1, tries=180) as proceed: while proceed(): log.info('Waiting for %d mons in monmap...' % (len(nodes))) r = _shell( ctx=ctx, cluster_name=cluster_name, remote=remote, args=[ 'ceph', 'mon', 'dump', '-f', 'json', ], stdout=StringIO(), ) j = json.loads(r.stdout.getvalue()) if len(j['mons']) == len(nodes): break # refresh our (final) ceph.conf file bootstrap_remote = ctx.ceph[cluster_name].bootstrap_remote log.info('Generating final ceph.conf file...') r = _shell( ctx=ctx, cluster_name=cluster_name, remote=bootstrap_remote, args=[ 'ceph', 'config', 'generate-minimal-conf', ], stdout=StringIO(), ) ctx.ceph[cluster_name].config_file = r.stdout.getvalue() yield finally: pass
def hosts_of_type(self, type_): return [ r.name for r in self.ctx.cluster.only(misc.is_type(type_)).remotes.keys() ]
def upgrade(ctx, config): """ Upgrade using ceph-deploy eg: ceph-deploy.upgrade: # to upgrade to specific branch, use branch: stable: jewel # to setup mgr node, use setup-mgr-node: True # to wait for cluster to be healthy after all upgrade, use wait-for-healthy: True role: (upgrades the below roles serially) mon.a mon.b osd.0 """ roles = config.get('roles') # get the roles that are mapped as per ceph-deploy # roles are mapped for mon/mds eg: mon.a => mon.host_short_name mapped_role = ctx.cluster.mapped_role log.info("roles={r}, mapped_roles={mr}".format(r=roles, mr=mapped_role)) if config.get('branch'): branch = config.get('branch') (var, val) = branch.items()[0] ceph_branch = '--{var}={val}'.format(var=var, val=val) else: # default to wip-branch under test dev_branch = ctx.config['branch'] ceph_branch = '--dev={branch}'.format(branch=dev_branch) # get the node used for initial deployment which is mon.a mon_a = mapped_role.get('mon.a') (ceph_admin,) = ctx.cluster.only(mon_a).remotes.keys() testdir = teuthology.get_testdir(ctx) cmd = './ceph-deploy install ' + ceph_branch for role in roles: # check if this role is mapped (mon or mds) if mapped_role.get(role): role = mapped_role.get(role) remotes_and_roles = ctx.cluster.only(role).remotes for remote, roles in remotes_and_roles.items(): nodename = remote.shortname cmd = cmd + ' ' + nodename log.info("Upgrading ceph on %s", nodename) ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(cmd), ], ) # restart all ceph services, ideally upgrade should but it does not remote.run( args=[ 'sudo', 'systemctl', 'restart', 'ceph.target' ] ) ceph_admin.run(args=['sudo', 'ceph', '-s']) # workaround for http://tracker.ceph.com/issues/20950 # write the correct mgr key to disk if config.get('setup-mgr-node', None): mons = ctx.cluster.only(teuthology.is_type('mon')) for remote, roles in mons.remotes.items(): remote.run( args=[ run.Raw('sudo ceph auth get client.bootstrap-mgr'), run.Raw('|'), run.Raw('sudo tee'), run.Raw('/var/lib/ceph/bootstrap-mgr/ceph.keyring') ] ) if config.get('setup-mgr-node', None): mgr_nodes = get_nodes_using_role(ctx, 'mgr') mgr_nodes = " ".join(mgr_nodes) mgr_install = './ceph-deploy install --mgr ' + ceph_branch + " " + mgr_nodes mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes # install mgr ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(mgr_install), ], ) # create mgr ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(mgr_create), ], ) ceph_admin.run(args=['sudo', 'ceph', '-s']) if config.get('wait-for-healthy', None): wait_until_healthy(ctx, ceph_admin, use_sudo=True) yield
def cli_test(ctx, config): """ ceph-deploy cli to exercise most commonly use cli's and ensure all commands works and also startup the init system. """ log.info('Ceph-deploy Test') if config is None: config = {} test_branch = '' conf_dir = teuthology.get_testdir(ctx) + "/cdtest" def execute_cdeploy(admin, cmd, path): """Execute ceph-deploy commands """ """Either use git path or repo path """ args = ['cd', conf_dir, run.Raw(';')] if path: args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path)) else: args.append('ceph-deploy') args.append(run.Raw(cmd)) ec = admin.run(args=args, check_status=False).exitstatus if ec != 0: raise RuntimeError( "failed during ceph-deploy cmd: {cmd} , ec={ec}".format(cmd=cmd, ec=ec)) if config.get('rhbuild'): path = None else: path = teuthology.get_testdir(ctx) # test on branch from config eg: wip-* , master or next etc # packages for all distro's should exist for wip* if ctx.config.get('branch'): branch = ctx.config.get('branch') test_branch = ' --dev={branch} '.format(branch=branch) mons = ctx.cluster.only(teuthology.is_type('mon')) for node, role in mons.remotes.items(): admin = node admin.run(args=['mkdir', conf_dir], check_status=False) nodename = admin.shortname system_type = teuthology.get_system_type(admin) if config.get('rhbuild'): admin.run(args=['sudo', 'yum', 'install', 'ceph-deploy', '-y']) log.info('system type is %s', system_type) osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, roles in osds.remotes.items(): devs = teuthology.get_scratch_devices(remote) log.info("roles %s", roles) if (len(devs) < 3): log.error( 'Test needs minimum of 3 devices, only found %s', str(devs)) raise RuntimeError("Needs minimum of 3 devices ") conf_path = '{conf_dir}/ceph.conf'.format(conf_dir=conf_dir) new_cmd = 'new ' + nodename execute_cdeploy(admin, new_cmd, path) if config.get('conf') is not None: confp = config.get('conf') for section, keys in confp.items(): lines = '[{section}]\n'.format(section=section) teuthology.append_lines_to_file(admin, conf_path, lines, sudo=True) for key, value in keys.items(): log.info("[%s] %s = %s" % (section, key, value)) lines = '{key} = {value}\n'.format(key=key, value=value) teuthology.append_lines_to_file(admin, conf_path, lines, sudo=True) new_mon_install = 'install {branch} --mon '.format( branch=test_branch) + nodename new_mgr_install = 'install {branch} --mgr '.format( branch=test_branch) + nodename new_osd_install = 'install {branch} --osd '.format( branch=test_branch) + nodename new_admin = 'install {branch} --cli '.format(branch=test_branch) + nodename create_initial = 'mon create-initial ' mgr_create = 'mgr create ' + nodename # either use create-keys or push command push_keys = 'admin ' + nodename execute_cdeploy(admin, new_mon_install, path) execute_cdeploy(admin, new_mgr_install, path) execute_cdeploy(admin, new_osd_install, path) execute_cdeploy(admin, new_admin, path) execute_cdeploy(admin, create_initial, path) execute_cdeploy(admin, mgr_create, path) execute_cdeploy(admin, push_keys, path) for i in range(3): zap_disk = 'disk zap ' + "{n}:{d}".format(n=nodename, d=devs[i]) prepare = 'osd prepare ' + "{n}:{d}".format(n=nodename, d=devs[i]) execute_cdeploy(admin, zap_disk, path) execute_cdeploy(admin, prepare, path) log.info("list files for debugging purpose to check file permissions") admin.run(args=['ls', run.Raw('-lt'), conf_dir]) remote.run(args=['sudo', 'ceph', '-s'], check_status=False) r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO()) out = r.stdout.getvalue() log.info('Ceph health: %s', out.rstrip('\n')) log.info("Waiting for cluster to become healthy") with contextutil.safe_while(sleep=10, tries=6, action='check health') as proceed: while proceed(): r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO()) out = r.stdout.getvalue() if (out.split(None, 1)[0] == 'HEALTH_OK'): break rgw_install = 'install {branch} --rgw {node}'.format( branch=test_branch, node=nodename, ) rgw_create = 'rgw create ' + nodename execute_cdeploy(admin, rgw_install, path) execute_cdeploy(admin, rgw_create, path) log.info('All ceph-deploy cli tests passed') try: yield finally: log.info("cleaning up") ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'], check_status=False) time.sleep(4) for i in range(3): umount_dev = "{d}1".format(d=devs[i]) r = remote.run(args=['sudo', 'umount', run.Raw(umount_dev)]) cmd = 'purge ' + nodename execute_cdeploy(admin, cmd, path) cmd = 'purgedata ' + nodename execute_cdeploy(admin, cmd, path) log.info("Removing temporary dir") admin.run( args=[ 'rm', run.Raw('-rf'), run.Raw(conf_dir)], check_status=False) if config.get('rhbuild'): admin.run(args=['sudo', 'yum', 'remove', 'ceph-deploy', '-y'])
def task(ctx, config): """ Start up rest-api. To start on on all clients:: tasks: - ceph: - rest-api: To only run on certain clients:: tasks: - ceph: - rest-api: [client.0, client.3] or tasks: - ceph: - rest-api: client.0: client.3: The general flow of things here is: 1. Find clients on which rest-api is supposed to run (api_clients) 2. Generate keyring values 3. Start up ceph-rest-api daemons On cleanup: 4. Stop the daemons 5. Delete keyring value files. """ api_clients = [] remotes = ctx.cluster.only(teuthology.is_type('client')).remotes log.info(remotes) if config == None: api_clients = [ 'client.{id}'.format(id=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client') ] else: api_clients = config log.info(api_clients) testdir = teuthology.get_testdir(ctx) coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) for rems, roles in remotes.iteritems(): for whole_id_ in roles: if whole_id_ in api_clients: id_ = whole_id_[len('client.'):] keyring = '/etc/ceph/ceph.client.rest{id}.keyring'.format( id=id_) rems.run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', '--gen-key', '--name=client.rest{id}'.format(id=id_), '--set-uid=0', '--cap', 'mon', 'allow *', '--cap', 'osd', 'allow *', '--cap', 'mds', 'allow', keyring, run.Raw('&&'), 'sudo', 'chmod', '0644', keyring, ], ) rems.run(args=[ 'sudo', 'sh', '-c', run.Raw("'"), "echo", '[client.rest{id}]'.format(id=id_), run.Raw('>>'), "/etc/ceph/ceph.conf", run.Raw("'") ]) rems.run(args=[ 'sudo', 'sh', '-c', run.Raw("'"), 'echo', 'restapi', 'keyring', '=', '/etc/ceph/ceph.client.rest{id}.keyring'.format(id=id_), run.Raw('>>'), '/etc/ceph/ceph.conf', run.Raw("'"), ]) rems.run(args=[ 'ceph', 'auth', 'import', '-i', '/etc/ceph/ceph.client.rest{id}.keyring'.format(id=id_), ]) with contextutil.nested( lambda: run_rest_api_daemon(ctx=ctx, api_clients=api_clients), ): yield
def test_objectstore(ctx, config, cli_remote, REP_POOL, REP_NAME, ec=False): manager = ctx.manager osds = ctx.cluster.only(teuthology.is_type('osd')) TEUTHDIR = teuthology.get_testdir(ctx) DATADIR = os.path.join(TEUTHDIR, "data") DATALINECOUNT = 10000 ERRORS = 0 NUM_OBJECTS = config.get('objects', 10) log.info("objects: {num}".format(num=NUM_OBJECTS)) pool_dump = manager.get_pool_dump(REP_POOL) REPID = pool_dump['pool'] log.debug("repid={num}".format(num=REPID)) db = {} LOCALDIR = tempfile.mkdtemp("cod") cod_setup_local_data(log, ctx, NUM_OBJECTS, LOCALDIR, REP_NAME, DATALINECOUNT) allremote = [] allremote.append(cli_remote) allremote += osds.remotes.keys() allremote = list(set(allremote)) for remote in allremote: cod_setup_remote_data(log, ctx, remote, NUM_OBJECTS, DATADIR, REP_NAME, DATALINECOUNT) ERRORS += cod_setup(log, ctx, cli_remote, NUM_OBJECTS, DATADIR, REP_NAME, DATALINECOUNT, REP_POOL, db, ec) pgs = {} for stats in manager.get_pg_stats(): if stats["pgid"].find(str(REPID) + ".") != 0: continue if pool_dump["type"] == ceph_manager.CephManager.REPLICATED_POOL: for osd in stats["acting"]: pgs.setdefault(osd, []).append(stats["pgid"]) elif pool_dump["type"] == ceph_manager.CephManager.ERASURE_CODED_POOL: shard = 0 for osd in stats["acting"]: pgs.setdefault(osd, []).append("{pgid}s{shard}".format( pgid=stats["pgid"], shard=shard)) shard += 1 else: raise Exception("{pool} has an unexpected type {type}".format( pool=REP_POOL, type=pool_dump["type"])) log.info(pgs) log.info(db) for osd in manager.get_osd_status()['up']: manager.kill_osd(osd) time.sleep(5) pgswithobjects = set() objsinpg = {} # Test --op list and generate json for all objects log.info("Test --op list by generating json for all objects") prefix = ("sudo ceph-objectstore-tool " "--data-path {fpath} " "--journal-path {jpath} ").format(fpath=FSPATH, jpath=JPATH) for remote in osds.remotes.iterkeys(): log.debug(remote) log.debug(osds.remotes[remote]) for role in osds.remotes[remote]: if string.find(role, "osd.") != 0: continue osdid = int(role.split('.')[1]) log.info("process osd.{id} on {remote}".format(id=osdid, remote=remote)) cmd = (prefix + "--op list").format(id=osdid) proc = remote.run(args=cmd.split(), check_status=False, stdout=StringIO()) if proc.exitstatus != 0: log.error( "Bad exit status {ret} from --op list request".format( ret=proc.exitstatus)) ERRORS += 1 else: for pgline in proc.stdout.getvalue().splitlines(): if not pgline: continue (pg, obj) = json.loads(pgline) name = obj['oid'] if name in db: pgswithobjects.add(pg) objsinpg.setdefault(pg, []).append(name) db[name].setdefault("pg2json", {})[pg] = json.dumps(obj) log.info(db) log.info(pgswithobjects) log.info(objsinpg) if pool_dump["type"] == ceph_manager.CephManager.REPLICATED_POOL: # Test get-bytes log.info("Test get-bytes and set-bytes") for basename in db.keys(): file = os.path.join(DATADIR, basename) GETNAME = os.path.join(DATADIR, "get") SETNAME = os.path.join(DATADIR, "set") for remote in osds.remotes.iterkeys(): for role in osds.remotes[remote]: if string.find(role, "osd.") != 0: continue osdid = int(role.split('.')[1]) if osdid not in pgs: continue for pg, JSON in db[basename]["pg2json"].iteritems(): if pg in pgs[osdid]: cmd = ((prefix + "--pgid {pg}").format( id=osdid, pg=pg).split()) cmd.append(run.Raw("'{json}'".format(json=JSON))) cmd += ("get-bytes {fname}".format( fname=GETNAME).split()) proc = remote.run(args=cmd, check_status=False) if proc.exitstatus != 0: remote.run(args="rm -f {getfile}".format( getfile=GETNAME).split()) log.error("Bad exit status {ret}".format( ret=proc.exitstatus)) ERRORS += 1 continue cmd = ("diff -q {file} {getfile}".format( file=file, getfile=GETNAME)) proc = remote.run(args=cmd.split()) if proc.exitstatus != 0: log.error("Data from get-bytes differ") # log.debug("Got:") # cat_file(logging.DEBUG, GETNAME) # log.debug("Expected:") # cat_file(logging.DEBUG, file) ERRORS += 1 remote.run(args="rm -f {getfile}".format( getfile=GETNAME).split()) data = ("put-bytes going into {file}\n".format( file=file)) teuthology.write_file(remote, SETNAME, data) cmd = ((prefix + "--pgid {pg}").format( id=osdid, pg=pg).split()) cmd.append(run.Raw("'{json}'".format(json=JSON))) cmd += ("set-bytes {fname}".format( fname=SETNAME).split()) proc = remote.run(args=cmd, check_status=False) proc.wait() if proc.exitstatus != 0: log.info( "set-bytes failed for object {obj} " "in pg {pg} osd.{id} ret={ret}".format( obj=basename, pg=pg, id=osdid, ret=proc.exitstatus)) ERRORS += 1 cmd = ((prefix + "--pgid {pg}").format( id=osdid, pg=pg).split()) cmd.append(run.Raw("'{json}'".format(json=JSON))) cmd += "get-bytes -".split() proc = remote.run(args=cmd, check_status=False, stdout=StringIO()) proc.wait() if proc.exitstatus != 0: log.error("get-bytes after " "set-bytes ret={ret}".format( ret=proc.exitstatus)) ERRORS += 1 else: if data != proc.stdout.getvalue(): log.error("Data inconsistent after " "set-bytes, got:") log.error(proc.stdout.getvalue()) ERRORS += 1 cmd = ((prefix + "--pgid {pg}").format( id=osdid, pg=pg).split()) cmd.append(run.Raw("'{json}'".format(json=JSON))) cmd += ("set-bytes {fname}".format( fname=file).split()) proc = remote.run(args=cmd, check_status=False) proc.wait() if proc.exitstatus != 0: log.info( "set-bytes failed for object {obj} " "in pg {pg} osd.{id} ret={ret}".format( obj=basename, pg=pg, id=osdid, ret=proc.exitstatus)) ERRORS += 1 log.info("Test list-attrs get-attr") for basename in db.keys(): file = os.path.join(DATADIR, basename) GETNAME = os.path.join(DATADIR, "get") SETNAME = os.path.join(DATADIR, "set") for remote in osds.remotes.iterkeys(): for role in osds.remotes[remote]: if string.find(role, "osd.") != 0: continue osdid = int(role.split('.')[1]) if osdid not in pgs: continue for pg, JSON in db[basename]["pg2json"].iteritems(): if pg in pgs[osdid]: cmd = ((prefix + "--pgid {pg}").format(id=osdid, pg=pg).split()) cmd.append(run.Raw("'{json}'".format(json=JSON))) cmd += ["list-attrs"] proc = remote.run(args=cmd, check_status=False, stdout=StringIO(), stderr=StringIO()) proc.wait() if proc.exitstatus != 0: log.error("Bad exit status {ret}".format( ret=proc.exitstatus)) ERRORS += 1 continue keys = proc.stdout.getvalue().split() values = dict(db[basename]["xattr"]) for key in keys: if (key == "_" or key == "snapset" or key == "hinfo_key"): continue key = key.strip("_") if key not in values: log.error( "The key {key} should be present".format( key=key)) ERRORS += 1 continue exp = values.pop(key) cmd = ((prefix + "--pgid {pg}").format( id=osdid, pg=pg).split()) cmd.append(run.Raw("'{json}'".format(json=JSON))) cmd += ("get-attr {key}".format(key="_" + key).split()) proc = remote.run(args=cmd, check_status=False, stdout=StringIO()) proc.wait() if proc.exitstatus != 0: log.error("get-attr failed with {ret}".format( ret=proc.exitstatus)) ERRORS += 1 continue val = proc.stdout.getvalue() if exp != val: log.error("For key {key} got value {got} " "instead of {expected}".format( key=key, got=val, expected=exp)) ERRORS += 1 if "hinfo_key" in keys: cmd_prefix = prefix.format(id=osdid) cmd = """ expected=$({prefix} --pgid {pg} '{json}' get-attr {key} | base64) echo placeholder | {prefix} --pgid {pg} '{json}' set-attr {key} - test $({prefix} --pgid {pg} '{json}' get-attr {key}) = placeholder echo $expected | base64 --decode | \ {prefix} --pgid {pg} '{json}' set-attr {key} - test $({prefix} --pgid {pg} '{json}' get-attr {key} | base64) = $expected """.format(prefix=cmd_prefix, pg=pg, json=JSON, key="hinfo_key") log.debug(cmd) proc = remote.run( args=['bash', '-e', '-x', '-c', cmd], check_status=False, stdout=StringIO(), stderr=StringIO()) proc.wait() if proc.exitstatus != 0: log.error("failed with " + str(proc.exitstatus)) log.error(proc.stdout.getvalue() + " " + proc.stderr.getvalue()) ERRORS += 1 if len(values) != 0: log.error("Not all keys found, remaining keys:") log.error(values) log.info("Test pg info") for remote in osds.remotes.iterkeys(): for role in osds.remotes[remote]: if string.find(role, "osd.") != 0: continue osdid = int(role.split('.')[1]) if osdid not in pgs: continue for pg in pgs[osdid]: cmd = ((prefix + "--op info --pgid {pg}").format( id=osdid, pg=pg).split()) proc = remote.run(args=cmd, check_status=False, stdout=StringIO()) proc.wait() if proc.exitstatus != 0: log.error("Failure of --op info command with {ret}".format( proc.exitstatus)) ERRORS += 1 continue info = proc.stdout.getvalue() if not str(pg) in info: log.error("Bad data from info: {info}".format(info=info)) ERRORS += 1 log.info("Test pg logging") for remote in osds.remotes.iterkeys(): for role in osds.remotes[remote]: if string.find(role, "osd.") != 0: continue osdid = int(role.split('.')[1]) if osdid not in pgs: continue for pg in pgs[osdid]: cmd = ((prefix + "--op log --pgid {pg}").format(id=osdid, pg=pg).split()) proc = remote.run(args=cmd, check_status=False, stdout=StringIO()) proc.wait() if proc.exitstatus != 0: log.error("Getting log failed for pg {pg} " "from osd.{id} with {ret}".format( pg=pg, id=osdid, ret=proc.exitstatus)) ERRORS += 1 continue HASOBJ = pg in pgswithobjects MODOBJ = "modify" in proc.stdout.getvalue() if HASOBJ != MODOBJ: log.error("Bad log for pg {pg} from osd.{id}".format( pg=pg, id=osdid)) MSG = (HASOBJ and [""] or ["NOT "])[0] log.error( "Log should {msg}have a modify entry".format(msg=MSG)) ERRORS += 1 log.info("Test pg export") EXP_ERRORS = 0 for remote in osds.remotes.iterkeys(): for role in osds.remotes[remote]: if string.find(role, "osd.") != 0: continue osdid = int(role.split('.')[1]) if osdid not in pgs: continue for pg in pgs[osdid]: fpath = os.path.join(DATADIR, "osd{id}.{pg}".format(id=osdid, pg=pg)) cmd = ((prefix + "--op export --pgid {pg} --file {file}").format( id=osdid, pg=pg, file=fpath)) proc = remote.run(args=cmd, check_status=False, stdout=StringIO()) proc.wait() if proc.exitstatus != 0: log.error("Exporting failed for pg {pg} " "on osd.{id} with {ret}".format( pg=pg, id=osdid, ret=proc.exitstatus)) EXP_ERRORS += 1 ERRORS += EXP_ERRORS log.info("Test pg removal") RM_ERRORS = 0 for remote in osds.remotes.iterkeys(): for role in osds.remotes[remote]: if string.find(role, "osd.") != 0: continue osdid = int(role.split('.')[1]) if osdid not in pgs: continue for pg in pgs[osdid]: cmd = ((prefix + "--op remove --pgid {pg}").format(pg=pg, id=osdid)) proc = remote.run(args=cmd, check_status=False, stdout=StringIO()) proc.wait() if proc.exitstatus != 0: log.error("Removing failed for pg {pg} " "on osd.{id} with {ret}".format( pg=pg, id=osdid, ret=proc.exitstatus)) RM_ERRORS += 1 ERRORS += RM_ERRORS IMP_ERRORS = 0 if EXP_ERRORS == 0 and RM_ERRORS == 0: log.info("Test pg import") for remote in osds.remotes.iterkeys(): for role in osds.remotes[remote]: if string.find(role, "osd.") != 0: continue osdid = int(role.split('.')[1]) if osdid not in pgs: continue for pg in pgs[osdid]: fpath = os.path.join( DATADIR, "osd{id}.{pg}".format(id=osdid, pg=pg)) cmd = ((prefix + "--op import --file {file}").format( id=osdid, file=fpath)) proc = remote.run(args=cmd, check_status=False, stdout=StringIO()) proc.wait() if proc.exitstatus != 0: log.error( "Import failed from {file} with {ret}".format( file=fpath, ret=proc.exitstatus)) IMP_ERRORS += 1 else: log.warning("SKIPPING IMPORT TESTS DUE TO PREVIOUS FAILURES") ERRORS += IMP_ERRORS if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0: log.info("Restarting OSDs....") # They are still look to be up because of setting nodown for osd in manager.get_osd_status()['up']: manager.revive_osd(osd) # Wait for health? time.sleep(5) # Let scrub after test runs verify consistency of all copies log.info("Verify replicated import data") objects = range(1, NUM_OBJECTS + 1) for i in objects: NAME = REP_NAME + "{num}".format(num=i) TESTNAME = os.path.join(DATADIR, "gettest") REFNAME = os.path.join(DATADIR, NAME) proc = rados(ctx, cli_remote, ['-p', REP_POOL, 'get', NAME, TESTNAME], wait=False) ret = proc.wait() if ret != 0: log.error("After import, rados get failed with {ret}".format( ret=proc.exitstatus)) ERRORS += 1 continue cmd = "diff -q {gettest} {ref}".format(gettest=TESTNAME, ref=REFNAME) proc = cli_remote.run(args=cmd, check_status=False) proc.wait() if proc.exitstatus != 0: log.error("Data comparison failed for {obj}".format(obj=NAME)) ERRORS += 1 return ERRORS
def ceph_mons(ctx, config): """ Deploy any additional mons """ cluster_name = config['cluster'] fsid = ctx.ceph[cluster_name].fsid num_mons = 1 try: for remote, roles in ctx.cluster.remotes.items(): for mon in [ r for r in roles if teuthology.is_type('mon', cluster_name)(r) ]: c_, _, id_ = teuthology.split_role(mon) if c_ == cluster_name and id_ == ctx.ceph[ cluster_name].first_mon: continue log.info('Adding %s on %s' % (mon, remote.shortname)) num_mons += 1 _shell(ctx, cluster_name, remote, [ 'ceph', 'orch', 'daemon', 'add', 'mon', remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_, ]) ctx.daemons.register_daemon( remote, 'mon', id_, cluster=cluster_name, fsid=fsid, logger=log.getChild(mon), wait=False, started=True, ) with contextutil.safe_while(sleep=1, tries=180) as proceed: while proceed(): log.info('Waiting for %d mons in monmap...' % (num_mons)) r = _shell( ctx=ctx, cluster_name=cluster_name, remote=remote, args=[ 'ceph', 'mon', 'dump', '-f', 'json', ], stdout=StringIO(), ) j = json.loads(r.stdout.getvalue()) if len(j['mons']) == num_mons: break # refresh our (final) ceph.conf file log.info('Generating final ceph.conf file...') r = _shell( ctx=ctx, cluster_name=cluster_name, remote=remote, args=[ 'ceph', 'config', 'generate-minimal-conf', ], stdout=StringIO(), ) ctx.ceph[cluster_name].config_file = r.stdout.getvalue() yield finally: pass
def task(ctx, config): """ pre-validation still pending """ log.info('starting rgw-longrunning') log.info('config %s' % config) if config is None: config = {} assert isinstance(config, dict), \ "task set-repo only supports a dictionary for configuration" config_file_name = config['test'] + ".yaml" log.info('test_version: %s' % config.get('test_version', 'v2')) log.info('test: %s' % config['test']) branch = config.get('branch', 'master') log.info('script: %s' % config.get('script', config['test'] + ".py")) test_root_dir = 'rgw-tests' test_base_path = os.path.join(test_root_dir, 'ceph-qe-scripts') script = os.path.join(test_base_path, DIR[config.get('test_version', 'v2')]['script'], config.get('script', config['test'] + ".py")) config_file = os.path.join(test_base_path, DIR[config.get('test_version', 'v2')]['config'], config_file_name) log.info('script: %s' % script) log.info('config_file: %s' % config_file) soot = ['venv', 'rgw-tests', 'io_info.yaml', '*.json', 'Download.*', 'Download', '*.mpFile', 'x*', 'key.*', 'Mp.*', '*.key.*'] cleanup = lambda x: clients[0].run(args=[run.Raw('sudo rm -rf %s' % x)]) remotes = ctx.cluster.only(teuthology.is_type('client')) clients = [ remote for remote, roles_for_host in remotes.remotes.items()] list(map(cleanup, soot)) clients[0].run(args=['mkdir', test_root_dir]) log.info('cloning the repo to %s' % clients[0].hostname) clients[0].run( args=[ 'cd', '%s' % test_root_dir, run.Raw(';'), 'git', 'clone', 'https://github.com/red-hat-storage/ceph-qe-scripts.git', '-b', '%s' % branch ]) mapped_sizes = do_auto_calculate_io(clients, config) test_config = {'config': config.get('config')} test_config['config']['objects_count'] = len(mapped_sizes) test_config['config']['mapped_sizes'] = mapped_sizes log.info('config: %s' % test_config) log.info('creating configuration from data: %s' % test_config) local_file = os.path.join('/tmp/', config_file_name + "_" + str(os.getpid()) + pwd.getpwuid(os.getuid()).pw_name) with open(local_file, 'w') as outfile: outfile.write(yaml.dump(test_config, default_flow_style=False)) log.info('local_file: %s' % local_file) log.info('copying temp yaml to the client node') clients[0].put_file(local_file, config_file) clients[0].run(args=['ls', '-lt', os.path.join(test_base_path, DIR[config.get('test_version', 'v2')]['config'])]) clients[0].run(args=['cat', config_file]) # os.remove(local_file) clients[0].run(args=['python3', '-m', 'venv', 'venv']) clients[0].run( args=[ 'source', 'venv/bin/activate', run.Raw(';'), run.Raw('pip3 install boto boto3 names PyYaml ConfigParser'), run.Raw(';'), 'deactivate']) time.sleep(60) log.info('trying to restart rgw service after sleep 60 secs') clients[0].run(args=[run.Raw('sudo systemctl restart ceph-radosgw.target')]) log.info('starting the tests after sleep of 60 secs') time.sleep(60) clients[0].run( args=[run.Raw( 'sudo venv/bin/python3 %s -c %s ' % (script, config_file))]) try: yield finally: log.info('Test completed') log.info('Cluster size after test completion') cluster_size = get_cluster_size_info(clients) log.info('available: %s' % cluster_size['AVAIL']) log.info("Deleting leftovers") list(map(cleanup, soot))
def ceph_mons(ctx, config): """ Deploy any additional mons """ cluster_name = config['cluster'] fsid = ctx.ceph[cluster_name].fsid testdir = teuthology.get_testdir(ctx) num_mons = 1 try: for remote, roles in ctx.cluster.remotes.items(): for mon in [ r for r in roles if teuthology.is_type('mon', cluster_name)(r) ]: c_, _, id_ = teuthology.split_role(mon) if c_ == cluster_name and id_ == ctx.ceph[ cluster_name].first_mon: continue log.info('Adding %s on %s' % (mon, remote.shortname)) num_mons += 1 _shell(ctx, cluster_name, remote, [ 'ceph', 'orchestrator', 'mon', 'update', str(num_mons), remote.shortname + ':' + ctx.ceph[cluster_name].mons[mon] + '=' + id_, ]) ctx.daemons.register_daemon( remote, 'mon', id_, cluster=cluster_name, fsid=fsid, logger=log.getChild(mon), wait=False, started=True, ) with contextutil.safe_while(sleep=1, tries=180) as proceed: while proceed(): log.info('Waiting for %d mons in monmap...' % (num_mons)) r = _shell( ctx=ctx, cluster_name=cluster_name, remote=remote, args=[ 'ceph', 'mon', 'dump', '-f', 'json', ], stdout=StringIO(), ) j = json.loads(r.stdout.getvalue()) if len(j['mons']) == num_mons: break # refresh ceph.conf files for all mons + first mgr """ for remote, roles in ctx.cluster.remotes.items(): for mon in [r for r in roles if teuthology.is_type('mon', cluster_name)(r)]: c_, _, id_ = teuthology.split_role(mon) _shell(ctx, cluster_name, remote, [ 'ceph', 'orchestrator', 'service', 'redeploy', 'mon', id_, ]) _shell(ctx, cluster_name, ctx.ceph[cluster_name].bootstrap_remote, [ 'ceph', 'orchestrator', 'service', 'redeploy', 'mgr', ctx.ceph[cluster_name].first_mgr, ]) """ yield finally: pass
def binaries(ctx, config): """ Fetch the binaries from the gitbuilder, and spawn the download tasks on the remote machines. """ path = config.get('path') if path is None: # fetch Apache Hadoop from gitbuilder log.info( 'Fetching and unpacking Apache Hadoop binaries from gitbuilder...') apache_sha1, apache_hadoop_bindir_url = teuthology.get_ceph_binary_url( package='apache-hadoop', branch=config.get('apache_branch'), tag=config.get('tag'), sha1=config.get('sha1'), flavor=config.get('flavor'), format=config.get('format'), dist=config.get('dist'), arch=config.get('arch'), ) log.info('apache_hadoop_bindir_url %s' % (apache_hadoop_bindir_url)) ctx.summary['apache-hadoop-sha1'] = apache_sha1 # fetch Inktank Hadoop from gitbuilder log.info( 'Fetching and unpacking Inktank Hadoop binaries from gitbuilder...' ) inktank_sha1, inktank_hadoop_bindir_url = \ teuthology.get_ceph_binary_url( package='hadoop', branch=config.get('inktank_branch'), tag=config.get('tag'), sha1=config.get('sha1'), flavor=config.get('flavor'), format=config.get('format'), dist=config.get('dist'), arch=config.get('arch'), ) log.info('inktank_hadoop_bindir_url %s' % (inktank_hadoop_bindir_url)) ctx.summary['inktank-hadoop-sha1'] = inktank_sha1 else: raise Exception( "The hadoop task does not support the path argument at present") with parallel() as parallel_task: hadoop_nodes = ctx.cluster.only(teuthology.is_type('hadoop')) # these can happen independently for remote in hadoop_nodes.remotes.iterkeys(): parallel_task.spawn(_node_binaries, ctx, remote, inktank_hadoop_bindir_url, apache_hadoop_bindir_url) try: yield finally: log.info('Removing hadoop binaries...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', '{tdir}/apache_hadoop'.format( tdir=teuthology.get_testdir(ctx)) ], wait=False, ), ) run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', '{tdir}/inktank_hadoop'.format( tdir=teuthology.get_testdir(ctx)) ], wait=False, ), )
def _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path): local_mstore = tempfile.mkdtemp() # collect the maps from all OSDs is_osd = teuthology.is_type('osd') osds = ctx.cluster.only(is_osd) assert osds for osd, roles in osds.remotes.items(): for role in roles: if not is_osd(role): continue cluster, _, osd_id = teuthology.split_role(role) assert cluster_name == cluster log.info('collecting maps from {cluster}:osd.{osd}'.format( cluster=cluster, osd=osd_id)) # push leveldb to OSD osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store') osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore]) _push_directory(local_mstore, osd, osd_mstore) log.info('rm -rf {0}'.format(local_mstore)) shutil.rmtree(local_mstore) # update leveldb with OSD data options = '--no-mon-config --op update-mon-db --mon-store-path {0}' log.info('cot {0}'.format(osd_mstore)) manager.objectstore_tool(pool=None, options=options.format(osd_mstore), args='', osd=osd_id, do_revive=False) # pull the updated mon db log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore)) local_mstore = tempfile.mkdtemp() teuthology.pull_directory(osd, osd_mstore, local_mstore) log.info('rm -rf osd:{0}'.format(osd_mstore)) osd.run(args=['sudo', 'rm', '-fr', osd_mstore]) # recover the first_mon with re-built mon db # pull from recovered leveldb from client mon_store_dir = os.path.join('/var/lib/ceph/mon', '{0}-{1}'.format(cluster_name, mon_id)) _push_directory(local_mstore, mon, mon_store_dir) mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir]) shutil.rmtree(local_mstore) # fill up the caps in the keyring file mon.run(args=['sudo', 'ceph-authtool', keyring_path, '-n', 'mon.', '--cap', 'mon', 'allow *']) mon.run(args=['sudo', 'ceph-authtool', keyring_path, '-n', 'client.admin', '--cap', 'mon', 'allow *', '--cap', 'osd', 'allow *', '--cap', 'mds', 'allow *', '--cap', 'mgr', 'allow *']) mon.run(args=['sudo', '-u', 'ceph', 'CEPH_ARGS=--no-mon-config', 'ceph-monstore-tool', mon_store_dir, 'rebuild', '--', '--keyring', keyring_path, '--monmap', '/tmp/monmap', ])
def task(ctx, config): """ Run Hadoop S3A tests using Ceph usage: -tasks: ceph-ansible: s3a-hadoop: maven-version: '3.3.9' (default) hadoop-version: '2.7.3' bucket-name: 's3atest' (default) access-key: 'anykey' (uses a default value) secret-key: 'secretkey' ( uses a default value) """ if config is None: config = {} assert isinstance(config, dict), \ "task only supports a dictionary for configuration" overrides = ctx.config.get('overrides', {}) misc.deep_merge(config, overrides.get('s3a-hadoop', {})) testdir = misc.get_testdir(ctx) rgws = ctx.cluster.only(misc.is_type('rgw')) # use the first rgw node to test s3a rgw_node = rgws.remotes.keys()[0] # get versions maven_major = config.get('maven-major', 'maven-3') maven_version = config.get('maven-version', '3.3.9') hadoop_ver = config.get('hadoop-version', '2.7.3') bucket_name = config.get('bucket-name', 's3atest') access_key = config.get('access-key', 'EGAQRD2ULOIFKFSKCT4F') secret_key = config.get('secret-key', 'zi816w1vZKfaSM85Cl0BxXTwSLyN7zB4RbTswrGb') # set versions for cloning the repo apache_maven = 'apache-maven-{maven_version}-bin.tar.gz'.format( maven_version=maven_version) maven_link = 'http://mirror.jax.hugeserver.com/apache/maven/' + \ '{maven_major}/{maven_version}/binaries/'.format(maven_major=maven_major, maven_version=maven_version) + apache_maven hadoop_git = 'https://github.com/apache/hadoop' hadoop_rel = 'hadoop-{ver} rel/release-{ver}'.format(ver=hadoop_ver) install_prereq(rgw_node) rgw_node.run(args=[ 'cd', testdir, run.Raw('&&'), 'wget', maven_link, run.Raw('&&'), 'tar', '-xvf', apache_maven, run.Raw('&&'), 'git', 'clone', run.Raw(hadoop_git), run.Raw('&&'), 'cd', 'hadoop', run.Raw('&&'), 'git', 'checkout', '-b', run.Raw(hadoop_rel) ]) dnsmasq_name = 's3.ceph.com' configure_s3a(rgw_node, dnsmasq_name, access_key, secret_key, bucket_name, testdir) setup_dnsmasq(rgw_node, dnsmasq_name) fix_rgw_config(rgw_node, dnsmasq_name) setup_user_bucket(rgw_node, dnsmasq_name, access_key, secret_key, bucket_name, testdir) if hadoop_ver.startswith('2.8'): test_options = '-Dit.test=ITestS3A* -Dparallel-tests -Dscale -Dfs.s3a.scale.test.huge.filesize=128M verify' else: test_options = 'test -Dtest=S3a*,TestS3A*' try: run_s3atest(rgw_node, maven_version, testdir, test_options) yield finally: log.info("Done s3a testing, Cleaning up") for fil in ['apache*', 'hadoop*', 'venv*', 'create*']: rgw_node.run(args=[ 'rm', run.Raw('-rf'), run.Raw('{tdir}/{file}'.format(tdir=testdir, file=fil)) ]) # restart and let NM restore original config rgw_node.run(args=['sudo', 'systemctl', 'stop', 'dnsmasq']) rgw_node.run(args=['sudo', 'systemctl', 'restart', 'network.service'], check_status=False) rgw_node.run(args=['sudo', 'systemctl', 'status', 'network.service'], check_status=False)
def task(ctx, config): if config is None: config = {} assert isinstance(config, dict), \ "task only supports a dictionary for configuration" overrides = ctx.config.get('overrides', {}) teuthology.deep_merge(config, overrides.get('ceph', {})) log.info('Config: ' + str(config)) testdir = teuthology.get_testdir(ctx) # set up cluster context first_ceph_cluster = False if not hasattr(ctx, 'daemons'): first_ceph_cluster = True if not hasattr(ctx, 'ceph'): ctx.ceph = {} ctx.managers = {} if 'cluster' not in config: config['cluster'] = 'ceph' cluster_name = config['cluster'] ctx.ceph[cluster_name] = argparse.Namespace() ctx.ceph[cluster_name].thrashers = [] # fixme: setup watchdog, ala ceph.py ctx.ceph[cluster_name].roleless = False # see below # cephadm mode? if 'cephadm_mode' not in config: config['cephadm_mode'] = 'root' assert config['cephadm_mode'] in ['root', 'cephadm-package'] if config['cephadm_mode'] == 'root': ctx.cephadm = testdir + '/cephadm' else: ctx.cephadm = 'cephadm' # in the path if first_ceph_cluster: # FIXME: this is global for all clusters ctx.daemons = DaemonGroup(use_cephadm=ctx.cephadm) # image ctx.ceph[cluster_name].image = config.get('image') ref = None if not ctx.ceph[cluster_name].image: sha1 = config.get('sha1') if sha1: ctx.ceph[cluster_name].image = 'quay.io/ceph-ci/ceph:%s' % sha1 ref = sha1 else: # hmm, fall back to branch? branch = config.get('branch', 'master') ref = branch ctx.ceph[cluster_name].image = 'quay.io/ceph-ci/ceph:%s' % branch log.info('Cluster image is %s' % ctx.ceph[cluster_name].image) # uuid fsid = str(uuid.uuid1()) log.info('Cluster fsid is %s' % fsid) ctx.ceph[cluster_name].fsid = fsid # mon ips log.info('Choosing monitor IPs and ports...') remotes_and_roles = ctx.cluster.remotes.items() roles = [role_list for (remote, role_list) in remotes_and_roles] ips = [ host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles) ] if config.get('roleless', False): # mons will be named after hosts n = len(roles) roles = [] first_mon = None for remote, _ in remotes_and_roles: roles.append(['mon.' + remote.shortname]) if not first_mon: first_mon = remote.shortname bootstrap_remote = remote log.info('No roles; fabricating mons %s' % roles) ctx.ceph[cluster_name].mons = get_mons( roles, ips, cluster_name, mon_bind_msgr2=config.get('mon_bind_msgr2', True), mon_bind_addrvec=config.get('mon_bind_addrvec', True), ) log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons) if config.get('roleless', False): ctx.ceph[cluster_name].roleless = True ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote ctx.ceph[cluster_name].first_mon = first_mon ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon else: first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0] _, _, first_mon = teuthology.split_role(first_mon_role) (bootstrap_remote, ) = ctx.cluster.only(first_mon_role).remotes.keys() log.info('First mon is mon.%s on %s' % (first_mon, bootstrap_remote.shortname)) ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote ctx.ceph[cluster_name].first_mon = first_mon ctx.ceph[cluster_name].first_mon_role = first_mon_role others = ctx.cluster.remotes[bootstrap_remote] mgrs = sorted( [r for r in others if teuthology.is_type('mgr', cluster_name)(r)]) if not mgrs: raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon) _, _, first_mgr = teuthology.split_role(mgrs[0]) log.info('First mgr is %s' % (first_mgr)) ctx.ceph[cluster_name].first_mgr = first_mgr with contextutil.nested( lambda: ceph_initial(), lambda: normalize_hostnames(ctx=ctx), lambda: download_cephadm(ctx=ctx, config=config, ref=ref), lambda: ceph_log(ctx=ctx, config=config), lambda: ceph_crash(ctx=ctx, config=config), lambda: ceph_bootstrap(ctx=ctx, config=config), lambda: crush_setup(ctx=ctx, config=config), lambda: ceph_mons(ctx=ctx, config=config), lambda: distribute_config_and_admin_keyring(ctx=ctx, config=config ), lambda: ceph_mgrs(ctx=ctx, config=config), lambda: ceph_osds(ctx=ctx, config=config), lambda: ceph_mdss(ctx=ctx, config=config), lambda: ceph_rgw(ctx=ctx, config=config), lambda: ceph_monitoring('prometheus', ctx=ctx, config=config), lambda: ceph_monitoring('node-exporter', ctx=ctx, config=config), lambda: ceph_monitoring('alertmanager', ctx=ctx, config=config), lambda: ceph_monitoring('grafana', ctx=ctx, config=config), lambda: ceph_clients(ctx=ctx, config=config), ): ctx.managers[cluster_name] = CephManager( ctx.ceph[cluster_name].bootstrap_remote, ctx=ctx, logger=log.getChild('ceph_manager.' + cluster_name), cluster=cluster_name, cephadm=True, ) try: if config.get('wait-for-healthy', True): healthy(ctx=ctx, config=config) log.info('Setup complete, yielding') yield finally: log.info('Teardown begin')
def run_haproxy(self): """ task: ceph-ansible: haproxy: true haproxy_repo: https://github.com/smanjara/ansible-haproxy.git haproxy_branch: master """ # Clone haproxy from https://github.com/smanjara/ansible-haproxy/, # use inven.yml from ceph-ansible dir to read haproxy node from # Assumes haproxy roles such as haproxy.0, haproxy.1 and so on. installer_node = self.ceph_installer haproxy_ansible_repo = self.config['haproxy_repo'] branch = 'master' if self.config.get('haproxy_branch'): branch = self.config.get('haproxy_branch') installer_node.run(args=[ 'cd', run.Raw('~/'), run.Raw(';'), 'git', 'clone', run.Raw('-b %s' % branch), run.Raw(haproxy_ansible_repo), ], timeout=4200, stdout=StringIO()) allhosts = self.each_cluster.only(misc.is_type('rgw')).remotes.keys() clients = list(set(allhosts)) ips = [] for each_client in clients: ips.append(socket.gethostbyname(each_client.hostname)) # substitute {{ ip_var' }} in haproxy.yml file with rgw node ips ip_vars = {} for i in range(len(ips)): ip_vars['ip_var' + str(i)] = ips.pop() # run haproxy playbook args = [ 'ANSIBLE_STDOUT_CALLBACK=debug', 'ansible-playbook', '-vv', 'haproxy.yml', '-e', "'%s'" % json.dumps(ip_vars), '-i', '~/ceph-ansible/inven.yml' ] log.debug("Running %s", args) str_args = ' '.join(args) installer_node.run(args=[ run.Raw('cd ~/ansible-haproxy'), run.Raw(';'), run.Raw(str_args) ]) # run keepalived playbook args = [ 'ANSIBLE_STDOUT_CALLBACK=debug', 'ansible-playbook', '-vv', 'keepalived.yml', '-e', "'%s'" % json.dumps(ip_vars), '-i', '~/ceph-ansible/inven.yml' ] log.debug("Running %s", args) str_args = ' '.join(args) installer_node.run(args=[ run.Raw('cd ~/ansible-haproxy'), run.Raw(';'), run.Raw(str_args) ])
def initialize_config(ctx, config): cluster_name = config['cluster'] testdir = teuthology.get_testdir(ctx) ctx.ceph[cluster_name].thrashers = [] # fixme: setup watchdog, ala ceph.py ctx.ceph[cluster_name].roleless = False # see below first_ceph_cluster = False if not hasattr(ctx, 'daemons'): first_ceph_cluster = True # cephadm mode? if 'cephadm_mode' not in config: config['cephadm_mode'] = 'root' assert config['cephadm_mode'] in ['root', 'cephadm-package'] if config['cephadm_mode'] == 'root': ctx.cephadm = testdir + '/cephadm' else: ctx.cephadm = 'cephadm' # in the path if first_ceph_cluster: # FIXME: this is global for all clusters ctx.daemons = DaemonGroup(use_cephadm=ctx.cephadm) # uuid fsid = str(uuid.uuid1()) log.info('Cluster fsid is %s' % fsid) ctx.ceph[cluster_name].fsid = fsid # mon ips log.info('Choosing monitor IPs and ports...') remotes_and_roles = ctx.cluster.remotes.items() ips = [ host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles) ] if config.get('roleless', False): # mons will be named after hosts first_mon = None for remote, _ in remotes_and_roles: ctx.cluster.remotes[remote].append('mon.' + remote.shortname) if not first_mon: first_mon = remote.shortname bootstrap_remote = remote log.info('No mon roles; fabricating mons') roles = [role_list for (remote, role_list) in ctx.cluster.remotes.items()] ctx.ceph[cluster_name].mons = get_mons( roles, ips, cluster_name, mon_bind_msgr2=config.get('mon_bind_msgr2', True), mon_bind_addrvec=config.get('mon_bind_addrvec', True), ) log.info('Monitor IPs: %s' % ctx.ceph[cluster_name].mons) if config.get('roleless', False): ctx.ceph[cluster_name].roleless = True ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote ctx.ceph[cluster_name].first_mon = first_mon ctx.ceph[cluster_name].first_mon_role = 'mon.' + first_mon else: first_mon_role = sorted(ctx.ceph[cluster_name].mons.keys())[0] _, _, first_mon = teuthology.split_role(first_mon_role) (bootstrap_remote, ) = ctx.cluster.only(first_mon_role).remotes.keys() log.info('First mon is mon.%s on %s' % (first_mon, bootstrap_remote.shortname)) ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote ctx.ceph[cluster_name].first_mon = first_mon ctx.ceph[cluster_name].first_mon_role = first_mon_role others = ctx.cluster.remotes[bootstrap_remote] mgrs = sorted( [r for r in others if teuthology.is_type('mgr', cluster_name)(r)]) if not mgrs: raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon) _, _, first_mgr = teuthology.split_role(mgrs[0]) log.info('First mgr is %s' % (first_mgr)) ctx.ceph[cluster_name].first_mgr = first_mgr yield
def ceph_bootstrap(ctx, config): cluster_name = config['cluster'] testdir = teuthology.get_testdir(ctx) fsid = ctx.ceph[cluster_name].fsid mons = ctx.ceph[cluster_name].mons first_mon_role = sorted(mons.keys())[0] _, _, first_mon = teuthology.split_role(first_mon_role) (bootstrap_remote, ) = ctx.cluster.only(first_mon_role).remotes.keys() log.info('First mon is mon.%s on %s' % (first_mon, bootstrap_remote.shortname)) ctx.ceph[cluster_name].bootstrap_remote = bootstrap_remote ctx.ceph[cluster_name].first_mon = first_mon others = ctx.cluster.remotes[bootstrap_remote] log.info('others %s' % others) mgrs = sorted( [r for r in others if teuthology.is_type('mgr', cluster_name)(r)]) if not mgrs: raise RuntimeError('no mgrs on the same host as first mon %s' % first_mon) _, _, first_mgr = teuthology.split_role(mgrs[0]) log.info('First mgr is %s' % (first_mgr)) ctx.ceph[cluster_name].first_mgr = first_mgr try: # write seed config log.info('Writing seed config...') conf_fp = StringIO() seed_config = build_initial_config(ctx, config) seed_config.write(conf_fp) teuthology.write_file(remote=bootstrap_remote, path='{}/seed.{}.conf'.format( testdir, cluster_name), data=conf_fp.getvalue()) log.debug('Final config:\n' + conf_fp.getvalue()) # bootstrap log.info('Bootstrapping...') cmd = [ 'sudo', ctx.ceph_daemon, '--image', ctx.ceph[cluster_name].image, 'bootstrap', '--fsid', fsid, '--mon-id', first_mon, '--mgr-id', first_mgr, '--config', '{}/seed.{}.conf'.format(testdir, cluster_name), '--output-config', '{}/{}.conf'.format(testdir, cluster_name), '--output-keyring', '{}/{}.keyring'.format(testdir, cluster_name), '--output-pub-ssh-key', '{}/{}.pub'.format(testdir, cluster_name), ] if mons[first_mon_role].startswith('['): cmd += ['--mon-addrv', mons[first_mon_role]] else: cmd += ['--mon-ip', mons[first_mon_role]] if config.get('skip_dashboard'): cmd += ['--skip-dashboard'] # bootstrap makes the keyring root 0600, so +r it for our purposes cmd += [ run.Raw('&&'), 'sudo', 'chmod', '+r', '{}/{}.keyring'.format(testdir, cluster_name), ] bootstrap_remote.run(args=cmd) # register initial daemons ctx.daemons.register_daemon( bootstrap_remote, 'mon', first_mon, cluster=cluster_name, fsid=fsid, logger=log.getChild('mon.' + first_mon), wait=False, started=True, ) ctx.daemons.register_daemon( bootstrap_remote, 'mgr', first_mgr, cluster=cluster_name, fsid=fsid, logger=log.getChild('mgr.' + first_mgr), wait=False, started=True, ) # fetch keys and configs log.info('Fetching config...') ctx.ceph[cluster_name].config_file = teuthology.get_file( remote=bootstrap_remote, path='{}/{}.conf'.format(testdir, cluster_name)) log.info('Fetching client.admin keyring...') ctx.ceph[cluster_name].admin_keyring = teuthology.get_file( remote=bootstrap_remote, path='{}/{}.keyring'.format(testdir, cluster_name)) log.info('Fetching mon keyring...') ctx.ceph[cluster_name].mon_keyring = teuthology.get_file( remote=bootstrap_remote, path='/var/lib/ceph/%s/mon.%s/keyring' % (fsid, first_mon), sudo=True) # fetch ssh key, distribute to additional nodes log.info('Fetching pub ssh key...') ssh_pub_key = teuthology.get_file(remote=bootstrap_remote, path='{}/{}.pub'.format( testdir, cluster_name)).strip() log.info('Installing pub ssh key for root users...') ctx.cluster.run(args=[ 'sudo', 'install', '-d', '-m', '0700', '/root/.ssh', run.Raw('&&'), 'echo', ssh_pub_key, run.Raw('|'), 'sudo', 'tee', '-a', '/root/.ssh/authorized_keys', run.Raw('&&'), 'sudo', 'chmod', '0600', '/root/.ssh/authorized_keys', ]) # add other hosts for remote in ctx.cluster.remotes.keys(): if remote == bootstrap_remote: continue log.info('Writing conf and keyring to %s' % remote.shortname) teuthology.write_file(remote=remote, path='{}/{}.conf'.format( testdir, cluster_name), data=ctx.ceph[cluster_name].config_file) teuthology.write_file(remote=remote, path='{}/{}.keyring'.format( testdir, cluster_name), data=ctx.ceph[cluster_name].admin_keyring) log.info('Adding host %s to orchestrator...' % remote.shortname) _shell(ctx, cluster_name, remote, ['ceph', 'orchestrator', 'host', 'add', remote.shortname]) yield finally: log.info('Cleaning up testdir ceph.* files...') ctx.cluster.run(args=[ 'rm', '-f', '{}/seed.{}.conf'.format(testdir, cluster_name), '{}/{}.pub'.format(testdir, cluster_name), '{}/{}.conf'.format(testdir, cluster_name), '{}/{}.keyring'.format(testdir, cluster_name), ]) log.info('Stopping all daemons...') # this doesn't block until they are all stopped... #ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target']) # so, stop them individually for role in ctx.daemons.resolve_role_list(None, CEPH_ROLE_TYPES): cluster, type_, id_ = teuthology.split_role(role) ctx.daemons.get_daemon(type_, id_, cluster).stop()
def task(ctx, config): """ "Thrash" the OSDs by randomly marking them out/down (and then back in) until the task is ended. This loops, and every op_delay seconds it randomly chooses to add or remove an OSD (even odds) unless there are fewer than min_out OSDs out of the cluster, or more than min_in OSDs in the cluster. All commands are run on mon0 and it stops when __exit__ is called. The config is optional, and is a dict containing some or all of: min_in: (default 3) the minimum number of OSDs to keep in the cluster min_out: (default 0) the minimum number of OSDs to keep out of the cluster op_delay: (5) the length of time to sleep between changing an OSD's status min_dead: (0) minimum number of osds to leave down/dead. max_dead: (0) maximum number of osds to leave down/dead before waiting for clean. This should probably be num_replicas - 1. clean_interval: (60) the approximate length of time to loop before waiting until the cluster goes clean. (In reality this is used to probabilistically choose when to wait, and the method used makes it closer to -- but not identical to -- the half-life.) scrub_interval: (-1) the approximate length of time to loop before waiting until a scrub is performed while cleaning. (In reality this is used to probabilistically choose when to wait, and it only applies to the cases where cleaning is being performed). -1 is used to indicate that no scrubbing will be done. chance_down: (0.4) the probability that the thrasher will mark an OSD down rather than marking it out. (The thrasher will not consider that OSD out of the cluster, since presently an OSD wrongly marked down will mark itself back up again.) This value can be either an integer (eg, 75) or a float probability (eg 0.75). chance_test_min_size: (0) chance to run test_pool_min_size, which: - kills all but one osd - waits - kills that osd - revives all other osds - verifies that the osds fully recover timeout: (360) the number of seconds to wait for the cluster to become clean after each cluster change. If this doesn't happen within the timeout, an exception will be raised. revive_timeout: (150) number of seconds to wait for an osd asok to appear after attempting to revive the osd thrash_primary_affinity: (true) randomly adjust primary-affinity chance_pgnum_grow: (0) chance to increase a pool's size chance_pgpnum_fix: (0) chance to adjust pgpnum to pg for a pool pool_grow_by: (10) amount to increase pgnum by max_pgs_per_pool_osd: (1200) don't expand pools past this size per osd pause_short: (3) duration of short pause pause_long: (80) duration of long pause pause_check_after: (50) assert osd down after this long chance_inject_pause_short: (1) chance of injecting short stall chance_inject_pause_long: (0) chance of injecting long stall clean_wait: (0) duration to wait before resuming thrashing once clean sighup_delay: (0.1) duration to delay between sending signal.SIGHUP to a random live osd powercycle: (false) whether to power cycle the node instead of just the osd process. Note that this assumes that a single osd is the only important process on the node. chance_test_backfill_full: (0) chance to simulate full disks stopping backfill chance_test_map_discontinuity: (0) chance to test map discontinuity map_discontinuity_sleep_time: (40) time to wait for map trims ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down chance_move_pg: (1.0) chance of moving a pg if more than 1 osd is down (default 100%) example: tasks: - ceph: - thrashosds: chance_down: 10 op_delay: 3 min_in: 1 timeout: 600 - interactive: """ if config is None: config = {} assert isinstance(config, dict), \ 'thrashosds task only accepts a dict for configuration' # add default value for sighup_delay config['sighup_delay'] = config.get('sighup_delay', 0.1) overrides = ctx.config.get('overrides', {}) teuthology.deep_merge(config, overrides.get('thrashosds', {})) if 'powercycle' in config: # sync everyone first to avoid collateral damage to / etc. log.info('Doing preliminary sync to avoid collateral damage...') ctx.cluster.run(args=['sync']) if 'ipmi_user' in ctx.teuthology_config: for t, key in ctx.config['targets'].iteritems(): host = t.split('@')[-1] shortname = host.split('.')[0] from teuthology.orchestra import remote as oremote console = oremote.getRemoteConsole( name=host, ipmiuser=ctx.teuthology_config['ipmi_user'], ipmipass=ctx.teuthology_config['ipmi_password'], ipmidomain=ctx.teuthology_config['ipmi_domain']) cname = '{host}.{domain}'.format( host=shortname, domain=ctx.teuthology_config['ipmi_domain']) log.debug('checking console status of %s' % cname) if not console.check_status(): log.info('Failed to get console status for ' '%s, disabling console...' % cname) console = None else: # find the remote for this console and add it remotes = [ r for r in ctx.cluster.remotes.keys() if r.name == t ] if len(remotes) != 1: raise Exception('Too many (or too few) remotes ' 'found for target {t}'.format(t=t)) remotes[0].console = console log.debug('console ready on %s' % cname) # check that all osd remotes have a valid console osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, _ in osds.remotes.iteritems(): if not remote.console: raise Exception( 'IPMI console required for powercycling, ' 'but not available on osd role: {r}'.format( r=remote.name)) log.info('Beginning thrashosds...') thrash_proc = ceph_manager.Thrasher(ctx.manager, config, logger=log.getChild('thrasher')) try: yield finally: log.info('joining thrashosds') log.info('whsceshi....') #thrash_proc.do_join() ctx.manager.wait_for_recovery(config.get('timeout', 360))