def archive(ctx, config): log.info('Creating archive directory...') run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', '/tmp/cephtest/archive', ], wait=False, ) ) try: yield finally: if ctx.archive is not None: log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') os.mkdir(logdir) for remote in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, remote.shortname) teuthology.pull_directory(remote, '/tmp/cephtest/archive', path) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', '/tmp/cephtest/archive', ], wait=False, ), )
def collect_logs(self): ctx = self.ctx if ctx.archive is not None and not (ctx.config.get('archive-on-error') and ctx.summary['success']): log.info('Archiving logs...') path = os.path.join( ctx.archive, self.cluster_name if self.cluster_name else 'ceph', 'remote') try: os.makedirs(path) except OSError as e: if e.errno != errno.EISDIR or e.errno != errno.EEXIST: raise def wanted(role): # Only attempt to collect logs from hosts which are part of the # cluster return any( map( lambda role_stub: role.startswith(role_stub), list(self.groups_to_roles.values()), )) for remote in list(self.each_cluster.only(wanted).remotes.keys()): sub = os.path.join(path, remote.shortname) os.makedirs(sub) misc.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) if ctx.config.get('coverage', False): cover_dir = os.path.join(sub, "coverage") os.makedirs(cover_dir) misc.pull_directory(remote, '/builddir', cover_dir)
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info("Creating archive directory...") archive_dir = misc.get_archive_dir(ctx) run.wait(ctx.cluster.run(args=["install", "-d", "-m0755", "--", archive_dir], wait=False)) try: yield except Exception: # we need to know this below set_status(ctx.summary, "fail") raise finally: passed = get_status(ctx.summary) == "pass" if ctx.archive is not None and not (ctx.config.get("archive-on-error") and passed): log.info("Transferring archived files...") logdir = os.path.join(ctx.archive, "remote") if not os.path.exists(logdir): os.mkdir(logdir) for rem in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, rem.shortname) misc.pull_directory(rem, archive_dir, path) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info("Removing archive directory...") run.wait(ctx.cluster.run(args=["rm", "-rf", "--", archive_dir], wait=False))
def _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path): local_mstore = tempfile.mkdtemp() # collect the maps from all OSDs is_osd = teuthology.is_type('osd') osds = ctx.cluster.only(is_osd) assert osds for osd, roles in osds.remotes.iteritems(): for role in roles: if not is_osd(role): continue cluster, _, osd_id = teuthology.split_role(role) assert cluster_name == cluster log.info('collecting maps from {cluster}:osd.{osd}'.format( cluster=cluster, osd=osd_id)) # push leveldb to OSD osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store') osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore]) _push_directory(local_mstore, osd, osd_mstore) log.info('rm -rf {0}'.format(local_mstore)) shutil.rmtree(local_mstore) # update leveldb with OSD data options = '--no-mon-config --op update-mon-db --mon-store-path {0}' log.info('cot {0}'.format(osd_mstore)) manager.objectstore_tool(pool=None, options=options.format(osd_mstore), args='', osd=osd_id, do_revive=False) # pull the updated mon db log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore)) local_mstore = tempfile.mkdtemp() teuthology.pull_directory(osd, osd_mstore, local_mstore) log.info('rm -rf osd:{0}'.format(osd_mstore)) osd.run(args=['sudo', 'rm', '-fr', osd_mstore]) # recover the first_mon with re-built mon db # pull from recovered leveldb from client mon_store_dir = os.path.join('/var/lib/ceph/mon', '{0}-{1}'.format(cluster_name, mon_id)) _push_directory(local_mstore, mon, mon_store_dir) mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir]) shutil.rmtree(local_mstore) # fill up the caps in the keyring file mon.run(args=[ 'sudo', 'ceph-authtool', keyring_path, '-n', 'mon.', '--cap', 'mon', 'allow *' ]) mon.run(args=[ 'sudo', 'ceph-authtool', keyring_path, '-n', 'client.admin', '--cap', 'mon', 'allow *', '--cap', 'osd', 'allow *', '--cap', 'mds', 'allow *', '--cap', 'mgr', 'allow *' ]) mon.run(args=[ 'sudo', '-u', 'ceph', 'CEPH_ARGS=--no-mon-config', 'ceph-monstore-tool', mon_store_dir, 'rebuild', '--', '--keyring', keyring_path ])
def ceph_crash(ctx, config): """ Gather crash dumps from /var/lib/ceph/$fsid/crash """ cluster_name = config['cluster'] fsid = ctx.ceph[cluster_name].fsid try: yield finally: if ctx.archive is not None: log.info('Archiving crash dumps...') path = os.path.join(ctx.archive, 'remote') try: os.makedirs(path) except OSError: pass for remote in ctx.cluster.remotes.keys(): sub = os.path.join(path, remote.name) try: os.makedirs(sub) except OSError: pass try: teuthology.pull_directory(remote, '/var/lib/ceph/%s/crash' % fsid, os.path.join(sub, 'crash')) except ReadError: pass
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=['install', '-d', '-m0755', '--', archive_dir], wait=False, ) ) # Add logs directory to job's info log file misc.add_remote_path(ctx, 'init', archive_dir) try: yield except Exception: # we need to know this below set_status(ctx.summary, 'fail') raise finally: passed = get_status(ctx.summary) == 'pass' if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and passed): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for rem in ctx.cluster.remotes.keys(): path = os.path.join(logdir, rem.shortname) min_size_option = ctx.config.get('log-compress-min-size', '128MB') try: compress_min_size_bytes = \ humanfriendly.parse_size(min_size_option) except humanfriendly.InvalidSize: msg = 'invalid "log-compress-min-size": {}'.format(min_size_option) log.error(msg) raise ConfigError(msg) maybe_compress = functools.partial(gzip_if_too_large, compress_min_size_bytes) misc.pull_directory(rem, archive_dir, path, maybe_compress) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=['rm', '-rf', '--', archive_dir], wait=False, ), )
def collect_logs(self): ctx = self.ctx if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) misc.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log'))
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', archive_dir, ], wait=False, )) try: yield except Exception: # we need to know this below set_status(ctx.summary, 'fail') raise finally: passed = get_status(ctx.summary) == 'pass' if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and passed): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for rem in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, rem.shortname) misc.pull_directory(rem, archive_dir, path) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', archive_dir, ], wait=False, ), )
def ceph_log(ctx, config): cluster_name = config['cluster'] fsid = ctx.ceph[cluster_name].fsid try: yield finally: if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): # and logs log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph/' + fsid, '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') try: os.makedirs(path) except OSError: pass for remote in ctx.cluster.remotes.keys(): sub = os.path.join(path, remote.name) try: os.makedirs(sub) except OSError: pass teuthology.pull_directory(remote, '/var/log/ceph/' + fsid, os.path.join(sub, 'log'))
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = misc.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', archive_dir, ], wait=False, ) ) try: yield except Exception: # we need to know this below set_status(ctx.summary, 'fail') raise finally: passed = get_status(ctx.summary) == 'pass' if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and passed): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for rem in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, rem.shortname) misc.pull_directory(rem, archive_dir, path) # Check for coredumps and pull binaries fetch_binaries_for_coredumps(path, rem) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', archive_dir, ], wait=False, ), )
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = teuthology.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', archive_dir, ], wait=False, )) try: yield except Exception: # we need to know this below ctx.summary['success'] = False raise finally: if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for remote in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, remote.shortname) teuthology.pull_directory(remote, archive_dir, path) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', archive_dir, ], wait=False, ), )
def archive(ctx, config): """ Handle the creation and deletion of the archive directory. """ log.info('Creating archive directory...') archive_dir = teuthology.get_archive_dir(ctx) run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', archive_dir, ], wait=False, ) ) try: yield except Exception: # we need to know this below ctx.summary['success'] = False raise finally: if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): log.info('Transferring archived files...') logdir = os.path.join(ctx.archive, 'remote') if (not os.path.exists(logdir)): os.mkdir(logdir) for remote in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, remote.shortname) teuthology.pull_directory(remote, archive_dir, path) log.info('Removing archive directory...') run.wait( ctx.cluster.run( args=[ 'rm', '-rf', '--', archive_dir, ], wait=False, ), )
def collect_logs(self): ctx = self.ctx if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) def wanted(role): # Only attempt to collect logs from hosts which are part of the # cluster return any(map( lambda role_stub: role.startswith(role_stub), self.groups_to_roles.values(), )) for remote in ctx.cluster.only(wanted).remotes.keys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) misc.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log'))
def collect_logs(self): ctx = self.ctx if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) def wanted(role): # Only attempt to collect logs from hosts which are part of the # cluster return any(map( lambda role_stub: role.startswith(role_stub), self.groups_to_roles.values(), )) for remote in ctx.cluster.only(wanted).remotes.keys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) misc.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log'))
def ceph_crash(ctx, config): """ Gather crash dumps from /var/lib/ceph/$fsid/crash """ cluster_name = config['cluster'] fsid = ctx.ceph[cluster_name].fsid # Add logs directory to job's info log file with open(os.path.join(ctx.archive, 'info.yaml'), 'r+') as info_file: info_yaml = yaml.safe_load(info_file) info_file.seek(0) if 'archive' not in info_yaml: info_yaml['archive'] = {'crash': '/var/lib/ceph/%s/crash' % fsid} else: info_yaml['archive']['crash'] = '/var/lib/ceph/%s/crash' % fsid yaml.safe_dump(info_yaml, info_file, default_flow_style=False) try: yield finally: if ctx.archive is not None: log.info('Archiving crash dumps...') path = os.path.join(ctx.archive, 'remote') try: os.makedirs(path) except OSError: pass for remote in ctx.cluster.remotes.keys(): sub = os.path.join(path, remote.name) try: os.makedirs(sub) except OSError: pass try: teuthology.pull_directory(remote, '/var/lib/ceph/%s/crash' % fsid, os.path.join(sub, 'crash')) except ReadError: pass
def archive(ctx, config): log.info("Creating archive directory...") archive_dir = teuthology.get_archive_dir(ctx) run.wait(ctx.cluster.run(args=["install", "-d", "-m0755", "--", archive_dir], wait=False)) try: yield except Exception: # we need to know this below ctx.summary["success"] = False raise finally: if ctx.archive is not None and not (ctx.config.get("archive-on-error") and ctx.summary["success"]): log.info("Transferring archived files...") logdir = os.path.join(ctx.archive, "remote") if not os.path.exists(logdir): os.mkdir(logdir) for remote in ctx.cluster.remotes.iterkeys(): path = os.path.join(logdir, remote.shortname) teuthology.pull_directory(remote, archive_dir, path) log.info("Removing archive directory...") run.wait(ctx.cluster.run(args=["rm", "-rf", "--", archive_dir], wait=False))
def build_ceph_cluster(ctx, config): """Build a ceph cluster""" try: log.info('Building ceph cluster using ceph-deploy...') testdir = teuthology.get_testdir(ctx) ceph_branch = None if config.get('branch') is not None: cbranch = config.get('branch') for var, val in cbranch.iteritems(): if var == 'testing': ceph_branch = '--{var}'.format(var=var) ceph_branch = '--{var}={val}'.format(var=var, val=val) node_dev_list = [] all_nodes = get_all_nodes(ctx, config) mds_nodes = get_nodes_using_roles(ctx, config, 'mds') mds_nodes = " ".join(mds_nodes) mon_node = get_nodes_using_roles(ctx, config, 'mon') mon_nodes = " ".join(mon_node) new_mon = './ceph-deploy new' + " " + mon_nodes install_nodes = './ceph-deploy install ' + ceph_branch + " " + all_nodes purge_nodes = './ceph-deploy purge' + " " + all_nodes purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes mon_hostname = mon_nodes.split(' ')[0] mon_hostname = str(mon_hostname) gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname deploy_mds = './ceph-deploy mds create' + " " + mds_nodes no_of_osds = 0 if mon_nodes is None: raise RuntimeError("no monitor nodes in the config file") estatus_new = execute_ceph_deploy(ctx, config, new_mon) if estatus_new != 0: raise RuntimeError("ceph-deploy: new command failed") log.info('adding config inputs...') testdir = teuthology.get_testdir(ctx) conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir) first_mon = teuthology.get_first_mon(ctx, config) (remote, ) = ctx.cluster.only(first_mon).remotes.keys() lines = None if config.get('conf') is not None: confp = config.get('conf') for section, keys in confp.iteritems(): lines = '[{section}]\n'.format(section=section) teuthology.append_lines_to_file(remote, conf_path, lines, sudo=True) for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) lines = '{key} = {value}\n'.format(key=key, value=value) teuthology.append_lines_to_file(remote, conf_path, lines, sudo=True) estatus_install = execute_ceph_deploy(ctx, config, install_nodes) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph") mon_no = None mon_no = config.get('mon_initial_members') if mon_no is not None: i = 0 mon1 = [] while (i < mon_no): mon1.append(mon_node[i]) i = i + 1 initial_mons = " ".join(mon1) for k in range(mon_no, len(mon_node)): mon_create_nodes = './ceph-deploy mon create' + " " + \ initial_mons + " " + mon_node[k] estatus_mon = execute_ceph_deploy(ctx, config, mon_create_nodes) if estatus_mon != 0: raise RuntimeError("ceph-deploy: Failed to create monitor") else: mon_create_nodes = './ceph-deploy mon create-initial' estatus_mon = execute_ceph_deploy(ctx, config, mon_create_nodes) if estatus_mon != 0: raise RuntimeError("ceph-deploy: Failed to create monitors") estatus_gather = execute_ceph_deploy(ctx, config, gather_keys) max_gather_tries = 90 gather_tries = 0 while (estatus_gather != 0): gather_tries += 1 if gather_tries >= max_gather_tries: msg = 'ceph-deploy was not able to gatherkeys after 15 minutes' raise RuntimeError(msg) estatus_gather = execute_ceph_deploy(ctx, config, gather_keys) time.sleep(10) if mds_nodes: estatus_mds = execute_ceph_deploy(ctx, config, deploy_mds) if estatus_mds != 0: raise RuntimeError("ceph-deploy: Failed to deploy mds") if config.get('test_mon_destroy') is not None: for d in range(1, len(mon_node)): mon_destroy_nodes = './ceph-deploy mon destroy' + " " + mon_node[ d] estatus_mon_d = execute_ceph_deploy(ctx, config, mon_destroy_nodes) if estatus_mon_d != 0: raise RuntimeError("ceph-deploy: Failed to delete monitor") node_dev_list = get_dev_for_osd(ctx, config) for d in node_dev_list: osd_create_cmds = './ceph-deploy osd create --zap-disk' + " " + d estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmds) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: zap_disk = './ceph-deploy disk zap' + " " + d execute_ceph_deploy(ctx, config, zap_disk) estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmds) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") if config.get('wait-for-healthy', True) and no_of_osds >= 2: is_healthy(ctx=ctx, config=None) log.info('Setting up client nodes...') conf_path = '/etc/ceph/ceph.conf' admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring' first_mon = teuthology.get_first_mon(ctx, config) (mon0_remote, ) = ctx.cluster.only(first_mon).remotes.keys() conf_data = teuthology.get_file( remote=mon0_remote, path=conf_path, sudo=True, ) admin_keyring = teuthology.get_file( remote=mon0_remote, path=admin_keyring_path, sudo=True, ) clients = ctx.cluster.only(teuthology.is_type('client')) for remot, roles_for_host in clients.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'client'): client_keyring = \ '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) mon0_remote.run(args=[ 'cd', '{tdir}'.format(tdir=testdir), run.Raw('&&'), 'sudo', 'bash', '-c', run.Raw('"'), 'ceph', 'auth', 'get-or-create', 'client.{id}'.format(id=id_), 'mds', 'allow', 'mon', 'allow *', 'osd', 'allow *', run.Raw('>'), client_keyring, run.Raw('"'), ], ) key_data = teuthology.get_file( remote=mon0_remote, path=client_keyring, sudo=True, ) teuthology.sudo_write_file(remote=remot, path=client_keyring, data=key_data, perms='0644') teuthology.sudo_write_file(remote=remot, path=admin_keyring_path, data=admin_keyring, perms='0644') teuthology.sudo_write_file(remote=remot, path=conf_path, data=conf_data, perms='0644') else: raise RuntimeError( "The cluster is NOT operational due to insufficient OSDs") yield finally: log.info('Stopping ceph...') ctx.cluster.run(args=[ 'sudo', 'stop', 'ceph-all', run.Raw('||'), 'sudo', 'service', 'ceph', 'stop' ]) if ctx.archive is not None: # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) mons = ctx.cluster.only(teuthology.is_type('mon')) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) # Prevent these from being undefined if the try block fails all_nodes = get_all_nodes(ctx, config) purge_nodes = './ceph-deploy purge' + " " + all_nodes purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes log.info('Purging package...') execute_ceph_deploy(ctx, config, purge_nodes) log.info('Purging data...') execute_ceph_deploy(ctx, config, purgedata_nodes)
def ceph_log(ctx, config): """ Create /var/log/ceph log directory that is open to everyone. Add valgrind and profiling-logger directories. :param ctx: Context :param config: Configuration """ log.info('Making ceph log dir writeable by non-root...') run.wait( ctx.cluster.run( args=[ 'sudo', 'chmod', '777', '/var/log/ceph', ], wait=False, ) ) log.info('Disabling ceph logrotate...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-f', '--', '/etc/logrotate.d/ceph', ], wait=False, ) ) log.info('Creating extra log directories...') run.wait( ctx.cluster.run( args=[ 'sudo', 'install', '-d', '-m0777', '--', '/var/log/ceph/valgrind', '/var/log/ceph/profiling-logger', ], wait=False, ) ) try: yield finally: if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): # and logs log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log'))
def cluster(ctx, config): """ Handle the creation and removal of a ceph cluster. On startup: Create directories needed for the cluster. Create remote journals for all osds. Create and set keyring. Copy the monmap to tht test systems. Setup mon nodes. Setup mds nodes. Mkfs osd nodes. Add keyring information to monmaps Mkfs mon nodes. On exit: If errors occured, extract a failure message and store in ctx.summary. Unmount all test files and temporary journaling files. Save the monitor information and archive all ceph logs. Cleanup the keyring setup, and remove all monitor map and data files left over. :param ctx: Context :param config: Configuration """ if ctx.config.get('use_existing_cluster', False) is True: log.info("'use_existing_cluster' is true; skipping cluster creation") yield testdir = teuthology.get_testdir(ctx) log.info('Creating ceph cluster...') run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', '{tdir}/data'.format(tdir=testdir), ], wait=False, )) run.wait( ctx.cluster.run( args=[ 'sudo', 'install', '-d', '-m0777', '--', '/var/run/ceph', ], wait=False, )) devs_to_clean = {} remote_to_roles_to_devs = {} remote_to_roles_to_journals = {} osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, roles_for_host in osds.remotes.iteritems(): devs = teuthology.get_scratch_devices(remote) roles_to_devs = {} roles_to_journals = {} if config.get('fs'): log.info('fs option selected, checking for scratch devs') log.info('found devs: %s' % (str(devs), )) devs_id_map = teuthology.get_wwn_id_map(remote, devs) iddevs = devs_id_map.values() roles_to_devs = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), iddevs) if len(roles_to_devs) < len(iddevs): iddevs = iddevs[len(roles_to_devs):] devs_to_clean[remote] = [] if config.get('block_journal'): log.info('block journal enabled') roles_to_journals = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), iddevs) log.info('journal map: %s', roles_to_journals) if config.get('tmpfs_journal'): log.info('tmpfs journal enabled') roles_to_journals = {} remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt']) for osd in teuthology.roles_of_type(roles_for_host, 'osd'): tmpfs = '/mnt/osd.%s' % osd roles_to_journals[osd] = tmpfs remote.run(args=['truncate', '-s', '1500M', tmpfs]) log.info('journal map: %s', roles_to_journals) log.info('dev map: %s' % (str(roles_to_devs), )) remote_to_roles_to_devs[remote] = roles_to_devs remote_to_roles_to_journals[remote] = roles_to_journals log.info('Generating config...') remotes_and_roles = ctx.cluster.remotes.items() roles = [role_list for (remote, role_list) in remotes_and_roles] ips = [ host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles) ] conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips) for remote, roles_to_journals in remote_to_roles_to_journals.iteritems(): for role, journal in roles_to_journals.iteritems(): key = "osd." + str(role) if key not in conf: conf[key] = {} conf[key]['osd journal'] = journal for section, keys in config['conf'].iteritems(): for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) if section not in conf: conf[section] = {} conf[section][key] = value if config.get('tmpfs_journal'): conf['journal dio'] = False ctx.ceph = argparse.Namespace() ctx.ceph.conf = conf keyring_path = config.get('keyring_path', '/etc/ceph/ceph.keyring') coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) firstmon = teuthology.get_first_mon(ctx, config) log.info('Setting up %s...' % firstmon) ctx.cluster.only(firstmon).run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', keyring_path, ], ) ctx.cluster.only(firstmon).run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--gen-key', '--name=mon.', keyring_path, ], ) ctx.cluster.only(firstmon).run(args=[ 'sudo', 'chmod', '0644', keyring_path, ], ) (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys() fsid = teuthology.create_simple_monmap( ctx, remote=mon0_remote, conf=conf, ) if not 'global' in conf: conf['global'] = {} conf['global']['fsid'] = fsid log.info('Writing ceph.conf for FSID %s...' % fsid) conf_path = config.get('conf_path', '/etc/ceph/ceph.conf') conf_fp = StringIO() conf.write(conf_fp) conf_fp.seek(0) writes = ctx.cluster.run( args=[ 'sudo', 'mkdir', '-p', '/etc/ceph', run.Raw('&&'), 'sudo', 'chmod', '0755', '/etc/ceph', run.Raw('&&'), 'sudo', 'python', '-c', 'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))', conf_path, run.Raw('&&'), 'sudo', 'chmod', '0644', conf_path, ], stdin=run.PIPE, wait=False, ) teuthology.feed_many_stdins_and_close(conf_fp, writes) run.wait(writes) log.info('Creating admin key on %s...' % firstmon) ctx.cluster.only(firstmon).run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--gen-key', '--name=client.admin', '--set-uid=0', '--cap', 'mon', 'allow *', '--cap', 'osd', 'allow *', '--cap', 'mds', 'allow', keyring_path, ], ) log.info('Copying monmap to all nodes...') keyring = teuthology.get_file( remote=mon0_remote, path=keyring_path, ) monmap = teuthology.get_file( remote=mon0_remote, path='{tdir}/monmap'.format(tdir=testdir), ) for rem in ctx.cluster.remotes.iterkeys(): # copy mon key and initial monmap log.info('Sending monmap to node {remote}'.format(remote=rem)) teuthology.sudo_write_file(remote=rem, path=keyring_path, data=keyring, perms='0644') teuthology.write_file( remote=rem, path='{tdir}/monmap'.format(tdir=testdir), data=monmap, ) log.info('Setting up mon nodes...') mons = ctx.cluster.only(teuthology.is_type('mon')) run.wait( mons.run( args=[ 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'osdmaptool', '-c', conf_path, '--clobber', '--createsimple', '{num:d}'.format(num=teuthology.num_instances_of_type( ctx.cluster, 'osd'), ), '{tdir}/osdmap'.format(tdir=testdir), '--pg_bits', '2', '--pgp_bits', '4', ], wait=False, ), ) log.info('Setting up mds nodes...') mdss = ctx.cluster.only(teuthology.is_type('mds')) for remote, roles_for_host in mdss.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'mds'): remote.run(args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/mds/ceph-{id}'.format(id=id_), run.Raw('&&'), 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', '--gen-key', '--name=mds.{id}'.format(id=id_), '/var/lib/ceph/mds/ceph-{id}/keyring'.format(id=id_), ], ) cclient.create_keyring(ctx) log.info('Running mkfs on osd nodes...') ctx.disk_config = argparse.Namespace() ctx.disk_config.remote_to_roles_to_dev = remote_to_roles_to_devs ctx.disk_config.remote_to_roles_to_journals = remote_to_roles_to_journals ctx.disk_config.remote_to_roles_to_dev_mount_options = {} ctx.disk_config.remote_to_roles_to_dev_fstype = {} log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format( r=str(ctx.disk_config.remote_to_roles_to_dev))) for remote, roles_for_host in osds.remotes.iteritems(): roles_to_devs = remote_to_roles_to_devs[remote] roles_to_journals = remote_to_roles_to_journals[remote] for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run(args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/osd/ceph-{id}'.format(id=id_), ]) log.info(str(roles_to_journals)) log.info(id_) if roles_to_devs.get(id_): dev = roles_to_devs[id_] fs = config.get('fs') package = None mkfs_options = config.get('mkfs_options') mount_options = config.get('mount_options') if fs == 'btrfs': #package = 'btrfs-tools' if mount_options is None: mount_options = ['noatime', 'user_subvol_rm_allowed'] if mkfs_options is None: mkfs_options = [ '-m', 'single', '-l', '32768', '-n', '32768' ] if fs == 'xfs': #package = 'xfsprogs' if mount_options is None: mount_options = ['noatime'] if mkfs_options is None: mkfs_options = ['-f', '-i', 'size=2048'] if fs == 'ext4' or fs == 'ext3': if mount_options is None: mount_options = ['noatime', 'user_xattr'] if mount_options is None: mount_options = [] if mkfs_options is None: mkfs_options = [] mkfs = ['mkfs.%s' % fs] + mkfs_options log.info('%s on %s on %s' % (mkfs, dev, remote)) if package is not None: remote.run( args=['sudo', 'apt-get', 'install', '-y', package], stdout=StringIO(), ) try: remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) except run.CommandFailedError: # Newer btfs-tools doesn't prompt for overwrite, use -f if '-f' not in mount_options: mkfs_options.append('-f') mkfs = ['mkfs.%s' % fs] + mkfs_options log.info('%s on %s on %s' % (mkfs, dev, remote)) remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) log.info('mount %s on %s -o %s' % (dev, remote, ','.join(mount_options))) remote.run(args=[ 'sudo', 'mount', '-t', fs, '-o', ','.join(mount_options), dev, os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format( id=id_)), ]) if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options: ctx.disk_config.remote_to_roles_to_dev_mount_options[ remote] = {} ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][ id_] = mount_options if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype: ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {} ctx.disk_config.remote_to_roles_to_dev_fstype[remote][id_] = fs devs_to_clean[remote].append( os.path.join( os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)), )) for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run(args=[ 'sudo', 'MALLOC_CHECK_=3', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-osd', '--mkfs', '--mkkey', '-i', id_, '--monmap', '{tdir}/monmap'.format(tdir=testdir), ], ) log.info('Reading keys from all nodes...') keys_fp = StringIO() keys = [] for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for type_ in ['mds', 'osd']: for id_ in teuthology.roles_of_type(roles_for_host, type_): data = teuthology.get_file( remote=remote, path='/var/lib/ceph/{type}/ceph-{id}/keyring'.format( type=type_, id=id_, ), sudo=True, ) keys.append((type_, id_, data)) keys_fp.write(data) for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for type_ in ['client']: for id_ in teuthology.roles_of_type(roles_for_host, type_): data = teuthology.get_file( remote=remote, path='/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)) keys.append((type_, id_, data)) keys_fp.write(data) log.info('Adding keys to all mons...') writes = mons.run( args=[ 'sudo', 'tee', '-a', keyring_path, ], stdin=run.PIPE, wait=False, stdout=StringIO(), ) keys_fp.seek(0) teuthology.feed_many_stdins_and_close(keys_fp, writes) run.wait(writes) for type_, id_, data in keys: run.wait( mons.run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', keyring_path, '--name={type}.{id}'.format( type=type_, id=id_, ), ] + list(teuthology.generate_caps(type_)), wait=False, ), ) log.info('Running mkfs on mon nodes...') for remote, roles_for_host in mons.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'mon'): remote.run(args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/mon/ceph-{id}'.format(id=id_), ], ) remote.run(args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-mon', '--mkfs', '-i', id_, '--monmap={tdir}/monmap'.format(tdir=testdir), '--osdmap={tdir}/osdmap'.format(tdir=testdir), '--keyring={kpath}'.format(kpath=keyring_path), ], ) run.wait( mons.run( args=[ 'rm', '--', '{tdir}/monmap'.format(tdir=testdir), '{tdir}/osdmap'.format(tdir=testdir), ], wait=False, ), ) try: yield except Exception: # we need to know this below ctx.summary['success'] = False raise finally: (mon0_remote, ) = ctx.cluster.only(firstmon).remotes.keys() log.info('Checking cluster log for badness...') def first_in_ceph_log(pattern, excludes): """ Find the first occurence of the pattern specified in the Ceph log, Returns None if none found. :param pattern: Pattern scanned for. :param excludes: Patterns to ignore. :return: First line of text (or None if not found) """ args = [ 'sudo', 'egrep', pattern, '/var/log/ceph/ceph.log', ] for exclude in excludes: args.extend([run.Raw('|'), 'egrep', '-v', exclude]) args.extend([ run.Raw('|'), 'head', '-n', '1', ]) r = mon0_remote.run( stdout=StringIO(), args=args, ) stdout = r.stdout.getvalue() if stdout != '': return stdout return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', config['log_whitelist']) is not None: log.warning('Found errors (ERR|WRN|SEC) in cluster log') ctx.summary['success'] = False # use the most severe problem as the failure reason if 'failure_reason' not in ctx.summary: for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']: match = first_in_ceph_log(pattern, config['log_whitelist']) if match is not None: ctx.summary['failure_reason'] = \ '"{match}" in cluster log'.format( match=match.rstrip('\n'), ) break for remote, dirs in devs_to_clean.iteritems(): for dir_ in dirs: log.info('Unmounting %s on %s' % (dir_, remote)) remote.run( args=['sync', run.Raw('&&'), 'sudo', 'umount', '-f', dir_]) if config.get('tmpfs_journal'): log.info('tmpfs journal enabled - unmounting tmpfs at /mnt') for remote, roles_for_host in osds.remotes.iteritems(): remote.run( args=['sudo', 'umount', '-f', '/mnt'], check_status=False, ) if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') # and logs log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) log.info('Cleaning ceph cluster...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-rf', '--', conf_path, keyring_path, '{tdir}/data'.format(tdir=testdir), '{tdir}/monmap'.format(tdir=testdir), ], wait=False, ), )
def build_ceph_cluster(ctx, config): """Build a ceph cluster""" # Expect to find ceph_admin on the first mon by ID, same place that the download task # puts it. Remember this here, because subsequently IDs will change from those in # the test config to those that ceph-deploy invents. (ceph_admin,) = ctx.cluster.only( teuthology.get_first_mon(ctx, config)).remotes.iterkeys() def execute_ceph_deploy(cmd): """Remotely execute a ceph_deploy command""" return ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(cmd), ], check_status=False, ).exitstatus try: log.info('Building ceph cluster using ceph-deploy...') testdir = teuthology.get_testdir(ctx) ceph_branch = None if config.get('branch') is not None: cbranch = config.get('branch') for var, val in cbranch.iteritems(): ceph_branch = '--{var}={val}'.format(var=var, val=val) all_nodes = get_all_nodes(ctx, config) mds_nodes = get_nodes_using_role(ctx, 'mds') mds_nodes = " ".join(mds_nodes) mon_node = get_nodes_using_role(ctx, 'mon') mon_nodes = " ".join(mon_node) new_mon = './ceph-deploy new' + " " + mon_nodes mon_hostname = mon_nodes.split(' ')[0] mon_hostname = str(mon_hostname) gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname deploy_mds = './ceph-deploy mds create' + " " + mds_nodes no_of_osds = 0 if mon_nodes is None: raise RuntimeError("no monitor nodes in the config file") estatus_new = execute_ceph_deploy(new_mon) if estatus_new != 0: raise RuntimeError("ceph-deploy: new command failed") log.info('adding config inputs...') testdir = teuthology.get_testdir(ctx) conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir) if config.get('conf') is not None: confp = config.get('conf') for section, keys in confp.iteritems(): lines = '[{section}]\n'.format(section=section) teuthology.append_lines_to_file(ceph_admin, conf_path, lines, sudo=True) for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) lines = '{key} = {value}\n'.format(key=key, value=value) teuthology.append_lines_to_file( ceph_admin, conf_path, lines, sudo=True) # install ceph dev_branch = ctx.config['branch'] branch = '--dev={branch}'.format(branch=dev_branch) if ceph_branch: option = ceph_branch else: option = branch install_nodes = './ceph-deploy install ' + option + " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph") # install ceph-test package too install_nodes2 = './ceph-deploy install --tests ' + option + \ " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes2) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph-test") mon_create_nodes = './ceph-deploy mon create-initial' # If the following fails, it is OK, it might just be that the monitors # are taking way more than a minute/monitor to form quorum, so lets # try the next block which will wait up to 15 minutes to gatherkeys. execute_ceph_deploy(mon_create_nodes) # create-keys is explicit now # http://tracker.ceph.com/issues/16036 mons = ctx.cluster.only(teuthology.is_type('mon')) for remote in mons.remotes.iterkeys(): remote.run(args=['sudo', 'ceph-create-keys', '--cluster', 'ceph', '--id', remote.shortname]) estatus_gather = execute_ceph_deploy(gather_keys) if mds_nodes: estatus_mds = execute_ceph_deploy(deploy_mds) if estatus_mds != 0: raise RuntimeError("ceph-deploy: Failed to deploy mds") if config.get('test_mon_destroy') is not None: for d in range(1, len(mon_node)): mon_destroy_nodes = './ceph-deploy mon destroy' + \ " " + mon_node[d] estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes) if estatus_mon_d != 0: raise RuntimeError("ceph-deploy: Failed to delete monitor") node_dev_list = get_dev_for_osd(ctx, config) for d in node_dev_list: node = d[0] for disk in d[1:]: zap = './ceph-deploy disk zap ' + node + ':' + disk estatus = execute_ceph_deploy(zap) if estatus != 0: raise RuntimeError("ceph-deploy: Failed to zap osds") osd_create_cmd = './ceph-deploy osd create ' if config.get('dmcrypt') is not None: osd_create_cmd += '--dmcrypt ' osd_create_cmd += ":".join(d) estatus_osd = execute_ceph_deploy(osd_create_cmd) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") if config.get('wait-for-healthy', True) and no_of_osds >= 2: is_healthy(ctx=ctx, config=None) log.info('Setting up client nodes...') conf_path = '/etc/ceph/ceph.conf' admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring' first_mon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys() conf_data = teuthology.get_file( remote=mon0_remote, path=conf_path, sudo=True, ) admin_keyring = teuthology.get_file( remote=mon0_remote, path=admin_keyring_path, sudo=True, ) clients = ctx.cluster.only(teuthology.is_type('client')) for remot, roles_for_host in clients.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'client'): client_keyring = \ '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) mon0_remote.run( args=[ 'cd', '{tdir}'.format(tdir=testdir), run.Raw('&&'), 'sudo', 'bash', '-c', run.Raw('"'), 'ceph', 'auth', 'get-or-create', 'client.{id}'.format(id=id_), 'mds', 'allow', 'mon', 'allow *', 'osd', 'allow *', run.Raw('>'), client_keyring, run.Raw('"'), ], ) key_data = teuthology.get_file( remote=mon0_remote, path=client_keyring, sudo=True, ) teuthology.sudo_write_file( remote=remot, path=client_keyring, data=key_data, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=admin_keyring_path, data=admin_keyring, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=conf_path, data=conf_data, perms='0644' ) if mds_nodes: log.info('Configuring CephFS...') ceph_fs = Filesystem(ctx) if not ceph_fs.legacy_configured(): ceph_fs.create() elif not config.get('only_mon'): raise RuntimeError( "The cluster is NOT operational due to insufficient OSDs") yield except Exception: log.info( "Error encountered, logging exception before tearing down ceph-deploy") log.info(traceback.format_exc()) raise finally: if config.get('keep_running'): return log.info('Stopping ceph...') ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'), 'sudo', 'service', 'ceph', 'stop', run.Raw('||'), 'sudo', 'systemctl', 'stop', 'ceph.target']) # Are you really not running anymore? # try first with the init tooling # ignoring the status so this becomes informational only ctx.cluster.run( args=[ 'sudo', 'status', 'ceph-all', run.Raw('||'), 'sudo', 'service', 'ceph', 'status', run.Raw('||'), 'sudo', 'systemctl', 'status', 'ceph.target'], check_status=False) # and now just check for the processes themselves, as if upstart/sysvinit # is lying to us. Ignore errors if the grep fails ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'), 'grep', '-v', 'grep', run.Raw('|'), 'grep', 'ceph'], check_status=False) if ctx.archive is not None: # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) mons = ctx.cluster.only(teuthology.is_type('mon')) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) # Prevent these from being undefined if the try block fails all_nodes = get_all_nodes(ctx, config) purge_nodes = './ceph-deploy purge' + " " + all_nodes purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes log.info('Purging package...') execute_ceph_deploy(purge_nodes) log.info('Purging data...') execute_ceph_deploy(purgedata_nodes)
def ceph_log(ctx, config): """ Create /var/log/ceph log directory that is open to everyone. Add valgrind and profiling-logger directories. :param ctx: Context :param config: Configuration """ log.info('Making ceph log dir writeable by non-root...') run.wait( ctx.cluster.run( args=[ 'sudo', 'chmod', '777', '/var/log/ceph', ], wait=False, )) log.info('Disabling ceph logrotate...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-f', '--', '/etc/logrotate.d/ceph', ], wait=False, )) log.info('Creating extra log directories...') run.wait( ctx.cluster.run( args=[ 'sudo', 'install', '-d', '-m0777', '--', '/var/log/ceph/valgrind', '/var/log/ceph/profiling-logger', ], wait=False, )) class Rotater(object): stop_event = gevent.event.Event() def invoke_logrotate(self): # 1) install ceph-test.conf in /etc/logrotate.d # 2) continuously loop over logrotate invocation with ceph-test.conf while not self.stop_event.is_set(): self.stop_event.wait(timeout=30) run.wait( ctx.cluster.run( args=[ 'sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf' ], wait=False, )) def begin(self): self.thread = gevent.spawn(self.invoke_logrotate) def end(self): self.stop_event.set() self.thread.get() def write_rotate_conf(ctx, daemons): testdir = teuthology.get_testdir(ctx) rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf') with file(rotate_conf_path, 'rb') as f: conf = "" for daemon, size in daemons.iteritems(): log.info('writing logrotate stanza for {daemon}'.format( daemon=daemon)) conf += f.read().format(daemon_type=daemon, max_size=size) f.seek(0, 0) for remote in ctx.cluster.remotes.iterkeys(): teuthology.write_file( remote=remote, path='{tdir}/logrotate.ceph-test.conf'.format( tdir=testdir), data=StringIO(conf)) remote.run(args=[ 'sudo', 'mv', '{tdir}/logrotate.ceph-test.conf'.format( tdir=testdir), '/etc/logrotate.d/ceph-test.conf', run.Raw('&&'), 'sudo', 'chmod', '0644', '/etc/logrotate.d/ceph-test.conf', run.Raw('&&'), 'sudo', 'chown', 'root.root', '/etc/logrotate.d/ceph-test.conf' ]) remote.chcon('/etc/logrotate.d/ceph-test.conf', 'system_u:object_r:etc_t:s0') if ctx.config.get('log-rotate'): daemons = ctx.config.get('log-rotate') log.info('Setting up log rotation with ' + str(daemons)) write_rotate_conf(ctx, daemons) logrotater = Rotater() logrotater.begin() try: yield finally: if ctx.config.get('log-rotate'): log.info('Shutting down logrotate') logrotater.end() ctx.cluster.run( args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf']) if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): # and logs log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log'))
def _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path): local_mstore = tempfile.mkdtemp() # collect the maps from all OSDs is_osd = teuthology.is_type('osd') osds = ctx.cluster.only(is_osd) assert osds for osd, roles in osds.remotes.iteritems(): for role in roles: if not is_osd(role): continue cluster, _, osd_id = teuthology.split_role(role) assert cluster_name == cluster log.info('collecting maps from {cluster}:osd.{osd}'.format( cluster=cluster, osd=osd_id)) # push leveldb to OSD osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store') osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore]) _push_directory(local_mstore, osd, osd_mstore) log.info('rm -rf {0}'.format(local_mstore)) shutil.rmtree(local_mstore) # update leveldb with OSD data options = '--no-mon-config --op update-mon-db --mon-store-path {0}' log.info('cot {0}'.format(osd_mstore)) manager.objectstore_tool(pool=None, options=options.format(osd_mstore), args='', osd=osd_id, do_revive=False) # pull the updated mon db log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore)) local_mstore = tempfile.mkdtemp() teuthology.pull_directory(osd, osd_mstore, local_mstore) log.info('rm -rf osd:{0}'.format(osd_mstore)) osd.run(args=['sudo', 'rm', '-fr', osd_mstore]) # recover the first_mon with re-built mon db # pull from recovered leveldb from client mon_store_dir = os.path.join('/var/lib/ceph/mon', '{0}-{1}'.format(cluster_name, mon_id)) _push_directory(local_mstore, mon, mon_store_dir) mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir]) shutil.rmtree(local_mstore) # fill up the caps in the keyring file mon.run(args=['sudo', 'ceph-authtool', keyring_path, '-n', 'mon.', '--cap', 'mon', 'allow *']) mon.run(args=['sudo', 'ceph-authtool', keyring_path, '-n', 'client.admin', '--cap', 'mon', 'allow *', '--cap', 'osd', 'allow *', '--cap', 'mds', 'allow *', '--cap', 'mgr', 'allow *']) mon.run(args=['sudo', '-u', 'ceph', 'CEPH_ARGS=--no-mon-config', 'ceph-monstore-tool', mon_store_dir, 'rebuild', '--', '--keyring', keyring_path])
def ceph_log(ctx, config): cluster_name = config['cluster'] fsid = ctx.ceph[cluster_name].fsid try: yield except Exception: # we need to know this below ctx.summary['success'] = False raise finally: log.info('Checking cluster log for badness...') def first_in_ceph_log(pattern, excludes): """ Find the first occurrence of the pattern specified in the Ceph log, Returns None if none found. :param pattern: Pattern scanned for. :param excludes: Patterns to ignore. :return: First line of text (or None if not found) """ args = [ 'sudo', 'egrep', pattern, '/var/log/ceph/{fsid}/ceph.log'.format( fsid=fsid), ] if excludes: for exclude in excludes: args.extend([run.Raw('|'), 'egrep', '-v', exclude]) args.extend([ run.Raw('|'), 'head', '-n', '1', ]) r = ctx.ceph[cluster_name].bootstrap_remote.run( stdout=StringIO(), args=args, ) stdout = r.stdout.getvalue() if stdout != '': return stdout return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', config.get('log-whitelist')) is not None: log.warning('Found errors (ERR|WRN|SEC) in cluster log') ctx.summary['success'] = False # use the most severe problem as the failure reason if 'failure_reason' not in ctx.summary: for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']: match = first_in_ceph_log(pattern, config['log-whitelist']) if match is not None: ctx.summary['failure_reason'] = \ '"{match}" in cluster log'.format( match=match.rstrip('\n'), ) break if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): # and logs log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', # all logs, not just for the cluster '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') try: os.makedirs(path) except OSError: pass for remote in ctx.cluster.remotes.keys(): sub = os.path.join(path, remote.name) try: os.makedirs(sub) except OSError: pass try: teuthology.pull_directory(remote, '/var/log/ceph', # everything os.path.join(sub, 'log')) except ReadError: pass
def build_ceph_cluster(ctx, config): """Build a ceph cluster""" try: log.info('Building ceph cluster using ceph-deploy...') testdir = teuthology.get_testdir(ctx) ceph_branch = None if config.get('branch') is not None: cbranch = config.get('branch') for var, val in cbranch.iteritems(): if var == 'testing': ceph_branch = '--{var}'.format(var=var) ceph_branch = '--{var}={val}'.format(var=var, val=val) node_dev_list = [] all_nodes = get_all_nodes(ctx, config) mds_nodes = get_nodes_using_roles(ctx, config, 'mds') mds_nodes = " ".join(mds_nodes) mon_node = get_nodes_using_roles(ctx, config, 'mon') mon_nodes = " ".join(mon_node) new_mon = './ceph-deploy new'+" "+mon_nodes install_nodes = './ceph-deploy install '+ceph_branch+" "+all_nodes purge_nodes = './ceph-deploy purge'+" "+all_nodes purgedata_nodes = './ceph-deploy purgedata'+" "+all_nodes mon_hostname = mon_nodes.split(' ')[0] mon_hostname = str(mon_hostname) gather_keys = './ceph-deploy gatherkeys'+" "+mon_hostname deploy_mds = './ceph-deploy mds create'+" "+mds_nodes no_of_osds = 0 if mon_nodes is None: raise RuntimeError("no monitor nodes in the config file") estatus_new = execute_ceph_deploy(ctx, config, new_mon) if estatus_new != 0: raise RuntimeError("ceph-deploy: new command failed") log.info('adding config inputs...') testdir = teuthology.get_testdir(ctx) conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir) first_mon = teuthology.get_first_mon(ctx, config) (remote,) = ctx.cluster.only(first_mon).remotes.keys() lines = None if config.get('conf') is not None: confp = config.get('conf') for section, keys in confp.iteritems(): lines = '[{section}]\n'.format(section=section) teuthology.append_lines_to_file(remote, conf_path, lines, sudo=True) for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) lines = '{key} = {value}\n'.format(key=key, value=value) teuthology.append_lines_to_file(remote, conf_path, lines, sudo=True) estatus_install = execute_ceph_deploy(ctx, config, install_nodes) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph") mon_create_nodes = './ceph-deploy mon create-initial' # If the following fails, it is OK, it might just be that the monitors # are taking way more than a minute/monitor to form quorum, so lets # try the next block which will wait up to 15 minutes to gatherkeys. execute_ceph_deploy(ctx, config, mon_create_nodes) estatus_gather = execute_ceph_deploy(ctx, config, gather_keys) max_gather_tries = 90 gather_tries = 0 while (estatus_gather != 0): gather_tries += 1 if gather_tries >= max_gather_tries: msg = 'ceph-deploy was not able to gatherkeys after 15 minutes' raise RuntimeError(msg) estatus_gather = execute_ceph_deploy(ctx, config, gather_keys) time.sleep(10) if mds_nodes: estatus_mds = execute_ceph_deploy(ctx, config, deploy_mds) if estatus_mds != 0: raise RuntimeError("ceph-deploy: Failed to deploy mds") if config.get('test_mon_destroy') is not None: for d in range(1, len(mon_node)): mon_destroy_nodes = './ceph-deploy mon destroy'+" "+mon_node[d] estatus_mon_d = execute_ceph_deploy(ctx, config, mon_destroy_nodes) if estatus_mon_d != 0: raise RuntimeError("ceph-deploy: Failed to delete monitor") node_dev_list = get_dev_for_osd(ctx, config) osd_create_cmd = './ceph-deploy osd create --zap-disk ' for d in node_dev_list: if config.get('dmcrypt') is not None: osd_create_cmd_d = osd_create_cmd+'--dmcrypt'+" "+d else: osd_create_cmd_d = osd_create_cmd+d estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmd_d) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: disks = [] disks = d.split(':') dev_disk = disks[0]+":"+disks[1] j_disk = disks[0]+":"+disks[2] zap_disk = './ceph-deploy disk zap '+dev_disk+" "+j_disk execute_ceph_deploy(ctx, config, zap_disk) estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmd_d) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") if config.get('wait-for-healthy', True) and no_of_osds >= 2: is_healthy(ctx=ctx, config=None) log.info('Setting up client nodes...') conf_path = '/etc/ceph/ceph.conf' admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring' first_mon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys() conf_data = teuthology.get_file( remote=mon0_remote, path=conf_path, sudo=True, ) admin_keyring = teuthology.get_file( remote=mon0_remote, path=admin_keyring_path, sudo=True, ) clients = ctx.cluster.only(teuthology.is_type('client')) for remot, roles_for_host in clients.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'client'): client_keyring = \ '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) mon0_remote.run( args=[ 'cd', '{tdir}'.format(tdir=testdir), run.Raw('&&'), 'sudo', 'bash', '-c', run.Raw('"'), 'ceph', 'auth', 'get-or-create', 'client.{id}'.format(id=id_), 'mds', 'allow', 'mon', 'allow *', 'osd', 'allow *', run.Raw('>'), client_keyring, run.Raw('"'), ], ) key_data = teuthology.get_file( remote=mon0_remote, path=client_keyring, sudo=True, ) teuthology.sudo_write_file( remote=remot, path=client_keyring, data=key_data, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=admin_keyring_path, data=admin_keyring, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=conf_path, data=conf_data, perms='0644' ) else: raise RuntimeError( "The cluster is NOT operational due to insufficient OSDs") yield finally: log.info('Stopping ceph...') ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'), 'sudo', 'service', 'ceph', 'stop' ]) # Are you really not running anymore? # try first with the init tooling # ignoring the status so this becomes informational only ctx.cluster.run(args=['sudo', 'status', 'ceph-all', run.Raw('||'), 'sudo', 'service', 'ceph', 'status'], check_status=False) # and now just check for the processes themselves, as if upstart/sysvinit # is lying to us. Ignore errors if the grep fails ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'), 'grep', '-v', 'grep', run.Raw('|'), 'grep', 'ceph'], check_status=False) if ctx.archive is not None: # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) mons = ctx.cluster.only(teuthology.is_type('mon')) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) # Prevent these from being undefined if the try block fails all_nodes = get_all_nodes(ctx, config) purge_nodes = './ceph-deploy purge'+" "+all_nodes purgedata_nodes = './ceph-deploy purgedata'+" "+all_nodes log.info('Purging package...') execute_ceph_deploy(ctx, config, purge_nodes) log.info('Purging data...') execute_ceph_deploy(ctx, config, purgedata_nodes)
def task(ctx, config): """ Test monitor recovery from OSD """ if config is None: config = {} assert isinstance(config, dict), \ 'task only accepts a dict for configuration' first_mon = teuthology.get_first_mon(ctx, config) (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() manager = ceph_manager.CephManager( mon, ctx=ctx, logger=log.getChild('ceph_manager')) mons = ctx.cluster.only(teuthology.is_type('mon')) assert mons # note down the first cluster_name and mon_id # we will recover it later on cluster_name = None mon_id = None for remote, roles in mons.remotes.iteritems(): is_mon = teuthology.is_type('mon') for role in roles: if not is_mon(role): continue cluster, _, m = teuthology.split_role(role) if cluster_name is None: cluster_name = cluster mon_id = m assert cluster_name == cluster log.info('killing {cluster}:mon.{mon}'.format( cluster=cluster, mon=m)) manager.kill_mon(m) mon_data = os.path.join('/var/lib/ceph/mon/', '{0}-{1}'.format(cluster_name, m)) if m == mon_id: # so we will only need to recreate the store.db for the # first mon, would be easier than mkfs on it then replace # the its store.db with the recovered one store_dir = os.path.join(mon_data, 'store.db') remote.run(args=['sudo', 'rm', '-r', store_dir]) else: remote.run(args=['sudo', 'rm', '-r', mon_data]) local_mstore = tempfile.mkdtemp() # collect the maps from all OSDs osds = ctx.cluster.only(teuthology.is_type('osd')) assert osds for osd, roles in osds.remotes.iteritems(): is_osd = teuthology.is_type('osd') for role in roles: if not is_osd(role): continue cluster, _, osd_id = teuthology.split_role(role) assert cluster_name == cluster log.info('collecting maps from {cluster}:osd.{osd}'.format( cluster=cluster, osd=osd_id)) # push leveldb to OSD osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store') osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore]) push_directory(local_mstore, osd, osd_mstore) log.info('rm -rf {0}'.format(local_mstore)) shutil.rmtree(local_mstore) # update leveldb with OSD data options = '--op update-mon-db --mon-store-path {0}' log.info('cot {0}'.format(osd_mstore)) manager.objectstore_tool(pool=None, options=options.format(osd_mstore), args='', osd=osd_id, do_revive=False) # pull the updated mon db log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore)) local_mstore = tempfile.mkdtemp() teuthology.pull_directory(osd, osd_mstore, local_mstore) log.info('rm -rf osd:{0}'.format(osd_mstore)) osd.run(args=['sudo', 'rm', '-fr', osd_mstore]) # recover the first_mon with re-built mon db # pull from recovered leveldb from client mon_store_dir = os.path.join('/var/lib/ceph/mon', '{0}-{1}'.format(cluster_name, mon_id)) push_directory(local_mstore, mon, mon_store_dir) mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir]) shutil.rmtree(local_mstore) default_keyring = '/etc/ceph/{cluster}.keyring'.format( cluster=cluster_name) keyring_path = config.get('keyring_path', default_keyring) # fill up the caps in the keyring file mon.run(args=['sudo', 'ceph-authtool', keyring_path, '-n', 'mon.', '--cap', 'mon', 'allow *']) mon.run(args=['sudo', 'ceph-authtool', keyring_path, '-n', 'client.admin', '--cap', 'mon', 'allow *', '--cap', 'osd', 'allow *', '--cap', 'mds', 'allow *']) mon.run(args=['sudo', '-u', 'ceph', 'ceph-monstore-tool', mon_store_dir, 'rebuild', '--', '--keyring', keyring_path]) # revive monitors # the initial monmap is in the ceph.conf, so we are good. n_mons = 0 for remote, roles in mons.remotes.iteritems(): is_mon = teuthology.is_type('mon') for role in roles: if not is_mon(role): continue cluster, _, m = teuthology.split_role(role) assert cluster_name == cluster if mon_id != m: log.info('running mkfs on {cluster}:mon.{mon}'.format( cluster=cluster, mon=m)) remote.run( args=[ 'sudo', 'ceph-mon', '--cluster', cluster, '--mkfs', '-i', m, '--keyring', keyring_path]) manager.revive_mon(m) n_mons += 1 manager.wait_for_mon_quorum_size(n_mons, timeout=30) for osd, roles in osds.remotes.iteritems(): is_osd = teuthology.is_type('osd') for role in roles: if not is_osd(role): continue _, _, osd_id = teuthology.split_role(role) log.info('reviving osd.{0}'.format(osd_id)) manager.revive_osd(osd_id)
def build_ceph_cluster(ctx, config): """Build a ceph cluster""" # Expect to find ceph_admin on the first mon by ID, same place that the download task # puts it. Remember this here, because subsequently IDs will change from those in # the test config to those that ceph-deploy invents. (ceph_admin,) = ctx.cluster.only(teuthology.get_first_mon(ctx, config)).remotes.iterkeys() def execute_ceph_deploy(cmd): """Remotely execute a ceph_deploy command""" return ceph_admin.run( args=["cd", "{tdir}/ceph-deploy".format(tdir=testdir), run.Raw("&&"), run.Raw(cmd)], check_status=False ).exitstatus try: log.info("Building ceph cluster using ceph-deploy...") testdir = teuthology.get_testdir(ctx) ceph_branch = None if config.get("branch") is not None: cbranch = config.get("branch") for var, val in cbranch.iteritems(): ceph_branch = "--{var}={val}".format(var=var, val=val) all_nodes = get_all_nodes(ctx, config) mds_nodes = get_nodes_using_role(ctx, "mds") mds_nodes = " ".join(mds_nodes) mon_node = get_nodes_using_role(ctx, "mon") mon_nodes = " ".join(mon_node) new_mon = "./ceph-deploy new" + " " + mon_nodes mon_hostname = mon_nodes.split(" ")[0] mon_hostname = str(mon_hostname) gather_keys = "./ceph-deploy gatherkeys" + " " + mon_hostname deploy_mds = "./ceph-deploy mds create" + " " + mds_nodes no_of_osds = 0 if mon_nodes is None: raise RuntimeError("no monitor nodes in the config file") estatus_new = execute_ceph_deploy(new_mon) if estatus_new != 0: raise RuntimeError("ceph-deploy: new command failed") log.info("adding config inputs...") testdir = teuthology.get_testdir(ctx) conf_path = "{tdir}/ceph-deploy/ceph.conf".format(tdir=testdir) if config.get("conf") is not None: confp = config.get("conf") for section, keys in confp.iteritems(): lines = "[{section}]\n".format(section=section) teuthology.append_lines_to_file(ceph_admin, conf_path, lines, sudo=True) for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) lines = "{key} = {value}\n".format(key=key, value=value) teuthology.append_lines_to_file(ceph_admin, conf_path, lines, sudo=True) # install ceph install_nodes = "./ceph-deploy install " + (ceph_branch if ceph_branch else "--dev=master") + " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph") # install ceph-test package too install_nodes2 = ( "./ceph-deploy install --tests " + (ceph_branch if ceph_branch else "--dev=master") + " " + all_nodes ) estatus_install = execute_ceph_deploy(install_nodes2) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph-test") mon_create_nodes = "./ceph-deploy mon create-initial" # If the following fails, it is OK, it might just be that the monitors # are taking way more than a minute/monitor to form quorum, so lets # try the next block which will wait up to 15 minutes to gatherkeys. execute_ceph_deploy(mon_create_nodes) estatus_gather = execute_ceph_deploy(gather_keys) max_gather_tries = 90 gather_tries = 0 while estatus_gather != 0: gather_tries += 1 if gather_tries >= max_gather_tries: msg = "ceph-deploy was not able to gatherkeys after 15 minutes" raise RuntimeError(msg) estatus_gather = execute_ceph_deploy(gather_keys) time.sleep(10) if mds_nodes: estatus_mds = execute_ceph_deploy(deploy_mds) if estatus_mds != 0: raise RuntimeError("ceph-deploy: Failed to deploy mds") if config.get("test_mon_destroy") is not None: for d in range(1, len(mon_node)): mon_destroy_nodes = "./ceph-deploy mon destroy" + " " + mon_node[d] estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes) if estatus_mon_d != 0: raise RuntimeError("ceph-deploy: Failed to delete monitor") node_dev_list = get_dev_for_osd(ctx, config) for d in node_dev_list: node = d[0] for disk in d[1:]: zap = "./ceph-deploy disk zap " + node + ":" + disk estatus = execute_ceph_deploy(zap) if estatus != 0: raise RuntimeError("ceph-deploy: Failed to zap osds") osd_create_cmd = "./ceph-deploy osd create " if config.get("dmcrypt") is not None: osd_create_cmd += "--dmcrypt " osd_create_cmd += ":".join(d) estatus_osd = execute_ceph_deploy(osd_create_cmd) if estatus_osd == 0: log.info("successfully created osd") no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") if config.get("wait-for-healthy", True) and no_of_osds >= 2: is_healthy(ctx=ctx, config=None) log.info("Setting up client nodes...") conf_path = "/etc/ceph/ceph.conf" admin_keyring_path = "/etc/ceph/ceph.client.admin.keyring" first_mon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys() conf_data = teuthology.get_file(remote=mon0_remote, path=conf_path, sudo=True) admin_keyring = teuthology.get_file(remote=mon0_remote, path=admin_keyring_path, sudo=True) clients = ctx.cluster.only(teuthology.is_type("client")) for remot, roles_for_host in clients.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, "client"): client_keyring = "/etc/ceph/ceph.client.{id}.keyring".format(id=id_) mon0_remote.run( args=[ "cd", "{tdir}".format(tdir=testdir), run.Raw("&&"), "sudo", "bash", "-c", run.Raw('"'), "ceph", "auth", "get-or-create", "client.{id}".format(id=id_), "mds", "allow", "mon", "allow *", "osd", "allow *", run.Raw(">"), client_keyring, run.Raw('"'), ] ) key_data = teuthology.get_file(remote=mon0_remote, path=client_keyring, sudo=True) teuthology.sudo_write_file(remote=remot, path=client_keyring, data=key_data, perms="0644") teuthology.sudo_write_file(remote=remot, path=admin_keyring_path, data=admin_keyring, perms="0644") teuthology.sudo_write_file(remote=remot, path=conf_path, data=conf_data, perms="0644") if mds_nodes: log.info("Configuring CephFS...") ceph_fs = Filesystem(ctx, admin_remote=clients.remotes.keys()[0]) if not ceph_fs.legacy_configured(): ceph_fs.create() elif not config.get("only_mon"): raise RuntimeError("The cluster is NOT operational due to insufficient OSDs") yield except Exception: log.info("Error encountered, logging exception before tearing down ceph-deploy") log.info(traceback.format_exc()) raise finally: if config.get("keep_running"): return log.info("Stopping ceph...") ctx.cluster.run( args=[ "sudo", "stop", "ceph-all", run.Raw("||"), "sudo", "service", "ceph", "stop", run.Raw("||"), "sudo", "systemctl", "stop", "ceph.target", ] ) # Are you really not running anymore? # try first with the init tooling # ignoring the status so this becomes informational only ctx.cluster.run( args=[ "sudo", "status", "ceph-all", run.Raw("||"), "sudo", "service", "ceph", "status", run.Raw("||"), "sudo", "systemctl", "status", "ceph.target", ], check_status=False, ) # and now just check for the processes themselves, as if upstart/sysvinit # is lying to us. Ignore errors if the grep fails ctx.cluster.run( args=["sudo", "ps", "aux", run.Raw("|"), "grep", "-v", "grep", run.Raw("|"), "grep", "ceph"], check_status=False, ) if ctx.archive is not None: # archive mon data, too log.info("Archiving mon data...") path = os.path.join(ctx.archive, "data") os.makedirs(path) mons = ctx.cluster.only(teuthology.is_type("mon")) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith("mon."): teuthology.pull_directory_tarball(remote, "/var/lib/ceph/mon", path + "/" + role + ".tgz") log.info("Compressing logs...") run.wait( ctx.cluster.run( args=[ "sudo", "find", "/var/log/ceph", "-name", "*.log", "-print0", run.Raw("|"), "sudo", "xargs", "-0", "--no-run-if-empty", "--", "gzip", "--", ], wait=False, ) ) log.info("Archiving logs...") path = os.path.join(ctx.archive, "remote") os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, "/var/log/ceph", os.path.join(sub, "log")) # Prevent these from being undefined if the try block fails all_nodes = get_all_nodes(ctx, config) purge_nodes = "./ceph-deploy purge" + " " + all_nodes purgedata_nodes = "./ceph-deploy purgedata" + " " + all_nodes log.info("Purging package...") execute_ceph_deploy(purge_nodes) log.info("Purging data...") execute_ceph_deploy(purgedata_nodes)
def cluster(ctx, config): """ Handle the creation and removal of a ceph cluster. On startup: Create directories needed for the cluster. Create remote journals for all osds. Create and set keyring. Copy the monmap to tht test systems. Setup mon nodes. Setup mds nodes. Mkfs osd nodes. Add keyring information to monmaps Mkfs mon nodes. On exit: If errors occured, extract a failure message and store in ctx.summary. Unmount all test files and temporary journaling files. Save the monitor information and archive all ceph logs. Cleanup the keyring setup, and remove all monitor map and data files left over. :param ctx: Context :param config: Configuration """ testdir = teuthology.get_testdir(ctx) log.info('Creating ceph cluster...') run.wait( ctx.cluster.run( args=[ 'install', '-d', '-m0755', '--', '{tdir}/data'.format(tdir=testdir), ], wait=False, ) ) run.wait( ctx.cluster.run( args=[ 'sudo', 'install', '-d', '-m0777', '--', '/var/run/ceph', ], wait=False, ) ) devs_to_clean = {} remote_to_roles_to_devs = {} remote_to_roles_to_journals = {} osds = ctx.cluster.only(teuthology.is_type('osd')) for remote, roles_for_host in osds.remotes.iteritems(): devs = teuthology.get_scratch_devices(remote) roles_to_devs = {} roles_to_journals = {} if config.get('fs'): log.info('fs option selected, checking for scratch devs') log.info('found devs: %s' % (str(devs),)) devs_id_map = teuthology.get_wwn_id_map(remote, devs) iddevs = devs_id_map.values() roles_to_devs = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), iddevs ) if len(roles_to_devs) < len(iddevs): iddevs = iddevs[len(roles_to_devs):] devs_to_clean[remote] = [] if config.get('block_journal'): log.info('block journal enabled') roles_to_journals = assign_devs( teuthology.roles_of_type(roles_for_host, 'osd'), iddevs ) log.info('journal map: %s', roles_to_journals) if config.get('tmpfs_journal'): log.info('tmpfs journal enabled') roles_to_journals = {} remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] ) for osd in teuthology.roles_of_type(roles_for_host, 'osd'): tmpfs = '/mnt/osd.%s' % osd roles_to_journals[osd] = tmpfs remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] ) log.info('journal map: %s', roles_to_journals) log.info('dev map: %s' % (str(roles_to_devs),)) remote_to_roles_to_devs[remote] = roles_to_devs remote_to_roles_to_journals[remote] = roles_to_journals log.info('Generating config...') remotes_and_roles = ctx.cluster.remotes.items() roles = [role_list for (remote, role_list) in remotes_and_roles] ips = [host for (host, port) in (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)] conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips) for remote, roles_to_journals in remote_to_roles_to_journals.iteritems(): for role, journal in roles_to_journals.iteritems(): key = "osd." + str(role) if key not in conf: conf[key] = {} conf[key]['osd journal'] = journal for section, keys in config['conf'].iteritems(): for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) if section not in conf: conf[section] = {} conf[section][key] = value if config.get('tmpfs_journal'): conf['journal dio'] = False ctx.ceph = argparse.Namespace() ctx.ceph.conf = conf conf_path = config.get('conf_path', '/etc/ceph/ceph.conf') keyring_path = config.get('keyring_path', '/etc/ceph/ceph.keyring') log.info('Writing configs...') conf_fp = StringIO() conf.write(conf_fp) conf_fp.seek(0) writes = ctx.cluster.run( args=[ 'sudo', 'mkdir', '-p', '/etc/ceph', run.Raw('&&'), 'sudo', 'chmod', '0755', '/etc/ceph', run.Raw('&&'), 'sudo', 'python', '-c', 'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))', conf_path, run.Raw('&&'), 'sudo', 'chmod', '0644', conf_path, ], stdin=run.PIPE, wait=False, ) teuthology.feed_many_stdins_and_close(conf_fp, writes) run.wait(writes) coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) firstmon = teuthology.get_first_mon(ctx, config) log.info('Setting up %s...' % firstmon) ctx.cluster.only(firstmon).run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', keyring_path, ], ) ctx.cluster.only(firstmon).run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--gen-key', '--name=mon.', keyring_path, ], ) ctx.cluster.only(firstmon).run( args=[ 'sudo', 'chmod', '0644', keyring_path, ], ) (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() teuthology.create_simple_monmap( ctx, remote=mon0_remote, conf=conf, ) log.info('Creating admin key on %s...' % firstmon) ctx.cluster.only(firstmon).run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--gen-key', '--name=client.admin', '--set-uid=0', '--cap', 'mon', 'allow *', '--cap', 'osd', 'allow *', '--cap', 'mds', 'allow', keyring_path, ], ) log.info('Copying monmap to all nodes...') keyring = teuthology.get_file( remote=mon0_remote, path=keyring_path, ) monmap = teuthology.get_file( remote=mon0_remote, path='{tdir}/monmap'.format(tdir=testdir), ) for rem in ctx.cluster.remotes.iterkeys(): # copy mon key and initial monmap log.info('Sending monmap to node {remote}'.format(remote=rem)) teuthology.sudo_write_file( remote=rem, path=keyring_path, data=keyring, perms='0644' ) teuthology.write_file( remote=rem, path='{tdir}/monmap'.format(tdir=testdir), data=monmap, ) log.info('Setting up mon nodes...') mons = ctx.cluster.only(teuthology.is_type('mon')) run.wait( mons.run( args=[ 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'osdmaptool', '-c', conf_path, '--clobber', '--createsimple', '{num:d}'.format( num=teuthology.num_instances_of_type(ctx.cluster, 'osd'), ), '{tdir}/osdmap'.format(tdir=testdir), '--pg_bits', '2', '--pgp_bits', '4', ], wait=False, ), ) log.info('Setting up mds nodes...') mdss = ctx.cluster.only(teuthology.is_type('mds')) for remote, roles_for_host in mdss.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'mds'): remote.run( args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/mds/ceph-{id}'.format(id=id_), run.Raw('&&'), 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', '--create-keyring', '--gen-key', '--name=mds.{id}'.format(id=id_), '/var/lib/ceph/mds/ceph-{id}/keyring'.format(id=id_), ], ) cclient.create_keyring(ctx) log.info('Running mkfs on osd nodes...') ctx.disk_config = argparse.Namespace() ctx.disk_config.remote_to_roles_to_dev = remote_to_roles_to_devs ctx.disk_config.remote_to_roles_to_journals = remote_to_roles_to_journals ctx.disk_config.remote_to_roles_to_dev_mount_options = {} ctx.disk_config.remote_to_roles_to_dev_fstype = {} log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev))) for remote, roles_for_host in osds.remotes.iteritems(): roles_to_devs = remote_to_roles_to_devs[remote] roles_to_journals = remote_to_roles_to_journals[remote] for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run( args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/osd/ceph-{id}'.format(id=id_), ]) log.info(str(roles_to_journals)) log.info(id_) if roles_to_devs.get(id_): dev = roles_to_devs[id_] fs = config.get('fs') package = None mkfs_options = config.get('mkfs_options') mount_options = config.get('mount_options') if fs == 'btrfs': #package = 'btrfs-tools' if mount_options is None: mount_options = ['noatime','user_subvol_rm_allowed'] if mkfs_options is None: mkfs_options = ['-m', 'single', '-l', '32768', '-n', '32768'] if fs == 'xfs': #package = 'xfsprogs' if mount_options is None: mount_options = ['noatime'] if mkfs_options is None: mkfs_options = ['-f', '-i', 'size=2048'] if fs == 'ext4' or fs == 'ext3': if mount_options is None: mount_options = ['noatime','user_xattr'] if mount_options is None: mount_options = [] if mkfs_options is None: mkfs_options = [] mkfs = ['mkfs.%s' % fs] + mkfs_options log.info('%s on %s on %s' % (mkfs, dev, remote)) if package is not None: remote.run( args=[ 'sudo', 'apt-get', 'install', '-y', package ], stdout=StringIO(), ) remote.run(args= ['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev]) log.info('mount %s on %s -o %s' % (dev, remote, ','.join(mount_options))) remote.run( args=[ 'sudo', 'mount', '-t', fs, '-o', ','.join(mount_options), dev, os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)), ] ) if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options: ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {} ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][id_] = mount_options if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype: ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {} ctx.disk_config.remote_to_roles_to_dev_fstype[remote][id_] = fs devs_to_clean[remote].append( os.path.join( os.path.join('/var/lib/ceph/osd', 'ceph-{id}'.format(id=id_)), ) ) for id_ in teuthology.roles_of_type(roles_for_host, 'osd'): remote.run( args=[ 'sudo', 'MALLOC_CHECK_=3', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-osd', '--mkfs', '--mkkey', '-i', id_, '--monmap', '{tdir}/monmap'.format(tdir=testdir), ], ) log.info('Reading keys from all nodes...') keys_fp = StringIO() keys = [] for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for type_ in ['mds','osd']: for id_ in teuthology.roles_of_type(roles_for_host, type_): data = teuthology.get_file( remote=remote, path='/var/lib/ceph/{type}/ceph-{id}/keyring'.format( type=type_, id=id_, ), sudo=True, ) keys.append((type_, id_, data)) keys_fp.write(data) for remote, roles_for_host in ctx.cluster.remotes.iteritems(): for type_ in ['client']: for id_ in teuthology.roles_of_type(roles_for_host, type_): data = teuthology.get_file( remote=remote, path='/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) ) keys.append((type_, id_, data)) keys_fp.write(data) log.info('Adding keys to all mons...') writes = mons.run( args=[ 'sudo', 'tee', '-a', keyring_path, ], stdin=run.PIPE, wait=False, stdout=StringIO(), ) keys_fp.seek(0) teuthology.feed_many_stdins_and_close(keys_fp, writes) run.wait(writes) for type_, id_, data in keys: run.wait( mons.run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-authtool', keyring_path, '--name={type}.{id}'.format( type=type_, id=id_, ), ] + list(teuthology.generate_caps(type_)), wait=False, ), ) log.info('Running mkfs on mon nodes...') for remote, roles_for_host in mons.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'mon'): remote.run( args=[ 'sudo', 'mkdir', '-p', '/var/lib/ceph/mon/ceph-{id}'.format(id=id_), ], ) remote.run( args=[ 'sudo', 'adjust-ulimits', 'ceph-coverage', coverage_dir, 'ceph-mon', '--mkfs', '-i', id_, '--monmap={tdir}/monmap'.format(tdir=testdir), '--osdmap={tdir}/osdmap'.format(tdir=testdir), '--keyring={kpath}'.format(kpath=keyring_path), ], ) run.wait( mons.run( args=[ 'rm', '--', '{tdir}/monmap'.format(tdir=testdir), '{tdir}/osdmap'.format(tdir=testdir), ], wait=False, ), ) try: yield except Exception: # we need to know this below ctx.summary['success'] = False raise finally: (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys() log.info('Checking cluster log for badness...') def first_in_ceph_log(pattern, excludes): """ Find the first occurence of the pattern specified in the Ceph log, Returns None if none found. :param pattern: Pattern scanned for. :param excludes: Patterns to ignore. :return: First line of text (or None if not found) """ args = [ 'sudo', 'egrep', pattern, '/var/log/ceph/ceph.log', ] for exclude in excludes: args.extend([run.Raw('|'), 'egrep', '-v', exclude]) args.extend([ run.Raw('|'), 'head', '-n', '1', ]) r = mon0_remote.run( stdout=StringIO(), args=args, ) stdout = r.stdout.getvalue() if stdout != '': return stdout return None if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]', config['log_whitelist']) is not None: log.warning('Found errors (ERR|WRN|SEC) in cluster log') ctx.summary['success'] = False # use the most severe problem as the failure reason if 'failure_reason' not in ctx.summary: for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']: match = first_in_ceph_log(pattern, config['log_whitelist']) if match is not None: ctx.summary['failure_reason'] = \ '"{match}" in cluster log'.format( match=match.rstrip('\n'), ) break for remote, dirs in devs_to_clean.iteritems(): for dir_ in dirs: log.info('Unmounting %s on %s' % (dir_, remote)) remote.run( args=[ 'sync', run.Raw('&&'), 'sudo', 'umount', '-f', dir_ ] ) if config.get('tmpfs_journal'): log.info('tmpfs journal enabled - unmounting tmpfs at /mnt') for remote, roles_for_host in osds.remotes.iteritems(): remote.run( args=[ 'sudo', 'umount', '-f', '/mnt' ], check_status=False, ) if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') # and logs log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) log.info('Cleaning ceph cluster...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-rf', '--', conf_path, keyring_path, '{tdir}/data'.format(tdir=testdir), '{tdir}/monmap'.format(tdir=testdir), ], wait=False, ), )
def build_ceph_cluster(ctx, config): """Build a ceph cluster""" # Expect to find ceph_admin on the first mon by ID, same place that the download task # puts it. Remember this here, because subsequently IDs will change from those in # the test config to those that ceph-deploy invents. (ceph_admin,) = ctx.cluster.only('mon.a').remotes.keys() def execute_ceph_deploy(cmd): """Remotely execute a ceph_deploy command""" return ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(cmd), ], check_status=False, ).exitstatus def ceph_disk_osd_create(ctx, config): node_dev_list = get_dev_for_osd(ctx, config) no_of_osds = 0 for d in node_dev_list: node = d[0] for disk in d[1:]: zap = './ceph-deploy disk zap ' + node + ' ' + disk estatus = execute_ceph_deploy(zap) if estatus != 0: raise RuntimeError("ceph-deploy: Failed to zap osds") osd_create_cmd = './ceph-deploy osd create ' # first check for filestore, default is bluestore with ceph-deploy if config.get('filestore') is not None: osd_create_cmd += '--filestore ' elif config.get('bluestore') is not None: osd_create_cmd += '--bluestore ' if config.get('dmcrypt') is not None: osd_create_cmd += '--dmcrypt ' osd_create_cmd += ":".join(d) estatus_osd = execute_ceph_deploy(osd_create_cmd) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") return no_of_osds def ceph_volume_osd_create(ctx, config): osds = ctx.cluster.only(teuthology.is_type('osd')) no_of_osds = 0 for remote in osds.remotes.keys(): # all devs should be lvm osd_create_cmd = './ceph-deploy osd create --debug ' + remote.shortname + ' ' # default is bluestore so we just need config item for filestore roles = ctx.cluster.remotes[remote] dev_needed = len([role for role in roles if role.startswith('osd')]) all_devs = teuthology.get_scratch_devices(remote) log.info("node={n}, need_devs={d}, available={a}".format( n=remote.shortname, d=dev_needed, a=all_devs, )) devs = all_devs[0:dev_needed] # rest of the devices can be used for journal if required jdevs = dev_needed for device in devs: device_split = device.split('/') lv_device = device_split[-2] + '/' + device_split[-1] if config.get('filestore') is not None: osd_create_cmd += '--filestore --data ' + lv_device + ' ' # filestore with ceph-volume also needs journal disk try: jdevice = all_devs.pop(jdevs) except IndexError: raise RuntimeError("No device available for \ journal configuration") jdevice_split = jdevice.split('/') j_lv = jdevice_split[-2] + '/' + jdevice_split[-1] osd_create_cmd += '--journal ' + j_lv else: osd_create_cmd += ' --data ' + lv_device estatus_osd = execute_ceph_deploy(osd_create_cmd) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") return no_of_osds try: log.info('Building ceph cluster using ceph-deploy...') testdir = teuthology.get_testdir(ctx) ceph_branch = None if config.get('branch') is not None: cbranch = config.get('branch') for var, val in cbranch.items(): ceph_branch = '--{var}={val}'.format(var=var, val=val) all_nodes = get_all_nodes(ctx, config) mds_nodes = get_nodes_using_role(ctx, 'mds') mds_nodes = " ".join(mds_nodes) mon_node = get_nodes_using_role(ctx, 'mon') mon_nodes = " ".join(mon_node) # skip mgr based on config item # this is needed when test uses latest code to install old ceph # versions skip_mgr = config.get('skip-mgr', False) if not skip_mgr: mgr_nodes = get_nodes_using_role(ctx, 'mgr') mgr_nodes = " ".join(mgr_nodes) new_mon = './ceph-deploy new' + " " + mon_nodes if not skip_mgr: mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes mon_hostname = mon_nodes.split(' ')[0] mon_hostname = str(mon_hostname) gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname deploy_mds = './ceph-deploy mds create' + " " + mds_nodes if mon_nodes is None: raise RuntimeError("no monitor nodes in the config file") estatus_new = execute_ceph_deploy(new_mon) if estatus_new != 0: raise RuntimeError("ceph-deploy: new command failed") log.info('adding config inputs...') testdir = teuthology.get_testdir(ctx) conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir) if config.get('conf') is not None: confp = config.get('conf') for section, keys in confp.items(): lines = '[{section}]\n'.format(section=section) teuthology.append_lines_to_file(ceph_admin, conf_path, lines, sudo=True) for key, value in keys.items(): log.info("[%s] %s = %s" % (section, key, value)) lines = '{key} = {value}\n'.format(key=key, value=value) teuthology.append_lines_to_file( ceph_admin, conf_path, lines, sudo=True) # install ceph dev_branch = ctx.config['branch'] branch = '--dev={branch}'.format(branch=dev_branch) if ceph_branch: option = ceph_branch else: option = branch install_nodes = './ceph-deploy install ' + option + " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph") # install ceph-test package too install_nodes2 = './ceph-deploy install --tests ' + option + \ " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes2) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph-test") mon_create_nodes = './ceph-deploy mon create-initial' # If the following fails, it is OK, it might just be that the monitors # are taking way more than a minute/monitor to form quorum, so lets # try the next block which will wait up to 15 minutes to gatherkeys. execute_ceph_deploy(mon_create_nodes) estatus_gather = execute_ceph_deploy(gather_keys) if estatus_gather != 0: raise RuntimeError("ceph-deploy: Failed during gather keys") # install admin key on mons (ceph-create-keys doesn't do this any more) mons = ctx.cluster.only(teuthology.is_type('mon')) for remote in mons.remotes.keys(): execute_ceph_deploy('./ceph-deploy admin ' + remote.shortname) # create osd's if config.get('use-ceph-volume', False): no_of_osds = ceph_volume_osd_create(ctx, config) else: # this method will only work with ceph-deploy v1.5.39 or older no_of_osds = ceph_disk_osd_create(ctx, config) if not skip_mgr: execute_ceph_deploy(mgr_create) if mds_nodes: estatus_mds = execute_ceph_deploy(deploy_mds) if estatus_mds != 0: raise RuntimeError("ceph-deploy: Failed to deploy mds") if config.get('test_mon_destroy') is not None: for d in range(1, len(mon_node)): mon_destroy_nodes = './ceph-deploy mon destroy' + \ " " + mon_node[d] estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes) if estatus_mon_d != 0: raise RuntimeError("ceph-deploy: Failed to delete monitor") if config.get('wait-for-healthy', True) and no_of_osds >= 2: is_healthy(ctx=ctx, config=None) log.info('Setting up client nodes...') conf_path = '/etc/ceph/ceph.conf' admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring' first_mon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys() conf_data = teuthology.get_file( remote=mon0_remote, path=conf_path, sudo=True, ) admin_keyring = teuthology.get_file( remote=mon0_remote, path=admin_keyring_path, sudo=True, ) clients = ctx.cluster.only(teuthology.is_type('client')) for remot, roles_for_host in clients.remotes.items(): for id_ in teuthology.roles_of_type(roles_for_host, 'client'): client_keyring = \ '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) mon0_remote.run( args=[ 'cd', '{tdir}'.format(tdir=testdir), run.Raw('&&'), 'sudo', 'bash', '-c', run.Raw('"'), 'ceph', 'auth', 'get-or-create', 'client.{id}'.format(id=id_), 'mds', 'allow', 'mon', 'allow *', 'osd', 'allow *', run.Raw('>'), client_keyring, run.Raw('"'), ], ) key_data = teuthology.get_file( remote=mon0_remote, path=client_keyring, sudo=True, ) teuthology.sudo_write_file( remote=remot, path=client_keyring, data=key_data, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=admin_keyring_path, data=admin_keyring, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=conf_path, data=conf_data, perms='0644' ) if mds_nodes: log.info('Configuring CephFS...') Filesystem(ctx, create=True) elif not config.get('only_mon'): raise RuntimeError( "The cluster is NOT operational due to insufficient OSDs") # create rbd pool ceph_admin.run( args=[ 'sudo', 'ceph', '--cluster', 'ceph', 'osd', 'pool', 'create', 'rbd', '128', '128'], check_status=False) ceph_admin.run( args=[ 'sudo', 'ceph', '--cluster', 'ceph', 'osd', 'pool', 'application', 'enable', 'rbd', 'rbd', '--yes-i-really-mean-it' ], check_status=False) yield except Exception: log.info( "Error encountered, logging exception before tearing down ceph-deploy") log.info(traceback.format_exc()) raise finally: if config.get('keep_running'): return log.info('Stopping ceph...') ctx.cluster.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'], check_status=False) time.sleep(4) # and now just check for the processes themselves, as if upstart/sysvinit # is lying to us. Ignore errors if the grep fails ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'), 'grep', '-v', 'grep', run.Raw('|'), 'grep', 'ceph'], check_status=False) ctx.cluster.run(args=['sudo', 'systemctl', run.Raw('|'), 'grep', 'ceph'], check_status=False) if ctx.archive is not None: # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) mons = ctx.cluster.only(teuthology.is_type('mon')) for remote, roles in mons.remotes.items(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.keys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) # Prevent these from being undefined if the try block fails all_nodes = get_all_nodes(ctx, config) purge_nodes = './ceph-deploy purge' + " " + all_nodes purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes log.info('Purging package...') execute_ceph_deploy(purge_nodes) log.info('Purging data...') execute_ceph_deploy(purgedata_nodes)
def ceph_log(ctx, config): """ Create /var/log/ceph log directory that is open to everyone. Add valgrind and profiling-logger directories. :param ctx: Context :param config: Configuration """ log.info('Making ceph log dir writeable by non-root...') run.wait( ctx.cluster.run( args=[ 'sudo', 'chmod', '777', '/var/log/ceph', ], wait=False, ) ) log.info('Disabling ceph logrotate...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-f', '--', '/etc/logrotate.d/ceph', ], wait=False, ) ) log.info('Creating extra log directories...') run.wait( ctx.cluster.run( args=[ 'sudo', 'install', '-d', '-m0777', '--', '/var/log/ceph/valgrind', '/var/log/ceph/profiling-logger', ], wait=False, ) ) class Rotater(object): stop_event = gevent.event.Event() def invoke_logrotate(self): # 1) install ceph-test.conf in /etc/logrotate.d # 2) continuously loop over logrotate invocation with ceph-test.conf while not self.stop_event.is_set(): self.stop_event.wait(timeout=30) run.wait( ctx.cluster.run( args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf' ], wait=False, ) ) def begin(self): self.thread = gevent.spawn(self.invoke_logrotate) def end(self): self.stop_event.set() self.thread.get() def write_rotate_conf(ctx, daemons): testdir = teuthology.get_testdir(ctx) rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf') with file(rotate_conf_path, 'rb') as f: conf = "" for daemon, size in daemons.iteritems(): log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon)) conf += f.read().format(daemon_type=daemon, max_size=size) f.seek(0, 0) for remote in ctx.cluster.remotes.iterkeys(): teuthology.write_file(remote=remote, path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir), data=StringIO(conf) ) remote.run( args=[ 'sudo', 'mv', '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir), '/etc/logrotate.d/ceph-test.conf', run.Raw('&&'), 'sudo', 'chmod', '0644', '/etc/logrotate.d/ceph-test.conf', run.Raw('&&'), 'sudo', 'chown', 'root.root', '/etc/logrotate.d/ceph-test.conf' ] ) remote.chcon('/etc/logrotate.d/ceph-test.conf', 'system_u:object_r:etc_t:s0') if ctx.config.get('log-rotate'): daemons = ctx.config.get('log-rotate') log.info('Setting up log rotation with ' + str(daemons)) write_rotate_conf(ctx, daemons) logrotater = Rotater() logrotater.begin() try: yield finally: if ctx.config.get('log-rotate'): log.info('Shutting down logrotate') logrotater.end() ctx.cluster.run( args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf' ] ) if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): # and logs log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log'))
def build_ceph_cluster(ctx, config): """Build a ceph cluster""" log.info("Building ceph cluster using ceph-deploy...") testdir = teuthology.get_testdir(ctx) ceph_branch = None if config.get("branch") is not None: cbranch = config.get("branch") for var, val in cbranch.iteritems(): if var == "testing": ceph_branch = "--{var}".format(var=var) ceph_branch = "--{var}={val}".format(var=var, val=val) node_dev_list = [] all_nodes = get_all_nodes(ctx, config) mds_nodes = get_nodes_using_roles(ctx, config, "mds") mds_nodes = " ".join(mds_nodes) mon_node = get_nodes_using_roles(ctx, config, "mon") mon_nodes = " ".join(mon_node) new_mon = "./ceph-deploy new" + " " + mon_nodes install_nodes = "./ceph-deploy install " + ceph_branch + " " + all_nodes purge_nodes = "./ceph-deploy purge" + " " + all_nodes purgedata_nodes = "./ceph-deploy purgedata" + " " + all_nodes mon_hostname = mon_nodes.split(" ")[0] mon_hostname = str(mon_hostname) gather_keys = "./ceph-deploy gatherkeys" + " " + mon_hostname deploy_mds = "./ceph-deploy mds create" + " " + mds_nodes no_of_osds = 0 if mon_nodes is None: raise RuntimeError("no monitor nodes in the config file") estatus_new = execute_ceph_deploy(ctx, config, new_mon) if estatus_new != 0: raise RuntimeError("ceph-deploy: new command failed") log.info("adding config inputs...") testdir = teuthology.get_testdir(ctx) conf_path = "{tdir}/ceph-deploy/ceph.conf".format(tdir=testdir) first_mon = teuthology.get_first_mon(ctx, config) (remote,) = ctx.cluster.only(first_mon).remotes.keys() lines = None if config.get("conf") is not None: confp = config.get("conf") for section, keys in confp.iteritems(): lines = "[{section}]\n".format(section=section) teuthology.append_lines_to_file(remote, conf_path, lines, sudo=True) for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) lines = "{key} = {value}\n".format(key=key, value=value) teuthology.append_lines_to_file(remote, conf_path, lines, sudo=True) estatus_install = execute_ceph_deploy(ctx, config, install_nodes) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph") mon_no = None mon_no = config.get("mon_initial_members") if mon_no is not None: i = 0 mon1 = [] while i < mon_no: mon1.append(mon_node[i]) i = i + 1 initial_mons = " ".join(mon1) for k in range(mon_no, len(mon_node)): mon_create_nodes = "./ceph-deploy mon create" + " " + initial_mons + " " + mon_node[k] estatus_mon = execute_ceph_deploy(ctx, config, mon_create_nodes) if estatus_mon != 0: raise RuntimeError("ceph-deploy: Failed to create monitor") else: mon_create_nodes = "./ceph-deploy mon create" + " " + mon_nodes estatus_mon = execute_ceph_deploy(ctx, config, mon_create_nodes) if estatus_mon != 0: raise RuntimeError("ceph-deploy: Failed to create monitors") estatus_gather = execute_ceph_deploy(ctx, config, gather_keys) while estatus_gather != 0: # mon_create_nodes = './ceph-deploy mon create'+" "+mon_node[0] # execute_ceph_deploy(ctx, config, mon_create_nodes) estatus_gather = execute_ceph_deploy(ctx, config, gather_keys) if mds_nodes: estatus_mds = execute_ceph_deploy(ctx, config, deploy_mds) if estatus_mds != 0: raise RuntimeError("ceph-deploy: Failed to deploy mds") if config.get("test_mon_destroy") is not None: for d in range(1, len(mon_node)): mon_destroy_nodes = "./ceph-deploy mon destroy" + " " + mon_node[d] estatus_mon_d = execute_ceph_deploy(ctx, config, mon_destroy_nodes) if estatus_mon_d != 0: raise RuntimeError("ceph-deploy: Failed to delete monitor") node_dev_list = get_dev_for_osd(ctx, config) for d in node_dev_list: osd_create_cmds = "./ceph-deploy osd create --zap-disk" + " " + d estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmds) if estatus_osd == 0: log.info("successfully created osd") no_of_osds += 1 else: zap_disk = "./ceph-deploy disk zap" + " " + d execute_ceph_deploy(ctx, config, zap_disk) estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmds) if estatus_osd == 0: log.info("successfully created osd") no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") if config.get("wait-for-healthy", True) and no_of_osds >= 2: is_healthy(ctx=ctx, config=None) log.info("Setting up client nodes...") conf_path = "/etc/ceph/ceph.conf" admin_keyring_path = "/etc/ceph/ceph.client.admin.keyring" first_mon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys() conf_data = teuthology.get_file(remote=mon0_remote, path=conf_path, sudo=True) admin_keyring = teuthology.get_file(remote=mon0_remote, path=admin_keyring_path, sudo=True) clients = ctx.cluster.only(teuthology.is_type("client")) for remot, roles_for_host in clients.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, "client"): client_keyring = "/etc/ceph/ceph.client.{id}.keyring".format(id=id_) mon0_remote.run( args=[ "cd", "{tdir}".format(tdir=testdir), run.Raw("&&"), "sudo", "bash", "-c", run.Raw('"'), "ceph", "auth", "get-or-create", "client.{id}".format(id=id_), "mds", "allow", "mon", "allow *", "osd", "allow *", run.Raw(">"), client_keyring, run.Raw('"'), ] ) key_data = teuthology.get_file(remote=mon0_remote, path=client_keyring, sudo=True) teuthology.sudo_write_file(remote=remot, path=client_keyring, data=key_data, perms="0644") teuthology.sudo_write_file(remote=remot, path=admin_keyring_path, data=admin_keyring, perms="0644") teuthology.sudo_write_file(remote=remot, path=conf_path, data=conf_data, perms="0644") else: raise RuntimeError("The cluster is NOT operational due to insufficient OSDs") try: yield finally: log.info("Stopping ceph...") ctx.cluster.run(args=["sudo", "stop", "ceph-all", run.Raw("||"), "sudo", "service", "ceph", "stop"]) if ctx.archive is not None: # archive mon data, too log.info("Archiving mon data...") path = os.path.join(ctx.archive, "data") os.makedirs(path) mons = ctx.cluster.only(teuthology.is_type("mon")) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith("mon."): teuthology.pull_directory_tarball(remote, "/var/lib/ceph/mon", path + "/" + role + ".tgz") log.info("Compressing logs...") run.wait( ctx.cluster.run( args=[ "sudo", "find", "/var/log/ceph", "-name", "*.log", "-print0", run.Raw("|"), "sudo", "xargs", "-0", "--no-run-if-empty", "--", "gzip", "--", ], wait=False, ) ) log.info("Archiving logs...") path = os.path.join(ctx.archive, "remote") os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, "/var/log/ceph", os.path.join(sub, "log")) log.info("Purging package...") execute_ceph_deploy(ctx, config, purge_nodes) log.info("Purging data...") execute_ceph_deploy(ctx, config, purgedata_nodes)
def build_ceph_cluster(ctx, config): """Build a ceph cluster""" try: log.info("Building ceph cluster using ceph-deploy...") testdir = teuthology.get_testdir(ctx) ceph_branch = None if config.get("branch") is not None: cbranch = config.get("branch") for var, val in cbranch.iteritems(): if var == "testing": ceph_branch = "--{var}".format(var=var) ceph_branch = "--{var}={val}".format(var=var, val=val) node_dev_list = [] all_nodes = get_all_nodes(ctx, config) mds_nodes = get_nodes_using_roles(ctx, config, "mds") mds_nodes = " ".join(mds_nodes) mon_node = get_nodes_using_roles(ctx, config, "mon") mon_nodes = " ".join(mon_node) new_mon = "./ceph-deploy new" + " " + mon_nodes install_nodes = "./ceph-deploy install " + ceph_branch + " " + all_nodes purge_nodes = "./ceph-deploy purge" + " " + all_nodes purgedata_nodes = "./ceph-deploy purgedata" + " " + all_nodes mon_hostname = mon_nodes.split(" ")[0] mon_hostname = str(mon_hostname) gather_keys = "./ceph-deploy gatherkeys" + " " + mon_hostname deploy_mds = "./ceph-deploy mds create" + " " + mds_nodes no_of_osds = 0 if mon_nodes is None: raise RuntimeError("no monitor nodes in the config file") estatus_new = execute_ceph_deploy(ctx, config, new_mon) if estatus_new != 0: raise RuntimeError("ceph-deploy: new command failed") log.info("adding config inputs...") testdir = teuthology.get_testdir(ctx) conf_path = "{tdir}/ceph-deploy/ceph.conf".format(tdir=testdir) first_mon = teuthology.get_first_mon(ctx, config) (remote,) = ctx.cluster.only(first_mon).remotes.keys() lines = None if config.get("conf") is not None: confp = config.get("conf") for section, keys in confp.iteritems(): lines = "[{section}]\n".format(section=section) teuthology.append_lines_to_file(remote, conf_path, lines, sudo=True) for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) lines = "{key} = {value}\n".format(key=key, value=value) teuthology.append_lines_to_file(remote, conf_path, lines, sudo=True) estatus_install = execute_ceph_deploy(ctx, config, install_nodes) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph") mon_create_nodes = "./ceph-deploy mon create-initial" # If the following fails, it is OK, it might just be that the monitors # are taking way more than a minute/monitor to form quorum, so lets # try the next block which will wait up to 15 minutes to gatherkeys. estatus_mon = execute_ceph_deploy(ctx, config, mon_create_nodes) estatus_gather = execute_ceph_deploy(ctx, config, gather_keys) max_gather_tries = 90 gather_tries = 0 while estatus_gather != 0: gather_tries += 1 if gather_tries >= max_gather_tries: msg = "ceph-deploy was not able to gatherkeys after 15 minutes" raise RuntimeError(msg) estatus_gather = execute_ceph_deploy(ctx, config, gather_keys) time.sleep(10) if mds_nodes: estatus_mds = execute_ceph_deploy(ctx, config, deploy_mds) if estatus_mds != 0: raise RuntimeError("ceph-deploy: Failed to deploy mds") if config.get("test_mon_destroy") is not None: for d in range(1, len(mon_node)): mon_destroy_nodes = "./ceph-deploy mon destroy" + " " + mon_node[d] estatus_mon_d = execute_ceph_deploy(ctx, config, mon_destroy_nodes) if estatus_mon_d != 0: raise RuntimeError("ceph-deploy: Failed to delete monitor") node_dev_list = get_dev_for_osd(ctx, config) osd_create_cmd = "./ceph-deploy osd create --zap-disk " for d in node_dev_list: if config.get("dmcrypt") is not None: osd_create_cmd_d = osd_create_cmd + "--dmcrypt" + " " + d else: osd_create_cmd_d = osd_create_cmd + d estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmd_d) if estatus_osd == 0: log.info("successfully created osd") no_of_osds += 1 else: disks = [] disks = d.split(":") dev_disk = disks[0] + ":" + disks[1] j_disk = disks[0] + ":" + disks[2] zap_disk = "./ceph-deploy disk zap " + dev_disk + " " + j_disk execute_ceph_deploy(ctx, config, zap_disk) estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmd_d) if estatus_osd == 0: log.info("successfully created osd") no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") if config.get("wait-for-healthy", True) and no_of_osds >= 2: is_healthy(ctx=ctx, config=None) log.info("Setting up client nodes...") conf_path = "/etc/ceph/ceph.conf" admin_keyring_path = "/etc/ceph/ceph.client.admin.keyring" first_mon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys() conf_data = teuthology.get_file(remote=mon0_remote, path=conf_path, sudo=True) admin_keyring = teuthology.get_file(remote=mon0_remote, path=admin_keyring_path, sudo=True) clients = ctx.cluster.only(teuthology.is_type("client")) for remot, roles_for_host in clients.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, "client"): client_keyring = "/etc/ceph/ceph.client.{id}.keyring".format(id=id_) mon0_remote.run( args=[ "cd", "{tdir}".format(tdir=testdir), run.Raw("&&"), "sudo", "bash", "-c", run.Raw('"'), "ceph", "auth", "get-or-create", "client.{id}".format(id=id_), "mds", "allow", "mon", "allow *", "osd", "allow *", run.Raw(">"), client_keyring, run.Raw('"'), ] ) key_data = teuthology.get_file(remote=mon0_remote, path=client_keyring, sudo=True) teuthology.sudo_write_file(remote=remot, path=client_keyring, data=key_data, perms="0644") teuthology.sudo_write_file(remote=remot, path=admin_keyring_path, data=admin_keyring, perms="0644") teuthology.sudo_write_file(remote=remot, path=conf_path, data=conf_data, perms="0644") else: raise RuntimeError("The cluster is NOT operational due to insufficient OSDs") yield finally: log.info("Stopping ceph...") ctx.cluster.run(args=["sudo", "stop", "ceph-all", run.Raw("||"), "sudo", "service", "ceph", "stop"]) # Are you really not running anymore? # try first with the init tooling # ignoring the status so this becomes informational only ctx.cluster.run( args=["sudo", "status", "ceph-all", run.Raw("||"), "sudo", "service", "ceph", "status"], check_status=False ) # and now just check for the processes themselves, as if upstart/sysvinit # is lying to us. Ignore errors if the grep fails ctx.cluster.run( args=["sudo", "ps", "aux", run.Raw("|"), "grep", "-v", "grep", run.Raw("|"), "grep", "ceph"], check_status=False, ) if ctx.archive is not None: # archive mon data, too log.info("Archiving mon data...") path = os.path.join(ctx.archive, "data") os.makedirs(path) mons = ctx.cluster.only(teuthology.is_type("mon")) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith("mon."): teuthology.pull_directory_tarball(remote, "/var/lib/ceph/mon", path + "/" + role + ".tgz") log.info("Compressing logs...") run.wait( ctx.cluster.run( args=[ "sudo", "find", "/var/log/ceph", "-name", "*.log", "-print0", run.Raw("|"), "sudo", "xargs", "-0", "--no-run-if-empty", "--", "gzip", "--", ], wait=False, ) ) log.info("Archiving logs...") path = os.path.join(ctx.archive, "remote") os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, "/var/log/ceph", os.path.join(sub, "log")) # Prevent these from being undefined if the try block fails all_nodes = get_all_nodes(ctx, config) purge_nodes = "./ceph-deploy purge" + " " + all_nodes purgedata_nodes = "./ceph-deploy purgedata" + " " + all_nodes log.info("Purging package...") execute_ceph_deploy(ctx, config, purge_nodes) log.info("Purging data...") execute_ceph_deploy(ctx, config, purgedata_nodes)
def ceph_log(ctx, config): """ Create /var/log/ceph log directory that is open to everyone. Add valgrind and profiling-logger directories. :param ctx: Context :param config: Configuration """ log.info('Making ceph log dir writeable by non-root...') run.wait( ctx.cluster.run( args=[ 'sudo', 'chmod', '777', '/var/log/ceph', ], wait=False, )) log.info('Disabling ceph logrotate...') run.wait( ctx.cluster.run( args=[ 'sudo', 'rm', '-f', '--', '/etc/logrotate.d/ceph', ], wait=False, )) log.info('Creating extra log directories...') run.wait( ctx.cluster.run( args=[ 'sudo', 'install', '-d', '-m0755', '--', '/var/log/ceph/valgrind', '/var/log/ceph/profiling-logger', ], wait=False, )) try: yield finally: if ctx.archive is not None and \ not (ctx.config.get('archive-on-error') and ctx.summary['success']): # and logs log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log'))
def build_ceph_cluster(ctx, config): """Build a ceph cluster""" # Expect to find ceph_admin on the first mon by ID, same place that the download task # puts it. Remember this here, because subsequently IDs will change from those in # the test config to those that ceph-deploy invents. (ceph_admin,) = ctx.cluster.only( teuthology.get_first_mon(ctx, config)).remotes.iterkeys() def execute_ceph_deploy(cmd): """Remotely execute a ceph_deploy command""" return ceph_admin.run( args=[ 'cd', '{tdir}/ceph-deploy'.format(tdir=testdir), run.Raw('&&'), run.Raw(cmd), ], check_status=False, ).exitstatus try: log.info('Building ceph cluster using ceph-deploy...') testdir = teuthology.get_testdir(ctx) ceph_branch = None if config.get('branch') is not None: cbranch = config.get('branch') for var, val in cbranch.iteritems(): ceph_branch = '--{var}={val}'.format(var=var, val=val) all_nodes = get_all_nodes(ctx, config) mds_nodes = get_nodes_using_role(ctx, 'mds') mds_nodes = " ".join(mds_nodes) mon_node = get_nodes_using_role(ctx, 'mon') mon_nodes = " ".join(mon_node) mgr_nodes = get_nodes_using_role(ctx, 'mgr') mgr_nodes = " ".join(mgr_nodes) new_mon = './ceph-deploy new' + " " + mon_nodes mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes mon_hostname = mon_nodes.split(' ')[0] mon_hostname = str(mon_hostname) gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname deploy_mds = './ceph-deploy mds create' + " " + mds_nodes no_of_osds = 0 if mon_nodes is None: raise RuntimeError("no monitor nodes in the config file") estatus_new = execute_ceph_deploy(new_mon) if estatus_new != 0: raise RuntimeError("ceph-deploy: new command failed") log.info('adding config inputs...') testdir = teuthology.get_testdir(ctx) conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir) if config.get('conf') is not None: confp = config.get('conf') for section, keys in confp.iteritems(): lines = '[{section}]\n'.format(section=section) teuthology.append_lines_to_file(ceph_admin, conf_path, lines, sudo=True) for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) lines = '{key} = {value}\n'.format(key=key, value=value) teuthology.append_lines_to_file( ceph_admin, conf_path, lines, sudo=True) # install ceph dev_branch = ctx.config['branch'] branch = '--dev={branch}'.format(branch=dev_branch) if ceph_branch: option = ceph_branch else: option = branch install_nodes = './ceph-deploy install ' + option + " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph") # install ceph-test package too install_nodes2 = './ceph-deploy install --tests ' + option + \ " " + all_nodes estatus_install = execute_ceph_deploy(install_nodes2) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph-test") mon_create_nodes = './ceph-deploy mon create-initial' # If the following fails, it is OK, it might just be that the monitors # are taking way more than a minute/monitor to form quorum, so lets # try the next block which will wait up to 15 minutes to gatherkeys. execute_ceph_deploy(mon_create_nodes) execute_ceph_deploy(mgr_create) # create-keys is explicit now # http://tracker.ceph.com/issues/16036 mons = ctx.cluster.only(teuthology.is_type('mon')) for remote in mons.remotes.iterkeys(): remote.run(args=['sudo', 'ceph-create-keys', '--cluster', 'ceph', '--id', remote.shortname]) estatus_gather = execute_ceph_deploy(gather_keys) if mds_nodes: estatus_mds = execute_ceph_deploy(deploy_mds) if estatus_mds != 0: raise RuntimeError("ceph-deploy: Failed to deploy mds") if config.get('test_mon_destroy') is not None: for d in range(1, len(mon_node)): mon_destroy_nodes = './ceph-deploy mon destroy' + \ " " + mon_node[d] estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes) if estatus_mon_d != 0: raise RuntimeError("ceph-deploy: Failed to delete monitor") node_dev_list = get_dev_for_osd(ctx, config) for d in node_dev_list: node = d[0] for disk in d[1:]: zap = './ceph-deploy disk zap ' + node + ':' + disk estatus = execute_ceph_deploy(zap) if estatus != 0: raise RuntimeError("ceph-deploy: Failed to zap osds") osd_create_cmd = './ceph-deploy osd create ' if config.get('dmcrypt') is not None: osd_create_cmd += '--dmcrypt ' osd_create_cmd += ":".join(d) estatus_osd = execute_ceph_deploy(osd_create_cmd) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") if config.get('wait-for-healthy', True) and no_of_osds >= 2: is_healthy(ctx=ctx, config=None) log.info('Setting up client nodes...') conf_path = '/etc/ceph/ceph.conf' admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring' first_mon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys() conf_data = teuthology.get_file( remote=mon0_remote, path=conf_path, sudo=True, ) admin_keyring = teuthology.get_file( remote=mon0_remote, path=admin_keyring_path, sudo=True, ) clients = ctx.cluster.only(teuthology.is_type('client')) for remot, roles_for_host in clients.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'client'): client_keyring = \ '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) mon0_remote.run( args=[ 'cd', '{tdir}'.format(tdir=testdir), run.Raw('&&'), 'sudo', 'bash', '-c', run.Raw('"'), 'ceph', 'auth', 'get-or-create', 'client.{id}'.format(id=id_), 'mds', 'allow', 'mon', 'allow *', 'osd', 'allow *', run.Raw('>'), client_keyring, run.Raw('"'), ], ) key_data = teuthology.get_file( remote=mon0_remote, path=client_keyring, sudo=True, ) teuthology.sudo_write_file( remote=remot, path=client_keyring, data=key_data, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=admin_keyring_path, data=admin_keyring, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=conf_path, data=conf_data, perms='0644' ) if mds_nodes: log.info('Configuring CephFS...') ceph_fs = Filesystem(ctx, create=True) elif not config.get('only_mon'): raise RuntimeError( "The cluster is NOT operational due to insufficient OSDs") yield except Exception: log.info( "Error encountered, logging exception before tearing down ceph-deploy") log.info(traceback.format_exc()) raise finally: if config.get('keep_running'): return log.info('Stopping ceph...') ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'), 'sudo', 'service', 'ceph', 'stop', run.Raw('||'), 'sudo', 'systemctl', 'stop', 'ceph.target']) # Are you really not running anymore? # try first with the init tooling # ignoring the status so this becomes informational only ctx.cluster.run( args=[ 'sudo', 'status', 'ceph-all', run.Raw('||'), 'sudo', 'service', 'ceph', 'status', run.Raw('||'), 'sudo', 'systemctl', 'status', 'ceph.target'], check_status=False) # and now just check for the processes themselves, as if upstart/sysvinit # is lying to us. Ignore errors if the grep fails ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'), 'grep', '-v', 'grep', run.Raw('|'), 'grep', 'ceph'], check_status=False) if ctx.archive is not None: # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) mons = ctx.cluster.only(teuthology.is_type('mon')) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) # Prevent these from being undefined if the try block fails all_nodes = get_all_nodes(ctx, config) purge_nodes = './ceph-deploy purge' + " " + all_nodes purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes log.info('Purging package...') execute_ceph_deploy(purge_nodes) log.info('Purging data...') execute_ceph_deploy(purgedata_nodes)
def build_ceph_cluster(ctx, config): log.info('Building ceph cluster using ceph-deploy...') testdir = teuthology.get_testdir(ctx) ceph_branch = None if config.get('branch') is not None: cbranch = config.get('branch') for var, val in cbranch.iteritems(): if var == 'testing': ceph_branch = '--{var}'.format(var=var) ceph_branch = '--{var}={val}'.format(var=var, val=val) node_dev_list = [] all_nodes = get_all_nodes(ctx, config) mds_nodes = get_nodes_using_roles(ctx, config, 'mds') mds_nodes = " ".join(mds_nodes) mon_node = get_nodes_using_roles(ctx, config, 'mon') mon_nodes = " ".join(mon_node) new_mon = './ceph-deploy new'+" "+mon_nodes install_nodes = './ceph-deploy install '+ceph_branch+" "+all_nodes purge_nodes = './ceph-deploy purge'+" "+all_nodes purgedata_nodes = './ceph-deploy purgedata'+" "+all_nodes mon_hostname = mon_nodes.split(' ')[0] mon_hostname = str(mon_hostname) gather_keys = './ceph-deploy gatherkeys'+" "+mon_hostname deploy_mds = './ceph-deploy mds create'+" "+mds_nodes no_of_osds = 0 if mon_nodes is None: raise RuntimeError("no monitor nodes in the config file") estatus_new = execute_ceph_deploy(ctx, config, new_mon) if estatus_new != 0: raise RuntimeError("ceph-deploy: new command failed") log.info('adding config inputs...') testdir = teuthology.get_testdir(ctx) conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir) first_mon = teuthology.get_first_mon(ctx, config) (remote,) = ctx.cluster.only(first_mon).remotes.keys() lines = None if config.get('conf') is not None: confp = config.get('conf') for section, keys in confp.iteritems(): lines = '[{section}]\n'.format(section=section) teuthology.append_lines_to_file(remote, conf_path, lines, sudo=True) for key, value in keys.iteritems(): log.info("[%s] %s = %s" % (section, key, value)) lines = '{key} = {value}\n'.format(key=key, value=value) teuthology.append_lines_to_file(remote, conf_path, lines, sudo=True) estatus_install = execute_ceph_deploy(ctx, config, install_nodes) if estatus_install != 0: raise RuntimeError("ceph-deploy: Failed to install ceph") mon_no = None mon_no = config.get('mon_initial_members') if mon_no is not None: i = 0 mon1 = [] while(i < mon_no): mon1.append(mon_node[i]) i = i + 1 initial_mons = " ".join(mon1) for k in range(mon_no, len(mon_node)): mon_create_nodes = './ceph-deploy mon create'+" "+initial_mons+" "+mon_node[k] estatus_mon = execute_ceph_deploy(ctx, config, mon_create_nodes) if estatus_mon != 0: raise RuntimeError("ceph-deploy: Failed to create monitor") else: mon_create_nodes = './ceph-deploy mon create'+" "+mon_nodes estatus_mon = execute_ceph_deploy(ctx, config, mon_create_nodes) if estatus_mon != 0: raise RuntimeError("ceph-deploy: Failed to create monitors") estatus_gather = execute_ceph_deploy(ctx, config, gather_keys) while (estatus_gather != 0): #mon_create_nodes = './ceph-deploy mon create'+" "+mon_node[0] #execute_ceph_deploy(ctx, config, mon_create_nodes) estatus_gather = execute_ceph_deploy(ctx, config, gather_keys) if mds_nodes: estatus_mds = execute_ceph_deploy(ctx, config, deploy_mds) if estatus_mds != 0: raise RuntimeError("ceph-deploy: Failed to deploy mds") if config.get('test_mon_destroy') is not None: for d in range(1, len(mon_node)): mon_destroy_nodes = './ceph-deploy mon destroy'+" "+mon_node[d] estatus_mon_d = execute_ceph_deploy(ctx, config, mon_destroy_nodes) if estatus_mon_d != 0: raise RuntimeError("ceph-deploy: Failed to delete monitor") node_dev_list = get_dev_for_osd(ctx, config) for d in node_dev_list: osd_create_cmds = './ceph-deploy osd create --zap-disk'+" "+d estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmds) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: zap_disk = './ceph-deploy disk zap'+" "+d execute_ceph_deploy(ctx, config, zap_disk) estatus_osd = execute_ceph_deploy(ctx, config, osd_create_cmds) if estatus_osd == 0: log.info('successfully created osd') no_of_osds += 1 else: raise RuntimeError("ceph-deploy: Failed to create osds") if config.get('wait-for-healthy', True) and no_of_osds >= 2: is_healthy(ctx=ctx, config=None) log.info('Setting up client nodes...') conf_path = '/etc/ceph/ceph.conf' admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring' first_mon = teuthology.get_first_mon(ctx, config) (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys() conf_data = teuthology.get_file( remote=mon0_remote, path=conf_path, sudo=True, ) admin_keyring = teuthology.get_file( remote=mon0_remote, path=admin_keyring_path, sudo=True, ) clients = ctx.cluster.only(teuthology.is_type('client')) for remot, roles_for_host in clients.remotes.iteritems(): for id_ in teuthology.roles_of_type(roles_for_host, 'client'): client_keyring = '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_) mon0_remote.run( args=[ 'cd', '{tdir}'.format(tdir=testdir), run.Raw('&&'), 'sudo','bash','-c', run.Raw('"'),'ceph', 'auth', 'get-or-create', 'client.{id}'.format(id=id_), 'mds', 'allow', 'mon', 'allow *', 'osd', 'allow *', run.Raw('>'), client_keyring, run.Raw('"'), ], ) key_data = teuthology.get_file( remote=mon0_remote, path=client_keyring, sudo=True, ) teuthology.sudo_write_file( remote=remot, path=client_keyring, data=key_data, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=admin_keyring_path, data=admin_keyring, perms='0644' ) teuthology.sudo_write_file( remote=remot, path=conf_path, data=conf_data, perms='0644' ) else: raise RuntimeError("The cluster is NOT operational due to insufficient OSDs") try: yield finally: log.info('Stopping ceph...') ctx.cluster.run(args=[ 'sudo', 'stop', 'ceph-all', run.Raw('||'), 'sudo', 'service', 'ceph', 'stop' ]) if ctx.archive is not None: # archive mon data, too log.info('Archiving mon data...') path = os.path.join(ctx.archive, 'data') os.makedirs(path) mons = ctx.cluster.only(teuthology.is_type('mon')) for remote, roles in mons.remotes.iteritems(): for role in roles: if role.startswith('mon.'): teuthology.pull_directory_tarball( remote, '/var/lib/ceph/mon', path + '/' + role + '.tgz') log.info('Compressing logs...') run.wait( ctx.cluster.run( args=[ 'sudo', 'find', '/var/log/ceph', '-name', '*.log', '-print0', run.Raw('|'), 'sudo', 'xargs', '-0', '--no-run-if-empty', '--', 'gzip', '--', ], wait=False, ), ) log.info('Archiving logs...') path = os.path.join(ctx.archive, 'remote') os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, '/var/log/ceph', os.path.join(sub, 'log')) log.info('Purging package...') execute_ceph_deploy(ctx, config, purge_nodes) log.info('Purging data...') execute_ceph_deploy(ctx, config, purgedata_nodes)
def ceph_log(ctx, config): """ Create /var/log/ceph log directory that is open to everyone. Add valgrind and profiling-logger directories. :param ctx: Context :param config: Configuration """ log.info("Making ceph log dir writeable by non-root...") run.wait(ctx.cluster.run(args=["sudo", "chmod", "777", "/var/log/ceph"], wait=False)) log.info("Disabling ceph logrotate...") run.wait(ctx.cluster.run(args=["sudo", "rm", "-f", "--", "/etc/logrotate.d/ceph"], wait=False)) log.info("Creating extra log directories...") run.wait( ctx.cluster.run( args=["sudo", "install", "-d", "-m0777", "--", "/var/log/ceph/valgrind", "/var/log/ceph/profiling-logger"], wait=False, ) ) class Rotater(object): stop_event = gevent.event.Event() def invoke_logrotate(self): # 1) install ceph-test.conf in /etc/logrotate.d # 2) continuously loop over logrotate invocation with ceph-test.conf while not self.stop_event.is_set(): self.stop_event.wait(timeout=30) run.wait(ctx.cluster.run(args=["sudo", "logrotate", "/etc/logrotate.d/ceph-test.conf"], wait=False)) def begin(self): self.thread = gevent.spawn(self.invoke_logrotate) def end(self): self.stop_event.set() self.thread.get() def write_rotate_conf(ctx, daemons): testdir = teuthology.get_testdir(ctx) rotate_conf_path = os.path.join(os.path.dirname(__file__), "logrotate.conf") with file(rotate_conf_path, "rb") as f: conf = "" for daemon, size in daemons.iteritems(): log.info("writing logrotate stanza for {daemon}".format(daemon=daemon)) conf += f.read().format(daemon_type=daemon, max_size=size) f.seek(0, 0) for remote in ctx.cluster.remotes.iterkeys(): teuthology.write_file( remote=remote, path="{tdir}/logrotate.ceph-test.conf".format(tdir=testdir), data=StringIO(conf) ) remote.run( args=[ "sudo", "mv", "{tdir}/logrotate.ceph-test.conf".format(tdir=testdir), "/etc/logrotate.d/ceph-test.conf", run.Raw("&&"), "sudo", "chmod", "0644", "/etc/logrotate.d/ceph-test.conf", run.Raw("&&"), "sudo", "chown", "root.root", "/etc/logrotate.d/ceph-test.conf", ] ) remote.chcon("/etc/logrotate.d/ceph-test.conf", "system_u:object_r:etc_t:s0") if ctx.config.get("log-rotate"): daemons = ctx.config.get("log-rotate") log.info("Setting up log rotation with " + str(daemons)) write_rotate_conf(ctx, daemons) logrotater = Rotater() logrotater.begin() try: yield finally: if ctx.config.get("log-rotate"): log.info("Shutting down logrotate") logrotater.end() ctx.cluster.run(args=["sudo", "rm", "/etc/logrotate.d/ceph-test.conf"]) if ctx.archive is not None and not (ctx.config.get("archive-on-error") and ctx.summary["success"]): # and logs log.info("Compressing logs...") run.wait( ctx.cluster.run( args=[ "sudo", "find", "/var/log/ceph", "-name", "*.log", "-print0", run.Raw("|"), "sudo", "xargs", "-0", "--no-run-if-empty", "--", "gzip", "--", ], wait=False, ) ) log.info("Archiving logs...") path = os.path.join(ctx.archive, "remote") os.makedirs(path) for remote in ctx.cluster.remotes.iterkeys(): sub = os.path.join(path, remote.shortname) os.makedirs(sub) teuthology.pull_directory(remote, "/var/log/ceph", os.path.join(sub, "log"))