def mon_status_check(conn, logger, hostname, args): """ A direct check for JSON output on the monitor status. For newer versions of Ceph (dumpling and newer) a new mon_status command was added ( `ceph daemon mon mon_status` ) and should be revisited if the output changes as this check depends on that availability. """ asok_path = paths.mon.asok(args.cluster, hostname) out, err, code = process.check( conn, [ 'ceph', '--cluster={cluster}'.format(cluster=args.cluster), '--admin-daemon', asok_path, 'mon_status', ], ) for line in err: logger.error(line) try: return json.loads(''.join(out)) except ValueError: return {}
def can_connect_passwordless(hostname): """ Ensure that current host can SSH remotely to the remote host using the ``BatchMode`` option to prevent a password prompt. That attempt will error with an exit status of 255 and a ``Permission denied`` message. """ # Ensure we are not doing this for local hosts if not needs_ssh(hostname): return True logger = logging.getLogger(hostname) with get_local_connection(logger) as conn: # Check to see if we can login, disabling password prompts command = ['ssh', '-CT', '-o', 'BatchMode=yes', hostname] out, err, retval = process.check(conn, command, stop_on_error=False) expected_error = 'Permission denied (publickey,password)' has_key_error = False for line in err: if expected_error in line: has_key_error = True if retval == 255 and has_key_error: return False return True
def is_running(conn, args): """ Run a command to check the status of a mon, return a boolean. We heavily depend on the format of the output, if that ever changes we need to modify this. Check daemon status for 3 times output of the status should be similar to:: mon.mira094: running {"version":"0.61.5"} or when it fails:: mon.mira094: dead {"version":"0.61.5"} mon.mira094: not running {"version":"0.61.5"} """ stdout, stderr, _ = process.check( conn, args ) result_string = ' '.join(stdout) for run_check in [': running', ' start/running']: if run_check in result_string: return True return False
def can_connect_passwordless(hostname): """ Ensure that current host can SSH remotely to the remote host using the ``BatchMode`` option to prevent a password prompt. That attempt will error with an exit status of 255 and a ``Permission denied`` message. """ # Ensure we are not doing this for local hosts if not needs_ssh(hostname): return True logger = logging.getLogger(hostname) with get_local_connection(logger) as conn: # Check to see if we can login, disabling password prompts command = ['ssh', '-CT', '-o', 'BatchMode=yes', hostname] out, err, retval = process.check(conn, command, stop_on_error=False) expected_error = 'Permission denied ' has_key_error = False for line in err: if expected_error in line: has_key_error = True if retval == 255 and has_key_error: return False return True
def osd_status_check(conn, cluster): """ Check the status of an OSD. Make sure all are up and in What good output would look like:: { "epoch": 8, "num_osds": 1, "num_up_osds": 1, "num_in_osds": "1", "full": "false", "nearfull": "false" } Note how the booleans are actually strings, so we need to take that into account and fix it before returning the dictionary. Issue #8108 """ command = [ 'ceph', '--cluster={cluster}'.format(cluster=cluster), 'osd', 'stat', '--format=json', ] try: out, err, code = process.check( conn, command, ) except TypeError: # XXX This is a bug in remoto. If the other end disconnects with a timeout # it will return a None, and here we are expecting a 3 item tuple, not a None # so it will break with a TypeError. Once remoto fixes this, we no longer need # this try/except. return {} try: loaded_json = json.loads(''.join(out)) # convert boolean strings to actual booleans because # --format=json fails to do this properly for k, v in loaded_json.items(): if v == 'true': loaded_json[k] = True elif v == 'false': loaded_json[k] = False return loaded_json except ValueError: return {}
def osd_tree(conn, cluster): """ Check the status of an OSD. Make sure all are up and in What good output would look like:: { "epoch": 8, "num_osds": 1, "num_up_osds": 1, "num_in_osds": "1", "full": "false", "nearfull": "false" } Note how the booleans are actually strings, so we need to take that into account and fix it before returning the dictionary. Issue #8108 """ command = [ 'ceph', '--cluster={cluster}'.format(cluster=cluster), 'osd', 'tree', '--format=json', ] out, err, code = process.check( conn, command, ) try: loaded_json = json.loads(''.join(out)) # convert boolean strings to actual booleans because # --format=json fails to do this properly for k, v in loaded_json.items(): if v == 'true': loaded_json[k] = True elif v == 'false': loaded_json[k] = False return loaded_json except ValueError: return {}
def is_running(conn, args): """ Run a command to check the status of a mon, return a boolean. We heavily depend on the format of the output, if that ever changes we need to modify this. Check daemon status for 3 times output of the status should be similar to:: mon.mira094: running {"version":"0.61.5"} or when it fails:: mon.mira094: dead {"version":"0.61.5"} mon.mira094: not running {"version":"0.61.5"} """ stdout, stderr, _ = process.check(conn, args) result_string = ' '.join(stdout) for run_check in [': running', ' start/running']: if run_check in result_string: return True return False
def osd_list(args, cfg): # FIXME: this portion should probably be abstracted. We do the same in # mon.py cfg = conf.ceph.load(args) mon_initial_members = cfg.safe_get('global', 'mon_initial_members') monitors = re.split(r'[,\s]+', mon_initial_members) if not monitors: raise exc.NeedHostError( 'could not find `mon initial members` defined in ceph.conf') # get the osd tree from a monitor host mon_host = monitors[0] distro = hosts.get(mon_host, username=args.username) tree = osd_tree(distro.conn, args.cluster) distro.conn.exit() interesting_files = ['active', 'magic', 'whoami', 'journal_uuid'] for hostname, disk, journal in args.disk: distro = hosts.get(hostname, username=args.username) remote_module = distro.conn.remote_module osds = distro.conn.remote_module.listdir(constants.osd_path) output, err, exit_code = process.check(distro.conn, [ 'ceph-disk', 'list', ]) for _osd in osds: osd_path = os.path.join(constants.osd_path, _osd) journal_path = os.path.join(osd_path, 'journal') _id = int(_osd.split('-')[-1]) # split on dash, get the id osd_name = 'osd.%s' % _id metadata = {} json_blob = {} # piggy back from ceph-disk and get the mount point device = get_osd_mount_point(output, osd_name) if device: metadata['device'] = device # read interesting metadata from files for f in interesting_files: osd_f_path = os.path.join(osd_path, f) if remote_module.path_exists(osd_f_path): metadata[f] = remote_module.readline(osd_f_path) # do we have a journal path? if remote_module.path_exists(journal_path): metadata['journal path'] = remote_module.get_realpath( journal_path) # is this OSD in osd tree? for blob in tree['nodes']: if blob.get('id') == _id: # matches our OSD json_blob = blob print_osd( distro.conn.logger, hostname, osd_path, json_blob, metadata, ) distro.conn.exit()
def osd_list(args, cfg): # FIXME: this portion should probably be abstracted. We do the same in # mon.py cfg = conf.ceph.load(args) mon_initial_members = cfg.safe_get('global', 'mon_initial_members') monitors = re.split(r'[,\s]+', mon_initial_members) if not monitors: raise exc.NeedHostError( 'could not find `mon initial members` defined in ceph.conf' ) # get the osd tree from a monitor host mon_host = monitors[0] distro = hosts.get(mon_host, username=args.username) tree = osd_tree(distro.conn, args.cluster) distro.conn.exit() interesting_files = ['active', 'magic', 'whoami', 'journal_uuid'] for hostname, disk, journal in args.disk: distro = hosts.get(hostname, username=args.username) remote_module = distro.conn.remote_module osds = distro.conn.remote_module.listdir(constants.osd_path) output, err, exit_code = process.check( distro.conn, [ 'ceph-disk', 'list', ] ) for _osd in osds: osd_path = os.path.join(constants.osd_path, _osd) journal_path = os.path.join(osd_path, 'journal') _id = int(_osd.split('-')[-1]) # split on dash, get the id osd_name = 'osd.%s' % _id metadata = {} json_blob = {} # piggy back from ceph-disk and get the mount point device = get_osd_mount_point(output, osd_name) if device: metadata['device'] = device # read interesting metadata from files for f in interesting_files: osd_f_path = os.path.join(osd_path, f) if remote_module.path_exists(osd_f_path): metadata[f] = remote_module.readline(osd_f_path) # do we have a journal path? if remote_module.path_exists(journal_path): metadata['journal path'] = remote_module.get_realpath(journal_path) # is this OSD in osd tree? for blob in tree['nodes']: if blob.get('id') == _id: # matches our OSD json_blob = blob print_osd( distro.conn.logger, hostname, osd_path, json_blob, metadata, ) distro.conn.exit()
def purgedata(args): LOG.debug( 'Purging data from cluster %s hosts %s', args.cluster, ' '.join(args.host), ) installed_hosts = [] for hostname in args.host: distro = hosts.get(hostname, username=args.username) ceph_is_installed = distro.conn.remote_module.which('ceph') if ceph_is_installed: installed_hosts.append(hostname) distro.conn.exit() if installed_hosts: LOG.error("ceph is still installed on: %s", installed_hosts) raise RuntimeError("refusing to purge data while ceph is still installed") for hostname in args.host: distro = hosts.get(hostname, username=args.username) LOG.info( 'Distro info: %s %s %s', distro.name, distro.release, distro.codename ) rlogger = logging.getLogger(hostname) rlogger.info('purging data on %s' % hostname) # Try to remove the contents of /var/lib/ceph first, don't worry # about errors here, we deal with them later on process.check( distro.conn, [ 'rm', '-rf', '--one-file-system', '--', '/var/lib/ceph', ] ) # If we failed in the previous call, then we probably have OSDs # still mounted, so we unmount them here if distro.conn.remote_module.path_exists('/var/lib/ceph'): rlogger.warning( 'OSDs may still be mounted, trying to unmount them' ) process.run( distro.conn, [ 'find', '/var/lib/ceph', '-mindepth', '1', '-maxdepth', '2', '-type', 'd', '-exec', 'umount', '{}', ';', ] ) # And now we try again to remove the contents, since OSDs should be # unmounted, but this time we do check for errors process.run( distro.conn, [ 'rm', '-rf', '--one-file-system', '--', '/var/lib/ceph', ] ) process.run( distro.conn, [ 'rm', '-rf', '--one-file-system', '--', '/etc/ceph/', ] ) distro.conn.exit()
def create_mds(conn, name, cluster, init): path = '/var/lib/ceph/mds/{cluster}-{name}'.format(cluster=cluster, name=name) conn.remote_module.safe_mkdir(path) bootstrap_keyring = '/var/lib/ceph/bootstrap-mds/{cluster}.keyring'.format( cluster=cluster) keypath = os.path.join(path, 'keyring') stdout, stderr, returncode = process.check(conn, [ 'ceph', '--cluster', cluster, '--name', 'client.bootstrap-mds', '--keyring', bootstrap_keyring, 'auth', 'get-or-create', 'mds.{name}'.format(name=name), 'osd', 'allow rwx', 'mds', 'allow', 'mon', 'allow profile mds', '-o', os.path.join(keypath), ]) if returncode > 0 and returncode != errno.EACCES: for line in stderr: conn.logger.error(line) for line in stdout: # yes stdout as err because this is an error conn.logger.error(line) conn.logger.error('exit code from command was: %s' % returncode) raise RuntimeError('could not create mds') process.check(conn, [ 'ceph', '--cluster', cluster, '--name', 'client.bootstrap-mds', '--keyring', bootstrap_keyring, 'auth', 'get-or-create', 'mds.{name}'.format(name=name), 'osd', 'allow *', 'mds', 'allow', 'mon', 'allow rwx', '-o', os.path.join(keypath), ]) conn.remote_module.touch_file(os.path.join(path, 'done')) conn.remote_module.touch_file(os.path.join(path, init)) if init == 'upstart': process.run(conn, [ 'initctl', 'emit', 'ceph-mds', 'cluster={cluster}'.format(cluster=cluster), 'id={name}'.format(name=name), ], timeout=7) elif init == 'sysvinit': process.run(conn, [ 'service', 'ceph', 'start', 'mds.{name}'.format(name=name), ], timeout=7)
def create_mds(conn, name, cluster, init): path = '/var/lib/ceph/mds/{cluster}-{name}'.format( cluster=cluster, name=name ) conn.remote_module.safe_mkdir(path) bootstrap_keyring = '/var/lib/ceph/bootstrap-mds/{cluster}.keyring'.format( cluster=cluster ) keypath = os.path.join(path, 'keyring') stdout, stderr, returncode = process.check( conn, [ 'ceph', '--cluster', cluster, '--name', 'client.bootstrap-mds', '--keyring', bootstrap_keyring, 'auth', 'get-or-create', 'mds.{name}'.format(name=name), 'osd', 'allow rwx', 'mds', 'allow', 'mon', 'allow profile mds', '-o', os.path.join(keypath), ] ) if returncode > 0 and returncode != errno.EACCES: for line in stderr: conn.logger.error(line) for line in stdout: # yes stdout as err because this is an error conn.logger.error(line) conn.logger.error('exit code from command was: %s' % returncode) raise RuntimeError('could not create mds') process.check( conn, [ 'ceph', '--cluster', cluster, '--name', 'client.bootstrap-mds', '--keyring', bootstrap_keyring, 'auth', 'get-or-create', 'mds.{name}'.format(name=name), 'osd', 'allow *', 'mds', 'allow', 'mon', 'allow rwx', '-o', os.path.join(keypath), ] ) conn.remote_module.touch_file(os.path.join(path, 'done')) conn.remote_module.touch_file(os.path.join(path, init)) if init == 'upstart': process.run( conn, [ 'initctl', 'emit', 'ceph-mds', 'cluster={cluster}'.format(cluster=cluster), 'id={name}'.format(name=name), ], timeout=7 ) elif init == 'sysvinit': process.run( conn, [ 'service', 'ceph', 'start', 'mds.{name}'.format(name=name), ], timeout=7 )