def service_filtered_ps(self): ps_filtered = [] path = os.path.join(HotSOSConfig.DATA_ROOT, 'sys/fs/cgroup/unified/system.slice') for svc in self.services: for svc in self.get_services_expanded(svc): _path = os.path.join(path, "{}.service".format(svc), 'cgroup.procs') if not os.path.exists(_path): _path = glob.glob( os.path.join(path, 'system-*.slice', "{}.service".format(svc), 'cgroup.procs')) if not _path or not os.path.exists(_path[0]): continue _path = _path[0] pids = [] with open(_path) as fd: for line in fd: pids.append(int(line)) for line in CLIHelper().ps(): for pid in pids: if " {} ".format(pid) in line: ps_filtered.append(line) return ps_filtered
def __init__(self): try: self.numactl = CLIHelper().numactl() or "" except OSError: self.numactl = "" self._nodes = {}
def path(self): if self.fs_path: # pylint: disable=W0125 path = os.path.join(HotSOSConfig.DATA_ROOT, self.fs_path) if (HotSOSConfig.USE_ALL_LOGS and not self.options['disable-all-logs']): path = "{}*".format(path) return path if self.command: # pylint: disable=W0125 if self.cmd_tmp_path: return self.cmd_tmp_path args_callback = self.options['args-callback'] if args_callback: args, kwargs = self.get_method(args_callback) else: args = self.options['args'] kwargs = self.options['kwargs'] # get command output out = getattr(CLIHelper(), self.command)(*args, **kwargs) # store in temp file to make it searchable # NOTE: we dont need to delete this at the the end since they are # created in the plugun tmp dir which is wiped at the end of the # plugin run. if type(out) == list: out = ''.join(out) elif type(out) == dict: out = str(out) self.cmd_tmp_path = mktemp_dump(out) return self.cmd_tmp_path log.debug("no input provided")
def __summary_rootfs(self): df_output = CLIHelper().df() if df_output: for line in df_output: ret = re.compile(r"(.+\/$)").match(line) if ret: return ret[1]
def services(self): """ Return a dict of identified systemd services and their state. Services are represented as either direct or indirect units and typically use one or the other. We homongenise these to present state based on the one we think is being used. Enabled units are aggregated but masked units are not so that they can be identified and reported. """ if self._service_info: return self._service_info svc_info = {} indirect_svc_info = {} for line in CLIHelper().systemctl_list_unit_files(): for expr in self.service_exprs: # Add snap prefix/suffixes base_expr = r"(?:snap\.)?{}(?:\.daemon)?".format(expr) # NOTE: we include indirect services (ending with @) so that # we can search for related units later. unit_expr = r'^\s*({}(?:@\S*)?)\.service'.format(base_expr) # match entries in systemctl list-unit-files unit_files_expr = r'{}\s+(\S+)'.format(unit_expr) ret = re.compile(unit_files_expr).match(line) if ret: unit = ret.group(1) state = ret.group(2) if unit.endswith('@'): # indirect or "template" units can have "instantiated" # units where only the latter represents whether the # unit is in use. If an indirect unit has instanciated # units we use them to represent the state of the # service. unit_svc_expr = r"\s+({}\d*)".format(unit) unit = unit.partition('@')[0] if self._get_systemd_units(unit_svc_expr): state = 'enabled' indirect_svc_info[unit] = state else: svc_info[unit] = SystemdService(unit, state) if indirect_svc_info: # Allow indirect unit info to override given certain conditions for unit, state in indirect_svc_info.items(): if unit in svc_info: if state == 'disabled' or svc_info[unit] == 'enabled': continue svc_info[unit].state = state else: svc_info[unit] = SystemdService(unit, state) self._service_info = svc_info return self._service_info
def num_cpus(self): """ Return number of cpus or 0 if none found. """ lscpu_output = CLIHelper().lscpu() if lscpu_output: for line in lscpu_output: ret = re.compile(r"^CPU\(s\):\s+([0-9]+)\s*.*").match(line) if ret: return int(ret[1]) return 0
def virtualisation_type(self): """ @return: virt type e.g. kvm or lxc if host is virtualised otherwise None. """ info = CLIHelper().hostnamectl() for line in info: split_line = line.partition(': ') if 'Virtualization' in split_line[0]: return split_line[2].strip() return
def get_services_expanded(self, name): _expanded = [] for line in CLIHelper().systemctl_list_units(): expr = r'^\s*({}(@\S*)?)\.service'.format(name) ret = re.compile(expr).match(line) if ret: _expanded.append(ret.group(1)) if not _expanded: _expanded = [name] return _expanded
def _get_systemd_units(self, expr): """ Search systemd unit instances. @param expr: expression used to match one or more units in --list-units """ units = [] for line in CLIHelper().systemctl_list_units(): ret = re.compile(expr).match(line) if ret: units.append(ret.group(1)) return units
def sysctl_all(self): if self._sysctl_all is not None: return self._sysctl_all actuals = {} for kv in CLIHelper().sysctl_all(): k = kv.partition("=")[0].strip() v = kv.partition("=")[2].strip() # normalise multi-whitespace into a single actuals[k] = ' '.join(v.split()) self._sysctl_all = actuals return self._sysctl_all
def __summary_devices(self): devs = {} for dev_type in ['bcache', 'nvme']: for line in CLIHelper().ls_lanR_sys_block(): expr = r".+[0-9:]+\s+({}[0-9a-z]+)\s+.+".format(dev_type) ret = re.compile(expr).match(line) if ret: if dev_type not in devs: devs[dev_type] = {} devname = ret[1] devs[dev_type][devname] = {} for line in CLIHelper().udevadm_info_dev(device=devname): expr = r'.+\s+disk/by-dname/(.+)' ret = re.compile(expr).match(line) if ret: devs[dev_type][devname]['dname'] = ret[1] elif 'dname' not in devs[dev_type][devname]: devs[dev_type][devname]['dname'] = \ '<notfound>' if devs: return devs
def unattended_upgrades_enabled(self): apt_config_dump = CLIHelper().apt_config_dump() if not apt_config_dump: return for line in apt_config_dump: ret = re.compile(r"^APT::Periodic::Unattended-Upgrade\s+" "\"([0-9]+)\";").match(line) if ret: if int(ret[1]) == 0: return False else: return True return False
class KernelLogEventChecks(KernelEventChecksBase): def __init__(self): super().__init__(yaml_defs_group='kernlog', searchobj=FileSearcher(), callback_helper=EVENTCALLBACKS) self.cli_helper = CLIHelper() self.hostnet_helper = HostNetworkingHelper() @EVENTCALLBACKS.callback() def over_mtu_dropped_packets(self, event): interfaces = {} for r in event.results: if r.get(1) in interfaces: interfaces[r.get(1)] += 1 else: interfaces[r.get(1)] = 1 if interfaces: # only report on interfaces that currently exist host_interfaces = [ iface.name for iface in self.hostnet_helper.host_interfaces_all ] # filter out interfaces that are actually ovs bridge aliases ovs_bridges = self.cli_helper.ovs_vsctl_list_br() # strip trailing newline chars ovs_bridges = [br.strip() for br in ovs_bridges] interfaces_extant = {} for iface in interfaces: if iface in host_interfaces: if iface not in ovs_bridges: interfaces_extant[iface] = interfaces[iface] if interfaces_extant: msg = ("kernel has reported over-mtu dropped packets for ({}) " "interfaces.".format(len(interfaces_extant))) issue = NetworkWarning(msg) IssuesManager().add(issue) # sort by number of occurrences sorted_dict = {} for k, v in sorted(interfaces_extant.items(), key=lambda e: e[1], reverse=True): sorted_dict[k] = v return sorted_dict
def start_time(self): """ Get most recent start time of this service unit. @returns: datetime.datetime object or None if time not found. """ if self._start_time: return self._start_time cexpr = re.compile(r"^(([0-9-]+)T[\d:]+\+[\d]+)\s+.+: " "(Started|Starting) .+") journal = CLIHelper().journalctl(unit=self.name) last = None for line in journal: ret = cexpr.search(line) if ret: last = ret.group(1) if last: self._start_time = datetime.strptime(last, "%Y-%m-%dT%H:%M:%S+%f") return self._start_time
def filter_by_age(cls, results, result_age_hours): if not result_age_hours: log.debug("result age filter not specified - skipping") return results current = CLIHelper().date(format='+%Y-%m-%d %H:%M:%S') if not current: log.warning("date() returned unexpected value '%s' - skipping " "filter by age", current) return results current = datetime.strptime(current, "%Y-%m-%d %H:%M:%S") log.debug("applying search filter (result_age_hours=%s, " "current='%s')", result_age_hours, current) _results = [] for r in results: ts = cls.get_datetime_from_result(r) if ts and ts >= current - timedelta(hours=result_age_hours): _results.append(r) return _results
def __init__(self): super().__init__(yaml_defs_group='kernlog', searchobj=FileSearcher(), callback_helper=EVENTCALLBACKS) self.cli_helper = CLIHelper() self.hostnet_helper = HostNetworkingHelper()
def loadavg(self): uptime = CLIHelper().uptime() if uptime: ret = re.compile(r".+load average:\s+(.+)").match(uptime) if ret: return ret[1]
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._bcache_devs = [] self.cli = CLIHelper()
class BcacheBase(StorageBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._bcache_devs = [] self.cli = CLIHelper() @property def bcache_enabled(self): """ Return True if there are any backing devices configured. """ for cset in self.get_cachesets(): if self.get_cacheset_bdevs(cset): return True def get_cachesets(self): return glob.glob( os.path.join(HotSOSConfig.DATA_ROOT, 'sys/fs/bcache/*')) def get_cacheset_bdevs(self, cset): return glob.glob(os.path.join(cset, 'bdev*')) def get_sysfs_cachesets(self): cachesets = [] for entry in self.get_cachesets(): if os.path.exists(os.path.join(entry, "cache_available_percent")): cachesets.append({ "path": entry, "uuid": os.path.basename(entry) }) for cset in cachesets: path = os.path.join(cset['path'], "cache_available_percent") with open(path) as fd: value = fd.read().strip() cset["cache_available_percent"] = int(value) # dont include in final output del cset["path"] return cachesets @property def udev_bcache_devs(self): """ If bcache devices exist fetch information and return as a list. """ if self._bcache_devs: return self._bcache_devs udevadm_info = self.cli.udevadm_info_exportdb() if not udevadm_info: return self._bcache_devs s = FileSearcher() sdef = SequenceSearchDef(start=SearchDef(r"^P: .+/(bcache\S+)"), body=SearchDef(r"^S: disk/by-uuid/(\S+)"), tag="bcacheinfo") s.add_search_term(sdef, utils.mktemp_dump('\n'.join(udevadm_info))) results = s.search() devs = [] for section in results.find_sequence_sections(sdef).values(): dev = {} for r in section: if r.tag == sdef.start_tag: dev["name"] = r.get(1) else: dev["by-uuid"] = r.get(1) devs.append(dev) self._bcache_devs = devs return self._bcache_devs def is_bcache_device(self, dev): """ Returns True if the device either is or is based on a bcache device e.g. dmcrypt device using bcache dev. """ if dev.startswith("bcache"): return True if dev.startswith("/dev/bcache"): return True ret = re.compile(r"/dev/mapper/crypt-(\S+)").search(dev) if ret: for dev in self.udev_bcache_devs: if dev.get("by-uuid") == ret.group(1): return True return False
def date(self): return CLIHelper().date(no_format=True)
def hostname(self): return CLIHelper().hostname()
class OpenstackNetworkChecks(OpenstackChecksBase): def __init__(self): super().__init__() self.cli = CLIHelper() @property def summary_subkey(self): return 'network' def _get_port_stat_outliers(self, counters): """ For a given port's packet counters, identify outliers i.e. > 1% and create a new dict with count and percent values. """ stats = {} for rxtx in counters: total = sum(counters[rxtx].values()) for key, value in counters[rxtx].items(): if key == "packets": continue if value: pcent = int(100 / float(total) * float(value)) if pcent <= 1: continue if rxtx not in stats: stats[rxtx] = {} stats[rxtx][key] = "{} ({}%)".format(int(value), pcent) return stats def get_config_info(self): config_info = {} for project in ['nova', 'neutron', 'octavia']: _project = getattr(self, project) if _project and _project.bind_interfaces: for name, port in _project.bind_interfaces.items(): if project not in config_info: config_info[project] = {} config_info[project][name] = port.to_dict() return config_info def get_phy_port_health_info(self): """ Identify ports used by Openstack services, include them in output for informational purposes along with their health (dropped packets etc) for any outliers detected. """ port_health_info = {} for project in ['nova', 'neutron', 'octavia']: _project = getattr(self, project) if _project and _project.bind_interfaces: for port in _project.bind_interfaces.values(): if port.stats: stats = self._get_port_stat_outliers(port.stats) if not stats: continue port_health_info[port.name] = stats return port_health_info def __summary_config(self): config_info = self.get_config_info() if config_info: return config_info def __summary_phy_port_health(self): port_health_info = self.get_phy_port_health_info() if port_health_info: return port_health_info def __summary_namespaces(self): """Populate namespace information dict.""" ns_info = {} for line in self.cli.ip_netns(): ret = re.compile(r"^([a-z0-9]+)-([0-9a-z\-]+)\s+.+").match(line) if ret: if ret[1] in ns_info: ns_info[ret[1]] += 1 else: ns_info[ret[1]] = 1 if ns_info: return ns_info def __summary_vm_port_health(self): """ For each instance get its ports and check port health, reporting on any outliers. """ if not self.nova.instances: return port_health_info = {} for guest in self.nova.instances.values(): for port in guest.ports: stats = port.stats if stats: outliers = self._get_port_stat_outliers(stats) if not outliers: continue if guest.uuid not in port_health_info: port_health_info[guest.uuid] = {} port_health_info[guest.uuid][port.hwaddr] = outliers if port_health_info: health = { 'num-vms-checked': len(self.nova.instances), 'stats': port_health_info } return health
def __init__(self): super().__init__() self.cli = CLIHelper()
def cli(data_root, version, defs_path, all_logs, quiet, debug, save, format, html_escape, user_summary, short, very_short, full, agent_error_key_by_time, max_logrotate_depth, max_parallel_tasks, list_plugins, machine_readable, **kwargs): """ Run this tool on a host or against an unpacked sosreport to perform analysis of specific applications and the host itself. A summary of information is generated along with any issues or known bugs detected. Applications are defined as plugins and support currently includes Openstack, Kubernetes, Ceph and more (see --list-plugins). The standard output format is yaml to allow easy visual inspection and post-processing by other tools. Other formats are also supported. There a three main components to this tool; the core python library, plugin extensions and a library of checks written in a high level yaml-based language. \b DATA_ROOT Path to an unpacked sosreport. If none provided, will run against local host. """ # noqa full_mode_explicit = full minimal_mode = None if short: minimal_mode = 'short' elif very_short: minimal_mode = 'very-short' repo_info = get_repo_info() if repo_info: setup_config(REPO_INFO=repo_info) _version = get_version() setup_config(HOTSOS_VERSION=_version) if version: print(_version) return if not user_summary: if not data_root or data_root == '/': data_root = '/' elif data_root[-1] != '/': # Ensure trailing slash data_root += '/' setup_config(USE_ALL_LOGS=all_logs, PLUGIN_YAML_DEFS=defs_path, DATA_ROOT=data_root, AGENT_ERROR_KEY_BY_TIME=agent_error_key_by_time, MAX_LOGROTATE_DEPTH=max_logrotate_depth, MAX_PARALLEL_TASKS=max_parallel_tasks, MACHINE_READABLE=machine_readable) if debug and quiet: sys.stderr.write('ERROR: cannot use both --debug and --quiet\n') return if debug: setup_logging(debug) if list_plugins: sys.stdout.write('\n'.join(PLUGIN_CATALOG.keys())) sys.stdout.write('\n') return if data_root == '/': analysis_target = 'localhost' else: analysis_target = 'sosreport {}'.format(data_root) if quiet: show_spinner = False spinner_msg = '' else: show_spinner = not debug spinner_msg = 'INFO: analysing {} '.format(analysis_target) with progress_spinner(show_spinner, spinner_msg): if user_summary: log.debug("User summary provided in %s", data_root) with open(data_root) as fd: summary = yaml.safe_load(fd) else: plugins = [] for k, v in kwargs.items(): if v is True: plugins.append(k) if plugins: # always run these plugins.append('hotsos') if 'system' not in plugins: plugins.append('system') summary = HotSOSClient().run(plugins) formatted = output_filter.apply_output_formatting( summary, format, html_escape, minimal_mode) if save: if user_summary: output_name = os.path.basename(data_root) output_name = output_name.rpartition('.')[0] else: if data_root != '/': if data_root.endswith('/'): data_root = data_root.rpartition('/')[0] output_name = os.path.basename(data_root) else: output_name = "hotsos-{}".format(CLIHelper().hostname()) if minimal_mode: if formatted: out = "{}.short.summary".format(output_name) with open(out, 'w', encoding='utf-8') as fd: fd.write(formatted) fd.write('\n') sys.stdout.write( "INFO: short summary written to {}\n".format(out)) if full_mode_explicit: formatted = output_filter.apply_output_formatting( summary, format, html_escape) if not minimal_mode or full_mode_explicit: if formatted: out = "{}.summary".format(output_name) with open(out, 'w', encoding='utf-8') as fd: fd.write(formatted) fd.write('\n') sys.stdout.write( "INFO: full summary written to {}\n".format(out)) else: if debug: sys.stderr.write('Results:\n') if formatted: sys.stdout.write("{}\n".format(formatted))
def __init__(self, *args, **kwargs): super().__init__(*args, callback_helper=EVENTCALLBACKS, yaml_defs_group='neutron-router-checks', **kwargs) self.cli = CLIHelper() self.ha_info = NeutronHAInfo()
class NeutronL3HAEventChecks(OpenstackEventChecksBase): def __init__(self, *args, **kwargs): super().__init__(*args, callback_helper=EVENTCALLBACKS, yaml_defs_group='neutron-router-checks', **kwargs) self.cli = CLIHelper() self.ha_info = NeutronHAInfo() def check_vrrp_transitions(self, transitions): # there will likely be a large number of transitions if we look across # all time so dont run this check. if HotSOSConfig.USE_ALL_LOGS: return max_transitions = 0 warn_count = 0 threshold = VRRP_TRANSITION_WARN_THRESHOLD for router in transitions: r = transitions[router] _transitions = sum([t for d, t in r.items()]) if _transitions > threshold: max_transitions = max(_transitions, max_transitions) warn_count += 1 if warn_count: msg = ("{} router(s) have had more than {} vrrp transitions " "(max={}) in the last 24 hours.".format(warn_count, threshold, max_transitions)) IssuesManager().add(NeutronL3HAWarning(msg)) def journalctl_args(self): """ Args callback for event cli command """ args = [] kwargs = {'unit': 'neutron-l3-agent'} if not HotSOSConfig.USE_ALL_LOGS: kwargs['date'] = self.cli.date(format="--iso-8601") return args, kwargs @EVENTCALLBACKS.callback() def vrrp_transitions(self, event): transitions = {} for r in event.results: ts_date = r.get(1) vr_id = r.get(2) router = self.ha_info.find_router_with_vr_id(vr_id) if not router: log.debug("no router found with vr_id %s", vr_id) continue uuid = router.uuid if uuid not in transitions: transitions[uuid] = {ts_date: 1} elif ts_date in transitions[uuid]: transitions[uuid][ts_date] += 1 else: transitions[uuid][ts_date] = 1 if transitions: # run checks self.check_vrrp_transitions(transitions) # add info to summary return {'transitions': transitions}, 'keepalived' def __summary_neutron_l3ha(self): return self.final_event_results