def test_filesearcher_error(self): s = FileSearcher() with mock.patch.object(SearchResult, '__init__') as mock_init: def fake_init(*args, **kwargs): raise EOFError("some error") mock_init.side_effect = fake_init path = os.path.join(os.environ["DATA_ROOT"]) s.add_search_term(SearchDef("."), path) s.search()
def test_filesearcher_network_info(self): filepath = os.path.join(os.environ["DATA_ROOT"], 'sos_commands', 'networking', 'ip_-d_address') filepath2 = os.path.join(os.environ["DATA_ROOT"], 'sos_commands', 'networking', 'ip_-s_-d_link') ip = "10.10.101.33" mac = "ac:1f:6b:9e:d8:44" s = FileSearcher() sd = SearchDef(r".+({}).+".format(ip)) s.add_search_term(sd, filepath) sd = SearchDef(r"^\s+link/ether\s+({})\s+.+".format(mac)) s.add_search_term(sd, filepath2) results = s.search() self.assertEquals(set(results.files), set([filepath, filepath2])) self.assertEquals(len(results.find_by_path(filepath)), 1) self.assertEquals(len(results.find_by_path(filepath2)), 3) self.assertEquals(results.find_by_path(filepath)[0].linenumber, 16) for result in results.find_by_path(filepath): self.assertEquals(result.get(1), ip) expected = {8: mac, 15: mac, 22: mac} for result in results.find_by_path(filepath2): ln = result.linenumber self.assertEquals(result.tag, None) self.assertEquals(result.get(1), expected[ln])
def detect_known_bugs(): """Unit fails to start complaining there are members in the relation.""" known_bugs = { 1910958: { "description": ("Unit fails to start complaining there are " "members in the relation."), "pattern": ( r'.* manifold worker returned unexpected error: failed to ' r'initialize uniter for "[A-Za-z0-9-]+": cannot create ' r'relation state tracker: cannot remove persisted state, ' r'relation \d+ has members'), "hint": "manifold worker returned unexpected error", } } s = FileSearcher() for bug in known_bugs: sd = SearchDef(known_bugs[bug]["pattern"], tag=1910958, hint=known_bugs[bug]["hint"]) s.add_search_term(sd, f"{JUJU_LOG_PATH}/*") results = s.search() for bug in known_bugs: if results.find_by_tag(bug): add_known_bug(bug, known_bugs.get("description"))
def test_sequence_searcher_section_start_end_same(self): """ Test scenario: * multiple sections that end with start of the next * start def matches unique start * end def matches any start """ with tempfile.NamedTemporaryFile(mode='w', delete=False) as ftmp: ftmp.write(SEQ_TEST_7) ftmp.close() s = FileSearcher() sd = SequenceSearchDef(start=SearchDef(r"^section (2)"), body=SearchDef(r"\d_\d"), end=SearchDef( r"^section (\d+)"), tag="seq-search-test7") s.add_search_term(sd, path=ftmp.name) results = s.search() sections = results.find_sequence_sections(sd) self.assertEqual(len(sections), 1) for id in sections: for r in sections[id]: if r.tag == sd.start_tag: self.assertEqual(r.get(1), "2") elif r.tag == sd.body_tag: self.assertTrue(r.get(0) in ["2_1"]) os.remove(ftmp.name)
def get_osd_lvm_info(self): if not self.ceph_volume_lvm_list: return ceph_osds = self.services.get("ceph-osd") if not ceph_osds: return 0 f_ceph_volume_lvm_list = mktemp_dump('\n'.join( self.ceph_volume_lvm_list)) s = FileSearcher() sd = SequenceSearchDef(start=SearchDef(r"^=+\s+osd\.(\d+)\s+=+.*"), body=SearchDef([ r"\s+osd\s+(fsid)\s+(\S+)\s*", r"\s+(devices)\s+([\S]+)\s*" ]), tag="ceph-lvm") s.add_search_term(sd, path=f_ceph_volume_lvm_list) info = {} for results in s.search().find_sequence_sections(sd).values(): _osd_id = None _info = {} for result in results: if result.tag == sd.start_tag: _osd_id = int(result.get(1)) elif result.tag == sd.body_tag: if result.get(1) == "fsid": _info["fsid"] = result.get(2) elif result.get(1) == "devices": _info["dev"] = result.get(2) info[_osd_id] = _info os.unlink(f_ceph_volume_lvm_list) return info
class KernelNetworkChecks(KernelChecksBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.search_obj = None def check_mtu_dropped_packets(self): ifaces = {} for r in self.results.find_by_tag("over-mtu"): if r.get(1) in ifaces: ifaces[r.get(1)] += 1 else: ifaces[r.get(1)] = 1 if ifaces: helper = HostNetworkingHelper() # only report the issue if the interfaces actually exist raise_issue = False host_interfaces = helper.get_host_interfaces( include_namespaces=True) ifaces_extant = [] for iface in ifaces: if iface in host_interfaces: raise_issue = True ifaces_extant.append(iface) if raise_issue: msg = ("kernel has reported over-mtu dropped packets for ({}) " "interfaces".format(len(ifaces_extant))) issue = issue_types.NetworkWarning(msg) issues_utils.add_issue(issue) # sort by nuber of occurences sorted_dict = {} for k, v in sorted(ifaces.items(), key=lambda e: e[1], reverse=True): sorted_dict[k] = v KERNEL_INFO["over-mtu-dropped-packets"] = sorted_dict def register_mtu_dropped_packets_search(self): path = os.path.join(constants.DATA_ROOT, 'var/log/kern.log') if constants.USE_ALL_LOGS: path = path + "*" sdef = SearchDef(r".+\] (\S+): dropped over-mtu packet", hint="dropped", tag="over-mtu") self.search_obj.add_search_term(sdef, path) def __call__(self): self.search_obj = FileSearcher() self.register_mtu_dropped_packets_search() self.results = self.search_obj.search() self.check_mtu_dropped_packets()
def run_agent_exception_checks(): s = FileSearcher() checks = [CommonAgentChecks(s)] for check in checks: check.register_search_terms() results = s.search() for check in checks: check_results = check.process_results(results) if check_results: AGENT_CHECKS_RESULTS["agent-exceptions"] = check_results
def run_agent_checks(): s = FileSearcher() checks = [ NeutronAgentEventChecks(s, root="neutron-agent-checks"), NeutronAgentBugChecks(s, root="neutron") ] for check in checks: check.register_search_terms() results = s.search() for check in checks: check_results = check.process_results(results) if check_results: AGENT_CHECKS_RESULTS["agent-checks"] = check_results
def test_search_filter_invert_match(self): with tempfile.NamedTemporaryFile(mode='w', delete=False) as ftmp: ftmp.write(FILTER_TEST_1) ftmp.close() s = FileSearcher() fd = FilterDef(r" (ERROR)", invert_match=True) s.add_filter_term(fd, path=ftmp.name) sd = SearchDef(r".+ INFO (.+)") s.add_search_term(sd, path=ftmp.name) results = s.search().find_by_path(ftmp.name) self.assertEqual(len(results), 1) for r in results: self.assertEqual(r.get(1), "blah") os.remove(ftmp.name)
class OpenvSwitchDaemonChecksBase(object): def __init__(self): self.search_obj = FileSearcher() self.results = [] def register_search_terms(self): raise NotImplementedError def process_results(self): raise NotImplementedError def __call__(self): self.register_search_terms() self.results = self.search_obj.search() self.process_results()
def run_agent_checks(): s = FileSearcher() checks = [NeutronL3AgentEventChecks(s), NeutronOVSAgentEventChecks(s), NeutronAgentBugChecks(s), ] for check in checks: check.register_search_terms() results = s.search() for check in checks: check_results = check.process_results(results) if check_results: key = check.master_results_key AGENT_CHECKS_RESULTS["agent-checks"][key] = check_results
def detect_known_bugs(): """Unit fails to start complaining there are members in the relation.""" data_source = f"{JUJU_LOG_PATH}/*.log" if constants.USE_ALL_LOGS: data_source = f"{data_source}*" s = FileSearcher() for bugdef in BUG_SEARCHES: s.add_search_term(bugdef, data_source) results = s.search() for bugdef in BUG_SEARCHES: bug_results = results.find_by_tag(bugdef.tag) if bug_results: reason = bugdef.render_reason(bug_results[0]) add_known_bug(bugdef.tag, reason)
def __call__(self): super().__call__() data_source = os.path.join(constants.DATA_ROOT, CEPH_LOGS, 'ceph*.log') if constants.USE_ALL_LOGS: data_source = "{}*".format(data_source) s = FileSearcher() for search in SEARCHES: s.add_search_term(search, data_source) self.results = s.search() self.process_osd_failure_reports() self.process_mon_elections() self.process_slow_requests() self.process_crc_bluestore() self.process_crc_rocksdb() self.process_long_heartbeat() self.process_heartbeat_no_reply()
def test_sequence_searcher_overlapping_incomplete(self): with tempfile.NamedTemporaryFile(mode='w', delete=False) as ftmp: ftmp.write(SEQ_TEST_3) ftmp.close() s = FileSearcher() sd = SequenceSearchDef(start=SearchDef( r"^(a\S*) (start\S*) point\S*"), body=SearchDef(r"leads to"), end=SearchDef(r"^an (ending)$"), tag="seq-search-test3") s.add_search_term(sd, path=ftmp.name) results = s.search() sections = results.find_sequence_sections(sd) self.assertEqual(len(sections), 1) for id in sections: for r in sections[id]: if r.tag == sd.start_tag: self.assertEqual(r.get(1), "another") elif r.tag == sd.end_tag: self.assertEqual(r.get(1), "ending") os.remove(ftmp.name)
class RabbitMQClusterChecks(RabbitMQChecksBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.searcher = FileSearcher() def check_log_errors(self): path = os.path.join(constants.DATA_ROOT, 'var/log/rabbitmq/rabbit@*.log') if constants.USE_ALL_LOGS: path = f"{path}*" self.searcher.add_search_term(SearchDef(r".+ \S+_partitioned_network", tag="partitions"), path=path) results = self.searcher.search() if results.find_by_tag("partitions"): msg = ("cluster either has or has had partitions - check " "cluster_status") issues_utils.add_issue(issue_types.RabbitMQWarning(msg)) def __call__(self): super().__call__() self.check_log_errors()
def get_osd_rss(self, osd_id): """Return memory RSS for a given OSD. NOTE: this assumes we have ps auxwwwm format. """ ceph_osds = self.services.get("ceph-osd") if not ceph_osds: return 0 f_osd_ps_cmds = mktemp_dump('\n'.join(ceph_osds['ps_cmds'])) s = FileSearcher() # columns: USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND sd = SearchDef(r"\S+\s+\d+\s+\S+\s+\S+\s+\d+\s+(\d+)\s+.+/ceph-osd\s+" r".+--id\s+{}\s+.+".format(osd_id)) s.add_search_term(sd, path=f_osd_ps_cmds) rss = 0 # we only expect one result for result in s.search().find_by_path(f_osd_ps_cmds): rss = int(int(result.get(1)) / 1024) break os.unlink(f_osd_ps_cmds) return rss
def test_sequence_searcher_multiple_sections(self): with tempfile.NamedTemporaryFile(mode='w', delete=False) as ftmp: ftmp.write(SEQ_TEST_5) ftmp.close() s = FileSearcher() sd = SequenceSearchDef(start=SearchDef( r"^(a\S*) (start\S*) point\S*"), body=SearchDef(r"value is (\S+)"), end=SearchDef(r"^$"), tag="seq-search-test5") s.add_search_term(sd, path=ftmp.name) results = s.search() sections = results.find_sequence_sections(sd) self.assertEqual(len(sections), 2) for id in sections: for r in sections[id]: if r.tag == sd.start_tag: self.assertEqual(r.get(1), "another") elif r.tag == sd.body_tag: self.assertTrue(r.get(1) in ["3", "4"]) elif r.tag == sd.end_tag: self.assertEqual(r.get(0), "") os.remove(ftmp.name)
# process the results for service in AGENT_DAEMON_NAMES: self.process_agent_results(results, service) self.process_bug_results(results) if __name__ == "__main__": s = FileSearcher() common_checks = CommonAgentChecks(s) common_checks.add_agents_issues_search_terms() neutron_checks = NeutronAgentChecks(s) neutron_checks.add_rpc_loop_search_terms() neutron_checks.add_router_event_search_terms() results = s.search() neutron_checks.process_rpc_loop_results(results) neutron_checks.process_router_event_results(results) common_checks.process_agent_issues_results(results) AGENT_CHECKS = {"agent-checks": {}} if common_checks.agent_log_issues: AGENT_CHECKS["agent-checks"]["agent-issues"] = \ common_checks.agent_log_issues if neutron_checks.ovs_agent_info: AGENT_CHECKS["agent-checks"]["neutron-ovs-agent"] = \ neutron_checks.ovs_agent_info if neutron_checks.l3_agent_info:
def _get_port_stats(self, name=None, mac=None): """Get ip link stats for the given port.""" ip_link_show = cli_helpers.get_ip_link_show() stats_raw = [] if mac: libvirt_mac = "fe" + mac[2:] exprs = [] if mac: for _mac in [mac, libvirt_mac]: exprs.append(r"\s+link/ether\s+({})\s+.+".format(_mac)) else: exprs.append(r"\d+:\s+({}):\s+.+".format(name)) with tempfile.NamedTemporaryFile(mode='w', delete=False) as ftmp: ftmp.write(''.join(ip_link_show)) ftmp.close() s = FileSearcher() sd = SequenceSearchDef( # match start if interface start=SearchDef(r"^(?:{})".format('|'.join(exprs))), # match body of interface body=SearchDef(r".+"), # match next interface or EOF end=SearchDef(r"(?:^\d+:\s+\S+:.+|^$)"), tag="ifaces") s.add_search_term(sd, path=ftmp.name) results = s.search() for results in results.find_sequence_sections(sd).values(): for result in results: if result.tag == sd.body_tag: stats_raw.append(result.get(0)) # stop at first match - if matching by mac address it is # possible for multiple interfaces to have the same mac e.g. # bonds and its interfaces but we dont support that so just use # first. break os.unlink(ftmp.name) stats = {} total_packets = float(0) if stats_raw: for i, line in enumerate(stats_raw): ret = re.compile(r"\s+[RT]X:\s+.+").findall(line) if ret: ret = re.compile(r"\s*([a-z]+)\s*").findall(line) if ret: for j, column in enumerate(ret): value = int(stats_raw[i + 1].split()[j]) if column == "packets": total_packets = float(value) continue for key in ["dropped", "errors"]: if column == key: if not value: continue percentage = int( (100 / total_packets) * value) # only report if > 0% drops/errors if percentage > 0: stats[key] = ("{} ({}%)".format( value, percentage)) return stats
def get_events(event_name, data_source): ext_event_info = {} events = {} s = FileSearcher() # look for sequence starter if event_name == "network-vif-plugged": sd = SearchDef(r".+\[instance: (\S+)\].+Preparing to wait for " r"external event ({})-(\S+)\s+".format(event_name)) s.add_search_term(sd, data_source) elif event_name == "network-changed": sd = SearchDef( r".+\[instance: (\S+)\].+Received event ({})-(\S+)\s+".format( event_name)) s.add_search_term(sd, data_source) master_results = s.search() # now start a fresh one s = FileSearcher() for file, results in master_results: for result in results: instance_id = result.get(1) event_id = result.get(3) events[event_id] = { "instance_id": instance_id, "data_source": file } for stage in EXT_EVENT_META[event_name]["stages_keys"]: expr = ( r".+\[instance: {}\]\s+{}\s.*\s?event\s+{}-{}.? .+".format( instance_id, stage, event_name, event_id)) tag = "{}_{}_{}".format(instance_id, event_id, stage) sd = SearchDef(expr, tag, hint=event_name) s.add_search_term(sd, data_source) results = s.search() for event_id in events: instance_id = events[event_id]["instance_id"] data_source = events[event_id]["data_source"] stages = get_state_dict(event_name) for stage in stages: tag = "{}_{}_{}".format(instance_id, event_id, stage) r = results.find_by_tag(tag, path=data_source) if r: stages[stage] = True if all([stages[stage] for stage in stages]): result = "succeeded" else: result = "failed" if event_name not in ext_event_info: ext_event_info[event_name] = {} if result not in ext_event_info[event_name]: ext_event_info[event_name][result] = [] ext_event_info[event_name][result].append({ "port": event_id, "instance": instance_id }) if ext_event_info: for event in ext_event_info: if event not in EXT_EVENT_INFO: EXT_EVENT_INFO[event] = {} for result in ext_event_info[event]: s = ext_event_info[event][result] EXT_EVENT_INFO[event][result] = list(s)
class RabbitMQServiceChecks(RabbitMQServiceChecksBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) ftmp = tempfile.mktemp() _report = cli_helpers.get_rabbitmqctl_report() # save to file so we can search it later with open(ftmp, 'w') as fd: fd.write(''.join(_report)) self.report_path = ftmp self.searcher = FileSearcher() self.resources = {} def __del__(self): if os.path.exists(self.report_path): os.unlink(self.report_path) def get_running_services_info(self): """Get string info for running services.""" if self.services: RABBITMQ_INFO["services"] = self.get_service_info_str() def get_queues(self): """Get distribution of queues across cluster.""" sd = self._sequences["queues"]["searchdef"] vhost_queues = {} raise_issues = [] for results in self.results.find_sequence_sections(sd).values(): vhost = None queues = {} for result in results: if result.tag == sd.start_tag: vhost = result.get(1) elif result.tag == sd.body_tag: info = {"pid_name": result.get(1), "queue": result.get(2)} if info["pid_name"] not in queues: queues[info["pid_name"]] = 1 else: queues[info["pid_name"]] += 1 vhost_queues[vhost] = {} if len(queues.keys()) == 0: continue total = functools.reduce(lambda x, y: x + y, list(queues.values()), 0) vhost_queues[vhost] = {} for pid in queues: if total > 0: fraction = queues[pid] / total fraction_string = "{:.2f}%".format(fraction * 100) if fraction > 2 / 3: raise_issues.append( "{} holds more than 2/3 of queues".format(pid)) else: fraction_string = "N/A" vhost_queues[vhost][pid] = "{:d} ({})".format( queues[pid], fraction_string) for issue in raise_issues: issues_utils.add_issue(issue_types.RabbitMQWarning(issue)) if vhost_queues: # list all vhosts but only show their queues if not [] self.resources["vhosts"] = sorted(list(vhost_queues.keys())) self.resources["vhost-queue-distributions"] = \ {k: v for k, v in vhost_queues.items() if v} def get_queue_connection_distribution(self): """Get distribution of connections across cluster.""" sd = self._sequences["connections"]["searchdef"] queue_connections = {} for results in self.results.find_sequence_sections(sd).values(): for result in results: if result.tag == sd.body_tag: queue_name = result.get(1) if queue_name not in queue_connections: queue_connections[queue_name] = 1 else: queue_connections[queue_name] += 1 if queue_connections: self.resources["queue-connections"] = queue_connections def get_memory_used(self): """Get the memory used per broker.""" sd = self._sequences["memory"]["searchdef"] memory_used = {} for results in self.results.find_sequence_sections(sd).values(): for result in results: if result.tag == sd.start_tag: node_name = result.get(1) elif result.tag == sd.body_tag: total = result.get(1) mib_used = int(total) / 1024. / 1024. memory_used[node_name] = "{:.3f}".format(mib_used) if memory_used: self.resources["memory-used-mib"] = memory_used def register_report_searches(self): """Register all sequence search definitions that we will execute against rabbitmqctl report. """ self._sequences = { "queues": { "searchdef": SequenceSearchDef( start=SearchDef(r"^Queues on ([^:]+):"), body=SearchDef(r"^<([^.\s]+)[.0-9]+>\s+(\S+)\s+.+"), end=SearchDef(r"^$"), tag="queues"), "callbacks": [self.get_queues] }, "connections": { "searchdef": SequenceSearchDef( start=SearchDef(r"^Connections:$"), body=SearchDef(r"^<(rabbit[^>.]*)(?:[.][0-9]+)+>.*$"), end=SearchDef(r"^$"), tag="connections"), "callbacks": [self.get_queue_connection_distribution] }, "memory": { "searchdef": SequenceSearchDef( start=SearchDef(r"^Status of node '([^']*)'$"), body=SearchDef(r"^\s+\[{total,([0-9]+)}.+"), end=SearchDef(r"^$"), tag="memory"), "callbacks": [self.get_memory_used] } } for s in self._sequences.values(): self.searcher.add_search_term(s["searchdef"], self.report_path) def run_report_callbacks(self): for s in self._sequences.values(): for f in s["callbacks"]: f() def run_report_searches(self): self.register_report_searches() self.results = self.searcher.search() self.run_report_callbacks() if not self.resources: return RABBITMQ_INFO["resources"] = self.resources def __call__(self): super().__call__() self.get_running_services_info() self.run_report_searches()
class NeutronL3HAChecks(object): def __init__(self): self.searcher = FileSearcher() def get_neutron_ha_info(self): ha_state_path = os.path.join(constants.DATA_ROOT, NEUTRON_HA_PATH) if not os.path.exists(ha_state_path): return vrrp_states = {} router_states = {} for entry in os.listdir(ha_state_path): entry = os.path.join(ha_state_path, entry) if os.path.isdir(entry): pid_path = "{}{}".format(entry, ".pid.keepalived-vrrp") keepalived_conf_path = os.path.join(entry, "keepalived.conf") state_path = os.path.join(entry, "state") if os.path.exists(state_path): with open(state_path) as fd: router = os.path.basename(entry) state = fd.read().strip() if state in router_states: router_states[state].append(router) else: router_states[state] = [router] if os.path.isfile(keepalived_conf_path): with open(keepalived_conf_path) as fd: for line in fd: expr = ".+ virtual_router_id ([0-9]+)" ret = re.compile(expr).search(line) if ret: ROUTER_VR_IDS[router] = ret.group(1) if os.path.isfile(pid_path): with open(pid_path) as fd: pid = fd.read().strip() vrrp_states[router] = pid if router_states: L3HA_CHECKS["agent"] = router_states if vrrp_states: L3HA_CHECKS["keepalived"] = vrrp_states def get_vrrp_transitions(self): if "keepalived" not in L3HA_CHECKS: return transitions = {} with tempfile.NamedTemporaryFile(mode='w', delete=False) as ftmp: if not constants.USE_ALL_LOGS: date = cli_helpers.get_date(format="--iso-8601").rstrip() else: date = None out = cli_helpers.get_journalctl(unit="neutron-l3-agent", date=date) ftmp.write(''.join(out)) ftmp.close() for router in L3HA_CHECKS["keepalived"]: vr_id = ROUTER_VR_IDS[router] expr = (r"^(\S+) [0-9]+ [0-9:]+ \S+ Keepalived_vrrp" r"\[([0-9]+)\]: VRRP_Instance\(VR_{}\) .+ (\S+) " "STATE.*".format(vr_id)) d = SearchDef(expr, tag=router) self.searcher.add_search_term(d, ftmp.name) results = self.searcher.search() for router in L3HA_CHECKS["keepalived"]: transitions[router] = len(results.find_by_tag(router)) os.unlink(ftmp.name) if transitions: L3HA_CHECKS["keepalived"] = {"transitions": {}} for k, v in sorted(transitions.items(), key=lambda x: x[1], reverse=True): L3HA_CHECKS["keepalived"]["transitions"][k] = v def check_vrrp_transitions(self): if "transitions" not in L3HA_CHECKS.get("keepalived", {}): return max_transitions = 0 warn_count = 0 threshold = VRRP_TRANSITION_WARN_THRESHOLD for router in L3HA_CHECKS["keepalived"]["transitions"]: transitions = L3HA_CHECKS["keepalived"]["transitions"][router] if transitions > threshold: max_transitions = max(transitions, max_transitions) warn_count += 1 if warn_count: msg = ("{} router(s) have had more than {} vrrp transitions " "(max={}) in the last 24 hours".format( warn_count, threshold, max_transitions)) issues_utils.add_issue(issue_types.NeutronL3HAWarning(msg)) def __call__(self): self.get_neutron_ha_info() self.get_vrrp_transitions() # there will likely be a large number of transitions if we look across # all time so dont run this check. if not constants.USE_ALL_LOGS: self.check_vrrp_transitions()
def test_filesearcher_logs(self): expected = {4: '2021-02-25 14:22:18.861', 16: '2021-02-25 14:22:19.587'} logs_root = "var/log/neutron/" filepath = os.path.join(os.environ["DATA_ROOT"], logs_root, 'neutron-openvswitch-agent.log') globpath = os.path.join(os.environ["DATA_ROOT"], logs_root, 'neutron-l3-agent.log*') globpath_file1 = os.path.join(os.environ["DATA_ROOT"], logs_root, 'neutron-l3-agent.log') globpath_file2 = os.path.join(os.environ["DATA_ROOT"], logs_root, 'neutron-l3-agent.log.1.gz') s = FileSearcher() sd = SearchDef(r'^(\S+\s+[0-9:\.]+)\s+.+full sync.+', tag="T1") s.add_search_term(sd, filepath) sd = SearchDef(r'^(\S+\s+[0-9:\.]+)\s+.+ERROR.+', tag="T2") s.add_search_term(sd, filepath) sd = SearchDef((r'^(\S+\s+[0-9:\.]+)\s+.+ INFO .+ Router ' '9b8efc4c-305b-48ce-a5bd-624bc5eeee67.+'), tag="T3") s.add_search_term(sd, globpath) sd = SearchDef(r'non-existant-pattern', tag="T4") # search for something that doesn't exist to test that code path s.add_search_term(sd, globpath) results = s.search() self.assertEquals(set(results.files), set([filepath, globpath_file2, globpath_file1])) self.assertEquals(len(results.find_by_path(filepath)), 37) tag_results = results.find_by_tag("T1", path=filepath) self.assertEquals(len(tag_results), 2) for result in tag_results: ln = result.linenumber self.assertEquals(result.tag, "T1") self.assertEquals(result.get(1), expected[ln]) tag_results = results.find_by_tag("T1") self.assertEquals(len(tag_results), 2) for result in tag_results: ln = result.linenumber self.assertEquals(result.tag, "T1") self.assertEquals(result.get(1), expected[ln]) self.assertEquals(len(results.find_by_path(globpath_file1)), 1) self.assertEquals(len(results.find_by_path(globpath_file2)), 0) # these files have the same content so expect same result from both expected = {81: '2021-03-25 18:10:15.179'} path_results = results.find_by_path(globpath_file1) for result in path_results: ln = result.linenumber self.assertEquals(result.tag, "T3") self.assertEquals(result.get(1), expected[ln]) path_results = results.find_by_path(globpath_file2) for result in path_results: ln = result.linenumber self.assertEquals(result.tag, "T3") self.assertEquals(result.get(1), expected[ln])
class RabbitMQServiceChecks(RabbitMQChecksBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) out = cli_helpers.get_rabbitmqctl_report() # save to file so we can search it later self.f_report = mktemp_dump(''.join(out)) self.searcher = FileSearcher() self.resources = {} def __del__(self): if os.path.exists(self.f_report): os.unlink(self.f_report) def get_running_services_info(self): """Get string info for running services.""" if self.services: RABBITMQ_INFO["services"] = self.get_service_info_str() def register_report_searches(self): """Register all sequence search definitions that we will execute against rabbitmqctl report. NOTE: the rabbitmqctl report output differs between versions 3.6.x and 3.8.x and we try to account for either by providing optional regex expressions to match either. """ self._sequences = { "queues": { "searchdef": SequenceSearchDef( start=SearchDef([r"^Queues on ([^:]+):", (r"^Listing queues for vhost ([^:]+) " r"...")]), # NOTE: we don't use a list for the body here because # we need to know which expression matched so that we # can know in which order to retrieve the columns since # their order is inverted between 3.6.x and 3.8.x body=SearchDef(r"^(?:<([^.\s]+)[.0-9]+>\s+(\S+)|" r"(\S+)\s+(?:\S+\s+){4}<([^.\s]+)[.0-9]" r"+>)\s+.+"), end=SearchDef(r"^$"), tag="queues"), "callbacks": [self.get_queue_info] }, "connections": { "searchdef": SequenceSearchDef( start=SearchDef([r"^Connections:$", r"^Listing connections ...$"]), body=SearchDef(r"^<(rabbit[^>.]*)(?:[.][0-9]+)+>.*$"), end=SearchDef(r"^$"), tag="connections"), "callbacks": [self.get_queue_connection_distribution] }, "memory": { "searchdef": SequenceSearchDef( start=SearchDef([r"^Status of node '([^']*)'$", r"^Status of node ([^']*) ...$"]), body=SearchDef(r"^\s+\[{total,([0-9]+)}.+"), end=SearchDef(r"^$"), tag="memory"), "callbacks": [self.get_memory_used] }, "partitioning": { "searchdef": SearchDef(r"^\s*{cluster_partition_handling,([^}]*)}", tag="cluster_partition_handling"), "callbacks": [self.get_partition_handling] } } for s in self._sequences.values(): self.searcher.add_search_term(s["searchdef"], self.f_report) def get_queue_info(self): """Get distribution of queues across cluster.""" sd = self._sequences["queues"]["searchdef"] vhost_queues = {} issues_raised = {} skewed_queue_nodes = {} for results in self.results.find_sequence_sections(sd).values(): vhost = None queues = {} for result in results: if result.tag == sd.start_tag: # check both report formats vhost = result.get(1) elif result.tag == sd.body_tag: node_name = result.get(1) or result.get(4) # if we matched the section header, skip if node_name == "pid": continue queue = result.get(2) or result.get(3) # if we matched the section header, skip if queue == "name": continue if node_name not in queues: queues[node_name] = 0 queues[node_name] += 1 vhost_queues[vhost] = {} if not queues: continue total = sum(queues.values()) for node_name in queues: if total > 0: fraction = queues[node_name] / total fraction_string = "{:.2f}%".format(fraction * 100) if fraction > 2 / 3: if node_name not in skewed_queue_nodes: skewed_queue_nodes[node_name] = 0 skewed_queue_nodes[node_name] += 1 else: fraction_string = "N/A" vhost_queues[vhost][node_name] = "{:d} ({})".format( queues[node_name], fraction_string) # Report the node with the greatest skew of queues/vhost if skewed_queue_nodes: max_node = None for node_name in skewed_queue_nodes: if max_node is None: max_node = node_name elif (skewed_queue_nodes[node_name] >= skewed_queue_nodes[max_node]): max_node = node_name if (skewed_queue_nodes[max_node] > issues_raised.get(max_node, 0)): issues_raised[max_node] = skewed_queue_nodes[max_node] # this should only actually ever report one node for node_name in issues_raised: msg = ("{} holds more than 2/3 of queues for {}/{} vhost(s)". format(node_name, issues_raised[node_name], len(vhost_queues))) issues_utils.add_issue(issue_types.RabbitMQWarning(msg)) if vhost_queues: # list all vhosts but only show their queues if not [] self.resources["vhosts"] = sorted(list(vhost_queues.keys())) self.resources["vhost-queue-distributions"] = \ {k: v for k, v in vhost_queues.items() if v} def get_queue_connection_distribution(self): """Get distribution of connections across cluster.""" sd = self._sequences["connections"]["searchdef"] queue_connections = {} for results in self.results.find_sequence_sections(sd).values(): for result in results: if result.tag == sd.body_tag: queue_name = result.get(1) if queue_name not in queue_connections: queue_connections[queue_name] = 1 else: queue_connections[queue_name] += 1 if queue_connections: self.resources["queue-connections"] = queue_connections def get_memory_used(self): """Get the memory used per broker.""" sd = self._sequences["memory"]["searchdef"] memory_used = {} for results in self.results.find_sequence_sections(sd).values(): for result in results: if result.tag == sd.start_tag: # check both report formats node_name = result.get(1) elif result.tag == sd.body_tag: total = result.get(1) mib_used = int(total) / 1024. / 1024. memory_used[node_name] = "{:.3f}".format(mib_used) if memory_used: self.resources["memory-used-mib"] = memory_used def get_partition_handling(self): """Get the partition handling settings.""" results = self.results.find_by_tag("cluster_partition_handling") if not results: return setting = results[0].get(1) if setting == "ignore": msg = "Cluster partition handling is currently set to ignore. " \ "This is potentially dangerous and a setting of " \ "pause_minority is recommended." issues_utils.add_issue(issue_types.RabbitMQWarning(msg)) self.resources["cluster-partition-handling"] = setting def run_report_callbacks(self): for s in self._sequences.values(): for f in s["callbacks"]: f() def run_report_searches(self): self.register_report_searches() self.results = self.searcher.search() self.run_report_callbacks() if not self.resources: return RABBITMQ_INFO["resources"] = self.resources def __call__(self): super().__call__() self.get_running_services_info() self.run_report_searches()
class OctaviaLBChecks(checks.APTPackageChecksBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.searcher = FileSearcher() self.logs_path = os.path.join(constants.DATA_ROOT, SERVICE_RESOURCES["octavia"]["logs"]) self.data_sources = {} fname = 'octavia-health-manager.log' self.data_sources["health-manager"] = os.path.join(self.logs_path, fname) self.data_sources["worker"] = os.path.join(self.logs_path, 'octavia-worker.log') if constants.USE_ALL_LOGS: self.data_sources["health-manager"] = ( "{}*".format(self.data_sources["health-manager"])) self.data_sources["worker"] = ( "{}*".format(self.data_sources["worker"])) def get_hm_amphora_missed_heartbeats(self): missed_heartbeats = {} expr = (r"^(\S+) \S+ .+ Amphora (\S+) health message was processed " r"too slowly:.+") d = SearchDef(expr, tag="amp-missed-hb", hint="health message") self.searcher.add_search_term(d, self.data_sources["health-manager"]) results = self.searcher.search() for r in results.find_by_tag("amp-missed-hb"): ts_date = r.get(1) amp_id = r.get(2) if ts_date not in missed_heartbeats: missed_heartbeats[ts_date] = {} if amp_id in missed_heartbeats[ts_date]: missed_heartbeats[ts_date][amp_id] += 1 else: missed_heartbeats[ts_date][amp_id] = 1 # sort each amp by occurences for ts_date in missed_heartbeats: d = utils.sorted_dict(missed_heartbeats[ts_date], key=lambda e: e[1], reverse=True) missed_heartbeats[ts_date] = d if missed_heartbeats: # not sort by date LB_CHECKS["amp-missed-heartbeats"] = \ utils.sorted_dict(missed_heartbeats) def get_lb_failovers(self): """Get loadbalancer failover counts.""" failovers = {} expr = (r"^(\S+) \S+ .+ Performing failover for amphora:\s+(.+)") d = SearchDef(expr, tag="lb-failover-auto", hint="failover") self.searcher.add_search_term(d, self.data_sources["health-manager"]) expr = (r"^(\S+) \S+ .+ Performing failover for amphora:\s+(.+)") d = SearchDef(expr, tag="lb-failover-manual", hint="failover") self.searcher.add_search_term(d, self.data_sources["worker"]) for fo_type in ["auto", "manual"]: results = self.searcher.search() for r in results.find_by_tag("lb-failover-{}".format(fo_type)): ts_date = r.get(1) payload = r.get(2) payload = yaml.safe_load(payload) lb_id = payload.get("load_balancer_id") if lb_id is None: continue if fo_type not in failovers: failovers[fo_type] = {} if ts_date not in failovers[fo_type]: failovers[fo_type][ts_date] = {} if lb_id in failovers[fo_type][ts_date]: failovers[fo_type][ts_date][lb_id] += 1 else: failovers[fo_type][ts_date][lb_id] = 1 for fo_type in failovers: # sort each failover by occurences for ts_date in failovers[fo_type]: d = utils.sorted_dict(failovers[fo_type][ts_date], key=lambda e: e[1], reverse=True) failovers[fo_type][ts_date] = d # now sort the dates d = utils.sorted_dict(failovers[fo_type]) if failovers: LB_CHECKS["lb-failovers"] = failovers def __call__(self): if self.core: self.get_lb_failovers() self.get_hm_amphora_missed_heartbeats()
class NeutronL3HAChecks(object): def __init__(self): self.searcher = FileSearcher() self.f_journalctl = None self.router_vrrp_pids = {} def __del__(self): if self.f_journalctl and os.path.exists(self.f_journalctl): os.unlink(self.f_journalctl) def _get_journalctl_l3_agent(self): if not constants.USE_ALL_LOGS: date = cli_helpers.get_date(format="--iso-8601").rstrip() else: date = None out = cli_helpers.get_journalctl(unit="neutron-l3-agent", date=date) self.f_journalctl = mktemp_dump(''.join(out)) def get_neutron_ha_info(self): ha_state_path = os.path.join(constants.DATA_ROOT, NEUTRON_HA_PATH) if not os.path.exists(ha_state_path): return router_states = {} for entry in os.listdir(ha_state_path): entry = os.path.join(ha_state_path, entry) if os.path.isdir(entry): pid_path = "{}{}".format(entry, ".pid.keepalived-vrrp") keepalived_conf_path = os.path.join(entry, "keepalived.conf") state_path = os.path.join(entry, "state") if os.path.exists(state_path): with open(state_path) as fd: router = os.path.basename(entry) state = fd.read().strip() if state in router_states: router_states[state].append(router) else: router_states[state] = [router] if os.path.isfile(keepalived_conf_path): with open(keepalived_conf_path) as fd: for line in fd: expr = ".+ virtual_router_id ([0-9]+)" ret = re.compile(expr).search(line) if ret: ROUTER_VR_IDS[router] = ret.group(1) if os.path.isfile(pid_path): with open(pid_path) as fd: pid = fd.read().strip() self.router_vrrp_pids[router] = pid if router_states: L3HA_CHECKS["agent"] = router_states def get_vrrp_transitions(self): """ List routers that have had a vrrp state transition along with the number of transitions. Excludes routers that have not had any change of state. """ if not self.router_vrrp_pids: return self._get_journalctl_l3_agent() transitions = {} for router in self.router_vrrp_pids: vr_id = ROUTER_VR_IDS[router] expr = (r"^([0-9-]+)T\S+ \S+ Keepalived_vrrp" r"\[([0-9]+)\]: VRRP_Instance\(VR_{}\) .+ (\S+) " "STATE.*".format(vr_id)) d = SearchDef(expr, tag=router) self.searcher.add_search_term(d, self.f_journalctl) results = self.searcher.search() for router in self.router_vrrp_pids: t_count = len(results.find_by_tag(router)) if not t_count: continue for r in results.find_by_tag(router): ts_date = r.get(1) if router not in transitions: transitions[router] = {} if ts_date in transitions[router]: transitions[router][ts_date] += 1 else: transitions[router][ts_date] = 1 if transitions: L3HA_CHECKS["keepalived"] = {"transitions": transitions} def check_vrrp_transitions(self): if "transitions" not in L3HA_CHECKS.get("keepalived", {}): return max_transitions = 0 warn_count = 0 threshold = VRRP_TRANSITION_WARN_THRESHOLD for router in L3HA_CHECKS["keepalived"]["transitions"]: r = L3HA_CHECKS["keepalived"]["transitions"][router] transitions = sum([t for d, t in r.items()]) if transitions > threshold: max_transitions = max(transitions, max_transitions) warn_count += 1 if warn_count: msg = ("{} router(s) have had more than {} vrrp transitions " "(max={}) in the last 24 hours".format( warn_count, threshold, max_transitions)) issues_utils.add_issue(issue_types.NeutronL3HAWarning(msg)) def __call__(self): self.get_neutron_ha_info() self.get_vrrp_transitions() # there will likely be a large number of transitions if we look across # all time so dont run this check. if not constants.USE_ALL_LOGS: self.check_vrrp_transitions()