示例#1
0
    def test_filesearcher_error(self):
        s = FileSearcher()
        with mock.patch.object(SearchResult, '__init__') as mock_init:

            def fake_init(*args, **kwargs):
                raise EOFError("some error")

            mock_init.side_effect = fake_init
            path = os.path.join(os.environ["DATA_ROOT"])
            s.add_search_term(SearchDef("."), path)
            s.search()
示例#2
0
    def test_filesearcher_network_info(self):
        filepath = os.path.join(os.environ["DATA_ROOT"], 'sos_commands',
                                'networking', 'ip_-d_address')
        filepath2 = os.path.join(os.environ["DATA_ROOT"], 'sos_commands',
                                 'networking', 'ip_-s_-d_link')
        ip = "10.10.101.33"
        mac = "ac:1f:6b:9e:d8:44"
        s = FileSearcher()
        sd = SearchDef(r".+({}).+".format(ip))
        s.add_search_term(sd, filepath)
        sd = SearchDef(r"^\s+link/ether\s+({})\s+.+".format(mac))
        s.add_search_term(sd, filepath2)

        results = s.search()
        self.assertEquals(set(results.files), set([filepath, filepath2]))
        self.assertEquals(len(results.find_by_path(filepath)), 1)
        self.assertEquals(len(results.find_by_path(filepath2)), 3)

        self.assertEquals(results.find_by_path(filepath)[0].linenumber, 16)
        for result in results.find_by_path(filepath):
            self.assertEquals(result.get(1), ip)

        expected = {8: mac,
                    15: mac,
                    22: mac}

        for result in results.find_by_path(filepath2):
            ln = result.linenumber
            self.assertEquals(result.tag, None)
            self.assertEquals(result.get(1), expected[ln])
示例#3
0
def detect_known_bugs():
    """Unit fails to start complaining there are members in the relation."""
    known_bugs = {
        1910958: {
            "description": ("Unit fails to start complaining there are "
                            "members in the relation."),
            "pattern": (
                r'.* manifold worker returned unexpected error: failed to '
                r'initialize uniter for "[A-Za-z0-9-]+": cannot create '
                r'relation state tracker: cannot remove persisted state, '
                r'relation \d+ has members'),
            "hint": "manifold worker returned unexpected error",
            }
        }

    s = FileSearcher()
    for bug in known_bugs:
        sd = SearchDef(known_bugs[bug]["pattern"],
                       tag=1910958, hint=known_bugs[bug]["hint"])
        s.add_search_term(sd, f"{JUJU_LOG_PATH}/*")

    results = s.search()

    for bug in known_bugs:
        if results.find_by_tag(bug):
            add_known_bug(bug, known_bugs.get("description"))
示例#4
0
    def test_sequence_searcher_section_start_end_same(self):
        """
        Test scenario:
         * multiple sections that end with start of the next
         * start def matches unique start
         * end def matches any start
        """
        with tempfile.NamedTemporaryFile(mode='w', delete=False) as ftmp:
            ftmp.write(SEQ_TEST_7)
            ftmp.close()
            s = FileSearcher()
            sd = SequenceSearchDef(start=SearchDef(r"^section (2)"),
                                   body=SearchDef(r"\d_\d"),
                                   end=SearchDef(
                                               r"^section (\d+)"),
                                   tag="seq-search-test7")
            s.add_search_term(sd, path=ftmp.name)
            results = s.search()
            sections = results.find_sequence_sections(sd)
            self.assertEqual(len(sections), 1)
            for id in sections:
                for r in sections[id]:
                    if r.tag == sd.start_tag:
                        self.assertEqual(r.get(1), "2")
                    elif r.tag == sd.body_tag:
                        self.assertTrue(r.get(0) in ["2_1"])

            os.remove(ftmp.name)
示例#5
0
    def get_osd_lvm_info(self):
        if not self.ceph_volume_lvm_list:
            return

        ceph_osds = self.services.get("ceph-osd")
        if not ceph_osds:
            return 0

        f_ceph_volume_lvm_list = mktemp_dump('\n'.join(
            self.ceph_volume_lvm_list))
        s = FileSearcher()
        sd = SequenceSearchDef(start=SearchDef(r"^=+\s+osd\.(\d+)\s+=+.*"),
                               body=SearchDef([
                                   r"\s+osd\s+(fsid)\s+(\S+)\s*",
                                   r"\s+(devices)\s+([\S]+)\s*"
                               ]),
                               tag="ceph-lvm")
        s.add_search_term(sd, path=f_ceph_volume_lvm_list)
        info = {}
        for results in s.search().find_sequence_sections(sd).values():
            _osd_id = None
            _info = {}
            for result in results:
                if result.tag == sd.start_tag:
                    _osd_id = int(result.get(1))
                elif result.tag == sd.body_tag:
                    if result.get(1) == "fsid":
                        _info["fsid"] = result.get(2)
                    elif result.get(1) == "devices":
                        _info["dev"] = result.get(2)

            info[_osd_id] = _info

        os.unlink(f_ceph_volume_lvm_list)
        return info
示例#6
0
class KernelNetworkChecks(KernelChecksBase):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.search_obj = None

    def check_mtu_dropped_packets(self):
        ifaces = {}
        for r in self.results.find_by_tag("over-mtu"):
            if r.get(1) in ifaces:
                ifaces[r.get(1)] += 1
            else:
                ifaces[r.get(1)] = 1

        if ifaces:
            helper = HostNetworkingHelper()
            # only report the issue if the interfaces actually exist
            raise_issue = False
            host_interfaces = helper.get_host_interfaces(
                                                       include_namespaces=True)

            ifaces_extant = []
            for iface in ifaces:
                if iface in host_interfaces:
                    raise_issue = True
                    ifaces_extant.append(iface)

            if raise_issue:
                msg = ("kernel has reported over-mtu dropped packets for ({}) "
                       "interfaces".format(len(ifaces_extant)))
                issue = issue_types.NetworkWarning(msg)
                issues_utils.add_issue(issue)

            # sort by nuber of occurences
            sorted_dict = {}
            for k, v in sorted(ifaces.items(), key=lambda e: e[1],
                               reverse=True):
                sorted_dict[k] = v

            KERNEL_INFO["over-mtu-dropped-packets"] = sorted_dict

    def register_mtu_dropped_packets_search(self):
        path = os.path.join(constants.DATA_ROOT, 'var/log/kern.log')
        if constants.USE_ALL_LOGS:
            path = path + "*"

        sdef = SearchDef(r".+\] (\S+): dropped over-mtu packet",
                         hint="dropped", tag="over-mtu")
        self.search_obj.add_search_term(sdef, path)

    def __call__(self):
        self.search_obj = FileSearcher()
        self.register_mtu_dropped_packets_search()
        self.results = self.search_obj.search()
        self.check_mtu_dropped_packets()
示例#7
0
def run_agent_exception_checks():
    s = FileSearcher()
    checks = [CommonAgentChecks(s)]
    for check in checks:
        check.register_search_terms()

    results = s.search()
    for check in checks:
        check_results = check.process_results(results)
        if check_results:
            AGENT_CHECKS_RESULTS["agent-exceptions"] = check_results
示例#8
0
def run_agent_checks():
    s = FileSearcher()
    checks = [
        NeutronAgentEventChecks(s, root="neutron-agent-checks"),
        NeutronAgentBugChecks(s, root="neutron")
    ]
    for check in checks:
        check.register_search_terms()

    results = s.search()
    for check in checks:
        check_results = check.process_results(results)
        if check_results:
            AGENT_CHECKS_RESULTS["agent-checks"] = check_results
示例#9
0
    def test_search_filter_invert_match(self):
        with tempfile.NamedTemporaryFile(mode='w', delete=False) as ftmp:
            ftmp.write(FILTER_TEST_1)
            ftmp.close()
            s = FileSearcher()
            fd = FilterDef(r" (ERROR)", invert_match=True)
            s.add_filter_term(fd, path=ftmp.name)
            sd = SearchDef(r".+ INFO (.+)")
            s.add_search_term(sd, path=ftmp.name)
            results = s.search().find_by_path(ftmp.name)
            self.assertEqual(len(results), 1)
            for r in results:
                self.assertEqual(r.get(1), "blah")

            os.remove(ftmp.name)
示例#10
0
class OpenvSwitchDaemonChecksBase(object):
    def __init__(self):
        self.search_obj = FileSearcher()
        self.results = []

    def register_search_terms(self):
        raise NotImplementedError

    def process_results(self):
        raise NotImplementedError

    def __call__(self):
        self.register_search_terms()
        self.results = self.search_obj.search()
        self.process_results()
示例#11
0
def run_agent_checks():
    s = FileSearcher()
    checks = [NeutronL3AgentEventChecks(s),
              NeutronOVSAgentEventChecks(s),
              NeutronAgentBugChecks(s),
              ]

    for check in checks:
        check.register_search_terms()

    results = s.search()

    for check in checks:
        check_results = check.process_results(results)
        if check_results:
            key = check.master_results_key
            AGENT_CHECKS_RESULTS["agent-checks"][key] = check_results
示例#12
0
def detect_known_bugs():
    """Unit fails to start complaining there are members in the relation."""
    data_source = f"{JUJU_LOG_PATH}/*.log"
    if constants.USE_ALL_LOGS:
        data_source = f"{data_source}*"

    s = FileSearcher()
    for bugdef in BUG_SEARCHES:
        s.add_search_term(bugdef, data_source)

    results = s.search()

    for bugdef in BUG_SEARCHES:
        bug_results = results.find_by_tag(bugdef.tag)
        if bug_results:
            reason = bugdef.render_reason(bug_results[0])
            add_known_bug(bugdef.tag, reason)
示例#13
0
    def __call__(self):
        super().__call__()
        data_source = os.path.join(constants.DATA_ROOT, CEPH_LOGS, 'ceph*.log')
        if constants.USE_ALL_LOGS:
            data_source = "{}*".format(data_source)

        s = FileSearcher()
        for search in SEARCHES:
            s.add_search_term(search, data_source)

        self.results = s.search()
        self.process_osd_failure_reports()
        self.process_mon_elections()
        self.process_slow_requests()
        self.process_crc_bluestore()
        self.process_crc_rocksdb()
        self.process_long_heartbeat()
        self.process_heartbeat_no_reply()
示例#14
0
    def test_sequence_searcher_overlapping_incomplete(self):
        with tempfile.NamedTemporaryFile(mode='w', delete=False) as ftmp:
            ftmp.write(SEQ_TEST_3)
            ftmp.close()
            s = FileSearcher()
            sd = SequenceSearchDef(start=SearchDef(
                                               r"^(a\S*) (start\S*) point\S*"),
                                   body=SearchDef(r"leads to"),
                                   end=SearchDef(r"^an (ending)$"),
                                   tag="seq-search-test3")
            s.add_search_term(sd, path=ftmp.name)
            results = s.search()
            sections = results.find_sequence_sections(sd)
            self.assertEqual(len(sections), 1)
            for id in sections:
                for r in sections[id]:
                    if r.tag == sd.start_tag:
                        self.assertEqual(r.get(1), "another")
                    elif r.tag == sd.end_tag:
                        self.assertEqual(r.get(1), "ending")

            os.remove(ftmp.name)
示例#15
0
class RabbitMQClusterChecks(RabbitMQChecksBase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.searcher = FileSearcher()

    def check_log_errors(self):
        path = os.path.join(constants.DATA_ROOT,
                            'var/log/rabbitmq/rabbit@*.log')
        if constants.USE_ALL_LOGS:
            path = f"{path}*"

        self.searcher.add_search_term(SearchDef(r".+ \S+_partitioned_network",
                                                tag="partitions"),
                                      path=path)
        results = self.searcher.search()
        if results.find_by_tag("partitions"):
            msg = ("cluster either has or has had partitions - check "
                   "cluster_status")
            issues_utils.add_issue(issue_types.RabbitMQWarning(msg))

    def __call__(self):
        super().__call__()
        self.check_log_errors()
示例#16
0
    def get_osd_rss(self, osd_id):
        """Return memory RSS for a given OSD.

        NOTE: this assumes we have ps auxwwwm format.
        """
        ceph_osds = self.services.get("ceph-osd")
        if not ceph_osds:
            return 0

        f_osd_ps_cmds = mktemp_dump('\n'.join(ceph_osds['ps_cmds']))

        s = FileSearcher()
        # columns: USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
        sd = SearchDef(r"\S+\s+\d+\s+\S+\s+\S+\s+\d+\s+(\d+)\s+.+/ceph-osd\s+"
                       r".+--id\s+{}\s+.+".format(osd_id))
        s.add_search_term(sd, path=f_osd_ps_cmds)
        rss = 0
        # we only expect one result
        for result in s.search().find_by_path(f_osd_ps_cmds):
            rss = int(int(result.get(1)) / 1024)
            break

        os.unlink(f_osd_ps_cmds)
        return rss
示例#17
0
    def test_sequence_searcher_multiple_sections(self):
        with tempfile.NamedTemporaryFile(mode='w', delete=False) as ftmp:
            ftmp.write(SEQ_TEST_5)
            ftmp.close()
            s = FileSearcher()
            sd = SequenceSearchDef(start=SearchDef(
                                               r"^(a\S*) (start\S*) point\S*"),
                                   body=SearchDef(r"value is (\S+)"),
                                   end=SearchDef(r"^$"),
                                   tag="seq-search-test5")
            s.add_search_term(sd, path=ftmp.name)
            results = s.search()
            sections = results.find_sequence_sections(sd)
            self.assertEqual(len(sections), 2)
            for id in sections:
                for r in sections[id]:
                    if r.tag == sd.start_tag:
                        self.assertEqual(r.get(1), "another")
                    elif r.tag == sd.body_tag:
                        self.assertTrue(r.get(1) in ["3", "4"])
                    elif r.tag == sd.end_tag:
                        self.assertEqual(r.get(0), "")

            os.remove(ftmp.name)
示例#18
0
        # process the results
        for service in AGENT_DAEMON_NAMES:
            self.process_agent_results(results, service)

        self.process_bug_results(results)


if __name__ == "__main__":
    s = FileSearcher()
    common_checks = CommonAgentChecks(s)
    common_checks.add_agents_issues_search_terms()
    neutron_checks = NeutronAgentChecks(s)
    neutron_checks.add_rpc_loop_search_terms()
    neutron_checks.add_router_event_search_terms()

    results = s.search()

    neutron_checks.process_rpc_loop_results(results)
    neutron_checks.process_router_event_results(results)
    common_checks.process_agent_issues_results(results)

    AGENT_CHECKS = {"agent-checks": {}}
    if common_checks.agent_log_issues:
        AGENT_CHECKS["agent-checks"]["agent-issues"] = \
            common_checks.agent_log_issues

    if neutron_checks.ovs_agent_info:
        AGENT_CHECKS["agent-checks"]["neutron-ovs-agent"] = \
            neutron_checks.ovs_agent_info

    if neutron_checks.l3_agent_info:
示例#19
0
    def _get_port_stats(self, name=None, mac=None):
        """Get ip link stats for the given port."""
        ip_link_show = cli_helpers.get_ip_link_show()
        stats_raw = []

        if mac:
            libvirt_mac = "fe" + mac[2:]

        exprs = []
        if mac:
            for _mac in [mac, libvirt_mac]:
                exprs.append(r"\s+link/ether\s+({})\s+.+".format(_mac))
        else:
            exprs.append(r"\d+:\s+({}):\s+.+".format(name))

        with tempfile.NamedTemporaryFile(mode='w', delete=False) as ftmp:
            ftmp.write(''.join(ip_link_show))
            ftmp.close()
            s = FileSearcher()
            sd = SequenceSearchDef(
                # match start if interface
                start=SearchDef(r"^(?:{})".format('|'.join(exprs))),
                # match body of interface
                body=SearchDef(r".+"),
                # match next interface or EOF
                end=SearchDef(r"(?:^\d+:\s+\S+:.+|^$)"),
                tag="ifaces")
            s.add_search_term(sd, path=ftmp.name)
            results = s.search()
            for results in results.find_sequence_sections(sd).values():
                for result in results:
                    if result.tag == sd.body_tag:
                        stats_raw.append(result.get(0))

                # stop at first match - if matching by mac address it is
                # possible for multiple interfaces to have the same mac e.g.
                # bonds and its interfaces but we dont support that so just use
                # first.
                break

            os.unlink(ftmp.name)

        stats = {}
        total_packets = float(0)
        if stats_raw:
            for i, line in enumerate(stats_raw):
                ret = re.compile(r"\s+[RT]X:\s+.+").findall(line)
                if ret:
                    ret = re.compile(r"\s*([a-z]+)\s*").findall(line)
                    if ret:
                        for j, column in enumerate(ret):
                            value = int(stats_raw[i + 1].split()[j])
                            if column == "packets":
                                total_packets = float(value)
                                continue

                            for key in ["dropped", "errors"]:
                                if column == key:
                                    if not value:
                                        continue

                                    percentage = int(
                                        (100 / total_packets) * value)
                                    # only report if > 0% drops/errors
                                    if percentage > 0:
                                        stats[key] = ("{} ({}%)".format(
                                            value, percentage))

        return stats
示例#20
0
def get_events(event_name, data_source):
    ext_event_info = {}
    events = {}

    s = FileSearcher()

    # look for sequence starter
    if event_name == "network-vif-plugged":
        sd = SearchDef(r".+\[instance: (\S+)\].+Preparing to wait for "
                       r"external event ({})-(\S+)\s+".format(event_name))
        s.add_search_term(sd, data_source)
    elif event_name == "network-changed":
        sd = SearchDef(
            r".+\[instance: (\S+)\].+Received event ({})-(\S+)\s+".format(
                event_name))
        s.add_search_term(sd, data_source)

    master_results = s.search()

    # now start a fresh one
    s = FileSearcher()

    for file, results in master_results:
        for result in results:
            instance_id = result.get(1)
            event_id = result.get(3)
            events[event_id] = {
                "instance_id": instance_id,
                "data_source": file
            }

            for stage in EXT_EVENT_META[event_name]["stages_keys"]:
                expr = (
                    r".+\[instance: {}\]\s+{}\s.*\s?event\s+{}-{}.? .+".format(
                        instance_id, stage, event_name, event_id))
                tag = "{}_{}_{}".format(instance_id, event_id, stage)
                sd = SearchDef(expr, tag, hint=event_name)
                s.add_search_term(sd, data_source)

    results = s.search()
    for event_id in events:
        instance_id = events[event_id]["instance_id"]
        data_source = events[event_id]["data_source"]
        stages = get_state_dict(event_name)
        for stage in stages:
            tag = "{}_{}_{}".format(instance_id, event_id, stage)
            r = results.find_by_tag(tag, path=data_source)
            if r:
                stages[stage] = True

        if all([stages[stage] for stage in stages]):
            result = "succeeded"
        else:
            result = "failed"

        if event_name not in ext_event_info:
            ext_event_info[event_name] = {}

        if result not in ext_event_info[event_name]:
            ext_event_info[event_name][result] = []

        ext_event_info[event_name][result].append({
            "port": event_id,
            "instance": instance_id
        })

    if ext_event_info:
        for event in ext_event_info:
            if event not in EXT_EVENT_INFO:
                EXT_EVENT_INFO[event] = {}
            for result in ext_event_info[event]:
                s = ext_event_info[event][result]
                EXT_EVENT_INFO[event][result] = list(s)
示例#21
0
class RabbitMQServiceChecks(RabbitMQServiceChecksBase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        ftmp = tempfile.mktemp()
        _report = cli_helpers.get_rabbitmqctl_report()
        # save to file so we can search it later
        with open(ftmp, 'w') as fd:
            fd.write(''.join(_report))

        self.report_path = ftmp
        self.searcher = FileSearcher()
        self.resources = {}

    def __del__(self):
        if os.path.exists(self.report_path):
            os.unlink(self.report_path)

    def get_running_services_info(self):
        """Get string info for running services."""
        if self.services:
            RABBITMQ_INFO["services"] = self.get_service_info_str()

    def get_queues(self):
        """Get distribution of queues across cluster."""
        sd = self._sequences["queues"]["searchdef"]
        vhost_queues = {}
        raise_issues = []
        for results in self.results.find_sequence_sections(sd).values():
            vhost = None
            queues = {}
            for result in results:
                if result.tag == sd.start_tag:
                    vhost = result.get(1)
                elif result.tag == sd.body_tag:
                    info = {"pid_name": result.get(1), "queue": result.get(2)}
                    if info["pid_name"] not in queues:
                        queues[info["pid_name"]] = 1
                    else:
                        queues[info["pid_name"]] += 1

            vhost_queues[vhost] = {}
            if len(queues.keys()) == 0:
                continue

            total = functools.reduce(lambda x, y: x + y, list(queues.values()),
                                     0)
            vhost_queues[vhost] = {}
            for pid in queues:
                if total > 0:
                    fraction = queues[pid] / total
                    fraction_string = "{:.2f}%".format(fraction * 100)
                    if fraction > 2 / 3:
                        raise_issues.append(
                            "{} holds more than 2/3 of queues".format(pid))
                else:
                    fraction_string = "N/A"

                vhost_queues[vhost][pid] = "{:d} ({})".format(
                    queues[pid], fraction_string)

        for issue in raise_issues:
            issues_utils.add_issue(issue_types.RabbitMQWarning(issue))

        if vhost_queues:
            # list all vhosts but only show their queues if not []
            self.resources["vhosts"] = sorted(list(vhost_queues.keys()))
            self.resources["vhost-queue-distributions"] = \
                {k: v for k, v in vhost_queues.items() if v}

    def get_queue_connection_distribution(self):
        """Get distribution of connections across cluster."""
        sd = self._sequences["connections"]["searchdef"]
        queue_connections = {}
        for results in self.results.find_sequence_sections(sd).values():
            for result in results:
                if result.tag == sd.body_tag:
                    queue_name = result.get(1)
                    if queue_name not in queue_connections:
                        queue_connections[queue_name] = 1
                    else:
                        queue_connections[queue_name] += 1

        if queue_connections:
            self.resources["queue-connections"] = queue_connections

    def get_memory_used(self):
        """Get the memory used per broker."""
        sd = self._sequences["memory"]["searchdef"]
        memory_used = {}
        for results in self.results.find_sequence_sections(sd).values():
            for result in results:
                if result.tag == sd.start_tag:
                    node_name = result.get(1)
                elif result.tag == sd.body_tag:
                    total = result.get(1)
                    mib_used = int(total) / 1024. / 1024.
                    memory_used[node_name] = "{:.3f}".format(mib_used)

        if memory_used:
            self.resources["memory-used-mib"] = memory_used

    def register_report_searches(self):
        """Register all sequence search definitions that we will execute
        against rabbitmqctl report.
        """
        self._sequences = {
            "queues": {
                "searchdef":
                SequenceSearchDef(
                    start=SearchDef(r"^Queues on ([^:]+):"),
                    body=SearchDef(r"^<([^.\s]+)[.0-9]+>\s+(\S+)\s+.+"),
                    end=SearchDef(r"^$"),
                    tag="queues"),
                "callbacks": [self.get_queues]
            },
            "connections": {
                "searchdef":
                SequenceSearchDef(
                    start=SearchDef(r"^Connections:$"),
                    body=SearchDef(r"^<(rabbit[^>.]*)(?:[.][0-9]+)+>.*$"),
                    end=SearchDef(r"^$"),
                    tag="connections"),
                "callbacks": [self.get_queue_connection_distribution]
            },
            "memory": {
                "searchdef":
                SequenceSearchDef(
                    start=SearchDef(r"^Status of node '([^']*)'$"),
                    body=SearchDef(r"^\s+\[{total,([0-9]+)}.+"),
                    end=SearchDef(r"^$"),
                    tag="memory"),
                "callbacks": [self.get_memory_used]
            }
        }
        for s in self._sequences.values():
            self.searcher.add_search_term(s["searchdef"], self.report_path)

    def run_report_callbacks(self):
        for s in self._sequences.values():
            for f in s["callbacks"]:
                f()

    def run_report_searches(self):
        self.register_report_searches()
        self.results = self.searcher.search()
        self.run_report_callbacks()
        if not self.resources:
            return

        RABBITMQ_INFO["resources"] = self.resources

    def __call__(self):
        super().__call__()
        self.get_running_services_info()
        self.run_report_searches()
示例#22
0
class NeutronL3HAChecks(object):
    def __init__(self):
        self.searcher = FileSearcher()

    def get_neutron_ha_info(self):
        ha_state_path = os.path.join(constants.DATA_ROOT, NEUTRON_HA_PATH)
        if not os.path.exists(ha_state_path):
            return

        vrrp_states = {}
        router_states = {}
        for entry in os.listdir(ha_state_path):
            entry = os.path.join(ha_state_path, entry)
            if os.path.isdir(entry):
                pid_path = "{}{}".format(entry, ".pid.keepalived-vrrp")
                keepalived_conf_path = os.path.join(entry, "keepalived.conf")
                state_path = os.path.join(entry, "state")
                if os.path.exists(state_path):
                    with open(state_path) as fd:
                        router = os.path.basename(entry)
                        state = fd.read().strip()
                        if state in router_states:
                            router_states[state].append(router)
                        else:
                            router_states[state] = [router]

                    if os.path.isfile(keepalived_conf_path):
                        with open(keepalived_conf_path) as fd:
                            for line in fd:
                                expr = ".+ virtual_router_id ([0-9]+)"
                                ret = re.compile(expr).search(line)
                                if ret:
                                    ROUTER_VR_IDS[router] = ret.group(1)

                    if os.path.isfile(pid_path):
                        with open(pid_path) as fd:
                            pid = fd.read().strip()
                            vrrp_states[router] = pid

        if router_states:
            L3HA_CHECKS["agent"] = router_states

        if vrrp_states:
            L3HA_CHECKS["keepalived"] = vrrp_states

    def get_vrrp_transitions(self):
        if "keepalived" not in L3HA_CHECKS:
            return

        transitions = {}
        with tempfile.NamedTemporaryFile(mode='w', delete=False) as ftmp:
            if not constants.USE_ALL_LOGS:
                date = cli_helpers.get_date(format="--iso-8601").rstrip()
            else:
                date = None

            out = cli_helpers.get_journalctl(unit="neutron-l3-agent",
                                             date=date)
            ftmp.write(''.join(out))
            ftmp.close()

            for router in L3HA_CHECKS["keepalived"]:
                vr_id = ROUTER_VR_IDS[router]
                expr = (r"^(\S+) [0-9]+ [0-9:]+ \S+ Keepalived_vrrp"
                        r"\[([0-9]+)\]: VRRP_Instance\(VR_{}\) .+ (\S+) "
                        "STATE.*".format(vr_id))
                d = SearchDef(expr, tag=router)
                self.searcher.add_search_term(d, ftmp.name)

            results = self.searcher.search()
            for router in L3HA_CHECKS["keepalived"]:
                transitions[router] = len(results.find_by_tag(router))

            os.unlink(ftmp.name)

        if transitions:
            L3HA_CHECKS["keepalived"] = {"transitions": {}}
            for k, v in sorted(transitions.items(),
                               key=lambda x: x[1],
                               reverse=True):
                L3HA_CHECKS["keepalived"]["transitions"][k] = v

    def check_vrrp_transitions(self):
        if "transitions" not in L3HA_CHECKS.get("keepalived", {}):
            return

        max_transitions = 0
        warn_count = 0
        threshold = VRRP_TRANSITION_WARN_THRESHOLD
        for router in L3HA_CHECKS["keepalived"]["transitions"]:
            transitions = L3HA_CHECKS["keepalived"]["transitions"][router]
            if transitions > threshold:
                max_transitions = max(transitions, max_transitions)
                warn_count += 1

        if warn_count:
            msg = ("{} router(s) have had more than {} vrrp transitions "
                   "(max={}) in the last 24 hours".format(
                       warn_count, threshold, max_transitions))
            issues_utils.add_issue(issue_types.NeutronL3HAWarning(msg))

    def __call__(self):
        self.get_neutron_ha_info()
        self.get_vrrp_transitions()

        # there will likely be a large number of transitions if we look across
        # all time so dont run this check.
        if not constants.USE_ALL_LOGS:
            self.check_vrrp_transitions()
示例#23
0
    def test_filesearcher_logs(self):
        expected = {4: '2021-02-25 14:22:18.861',
                    16: '2021-02-25 14:22:19.587'}

        logs_root = "var/log/neutron/"
        filepath = os.path.join(os.environ["DATA_ROOT"], logs_root,
                                'neutron-openvswitch-agent.log')
        globpath = os.path.join(os.environ["DATA_ROOT"], logs_root,
                                'neutron-l3-agent.log*')
        globpath_file1 = os.path.join(os.environ["DATA_ROOT"], logs_root,
                                      'neutron-l3-agent.log')
        globpath_file2 = os.path.join(os.environ["DATA_ROOT"], logs_root,
                                      'neutron-l3-agent.log.1.gz')

        s = FileSearcher()
        sd = SearchDef(r'^(\S+\s+[0-9:\.]+)\s+.+full sync.+', tag="T1")
        s.add_search_term(sd, filepath)
        sd = SearchDef(r'^(\S+\s+[0-9:\.]+)\s+.+ERROR.+', tag="T2")
        s.add_search_term(sd, filepath)
        sd = SearchDef((r'^(\S+\s+[0-9:\.]+)\s+.+ INFO .+ Router '
                        '9b8efc4c-305b-48ce-a5bd-624bc5eeee67.+'), tag="T3")
        s.add_search_term(sd, globpath)
        sd = SearchDef(r'non-existant-pattern', tag="T4")
        # search for something that doesn't exist to test that code path
        s.add_search_term(sd, globpath)

        results = s.search()
        self.assertEquals(set(results.files), set([filepath,
                                                   globpath_file2,
                                                   globpath_file1]))

        self.assertEquals(len(results.find_by_path(filepath)), 37)

        tag_results = results.find_by_tag("T1", path=filepath)
        self.assertEquals(len(tag_results), 2)
        for result in tag_results:
            ln = result.linenumber
            self.assertEquals(result.tag, "T1")
            self.assertEquals(result.get(1), expected[ln])

        tag_results = results.find_by_tag("T1")
        self.assertEquals(len(tag_results), 2)
        for result in tag_results:
            ln = result.linenumber
            self.assertEquals(result.tag, "T1")
            self.assertEquals(result.get(1), expected[ln])

        self.assertEquals(len(results.find_by_path(globpath_file1)), 1)
        self.assertEquals(len(results.find_by_path(globpath_file2)), 0)

        # these files have the same content so expect same result from both
        expected = {81: '2021-03-25 18:10:15.179'}
        path_results = results.find_by_path(globpath_file1)
        for result in path_results:
            ln = result.linenumber
            self.assertEquals(result.tag, "T3")
            self.assertEquals(result.get(1), expected[ln])

        path_results = results.find_by_path(globpath_file2)
        for result in path_results:
            ln = result.linenumber
            self.assertEquals(result.tag, "T3")
            self.assertEquals(result.get(1), expected[ln])
示例#24
0
class RabbitMQServiceChecks(RabbitMQChecksBase):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        out = cli_helpers.get_rabbitmqctl_report()
        # save to file so we can search it later
        self.f_report = mktemp_dump(''.join(out))
        self.searcher = FileSearcher()
        self.resources = {}

    def __del__(self):
        if os.path.exists(self.f_report):
            os.unlink(self.f_report)

    def get_running_services_info(self):
        """Get string info for running services."""
        if self.services:
            RABBITMQ_INFO["services"] = self.get_service_info_str()

    def register_report_searches(self):
        """Register all sequence search definitions that we will execute
        against rabbitmqctl report.

        NOTE: the rabbitmqctl report output differs between versions 3.6.x and
              3.8.x and we try to account for either by providing optional
              regex expressions to match either.
        """
        self._sequences = {
            "queues": {
                "searchdef":
                    SequenceSearchDef(
                        start=SearchDef([r"^Queues on ([^:]+):",
                                         (r"^Listing queues for vhost ([^:]+) "
                                          r"...")]),
                        # NOTE: we don't use a list for the body here because
                        # we need to know which expression matched so that we
                        # can know in which order to retrieve the columns since
                        # their order is inverted between 3.6.x and 3.8.x
                        body=SearchDef(r"^(?:<([^.\s]+)[.0-9]+>\s+(\S+)|"
                                       r"(\S+)\s+(?:\S+\s+){4}<([^.\s]+)[.0-9]"
                                       r"+>)\s+.+"),
                        end=SearchDef(r"^$"),
                        tag="queues"),
                "callbacks":
                    [self.get_queue_info]
                },
            "connections": {
                "searchdef":
                    SequenceSearchDef(
                        start=SearchDef([r"^Connections:$",
                                         r"^Listing connections ...$"]),
                        body=SearchDef(r"^<(rabbit[^>.]*)(?:[.][0-9]+)+>.*$"),
                        end=SearchDef(r"^$"),
                        tag="connections"),
                "callbacks":
                    [self.get_queue_connection_distribution]
                },
            "memory": {
                "searchdef":
                    SequenceSearchDef(
                        start=SearchDef([r"^Status of node '([^']*)'$",
                                         r"^Status of node ([^']*) ...$"]),
                        body=SearchDef(r"^\s+\[{total,([0-9]+)}.+"),
                        end=SearchDef(r"^$"),
                        tag="memory"),
                "callbacks":
                    [self.get_memory_used]
                },
            "partitioning": {
                "searchdef":
                    SearchDef(r"^\s*{cluster_partition_handling,([^}]*)}",
                              tag="cluster_partition_handling"),
                "callbacks":
                [self.get_partition_handling]
            }
        }
        for s in self._sequences.values():
            self.searcher.add_search_term(s["searchdef"], self.f_report)

    def get_queue_info(self):
        """Get distribution of queues across cluster."""
        sd = self._sequences["queues"]["searchdef"]
        vhost_queues = {}
        issues_raised = {}
        skewed_queue_nodes = {}
        for results in self.results.find_sequence_sections(sd).values():
            vhost = None
            queues = {}
            for result in results:
                if result.tag == sd.start_tag:
                    # check both report formats
                    vhost = result.get(1)
                elif result.tag == sd.body_tag:
                    node_name = result.get(1) or result.get(4)
                    # if we matched the section header, skip
                    if node_name == "pid":
                        continue

                    queue = result.get(2) or result.get(3)
                    # if we matched the section header, skip
                    if queue == "name":
                        continue

                    if node_name not in queues:
                        queues[node_name] = 0

                    queues[node_name] += 1

            vhost_queues[vhost] = {}
            if not queues:
                continue

            total = sum(queues.values())
            for node_name in queues:
                if total > 0:
                    fraction = queues[node_name] / total
                    fraction_string = "{:.2f}%".format(fraction * 100)
                    if fraction > 2 / 3:
                        if node_name not in skewed_queue_nodes:
                            skewed_queue_nodes[node_name] = 0

                        skewed_queue_nodes[node_name] += 1
                else:
                    fraction_string = "N/A"

                vhost_queues[vhost][node_name] = "{:d} ({})".format(
                    queues[node_name], fraction_string)

            # Report the node with the greatest skew of queues/vhost
            if skewed_queue_nodes:
                max_node = None
                for node_name in skewed_queue_nodes:
                    if max_node is None:
                        max_node = node_name
                    elif (skewed_queue_nodes[node_name] >=
                            skewed_queue_nodes[max_node]):
                        max_node = node_name

                if (skewed_queue_nodes[max_node] >
                        issues_raised.get(max_node, 0)):
                    issues_raised[max_node] = skewed_queue_nodes[max_node]

        # this should only actually ever report one node
        for node_name in issues_raised:
            msg = ("{} holds more than 2/3 of queues for {}/{} vhost(s)".
                   format(node_name, issues_raised[node_name],
                          len(vhost_queues)))
            issues_utils.add_issue(issue_types.RabbitMQWarning(msg))

        if vhost_queues:
            # list all vhosts but only show their queues if not []
            self.resources["vhosts"] = sorted(list(vhost_queues.keys()))
            self.resources["vhost-queue-distributions"] = \
                {k: v for k, v in vhost_queues.items() if v}

    def get_queue_connection_distribution(self):
        """Get distribution of connections across cluster."""
        sd = self._sequences["connections"]["searchdef"]
        queue_connections = {}
        for results in self.results.find_sequence_sections(sd).values():
            for result in results:
                if result.tag == sd.body_tag:
                    queue_name = result.get(1)
                    if queue_name not in queue_connections:
                        queue_connections[queue_name] = 1
                    else:
                        queue_connections[queue_name] += 1

        if queue_connections:
            self.resources["queue-connections"] = queue_connections

    def get_memory_used(self):
        """Get the memory used per broker."""
        sd = self._sequences["memory"]["searchdef"]
        memory_used = {}
        for results in self.results.find_sequence_sections(sd).values():
            for result in results:
                if result.tag == sd.start_tag:
                    # check both report formats
                    node_name = result.get(1)
                elif result.tag == sd.body_tag:
                    total = result.get(1)
                    mib_used = int(total) / 1024. / 1024.
                    memory_used[node_name] = "{:.3f}".format(mib_used)

        if memory_used:
            self.resources["memory-used-mib"] = memory_used

    def get_partition_handling(self):
        """Get the partition handling settings."""
        results = self.results.find_by_tag("cluster_partition_handling")
        if not results:
            return

        setting = results[0].get(1)
        if setting == "ignore":
            msg = "Cluster partition handling is currently set to ignore. " \
                "This is potentially dangerous and a setting of " \
                "pause_minority is recommended."
            issues_utils.add_issue(issue_types.RabbitMQWarning(msg))
            self.resources["cluster-partition-handling"] = setting

    def run_report_callbacks(self):
        for s in self._sequences.values():
            for f in s["callbacks"]:
                f()

    def run_report_searches(self):
        self.register_report_searches()
        self.results = self.searcher.search()
        self.run_report_callbacks()
        if not self.resources:
            return

        RABBITMQ_INFO["resources"] = self.resources

    def __call__(self):
        super().__call__()
        self.get_running_services_info()
        self.run_report_searches()
示例#25
0
class OctaviaLBChecks(checks.APTPackageChecksBase):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.searcher = FileSearcher()
        self.logs_path = os.path.join(constants.DATA_ROOT,
                                      SERVICE_RESOURCES["octavia"]["logs"])

        self.data_sources = {}
        fname = 'octavia-health-manager.log'
        self.data_sources["health-manager"] = os.path.join(self.logs_path,
                                                           fname)
        self.data_sources["worker"] = os.path.join(self.logs_path,
                                                   'octavia-worker.log')
        if constants.USE_ALL_LOGS:
            self.data_sources["health-manager"] = (
                "{}*".format(self.data_sources["health-manager"]))
            self.data_sources["worker"] = (
                "{}*".format(self.data_sources["worker"]))

    def get_hm_amphora_missed_heartbeats(self):
        missed_heartbeats = {}
        expr = (r"^(\S+) \S+ .+ Amphora (\S+) health message was processed "
                r"too slowly:.+")
        d = SearchDef(expr, tag="amp-missed-hb", hint="health message")
        self.searcher.add_search_term(d, self.data_sources["health-manager"])

        results = self.searcher.search()
        for r in results.find_by_tag("amp-missed-hb"):
            ts_date = r.get(1)
            amp_id = r.get(2)

            if ts_date not in missed_heartbeats:
                missed_heartbeats[ts_date] = {}

            if amp_id in missed_heartbeats[ts_date]:
                missed_heartbeats[ts_date][amp_id] += 1
            else:
                missed_heartbeats[ts_date][amp_id] = 1

        # sort each amp by occurences
        for ts_date in missed_heartbeats:
            d = utils.sorted_dict(missed_heartbeats[ts_date],
                                  key=lambda e: e[1], reverse=True)
            missed_heartbeats[ts_date] = d

        if missed_heartbeats:
            # not sort by date
            LB_CHECKS["amp-missed-heartbeats"] = \
                utils.sorted_dict(missed_heartbeats)

    def get_lb_failovers(self):
        """Get loadbalancer failover counts."""
        failovers = {}
        expr = (r"^(\S+) \S+ .+ Performing failover for amphora:\s+(.+)")
        d = SearchDef(expr, tag="lb-failover-auto", hint="failover")
        self.searcher.add_search_term(d, self.data_sources["health-manager"])

        expr = (r"^(\S+) \S+ .+ Performing failover for amphora:\s+(.+)")
        d = SearchDef(expr, tag="lb-failover-manual", hint="failover")
        self.searcher.add_search_term(d, self.data_sources["worker"])

        for fo_type in ["auto", "manual"]:
            results = self.searcher.search()
            for r in results.find_by_tag("lb-failover-{}".format(fo_type)):
                ts_date = r.get(1)
                payload = r.get(2)
                payload = yaml.safe_load(payload)
                lb_id = payload.get("load_balancer_id")
                if lb_id is None:
                    continue

                if fo_type not in failovers:
                    failovers[fo_type] = {}

                if ts_date not in failovers[fo_type]:
                    failovers[fo_type][ts_date] = {}

                if lb_id in failovers[fo_type][ts_date]:
                    failovers[fo_type][ts_date][lb_id] += 1
                else:
                    failovers[fo_type][ts_date][lb_id] = 1

        for fo_type in failovers:
            # sort each failover by occurences
            for ts_date in failovers[fo_type]:
                d = utils.sorted_dict(failovers[fo_type][ts_date],
                                      key=lambda e: e[1], reverse=True)
                failovers[fo_type][ts_date] = d

            # now sort the dates
            d = utils.sorted_dict(failovers[fo_type])

        if failovers:
            LB_CHECKS["lb-failovers"] = failovers

    def __call__(self):
        if self.core:
            self.get_lb_failovers()
            self.get_hm_amphora_missed_heartbeats()
示例#26
0
class NeutronL3HAChecks(object):
    def __init__(self):
        self.searcher = FileSearcher()
        self.f_journalctl = None
        self.router_vrrp_pids = {}

    def __del__(self):
        if self.f_journalctl and os.path.exists(self.f_journalctl):
            os.unlink(self.f_journalctl)

    def _get_journalctl_l3_agent(self):
        if not constants.USE_ALL_LOGS:
            date = cli_helpers.get_date(format="--iso-8601").rstrip()
        else:
            date = None

        out = cli_helpers.get_journalctl(unit="neutron-l3-agent", date=date)
        self.f_journalctl = mktemp_dump(''.join(out))

    def get_neutron_ha_info(self):
        ha_state_path = os.path.join(constants.DATA_ROOT, NEUTRON_HA_PATH)
        if not os.path.exists(ha_state_path):
            return

        router_states = {}
        for entry in os.listdir(ha_state_path):
            entry = os.path.join(ha_state_path, entry)
            if os.path.isdir(entry):
                pid_path = "{}{}".format(entry, ".pid.keepalived-vrrp")
                keepalived_conf_path = os.path.join(entry, "keepalived.conf")
                state_path = os.path.join(entry, "state")
                if os.path.exists(state_path):
                    with open(state_path) as fd:
                        router = os.path.basename(entry)
                        state = fd.read().strip()
                        if state in router_states:
                            router_states[state].append(router)
                        else:
                            router_states[state] = [router]

                    if os.path.isfile(keepalived_conf_path):
                        with open(keepalived_conf_path) as fd:
                            for line in fd:
                                expr = ".+ virtual_router_id ([0-9]+)"
                                ret = re.compile(expr).search(line)
                                if ret:
                                    ROUTER_VR_IDS[router] = ret.group(1)

                    if os.path.isfile(pid_path):
                        with open(pid_path) as fd:
                            pid = fd.read().strip()
                            self.router_vrrp_pids[router] = pid

        if router_states:
            L3HA_CHECKS["agent"] = router_states

    def get_vrrp_transitions(self):
        """
        List routers that have had a vrrp state transition along with the
        number of transitions. Excludes routers that have not had any change of
        state.
        """
        if not self.router_vrrp_pids:
            return

        self._get_journalctl_l3_agent()
        transitions = {}
        for router in self.router_vrrp_pids:
            vr_id = ROUTER_VR_IDS[router]
            expr = (r"^([0-9-]+)T\S+ \S+ Keepalived_vrrp"
                    r"\[([0-9]+)\]: VRRP_Instance\(VR_{}\) .+ (\S+) "
                    "STATE.*".format(vr_id))
            d = SearchDef(expr, tag=router)
            self.searcher.add_search_term(d, self.f_journalctl)

        results = self.searcher.search()
        for router in self.router_vrrp_pids:
            t_count = len(results.find_by_tag(router))
            if not t_count:
                continue

            for r in results.find_by_tag(router):
                ts_date = r.get(1)
                if router not in transitions:
                    transitions[router] = {}

                if ts_date in transitions[router]:
                    transitions[router][ts_date] += 1
                else:
                    transitions[router][ts_date] = 1

        if transitions:
            L3HA_CHECKS["keepalived"] = {"transitions": transitions}

    def check_vrrp_transitions(self):
        if "transitions" not in L3HA_CHECKS.get("keepalived", {}):
            return

        max_transitions = 0
        warn_count = 0
        threshold = VRRP_TRANSITION_WARN_THRESHOLD
        for router in L3HA_CHECKS["keepalived"]["transitions"]:
            r = L3HA_CHECKS["keepalived"]["transitions"][router]
            transitions = sum([t for d, t in r.items()])
            if transitions > threshold:
                max_transitions = max(transitions, max_transitions)
                warn_count += 1

        if warn_count:
            msg = ("{} router(s) have had more than {} vrrp transitions "
                   "(max={}) in the last 24 hours".format(
                       warn_count, threshold, max_transitions))
            issues_utils.add_issue(issue_types.NeutronL3HAWarning(msg))

    def __call__(self):
        self.get_neutron_ha_info()
        self.get_vrrp_transitions()

        # there will likely be a large number of transitions if we look across
        # all time so dont run this check.
        if not constants.USE_ALL_LOGS:
            self.check_vrrp_transitions()