def test_add_issue(self):
        known_bugs = {
            IssuesManager.SUMMARY_OUT_BUGS_ROOT: [{
                'id': 'https://bugs.launchpad.net/bugs/1',
                'desc': None,
                'origin': 'testplugin.01part'
            }]
        }
        with open(os.path.join(self.plugin_tmp_dir, 'known_bugs.yaml'),
                  'w') as fd:
            fd.write(yaml.dump(known_bugs))

        mgr = IssuesManager()
        mgr.add(LaunchpadBug(2, None))
        ret = mgr.load_bugs()
        expected = {
            IssuesManager.SUMMARY_OUT_BUGS_ROOT: [{
                'id': 'https://bugs.launchpad.net/bugs/1',
                'desc': None,
                'origin': 'testplugin.01part'
            }, {
                'id': 'https://bugs.launchpad.net/bugs/2',
                'desc': None,
                'origin': 'testplugin.01part'
            }]
        }
        self.assertEqual(ret, expected)
示例#2
0
    def run(self):
        mgr = IssuesManager()
        for scenario in self.scenarios:
            results = {}
            log.debug("running scenario: %s", scenario.name)
            # run all conclusions and use highest priority result(s). One or
            # more conclusions may share the same priority. All conclusions
            # that match and share the same priority will be used.
            for name, conc in scenario.conclusions.items():
                if conc.reached(scenario.checks):
                    if conc.priority:
                        priority = conc.priority.value
                    else:
                        priority = 1

                    if priority in results:
                        results[priority].append(conc)
                    else:
                        results[priority] = [conc]

                    log.debug("conclusion reached: %s (priority=%s)", name,
                              priority)

            if results:
                highest = max(results.keys())
                log.debug("selecting highest priority=%s conclusions (%s)",
                          highest, len(results[highest]))
                for conc in results[highest]:
                    mgr.add(conc.issue, context=conc.context)
            else:
                log.debug("no conclusions reached")
    def test_get_issues(self):
        raised_issues = {}
        with open(os.path.join(self.plugin_tmp_dir, 'yaml'), 'w') as fd:
            fd.write(yaml.dump(raised_issues))

        mgr = IssuesManager()
        ret = mgr.load_issues()
        self.assertEqual(ret, raised_issues)
 def test_issue_not_machine_readable(self):
     mgr = IssuesManager()
     mgr.add(MemoryWarning("test"))
     ret = mgr.load_issues()
     self.assertEqual(
         ret, {
             IssuesManager.SUMMARY_OUT_ISSUES_ROOT: {
                 'MemoryWarnings': ['test (origin=testplugin.01part)']
             }
         })
 def test_add_issue_first(self):
     mgr = IssuesManager()
     mgr.add(LaunchpadBug(1, None))
     ret = mgr.load_bugs()
     self.assertEqual(
         ret, {
             IssuesManager.SUMMARY_OUT_BUGS_ROOT:
             [{
                 'id': 'https://bugs.launchpad.net/bugs/1',
                 'desc': None,
                 'origin': 'testplugin.01part'
             }]
         })
 def test_issue_machine_readable(self):
     setup_config(MACHINE_READABLE=True)
     mgr = IssuesManager()
     mgr.add(MemoryWarning("test"))
     ret = mgr.load_issues()
     self.assertEqual(
         ret, {
             IssuesManager.SUMMARY_OUT_ISSUES_ROOT:
             [{
                 'type': 'MemoryWarning',
                 'desc': 'test',
                 'origin': 'testplugin.01part'
             }]
         })
 def test_add_issue_w_empty_context(self):
     setup_config(MACHINE_READABLE=True)
     ctxt = IssueContext()
     mgr = IssuesManager()
     mgr.add(MemoryWarning("test"), ctxt)
     ret = mgr.load_issues()
     self.assertEqual(
         ret, {
             IssuesManager.SUMMARY_OUT_ISSUES_ROOT:
             [{
                 'type': 'MemoryWarning',
                 'desc': 'test',
                 'origin': 'testplugin.01part'
             }]
         })
示例#8
0
 def test_cacheset(self):
     with tempfile.TemporaryDirectory() as dtmp:
         self.setup_bcachefs(dtmp, cacheset_error=True)
         setup_config(DATA_ROOT=dtmp)
         YScenarioChecker()()
         bug_msg = (
             'bcache cache_available_percent is 33 (i.e. approx. 30%) '
             'which implies this node could be suffering from bug LP '
             '1900438 - please check.')
         issue_msg = ('bcache cacheset config congested_write_threshold_us '
                      'expected to be eq 0 but actual=100.')
         issues = list(IssuesManager().load_issues().values())[0]
         self.assertEqual([issue['desc'] for issue in issues], [issue_msg])
         bugs = list(IssuesManager().load_bugs().values())[0]
         self.assertEqual([issue['desc'] for issue in bugs], [bug_msg])
示例#9
0
 def test_osd_messenger_v2_protocol(self):
     YScenarioChecker()()
     msg = ("This Ceph cluster has 1 OSD(s) that do not bind to a v2 "
            "messenger address. This will cause unexpected behaviour and "
            "should be resolved asap.")
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
示例#10
0
    def __summary_memory_checks(self):
        _mem_info = {}
        node_results = self.check_nodes_memory("Normal")
        if not node_results:
            # only check other types of no issue detected on Normal
            node_results = self.check_nodes_memory("DMA32")

        if node_results:
            _mem_info = node_results

        # We only report on compaction errors if there is a shortage of
        # high-order zones.
        if _mem_info:
            fail_count = self.get_vmstat_value("compact_fail")
            success_count = self.get_vmstat_value("compact_success")
            # we use an arbitrary threshold of 10k to suggest that a lot of
            # compaction has occurred but noting that this is a rolling counter
            # and is not necessarily representative of current state.
            if success_count > 10000:
                pcent = int(fail_count / (success_count / 100))
                if pcent > 10:
                    msg = ("compaction failures are at {}% of successes "
                           "(see {}).".format(pcent, self.vmstat_path))
                    IssuesManager().add(MemoryWarning(msg))

            top5 = self.get_slab_major_consumers()
            if top5:
                _mem_info["slab-top-consumers"] = top5
        else:
            _mem_info = "no issues found"

        return _mem_info
示例#11
0
    def test_lp1936136(self, mocl_cli, mock_cephbase, mock_kernelbase,
                       mock_cset_config, mock_ceph_config):
        def fake_ceph_config(key):
            if key == 'bluefs_buffered_io':
                return 'true'

        mocl_cli.return_value = mock.MagicMock()
        mocl_cli.return_value.dpkg_l.return_value = \
            ["ii  ceph-osd 14.2.22-0ubuntu0.20.04.2 amd64"]

        mock_cset_config.return_value = mock.MagicMock()
        mock_cset_config.return_value.get.return_value = 69

        mock_ceph_config.return_value = mock.MagicMock()
        mock_ceph_config.return_value.get.side_effect = fake_ceph_config

        mock_cephbase.return_value = mock.MagicMock()
        mock_cephbase.return_value.local_osds_use_bcache = True
        mock_kernelbase.return_value = mock.MagicMock()
        mock_kernelbase.return_value.version = '5.3'

        YScenarioChecker()()

        msg = ('This host has Ceph OSDs using bcache block devices and may be '
               'vulnerable to bcache bug LP 1936136 since '
               'bcache cache_available_percent is lt 70 (actual=69). The '
               'current workaround is to set bluefs_buffered_io=false in Ceph '
               'or upgrade to a kernel >= 5.4.')

        issues = list(IssuesManager().load_bugs().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
示例#12
0
 def test_bug_check_lp1959649(self, mock_cephdaemon, mock_helper):
     mock_helper.return_value = mock.MagicMock()
     mock_helper.return_value.dpkg_l.return_value = \
         ["ii  ceph-osd 15.2.7-0ubuntu0.20.04.2 amd64"]
     mock_cephdaemon.return_value = mock.MagicMock()
     mock_cephdaemon.return_value.bluestore_volume_selection_policy = \
         ['rocksdb_original']
     YScenarioChecker()()
     msg = ('This host is vulnerable to known bug '
            'https://tracker.ceph.com/issues/38745. RocksDB needs more '
            'space than the leveled space available so it is using storage '
            'from the data disk. Please set '
            'bluestore_volume_selection_policy of all OSDs to '
            'use_some_extra')
     expected = {
         'bugs-detected': [{
             'context': {
                 'passes': True
             },
             'desc': msg,
             'id': 'https://bugs.launchpad.net/bugs/1959649',
             'origin': 'storage.01part'
         }]
     }
     self.assertEqual(IssuesManager().load_bugs(), expected)
示例#13
0
    def test_1943937(self):
        with tempfile.TemporaryDirectory() as dtmp:
            setup_config(DATA_ROOT=dtmp)
            logfile = os.path.join(dtmp, 'var/log/rabbitmq/[email protected]')
            os.makedirs(os.path.dirname(logfile))
            with open(logfile, 'w') as fd:
                fd.write("operation queue.declare caused a channel exception "
                         "not_found: failed to perform operation on queue "
                         "'test_exchange_queue' in vhost "
                         "'nagios-rabbitmq-server-0' due to timeout")

            YScenarioChecker()()
            msg = ('Known RabbitMQ issue where queues get stuck and clients '
                   'trying to use them will just keep timing out. This stops '
                   'many services in the cloud from working correctly. '
                   'Resolution requires you to stop all RabbitMQ servers '
                   'before starting them all again at the same time. A '
                   'rolling restart or restarting them simultaneously will '
                   'not work. See bug for more detail.')

            expected = {
                'bugs-detected': [{
                    'id': 'https://bugs.launchpad.net/bugs/1943937',
                    'desc': msg,
                    'origin': 'rabbitmq.01part'
                }]
            }
            self.assertEqual(IssuesManager().load_bugs(), expected)
示例#14
0
    def test_yaml_def_scenario_checks_requires(self):
        with tempfile.TemporaryDirectory() as dtmp:
            setup_config(PLUGIN_YAML_DEFS=dtmp, PLUGIN_NAME='myplugin')
            open(os.path.join(dtmp, 'scenarios.yaml'),
                 'w').write(SCENARIO_CHECKS)
            checker = scenarios.YScenarioChecker()
            checker.load()
            self.assertEqual(len(checker.scenarios), 1)
            checked = 0
            for scenario in checker.scenarios:
                for check in scenario.checks.values():
                    if check.name == 'apt_pkg_exists':
                        checked += 1
                        self.assertTrue(check.result)
                    elif check.name == 'snap_pkg_exists':
                        checked += 1
                        self.assertTrue(check.result)
                    elif check.name == 'service_exists_and_enabled':
                        checked += 1
                        self.assertTrue(check.result)
                    elif check.name == 'service_exists_not_enabled':
                        checked += 1
                        self.assertFalse(check.result)

            self.assertEqual(checked, 4)

            # now run the scenarios
            checker()

            self.assertEqual(IssuesManager().load_issues(), {})
示例#15
0
    def _get_crc_errors(self, results, osd_type):
        if results:
            ret = self.get_timings(results, resource_osd_from_source=True)

            # If on any particular day there were > 3 crc errors for a
            # particular osd we raise an issue since that indicates they are
            # likely to reflect a real problem.
            osds_in_err = set()
            osd_err_max = 0
            # ret is keyed by day
            for osds in ret.values():
                # If we were unable to glean the osd id from the search results
                # this will not be a dict so skip.
                if type(osds) != dict:
                    continue

                for osd, num_errs in osds.items():
                    if num_errs > 3:
                        if num_errs > osd_err_max:
                            osd_err_max = num_errs

                        osds_in_err.add(osd)

            if osds_in_err:
                msg = ("{} osds ({}) found with > 3 {} crc errors (max={}) "
                       "each within a 24hr period - please investigate".
                       format(len(osds_in_err), ','.join(osds_in_err),
                              osd_type, osd_err_max))
                IssuesManager().add(CephOSDError(msg))

            return ret
示例#16
0
 def test_filestore_to_bluestore_upgrade(self, mock_ceph_config):
     mock_ceph_config.return_value = mock.MagicMock()
     mock_ceph_config.return_value.get = lambda args: '/journal/path'
     YScenarioChecker()()
     msg = ("Ceph Bluestore is enabled yet there is a still a journal "
            "device configured in ceph.conf - please check")
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
示例#17
0
    def test_ssd_osds_no_discard(self):
        self.skipTest("scenario currently disabled until fixed")

        YScenarioChecker()()
        msgs = [("This host has osds with device_class 'ssd' but Bluestore "
                 "discard is not enabled. The recommendation is to set 'bdev "
                 "enable discard true'.")]
        issues = list(IssuesManager().load_issues().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], msgs)
示例#18
0
 def test_bdev(self):
     with tempfile.TemporaryDirectory() as dtmp:
         self.setup_bcachefs(dtmp, bdev_error=True)
         setup_config(DATA_ROOT=dtmp)
         YScenarioChecker()()
         msg = ('bcache config writeback_percent expected to be ge '
                '10 but actual=1.')
         issues = list(IssuesManager().load_issues().values())[0]
         self.assertEqual([issue['desc'] for issue in issues], [msg])
示例#19
0
 def test_bluefs_size(self):
     YScenarioChecker()()
     msg = ('Found 3 Ceph OSDs with metadata size larger than 10G. This '
            'could be the result of a compaction failure/bug and this host '
            'may be affected by https://tracker.ceph.com/issues/45903. A '
            'workaround (>= Nautilus) is to manually compact using '
            "'ceph-bluestore-tool'.")
     issues = list(IssuesManager().load_bugs().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
示例#20
0
 def test_unresponsive_mgr_p1(self):
     YScenarioChecker()()
     msg = ("One or more sosreport ceph plugins contain incomplete data. "
            "This usually indicates a problem with ceph mon/mgr. Please "
            "check ceph-mon.log and retry commands to see if they are "
            "still unresponsive. Restarting ceph-mon and ceph-mgr might "
            "resolve this.")
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
示例#21
0
 def test_juju_ceph_no_bcache_tuning(self):
     YScenarioChecker()()
     msg = ("This host is running Juju-managed Ceph OSDs that are "
            "using bcache devices yet the bcache-tuning charm was "
            "not detected. It is recommended to use the "
            "bcache-tuning charm to ensure optimal bcache "
            "configuration.")
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
示例#22
0
    def test_laggy_pgs(self, mock_helper):
        mock_helper.return_value = mock.MagicMock()
        mock_helper.return_value.ceph_pg_dump_json_decoded.return_value = \
            PG_DUMP_JSON_DECODED

        YScenarioChecker()()
        msg = ('Ceph cluster is reporting 1 laggy/wait PGs. This suggests a '
               'potential network or storage issue - please check.')
        issues = list(IssuesManager().load_issues().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
示例#23
0
    def test_unresponsive_mgr_p2(self):
        YScenarioChecker()()
        msg = ("Some ceph commands are returning incomplete data. This "
               "usually indicates a problem with ceph mon/mgr. Please check "
               "ceph-mon.log and retry commands to see if they are still "
               "unresponsive. Restarting ceph-mon and ceph-mgr might "
               "resolve this.")

        issues = list(IssuesManager().load_issues().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
示例#24
0
    def test_required_osd_release(self, mock_helper):
        mock_helper.return_value = mock.MagicMock()
        mock_helper.return_value.ceph_versions.return_value = \
            CEPH_VERSIONS_MISMATCHED_MAJOR.split('\n')

        YScenarioChecker()()
        msg = ("Ceph cluster config 'require_osd_release' is set to 'octopus' "
               "but not all OSDs are on that version - please check.")
        issues = list(IssuesManager().load_issues().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
示例#25
0
 def test_add_issue_empty_context(self):
     setup_config(MACHINE_READABLE=True)
     ctxt = IssueContext()
     ctxt.set(linenumber=123, path='/foo/bar')
     mgr = IssuesManager()
     mgr.add(MemoryWarning("test"), ctxt)
     ret = mgr.load_issues()
     self.assertEqual(
         ret, {
             IssuesManager.SUMMARY_OUT_ISSUES_ROOT:
             [{
                 'type': 'MemoryWarning',
                 'desc': 'test',
                 'context': {
                     'path': '/foo/bar',
                     'linenumber': 123
                 },
                 'origin': 'testplugin.01part'
             }]
         })
示例#26
0
 def test_ceph_versions_mismatch_p1(self, mock_helper):
     mock_helper.return_value = mock.MagicMock()
     mock_helper.return_value.ceph_versions.return_value = \
         CEPH_VERSIONS_MISMATCHED_MINOR.split('\n')
     YScenarioChecker()()
     msg = ('Ceph daemon versions are not aligned across the cluster. This '
            'could be the result of an incomplete or failed cluster '
            'upgrade. All daemons, except the clients, should ideally be '
            'on the same version for ceph to function correctly.')
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
示例#27
0
    def test_crushmap_bucket_checks_mixed_buckets(self, mock_helper):
        mock_helper.return_value = mock.MagicMock()
        mock_helper.return_value.ceph_osd_crush_dump_json_decoded.\
            return_value = json.loads(CEPH_OSD_CRUSH_DUMP)

        YScenarioChecker()()
        msg = ("Mixed crush bucket types identified in buckets 'default'. "
               "This can cause data distribution to become skewed - please "
               "check crush map.")
        issues = list(IssuesManager().load_issues().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
示例#28
0
 def test_ceph_versions_mismatch_p2(self, mock_helper):
     mock_helper.return_value = mock.MagicMock()
     mock_helper.return_value.ceph_versions.return_value = \
         CEPH_VERSIONS_MISMATCHED_MINOR_MONS_UNALIGNED.split('\n')
     YScenarioChecker()()
     msg = ('One or more Ceph mons has a version lower than other daemons '
            'e.g. ceph-osd running in the cluster. This can cause '
            'unexpected behaviour and should be resolved as soon as '
            'possible. Check full summary output for current versions.')
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
示例#29
0
 def test_ceph_pg_imbalance(self, mock_helper):
     self.setup_fake_cli_osds_imbalanced_pgs(mock_helper)
     YScenarioChecker()()
     msg1 = ('Found some Ceph osd(s) with > 500 pgs - this is close to the '
             'hard limit at which point they will stop creating pgs and '
             'fail - please investigate.')
     msg2 = ('Found some Ceph osd(s) whose pg count is > 30% outside the '
             'optimal range of 50-200 pgs. This could indicate poor data '
             'distribution across the cluster and result in '
             'performance degradation.')
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg1, msg2])
示例#30
0
    def test_scenario_osd_maps_backlog_too_large(self, mock_helper):
        pinned = {'osdmap_manifest': {'pinned_maps': range(5496)}}
        mock_helper.return_value = mock.MagicMock()
        mock_helper.return_value.ceph_report_json_decoded.return_value = pinned

        YScenarioChecker()()
        msg = ("This Ceph cluster has 5496 pinned osdmaps. This can affect "
               "ceph-mon performance and may also indicate bugs such as "
               "https://tracker.ceph.com/issues/44184 and "
               "https://tracker.ceph.com/issues/47290.")
        issues = list(IssuesManager().load_issues().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])