def test_add_issue(self): known_bugs = { IssuesManager.SUMMARY_OUT_BUGS_ROOT: [{ 'id': 'https://bugs.launchpad.net/bugs/1', 'desc': None, 'origin': 'testplugin.01part' }] } with open(os.path.join(self.plugin_tmp_dir, 'known_bugs.yaml'), 'w') as fd: fd.write(yaml.dump(known_bugs)) mgr = IssuesManager() mgr.add(LaunchpadBug(2, None)) ret = mgr.load_bugs() expected = { IssuesManager.SUMMARY_OUT_BUGS_ROOT: [{ 'id': 'https://bugs.launchpad.net/bugs/1', 'desc': None, 'origin': 'testplugin.01part' }, { 'id': 'https://bugs.launchpad.net/bugs/2', 'desc': None, 'origin': 'testplugin.01part' }] } self.assertEqual(ret, expected)
def run(self): mgr = IssuesManager() for scenario in self.scenarios: results = {} log.debug("running scenario: %s", scenario.name) # run all conclusions and use highest priority result(s). One or # more conclusions may share the same priority. All conclusions # that match and share the same priority will be used. for name, conc in scenario.conclusions.items(): if conc.reached(scenario.checks): if conc.priority: priority = conc.priority.value else: priority = 1 if priority in results: results[priority].append(conc) else: results[priority] = [conc] log.debug("conclusion reached: %s (priority=%s)", name, priority) if results: highest = max(results.keys()) log.debug("selecting highest priority=%s conclusions (%s)", highest, len(results[highest])) for conc in results[highest]: mgr.add(conc.issue, context=conc.context) else: log.debug("no conclusions reached")
def test_get_issues(self): raised_issues = {} with open(os.path.join(self.plugin_tmp_dir, 'yaml'), 'w') as fd: fd.write(yaml.dump(raised_issues)) mgr = IssuesManager() ret = mgr.load_issues() self.assertEqual(ret, raised_issues)
def test_issue_not_machine_readable(self): mgr = IssuesManager() mgr.add(MemoryWarning("test")) ret = mgr.load_issues() self.assertEqual( ret, { IssuesManager.SUMMARY_OUT_ISSUES_ROOT: { 'MemoryWarnings': ['test (origin=testplugin.01part)'] } })
def test_add_issue_first(self): mgr = IssuesManager() mgr.add(LaunchpadBug(1, None)) ret = mgr.load_bugs() self.assertEqual( ret, { IssuesManager.SUMMARY_OUT_BUGS_ROOT: [{ 'id': 'https://bugs.launchpad.net/bugs/1', 'desc': None, 'origin': 'testplugin.01part' }] })
def test_issue_machine_readable(self): setup_config(MACHINE_READABLE=True) mgr = IssuesManager() mgr.add(MemoryWarning("test")) ret = mgr.load_issues() self.assertEqual( ret, { IssuesManager.SUMMARY_OUT_ISSUES_ROOT: [{ 'type': 'MemoryWarning', 'desc': 'test', 'origin': 'testplugin.01part' }] })
def test_add_issue_w_empty_context(self): setup_config(MACHINE_READABLE=True) ctxt = IssueContext() mgr = IssuesManager() mgr.add(MemoryWarning("test"), ctxt) ret = mgr.load_issues() self.assertEqual( ret, { IssuesManager.SUMMARY_OUT_ISSUES_ROOT: [{ 'type': 'MemoryWarning', 'desc': 'test', 'origin': 'testplugin.01part' }] })
def test_cacheset(self): with tempfile.TemporaryDirectory() as dtmp: self.setup_bcachefs(dtmp, cacheset_error=True) setup_config(DATA_ROOT=dtmp) YScenarioChecker()() bug_msg = ( 'bcache cache_available_percent is 33 (i.e. approx. 30%) ' 'which implies this node could be suffering from bug LP ' '1900438 - please check.') issue_msg = ('bcache cacheset config congested_write_threshold_us ' 'expected to be eq 0 but actual=100.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [issue_msg]) bugs = list(IssuesManager().load_bugs().values())[0] self.assertEqual([issue['desc'] for issue in bugs], [bug_msg])
def test_osd_messenger_v2_protocol(self): YScenarioChecker()() msg = ("This Ceph cluster has 1 OSD(s) that do not bind to a v2 " "messenger address. This will cause unexpected behaviour and " "should be resolved asap.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def __summary_memory_checks(self): _mem_info = {} node_results = self.check_nodes_memory("Normal") if not node_results: # only check other types of no issue detected on Normal node_results = self.check_nodes_memory("DMA32") if node_results: _mem_info = node_results # We only report on compaction errors if there is a shortage of # high-order zones. if _mem_info: fail_count = self.get_vmstat_value("compact_fail") success_count = self.get_vmstat_value("compact_success") # we use an arbitrary threshold of 10k to suggest that a lot of # compaction has occurred but noting that this is a rolling counter # and is not necessarily representative of current state. if success_count > 10000: pcent = int(fail_count / (success_count / 100)) if pcent > 10: msg = ("compaction failures are at {}% of successes " "(see {}).".format(pcent, self.vmstat_path)) IssuesManager().add(MemoryWarning(msg)) top5 = self.get_slab_major_consumers() if top5: _mem_info["slab-top-consumers"] = top5 else: _mem_info = "no issues found" return _mem_info
def test_lp1936136(self, mocl_cli, mock_cephbase, mock_kernelbase, mock_cset_config, mock_ceph_config): def fake_ceph_config(key): if key == 'bluefs_buffered_io': return 'true' mocl_cli.return_value = mock.MagicMock() mocl_cli.return_value.dpkg_l.return_value = \ ["ii ceph-osd 14.2.22-0ubuntu0.20.04.2 amd64"] mock_cset_config.return_value = mock.MagicMock() mock_cset_config.return_value.get.return_value = 69 mock_ceph_config.return_value = mock.MagicMock() mock_ceph_config.return_value.get.side_effect = fake_ceph_config mock_cephbase.return_value = mock.MagicMock() mock_cephbase.return_value.local_osds_use_bcache = True mock_kernelbase.return_value = mock.MagicMock() mock_kernelbase.return_value.version = '5.3' YScenarioChecker()() msg = ('This host has Ceph OSDs using bcache block devices and may be ' 'vulnerable to bcache bug LP 1936136 since ' 'bcache cache_available_percent is lt 70 (actual=69). The ' 'current workaround is to set bluefs_buffered_io=false in Ceph ' 'or upgrade to a kernel >= 5.4.') issues = list(IssuesManager().load_bugs().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_bug_check_lp1959649(self, mock_cephdaemon, mock_helper): mock_helper.return_value = mock.MagicMock() mock_helper.return_value.dpkg_l.return_value = \ ["ii ceph-osd 15.2.7-0ubuntu0.20.04.2 amd64"] mock_cephdaemon.return_value = mock.MagicMock() mock_cephdaemon.return_value.bluestore_volume_selection_policy = \ ['rocksdb_original'] YScenarioChecker()() msg = ('This host is vulnerable to known bug ' 'https://tracker.ceph.com/issues/38745. RocksDB needs more ' 'space than the leveled space available so it is using storage ' 'from the data disk. Please set ' 'bluestore_volume_selection_policy of all OSDs to ' 'use_some_extra') expected = { 'bugs-detected': [{ 'context': { 'passes': True }, 'desc': msg, 'id': 'https://bugs.launchpad.net/bugs/1959649', 'origin': 'storage.01part' }] } self.assertEqual(IssuesManager().load_bugs(), expected)
def test_1943937(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) logfile = os.path.join(dtmp, 'var/log/rabbitmq/[email protected]') os.makedirs(os.path.dirname(logfile)) with open(logfile, 'w') as fd: fd.write("operation queue.declare caused a channel exception " "not_found: failed to perform operation on queue " "'test_exchange_queue' in vhost " "'nagios-rabbitmq-server-0' due to timeout") YScenarioChecker()() msg = ('Known RabbitMQ issue where queues get stuck and clients ' 'trying to use them will just keep timing out. This stops ' 'many services in the cloud from working correctly. ' 'Resolution requires you to stop all RabbitMQ servers ' 'before starting them all again at the same time. A ' 'rolling restart or restarting them simultaneously will ' 'not work. See bug for more detail.') expected = { 'bugs-detected': [{ 'id': 'https://bugs.launchpad.net/bugs/1943937', 'desc': msg, 'origin': 'rabbitmq.01part' }] } self.assertEqual(IssuesManager().load_bugs(), expected)
def test_yaml_def_scenario_checks_requires(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(PLUGIN_YAML_DEFS=dtmp, PLUGIN_NAME='myplugin') open(os.path.join(dtmp, 'scenarios.yaml'), 'w').write(SCENARIO_CHECKS) checker = scenarios.YScenarioChecker() checker.load() self.assertEqual(len(checker.scenarios), 1) checked = 0 for scenario in checker.scenarios: for check in scenario.checks.values(): if check.name == 'apt_pkg_exists': checked += 1 self.assertTrue(check.result) elif check.name == 'snap_pkg_exists': checked += 1 self.assertTrue(check.result) elif check.name == 'service_exists_and_enabled': checked += 1 self.assertTrue(check.result) elif check.name == 'service_exists_not_enabled': checked += 1 self.assertFalse(check.result) self.assertEqual(checked, 4) # now run the scenarios checker() self.assertEqual(IssuesManager().load_issues(), {})
def _get_crc_errors(self, results, osd_type): if results: ret = self.get_timings(results, resource_osd_from_source=True) # If on any particular day there were > 3 crc errors for a # particular osd we raise an issue since that indicates they are # likely to reflect a real problem. osds_in_err = set() osd_err_max = 0 # ret is keyed by day for osds in ret.values(): # If we were unable to glean the osd id from the search results # this will not be a dict so skip. if type(osds) != dict: continue for osd, num_errs in osds.items(): if num_errs > 3: if num_errs > osd_err_max: osd_err_max = num_errs osds_in_err.add(osd) if osds_in_err: msg = ("{} osds ({}) found with > 3 {} crc errors (max={}) " "each within a 24hr period - please investigate". format(len(osds_in_err), ','.join(osds_in_err), osd_type, osd_err_max)) IssuesManager().add(CephOSDError(msg)) return ret
def test_filestore_to_bluestore_upgrade(self, mock_ceph_config): mock_ceph_config.return_value = mock.MagicMock() mock_ceph_config.return_value.get = lambda args: '/journal/path' YScenarioChecker()() msg = ("Ceph Bluestore is enabled yet there is a still a journal " "device configured in ceph.conf - please check") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_ssd_osds_no_discard(self): self.skipTest("scenario currently disabled until fixed") YScenarioChecker()() msgs = [("This host has osds with device_class 'ssd' but Bluestore " "discard is not enabled. The recommendation is to set 'bdev " "enable discard true'.")] issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], msgs)
def test_bdev(self): with tempfile.TemporaryDirectory() as dtmp: self.setup_bcachefs(dtmp, bdev_error=True) setup_config(DATA_ROOT=dtmp) YScenarioChecker()() msg = ('bcache config writeback_percent expected to be ge ' '10 but actual=1.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_bluefs_size(self): YScenarioChecker()() msg = ('Found 3 Ceph OSDs with metadata size larger than 10G. This ' 'could be the result of a compaction failure/bug and this host ' 'may be affected by https://tracker.ceph.com/issues/45903. A ' 'workaround (>= Nautilus) is to manually compact using ' "'ceph-bluestore-tool'.") issues = list(IssuesManager().load_bugs().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_unresponsive_mgr_p1(self): YScenarioChecker()() msg = ("One or more sosreport ceph plugins contain incomplete data. " "This usually indicates a problem with ceph mon/mgr. Please " "check ceph-mon.log and retry commands to see if they are " "still unresponsive. Restarting ceph-mon and ceph-mgr might " "resolve this.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_juju_ceph_no_bcache_tuning(self): YScenarioChecker()() msg = ("This host is running Juju-managed Ceph OSDs that are " "using bcache devices yet the bcache-tuning charm was " "not detected. It is recommended to use the " "bcache-tuning charm to ensure optimal bcache " "configuration.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_laggy_pgs(self, mock_helper): mock_helper.return_value = mock.MagicMock() mock_helper.return_value.ceph_pg_dump_json_decoded.return_value = \ PG_DUMP_JSON_DECODED YScenarioChecker()() msg = ('Ceph cluster is reporting 1 laggy/wait PGs. This suggests a ' 'potential network or storage issue - please check.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_unresponsive_mgr_p2(self): YScenarioChecker()() msg = ("Some ceph commands are returning incomplete data. This " "usually indicates a problem with ceph mon/mgr. Please check " "ceph-mon.log and retry commands to see if they are still " "unresponsive. Restarting ceph-mon and ceph-mgr might " "resolve this.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_required_osd_release(self, mock_helper): mock_helper.return_value = mock.MagicMock() mock_helper.return_value.ceph_versions.return_value = \ CEPH_VERSIONS_MISMATCHED_MAJOR.split('\n') YScenarioChecker()() msg = ("Ceph cluster config 'require_osd_release' is set to 'octopus' " "but not all OSDs are on that version - please check.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_add_issue_empty_context(self): setup_config(MACHINE_READABLE=True) ctxt = IssueContext() ctxt.set(linenumber=123, path='/foo/bar') mgr = IssuesManager() mgr.add(MemoryWarning("test"), ctxt) ret = mgr.load_issues() self.assertEqual( ret, { IssuesManager.SUMMARY_OUT_ISSUES_ROOT: [{ 'type': 'MemoryWarning', 'desc': 'test', 'context': { 'path': '/foo/bar', 'linenumber': 123 }, 'origin': 'testplugin.01part' }] })
def test_ceph_versions_mismatch_p1(self, mock_helper): mock_helper.return_value = mock.MagicMock() mock_helper.return_value.ceph_versions.return_value = \ CEPH_VERSIONS_MISMATCHED_MINOR.split('\n') YScenarioChecker()() msg = ('Ceph daemon versions are not aligned across the cluster. This ' 'could be the result of an incomplete or failed cluster ' 'upgrade. All daemons, except the clients, should ideally be ' 'on the same version for ceph to function correctly.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_crushmap_bucket_checks_mixed_buckets(self, mock_helper): mock_helper.return_value = mock.MagicMock() mock_helper.return_value.ceph_osd_crush_dump_json_decoded.\ return_value = json.loads(CEPH_OSD_CRUSH_DUMP) YScenarioChecker()() msg = ("Mixed crush bucket types identified in buckets 'default'. " "This can cause data distribution to become skewed - please " "check crush map.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_ceph_versions_mismatch_p2(self, mock_helper): mock_helper.return_value = mock.MagicMock() mock_helper.return_value.ceph_versions.return_value = \ CEPH_VERSIONS_MISMATCHED_MINOR_MONS_UNALIGNED.split('\n') YScenarioChecker()() msg = ('One or more Ceph mons has a version lower than other daemons ' 'e.g. ceph-osd running in the cluster. This can cause ' 'unexpected behaviour and should be resolved as soon as ' 'possible. Check full summary output for current versions.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_ceph_pg_imbalance(self, mock_helper): self.setup_fake_cli_osds_imbalanced_pgs(mock_helper) YScenarioChecker()() msg1 = ('Found some Ceph osd(s) with > 500 pgs - this is close to the ' 'hard limit at which point they will stop creating pgs and ' 'fail - please investigate.') msg2 = ('Found some Ceph osd(s) whose pg count is > 30% outside the ' 'optimal range of 50-200 pgs. This could indicate poor data ' 'distribution across the cluster and result in ' 'performance degradation.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg1, msg2])
def test_scenario_osd_maps_backlog_too_large(self, mock_helper): pinned = {'osdmap_manifest': {'pinned_maps': range(5496)}} mock_helper.return_value = mock.MagicMock() mock_helper.return_value.ceph_report_json_decoded.return_value = pinned YScenarioChecker()() msg = ("This Ceph cluster has 5496 pinned osdmaps. This can affect " "ceph-mon performance and may also indicate bugs such as " "https://tracker.ceph.com/issues/44184 and " "https://tracker.ceph.com/issues/47290.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])