def test_yaml_def_scenario_checks_requires(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(PLUGIN_YAML_DEFS=dtmp, PLUGIN_NAME='myplugin') open(os.path.join(dtmp, 'scenarios.yaml'), 'w').write(SCENARIO_CHECKS) checker = scenarios.YScenarioChecker() checker.load() self.assertEqual(len(checker.scenarios), 1) checked = 0 for scenario in checker.scenarios: for check in scenario.checks.values(): if check.name == 'apt_pkg_exists': checked += 1 self.assertTrue(check.result) elif check.name == 'snap_pkg_exists': checked += 1 self.assertTrue(check.result) elif check.name == 'service_exists_and_enabled': checked += 1 self.assertTrue(check.result) elif check.name == 'service_exists_not_enabled': checked += 1 self.assertFalse(check.result) self.assertEqual(checked, 4) # now run the scenarios checker() self.assertEqual(IssuesManager().load_issues(), {})
def test_flow_lookup_checks_p2(self, mock_cli): mock_cli.return_value = mock.MagicMock() mock_cli.return_value.ovs_appctl_dpctl_show.return_value = \ ['lookups: hit:39017272903 missed:137481120 lost:54691089'] with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) logfile = os.path.join(dtmp, 'var/log/openvswitch/ovs-vswitchd.log') os.makedirs(os.path.dirname(logfile)) with open(logfile, 'w') as fd: fd.write(DPIF_LOST_PACKETS_LOGS) YScenarioChecker()() msg = ('OVS datapath is reporting a non-zero amount of "lost" ' 'packets (total=54691089) which implies that packets ' 'destined for userspace (e.g. vm tap) are being dropped. ' 'ovs-vswitchd is also reporting large numbers of dropped ' 'packets within a 24h period (look for ' '"system@ovs-system: lost packet on port channel"). ' 'This could be caused by ' 'overloaded system cores blocking ovs threads from ' 'delivering packets in time. Please check ovs-appctl ' 'dpctl/show to see if the number of lost packets is still ' 'increasing.') issues = list(IssuesStore().load().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_yaml_def_scenario_checks_expr(self, mock_cli): mock_cli.return_value = mock.MagicMock() mock_cli.return_value.date.return_value = "2021-04-03 00:00:00" with tempfile.TemporaryDirectory() as dtmp: setup_config(PLUGIN_YAML_DEFS=dtmp, DATA_ROOT=dtmp, PLUGIN_NAME='myplugin') logfile = os.path.join(dtmp, 'foo.log') open(os.path.join(dtmp, 'scenarios.yaml'), 'w').write(SCENARIO_CHECKS) contents = [ '2021-04-01 00:31:00.000 an event\n', '2021-04-01 00:32:00.000 an event\n', '2021-04-01 00:33:00.000 an event\n', '2021-04-02 00:36:00.000 an event\n', '2021-04-02 00:00:00.000 an event\n', ] self._create_search_results(logfile, contents) checker = scenarios.YScenarioChecker() checker.load() self.assertEqual(len(checker.scenarios), 1) for scenario in checker.scenarios: for check in scenario.checks.values(): if check.name == 'logmatch': self.assertTrue(check.result) # now run the scenarios checker.run() msg = ("log matched 5 times") issues = list(IssuesStore().load().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_1943937(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) logfile = os.path.join(dtmp, 'var/log/rabbitmq/[email protected]') os.makedirs(os.path.dirname(logfile)) with open(logfile, 'w') as fd: fd.write("operation queue.declare caused a channel exception " "not_found: failed to perform operation on queue " "'test_exchange_queue' in vhost " "'nagios-rabbitmq-server-0' due to timeout") YScenarioChecker()() msg = ('Known RabbitMQ issue where queues get stuck and clients ' 'trying to use them will just keep timing out. This stops ' 'many services in the cloud from working correctly. ' 'Resolution requires you to stop all RabbitMQ servers ' 'before starting them all again at the same time. A ' 'rolling restart or restarting them simultaneously will ' 'not work. See bug for more detail.') expected = { 'bugs-detected': [{ 'id': 'https://bugs.launchpad.net/bugs/1943937', 'desc': msg, 'origin': 'rabbitmq.01part' }] } self.assertEqual(IssuesManager().load_bugs(), expected)
def _run(self, plugin): log.debug("running plugin %s", plugin) if plugin not in PLUGIN_CATALOG: raise Exception("unknown plugin {}".format(plugin)) setup_config(PLUGIN_NAME=plugin) parts = PLUGIN_CATALOG[plugin] return plugintools.PluginRunner().run_parts(parts)
def test_yaml_def_scenario_result_filters_by_period(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(PLUGIN_YAML_DEFS=dtmp, DATA_ROOT=dtmp, PLUGIN_NAME='myplugin') logfile = os.path.join(dtmp, 'foo.log') contents = ['2021-04-01 00:01:00.000 an event\n'] results = self._create_search_results(logfile, contents) result = YPropertyCheck.filter_by_period(results, 24, 1) self.assertEqual(len(result), 1) result = YPropertyCheck.filter_by_period(results, 24, 2) self.assertEqual(len(result), 0) contents = [ '2021-04-01 00:01:00.000 an event\n', '2021-04-01 00:02:00.000 an event\n', '2021-04-03 00:01:00.000 an event\n', ] results = self._create_search_results(logfile, contents) result = YPropertyCheck.filter_by_period(results, 24, 2) self.assertEqual(len(result), 2) contents = [ '2021-04-01 00:00:00.000 an event\n', '2021-04-01 00:01:00.000 an event\n', '2021-04-01 00:02:00.000 an event\n', '2021-04-02 00:00:00.000 an event\n', '2021-04-02 00:01:00.000 an event\n', ] results = self._create_search_results(logfile, contents) result = YPropertyCheck.filter_by_period(results, 24, 4) self.assertEqual(len(result), 4) contents = [ '2021-04-01 00:00:00.000 an event\n', '2021-04-01 00:01:00.000 an event\n', '2021-04-02 00:01:00.000 an event\n', '2021-04-02 00:02:00.000 an event\n', ] results = self._create_search_results(logfile, contents) result = YPropertyCheck.filter_by_period(results, 24, 4) self.assertEqual(len(result), 0) contents = [ '2021-04-01 00:00:00.000 an event\n', '2021-04-01 00:01:00.000 an event\n', '2021-04-02 02:00:00.000 an event\n', '2021-04-03 01:00:00.000 an event\n', '2021-04-04 02:00:00.000 an event\n', '2021-04-05 02:00:00.000 an event\n', '2021-04-06 01:00:00.000 an event\n', ] results = self._create_search_results(logfile, contents) result = YPropertyCheck.filter_by_period(results, 24, 3) self.assertEqual(len(result), 3)
def test_bdev(self): with tempfile.TemporaryDirectory() as dtmp: self.setup_bcachefs(dtmp, bdev_error=True) setup_config(DATA_ROOT=dtmp) YScenarioChecker()() msg = ('bcache config writeback_percent expected to be ge ' '10 but actual=1.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_get_date_w_invalid_tz(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) helper = cli.CLIHelper() os.makedirs(os.path.join(dtmp, "sos_commands/date")) with open(os.path.join(dtmp, "sos_commands/date/date"), 'w') as fd: fd.write("Thu Mar 25 10:55:05 123UTC 2021") self.assertEqual(helper.date(), "")
def test_bluestore_not_enabled(self): with tempfile.TemporaryDirectory() as dtmp: path = os.path.join(dtmp, 'etc/ceph') os.makedirs(path) with open(os.path.join(path, 'ceph.conf'), 'w') as fd: fd.write(CEPH_CONF_NO_BLUESTORE) setup_config(DATA_ROOT=dtmp) enabled = ceph_core.CephChecksBase().bluestore_enabled self.assertFalse(enabled)
def test_fs_override_inheritance(self): """ When a directory is used to group definitions and overrides are provided in a <dirname>.yaml file, we need to make sure those overrides do not supersceded overrides of the same type used by definitions in the same directory. """ with tempfile.TemporaryDirectory() as dtmp: setup_config(PLUGIN_YAML_DEFS=dtmp, DATA_ROOT=dtmp, PLUGIN_NAME='myplugin') overrides = os.path.join(dtmp, 'mytype', 'myplugin', 'mytype.yaml') defs = os.path.join(dtmp, 'mytype', 'myplugin', 'defs.yaml') os.makedirs(os.path.dirname(overrides)) with open(overrides, 'w') as fd: fd.write("requires:\n") fd.write(" property: foo\n") with open(defs, 'w') as fd: fd.write("foo: bar\n") expected = { 'mytype': { 'requires': { 'property': 'foo' } }, 'defs': { 'foo': 'bar' } } self.assertEqual( YDefsLoader('mytype').load_plugin_defs(), expected) with open(defs, 'a') as fd: fd.write("requires:\n") fd.write(" apt: apackage\n") expected = { 'mytype': { 'requires': { 'property': 'foo' } }, 'defs': { 'foo': 'bar', 'requires': { 'apt': 'apackage' } } } self.assertEqual( YDefsLoader('mytype').load_plugin_defs(), expected)
def test_yaml_def_nested_logic(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(PLUGIN_YAML_DEFS=dtmp, DATA_ROOT=dtmp) plugin_dir = os.path.join(dtmp, 'scenarios', HotSOSConfig.PLUGIN_NAME) os.makedirs(plugin_dir) open(os.path.join(plugin_dir, 'scenarios.yaml'), 'w').write(YDEF_NESTED_LOGIC) scenarios.YScenarioChecker()() issues = list(IssuesStore().load().values())[0] self.assertEqual(sorted([issue['desc'] for issue in issues]), sorted(['conc1', 'conc3']))
def test_oom_killer_invoked(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) os.makedirs(os.path.join(dtmp, 'var/log')) klog = os.path.join(dtmp, 'var/log/kern.log') with open(klog, 'w') as fd: fd.write(KERNLOG_OOM) YScenarioChecker()() msg = ('1 reports of oom-killer invoked in kern.log - please check.') issues = list(IssuesStore().load().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_issue_machine_readable(self): setup_config(MACHINE_READABLE=True) mgr = IssuesManager() mgr.add(MemoryWarning("test")) ret = mgr.load_issues() self.assertEqual( ret, { IssuesManager.SUMMARY_OUT_ISSUES_ROOT: [{ 'type': 'MemoryWarning', 'desc': 'test', 'origin': 'testplugin.01part' }] })
def test_nf_conntrack_full(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) os.makedirs(os.path.join(dtmp, 'var/log')) klog = os.path.join(dtmp, 'var/log/kern.log') with open(klog, 'w') as fd: fd.write(KERNLOG_NF_CONNTRACK_FULL) YScenarioChecker()() msg = ("1 reports of 'nf_conntrack: table full' detected in " "kern.log - please check.") issues = list(IssuesStore().load().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def setUp(self): self.maxDiff = None # ensure locale consistency wherever tests are run os.environ["LANG"] = 'C.UTF-8' self.plugin_tmp_dir = tempfile.mkdtemp() # Always reset env globals # If a test relies on loading info from defs yaml this needs to be set # to actual plugin name. setup_config(DATA_ROOT=os.path.join(TESTS_DIR, DEFAULT_FAKE_ROOT), PLUGIN_NAME="testplugin", PLUGIN_YAML_DEFS=os.path.join(TESTS_DIR, "defs"), PART_NAME="01part", PLUGIN_TMP_DIR=self.plugin_tmp_dir, USE_ALL_LOGS=True) setup_logging(debug_mode=True)
def test_add_issue_w_empty_context(self): setup_config(MACHINE_READABLE=True) ctxt = IssueContext() mgr = IssuesManager() mgr.add(MemoryWarning("test"), ctxt) ret = mgr.load_issues() self.assertEqual( ret, { IssuesManager.SUMMARY_OUT_ISSUES_ROOT: [{ 'type': 'MemoryWarning', 'desc': 'test', 'origin': 'testplugin.01part' }] })
def test_cacheset(self): with tempfile.TemporaryDirectory() as dtmp: self.setup_bcachefs(dtmp, cacheset_error=True) setup_config(DATA_ROOT=dtmp) YScenarioChecker()() bug_msg = ( 'bcache cache_available_percent is 33 (i.e. approx. 30%) ' 'which implies this node could be suffering from bug LP ' '1900438 - please check.') issue_msg = ('bcache cacheset config congested_write_threshold_us ' 'expected to be eq 0 but actual=100.') issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [issue_msg]) bugs = list(IssuesManager().load_bugs().values())[0] self.assertEqual([issue['desc'] for issue in bugs], [bug_msg])
def test_1917475(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) logfile = os.path.join(dtmp, 'var/log/ovn/ovn-controller.log') os.makedirs(os.path.dirname(logfile)) with open(logfile, 'w') as fd: fd.write(LP1917475_LOG) YScenarioChecker()() expected = { 'bugs-detected': [{ 'id': 'https://bugs.launchpad.net/bugs/1917475', 'desc': "known ovn bug identified - db rbac errors", 'origin': 'openvswitch.01part' }] } self.assertEqual(issues.IssuesManager().load_bugs(), expected)
def test_run_log_event_checks(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) logfile = os.path.join(dtmp, ('var/log/kern.log')) os.makedirs(os.path.dirname(logfile)) with open(logfile, 'w') as fd: fd.write(EVENTS_KERN_LOG) expected = {'over-mtu-dropped-packets': {'tap0e778df8-ca': 5}} inst = log_event_checks.KernelLogEventChecks() # checks get run when we fetch the output so do that now actual = self.part_output_to_actual(inst.output) issues = list(IssuesStore().load().values())[0] msg = ('kernel has reported over-mtu dropped packets for (1) ' 'interfaces.') self.assertEqual([issue['desc'] for issue in issues], [msg]) self.assertTrue(inst.plugin_runnable) self.assertEqual(actual, expected)
def test_scenario_osd_slow_heartbeats(self, mock_cephbase): mock_cephbase.return_value = mock.MagicMock() mock_cephbase.return_value.plugin_runnable = True with tempfile.TemporaryDirectory() as dtmp: path = os.path.join(dtmp, 'var/log/ceph') os.makedirs(path) with open(os.path.join(path, 'ceph.log'), 'w') as fd: fd.write(OSD_SLOW_HEARTBEATS) setup_config(DATA_ROOT=dtmp) YScenarioChecker()() msg = ("One or more Ceph OSDs is showing slow heartbeats. This most " "commonly a result of network issues between OSDs. Please " "check that the interfaces and network between OSDs is not " "experiencing problems.") issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_add_issue_empty_context(self): setup_config(MACHINE_READABLE=True) ctxt = IssueContext() ctxt.set(linenumber=123, path='/foo/bar') mgr = IssuesManager() mgr.add(MemoryWarning("test"), ctxt) ret = mgr.load_issues() self.assertEqual( ret, { IssuesManager.SUMMARY_OUT_ISSUES_ROOT: [{ 'type': 'MemoryWarning', 'desc': 'test', 'context': { 'path': '/foo/bar', 'linenumber': 123 }, 'origin': 'testplugin.01part' }] })
def test_yaml_def_scenario_result_filters_by_age(self, mock_cli): mock_cli.return_value = mock.MagicMock() mock_cli.return_value.date.return_value = "2022-01-07 00:00:00" with tempfile.TemporaryDirectory() as dtmp: setup_config(PLUGIN_YAML_DEFS=dtmp, DATA_ROOT=dtmp, PLUGIN_NAME='myplugin') logfile = os.path.join(dtmp, 'foo.log') contents = ['2022-01-06 00:00:00.000 an event\n'] results = self._create_search_results(logfile, contents) result = YPropertyCheck.filter_by_age(results, 48) self.assertEqual(len(result), 1) result = YPropertyCheck.filter_by_age(results, 24) self.assertEqual(len(result), 1) result = YPropertyCheck.filter_by_age(results, 23) self.assertEqual(len(result), 0)
def test_yaml_def_scenario_checks_false(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(PLUGIN_YAML_DEFS=dtmp, DATA_ROOT=dtmp, PLUGIN_NAME='myplugin') logfile = os.path.join(dtmp, 'foo.log') open(os.path.join(dtmp, 'scenarios.yaml'), 'w').write(SCENARIO_CHECKS) contents = ['2021-04-01 00:31:00.000 an event\n'] self._create_search_results(logfile, contents) checker = scenarios.YScenarioChecker() checker.load() self.assertEqual(len(checker.scenarios), 1) for scenario in checker.scenarios: for check in scenario.checks.values(): self.assertFalse(check.result) # now run the scenarios checker() self.assertEqual(IssuesManager().load_issues(), {})
def test_1910958(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) logfile = os.path.join(dtmp, 'var/log/juju/unit-rabbitmq-server-0.log') os.makedirs(os.path.dirname(logfile)) with open(logfile, 'w') as fd: fd.write(RABBITMQ_CHARM_LOGS) YScenarioChecker()() expected = { 'bugs-detected': [{ 'id': 'https://bugs.launchpad.net/bugs/1910958', 'desc': ('Unit unit-rabbitmq-server-0 failed to start due ' 'to members in relation 236 that cannot be ' 'removed.'), 'origin': 'juju.01part' }] } self.assertEqual(KnownBugsStore().load(), expected)
def test_scenarios_cluster_logchecks(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) logfile = os.path.join(dtmp, 'var/log/rabbitmq/[email protected]') os.makedirs(os.path.dirname(logfile)) with open(logfile, 'w') as fd: fd.write(RABBITMQ_LOGS) YScenarioChecker()() msg1 = ('Messages were discarded because transient mirrored ' 'classic queues are not syncronized. Please stop all ' 'rabbitmq-server units and restart the cluster. ' 'Note that a rolling restart will not work.') msg2 = ('This rabbitmq cluster either has or has had partitions - ' 'please check rabbtimqctl cluster_status.') msg3 = ('Transient mirrored classic queues are not deleted when ' 'there are no replicas available for promotion. Please ' 'stop all rabbitmq-server units and restart the cluster. ' 'Note that a rolling restart will not work.') issues = list(IssuesStore().load().values())[0] self.assertEqual(sorted([issue['desc'] for issue in issues]), sorted([msg1, msg2, msg3]))
def test_unit_checks(self, mock_cli): mock_cli.return_value = mock.MagicMock() with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) logfile = os.path.join(dtmp, 'var/log/juju/unit-keystone-2.log') os.makedirs(os.path.dirname(logfile)) with open(logfile, 'w') as fd: fd.write(UNIT_LEADERSHIP_ERROR) # first try outside age limit mock_cli.return_value.date.return_value = "2021-09-25 00:00:00" YScenarioChecker()() self.assertEqual(IssuesStore().load(), {}) # then within mock_cli.return_value.date.return_value = "2021-09-17 00:00:00" YScenarioChecker()() msg = ("Juju unit(s) 'keystone' are showing leadership errors in " "their logs from the last 7 days. Please investigate.") issues = list(IssuesStore().load().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg])
def test_scenario_osd_flapping(self, mock_cephbase): mock_cephbase.return_value = mock.MagicMock() mock_cephbase.return_value.plugin_runnable = True mock_cephbase.return_value.has_interface_errors = True mock_cephbase.return_value.bind_interface_names = 'ethX' with tempfile.TemporaryDirectory() as dtmp: path = os.path.join(dtmp, 'var/log/ceph') os.makedirs(path) logpath = os.path.join(path, 'ceph.log') with open(logpath, 'w') as fd: fd.write(OSD_FLAPPING) setup_config(DATA_ROOT=dtmp) YScenarioChecker()() msg = ("Cluster is experiencing OSD flapping. The network " "interface(s) (ethX) used by the Ceph are showing errors " "- please investigate.") # Since we have enabled machine readable we should get some context so # test that as well. context = { logpath: 3, 'ops': 'truth', 'passes': True, 'property': ('hotsos.core.plugins.storage.ceph.' 'CephChecksBase.has_interface_errors'), 'value_actual': True } issues = list(IssuesManager().load_issues().values())[0] self.assertEqual([issue['desc'] for issue in issues], [msg]) self.assertEqual([issue['context'] for issue in issues], [context])
def test_ovs_ofctl_bin_w_errors(self): def fake_check_output(cmd, *_args, **_kwargs): if 'OpenFlow13' in cmd: return 'testdata'.encode(encoding='utf_8', errors='strict') else: raise subprocess.CalledProcessError(1, 'ofctl') setup_config(DATA_ROOT='/') with mock.patch.object(cli.subprocess, 'check_output') as \ mock_check_output: mock_check_output.side_effect = fake_check_output # Test errors with eventual success helper = cli.CLIHelper() self.assertEqual(helper.ovs_ofctl_show(bridge='br-int'), ['testdata']) mock_check_output.side_effect = \ subprocess.CalledProcessError(1, 'ofctl') # Ensure that if all fails the result is always iterable helper = cli.CLIHelper() self.assertEqual(helper.ovs_ofctl_show(bridge='br-int'), [])
def test_systemd_config(self): with tempfile.TemporaryDirectory() as dtmp: setup_config(DATA_ROOT=dtmp) path = os.path.join(dtmp, 'etc/systemd/system.conf') os.makedirs(os.path.dirname(path)) with open(path, 'w') as fd: fd.write("[Manager]\n") fd.write("#CPUAffinity=1 2\n") fd.write("CPUAffinity=0-7,32-39\n") self.assertEqual(SystemdConfig().get('CPUAffinity'), '0-7,32-39') self.assertEqual(SystemdConfig().get('CPUAffinity', expand_to_list=True), [0, 1, 2, 3, 4, 5, 6, 7, 32, 33, 34, 35, 36, 37, 38, 39]) self.assertTrue(SystemdConfig().cpuaffinity_enabled) with open(path, 'w') as fd: fd.write("[Manager]\n") fd.write("#CPUAffinity=1 2\n") fd.write("CPUAffinity=0 1 2 3 8 9 10 11\n") self.assertEqual(SystemdConfig().get('CPUAffinity'), '0 1 2 3 8 9 10 11')
def setup_env(self): log.debug("setting up env") setup_config(PLUGIN_TMP_DIR=tempfile.mkdtemp())