Exemplo n.º 1
0
 def test_scenarios_cluster_config(self):
     YScenarioChecker()()
     msg = ('Cluster partition handling is currently set to "ignore". This '
            'is potentially dangerous and a setting of '
            '"pause_minority" is recommended.')
     issues = list(IssuesStore().load().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 2
0
    def test_lp1936136(self, mocl_cli, mock_cephbase, mock_kernelbase,
                       mock_cset_config, mock_ceph_config):
        def fake_ceph_config(key):
            if key == 'bluefs_buffered_io':
                return 'true'

        mocl_cli.return_value = mock.MagicMock()
        mocl_cli.return_value.dpkg_l.return_value = \
            ["ii  ceph-osd 14.2.22-0ubuntu0.20.04.2 amd64"]

        mock_cset_config.return_value = mock.MagicMock()
        mock_cset_config.return_value.get.return_value = 69

        mock_ceph_config.return_value = mock.MagicMock()
        mock_ceph_config.return_value.get.side_effect = fake_ceph_config

        mock_cephbase.return_value = mock.MagicMock()
        mock_cephbase.return_value.local_osds_use_bcache = True
        mock_kernelbase.return_value = mock.MagicMock()
        mock_kernelbase.return_value.version = '5.3'

        YScenarioChecker()()

        msg = ('This host has Ceph OSDs using bcache block devices and may be '
               'vulnerable to bcache bug LP 1936136 since '
               'bcache cache_available_percent is lt 70 (actual=69). The '
               'current workaround is to set bluefs_buffered_io=false in Ceph '
               'or upgrade to a kernel >= 5.4.')

        issues = list(IssuesManager().load_bugs().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 3
0
    def test_flow_lookup_checks_p2(self, mock_cli):
        mock_cli.return_value = mock.MagicMock()
        mock_cli.return_value.ovs_appctl_dpctl_show.return_value = \
            ['lookups: hit:39017272903 missed:137481120 lost:54691089']

        with tempfile.TemporaryDirectory() as dtmp:
            setup_config(DATA_ROOT=dtmp)
            logfile = os.path.join(dtmp,
                                   'var/log/openvswitch/ovs-vswitchd.log')
            os.makedirs(os.path.dirname(logfile))
            with open(logfile, 'w') as fd:
                fd.write(DPIF_LOST_PACKETS_LOGS)

            YScenarioChecker()()
            msg = ('OVS datapath is reporting a non-zero amount of "lost" '
                   'packets (total=54691089) which implies that packets '
                   'destined for userspace (e.g. vm tap) are being dropped. '
                   'ovs-vswitchd is also reporting large numbers of dropped '
                   'packets within a 24h period (look for '
                   '"system@ovs-system: lost packet on port channel"). '
                   'This could be caused by '
                   'overloaded system cores blocking ovs threads from '
                   'delivering packets in time. Please check ovs-appctl '
                   'dpctl/show to see if the number of lost packets is still '
                   'increasing.')
            issues = list(IssuesStore().load().values())[0]
            self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 4
0
 def test_bug_check_lp1959649(self, mock_cephdaemon, mock_helper):
     mock_helper.return_value = mock.MagicMock()
     mock_helper.return_value.dpkg_l.return_value = \
         ["ii  ceph-osd 15.2.7-0ubuntu0.20.04.2 amd64"]
     mock_cephdaemon.return_value = mock.MagicMock()
     mock_cephdaemon.return_value.bluestore_volume_selection_policy = \
         ['rocksdb_original']
     YScenarioChecker()()
     msg = ('This host is vulnerable to known bug '
            'https://tracker.ceph.com/issues/38745. RocksDB needs more '
            'space than the leveled space available so it is using storage '
            'from the data disk. Please set '
            'bluestore_volume_selection_policy of all OSDs to '
            'use_some_extra')
     expected = {
         'bugs-detected': [{
             'context': {
                 'passes': True
             },
             'desc': msg,
             'id': 'https://bugs.launchpad.net/bugs/1959649',
             'origin': 'storage.01part'
         }]
     }
     self.assertEqual(IssuesManager().load_bugs(), expected)
Exemplo n.º 5
0
 def test_osd_messenger_v2_protocol(self):
     YScenarioChecker()()
     msg = ("This Ceph cluster has 1 OSD(s) that do not bind to a v2 "
            "messenger address. This will cause unexpected behaviour and "
            "should be resolved asap.")
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 6
0
    def test_1943937(self):
        with tempfile.TemporaryDirectory() as dtmp:
            setup_config(DATA_ROOT=dtmp)
            logfile = os.path.join(dtmp, 'var/log/rabbitmq/[email protected]')
            os.makedirs(os.path.dirname(logfile))
            with open(logfile, 'w') as fd:
                fd.write("operation queue.declare caused a channel exception "
                         "not_found: failed to perform operation on queue "
                         "'test_exchange_queue' in vhost "
                         "'nagios-rabbitmq-server-0' due to timeout")

            YScenarioChecker()()
            msg = ('Known RabbitMQ issue where queues get stuck and clients '
                   'trying to use them will just keep timing out. This stops '
                   'many services in the cloud from working correctly. '
                   'Resolution requires you to stop all RabbitMQ servers '
                   'before starting them all again at the same time. A '
                   'rolling restart or restarting them simultaneously will '
                   'not work. See bug for more detail.')

            expected = {
                'bugs-detected': [{
                    'id': 'https://bugs.launchpad.net/bugs/1943937',
                    'desc': msg,
                    'origin': 'rabbitmq.01part'
                }]
            }
            self.assertEqual(IssuesManager().load_bugs(), expected)
Exemplo n.º 7
0
 def test_unattended_upgrades(self):
     YScenarioChecker()()
     msg = ('Unattended upgrades are enabled which can lead to '
            'uncontrolled changes to this environment. If maintenance '
            'windows are required please consider disabling unattended '
            'upgrades.')
     issues = list(IssuesStore().load().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 8
0
 def test_filestore_to_bluestore_upgrade(self, mock_ceph_config):
     mock_ceph_config.return_value = mock.MagicMock()
     mock_ceph_config.return_value.get = lambda args: '/journal/path'
     YScenarioChecker()()
     msg = ("Ceph Bluestore is enabled yet there is a still a journal "
            "device configured in ceph.conf - please check")
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 9
0
    def test_ssd_osds_no_discard(self):
        self.skipTest("scenario currently disabled until fixed")

        YScenarioChecker()()
        msgs = [("This host has osds with device_class 'ssd' but Bluestore "
                 "discard is not enabled. The recommendation is to set 'bdev "
                 "enable discard true'.")]
        issues = list(IssuesManager().load_issues().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], msgs)
Exemplo n.º 10
0
 def test_juju_ceph_no_bcache_tuning(self):
     YScenarioChecker()()
     msg = ("This host is running Juju-managed Ceph OSDs that are "
            "using bcache devices yet the bcache-tuning charm was "
            "not detected. It is recommended to use the "
            "bcache-tuning charm to ensure optimal bcache "
            "configuration.")
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 11
0
 def test_bdev(self):
     with tempfile.TemporaryDirectory() as dtmp:
         self.setup_bcachefs(dtmp, bdev_error=True)
         setup_config(DATA_ROOT=dtmp)
         YScenarioChecker()()
         msg = ('bcache config writeback_percent expected to be ge '
                '10 but actual=1.')
         issues = list(IssuesManager().load_issues().values())[0]
         self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 12
0
 def test_bluefs_size(self):
     YScenarioChecker()()
     msg = ('Found 3 Ceph OSDs with metadata size larger than 10G. This '
            'could be the result of a compaction failure/bug and this host '
            'may be affected by https://tracker.ceph.com/issues/45903. A '
            'workaround (>= Nautilus) is to manually compact using '
            "'ceph-bluestore-tool'.")
     issues = list(IssuesManager().load_bugs().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 13
0
 def test_unresponsive_mgr_p1(self):
     YScenarioChecker()()
     msg = ("One or more sosreport ceph plugins contain incomplete data. "
            "This usually indicates a problem with ceph mon/mgr. Please "
            "check ceph-mon.log and retry commands to see if they are "
            "still unresponsive. Restarting ceph-mon and ceph-mgr might "
            "resolve this.")
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 14
0
    def test_required_osd_release(self, mock_helper):
        mock_helper.return_value = mock.MagicMock()
        mock_helper.return_value.ceph_versions.return_value = \
            CEPH_VERSIONS_MISMATCHED_MAJOR.split('\n')

        YScenarioChecker()()
        msg = ("Ceph cluster config 'require_osd_release' is set to 'octopus' "
               "but not all OSDs are on that version - please check.")
        issues = list(IssuesManager().load_issues().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 15
0
    def test_laggy_pgs(self, mock_helper):
        mock_helper.return_value = mock.MagicMock()
        mock_helper.return_value.ceph_pg_dump_json_decoded.return_value = \
            PG_DUMP_JSON_DECODED

        YScenarioChecker()()
        msg = ('Ceph cluster is reporting 1 laggy/wait PGs. This suggests a '
               'potential network or storage issue - please check.')
        issues = list(IssuesManager().load_issues().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 16
0
    def test_unresponsive_mgr_p2(self):
        YScenarioChecker()()
        msg = ("Some ceph commands are returning incomplete data. This "
               "usually indicates a problem with ceph mon/mgr. Please check "
               "ceph-mon.log and retry commands to see if they are still "
               "unresponsive. Restarting ceph-mon and ceph-mgr might "
               "resolve this.")

        issues = list(IssuesManager().load_issues().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 17
0
 def test_ceph_versions_mismatch_p2(self, mock_helper):
     mock_helper.return_value = mock.MagicMock()
     mock_helper.return_value.ceph_versions.return_value = \
         CEPH_VERSIONS_MISMATCHED_MINOR_MONS_UNALIGNED.split('\n')
     YScenarioChecker()()
     msg = ('One or more Ceph mons has a version lower than other daemons '
            'e.g. ceph-osd running in the cluster. This can cause '
            'unexpected behaviour and should be resolved as soon as '
            'possible. Check full summary output for current versions.')
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 18
0
 def test_ceph_versions_mismatch_p1(self, mock_helper):
     mock_helper.return_value = mock.MagicMock()
     mock_helper.return_value.ceph_versions.return_value = \
         CEPH_VERSIONS_MISMATCHED_MINOR.split('\n')
     YScenarioChecker()()
     msg = ('Ceph daemon versions are not aligned across the cluster. This '
            'could be the result of an incomplete or failed cluster '
            'upgrade. All daemons, except the clients, should ideally be '
            'on the same version for ceph to function correctly.')
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 19
0
    def test_plugin_timeouts(self):
        with tempfile.TemporaryDirectory() as dtmp:
            self.setup_timed_out_plugins(dtmp)
            YScenarioChecker()()

        msg = ('The following sosreport plugins have have timed out and may '
               'have incomplete data: networking, system')
        issues = list(IssuesStore().load().values())[0]
        self.assertEqual([issue['type'] for issue in issues],
                         [SOSReportWarning('').name])
        self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 20
0
    def test_crushmap_bucket_checks_mixed_buckets(self, mock_helper):
        mock_helper.return_value = mock.MagicMock()
        mock_helper.return_value.ceph_osd_crush_dump_json_decoded.\
            return_value = json.loads(CEPH_OSD_CRUSH_DUMP)

        YScenarioChecker()()
        msg = ("Mixed crush bucket types identified in buckets 'default'. "
               "This can cause data distribution to become skewed - please "
               "check crush map.")
        issues = list(IssuesManager().load_issues().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 21
0
 def test_ceph_pg_imbalance(self, mock_helper):
     self.setup_fake_cli_osds_imbalanced_pgs(mock_helper)
     YScenarioChecker()()
     msg1 = ('Found some Ceph osd(s) with > 500 pgs - this is close to the '
             'hard limit at which point they will stop creating pgs and '
             'fail - please investigate.')
     msg2 = ('Found some Ceph osd(s) whose pg count is > 30% outside the '
             'optimal range of 50-200 pgs. This could indicate poor data '
             'distribution across the cluster and result in '
             'performance degradation.')
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg1, msg2])
Exemplo n.º 22
0
    def test_scenario_bluefs_spillover(self, mock_helper):
        mock_helper.return_value = mock.MagicMock()
        mock_helper.return_value.ceph_health_detail_json_decoded.return_value \
            = " experiencing BlueFS spillover"

        YScenarioChecker()()
        msg = ('Identified known Ceph bug. RocksDB needs more space than the '
               'leveled space available. See '
               'www.mail-archive.com/[email protected]/msg05782.html '
               'for more background information.')
        issues = list(IssuesManager().load_bugs().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 23
0
    def test_scenario_osd_maps_backlog_too_large(self, mock_helper):
        pinned = {'osdmap_manifest': {'pinned_maps': range(5496)}}
        mock_helper.return_value = mock.MagicMock()
        mock_helper.return_value.ceph_report_json_decoded.return_value = pinned

        YScenarioChecker()()
        msg = ("This Ceph cluster has 5496 pinned osdmaps. This can affect "
               "ceph-mon performance and may also indicate bugs such as "
               "https://tracker.ceph.com/issues/44184 and "
               "https://tracker.ceph.com/issues/47290.")
        issues = list(IssuesManager().load_issues().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 24
0
 def test_scenarios_cpufreq(self):
     YScenarioChecker()()
     msg = ('This node has Ceph OSDs running on it but is not using '
            'cpufreq scaling_governor in "performance" mode '
            '(actual=powersave). This is not recommended and can result '
            'in performance degradation. To fix this you can install '
            'cpufrequtils, set "GOVERNOR=performance" in '
            '/etc/default/cpufrequtils and run systemctl restart '
            'cpufrequtils. You will also need to stop and disable the '
            'ondemand systemd service in order for changes to persist.')
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 25
0
    def test_flow_lookup_checks_p1(self, mock_cli):
        mock_cli.return_value = mock.MagicMock()
        mock_cli.return_value.ovs_appctl_dpctl_show.return_value = \
            ['lookups: hit:39017272903 missed:137481120 lost:54691089']

        YScenarioChecker()()
        msg = ('OVS datapath is reporting a non-zero amount of "lost" packets '
               '(total=54691089) which implies that packets destined for '
               'userspace (e.g. vm tap) are being dropped. Please check '
               'ovs-appctl dpctl/show to see if the number of lost packets is '
               'still increasing.')
        issues = list(IssuesStore().load().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 26
0
    def test_unit_checks(self, mock_cli):
        mock_cli.return_value = mock.MagicMock()

        with tempfile.TemporaryDirectory() as dtmp:
            setup_config(DATA_ROOT=dtmp)
            logfile = os.path.join(dtmp, 'var/log/juju/unit-keystone-2.log')
            os.makedirs(os.path.dirname(logfile))
            with open(logfile, 'w') as fd:
                fd.write(UNIT_LEADERSHIP_ERROR)

            # first try outside age limit
            mock_cli.return_value.date.return_value = "2021-09-25 00:00:00"
            YScenarioChecker()()
            self.assertEqual(IssuesStore().load(), {})

            # then within
            mock_cli.return_value.date.return_value = "2021-09-17 00:00:00"
            YScenarioChecker()()
            msg = ("Juju unit(s) 'keystone' are showing leadership errors in "
                   "their logs from the last 7 days. Please investigate.")
            issues = list(IssuesStore().load().values())[0]
            self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 27
0
    def test_oom_killer_invoked(self):
        with tempfile.TemporaryDirectory() as dtmp:
            setup_config(DATA_ROOT=dtmp)
            os.makedirs(os.path.join(dtmp, 'var/log'))
            klog = os.path.join(dtmp, 'var/log/kern.log')
            with open(klog, 'w') as fd:
                fd.write(KERNLOG_OOM)

            YScenarioChecker()()

        msg = ('1 reports of oom-killer invoked in kern.log - please check.')
        issues = list(IssuesStore().load().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 28
0
 def test_large_omap_objects(self, mock_cli):
     mock_cli.return_value = mock.MagicMock()
     mock_cli.return_value.ceph_pg_dump_json_decoded.return_value = \
         PG_DUMP_JSON_DECODED
     YScenarioChecker()()
     msg = ("Large omap objects found in pgs '2.f'. "
            "This is usually resolved by deep-scrubbing the pgs. Check "
            "config options "
            "'osd_deep_scrub_large_omap_object_key_threshold' and "
            "'osd_deep_scrub_large_omap_object_value_sum_threshold' to "
            "find whether the values of these keys are too high. "
            "See full summary for more detail.")
     issues = list(IssuesManager().load_issues().values())[0]
     self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 29
0
    def test_nf_conntrack_full(self):
        with tempfile.TemporaryDirectory() as dtmp:
            setup_config(DATA_ROOT=dtmp)
            os.makedirs(os.path.join(dtmp, 'var/log'))
            klog = os.path.join(dtmp, 'var/log/kern.log')
            with open(klog, 'w') as fd:
                fd.write(KERNLOG_NF_CONNTRACK_FULL)

            YScenarioChecker()()

        msg = ("1 reports of 'nf_conntrack: table full' detected in "
               "kern.log - please check.")
        issues = list(IssuesStore().load().values())[0]
        self.assertEqual([issue['desc'] for issue in issues], [msg])
Exemplo n.º 30
0
 def test_cacheset(self):
     with tempfile.TemporaryDirectory() as dtmp:
         self.setup_bcachefs(dtmp, cacheset_error=True)
         setup_config(DATA_ROOT=dtmp)
         YScenarioChecker()()
         bug_msg = (
             'bcache cache_available_percent is 33 (i.e. approx. 30%) '
             'which implies this node could be suffering from bug LP '
             '1900438 - please check.')
         issue_msg = ('bcache cacheset config congested_write_threshold_us '
                      'expected to be eq 0 but actual=100.')
         issues = list(IssuesManager().load_issues().values())[0]
         self.assertEqual([issue['desc'] for issue in issues], [issue_msg])
         bugs = list(IssuesManager().load_bugs().values())[0]
         self.assertEqual([issue['desc'] for issue in bugs], [bug_msg])