Пример #1
0
    def _get_tiers_size(self):
        try:
            resp, body = self.service.ceph_api.osd_df(body='json',
                                                      output_method='tree')
        except IOError:
            return 0
        if not resp.ok:
            LOG.error(
                _LE("Getting the cluster usage "
                    "information failed: %(reason)s - "
                    "%(body)s") % {
                        "reason": resp.reason,
                        "body": body
                    })
            return {}

        # A node is a crushmap element: root, chassis, host, osd. Create a
        # dictionary for the nodes with the key as the id used for efficient
        # searching through nodes.
        #
        # For example: storage-0's node has one child node => OSD 0
        # {
        #     "id": -4,
        #     "name": "storage-0",
        #     "type": "host",
        #     "type_id": 1,
        #     "reweight": -1.000000,
        #     "kb": 51354096,
        #     "kb_used": 1510348,
        #     "kb_avail": 49843748,
        #     "utilization": 2.941047,
        #     "var": 1.480470,
        #     "pgs": 0,
        #     "children": [
        #         0
        #     ]
        # },
        search_tree = {}
        for node in body['output']['nodes']:
            search_tree[node['id']] = node

        # Extract the tiers as we will return a dict for the size of each tier
        tiers = {k: v for k, v in search_tree.items() if v['type'] == 'root'}

        # For each tier, traverse the heirarchy from the root->chassis->host.
        # Sum the host sizes to determine the overall size of the tier
        tier_sizes = {}
        for tier in tiers.values():
            tier_size = 0
            for chassis_id in tier['children']:
                chassis_size = 0
                chassis = search_tree[chassis_id]
                for host_id in chassis['children']:
                    host = search_tree[host_id]
                    if (chassis_size == 0 or chassis_size > host['kb']):
                        chassis_size = host['kb']
                tier_size += chassis_size / (1024**2)
            tier_sizes[tier['name']] = tier_size

        return tier_sizes
Пример #2
0
    def _report_fault(self, health, alarm_id):
        if alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH:
            new_severity = constants.SEVERITY[health['health']]
            new_reason_text = self._parse_reason(health)
            new_service_affecting = \
                constants.SERVICE_AFFECTING[health['health']]

            # Raise or update alarm if necessary
            if ((not self.current_health_alarm) or
                (self.current_health_alarm.__dict__['severity'] !=
                 new_severity) or
                (self.current_health_alarm.__dict__['reason_text'] !=
                 new_reason_text) or
                (self.current_health_alarm.__dict__['service_affecting'] !=
                 str(new_service_affecting))):

                fault = fm_api.Fault(
                    alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH,
                    alarm_type=fm_constants.FM_ALARM_TYPE_4,
                    alarm_state=fm_constants.FM_ALARM_STATE_SET,
                    entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER,
                    entity_instance_id=self.service.entity_instance_id,
                    severity=new_severity,
                    reason_text=new_reason_text,
                    probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15,
                    proposed_repair_action=constants.REPAIR_ACTION,
                    service_affecting=new_service_affecting)

                alarm_uuid = self.service.fm_api.set_fault(fault)
                if alarm_uuid:
                    LOG.info(_LI(
                        "Created storage alarm %(alarm_uuid)s - "
                        "severity: %(severity)s, reason: %(reason)s, "
                        "service_affecting: %(service_affecting)s") % {
                        "alarm_uuid": alarm_uuid,
                        "severity": new_severity,
                        "reason": new_reason_text,
                        "service_affecting": new_service_affecting})
                else:
                    LOG.error(_LE(
                        "Failed to create storage alarm - "
                        "severity: %(severity)s, reason: %(reason)s "
                        "service_affecting: %(service_affecting)s") % {
                        "severity": new_severity,
                        "reason": new_reason_text,
                        "service_affecting": new_service_affecting})

            # Log detailed reason for later analysis
            if (self.current_ceph_health != health['health'] or
                    self.detailed_health_reason != health['detail']):
                LOG.info(_LI("Ceph status changed: %(health)s "
                             "detailed reason: %(detail)s") % health)
                self.current_ceph_health = health['health']
                self.detailed_health_reason = health['detail']
Пример #3
0
    def _get_osd_pool_quota(self, pool_name):
        try:
            resp, quota = self.service.ceph_api.osd_get_pool_quota(
                pool_name, body='json')
        except IOError:
            return 0

        if not resp.ok:
            LOG.error(_LE("Getting the quota for "
                          "%(name)s pool failed:%(reason)s)") %
                      {"name": pool_name, "reason": resp.reason})
            return 0
        else:
            try:
                quota_gib = int(quota["output"]["quota_max_bytes"]) / (1024**3)
                return quota_gib
            except IOError:
                return 0
Пример #4
0
    def _report_fault(self, health, alarm_id):
        if alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH:
            new_severity = constants.SEVERITY[health['health']]
            new_reason_text = self._parse_reason(health)
            new_service_affecting = \
                constants.SERVICE_AFFECTING[health['health']]

            # Raise or update alarm if necessary
            if ((not self.current_health_alarm)
                    or (self.current_health_alarm.__dict__['severity'] !=
                        new_severity)
                    or (self.current_health_alarm.__dict__['reason_text'] !=
                        new_reason_text)
                    or (self.current_health_alarm.__dict__['service_affecting']
                        != str(new_service_affecting))):

                fault = fm_api.Fault(
                    alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH,
                    alarm_type=fm_constants.FM_ALARM_TYPE_4,
                    alarm_state=fm_constants.FM_ALARM_STATE_SET,
                    entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER,
                    entity_instance_id=self.service.entity_instance_id,
                    severity=new_severity,
                    reason_text=new_reason_text,
                    probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15,
                    proposed_repair_action=constants.REPAIR_ACTION,
                    service_affecting=new_service_affecting)

                alarm_uuid = self.service.fm_api.set_fault(fault)
                if alarm_uuid:
                    LOG.info(
                        _LI("Created storage alarm %(alarm_uuid)s - "
                            "severity: %(severity)s, reason: %(reason)s, "
                            "service_affecting: %(service_affecting)s") % {
                                "alarm_uuid": alarm_uuid,
                                "severity": new_severity,
                                "reason": new_reason_text,
                                "service_affecting": new_service_affecting
                            })
                else:
                    LOG.error(
                        _LE("Failed to create storage alarm - "
                            "severity: %(severity)s, reason: %(reason)s "
                            "service_affecting: %(service_affecting)s") % {
                                "severity": new_severity,
                                "reason": new_reason_text,
                                "service_affecting": new_service_affecting
                            })

            # Log detailed reason for later analysis
            if (self.current_ceph_health != health['health']
                    or self.detailed_health_reason != health['detail']):
                LOG.info(
                    _LI("Ceph status changed: %(health)s "
                        "detailed reason: %(detail)s") % health)
                self.current_ceph_health = health['health']
                self.detailed_health_reason = health['detail']

        elif (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE
              and not health['tier_eid'] in self.current_quota_alarms):

            quota_reason_text = ("Quota/Space mismatch for the %s tier. The "
                                 "sum of Ceph pool quotas does not match the "
                                 "tier size." % health['tier_name'])
            fault = fm_api.Fault(
                alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE,
                alarm_state=fm_constants.FM_ALARM_STATE_SET,
                entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER,
                entity_instance_id=health['tier_eid'],
                severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
                reason_text=quota_reason_text,
                alarm_type=fm_constants.FM_ALARM_TYPE_7,
                probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_75,
                proposed_repair_action=(
                    "Update ceph storage pool quotas to use all available "
                    "cluster space for the %s tier." % health['tier_name']),
                service_affecting=False)

            alarm_uuid = self.service.fm_api.set_fault(fault)
            if alarm_uuid:
                LOG.info(
                    _LI("Created storage quota storage alarm %(alarm_uuid)s. "
                        "Reason: %(reason)s") % {
                            "alarm_uuid": alarm_uuid,
                            "reason": quota_reason_text
                        })
            else:
                LOG.error(
                    _LE("Failed to create quota "
                        "storage alarm. Reason: %s") % quota_reason_text)
Пример #5
0
    def _report_alarm_osds_health(self):
        response, osd_tree = self.service.ceph_api.osd_tree(body='json')
        if not response.ok:
            LOG.error(
                _LE("Failed to retrieve Ceph OSD tree: "
                    "status_code: %(status_code)s, reason: %(reason)s") % {
                        "status_code": response.status_code,
                        "reason": response.reason
                    })
            return
        osd_tree = dict([(n['id'], n) for n in osd_tree['output']['nodes']])
        alarms = []

        self._check_storage_tier(osd_tree, "storage-tier",
                                 lambda *args: alarms.append(args))

        old_alarms = {}
        for alarm_id in [
                fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR,
                fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL
        ]:
            alarm_list = self.service.fm_api.get_faults_by_id(alarm_id)
            if not alarm_list:
                continue
            for alarm in alarm_list:
                if alarm.entity_instance_id not in old_alarms:
                    old_alarms[alarm.entity_instance_id] = []
                old_alarms[alarm.entity_instance_id].append(
                    (alarm.alarm_id, alarm.reason_text))

        for peer_group, reason, severity in alarms:
            if self._current_health_alarm_equals(reason, severity):
                continue
            alarm_critical_major = fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR
            if severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
                alarm_critical_major = (
                    fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL)
            entity_instance_id = (self.service.entity_instance_id +
                                  '.peergroup=' + peer_group)
            alarm_already_exists = False
            if entity_instance_id in old_alarms:
                for alarm_id, old_reason in old_alarms[entity_instance_id]:
                    if (reason == old_reason
                            and alarm_id == alarm_critical_major):
                        # if the alarm is exactly the same, we don't need
                        # to recreate it
                        old_alarms[entity_instance_id].remove(
                            (alarm_id, old_reason))
                        alarm_already_exists = True
                    elif (alarm_id == alarm_critical_major):
                        # if we change just the reason, then we just remove the
                        # alarm from the list so we don't remove it at the
                        # end of the function
                        old_alarms[entity_instance_id].remove(
                            (alarm_id, old_reason))

                if (len(old_alarms[entity_instance_id]) == 0):
                    del old_alarms[entity_instance_id]

                # in case the alarm is exactly the same, we skip the alarm set
                if alarm_already_exists is True:
                    continue
            major_repair_action = constants.REPAIR_ACTION_MAJOR_CRITICAL_ALARM
            fault = fm_api.Fault(
                alarm_id=alarm_critical_major,
                alarm_type=fm_constants.FM_ALARM_TYPE_4,
                alarm_state=fm_constants.FM_ALARM_STATE_SET,
                entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER,
                entity_instance_id=entity_instance_id,
                severity=severity,
                reason_text=reason,
                probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15,
                proposed_repair_action=major_repair_action,
                service_affecting=constants.SERVICE_AFFECTING['HEALTH_WARN'])
            alarm_uuid = self.service.fm_api.set_fault(fault)
            if alarm_uuid:
                LOG.info(
                    _LI("Created storage alarm %(alarm_uuid)s - "
                        "severity: %(severity)s, reason: %(reason)s, "
                        "service_affecting: %(service_affecting)s") % {
                            "alarm_uuid":
                            str(alarm_uuid),
                            "severity":
                            str(severity),
                            "reason":
                            reason,
                            "service_affecting":
                            str(constants.SERVICE_AFFECTING['HEALTH_WARN'])
                        })
            else:
                LOG.error(
                    _LE("Failed to create storage alarm - "
                        "severity: %(severity)s, reason: %(reason)s, "
                        "service_affecting: %(service_affecting)s") % {
                            "severity":
                            str(severity),
                            "reason":
                            reason,
                            "service_affecting":
                            str(constants.SERVICE_AFFECTING['HEALTH_WARN'])
                        })

        for entity_instance_id in old_alarms:
            for alarm_id, old_reason in old_alarms[entity_instance_id]:
                self.service.fm_api.clear_fault(alarm_id, entity_instance_id)