Exemplo n.º 1
0
    def get(self, request, fsid):
        servers = self.client.server_list_cluster(fsid, async=True)
        osd_data = self.client.get_sync_object(fsid, OsdMap.str, async=True)
        osds = self.client.list(fsid, OSD, {}, async=True)
        pg_summary = self.client.get_sync_object(fsid, PgSummary.str, async=True)
        osds = osds.get()
        servers = servers.get()
        osd_data = osd_data.get()
        pg_summary = pg_summary.get()

        osd_map = OsdMap(None, osd_data)

        server_info = self.client.server_by_service([ServiceId(fsid, OSD, str(osd['osd'])) for osd in osds], async=True)
        server_info = server_info.get()

        osds, osds_by_pg_state = self.generate(pg_summary, osd_map, server_info, servers)

        if not osds or not osds_by_pg_state:
            return Response([], status.HTTP_202_ACCEPTED)

        pg_states = request.QUERY_PARAMS.get('pg_states', None)
        if pg_states:
            osds = self._filter_by_pg_state(osds, pg_states, osds_by_pg_state)

        osd_list = DataObject({
            # 'osds': [DataObject({'osd': o}) for o in osds],
            'osds': osds,
            'osds_by_pg_state': osds_by_pg_state
        })

        return Response(OSDListSerializer(osd_list).data)
Exemplo n.º 2
0
    def retrieve(self, request, fsid, osd_id):
        osd = self.client.get_sync_object(
            fsid, 'osd_map', ['osds_by_id', int(osd_id)])
        crush_node = self.client.get_sync_object(
            fsid, 'osd_map',
            ['osd_tree_node_by_id', int(osd_id)])
        osd['reweight'] = float(crush_node['reweight'])
        osd['server'] = self.client.server_by_service(
            [ServiceId(fsid, OSD, osd_id)])[0][1]

        pools = self.client.get_sync_object(
            fsid, 'osd_map', ['osd_pools', int(osd_id)])
        osd['pools'] = pools

        osd_metadata = self.client.get_sync_object(
            fsid, 'osd_map', ['metadata_by_id', int(osd_id)])
        try:
            osd['backend_device_node'] = osd_metadata[
                'backend_filestore_dev_node']
        except KeyError:
            osd['backend_device_node'] = None
        try:
            osd['backend_partition_path'] = osd_metadata[
                'backend_filestore_partition_path']
        except KeyError:
            osd['backend_partition_path'] = None

        osd_commands = self.client.get_valid_commands(fsid, OSD, [int(osd_id)])
        osd.update(osd_commands[int(osd_id)])
        parent_map = self.client.get_sync_object(fsid, 'osd_map',
                                                 ['parent_bucket_by_node_id'])
        osd.update(
            {'crush_node_ancestry': lookup_ancestry(osd['osd'], parent_map)})

        return Response(self.serializer_class(DataObject(osd)).data)
Exemplo n.º 3
0
    def retrieve_status(self, request, fsid, mon_id):
        service_info = self.client.status_by_service(
            [ServiceId(fsid, 'mon', mon_id)])[0]
        if service_info is None:
            raise Http404("Mon not found '%s'" % mon_id)

        return Response(service_info['status'])
Exemplo n.º 4
0
 def status_by_service(self, services):
     result = self._manager.servers.get_services(
         [ServiceId(*s) for s in services])
     return [({
         'running': ss.running,
         'server': ss.server_state.fqdn,
         'status': ss.status
     } if ss else None) for ss in result]
Exemplo n.º 5
0
 def _get_fqdn(self, fsid, service_type, service_id):
     """
     Resolve a service to a FQDN if possible, else return None
     """
     server = self._manager.servers.get_by_service(ServiceId(fsid, service_type, str(service_id)))
     if server is None:
         log.warn("No server found for service %s %s" % (service_type, service_id))
     return server.fqdn if server else None
Exemplo n.º 6
0
    def server_by_service(self, services):
        """
        Return a list of 2-tuples mapping of service ID to server FQDN

        Note that we would rather return a dict but tuple dict keys are awkward to serialize
        """
        result = self._manager.servers.list_by_service([ServiceId(*s) for s in services])
        return result
Exemplo n.º 7
0
    def list(self, request, fsid):
        # Get data needed for filtering
        list_filter = {}

        if 'pool' in request.GET:
            try:
                pool_id = int(request.GET['pool'])
            except ValueError:
                return Response("Pool ID must be an integer", status=status.HTTP_400_BAD_REQUEST)
            list_filter['pool'] = pool_id

        if 'id__in[]' in request.GET:
            try:
                ids = request.GET.getlist("id__in[]")
                list_filter['id__in'] = [int(i) for i in ids]
            except ValueError:
                return Response("Invalid OSD ID in list", status=status.HTTP_400_BAD_REQUEST)

        # Get data
        osds = self.client.list(fsid, OSD, list_filter, async=True)
        osd_to_pools = self.client.get_sync_object(fsid, 'osd_map', ['osd_pools'], async=True)
        crush_nodes = self.client.get_sync_object(fsid, 'osd_map', ['osd_tree_node_by_id'], async=True)
        osds = osds.get()

        # Get data depending on OSD list
        server_info = self.client.server_by_service([ServiceId(fsid, OSD, str(osd['osd'])) for osd in osds], async=True)
        osd_commands = self.client.get_valid_commands(fsid, OSD, [x['osd'] for x in osds], async=True)

        # Preparation complete, await all data to serialize result
        osd_to_pools = osd_to_pools.get()
        crush_nodes = crush_nodes.get()
        server_info = server_info.get()
        osd_commands = osd_commands.get()

        # Build OSD data objects
        for o in osds:
            # An OSD being in the OSD map does not guarantee its presence in the CRUSH
            # map, as "osd crush rm" and "osd rm" are separate operations.
            try:
                o.update({'reweight': float(crush_nodes[o['osd']]['reweight'])})
            except KeyError:
                log.warning("No CRUSH data available for OSD {0}".format(o['osd']))
                o.update({'reweight': 0.0})

        for o, (service_id, fqdn) in zip(osds, server_info):
            o['server'] = fqdn

        for o in osds:
            o['pools'] = osd_to_pools[o['osd']]

        for o in osds:
            o.update(osd_commands[o['osd']])

        return Response(self.serializer_class([DataObject(o) for o in osds], many=True).data)
Exemplo n.º 8
0
    def on_mon_map(self, mon_map, mon_status):
        """
        When a new mon map is received, use it to eliminate any mon
        ServiceState records that no longer exist in the real world.
        """
        log.debug("ServerMonitor.on_mon_map: %s" %
                  str([m['name'] for m in mon_map['mons']]))
        # We're no longer getting these via salt so we fake them
        # based on what we know in the mon_map
        if mon_status is None:
            mon_status = {}

        for mon in mon_map['mons']:
            services = {
                mon['name']: {
                    'fsid': mon_map['fsid'],
                    'type': 'mon',
                    'status': {
                        'election_epoch': mon_status.get('election_epoch'),
                        'quorum': mon_map['quorum'],
                        'rank': mon['rank']
                    },
                    'id': mon['name']
                }
            }
            mon_addr = mon.get('addr')
            mon_name = mon['name']
            if mon_addr is not None:
                mon_addr = mon_addr.split('/')[0].split(':')[
                    0]  # deal with CIDR notation
                try:
                    mon_name = socket.getfqdn(mon_addr)
                except socket.gaierror:
                    pass

            self.on_server_heartbeat(mon_name, {
                'boot_time': 0,
                'ceph_version': None,
                'services': services
            })

        map_mons = set([
            ServiceId(mon_map['fsid'], 'mon', m['name'])
            for m in mon_map['mons']
        ])
        known_mons = set([
            s.id for s in self.fsid_services[mon_map['fsid']]
            if s.service_type == 'mon'
        ])

        for stale_mon_id in known_mons - map_mons:
            self.forget_service(self.services[stale_mon_id])
Exemplo n.º 9
0
    def retrieve(self, request, fsid, osd_id):
        osd = self.client.get_sync_object(fsid, 'osd_map', ['osds_by_id', int(osd_id)])
        crush_node = self.client.get_sync_object(fsid, 'osd_map', ['osd_tree_node_by_id', int(osd_id)])
        osd['reweight'] = float(crush_node['reweight'])
        osd['server'] = self.client.server_by_service([ServiceId(fsid, OSD, osd_id)])[0][1]

        pools = self.client.get_sync_object(fsid, 'osd_map', ['osd_pools', int(osd_id)])
        osd['pools'] = pools

        osd_commands = self.client.get_valid_commands(fsid, OSD, [int(osd_id)])
        osd.update(osd_commands[int(osd_id)])

        return Response(self.serializer_class(DataObject(osd)).data)
Exemplo n.º 10
0
        def fixup_osd(osd):
            osd_id = osd['osd']
            data = dict((k, osd[k]) for k in OSD_FIELDS)
            data.update({'id': osd_id})
            data.update({'osd': osd_id})
            data.update({'pg_states': dict(pg_states_by_osd[osd_id])})
            data.update({'pools': list(pools_by_osd[osd_id])})

            server = server_monitor.get_by_service(
                ServiceId(cluster_monitor.fsid, 'osd', str(osd_id)))

            data.update({'host': server.hostname if server else None})
            data.update({'fqdn': server.fqdn if server else None})
            return data
Exemplo n.º 11
0
 def on_mon_map(self, mon_map):
     """
     When a new mon map is received, use it to eliminate any mon
     ServiceState records that no longer exist in the real world.
     """
     map_mons = set([
         ServiceId(mon_map['fsid'], 'mon', m['name'])
         for m in mon_map['mons']
     ])
     known_mons = set([
         s.id for s in self.fsid_services[mon_map['fsid']]
         if s.service_type == 'mon'
     ])
     for stale_mon_id in known_mons - map_mons:
         self.forget_service(self.services[stale_mon_id])
Exemplo n.º 12
0
    def on_mds_map(self, fsid, mds_map):
        """
        When a new MDS map is received, use it to eliminate any MDS
        ServiceState records that no longer exist in the real world.

        :param fsid: Pass in fsid string because mds map doesn't include it
        :param mds_map: The MDS map sync object
        """
        map_mds = set([
            ServiceId(fsid, 'mds', i['name'])
            for i in mds_map['info'].values()
        ])
        known_mds = set([
            s.id for s in self.fsid_services[fsid] if s.service_type == 'mds'
        ])
        for stale_mds_id in known_mds - map_mds:
            self.forget_service(self.services[stale_mds_id])
Exemplo n.º 13
0
    def _get_up_mon_servers(self, fsid):
        # Resolve FSID to list of mon FQDNs
        servers = self.client.server_list_cluster(fsid)
        # Sort to get most recently contacted server first; drop any
        # for whom last_contact is None
        servers = [s for s in servers if s['last_contact']]
        servers = sorted(servers,
                         key=lambda t: dateutil_parse(t['last_contact']),
                         reverse=True)
        mon_fqdns = []
        for server in servers:
            for service in server['services']:
                service_id = ServiceId(*(service['id']))
                if service[
                        'running'] and service_id.service_type == MON and service_id.fsid == fsid:
                    mon_fqdns.append(server['fqdn'])

        return mon_fqdns
Exemplo n.º 14
0
    def get_cluster_log(self, request, fsid):
        """
        Retrieve the cluster log from one of a cluster's mons (expect it to be in /var/log/ceph/ceph.log)
        """

        lines = request.GET.get('lines', 40)

        # Resolve FSID to name
        name = self.client.get_cluster(fsid)['name']

        # Resolve FSID to list of mon FQDNs
        servers = self.client.server_list_cluster(fsid)
        # Sort to get most recently contacted server first; drop any
        # for whom last_contact is None
        servers = [s for s in servers if s['last_contact']]
        servers = sorted(servers,
                         key=lambda t: dateutil_parse(t['last_contact']),
                         reverse=True)
        mon_fqdns = []
        for server in servers:
            for service in server['services']:
                service_id = ServiceId(*(service['id']))
                if service[
                        'running'] and service_id.service_type == MON and service_id.fsid == fsid:
                    mon_fqdns.append(server['fqdn'])

        client = salt.client.LocalClient(
            config.get('cthulhu', 'salt_config_path'))
        log.debug("LogTailViewSet: mons for %s are %s" % (fsid, mon_fqdns))
        # For each mon FQDN, try to go get ceph/$cluster.log, if we succeed return it, if we fail try the next one
        # NB this path is actually customizable in ceph as `mon_cluster_log_file` but we assume user hasn't done that.
        for mon_fqdn in mon_fqdns:
            results = client.cmd(mon_fqdn, "log_tail.tail",
                                 ["ceph/{name}.log".format(name=name), lines])
            if results:
                return Response({'lines': results[mon_fqdn]})
            else:
                log.info("Failed to get log from %s" % mon_fqdn)

        # If none of the mons gave us what we wanted, return a 503 service unavailable
        return Response("mon log data unavailable",
                        status=status.HTTP_503_SERVICE_UNAVAILABLE)
Exemplo n.º 15
0
    def _get_mons(self, fsid):
        mon_status = self.client.get_sync_object(fsid, 'mon_status')
        if not mon_status:
            raise Http404("No mon data available")

        mons = mon_status['monmap']['mons']
        service_ids = [ServiceId(fsid, MON, mon['name']) for mon in mons]
        services_info = self.client.status_by_service(service_ids)

        # Use this to invalidate any statuses we can prove are outdated
        lowest_valid_epoch = mon_status['election_epoch']

        # Step 1: account for the possibility that our cluster-wide mon_status object
        # could be out of date with respect to local mon_status data that we get
        # from each mon service.
        for mon, service_info in zip(mons, services_info):
            if service_info and service_info['status']:
                local_epoch = service_info['status']['election_epoch']
                if local_epoch > lowest_valid_epoch:
                    # Evidence that the cluster mon status is out of date, and we have
                    # a more recent one to replace it with.
                    log.warn(
                        "Using mon '%s' local status as it is most recent" %
                        (mon['name']))
                    mon_status = service_info['status']
                    lowest_valid_epoch = mon_status['election_epoch']
                elif local_epoch == lowest_valid_epoch and service_info[
                        'status']['quorum'] != mon_status['quorum']:
                    # Evidence that the cluster mon status is out of date, and we
                    # have to assume that anyone it claimed was in quorum no longer is.
                    log.warn(
                        "Disregarding cluster mon status because '%s' disagrees"
                        % (mon['name']))
                    lowest_valid_epoch = local_epoch + 1

        # Step 2: Reconcile what the cluster mon status thinks about this mon with
        # what it thinks about itself.
        for mon, service_info in zip(mons, services_info):
            mon['server'] = service_info['server'] if service_info else None

            cluster_opinion = mon['rank'] in mon_status[
                'quorum'] and mon_status['election_epoch'] >= lowest_valid_epoch
            if service_info is None or service_info['status'] is None:
                # Handle local data being unavailable, e.g. if our agent
                # is not installed on one or more mons
                mon['status'] = None
                mon['in_quorum'] = cluster_opinion
                continue

            status = service_info['status']
            mon['status'] = status

            local_opinion = service_info['running'] and (status['rank'] in status['quorum']) and \
                status['election_epoch'] >= lowest_valid_epoch

            if cluster_opinion != local_opinion:
                log.warn("mon %s/%s local state disagrees with cluster state" %
                         (mon['name'], mon['rank']))

                if status['election_epoch'] == 0 or not service_info['running']:
                    # You're claiming not to be in quorum, I believe you because I have
                    # no way of knowing the cluster map is more up to date than your info.
                    mon['in_quorum'] = local_opinion
                elif status['election_epoch'] < mon_status['election_epoch']:
                    # The cluster map is unambiguously more up to date than your info, so
                    # I believe it.
                    mon['in_quorum'] = cluster_opinion
                else:
                    # Your data is newer than the cluster map, I believe you.
                    mon['in_quorum'] = local_opinion
            else:
                mon['in_quorum'] = cluster_opinion

        # Step 3: special case, handle when our local inferrences about mon status
        # make it impossible for us to believe what the cluster mon status is telling us.
        if len([m for m in mons if m['in_quorum']]) < (len(mons) / 2 + 1):
            log.warn(
                "Asserting that there is no quorum even if cluster map says there is"
            )
            # I think the cluster map is lying about there being a quorum at all
            for m in mons:
                m['in_quorum'] = False

        return mons
Exemplo n.º 16
0
    def on_server_heartbeat(self, fqdn, server_heartbeat):
        """
        Call back for when a ceph.service message is received from a salt minion.

        This is actually a fairly simple operation of updating the in memory ServerState
        to reflect what is in the message, but it's convoluted because we may be seeing
        a new server, a known server, or a server which was known but unmanaged.
        """
        log.debug("ServerMonitor.on_server_heartbeat: %s" % fqdn)
        new_server = True
        newly_managed_server = False
        try:
            server_state = self.servers[fqdn]
            new_server = False
        except KeyError:
            # Look up the grains for this server, we need to know its hostname in order
            # to resolve this vs. the OSD map.
            hostname = self._get_grains(fqdn)['host']

            if hostname in self.hostname_to_server:
                server_state = self.hostname_to_server[hostname]
                if not server_state.managed:
                    # Take over a ServerState that was created from OSD map
                    server_state.managed = True
                    old_fqdn = server_state.fqdn
                    # OSD map servers would have faked up FQDN as hostname, so clear that out
                    del self.servers[old_fqdn]
                    server_state.fqdn = fqdn
                    self.servers[server_state.fqdn] = server_state
                    self._persister.update_server(old_fqdn,
                                                  fqdn=fqdn,
                                                  managed=True)
                    new_server = False
                    log.info("Server %s went from unmanaged to managed" % fqdn)
                    newly_managed_server = True

                else:
                    # We will go on to treat these as distinct servers even though
                    # they have the same hostname
                    log.warn(
                        "Hostname clash: FQDNs '%s' and '%s' both have hostname %s"
                        % (fqdn, server_state.fqdn, hostname))
        else:
            # The case where hostname == FQDN, we may already have this FQDN in our
            # map from an unmanaged server being reported by hostname.
            if not server_state.managed:
                newly_managed_server = True
                server_state.managed = True
                self._persister.update_server(server_state.fqdn, managed=True)
                log.info("Server %s went from unmanaged to managed" % fqdn)

        boot_time = datetime.datetime.fromtimestamp(
            server_heartbeat['boot_time'], tz=tz.tzutc())
        if new_server:
            hostname = self._get_grains(fqdn)['host']
            server_state = ServerState(
                fqdn,
                hostname,
                managed=True,
                last_contact=now(),
                boot_time=boot_time,
                ceph_version=server_heartbeat['ceph_version'])
            self.inject_server(server_state)
            self._persister.create_server(
                Server(fqdn=server_state.fqdn,
                       hostname=server_state.hostname,
                       managed=server_state.managed,
                       last_contact=server_state.last_contact))
            log.info("Saw server %s for the first time" % server_state)

        server_state.last_contact = now()
        self._persister.update_server(server_state.fqdn,
                                      last_contact=server_state.last_contact)

        if server_state.boot_time != boot_time:
            log.warn("{0} boot time changed, old {1} new {2}".format(
                server_state.fqdn, server_state.boot_time, boot_time))
            old_boot_time = server_state.boot_time
            server_state.boot_time = boot_time
            self._persister.update_server(server_state.fqdn,
                                          boot_time=server_state.boot_time)
            if old_boot_time is not None:  # i.e. a reboot, not an unmanaged->managed transition
                if server_state.boot_time < old_boot_time:
                    log.warn("Server boot time went backwards")
                elif server_state.boot_time - old_boot_time < REBOOT_THRESHOLD:
                    log.warn("Server boot time changed, but only a little")
                else:
                    # A substantial forward change in boot time, that's a reboot: emit
                    # a user visible event
                    log.warn("{0} rebooted!".format(fqdn))
                    self._eventer.on_reboot(server_state, False)

        if server_state.ceph_version != server_heartbeat['ceph_version']:
            # Interpret "no package installed but some services running" as meaning we're
            # in the process of upgrading.
            upgrading = server_heartbeat[
                'ceph_version'] is None and server_heartbeat['services']
            if server_heartbeat['ceph_version'] is None and upgrading:
                # Ignore version=None while upgrading to avoid generating spurious
                # "ceph uninstalled" events
                pass
            else:
                server_state.ceph_version = server_heartbeat['ceph_version']
                self._persister.update_server(
                    server_state.fqdn, ceph_version=server_state.ceph_version)
                if not (new_server or newly_managed_server):
                    self._eventer.on_new_version(server_state)

        seen_id_tuples = set()
        for service_name, service in server_heartbeat['services'].items():
            id_tuple = ServiceId(service['fsid'], service['type'],
                                 service['id'])
            seen_id_tuples.add(id_tuple)
            self._register_service(server_state,
                                   id_tuple,
                                   running=True,
                                   status=service['status'])

        # For any service which was last reported on this server but
        # is now gone, mark it as not running
        for unseen_id_tuple in set(
                server_state.services.keys()) ^ seen_id_tuples:
            service_state = self.services[unseen_id_tuple]
            if service_state.running:
                log.info("Service %s stopped on server %s" %
                         (service_state, server_state))
                service_state.running = False

        if new_server or newly_managed_server:
            # We do this at the end so that by the time we emit the event
            # the ServiceState objects have been created
            self._eventer.on_server(server_state)
Exemplo n.º 17
0
    def on_osd_map(self, osd_map):
        """
        For when a new OSD map is received: we may infer the existence of
        hosts from the CRUSH map if the hosts are not all sending
        us data with salt.

        :param osd_map: The data from an OsdMap sync object
        """
        log.debug("ServerMonitor.on_osd_map: epoch %s" % osd_map['epoch'])

        hostname_to_osds = self.get_hostname_to_osds(osd_map)
        log.debug("ServerMonitor.on_osd_map: got service data for %s servers" %
                  len(hostname_to_osds))

        osds_in_map = set()
        for hostname, osds in hostname_to_osds.items():
            id_to_osd = dict([(ServiceId(osd_map['fsid'], 'osd',
                                         str(o['osd'])), o) for o in osds])
            osds_in_map |= set(id_to_osd.keys())

            # Identify if this is a CRUSH alias rather than a real hostname, by
            # checking if any of the OSDs mentioned are already recorded as children
            # of a managed host.
            crush_alias_to = None
            if hostname not in self.hostname_to_server:
                for service_id, osd in id_to_osd.items():
                    try:
                        service_state = self.services[service_id]
                        if service_state.server_state.managed:
                            crush_alias_to = service_state.server_state
                    except KeyError:
                        pass

            if crush_alias_to:
                log.info("'{0}' is a CRUSH alias to {1}".format(
                    hostname, crush_alias_to))
                continue

            # Look up or create ServerState for the server named in the CRUSH map
            try:
                server_state = self.hostname_to_server[hostname]
            except KeyError:
                # Fake FQDN to equal hostname
                server_state = ServerState(hostname,
                                           hostname,
                                           managed=False,
                                           last_contact=None,
                                           boot_time=None,
                                           ceph_version=None)
                self.inject_server(server_state)
                self._persister.create_server(
                    Server(fqdn=server_state.fqdn,
                           hostname=server_state.hostname,
                           managed=server_state.managed))

            # Register all the OSDs reported under this hostname with the ServerState
            for service_id, osd in id_to_osd.items():
                if not server_state.managed:
                    # Only pay attention to these services for unmanaged servers,
                    # for managed servers rely on ceph/server salt messages
                    self._register_service(server_state, service_id,
                                           bool(osd['up']), None)

        # Remove ServiceState for any OSDs for this FSID which are not
        # mentioned in hostname_to_osds
        known_osds = set([
            s.id for s in self.fsid_services[osd_map['fsid']]
            if s.service_type == 'osd'
        ])
        for stale_service_id in known_osds - osds_in_map:
            self.forget_service(self.services[stale_service_id])
Exemplo n.º 18
0
 def id(self):
     return ServiceId(self.fsid, self.service_type, self.service_id)