def __init__(self, fsid, cluster_name): """Requiring cluster_name and fsid is redundant (ideally everything would speak in terms of fsid) but convenient, because the librados interface wants a cluster name when you create a client, and otherwise we would have to look up via ceph.conf. """ # getChild isn't in 2.6 self.requested_at = now() self.completed_at = None # This is actually kind of overkill compared with having a counter, # somewhere but it's easy. self.id = uuid.uuid4().__str__() self._minion_id = None self.fsid = fsid self._cluster_name = cluster_name self.jid = None self.state = self.NEW self.result = None self.error = False self.error_message = "" # Time at which we last believed the current JID to be really running self.alive_at = None
def on_sync_object(self, data): assert data['fsid'] == self.fsid sync_object = data['data'] sync_type = SYNC_OBJECT_STR_TYPE[data['type']] new_object = self.inject_sync_object(data['type'], data['version'], sync_object) if new_object: self._persister.update_sync_object( str(time.time()), self.fsid, self.name, sync_type.str, new_object.version if isinstance( new_object.version, int) else None, now(), sync_object, self._manager.cluster_id) if sync_type.str == "osd_map": util_data = self._get_utilization_data() for raw_pool in sync_object.get('pools', []): LOG.info("Updating Pool %s" % raw_pool['pool_name']) for pool in util_data['pools']: if pool['name'] == raw_pool['pool_name']: pool_used = pool['used'] pcnt = pool['pcnt_used'] pool = Pool(updated=str(time.time()), cluster_id=self._manager.cluster_id, pool_id=raw_pool['pool'], poolname=raw_pool['pool_name'], pg_num=raw_pool['pg_num'], min_size=raw_pool['min_size'], used=pool_used, percent_used=pcnt) self._persister.update_pool(pool) else: LOG.warn("ClusterMonitor.on_sync_object: stale object" " received for %s" % data['type'])
def fetch(self, sync_type): LOG.debug("SyncObjects.fetch: %s" % sync_type) self._fetching_at[sync_type] = now() # TODO(Rohan) clean up unused 'since' argument return ceph.get_cluster_object(self._cluster_name, sync_type.str, None)
def fetch(self, sync_type): Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": "SyncObjects.fetch: %s" % sync_type})) self._fetching_at[sync_type] = now() # TODO(Rohan) clean up unused 'since' argument return ceph.get_cluster_object(self._cluster_name, sync_type.str)
def complete(self): """Call this when you're all done """ assert self.state != self.COMPLETE assert self.jid is None LOG.info("Request %s completed with error=%s (%s)" % (self.id, self.error, self.error_message)) self.state = self.COMPLETE self.completed_at = now()
def on_sync_object(self, data): assert data['fsid'] == self.fsid sync_object = data['data'] sync_type = SYNC_OBJECT_STR_TYPE[data['type']] new_object = self.inject_sync_object(data['type'], data['version'], sync_object) if new_object: tendrl_ns.ceph_integration.objects.SyncObject( updated=now(), sync_type=sync_type.str, version=new_object.version if isinstance( new_object.version, int) else None, when=now(), data=sync_object).save() if sync_type.str == "osd_map": util_data = self._get_utilization_data() for raw_pool in sync_object.get('pools', []): LOG.info("Updating Pool %s" % raw_pool['pool_name']) for pool in util_data['pools']: if pool['name'] == raw_pool['pool_name']: pool_used = pool['used'] pcnt = pool['pcnt_used'] tendrl_ns.ceph_integration.objects.Pool( pool_id=raw_pool['pool'], pool_name=raw_pool['pool_name'], pg_num=raw_pool['pg_num'], min_size=raw_pool['min_size'], used=pool_used, percent_used=pcnt).save() else: LOG.warn("ClusterMonitor.on_sync_object: stale object" " received for %s" % data['type'])
def complete(self): """Call this when you're all done """ assert self.state != self.COMPLETE assert self.jid is None Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Request %s completed with error=%s (%s)" % (self.id, self.error, self.error_message) })) self.state = self.COMPLETE self.completed_at = now()
def on_version(self, sync_type, new_version): """Notify me that a particular version of a particular map exists. I may choose to initiate RPC to retrieve the map """ LOG.debug( "SyncObjects.on_version %s/%s" % (sync_type.str, new_version) ) old_version = self.get_version(sync_type) if sync_type.cmp(new_version, old_version) > 0: known_version = self._known_versions[sync_type] if sync_type.cmp(new_version, known_version) > 0: # We are out of date: request an up to date copy LOG.info("Advanced known version %s/%s %s->%s" % ( self._cluster_name, sync_type.str, known_version, new_version)) self._known_versions[sync_type] = new_version else: LOG.info( "on_version: %s is newer than %s" % ( new_version, old_version ) ) # If we already have a request out for this type of map, # then consider cancelling it if we've already waited for # a while. if self._fetching_at[sync_type] is not None: if now() - self._fetching_at[sync_type] < self.FETCH_TIMEOUT: LOG.info("Fetch already underway for %s" % sync_type.str) return else: LOG.warn("Abandoning fetch for %s started at %s" % ( sync_type.str, self._fetching_at[sync_type])) LOG.info( "on_version: fetching %s/%s , " "currently got %s, know %s" % ( sync_type, new_version, old_version, known_version ) ) return self.fetch(sync_type)
def _emit(self, severity, message, **associations): """:param severity: One of the defined serverity values :param message: One line human readable string :param associations: Optional extra attributes to associate the event with a particular cluster/server/service """ now_utc = now() LOG.info("Eventer._emit: %s/%s/%s" % (now_utc, severity_str(severity), message)) self._events.append( Event(id=str(uuid.uuid4()), when=now_utc, message=message, severity=severity, **associations))
def on_server_heartbeat(self, fqdn, server_heartbeat): """Call back for when a ceph.service message is received from a salt minion. This is actually a fairly simple operation of updating the in memory ServerState to reflect what is in the message, but it's convoluted because we may be seeing a new server, a known server, or a server which was known but unmanaged. """ LOG.debug("ServerMonitor.on_server_heartbeat: %s" % fqdn) new_server = True newly_managed_server = False try: server_state = self.servers[fqdn] new_server = False except KeyError: # Look up the grains for this server, we need to know its # hostname in order to resolve this vs. the OSD map. hostname = fqdn if hostname in self.hostname_to_server: server_state = self.hostname_to_server[hostname] if not server_state.managed: # Take over a ServerState that was created from OSD map server_state.managed = True old_fqdn = server_state.fqdn # OSD map servers would have faked up FQDN as hostname, # so clear that out del self.servers[old_fqdn] server_state.fqdn = fqdn self.servers[server_state.fqdn] = server_state for service_name, service in server_heartbeat[ 'services'].items(): self._persister.create_server( Server( fsid=service['fsid'], fqdn=server_state.fqdn, managed=True, )) break new_server = False LOG.info("Server %s went from unmanaged to managed" % fqdn) newly_managed_server = True else: # We will go on to treat these as distinct servers even # though they have the same hostname LOG.warn("Hostname clash: FQDNs '%s' and" " '%s' both have hostname %s" % (fqdn, server_state.fqdn, hostname)) else: # The case where hostname == FQDN, we may already have this # FQDN in our map from an unmanaged server being reported by # hostname. if not server_state.managed: newly_managed_server = True server_state.managed = True for service_name, service in server_heartbeat[ 'services'].items(): self._persister.create_server( Server( fsid=service['fsid'], fqdn=server_state.fqdn, managed=True, )) LOG.info("Server %s went from unmanaged to managed" % fqdn) break boot_time = datetime.datetime.fromtimestamp( server_heartbeat['boot_time'], tz=tz.tzutc()) if new_server: hostname = fqdn server_state = ServerState( fqdn, hostname, managed=True, last_contact=now(), boot_time=boot_time, ceph_version=server_heartbeat['ceph_version']) self.inject_server(server_state) for service_name, service in server_heartbeat['services'].items(): self._persister.create_server( Server(fsid=service['fsid'], fqdn=server_state.fqdn, hostname=server_state.hostname, managed=server_state.managed, last_contact=server_state.last_contact, boot_time=boot_time, ceph_version=server_heartbeat['ceph_version'])) LOG.info("Saw server %s for the first time" % server_state) break server_state.last_contact = now() for service_name, service in server_heartbeat['services'].items(): self._persister.create_server( Server( fsid=service['fsid'], fqdn=server_state.fqdn, last_contact=server_state.last_contact, )) break if server_state.boot_time != boot_time: LOG.warn("{0} boot time changed, old {1} new {2}".format( server_state.fqdn, server_state.boot_time, boot_time)) old_boot_time = server_state.boot_time server_state.boot_time = boot_time for service_name, service in server_heartbeat['services'].items(): self._persister.create_server( Server( fsid=service['fsid'], fqdn=server_state.fqdn, boot_time=server_state.boot_time, )) break if old_boot_time is not None: # i.e. a reboot, not an unmanaged->managed transition if server_state.boot_time < old_boot_time: LOG.warn("Server boot time went backwards") elif server_state.boot_time - old_boot_time < REBOOT_THRESHOLD: LOG.warn("Server boot time changed, but only a little") else: # A substantial forward change in boot time, that's a # reboot: emit a user visible event LOG.warn("{0} rebooted!".format(fqdn)) self._eventer.on_reboot(server_state, False) if server_state.ceph_version != server_heartbeat['ceph_version']: # Interpret "no package installed but some services running" as # meaning we're in the process of upgrading. upgrading = server_heartbeat[ 'ceph_version'] is None and server_heartbeat['services'] if server_heartbeat['ceph_version'] is None and upgrading: # Ignore version=None while upgrading to avoid generating # spurious "ceph uninstalled" events pass else: server_state.ceph_version = server_heartbeat['ceph_version'] for service_name, service in server_heartbeat[ 'services'].items(): self._persister.create_server( Server( fsid=service['fsid'], fqdn=server_state.fqdn, ceph_version=server_state.ceph_version, )) break if not (new_server or newly_managed_server): self._eventer.on_new_version(server_state) seen_id_tuples = set() for service_name, service in server_heartbeat['services'].items(): id_tuple = ServiceId(service['fsid'], service['type'], service['id']) seen_id_tuples.add(id_tuple) self._register_service(server_state, id_tuple, running=True, status=service['status'], fsid=service['fsid'], fqdn=fqdn) # For any service which was last reported on this server but # is now gone, mark it as not running for unseen_id_tuple in set( server_state.services.keys()) ^ seen_id_tuples: service_state = self.services[unseen_id_tuple] if service_state.running: LOG.info("Service %s stopped on server %s" % (service_state, server_state)) service_state.running = False if new_server or newly_managed_server: # We do this at the end so that by the time we emit the event # the ServiceState objects have been created self._eventer.on_server(server_state)
def on_sync_object(self, data): assert data['fsid'] == self.fsid sync_object = copy.deepcopy(data['data']) sync_type = SYNC_OBJECT_STR_TYPE[data['type']] new_object = self.inject_sync_object(data['type'], data['version'], sync_object) if new_object: NS.ceph.objects.SyncObject( updated=now(), sync_type=sync_type.str, version=new_object.version if isinstance( new_object.version, int) else None, when=now(), data=data['data']).save() if sync_type.str == "health": NS.ceph.objects.GlobalDetails( status=sync_object['overall_status']).save() if sync_type.str == "osd_map": util_data = self._get_utilization_data() NS.ceph.objects.Utilization( total=util_data['cluster']['total'], used=util_data['cluster']['used'], available=util_data['cluster']['available'], pcnt_used=util_data['cluster']['pcnt_used']).save() for raw_pool in sync_object.get('pools', []): Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Updating Pool %s" % raw_pool['pool_name'] })) for pool in util_data['pools']: if pool['name'] == raw_pool['pool_name']: pool_used = pool['used'] pcnt = pool['pcnt_used'] pool_type = 'replicated' if 'erasure_code_profile' in raw_pool and \ raw_pool['erasure_code_profile'] != "": pool_type = 'erasure_coded' quota_enabled = False if ('quota_max_objects' in raw_pool and raw_pool['quota_max_objects'] > 0) or \ ('quota_max_bytes' in raw_pool and raw_pool['quota_max_bytes'] > 0): quota_enabled = True NS.ceph.objects.Pool( pool_id=raw_pool['pool'], pool_name=raw_pool['pool_name'], pg_num=raw_pool['pg_num'], type=pool_type, erasure_code_profile=raw_pool.get( 'erasure_code_profile'), min_size=raw_pool['min_size'], size=raw_pool.get('size', None), quota_enabled=quota_enabled, quota_max_objects=raw_pool['quota_max_objects'], quota_max_bytes=raw_pool['quota_max_bytes'], used=pool_used, percent_used=pcnt).save() else: Event( Message(priority="warning", publisher=NS.publisher_id, payload={ "message": "ClusterMonitor.on_sync_object: " "stale object received for %s" % data['type'] }))
def on_sync_object(self, data): assert data['fsid'] == self.fsid sync_object = copy.deepcopy(data['data']) sync_type = SYNC_OBJECT_STR_TYPE[data['type']] new_object = self.inject_sync_object(data['type'], data['version'], sync_object) self._request_coll.on_map(sync_type, new_object) if new_object: NS.ceph.objects.SyncObject( updated=now(), sync_type=sync_type.str, version=new_object.version if isinstance( new_object.version, int) else None, when=now(), data=data['data']).save(update=False) if sync_type.str == "health": NS.ceph.objects.GlobalDetails( status=sync_object['overall_status']).save() if sync_type.str == "osd_map": util_data = self._get_utilization_data() NS.ceph.objects.Utilization( total=util_data['cluster']['total'], used=util_data['cluster']['used'], available=util_data['cluster']['available'], pcnt_used=util_data['cluster']['pcnt_used']).save() for raw_pool in sync_object.get('pools', []): Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Updating Pool %s" % raw_pool['pool_name'] })) pool_used = 0 pcnt = 0 for pool in util_data['pools']: if pool['name'] == raw_pool['pool_name']: pool_used = pool['used'] pcnt = pool['pcnt_used'] pool_type = 'replicated' if 'erasure_code_profile' in raw_pool and \ raw_pool['erasure_code_profile'] != "": pool_type = 'erasure_coded' quota_enabled = False if ('quota_max_objects' in raw_pool and raw_pool['quota_max_objects'] > 0) or \ ('quota_max_bytes' in raw_pool and raw_pool['quota_max_bytes'] > 0): quota_enabled = True NS.ceph.objects.Pool( pool_id=raw_pool['pool'], pool_name=raw_pool['pool_name'], pg_num=raw_pool['pg_num'], type=pool_type, erasure_code_profile=raw_pool.get( 'erasure_code_profile'), min_size=raw_pool['min_size'], size=raw_pool.get('size', None), quota_enabled=quota_enabled, quota_max_objects=raw_pool['quota_max_objects'], quota_max_bytes=raw_pool['quota_max_bytes'], used=pool_used, percent_used=pcnt).save() for raw_osd in sync_object.get('osds', []): Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Updating OSD %s" % raw_osd['osd'] })) NS.ceph.objects.Osd( id=raw_osd['osd'], uuid=raw_osd['uuid'], public_addr=raw_osd['public_addr'], cluster_addr=raw_osd['cluster_addr'], heartbeat_front_addr=raw_osd['heartbeat_front_addr'], heartbeat_back_addr=raw_osd['heartbeat_back_addr'], down_at=raw_osd['down_at'], up_from=raw_osd['up_from'], lost_at=raw_osd['lost_at'], osd_up=raw_osd['up'], osd_in=raw_osd['in'], up_thru=raw_osd['up_thru'], weight=str(raw_osd['weight']), primary_affinity=str(raw_osd['primary_affinity']), state=raw_osd['state'], last_clean_begin=raw_osd['last_clean_begin'], last_clean_end=raw_osd['last_clean_end']).save() else: Event( Message(priority="warning", publisher=NS.publisher_id, payload={ "message": "ClusterMonitor.on_sync_object: " "stale object received for %s" % data['type'] }))
def on_version(self, sync_type, new_version): """Notify me that a particular version of a particular map exists. I may choose to initiate RPC to retrieve the map """ Event( Message( priority="debug", publisher=NS.publisher_id, payload={"message": "SyncObjects.on_version %s/%s" % (sync_type.str, new_version) } ) ) old_version = self.get_version(sync_type) if sync_type.cmp(new_version, old_version) > 0: known_version = self._known_versions[sync_type] if sync_type.cmp(new_version, known_version) > 0: # We are out of date: request an up to date copy Event( Message( priority="info", publisher=NS.publisher_id, payload={"message": "Advanced known version %s/%s " "%s->%s" % (self._cluster_name, sync_type.str, known_version, new_version ) } ) ) self._known_versions[sync_type] = new_version else: Event( Message( priority="info", publisher=NS.publisher_id, payload={"message": "on_version: %s is newer than %s" % (new_version, old_version) } ) ) # If we already have a request out for this type of map, # then consider cancelling it if we've already waited for # a while. if self._fetching_at[sync_type] is not None: if now() - self._fetching_at[sync_type] < self.FETCH_TIMEOUT: Event( Message( priority="info", publisher=NS.publisher_id, payload={"message": "Fetch already underway for %s" % sync_type.str } ) ) return else: Event( Message( priority="debug", publisher=NS.publisher_id, payload={"message": "Abandoning fetch for %s " "started at %s" % (sync_type.str, self._fetching_at[sync_type] ) } ) ) Event( Message( priority="info", publisher=NS.publisher_id, payload={"message": "on_version: fetching %s/%s , " "currently got %s, know %s" % (sync_type, new_version, old_version, known_version ) } ) ) return self.fetch(sync_type)
def test_now(): assert type(util.now()) is datetime.datetime