def fetch(self, minion_id, sync_type): log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type)) if minion_id is None: # We're probably being replayed to from the database log.warn("SyncObjects.fetch called with minion_id=None") return self._fetching_at[sync_type] = now() client = LocalClient(config.get('cthulhu', 'salt_config_path')) # TODO clean up unused 'since' argument pub_data = client.run_job( minion_id, 'ceph.get_cluster_object', condition_kwarg( [], { 'cluster_name': self._cluster_name, 'sync_type': sync_type.str, 'since': None })) if not pub_data: log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type)) # Don't throw an exception because if a fetch fails we should always else: log.debug("SyncObjects.fetch: jid=%s minions=%s" % (pub_data['jid'], pub_data['minions']))
def on_sync_object(self, fsid, sync_type, new, old): """ Notification that a newer version of a SyncObject is available, or the first version of a SyncObject is available at startup (wherein old will be a null SyncObject) :param fsid: The FSID of the cluster to which the object belongs :param sync_type: A SyncObject subclass :param new: A SyncObject :param old: A SyncObject (same type as new) """ log.debug("Eventer.on_sync_object: %s" % sync_type.str) if old.data is None: return if sync_type == OsdMap: self._on_pool_status(fsid, new, old) self._on_osd_map(fsid, new, old) elif sync_type == Health: self._on_health(fsid, new, old) elif sync_type == MonStatus: self._on_mon_status(fsid, new, old) elif sync_type == QuorumStatus: self._on_quorum_status(fsid, new, old) self._flush()
def on_fetch_complete(self, minion_id, sync_type, version, data): """ :return A SyncObject if this version was new to us, else None """ log.debug("SyncObjects.on_fetch_complete %s/%s/%s" % (minion_id, sync_type.str, version)) self._fetching_at[sync_type] = None # A fetch might give us a newer version than we knew we had asked for if sync_type.cmp(version, self._known_versions[sync_type]) > 0: self._known_versions[sync_type] = version # Don't store this if we already got something newer if sync_type.cmp(version, self.get_version(sync_type)) <= 0: log.warn("Ignoring outdated update %s/%s from %s" % (sync_type.str, version, minion_id)) new_object = None else: log.info("Got new version %s/%s" % (sync_type.str, version)) new_object = self.set_map(sync_type, version, data) # This might not be the latest: if it's not, send out another fetch # right away if sync_type.cmp(self._known_versions[sync_type], version) > 0: self.fetch(minion_id, sync_type) return new_object
def _run(self): log.info("%s running" % self.__class__.__name__) event = SaltEventSource(log, salt_config) while not self._complete.is_set(): # No salt tag filtering: https://github.com/saltstack/salt/issues/11582 ev = event.get_event(full=True) if ev is not None and 'tag' in ev: tag = ev['tag'] data = ev['data'] try: if tag.startswith("ceph/cluster/"): cluster_data = data['data'] if not cluster_data['fsid'] in self._manager.clusters: self._manager.on_discovery(data['id'], cluster_data) else: log.debug( "%s: heartbeat from existing cluster %s" % (self.__class__.__name__, cluster_data['fsid'])) elif re.match("^salt/job/\d+/ret/[^/]+$", tag): if data['fun'] == 'saltutil.running': self._manager.requests.on_tick_response( data['id'], data['return']) else: self._manager.requests.on_completion(data) else: # This does not concern us, ignore it log.debug("TopLevelEvents: ignoring %s" % tag) pass except: log.exception("Exception handling message tag=%s" % tag) log.info("%s complete" % self.__class__.__name__)
def _run(self): log.info("%s running" % self.__class__.__name__) event = SaltEventSource(salt_config) while not self._complete.is_set(): # No salt tag filtering: https://github.com/saltstack/salt/issues/11582 ev = event.get_event(full=True) if ev is not None: tag = ev['tag'] data = ev['data'] try: if tag.startswith("ceph/cluster/"): cluster_data = data['data'] if not cluster_data['fsid'] in self._manager.clusters: self._manager.on_discovery(data['id'], cluster_data) else: log.debug( "%s: heartbeat from existing cluster %s" % (self.__class__.__name__, cluster_data['fsid'])) else: # This does not concern us, ignore it pass except: log.debug("Message content: %s" % data) log.exception("Exception handling message") log.info("%s complete" % self.__class__.__name__)
def _run(self): log.info("%s running" % self.__class__.__name__) event = SaltEventSource(salt_config) while not self._complete.is_set(): # No salt tag filtering: https://github.com/saltstack/salt/issues/11582 ev = event.get_event(full=True) if ev is not None: tag = ev['tag'] data = ev['data'] try: if tag.startswith("ceph/cluster/"): cluster_data = data['data'] if not cluster_data['fsid'] in self._manager.clusters: self._manager.on_discovery(data['id'], cluster_data) else: log.debug("%s: heartbeat from existing cluster %s" % ( self.__class__.__name__, cluster_data['fsid'])) else: # This does not concern us, ignore it pass except: log.debug("Message content: %s" % data) log.exception("Exception handling message") log.info("%s complete" % self.__class__.__name__)
def _run(self): log.info("%s running" % self.__class__.__name__) event = SaltEventSource(log, salt_config) while not self._complete.is_set(): # No salt tag filtering: https://github.com/saltstack/salt/issues/11582 ev = event.get_event(full=True) if ev is not None and 'tag' in ev: tag = ev['tag'] data = ev['data'] try: if tag.startswith("ceph/cluster/"): cluster_data = data['data'] if not cluster_data['fsid'] in self._manager.clusters: self._manager.on_discovery(data['id'], cluster_data) else: log.debug("%s: heartbeat from existing cluster %s" % ( self.__class__.__name__, cluster_data['fsid'])) elif re.match("^salt/job/\d+/ret/[^/]+$", tag): if data['fun'] == 'saltutil.running': self._manager.requests.on_tick_response(data['id'], data['return']) else: self._manager.requests.on_completion(data) else: # This does not concern us, ignore it log.debug("TopLevelEvents: ignoring %s" % tag) pass except: log.exception("Exception handling message tag=%s" % tag) log.info("%s complete" % self.__class__.__name__)
def _is_favorite(self, minion_id): """ Check if this minion is the one which we are currently treating as the primary source of updates, and promote it to be the favourite if the favourite has not sent a heartbeat since cthulhu->favorite_timeout_s. :return True if this minion was the favorite or has just been promoted. """ t_now = now() self._last_heartbeat[minion_id] = t_now if self._favorite_mon is None: log.debug("%s is my new favourite" % minion_id) self._set_favorite(minion_id) return True elif minion_id != self._favorite_mon: # Consider whether this minion should become my new favourite: has it been # too long since my current favourite reported in? time_since = t_now - self._last_heartbeat[self._favorite_mon] favorite_timeout_s = self._servers.get_contact_period(self._favorite_mon) * FAVORITE_TIMEOUT_FACTOR if time_since > datetime.timedelta(seconds=favorite_timeout_s): log.debug("My old favourite, %s, has not sent a heartbeat for %s: %s is my new favourite" % ( self._favorite_mon, time_since, minion_id )) self._set_favorite(minion_id) return minion_id == self._favorite_mon
def on_version(self, reported_by, sync_type, new_version): """ Notify me that a particular version of a particular map exists. I may choose to initiate RPC to retrieve the map """ log.debug("SyncObjects.on_version %s/%s/%s" % (reported_by, sync_type.str, new_version)) old_version = self.get_version(sync_type) if sync_type.cmp(new_version, old_version) > 0: known_version = self._known_versions[sync_type] if sync_type.cmp(new_version, known_version) > 0: # We are out of date: request an up to date copy log.info("Advanced known version %s/%s %s->%s" % ( self._cluster_name, sync_type.str, known_version, new_version)) self._known_versions[sync_type] = new_version else: log.info("on_version: %s is newer than %s" % (new_version, old_version)) # If we already have a request out for this type of map, then consider # cancelling it if we've already waited for a while. if self._fetching_at[sync_type] is not None: if now() - self._fetching_at[sync_type] < self.FETCH_TIMEOUT: log.info("Fetch already underway for %s" % sync_type.str) return else: log.warn("Abandoning fetch for %s started at %s" % ( sync_type.str, self._fetching_at[sync_type])) log.info("on_version: fetching %s/%s from %s, currently got %s, know %s" % ( sync_type, new_version, reported_by, old_version, known_version )) self.fetch(reported_by, sync_type)
def create(self, attributes): commands = [('osd pool create', {'pool': attributes['name'], 'pg_num': attributes['pg_num']})] # Calculate appropriate min_size, including default if none given req_size = attributes.get('size', 0) req_min_size = attributes.get('min_size', 0) attributes['min_size'] = self._pool_min_size(req_size, req_min_size) # Which attributes must we set after the initial create? post_create_attrs = attributes.copy() del post_create_attrs['name'] del post_create_attrs['pg_num'] if 'pgp_num' in post_create_attrs: del post_create_attrs['pgp_num'] commands.extend(self._pool_attribute_commands( attributes['name'], post_create_attrs )) log.debug("Post-create attributes: %s" % post_create_attrs) log.debug("Commands: %s" % commands) return PoolCreatingRequest( "Creating pool '{name}'".format(name=attributes['name']), self._cluster_monitor.fsid, self._cluster_monitor.name, attributes['name'], commands)
def _emit_stats(self): try: if not self._socket: log.info("Opening carbon socket {0}:{1}".format( self.CARBON_HOST, self.CARBON_PORT)) self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self._socket.connect((self.CARBON_HOST, self.CARBON_PORT)) carbon_data = "" t = int(time.time()) usage = resource.getrusage(resource.RUSAGE_SELF) for usage_field in ("utime", "stime", "maxrss", "ixrss", "idrss", "isrss", "minflt", "majflt", "nswap", "inblock", "oublock", "msgsnd", "msgrcv", "nsignals", "nvcsw", "nivcsw"): val = getattr(usage, "ru_{0}".format(usage_field)) log.debug("{0}: {1}".format(usage_field, val)) carbon_data += "calamari.cthulhu.ru_{0} {1} {2}\n".format( usage_field, val, t) self._socket.sendall(carbon_data) except socket.gaierror, resource.error: log.exception("Failed to send debugging statistics") self._close()
def wrap(*args, **kwargs): log.debug("RpcInterface >> %s(%s, %s)" % (item, args, kwargs)) try: rc = attr(*args, **kwargs) log.debug("RpcInterface << %s" % item) except: log.exception("RpcInterface !! %s" % item) raise return rc
def _run(self): self._emit(INFO, "Calamari server started") self._flush() self._complete.wait(GRACE_PERIOD) while not self._complete.is_set(): self.on_tick() self._complete.wait(TICK_SECONDS) log.debug("Eventer complete")
def _run(self): log.debug("Eventer running") self._emit(INFO, "Calamari server started") self._emit_to_salt_bus(SEVERITIES[INFO], "Calamari server started", "ceph/calamari/started") self._flush() self._complete.wait(GRACE_PERIOD) while not self._complete.is_set(): self.on_tick() self._complete.wait(TICK_SECONDS) log.debug("Eventer complete")
def on_tick(self): """ Periodically call this to drive non-event-driven events (i.e. things which are based on walltime checks) """ log.debug("Eventer.on_tick") now_utc = now() for fqdn, server_state in self._manager.servers.servers.items(): if not server_state.managed: # We don't expect messages from unmanaged servers so don't # worry about whether they sent us one recently. continue if len(server_state.clusters) == 1: # Because Events can only be associated with one FSID, we only make this # association for servers with exactly one cluster. This is a bit cheeky and # kind of an unnecessary limitation in the Event DB schema. fsid = server_state.clusters[0] else: fsid = None contact_threshold = CONTACT_THRESHOLD_FACTOR * self._manager.servers.get_contact_period(fqdn) if now_utc - server_state.last_contact > datetime.timedelta(seconds=contact_threshold): if fqdn not in self._servers_complained: self._emit(WARNING, "Server {fqdn} is late reporting in, last report at {last}".format( fqdn=fqdn, last=server_state.last_contact ), fqdn=fqdn, fsid=fsid) self._servers_complained.add(fqdn) else: if fqdn in self._servers_complained: self._emit(RECOVERY, "Server {fqdn} regained contact".format(fqdn=fqdn), fqdn=fqdn, fsid=fsid) self._servers_complained.discard(fqdn) for fsid, cluster_monitor in self._manager.clusters.items(): if cluster_monitor.update_time is None or now_utc - cluster_monitor.update_time > datetime.timedelta( seconds=CLUSTER_CONTACT_THRESHOLD): if fsid not in self._clusters_complained: self._clusters_complained.add(fsid) self._emit(WARNING, "Cluster '{name}' is late reporting in".format(name=cluster_monitor.name), fsid=fsid) else: if fsid in self._clusters_complained: self._emit(RECOVERY, "Cluster '{name}' regained contact".format(name=cluster_monitor.name), fsid=fsid) self._clusters_complained.discard(fsid) self._flush()
def _run(self): self._plugin_monitor.start() self._ready.set() log.debug("ClusterMonitor._run: ready") remote.listen(self._complete, on_heartbeat=self.on_heartbeat, fsid=self.fsid, on_job=self.on_job_complete) log.info("%s complete" % self.__class__.__name__) self._plugin_monitor.stop() self._plugin_monitor.join() self.done.set()
def tick(self): """ For walltime-based monitoring of running requests. Long-running requests get a periodic call to saltutil.running to verify that things really are still happening. """ if not self._by_jid: return else: log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid)) # Identify JIDs who haven't had a saltutil.running reponse for too long. # Kill requests in a separate phase because request:JID is not 1:1 stale_jobs = set() _now = now() for request in self._by_jid.values(): if _now - request.alive_at > datetime.timedelta( seconds=TICK_PERIOD * 3): log.error("Request %s JID %s stale: now=%s, alive_at=%s" % (request.id, request.jid, _now, request.alive_at)) stale_jobs.add(request) # Any identified stale jobs are errored out. for request in stale_jobs: with self._update_index(request): request.set_error("Lost contact") request.jid = None request.complete() # Identify minions associated with JIDs in flight query_minions = set() for jid, request in self._by_jid.items(): query_minions.add(request.minion_id) # Attempt to emit a saltutil.running to ping jobs, next tick we # will see if we got updates to the alive_at attribute to indicate non-staleness if query_minions: log.info("RequestCollection.tick: sending saltutil.running to {0}". format(query_minions)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list") if not pub_data: log.warning("Failed to publish saltutil.running to {0}".format( query_minions))
def on_tick_response(self, minion_id, jobs): """ Update the alive_at parameter of requests to record that they are still running remotely. :param jobs: The response from a saltutil.running """ log.debug("RequestCollection.on_tick_response: %s from %s" % (len(jobs), minion_id)) for job in jobs: try: request = self._by_jid[job['jid']] except KeyError: # Not one of mine, ignore it pass else: request.alive_at = now()
def on_job_complete(self, fqdn, jid, success, result, cmd, args): # It would be much nicer to put the FSID at the start of # the tag, if salt would only let us add custom tags to our jobs. # Instead we enforce a convention that calamari jobs include # fsid in their return value. if 'fsid' not in result or result['fsid'] != self.fsid: # Something for a different ClusterMonitor log.debug("Ignoring job return, not for my FSID") return if cmd == 'ceph.get_cluster_object': # A ceph.get_cluster_object response if not success: log.error("on_sync_object: failure from %s: %s" % (fqdn, result)) return self.on_sync_object(fqdn, result) else: log.warning("Unexpected function '%s' (%s)" % (cmd, cmd))
def tick(self): """ For walltime-based monitoring of running requests. Long-running requests get a periodic call to saltutil.running to verify that things really are still happening. """ if not self._by_jid: return else: log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid)) # Identify JIDs who haven't had a saltutil.running reponse for too long. # Kill requests in a separate phase because request:JID is not 1:1 stale_jobs = set() _now = now() for request in self._by_jid.values(): if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3): log.error("Request %s JID %s stale: now=%s, alive_at=%s" % ( request.id, request.jid, _now, request.alive_at )) stale_jobs.add(request) # Any identified stale jobs are errored out. for request in stale_jobs: with self._update_index(request): request.set_error("Lost contact") request.jid = None request.complete() # Identify minions associated with JIDs in flight query_minions = set() for jid, request in self._by_jid.items(): query_minions.add(request.minion_id) # Attempt to emit a saltutil.running to ping jobs, next tick we # will see if we got updates to the alive_at attribute to indicate non-staleness if query_minions: log.info("RequestCollection.tick: sending saltutil.running to {0}".format(query_minions)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list") if not pub_data: log.warning("Failed to publish saltutil.running to {0}".format(query_minions))
def run_plugin(self, plugin_name, status_processor, period): # slice of some time for the checks, leaving some for the status_processor check_timeout = int(period * .75) salt_name = '.'.join((plugin_name, 'status_check')) while not self._complete.is_set(): start = int(time.time()) timeout_at = start + period servers = [s.fqdn for s in self._servers.get_all()] check_data = self.filter_errors(self._remote_run_cmd_async(servers, salt_name, timeout=check_timeout), salt_name) self.plugin_results[plugin_name] = status_processor(check_data) log.debug("processed " + str(plugin_name) + str(check_data)) time_left = timeout_at - int(time.time()) gevent.sleep(max(0, time_left))
def run_plugin(self, plugin_name, status_processor, period): # slice of some time for the checks, leaving some for the status_processor check_timeout = int(period * .75) salt_name = '.'.join((plugin_name, 'status_check')) while not self._complete.is_set(): start = int(time.time()) timeout_at = start + period servers = [s.fqdn for s in self._servers.get_all()] check_data = self.filter_errors( self._remote_run_cmd_async(servers, salt_name, timeout=check_timeout), salt_name) self.plugin_results[plugin_name] = status_processor(check_data) log.debug("processed " + str(plugin_name) + str(check_data)) time_left = timeout_at - int(time.time()) gevent.sleep(max(0, time_left))
def fetch(self, minion_id, sync_type): log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type)) if minion_id is None: # We're probably being replayed to from the database log.warn("SyncObjects.fetch called with minion_id=None") return self._fetching_at[sync_type] = now() client = LocalClient(config.get('cthulhu', 'salt_config_path')) # TODO clean up unused 'since' argument pub_data = client.run_job(minion_id, 'ceph.get_cluster_object', condition_kwarg([], {'cluster_name': self._cluster_name, 'sync_type': sync_type.str, 'since': None})) if not pub_data: log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type)) # Don't throw an exception because if a fetch fails we should always else: log.debug("SyncObjects.fetch: jid=%s minions=%s" % (pub_data['jid'], pub_data['minions']))
def __init__(self, manager): super(Eventer, self).__init__() self._manager = manager self._complete = gevent.event.Event() # Flags for things we have complained about being out of contact # with, to avoid generating the same events repeatedly self._servers_complained = set() self._clusters_complained = set() # Check the config to decide if events has to be pushed to salt event bus. # If config is set initialize the salt caller object used to push events. if EMIT_EVENTS_TO_SALT_EVENT_BUS: log.debug("Events will be emitted to salt event bus") __opts__ = salt.config.minion_config(MINION_CONFIG) __opts__['file_client'] = 'local' self.caller = salt.client.Caller(mopts=__opts__) self._events = []
def fetch(self, minion_id, sync_type): log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type)) if minion_id is None: # We're probably being replayed to from the database log.warn("SyncObjects.fetch called with minion_id=None") return self._fetching_at[sync_type] = now() try: # TODO clean up unused 'since' argument jid = remote.run_job(minion_id, 'ceph.get_cluster_object', {'cluster_name': self._cluster_name, 'sync_type': sync_type.str, 'since': None}) except Unavailable: # Don't throw an exception because if a fetch fails we should end up # issuing another on next heartbeat log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type)) else: log.debug("SyncObjects.fetch: jid=%s" % jid)
def on_sync_object(self, minion_id, data): if minion_id != self._favorite_mon: log.debug("Ignoring map from %s, it is not my favourite (%s)" % (minion_id, self._favorite_mon)) assert data['fsid'] == self.fsid sync_object = data['data'] sync_type = SYNC_OBJECT_STR_TYPE[data['type']] new_object = self.inject_sync_object(minion_id, data['type'], data['version'], sync_object) if new_object: self._requests.on_map(self.fsid, sync_type, new_object) self._persister.update_sync_object( self.fsid, self.name, sync_type.str, new_object.version if isinstance(new_object.version, int) else None, now(), sync_object) else: log.warn("ClusterMonitor.on_sync_object: stale object received from %s" % minion_id)
def _emit_stats(self): try: if not self._socket: log.info("Opening carbon socket {0}:{1}".format(self.CARBON_HOST, self.CARBON_PORT)) self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self._socket.connect((self.CARBON_HOST, self.CARBON_PORT)) carbon_data = "" t = int(time.time()) usage = resource.getrusage(resource.RUSAGE_SELF) for usage_field in ("utime", "stime", "maxrss", "ixrss", "idrss", "isrss", "minflt", "majflt", "nswap", "inblock", "oublock", "msgsnd", "msgrcv", "nsignals", "nvcsw", "nivcsw"): val = getattr(usage, "ru_{0}".format(usage_field)) log.debug("{0}: {1}".format(usage_field, val)) carbon_data += "calamari.cthulhu.ru_{0} {1} {2}\n".format(usage_field, val, t) self._socket.sendall(carbon_data) except socket.gaierror, resource.error: log.exception("Failed to send debugging statistics") self._close()
def create(self, attributes): commands = [('osd pool create', {'pool': attributes['name'], 'pg_num': attributes['pg_num']})] # Which attributes must we set after the initial create? post_create_attrs = attributes.copy() del post_create_attrs['name'] del post_create_attrs['pg_num'] if 'pgp_num' in post_create_attrs: del post_create_attrs['pgp_num'] commands.extend(self._pool_attribute_commands( attributes['name'], post_create_attrs )) log.debug("Post-create attributes: %s" % post_create_attrs) log.debug("Commands: %s" % post_create_attrs) return PoolCreatingRequest( "Creating pool '{name}'".format(name=attributes['name']), self._cluster_monitor.fsid, self._cluster_monitor.name, attributes['name'], commands)
def on_heartbeat(self, minion_id, cluster_data): """ Handle a ceph.heartbeat from a minion. Heartbeats come from all servers, but we're mostly interested in those which come from a mon (and therefore have the 'clusters' attribute populated) as these tells us whether there are any new versions of cluster maps for us to fetch. """ if not self._is_favorite(minion_id): log.debug('Ignoring cluster data from %s, it is not my favourite (%s)' % (minion_id, self._favorite_mon)) return self.update_time = datetime.datetime.utcnow().replace(tzinfo=utc) log.debug('Checking for version increments in heartbeat from %s' % minion_id) for sync_type in SYNC_OBJECT_TYPES: self._sync_objects.on_version( minion_id, sync_type, cluster_data['versions'][sync_type.str])
def fetch(self, minion_id, sync_type): log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type)) if minion_id is None: # We're probably being replayed to from the database log.warn("SyncObjects.fetch called with minion_id=None") return self._fetching_at[sync_type] = now() try: # TODO clean up unused 'since' argument jid = remote.run_job( minion_id, 'ceph.get_cluster_object', { 'cluster_name': self._cluster_name, 'sync_type': sync_type.str, 'since': None }) except Unavailable: # Don't throw an exception because if a fetch fails we should end up # issuing another on next heartbeat log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type)) else: log.debug("SyncObjects.fetch: jid=%s" % jid)
def inject_sync_object(self, minion_id, sync_type, version, data): sync_type = SYNC_OBJECT_STR_TYPE[sync_type] old_object = self._sync_objects.get(sync_type) new_object = self._sync_objects.on_fetch_complete(minion_id, sync_type, version, data) if new_object: # The ServerMonitor is interested in cluster maps, do this prior # to updating any derived objects so that derived generators have # access to latest view of server state if sync_type == OsdMap: self._servers.on_osd_map(data) elif sync_type == MonMap: self._servers.on_mon_map(data) elif sync_type == MdsMap: self._servers.on_mds_map(self.fsid, data) # The frontend would like us to maintain some derived objects that # munge together the PG and OSD maps into an easier-to-consume form. for generator in derived.generators: if sync_type in generator.depends: dependency_data = {} for t in generator.depends: obj = self._sync_objects.get(t) if obj is not None: dependency_data[t] = obj.data else: dependency_data[t] = None if None not in dependency_data.values(): log.debug("Updating %s" % generator.__name__) derived_objects = generator.generate(self, self._servers, dependency_data) self._derived_objects.update(derived_objects) self._eventer.on_sync_object(self.fsid, sync_type, new_object, old_object) return new_object
def _emit_to_salt_bus(self, severity, message, tag, **tags): """ This function emits events to salt event bus, if the config value "emit_events_to_salt_event_bus" is set to true. """ log.debug("Eventer running _emit_salt") if not EMIT_EVENTS_TO_SALT_EVENT_BUS: return log.debug("Eventer running _emit_salt") res = {} res["message"] = message res["severity"] = severity res["tags"] = tags tag = EVENT_TAG_PREFIX + tag log.debug("Eventer._emit_to_salt_bus: Tag:%s | Data: %s" % (str(tag), str(res))) self.caller.sminion.functions['event.send'](tag, res)
def _emit_to_salt_bus(self, severity, message, tag, **tags): """ This function emits events to salt event bus, if the config value "emit_events_to_salt_event_bus" is set to true. """ log.debug("Eventer running _emit_salt") if not EMIT_EVENTS_TO_SALT_EVENT_BUS: return log.debug("Eventer running _emit_salt") res = {} res["message"] = message res["severity"] = severity res["tags"] = tags tag = EVENT_TAG_PREFIX + tag log.debug("Eventer._emit_to_salt_bus: Tag:%s | Data: %s" % (str(tag), str(res))) self.caller.sminion.functions['event.send']( tag, res )
def on_heartbeat(self, fqdn, data): if not data['fsid'] in self._manager.clusters: self._manager.on_discovery(fqdn, data) else: log.debug("%s: heartbeat from existing cluster %s" % ( self.__class__.__name__, data['fsid']))
def list_server_logs(self, fqdn): client = LocalClient(config.get('cthulhu', 'salt_config_path')) results = client.cmd(fqdn, "log_tail.list_logs", ["."]) log.debug('list_server_log result !!! {results}'.format( results=str(results))) return results
def stop(self): log.debug("Eventer stopping") self._complete.set()
def _run(self): self._plugin_monitor.start() self._ready.set() log.debug("ClusterMonitor._run: ready") event = SaltEventSource(log, salt_config) while not self._complete.is_set(): # No salt tag filtering: https://github.com/saltstack/salt/issues/11582 ev = event.get_event(full=True) if ev is not None: data = ev['data'] tag = ev['tag'] log.debug("_run.ev: %s/tag=%s" % (data['id'] if 'id' in data else None, tag)) # I am interested in the following tags: # - salt/job/<jid>/ret/<minion id> where jid is one that I started # (this includes ceph.rados_command and ceph.get_cluster_object) # - ceph/cluster/<fsid> where fsid is my fsid try: if tag.startswith("ceph/cluster/{0}".format(self.fsid)): # A ceph.heartbeat beacon self.on_heartbeat(data['id'], data['data']) elif re.match("^salt/job/\d+/ret/[^/]+$", tag): if data['fun'] == "saltutil.running": # Update on what jobs are running # It would be nice to filter these down to those which really are for # this cluster, but as long as N_clusters and N_jobs are reasonably small # it's not an efficiency problem. self._requests.on_tick_response(data['id'], data['return']) # It would be much nicer to put the FSID at the start of # the tag, if salt would only let us add custom tags to our jobs. # Instead we enforce a convention that all calamari jobs must include # fsid in their return value. if (not isinstance(data, dict)) or not isinstance(data['return'], dict): # Something not formatted for ClusterMonitor log.warning("Ignoring event %s" % tag) continue if 'fsid' not in data['return'] or data['return']['fsid'] != self.fsid: # Something for a different ClusterMonitor log.debug("Ignoring job return, not for my FSID") continue if data['fun'] == 'ceph.get_cluster_object': # A ceph.get_cluster_object response if not data['success']: log.error("on_sync_object: failure from %s: %s" % (data['id'], data['return'])) continue self.on_sync_object(data['id'], data['return']) else: log.warning("Unexpected function '%s' (%s)" % (data['fun'], tag)) else: # This does not concern us, ignore it pass except: # Because this is our main event handling loop, swallow exceptions # instead of letting them end the world. log.exception("Exception handling message with tag %s" % tag) log.debug("Message content: %s" % data) log.info("%s complete" % self.__class__.__name__) self._plugin_monitor.stop() self._plugin_monitor.join() self.done.set()
def reset_event_sink(self): if EMIT_EVENTS_TO_SALT_EVENT_BUS: log.debug("resetting minion") __opts__ = salt.config.minion_config(MINION_CONFIG) __opts__['file_client'] = 'local' self.caller = salt.client.Caller(mopts=__opts__)
def _recover(self): if sqlalchemy is None: return session = Session() for server in session.query(Server).all(): log.debug("Recovered server %s" % server.fqdn) assert server.boot_time is None or server.boot_time.tzinfo is not None # expect timezone-aware DB backend self.servers.inject_server( ServerState(fqdn=server.fqdn, hostname=server.hostname, managed=server.managed, last_contact=server.last_contact, boot_time=server.boot_time, ceph_version=server.ceph_version)) for service in session.query(Service).all(): if service.server: server = session.query(Server).get(service.server) else: server = None log.debug("Recovered service %s/%s/%s on %s" % (service.fsid, service.service_type, service.service_id, server.fqdn if server else None)) self.servers.inject_service( ServiceState(fsid=service.fsid, service_type=service.service_type, service_id=service.service_id), server.fqdn if server else None) # I want the most recent version of every sync_object fsids = [(row[0], row[1]) for row in session.query( SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid) ] for fsid, name in fsids: cluster_monitor = ClusterMonitor(fsid, name, self.notifier, self.persister, self.servers, self.eventer, self.requests) self.clusters[fsid] = cluster_monitor object_types = [ row[0] for row in session.query(SyncObject.sync_type).filter_by( fsid=fsid).distinct() ] for sync_type in object_types: latest_record = session.query(SyncObject).filter_by( fsid=fsid, sync_type=sync_type).order_by(SyncObject.version.desc(), SyncObject.when.desc())[0] # FIXME: bit of a hack because records persisted only store their 'version' # if it's a real counter version, underlying problem is that we have # underlying data (health, pg_brief) without usable version counters. def md5(raw): hasher = hashlib.md5() hasher.update(raw) return hasher.hexdigest() if latest_record.version: version = latest_record.version else: version = md5(latest_record.data) when = latest_record.when when = when.replace(tzinfo=tzutc()) if cluster_monitor.update_time is None or when > cluster_monitor.update_time: cluster_monitor.update_time = when cluster_monitor.inject_sync_object( None, sync_type, version, msgpack.unpackb(latest_record.data)) for monitor in self.clusters.values(): log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time)) monitor.start()
def _recover(self): if sqlalchemy is None: return session = Session() for server in session.query(Server).all(): log.debug("Recovered server %s" % server.fqdn) assert server.boot_time is None or server.boot_time.tzinfo is not None # expect timezone-aware DB backend self.servers.inject_server(ServerState( fqdn=server.fqdn, hostname=server.hostname, managed=server.managed, last_contact=server.last_contact, boot_time=server.boot_time, ceph_version=server.ceph_version )) for service in session.query(Service).all(): if service.server: server = session.query(Server).get(service.server) else: server = None log.debug("Recovered service %s/%s/%s on %s" % ( service.fsid, service.service_type, service.service_id, server.fqdn if server else None )) self.servers.inject_service(ServiceState( fsid=service.fsid, service_type=service.service_type, service_id=service.service_id ), server.fqdn if server else None) # I want the most recent version of every sync_object fsids = [(row[0], row[1]) for row in session.query(SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)] for fsid, name in fsids: cluster_monitor = ClusterMonitor(fsid, name, self.persister, self.servers, self.eventer, self.requests) self.clusters[fsid] = cluster_monitor object_types = [row[0] for row in session.query(SyncObject.sync_type).filter_by(fsid=fsid).distinct()] for sync_type in object_types: latest_record = session.query(SyncObject).filter_by( fsid=fsid, sync_type=sync_type).order_by( SyncObject.version.desc(), SyncObject.when.desc())[0] # FIXME: bit of a hack because records persisted only store their 'version' # if it's a real counter version, underlying problem is that we have # underlying data (health, pg_brief) without usable version counters. def md5(raw): hasher = hashlib.md5() hasher.update(raw) return hasher.hexdigest() if latest_record.version: version = latest_record.version else: version = md5(latest_record.data) when = latest_record.when when = when.replace(tzinfo=tzutc()) if cluster_monitor.update_time is None or when > cluster_monitor.update_time: cluster_monitor.update_time = when cluster_monitor.inject_sync_object(None, sync_type, version, msgpack.unpackb(latest_record.data)) for monitor in self.clusters.values(): log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time)) monitor.start()
def list_server_logs(self, fqdn): client = LocalClient(config.get('cthulhu', 'salt_config_path')) results = client.cmd(fqdn, "log_tail.list_logs", ["."]) log.debug('list_server_log result !!! {results}'.format(results=str(results))) return results