def __init__(self): self._complete = gevent.event.Event() self._rpc_thread = RpcThread(self) self._discovery_thread = TopLevelEvents(self) self._process_monitor = ProcessMonitorThread() db_path = config.get('cthulhu', 'db_path') if sqlalchemy is not None and db_path: try: # Prepare persistence engine = create_engine(config.get('cthulhu', 'db_path')) # noqa Session.configure(bind=engine) self.persister = Persister() except sqlalchemy.exc.ArgumentError as e: log.error("Database error: %s" % e) raise else: class NullPersister(object): def start(self): pass def stop(self): pass def join(self): pass def __getattribute__(self, item): if item.startswith('_'): return object.__getattribute__(self, item) else: try: return object.__getattribute__(self, item) except AttributeError: def blackhole(*args, **kwargs): pass return blackhole self.persister = NullPersister() # Remote operations self.requests = RequestCollection(self) self._request_ticker = Ticker(request_collection.TICK_PERIOD, lambda: self.requests.tick()) # FSID to ClusterMonitor self.clusters = {} # Generate events on state changes self.eventer = Eventer(self) # Handle all ceph/server messages self.servers = ServerMonitor(self.persister, self.eventer, self.requests)
def __init__(self): self._complete = gevent.event.Event() self._rpc_thread = RpcThread(self) self._discovery_thread = DiscoveryThread(self) self._process_monitor = ProcessMonitorThread() self.notifier = NotificationThread() try: # Prepare persistence engine = create_engine(config.get('cthulhu', 'db_path')) Session.configure(bind=engine) self.persister = Persister() except sqlalchemy.exc.ArgumentError as e: log.error("Database error: %s" % e) raise # FSID to ClusterMonitor self.clusters = {} # Generate events on state changes self.eventer = Eventer(self) # Handle all ceph/server messages self.servers = ServerMonitor(self.persister, self.eventer)
class Manager(object): """ Manage a collection of ClusterMonitors. Subscribe to ceph/cluster events, and create a ClusterMonitor for any FSID we haven't seen before. """ def __init__(self): self._complete = gevent.event.Event() self._rpc_thread = RpcThread(self) self._discovery_thread = TopLevelEvents(self) self._process_monitor = ProcessMonitorThread() self.notifier = NotificationThread() if sqlalchemy is not None: try: # Prepare persistence engine = create_engine(config.get('cthulhu', 'db_path')) Session.configure(bind=engine) self.persister = Persister() except sqlalchemy.exc.ArgumentError as e: log.error("Database error: %s" % e) raise else: class NullPersister(object): def start(self): pass def stop(self): pass def join(self): pass def __getattribute__(self, item): if item.startswith('_'): return object.__getattribute__(self, item) else: try: return object.__getattribute__(self, item) except AttributeError: def blackhole(*args, **kwargs): pass return blackhole self.persister = NullPersister() # Remote operations self.requests = RequestCollection(self) self._request_ticker = Ticker(request_collection.TICK_PERIOD, lambda: self.requests.tick()) # FSID to ClusterMonitor self.clusters = {} # Generate events on state changes self.eventer = Eventer(self) # Handle all ceph/server messages self.servers = ServerMonitor(self.persister, self.eventer, self.requests) def delete_cluster(self, fs_id): """ Note that the cluster will pop right back again if it's still sending heartbeats. """ victim = self.clusters[fs_id] victim.stop() victim.done.wait() del self.clusters[fs_id] self._expunge(fs_id) def stop(self): log.info("%s stopping" % self.__class__.__name__) for monitor in self.clusters.values(): monitor.stop() self._rpc_thread.stop() self._discovery_thread.stop() self._process_monitor.stop() self.notifier.stop() self.eventer.stop() self._request_ticker.stop() def _expunge(self, fsid): session = Session() session.query(SyncObject).filter_by(fsid=fsid).delete() session.commit() def _recover(self): if sqlalchemy is None: return session = Session() for server in session.query(Server).all(): log.debug("Recovered server %s" % server.fqdn) assert server.boot_time is None or server.boot_time.tzinfo is not None # expect timezone-aware DB backend self.servers.inject_server( ServerState(fqdn=server.fqdn, hostname=server.hostname, managed=server.managed, last_contact=server.last_contact, boot_time=server.boot_time, ceph_version=server.ceph_version)) for service in session.query(Service).all(): if service.server: server = session.query(Server).get(service.server) else: server = None log.debug("Recovered service %s/%s/%s on %s" % (service.fsid, service.service_type, service.service_id, server.fqdn if server else None)) self.servers.inject_service( ServiceState(fsid=service.fsid, service_type=service.service_type, service_id=service.service_id), server.fqdn if server else None) # I want the most recent version of every sync_object fsids = [(row[0], row[1]) for row in session.query( SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid) ] for fsid, name in fsids: cluster_monitor = ClusterMonitor(fsid, name, self.notifier, self.persister, self.servers, self.eventer, self.requests) self.clusters[fsid] = cluster_monitor object_types = [ row[0] for row in session.query(SyncObject.sync_type).filter_by( fsid=fsid).distinct() ] for sync_type in object_types: latest_record = session.query(SyncObject).filter_by( fsid=fsid, sync_type=sync_type).order_by(SyncObject.version.desc(), SyncObject.when.desc())[0] # FIXME: bit of a hack because records persisted only store their 'version' # if it's a real counter version, underlying problem is that we have # underlying data (health, pg_brief) without usable version counters. def md5(raw): hasher = hashlib.md5() hasher.update(raw) return hasher.hexdigest() if latest_record.version: version = latest_record.version else: version = md5(latest_record.data) when = latest_record.when when = when.replace(tzinfo=tzutc()) if cluster_monitor.update_time is None or when > cluster_monitor.update_time: cluster_monitor.update_time = when cluster_monitor.inject_sync_object( None, sync_type, version, msgpack.unpackb(latest_record.data)) for monitor in self.clusters.values(): log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time)) monitor.start() def start(self): log.info("%s starting" % self.__class__.__name__) # Before we start listening to the outside world, recover # our last known state from persistent storage try: self._recover() except: log.exception("Recovery failed") os._exit(-1) self._rpc_thread.bind() self._rpc_thread.start() self._discovery_thread.start() self._process_monitor.start() self.notifier.start() self.persister.start() self.eventer.start() self._request_ticker.start() self.servers.start() def join(self): log.info("%s joining" % self.__class__.__name__) self._rpc_thread.join() self._discovery_thread.join() self._process_monitor.join() self.notifier.join() self.persister.join() self.eventer.join() self._request_ticker.join() self.servers.join() for monitor in self.clusters.values(): monitor.join() def on_discovery(self, minion_id, heartbeat_data): log.info("on_discovery: {0}/{1}".format(minion_id, heartbeat_data['fsid'])) cluster_monitor = ClusterMonitor(heartbeat_data['fsid'], heartbeat_data['name'], self.notifier, self.persister, self.servers, self.eventer, self.requests) self.clusters[heartbeat_data['fsid']] = cluster_monitor # Run before passing on the heartbeat, because otherwise the # syncs resulting from the heartbeat might not be received # by the monitor. cluster_monitor.start() # Wait for ClusterMonitor to start accepting events before asking it # to do anything cluster_monitor.ready() cluster_monitor.on_heartbeat(minion_id, heartbeat_data)
class Manager(object): """ Manage a collection of ClusterMonitors. Subscribe to ceph/cluster events, and create a ClusterMonitor for any FSID we haven't seen before. """ def __init__(self): self._complete = gevent.event.Event() self._rpc_thread = RpcThread(self) self._discovery_thread = TopLevelEvents(self) self._process_monitor = ProcessMonitorThread() db_path = config.get('cthulhu', 'db_path') if sqlalchemy is not None and db_path: try: # Prepare persistence engine = create_engine(config.get('cthulhu', 'db_path')) # noqa Session.configure(bind=engine) self.persister = Persister() except sqlalchemy.exc.ArgumentError as e: log.error("Database error: %s" % e) raise else: class NullPersister(object): def start(self): pass def stop(self): pass def join(self): pass def __getattribute__(self, item): if item.startswith('_'): return object.__getattribute__(self, item) else: try: return object.__getattribute__(self, item) except AttributeError: def blackhole(*args, **kwargs): pass return blackhole self.persister = NullPersister() # Remote operations self.requests = RequestCollection(self) self._request_ticker = Ticker(request_collection.TICK_PERIOD, lambda: self.requests.tick()) # FSID to ClusterMonitor self.clusters = {} # Generate events on state changes self.eventer = Eventer(self) # Handle all ceph/server messages self.servers = ServerMonitor(self.persister, self.eventer, self.requests) def delete_cluster(self, fs_id): """ Note that the cluster will pop right back again if it's still sending heartbeats. """ victim = self.clusters[fs_id] victim.stop() victim.done.wait() del self.clusters[fs_id] self._expunge(fs_id) def stop(self): log.info("%s stopping" % self.__class__.__name__) for monitor in self.clusters.values(): monitor.stop() self._rpc_thread.stop() self._discovery_thread.stop() self._process_monitor.stop() self.eventer.stop() self._request_ticker.stop() def _expunge(self, fsid): if sqlalchemy is None: return session = Session() session.query(SyncObject).filter_by(fsid=fsid).delete() session.commit() def _recover(self): if sqlalchemy is None: return session = Session() for server in session.query(Server).all(): log.debug("Recovered server %s" % server.fqdn) assert server.boot_time is None or server.boot_time.tzinfo is not None # expect timezone-aware DB backend self.servers.inject_server(ServerState( fqdn=server.fqdn, hostname=server.hostname, managed=server.managed, last_contact=server.last_contact, boot_time=server.boot_time, ceph_version=server.ceph_version )) for service in session.query(Service).all(): if service.server: server = session.query(Server).get(service.server) else: server = None log.debug("Recovered service %s/%s/%s on %s" % ( service.fsid, service.service_type, service.service_id, server.fqdn if server else None )) self.servers.inject_service(ServiceState( fsid=service.fsid, service_type=service.service_type, service_id=service.service_id ), server.fqdn if server else None) # I want the most recent version of every sync_object fsids = [(row[0], row[1]) for row in session.query(SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)] for fsid, name in fsids: cluster_monitor = ClusterMonitor(fsid, name, self.persister, self.servers, self.eventer, self.requests) self.clusters[fsid] = cluster_monitor object_types = [row[0] for row in session.query(SyncObject.sync_type).filter_by(fsid=fsid).distinct()] for sync_type in object_types: latest_record = session.query(SyncObject).filter_by( fsid=fsid, sync_type=sync_type).order_by( SyncObject.version.desc(), SyncObject.when.desc())[0] # FIXME: bit of a hack because records persisted only store their 'version' # if it's a real counter version, underlying problem is that we have # underlying data (health, pg_brief) without usable version counters. def md5(raw): hasher = hashlib.md5() hasher.update(raw) return hasher.hexdigest() if latest_record.version: version = latest_record.version else: version = md5(latest_record.data) when = latest_record.when when = when.replace(tzinfo=tzutc()) if cluster_monitor.update_time is None or when > cluster_monitor.update_time: cluster_monitor.update_time = when cluster_monitor.inject_sync_object(None, sync_type, version, msgpack.unpackb(latest_record.data)) for monitor in self.clusters.values(): log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time)) monitor.start() def start(self): log.info("%s starting" % self.__class__.__name__) self._rpc_thread.bind() self._rpc_thread.start() self._discovery_thread.start() self._process_monitor.start() self.persister.start() self.eventer.start() self._request_ticker.start() self.servers.start() return True def join(self): log.info("%s joining" % self.__class__.__name__) self._rpc_thread.join() self._discovery_thread.join() self._process_monitor.join() self.persister.join() self.eventer.join() self._request_ticker.join() self.servers.join() for monitor in self.clusters.values(): monitor.join() def on_discovery(self, minion_id, heartbeat_data): log.info("on_discovery: {0}/{1}".format(minion_id, heartbeat_data['fsid'])) cluster_monitor = ClusterMonitor(heartbeat_data['fsid'], heartbeat_data['name'], self.persister, self.servers, self.eventer, self.requests) self.clusters[heartbeat_data['fsid']] = cluster_monitor # Run before passing on the heartbeat, because otherwise the # syncs resulting from the heartbeat might not be received # by the monitor. cluster_monitor.start() # Wait for ClusterMonitor to start accepting events before asking it # to do anything cluster_monitor.ready() cluster_monitor.on_heartbeat(minion_id, heartbeat_data)