def test_get_osd_to_host_mapping_osd_metadata_partial_exists(self, mocket): """ That we get a mapping when osd_map contains osd_metadata no data """ mocket.getnameinfo.return_value = [OSD_HOSTNAME] mocket.getfqdn.return_value = OSD_FQDN osd_map = { "osds": [{ "cluster_addr": "192.34.58.142:6808/14001122", "osd": 0 }, { "cluster_addr": "192.34.58.142:6802/17383", "osd": 1 }], "osd_metadata": [{ "back_addr": "192.34.58.142:6808/14001122", "hostname": "gravel2", "id": 0, "hb_back_addr": "192.34.58.142:6809/14001122", "hb_front_addr": "192.34.58.142:6810/14001122", "front_addr": "192.34.58.142:6800/1122" }] } sm = ServerMonitor(Mock(), Mock(), Mock()) self.assertEqual( { ('gravel2.rockery', 'gravel2'): [{ 'cluster_addr': '192.34.58.142:6808/14001122', 'osd': 0 }, { 'cluster_addr': '192.34.58.142:6802/17383', 'osd': 1 }] }, sm.get_hostname_to_osds(osd_map))
def __init__(self): self._complete = gevent.event.Event() self._rpc_thread = RpcThread(self) self._discovery_thread = TopLevelEvents(self) self._process_monitor = ProcessMonitorThread() db_path = config.get('cthulhu', 'db_path') if sqlalchemy is not None and db_path: try: # Prepare persistence engine = create_engine(config.get('cthulhu', 'db_path')) # noqa Session.configure(bind=engine) self.persister = Persister() except sqlalchemy.exc.ArgumentError as e: log.error("Database error: %s" % e) raise else: class NullPersister(object): def start(self): pass def stop(self): pass def join(self): pass def __getattribute__(self, item): if item.startswith('_'): return object.__getattribute__(self, item) else: try: return object.__getattribute__(self, item) except AttributeError: def blackhole(*args, **kwargs): pass return blackhole self.persister = NullPersister() # Remote operations self.requests = RequestCollection(self) self._request_ticker = Ticker(request_collection.TICK_PERIOD, lambda: self.requests.tick()) # FSID to ClusterMonitor self.clusters = {} # Generate events on state changes self.eventer = Eventer(self) # Handle all ceph/server messages self.servers = ServerMonitor(self.persister, self.eventer, self.requests)
def test_get_osd_to_host_mapping_osd_down_and_out_from_epoch1( self, mocket): """ That we don't get a mapping when osd_map contains osd_metadata no data """ mocket.getnameinfo.return_value = [OSD_HOSTNAME] mocket.getfqdn.return_value = OSD_FQDN osd_map = { "osd_metadata": [], "osds": [{ "cluster_addr": ":/0", "down_at": 0, "heartbeat_back_addr": ":/0", "heartbeat_front_addr": ":/0", "in": 0, "last_clean_begin": 0, "last_clean_end": 0, "lost_at": 0, "osd": 0, "primary_affinity": 1.0, "public_addr": ":/0", "state": ["exists", "new"], "up": 0, "up_from": 0, "up_thru": 0, "uuid": "f53e0a25-d29c-4aa3-9a2e-f6ebee538f8e", "weight": 0.0 }] } sm = ServerMonitor(Mock(), Mock(), Mock()) self.assertEqual({}, sm.get_hostname_to_osds(osd_map))
def test_get_osd_to_host_mapping_osd_metadata_exists(self, mocket): """ That we get a mapping when osd_map contains osd_metadata no data """ osd_map = { "osds": [{ "cluster_addr": "192.34.58.142:6808/14001122", "osd": 0 }], "osd_metadata": [{ "back_addr": "192.34.58.142:6808/14001122", "hostname": "gravel2.rockery", "id": 0, "hb_back_addr": "192.34.58.142:6809/14001122", "hb_front_addr": "192.34.58.142:6810/14001122", "front_addr": "192.34.58.142:6800/1122" }] } sm = ServerMonitor(Mock(), Mock(), Mock()) self.assertEqual( { ('gravel2.rockery', 'gravel2'): [{ 'cluster_addr': '192.34.58.142:6808/14001122', 'osd': 0 }] }, sm.get_hostname_to_osds(osd_map)) self.assertEqual(False, mocket.called)
def test_get_osd_to_host_mapping_empty(self): """ That we get an empty mapping when osd_map contains no data """ osd_map = {'tree': {'nodes': []}, 'osds': []} sm = ServerMonitor(Mock(), Mock(), Mock()) self.assertEqual({}, sm.get_hostname_to_osds(osd_map))
def test_unmanaged_service_relocate(self): """ That when an OSD disappears from one server's salt.services output and reappears on another server, this is reflected in the state. """ sm = ServerMonitor(Mock(), Mock(), Mock()) sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES) sm.on_osd_map(OSD_MAP) # osd.1 initially on unmanaged server OSD self.assertEqual(sm.services[ServiceId(FSID, 'osd', "1")].server_state.fqdn, OSD_HOSTNAME) sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES_MIGRATED) sm.on_osd_map(OSD_MAP_MIGRATED) # osd.1 now on managed server MON self.assertEqual(sm.services[ServiceId(FSID, 'osd', "1")].server_state.fqdn, MON_FQDN) # Check the servers' lists of services are up to date too self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [ ServiceId(FSID, 'osd', '1'), ServiceId(FSID, 'mon', MON_HOSTNAME) ]) self.assertListEqual(sm.servers[OSD_HOSTNAME].services.keys(), [ ServiceId(FSID, 'osd', '0') ])
def __init__(self): self._complete = gevent.event.Event() self._rpc_thread = RpcThread(self) self._discovery_thread = DiscoveryThread(self) self._process_monitor = ProcessMonitorThread() self.notifier = NotificationThread() try: # Prepare persistence engine = create_engine(config.get('cthulhu', 'db_path')) Session.configure(bind=engine) self.persister = Persister() except sqlalchemy.exc.ArgumentError as e: log.error("Database error: %s" % e) raise # FSID to ClusterMonitor self.clusters = {} # Generate events on state changes self.eventer = Eventer(self) # Handle all ceph/server messages self.servers = ServerMonitor(self.persister, self.eventer)
def test_unmanaged_managed_transition(self): """ That when a pesky user doesn't initially install salt on OSD servers but later adds it, we correctly transition from paying attention to the CRUSH config to paying attention to the salt data, and fill in the correct FQDNs. """ sm = ServerMonitor(Mock(), Mock(), Mock()) sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES) sm.on_osd_map(OSD_MAP) self.assertListEqual(sm.servers.keys(), [MON_FQDN, OSD_HOSTNAME]) self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [ ServiceId(FSID, 'mon', MON_HOSTNAME) ]) self.assertListEqual(sm.servers[OSD_HOSTNAME].services.keys(), [ ServiceId(FSID, 'osd', '1'), ServiceId(FSID, 'osd', '0') ]) sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES) self.assertListEqual(sm.servers.keys(), [MON_FQDN, OSD_FQDN]) self.assertEqual(sm.servers[OSD_FQDN].fqdn, OSD_FQDN) self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [ ServiceId(FSID, 'mon', MON_HOSTNAME) ]) self.assertListEqual(sm.servers[OSD_FQDN].services.keys(), [ ServiceId(FSID, 'osd', '1'), ServiceId(FSID, 'osd', '0') ])
def test_get_osd_to_host_mapping_osd_metadata_absent(self, mocket): """ That we get a mapping when osd_map contains osd_metadata no data """ mocket.getnameinfo.return_value = [OSD_HOSTNAME] mocket.getfqdn.return_value = OSD_FQDN osd_map = { "osds": [{ "cluster_addr": "192.34.58.142:6808/14001122", "osd": 0 }] } sm = ServerMonitor(Mock(), Mock(), Mock()) self.assertEqual( { ('gravel2.rockery', 'gravel2'): [{ 'cluster_addr': '192.34.58.142:6808/14001122', 'osd': 0 }] }, sm.get_hostname_to_osds(osd_map))
def test_unmanaged_servers(self): """ That when only the mons are sending salt messages, we generate a correct view of service locations including OSDs. """ sm = ServerMonitor(Mock(), Mock(), Mock()) sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES) sm.on_osd_map(OSD_MAP) self.assertEqual(len(sm.servers), 2) self.assertEqual(len(sm.services), 3) self.assertEqual(len(sm.fsid_services), 1) self.assertEqual(len(sm.hostname_to_server), 2) self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [ ServiceId(FSID, 'mon', MON_HOSTNAME) ]) self.assertListEqual(sm.servers[OSD_HOSTNAME].services.keys(), [ ServiceId(FSID, 'osd', '1'), ServiceId(FSID, 'osd', '0') ])
def test_managed_servers(self): """ That managed servers (those sending salt messages) generate a correct view of service locations """ sm = ServerMonitor(Mock(), Mock(), Mock()) sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES) sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES) self.assertEqual(len(sm.servers), 2) self.assertEqual(len(sm.services), 3) self.assertEqual(len(sm.fsid_services), 1) self.assertEqual(len(sm.hostname_to_server), 2) self.assertListEqual(sm.servers.keys(), [MON_FQDN, OSD_FQDN]) self.assertEqual(sm.servers[OSD_FQDN].fqdn, OSD_FQDN) self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [ServiceId(FSID, 'mon', MON_HOSTNAME)]) self.assertListEqual( sm.servers[OSD_FQDN].services.keys(), [ServiceId(FSID, 'osd', '1'), ServiceId(FSID, 'osd', '0')])
def test_delete_managed(self): """ That when a managed server is removed, it no longer appears in the server/service data. """ sm = ServerMonitor(Mock(), Mock(), Mock()) sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES) sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES) sm.delete(OSD_FQDN) # The two OSD services, and the 'osd' server should be gone self.assertEqual(len(sm.servers), 1) self.assertEqual(len(sm.services), 1) self.assertEqual(len(sm.fsid_services), 1) self.assertEqual(len(sm.hostname_to_server), 1) self.assertListEqual(sm.servers.keys(), [MON_FQDN]) self.assertListEqual(sm.services.keys(), [ServiceId(FSID, 'mon', MON_HOSTNAME)]) self.assertListEqual([s.id for s in sm.fsid_services[FSID]], [ServiceId(FSID, 'mon', MON_HOSTNAME)]) self.assertListEqual(sm.hostname_to_server.keys(), [MON_HOSTNAME])
def test_remove_osd(self): """ That when an OSD is disappears from the OSD map, it is also removed from ServerMonitor's worldview """ sm = ServerMonitor(Mock(), Mock(), Mock()) sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES) sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES) self.assertListEqual(sm.services.keys(), [ ServiceId(FSID, 'osd', '0'), ServiceId(FSID, 'osd', '1'), ServiceId(FSID, 'mon', MON_HOSTNAME) ]) sm.on_osd_map(OSD_MAP_1_REMOVED) self.assertListEqual(sm.services.keys(), [ ServiceId(FSID, 'osd', '0'), ServiceId(FSID, 'mon', MON_HOSTNAME) ])
def test_managed_servers(self): """ That managed servers (those sending salt messages) generate a correct view of service locations """ sm = ServerMonitor(Mock(), Mock(), Mock()) sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES) sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES) self.assertEqual(len(sm.servers), 2) self.assertEqual(len(sm.services), 3) self.assertEqual(len(sm.fsid_services), 1) self.assertEqual(len(sm.hostname_to_server), 2) self.assertListEqual(sm.servers.keys(), [MON_FQDN, OSD_FQDN]) self.assertEqual(sm.servers[OSD_FQDN].fqdn, OSD_FQDN) self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [ ServiceId(FSID, 'mon', MON_HOSTNAME) ]) self.assertListEqual(sm.servers[OSD_FQDN].services.keys(), [ ServiceId(FSID, 'osd', '1'), ServiceId(FSID, 'osd', '0') ])
def test_remove_mds(self): """ That when an mds disappears from the mds map, ServerMonitor notices """ sm = ServerMonitor(Mock(), Mock(), Mock()) sm.on_server_heartbeat(MDS1_FQDN, MDS1_SERVICES) sm.on_server_heartbeat(MDS2_FQDN, MDS2_SERVICES) sm.on_mds_map(FSID, MDS_MAP) self.assertListEqual(sm.services.keys(), [ ServiceId(FSID, 'mds', MDS1_HOSTNAME), ServiceId(FSID, 'mds', MDS2_HOSTNAME) ]) sm.on_mds_map(FSID, MDS_MAP_1_REMOVED) self.assertListEqual(sm.services.keys(), [ServiceId(FSID, 'mds', MDS1_HOSTNAME)])
def test_remove_mon(self): """ That when a mon disappears from the mon map, ServerMonitor notices """ sm = ServerMonitor(Mock(), Mock(), Mock()) sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES) sm.on_server_heartbeat(OSD_FQDN, OSD_CEPH_SERVICES) sm.on_mon_map(MON_MAP) self.assertListEqual(sm.services.keys(), [ ServiceId(FSID, 'osd', '0'), ServiceId(FSID, 'osd', '1'), ServiceId(FSID, 'mon', MON_HOSTNAME) ]) sm.on_mon_map(MON_MAP_1_REMOVED) self.assertListEqual(sm.services.keys(), [ ServiceId(FSID, 'osd', '0'), ServiceId(FSID, 'osd', '1') ])
def test_remove_mds(self): """ That when an mds disappears from the mds map, ServerMonitor notices """ sm = ServerMonitor(Mock(), Mock(), Mock()) sm.on_server_heartbeat(MDS1_FQDN, MDS1_SERVICES) sm.on_server_heartbeat(MDS2_FQDN, MDS2_SERVICES) sm.on_mds_map(FSID, MDS_MAP) self.assertListEqual(sm.services.keys(), [ ServiceId(FSID, 'mds', MDS1_HOSTNAME), ServiceId(FSID, 'mds', MDS2_HOSTNAME) ]) sm.on_mds_map(FSID, MDS_MAP_1_REMOVED) self.assertListEqual(sm.services.keys(), [ ServiceId(FSID, 'mds', MDS1_HOSTNAME) ])
def test_on_osd_map(self, mocket): def get_name_info(addr, _): if addr == ":/0": return [''] else: return [OSD_HOSTNAME] def get_fqdn(thing): if thing == [""]: return '' else: return OSD_FQDN mocket.getnameinfo = get_name_info mocket.getfqdn = get_fqdn sm = ServerMonitor(Mock(), Mock(), Mock()) sm.on_osd_map(BAD_MAP) sm.on_osd_map(BAD_MAP2) sm.on_osd_map(BAD_MAP3)
def test_unmanaged_servers(self): """ That when only the mons are sending salt messages, we generate a correct view of service locations including OSDs. """ sm = ServerMonitor(Mock(), Mock(), Mock()) sm.on_server_heartbeat(MON_FQDN, MON_CEPH_SERVICES) sm.on_osd_map(OSD_MAP) self.assertEqual(len(sm.servers), 2) self.assertEqual(len(sm.services), 3) self.assertEqual(len(sm.fsid_services), 1) self.assertEqual(len(sm.hostname_to_server), 2) self.assertListEqual(sm.servers[MON_FQDN].services.keys(), [ServiceId(FSID, 'mon', MON_HOSTNAME)]) self.assertListEqual( sm.servers[OSD_HOSTNAME].services.keys(), [ServiceId(FSID, 'osd', '1'), ServiceId(FSID, 'osd', '0')])
class Manager(object): """ Manage a collection of ClusterMonitors. Subscribe to ceph/cluster events, and create a ClusterMonitor for any FSID we haven't seen before. """ def __init__(self): self._complete = gevent.event.Event() self._rpc_thread = RpcThread(self) self._discovery_thread = TopLevelEvents(self) self._process_monitor = ProcessMonitorThread() db_path = config.get('cthulhu', 'db_path') if sqlalchemy is not None and db_path: try: # Prepare persistence engine = create_engine(config.get('cthulhu', 'db_path')) # noqa Session.configure(bind=engine) self.persister = Persister() except sqlalchemy.exc.ArgumentError as e: log.error("Database error: %s" % e) raise else: class NullPersister(object): def start(self): pass def stop(self): pass def join(self): pass def __getattribute__(self, item): if item.startswith('_'): return object.__getattribute__(self, item) else: try: return object.__getattribute__(self, item) except AttributeError: def blackhole(*args, **kwargs): pass return blackhole self.persister = NullPersister() # Remote operations self.requests = RequestCollection(self) self._request_ticker = Ticker(request_collection.TICK_PERIOD, lambda: self.requests.tick()) # FSID to ClusterMonitor self.clusters = {} # Generate events on state changes self.eventer = Eventer(self) # Handle all ceph/server messages self.servers = ServerMonitor(self.persister, self.eventer, self.requests) def delete_cluster(self, fs_id): """ Note that the cluster will pop right back again if it's still sending heartbeats. """ victim = self.clusters[fs_id] victim.stop() victim.done.wait() del self.clusters[fs_id] self._expunge(fs_id) def stop(self): log.info("%s stopping" % self.__class__.__name__) for monitor in self.clusters.values(): monitor.stop() self._rpc_thread.stop() self._discovery_thread.stop() self._process_monitor.stop() self.eventer.stop() self._request_ticker.stop() def _expunge(self, fsid): if sqlalchemy is None: return session = Session() session.query(SyncObject).filter_by(fsid=fsid).delete() session.commit() def _recover(self): if sqlalchemy is None: return session = Session() for server in session.query(Server).all(): log.debug("Recovered server %s" % server.fqdn) assert server.boot_time is None or server.boot_time.tzinfo is not None # expect timezone-aware DB backend self.servers.inject_server(ServerState( fqdn=server.fqdn, hostname=server.hostname, managed=server.managed, last_contact=server.last_contact, boot_time=server.boot_time, ceph_version=server.ceph_version )) for service in session.query(Service).all(): if service.server: server = session.query(Server).get(service.server) else: server = None log.debug("Recovered service %s/%s/%s on %s" % ( service.fsid, service.service_type, service.service_id, server.fqdn if server else None )) self.servers.inject_service(ServiceState( fsid=service.fsid, service_type=service.service_type, service_id=service.service_id ), server.fqdn if server else None) # I want the most recent version of every sync_object fsids = [(row[0], row[1]) for row in session.query(SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)] for fsid, name in fsids: cluster_monitor = ClusterMonitor(fsid, name, self.persister, self.servers, self.eventer, self.requests) self.clusters[fsid] = cluster_monitor object_types = [row[0] for row in session.query(SyncObject.sync_type).filter_by(fsid=fsid).distinct()] for sync_type in object_types: latest_record = session.query(SyncObject).filter_by( fsid=fsid, sync_type=sync_type).order_by( SyncObject.version.desc(), SyncObject.when.desc())[0] # FIXME: bit of a hack because records persisted only store their 'version' # if it's a real counter version, underlying problem is that we have # underlying data (health, pg_brief) without usable version counters. def md5(raw): hasher = hashlib.md5() hasher.update(raw) return hasher.hexdigest() if latest_record.version: version = latest_record.version else: version = md5(latest_record.data) when = latest_record.when when = when.replace(tzinfo=tzutc()) if cluster_monitor.update_time is None or when > cluster_monitor.update_time: cluster_monitor.update_time = when cluster_monitor.inject_sync_object(None, sync_type, version, msgpack.unpackb(latest_record.data)) for monitor in self.clusters.values(): log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time)) monitor.start() def start(self): log.info("%s starting" % self.__class__.__name__) self._rpc_thread.bind() self._rpc_thread.start() self._discovery_thread.start() self._process_monitor.start() self.persister.start() self.eventer.start() self._request_ticker.start() self.servers.start() return True def join(self): log.info("%s joining" % self.__class__.__name__) self._rpc_thread.join() self._discovery_thread.join() self._process_monitor.join() self.persister.join() self.eventer.join() self._request_ticker.join() self.servers.join() for monitor in self.clusters.values(): monitor.join() def on_discovery(self, minion_id, heartbeat_data): log.info("on_discovery: {0}/{1}".format(minion_id, heartbeat_data['fsid'])) cluster_monitor = ClusterMonitor(heartbeat_data['fsid'], heartbeat_data['name'], self.persister, self.servers, self.eventer, self.requests) self.clusters[heartbeat_data['fsid']] = cluster_monitor # Run before passing on the heartbeat, because otherwise the # syncs resulting from the heartbeat might not be received # by the monitor. cluster_monitor.start() # Wait for ClusterMonitor to start accepting events before asking it # to do anything cluster_monitor.ready() cluster_monitor.on_heartbeat(minion_id, heartbeat_data)
class Manager(object): """ Manage a collection of ClusterMonitors. Subscribe to ceph/cluster events, and create a ClusterMonitor for any FSID we haven't seen before. """ def __init__(self): self._complete = gevent.event.Event() self._rpc_thread = RpcThread(self) self._discovery_thread = TopLevelEvents(self) self._process_monitor = ProcessMonitorThread() self.notifier = NotificationThread() if sqlalchemy is not None: try: # Prepare persistence engine = create_engine(config.get('cthulhu', 'db_path')) Session.configure(bind=engine) self.persister = Persister() except sqlalchemy.exc.ArgumentError as e: log.error("Database error: %s" % e) raise else: class NullPersister(object): def start(self): pass def stop(self): pass def join(self): pass def __getattribute__(self, item): if item.startswith('_'): return object.__getattribute__(self, item) else: try: return object.__getattribute__(self, item) except AttributeError: def blackhole(*args, **kwargs): pass return blackhole self.persister = NullPersister() # Remote operations self.requests = RequestCollection(self) self._request_ticker = Ticker(request_collection.TICK_PERIOD, lambda: self.requests.tick()) # FSID to ClusterMonitor self.clusters = {} # Generate events on state changes self.eventer = Eventer(self) # Handle all ceph/server messages self.servers = ServerMonitor(self.persister, self.eventer, self.requests) def delete_cluster(self, fs_id): """ Note that the cluster will pop right back again if it's still sending heartbeats. """ victim = self.clusters[fs_id] victim.stop() victim.done.wait() del self.clusters[fs_id] self._expunge(fs_id) def stop(self): log.info("%s stopping" % self.__class__.__name__) for monitor in self.clusters.values(): monitor.stop() self._rpc_thread.stop() self._discovery_thread.stop() self._process_monitor.stop() self.notifier.stop() self.eventer.stop() self._request_ticker.stop() def _expunge(self, fsid): session = Session() session.query(SyncObject).filter_by(fsid=fsid).delete() session.commit() def _recover(self): if sqlalchemy is None: return session = Session() for server in session.query(Server).all(): log.debug("Recovered server %s" % server.fqdn) assert server.boot_time is None or server.boot_time.tzinfo is not None # expect timezone-aware DB backend self.servers.inject_server( ServerState(fqdn=server.fqdn, hostname=server.hostname, managed=server.managed, last_contact=server.last_contact, boot_time=server.boot_time, ceph_version=server.ceph_version)) for service in session.query(Service).all(): if service.server: server = session.query(Server).get(service.server) else: server = None log.debug("Recovered service %s/%s/%s on %s" % (service.fsid, service.service_type, service.service_id, server.fqdn if server else None)) self.servers.inject_service( ServiceState(fsid=service.fsid, service_type=service.service_type, service_id=service.service_id), server.fqdn if server else None) # I want the most recent version of every sync_object fsids = [(row[0], row[1]) for row in session.query( SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid) ] for fsid, name in fsids: cluster_monitor = ClusterMonitor(fsid, name, self.notifier, self.persister, self.servers, self.eventer, self.requests) self.clusters[fsid] = cluster_monitor object_types = [ row[0] for row in session.query(SyncObject.sync_type).filter_by( fsid=fsid).distinct() ] for sync_type in object_types: latest_record = session.query(SyncObject).filter_by( fsid=fsid, sync_type=sync_type).order_by(SyncObject.version.desc(), SyncObject.when.desc())[0] # FIXME: bit of a hack because records persisted only store their 'version' # if it's a real counter version, underlying problem is that we have # underlying data (health, pg_brief) without usable version counters. def md5(raw): hasher = hashlib.md5() hasher.update(raw) return hasher.hexdigest() if latest_record.version: version = latest_record.version else: version = md5(latest_record.data) when = latest_record.when when = when.replace(tzinfo=tzutc()) if cluster_monitor.update_time is None or when > cluster_monitor.update_time: cluster_monitor.update_time = when cluster_monitor.inject_sync_object( None, sync_type, version, msgpack.unpackb(latest_record.data)) for monitor in self.clusters.values(): log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time)) monitor.start() def start(self): log.info("%s starting" % self.__class__.__name__) # Before we start listening to the outside world, recover # our last known state from persistent storage try: self._recover() except: log.exception("Recovery failed") os._exit(-1) self._rpc_thread.bind() self._rpc_thread.start() self._discovery_thread.start() self._process_monitor.start() self.notifier.start() self.persister.start() self.eventer.start() self._request_ticker.start() self.servers.start() def join(self): log.info("%s joining" % self.__class__.__name__) self._rpc_thread.join() self._discovery_thread.join() self._process_monitor.join() self.notifier.join() self.persister.join() self.eventer.join() self._request_ticker.join() self.servers.join() for monitor in self.clusters.values(): monitor.join() def on_discovery(self, minion_id, heartbeat_data): log.info("on_discovery: {0}/{1}".format(minion_id, heartbeat_data['fsid'])) cluster_monitor = ClusterMonitor(heartbeat_data['fsid'], heartbeat_data['name'], self.notifier, self.persister, self.servers, self.eventer, self.requests) self.clusters[heartbeat_data['fsid']] = cluster_monitor # Run before passing on the heartbeat, because otherwise the # syncs resulting from the heartbeat might not be received # by the monitor. cluster_monitor.start() # Wait for ClusterMonitor to start accepting events before asking it # to do anything cluster_monitor.ready() cluster_monitor.on_heartbeat(minion_id, heartbeat_data)