def __init__(self): self._complete = gevent.event.Event() self._rpc_thread = RpcThread(self) self._discovery_thread = TopLevelEvents(self) self._process_monitor = ProcessMonitorThread() db_path = config.get('cthulhu', 'db_path') if sqlalchemy is not None and db_path: try: # Prepare persistence engine = create_engine(config.get('cthulhu', 'db_path')) # noqa Session.configure(bind=engine) self.persister = Persister() except sqlalchemy.exc.ArgumentError as e: log.error("Database error: %s" % e) raise else: class NullPersister(object): def start(self): pass def stop(self): pass def join(self): pass def __getattribute__(self, item): if item.startswith('_'): return object.__getattribute__(self, item) else: try: return object.__getattribute__(self, item) except AttributeError: def blackhole(*args, **kwargs): pass return blackhole self.persister = NullPersister() # Remote operations self.requests = RequestCollection(self) self._request_ticker = Ticker(request_collection.TICK_PERIOD, lambda: self.requests.tick()) # FSID to ClusterMonitor self.clusters = {} # Generate events on state changes self.eventer = Eventer(self) # Handle all ceph/server messages self.servers = ServerMonitor(self.persister, self.eventer, self.requests)
def __init__(self): self._complete = gevent.event.Event() self._rpc_thread = RpcThread(self) self._discovery_thread = DiscoveryThread(self) self._process_monitor = ProcessMonitorThread() self.notifier = NotificationThread() try: # Prepare persistence engine = create_engine(config.get('cthulhu', 'db_path')) Session.configure(bind=engine) self.persister = Persister() except sqlalchemy.exc.ArgumentError as e: log.error("Database error: %s" % e) raise # FSID to ClusterMonitor self.clusters = {} # Generate events on state changes self.eventer = Eventer(self) # Handle all ceph/server messages self.servers = ServerMonitor(self.persister, self.eventer)
def cancel(self, request_id): """ Immediately mark a request as cancelled, and in the background try and cancel any outstanding JID for it. """ request = self._by_request_id[request_id] # Idempotent behaviour: no-op if already cancelled if request.state == request.COMPLETE: return with self._update_index(request): # I will take over cancelling the JID from the request cancel_jid = request.jid request.jid = None # Request is now done, no further calls request.set_error("Cancelled") request.complete() # In the background, try to cancel the request's JID on a best-effort basis if cancel_jid: client = LocalClient(config.get('cthulhu', 'salt_config_path')) client.run_job(request.minion_id, 'saltutil.kill_job', [cancel_jid])
def fetch(self, minion_id, sync_type): log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type)) if minion_id is None: # We're probably being replayed to from the database log.warn("SyncObjects.fetch called with minion_id=None") return self._fetching_at[sync_type] = now() client = LocalClient(config.get('cthulhu', 'salt_config_path')) # TODO clean up unused 'since' argument pub_data = client.run_job( minion_id, 'ceph.get_cluster_object', condition_kwarg( [], { 'cluster_name': self._cluster_name, 'sync_type': sync_type.str, 'since': None })) if not pub_data: log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type)) # Don't throw an exception because if a fetch fails we should always else: log.debug("SyncObjects.fetch: jid=%s minions=%s" % (pub_data['jid'], pub_data['minions']))
def __init__(self): super(Persister, self).__init__() self._queue = gevent.queue.Queue() self._complete = gevent.event.Event() self._session = Session() # Plumb the sqlalchemy logger into our cthulhu logger's output logging.getLogger("sqlalchemy.engine").setLevel(logging.getLevelName(config.get("cthulhu", "db_log_level"))) for handler in log.handlers: logging.getLogger("sqlalchemy.engine").addHandler(handler)
def __init__(self): super(Persister, self).__init__() self._queue = gevent.queue.Queue() self._complete = gevent.event.Event() self._session = Session() # Plumb the sqlalchemy logger into our cthulhu logger's output logging.getLogger('sqlalchemy.engine').setLevel( logging.getLevelName(config.get('cthulhu', 'db_log_level'))) for handler in log.handlers: logging.getLogger('sqlalchemy.engine').addHandler(handler)
def _submit(self): client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(self._minion_id, self._cmd, self._args) if not pub_data: # FIXME: LocalClient uses 'print' to record the # details of what went wrong :-( raise PublishError("Failed to publish job") self.log.info("Request %s started job %s" % (self.id, pub_data['jid'])) self.alive_at = now() self.jid = pub_data['jid'] return self.jid
def tick(self): """ For walltime-based monitoring of running requests. Long-running requests get a periodic call to saltutil.running to verify that things really are still happening. """ if not self._by_jid: return else: log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid)) # Identify JIDs who haven't had a saltutil.running reponse for too long. # Kill requests in a separate phase because request:JID is not 1:1 stale_jobs = set() _now = now() for request in self._by_jid.values(): if _now - request.alive_at > datetime.timedelta( seconds=TICK_PERIOD * 3): log.error("Request %s JID %s stale: now=%s, alive_at=%s" % (request.id, request.jid, _now, request.alive_at)) stale_jobs.add(request) # Any identified stale jobs are errored out. for request in stale_jobs: with self._update_index(request): request.set_error("Lost contact") request.jid = None request.complete() # Identify minions associated with JIDs in flight query_minions = set() for jid, request in self._by_jid.items(): query_minions.add(request.minion_id) # Attempt to emit a saltutil.running to ping jobs, next tick we # will see if we got updates to the alive_at attribute to indicate non-staleness if query_minions: log.info("RequestCollection.tick: sending saltutil.running to {0}". format(query_minions)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list") if not pub_data: log.warning("Failed to publish saltutil.running to {0}".format( query_minions))
def clear(args): if not args.yes_i_am_sure: log.warn("This will remove all stored Calamari monitoring status and history. Use '--yes-i-am-sure' to proceed") return log.info("Loading configuration..") config = CalamariConfig() log.info("Dropping tables") db_path = config.get('cthulhu', 'db_path') engine = create_engine(db_path) Base.metadata.drop_all(engine) Base.metadata.reflect(engine) if ALEMBIC_TABLE in Base.metadata.tables: Base.metadata.tables[ALEMBIC_TABLE].drop(engine) log.info("Complete. Now run `%s initialize`" % os.path.basename(sys.argv[0]))
def update_connected_minions(): from cthulhu.manager import config from calamari_common.salt_wrapper import Key, master_config if len(Key(master_config(config.get('cthulhu', 'salt_config_path'))).list_keys()['minions']) == 0: # no minions to update return message = "Updating already connected nodes." log.info(message) p = subprocess.Popen(["salt", "*", "state.highstate"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() log.debug("{message} salt stdout: {out}".format(message=message, out=out)) log.debug("{message} salt stderr: {err}".format(message=message, err=err)) if p.returncode != 0: raise RuntimeError("{message} failed with rc={rc}".format(message=message, rc=p.returncode))
def _submit(self, commands): self.log.debug("Request._submit: %s/%s/%s" % (self._minion_id, self._cluster_name, commands)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(self._minion_id, 'ceph.rados_commands', [self._fsid, self._cluster_name, commands]) if not pub_data: # FIXME: LocalClient uses 'print' to record the # details of what went wrong :-( raise PublishError("Failed to publish job") self.log.info("Request %s started job %s" % (self.id, pub_data['jid'])) self.alive_at = now() self.jid = pub_data['jid'] return self.jid
def cancel(self, request_id): """ Immediately mark a request as cancelled, and in the background try and cancel any outstanding JID for it. """ request = self._by_request_id[request_id] with self._update_index(request): request.set_error("Cancelled") request.complete() if request.jid: client = LocalClient(config.get('cthulhu', 'salt_config_path')) client.run_job(request.minion_id, 'saltutil.kill_job', [request.jid]) # We don't check for completion or errors from kill_job, it's a best-effort thing. If we're # cancelling something we will do our best to kill any subprocess but can't # any guarantees because running nodes may be out of touch with the calamari server. request.jid = None
def fetch(self, minion_id, sync_type): log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type)) if minion_id is None: # We're probably being replayed to from the database log.warn("SyncObjects.fetch called with minion_id=None") return self._fetching_at[sync_type] = now() client = LocalClient(config.get('cthulhu', 'salt_config_path')) # TODO clean up unused 'since' argument pub_data = client.run_job(minion_id, 'ceph.get_cluster_object', condition_kwarg([], {'cluster_name': self._cluster_name, 'sync_type': sync_type.str, 'since': None})) if not pub_data: log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type)) # Don't throw an exception because if a fetch fails we should always else: log.debug("SyncObjects.fetch: jid=%s minions=%s" % (pub_data['jid'], pub_data['minions']))
def clear(args): if not args.yes_i_am_sure: log.warn( "This will remove all stored Calamari monitoring status and history. Use '--yes-i-am-sure' to proceed" ) return log.info("Loading configuration..") config = CalamariConfig() log.info("Dropping tables") db_path = config.get('cthulhu', 'db_path') engine = create_engine(db_path) Base.metadata.drop_all(engine) Base.metadata.reflect(engine) if ALEMBIC_TABLE in Base.metadata.tables: Base.metadata.tables[ALEMBIC_TABLE].drop(engine) log.info("Complete. Now run `%s initialize`" % os.path.basename(sys.argv[0]))
def tick(self): """ For walltime-based monitoring of running requests. Long-running requests get a periodic call to saltutil.running to verify that things really are still happening. """ if not self._by_jid: return else: log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid)) # Identify JIDs who haven't had a saltutil.running reponse for too long. # Kill requests in a separate phase because request:JID is not 1:1 stale_jobs = set() _now = now() for request in self._by_jid.values(): if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3): log.error("Request %s JID %s stale: now=%s, alive_at=%s" % ( request.id, request.jid, _now, request.alive_at )) stale_jobs.add(request) # Any identified stale jobs are errored out. for request in stale_jobs: with self._update_index(request): request.set_error("Lost contact") request.jid = None request.complete() # Identify minions associated with JIDs in flight query_minions = set() for jid, request in self._by_jid.items(): query_minions.add(request.minion_id) # Attempt to emit a saltutil.running to ping jobs, next tick we # will see if we got updates to the alive_at attribute to indicate non-staleness if query_minions: log.info("RequestCollection.tick: sending saltutil.running to {0}".format(query_minions)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list") if not pub_data: log.warning("Failed to publish saltutil.running to {0}".format(query_minions))
def on_tick(self): # This procedure is to catch the annoying case of AES key changes (#7836), which are otherwise # ignored by minions which are doing only minion->master messaging. To ensure they # pick up on key changes, we actively send them something (doesn't matter what). To # avoid doing this constantly, we only send things to minions which seem to be a little # late # After this length of time, doubt a minion enough to send it a message in case # it needs a kick to update its key def _ping_period(fqdn): return datetime.timedelta(seconds=self.get_contact_period(fqdn) * 2) t = now() late_servers = [s.fqdn for s in self.servers.values() if s.last_contact and (t - s.last_contact) > _ping_period(s.fqdn)] log.debug("late servers: %s" % late_servers) if late_servers: client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub = client.pub(late_servers, "test.ping", expr_form='list') log.debug(pub)
def update_connected_minions(): from cthulhu.manager import config from calamari_common.salt_wrapper import Key, master_config if len( Key(master_config(config.get( 'cthulhu', 'salt_config_path'))).list_keys()['minions']) == 0: # no minions to update return message = "Updating already connected nodes." log.info(message) p = subprocess.Popen(["salt", "*", "state.highstate"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() log.debug("{message} salt stdout: {out}".format(message=message, out=out)) log.debug("{message} salt stderr: {err}".format(message=message, err=err)) if p.returncode != 0: raise RuntimeError("{message} failed with rc={rc}".format( message=message, rc=p.returncode))
def _submit(self, commands=None): if commands is None: commands = self._commands self.log.debug("%s._submit: %s/%s/%s" % (self.__class__.__name__, self._minion_id, self._cluster_name, commands)) client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub_data = client.run_job(self._minion_id, 'ceph.rados_commands', [self.fsid, self._cluster_name, commands]) if not pub_data: # FIXME: LocalClient uses 'print' to record the # details of what went wrong :-( raise PublishError("Failed to publish job") self.log.info("Request %s started job %s" % (self.id, pub_data['jid'])) self.alive_at = now() self.jid = pub_data['jid'] return self.jid
def on_tick(self): # This procedure is to catch the annoying case of AES key changes (#7836), which are otherwise # ignored by minions which are doing only minion->master messaging. To ensure they # pick up on key changes, we actively send them something (doesn't matter what). To # avoid doing this constantly, we only send things to minions which seem to be a little # late # After this length of time, doubt a minion enough to send it a message in case # it needs a kick to update its key def _ping_period(fqdn): return datetime.timedelta(seconds=self.get_contact_period(fqdn) * 2) t = now() late_servers = [ s.fqdn for s in self.servers.values() if s.last_contact and (t - s.last_contact) > _ping_period(s.fqdn) ] log.debug("late servers: %s" % late_servers) if late_servers: client = LocalClient(config.get('cthulhu', 'salt_config_path')) pub = client.pub(late_servers, "test.ping", expr_form='list') log.debug(pub)
def load_plugins(self): """ Try to load a status_processor from each module in plugin_path, store keyed by module_name """ loaded_plugins = [] # FIXME this assumes that plugin_path has been added to PYTHONPATH and/or is in site-packages plugin_path = config.get('cthulhu', 'plugin_path') if os.path.exists(plugin_path): for plugin in os.listdir(plugin_path): plugin = plugin.split('.')[0] if plugin in ('__init__', 'README'): continue status_processor = None try: plugin_module = importlib.import_module('.'.join((plugin, 'status_processor'))) status_processor = plugin_module.StatusProcessor() except ImportError, e: log.info("Error importing plugin %s %s" % (plugin, str(e))) if status_processor is not None: loaded_plugins.append((plugin, status_processor))
from cthulhu.manager import config from cthulhu.util import now from distutils.util import strtobool # The tick handler is very cheap (no I/O) so we call # it quite frequently. TICK_SECONDS = 10 # The time-based checks don't kick in until after # a grace period, to avoid generating complaints # about "stale" timestamps immediately after startup GRACE_PERIOD = 30 # How long must a [server|cluster] be out of contact before # we generate an event? CONTACT_THRESHOLD_FACTOR = int(config.get( 'cthulhu', 'server_timeout_factor')) # multiple of contact period CLUSTER_CONTACT_THRESHOLD = int( config.get('cthulhu', 'cluster_contact_threshold')) # in seconds MINION_CONFIG = str(config.get('cthulhu', 'salt_config_path')).replace( 'master', 'minion') EMIT_EVENTS_TO_SALT_EVENT_BUS = bool( strtobool(config.get('cthulhu', 'emit_events_to_salt_event_bus'))) EVENT_TAG_PREFIX = str(config.get('cthulhu', 'event_tag_prefix')) if EMIT_EVENTS_TO_SALT_EVENT_BUS: try: # TODO move this to import # from calamari_common import Caller import salt.client except ImportError as e:
from calamari_common.db.event import Event, ERROR, WARNING, RECOVERY, INFO, severity_str from cthulhu.util import now # The tick handler is very cheap (no I/O) so we call # it quite frequently. TICK_SECONDS = 10 # The time-based checks don't kick in until after # a grace period, to avoid generating complaints # about "stale" timestamps immediately after startup GRACE_PERIOD = 30 # How long must a [server|cluster] be out of contact before # we generate an event? CONTACT_THRESHOLD_FACTOR = int(config.get('cthulhu', 'server_timeout_factor')) # multiple of contact period CLUSTER_CONTACT_THRESHOLD = int(config.get('cthulhu', 'cluster_contact_threshold')) # in seconds class Eventer(gevent.greenlet.Greenlet): """ I listen to changes from ClusterMonitor and ServerMonitor, and feed events into the event log. I also periodically check some time-based conditions in my on_tick method. """ def __init__(self, manager): super(Eventer, self).__init__() self._manager = manager self._complete = gevent.event.Event()
from sqlalchemy.orm import sessionmaker from cthulhu.manager import config from cthulhu.persistence.sync_objects import SyncObject from cthulhu.persistence.servers import Server, Service from cthulhu.util import now from cthulhu.log import log Session = sessionmaker() DeferredCall = namedtuple("DeferredCall", ["fn", "args", "kwargs"]) CLUSTER_MAP_RETENTION = datetime.timedelta(seconds=int(config.get("cthulhu", "cluster_map_retention"))) class Persister(gevent.greenlet.Greenlet): """ Asynchronously persist a queue of updates. This is for use by classes that maintain the primary copy of state in memory, but also lazily update the DB so that they can recover from it on restart. """ def __init__(self): super(Persister, self).__init__() self._queue = gevent.queue.Queue() self._complete = gevent.event.Event()
from gevent import event import salt.utils.event import salt.utils.master from salt.client import LocalClient from cthulhu.gevent_util import nosleep from cthulhu.log import log as cthulhu_log from cthulhu.manager import salt_config, config # The type name for hosts and osds in the CRUSH map (if users have their # own crush map they may have changed this), Ceph defaults are 'host' and 'osd' from calamari_common.types import OsdMap, MonMap, ServiceId from cthulhu.persistence.servers import Server, Service from cthulhu.util import now, SaltEventSource CRUSH_HOST_TYPE = config.get('cthulhu', 'crush_host_type') CRUSH_OSD_TYPE = config.get('cthulhu', 'crush_osd_type') TICK_PERIOD = 10 # Ignore changes in boot time below this threshold, to avoid mistaking clock # adjustments for reboots. REBOOT_THRESHOLD = datetime.timedelta(seconds=10) # getChild isn't in 2.6 log = logging.getLogger('.'.join((cthulhu_log.name, 'server_monitor'))) class GrainsNotFound(Exception): pass
from sqlalchemy.orm import sessionmaker from cthulhu.manager import config from cthulhu.persistence.sync_objects import SyncObject from cthulhu.persistence.servers import Server, Service from cthulhu.util import now from cthulhu.log import log Session = sessionmaker() DeferredCall = namedtuple('DeferredCall', ['fn', 'args', 'kwargs']) CLUSTER_MAP_RETENTION = datetime.timedelta( seconds=int(config.get('cthulhu', 'cluster_map_retention'))) class Persister(gevent.greenlet.Greenlet): """ Asynchronously persist a queue of updates. This is for use by classes that maintain the primary copy of state in memory, but also lazily update the DB so that they can recover from it on restart. """ def __init__(self): super(Persister, self).__init__() self._queue = gevent.queue.Queue() self._complete = gevent.event.Event() self._session = Session()
def _salt_key(self): return Key(master_config(config.get("cthulhu", "salt_config_path")))
def initialize(args): """ This command exists to: - Prevent the user having to type more than one thing - Prevent the user seeing internals like 'manage.py' which we would rather people were not messing with on production systems. """ log.info("Loading configuration..") config = CalamariConfig() # Generate django's SECRET_KEY setting # Do this first, otherwise subsequent django ops will raise ImproperlyConfigured. # Write into a file instead of directly, so that package upgrades etc won't spuriously # prompt for modified config unless it really is modified. if not os.path.exists(config.get('calamari_web', 'secret_key_path')): chars = 'abcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*(-_=+)' open(config.get('calamari_web', 'secret_key_path'), 'w').write(get_random_string(50, chars)) run_local_salt(sls=RELAX_SALT_PERMS_SLS, message='salt') run_local_salt(sls=POSTGRES_SLS, message='postgres') # Cthulhu's database db_path = config.get('cthulhu', 'db_path') engine = create_engine(db_path) Base.metadata.reflect(engine) alembic_config = AlembicConfig() if ALEMBIC_TABLE in Base.metadata.tables: log.info("Updating database...") # Database already populated, migrate forward command.upgrade(alembic_config, "head") else: log.info("Initializing database...") # Blank database, do initial population Base.metadata.create_all(engine) command.stamp(alembic_config, "head") os.environ.setdefault("DJANGO_SETTINGS_MODULE", "calamari_web.settings") # Django's database with quiet(): execute_from_command_line(["", "syncdb", "--noinput"]) create_default_roles() create_admin_users(args) log.info("Initializing web interface...") # Django's static files with quiet(): execute_from_command_line(["", "collectstatic", "--noinput"]) # Because we've loaded Django, it will have written log files as # this user (probably root). Fix it so that apache can write them later. apache_user = pwd.getpwnam(config.get('calamari_web', 'username')) os.chown(config.get('calamari_web', 'log_path'), apache_user.pw_uid, apache_user.pw_gid) # Handle SQLite case, otherwise no chown is needed if config.get('calamari_web', 'db_engine').endswith("sqlite3"): os.chown(config.get('calamari_web', 'db_name'), apache_user.pw_uid, apache_user.pw_gid) # Start services, configure to run on boot run_local_salt(sls=SERVICES_SLS, message='services') # During an upgrade: update minions that were connected previously update_connected_minions() # Signal supervisor to restart cthulhu as we have created its database log.info("Restarting services...") subprocess.call(['supervisorctl', 'restart', 'cthulhu'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # TODO: optionally generate or install HTTPS certs + hand to apache log.info("Complete.")
def bind(self): log.info("%s bind..." % self.__class__.__name__) self._server.bind(config.get('cthulhu', 'rpc_url')) self._bound = True
def get_server_log(self, fqdn, log_path, lines): client = LocalClient(config.get('cthulhu', 'salt_config_path')) results = client.cmd(fqdn, "log_tail.tail", [log_path, lines]) return results
from gevent import greenlet from gevent import event from cthulhu.gevent_util import nosleep from cthulhu.log import log as cthulhu_log from cthulhu.manager import salt_config, config # The type name for hosts and osds in the CRUSH map (if users have their # own crush map they may have changed this), Ceph defaults are 'host' and 'osd' from calamari_common.types import OsdMap, MonMap, ServiceId from calamari_common.salt_wrapper import SaltEventSource, MasterPillarUtil from cthulhu.persistence.servers import Server, Service from cthulhu.util import now CRUSH_HOST_TYPE = config.get('cthulhu', 'crush_host_type') CRUSH_OSD_TYPE = config.get('cthulhu', 'crush_osd_type') # Ignore changes in boot time below this threshold, to avoid mistaking clock # adjustments for reboots. REBOOT_THRESHOLD = datetime.timedelta(seconds=10) # getChild isn't in 2.6 log = logging.getLogger('.'.join((cthulhu_log.name, 'server_monitor'))) class GrainsNotFound(Exception): pass
def _salt_key(self): return Key(master_config(config.get('cthulhu', 'salt_config_path')))
def list_server_logs(self, fqdn): client = LocalClient(config.get('cthulhu', 'salt_config_path')) results = client.cmd(fqdn, "log_tail.list_logs", ["."]) log.debug('list_server_log result !!! {results}'.format(results=str(results))) return results
def salt_client(self): return salt.client.LocalClient(config.get('cthulhu', 'salt_config_path'))
from calamari_common.salt_wrapper import condition_kwarg, LocalClient, SaltEventSource from cthulhu.gevent_util import nosleep, nosleep_mgr from cthulhu.log import log from cthulhu.manager.crush_node_request_factory import CrushNodeRequestFactory from cthulhu.manager.crush_request_factory import CrushRequestFactory from cthulhu.manager.osd_request_factory import OsdRequestFactory from cthulhu.manager.pool_request_factory import PoolRequestFactory from cthulhu.manager.plugin_monitor import PluginMonitor from calamari_common.types import CRUSH_NODE, CRUSH_MAP, SYNC_OBJECT_STR_TYPE, SYNC_OBJECT_TYPES, OSD, POOL, OsdMap, MdsMap, MonMap from cthulhu.manager import config, salt_config from cthulhu.util import now FAVORITE_TIMEOUT_FACTOR = int(config.get('cthulhu', 'favorite_timeout_factor')) class ClusterUnavailable(Exception): pass class SyncObjects(object): """ A collection of versioned objects, keyed by their class (which must be a SyncObject subclass). The objects are immutable, so it is safe to hand out references: new versions are new objects. """
from cthulhu.util import now from distutils.util import strtobool # The tick handler is very cheap (no I/O) so we call # it quite frequently. TICK_SECONDS = 10 # The time-based checks don't kick in until after # a grace period, to avoid generating complaints # about "stale" timestamps immediately after startup GRACE_PERIOD = 30 # How long must a [server|cluster] be out of contact before # we generate an event? CONTACT_THRESHOLD_FACTOR = int(config.get('cthulhu', 'server_timeout_factor')) # multiple of contact period CLUSTER_CONTACT_THRESHOLD = int(config.get('cthulhu', 'cluster_contact_threshold')) # in seconds MINION_CONFIG = str(config.get('cthulhu', 'salt_config_path')).replace('master', 'minion') EMIT_EVENTS_TO_SALT_EVENT_BUS = bool(strtobool(config.get('cthulhu', 'emit_events_to_salt_event_bus'))) EVENT_TAG_PREFIX = str(config.get('cthulhu', 'event_tag_prefix')) if EMIT_EVENTS_TO_SALT_EVENT_BUS: try: # TODO move this to import # from calamari_common import Caller import salt.client except ImportError as e: EMIT_EVENTS_TO_SALT_EVENT_BUS = False log.error("Could not import salt.client: %s. Events cannot be emitted to salt event bus", str(e))
from cthulhu.manager import config from calamari_common.db.event import Event, ERROR, WARNING, RECOVERY, INFO, severity_str from cthulhu.util import now # The tick handler is very cheap (no I/O) so we call # it quite frequently. TICK_SECONDS = 10 # The time-based checks don't kick in until after # a grace period, to avoid generating complaints # about "stale" timestamps immediately after startup GRACE_PERIOD = 30 # How long must a [server|cluster] be out of contact before # we generate an event? CONTACT_THRESHOLD_FACTOR = int(config.get( 'cthulhu', 'server_timeout_factor')) # multiple of contact period CLUSTER_CONTACT_THRESHOLD = int( config.get('cthulhu', 'cluster_contact_threshold')) # in seconds class Eventer(gevent.greenlet.Greenlet): """ I listen to changes from ClusterMonitor and ServerMonitor, and feed events into the event log. I also periodically check some time-based conditions in my on_tick method. """ def __init__(self, manager): super(Eventer, self).__init__() self._manager = manager self._complete = gevent.event.Event()
def list_server_logs(self, fqdn): client = LocalClient(config.get('cthulhu', 'salt_config_path')) results = client.cmd(fqdn, "log_tail.list_logs", ["."]) log.debug('list_server_log result !!! {results}'.format( results=str(results))) return results