Exemplo n.º 1
0
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = DiscoveryThread(self)
        self._process_monitor = ProcessMonitorThread()

        self.notifier = NotificationThread()
        try:
            # Prepare persistence
            engine = create_engine(config.get('cthulhu', 'db_path'))
            Session.configure(bind=engine)

            self.persister = Persister()
        except sqlalchemy.exc.ArgumentError as e:
            log.error("Database error: %s" % e)
            raise

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer)
Exemplo n.º 2
0
def main():

    log.info('calamari-list: starting')
    complete = gevent.event.Event()
    ceph_argparse = None
    while not ceph_argparse:
        try:
            import ceph_argparse
        except ImportError:
            log.error('Cannot import ceph_arg_parse module -- please install ceph')
            complete.wait(timeout=50)

    from cthulhu.manager.manager import Manager

    carbon = ShallowCarbonCache()
    carbon.start()

    cthulhu = Manager()
    cthulhu_started = False

    while not cthulhu_started:
        try:
            if not cthulhu_started:
                cthulhu_started = cthulhu.start()

        except Exception, e:
            log.exception('It borked')
            log.error(str(e))
            complete.wait(timeout=5)
Exemplo n.º 3
0
    def on_map(self, sync_type, sync_object):
        if self._awaiting_pgs:
            assert sync_type == PgSummary
            pg_summary = sync_object
            pgs_not_creating = 0
            for state_tuple, count in pg_summary.data['by_pool'][self._pool_id].items():
                states = state_tuple.split("+")
                if 'creating' not in states:
                    pgs_not_creating += count

            if pgs_not_creating >= self._pg_count:
                self.complete()

        elif self._await_version:
            assert sync_type == OsdMap
            osd_map = sync_object
            if osd_map.version >= self._await_version:
                for pool_id, pool in osd_map.pools_by_id.items():
                    if pool['pool_name'] == self._pool_name:
                        self._pool_id = pool_id
                        self._pg_count = pool['pg_num']
                        break

                if self._pool_id is None:
                    log.error("'{0}' not found, pools are {1}".format(
                        self._pool_name, [p['pool_name'] for p in osd_map.pools_by_id.values()]
                    ))
                    self.set_error("Expected pool '{0}' not found".format(self._pool_name))
                    self.complete()

                self._awaiting_pgs = True
        else:
            raise NotImplementedError("Unexpected map {0}".format(sync_type))
Exemplo n.º 4
0
    def on_map(self, sync_type, sync_objects):
        """
        Callback for when a new cluster map is available, in which
        we notify any interested ongoing UserRequests of the new map
        so that they can progress if they were waiting for it.
        """
        with self._lock:
            requests = self.get_all(state=UserRequest.SUBMITTED)
            for request in requests:
                try:
                    # If this is one of the types that this request
                    # is waiting for, invoke on_map.
                    for awaited_type in request.awaiting_versions.keys():
                        if awaited_type == sync_type:
                            with self._update_index(request):
                                request.on_map(sync_type, sync_objects)
                except Exception as e:
                    log.exception("Request %s threw exception in on_map", request.id)
                    if request.jid:
                        log.error("Abandoning job {0}".format(request.jid))
                        request.jid = None

                    request.set_error("Internal error %s" % e)
                    request.complete()

                if request.state == UserRequest.COMPLETE:
                    self._eventer.on_user_request_complete(request)
Exemplo n.º 5
0
 def create(self, attributes):
     # get the text map
     crush_map = self.osd_map.data['crush_map_text']
     merged_map = _merge_rule_and_map(crush_map, attributes)
     commands = [('osd setcrushmap', {'data': merged_map})]
     log.error('setcrushmap {0} {1}'.format(merged_map, attributes))
     message = "Creating CRUSH rule in {cluster_name}".format(cluster_name=self._cluster_monitor.name)
     return OsdMapModifyingRequest(message, self._cluster_monitor.fsid, self._cluster_monitor.name, commands)
Exemplo n.º 6
0
 def __init__(self, manager):
     super(RpcThread, self).__init__()
     self._manager = manager
     self._complete = gevent.event.Event()
     if zerorpc is None:
         log.error("zerorpc package is missing")
         raise RuntimeError("Cannot run without zerorpc installed!")
     self._server = zerorpc.Server(RpcInterface(manager))
     self._bound = False
Exemplo n.º 7
0
def dump_stacks():
    """
    This is for use in debugging, especially using manhole
    """
    for ob in gc.get_objects():
        if not isinstance(ob, greenlet.greenlet):
            continue
        if not ob:
            continue
        log.error(''.join(traceback.format_stack(ob.gr_frame)))
Exemplo n.º 8
0
    def __init__(self):
        self._complete = gevent.event.Event()

        self._rpc_thread = RpcThread(self)
        self._discovery_thread = TopLevelEvents(self)
        self._process_monitor = ProcessMonitorThread()

        db_path = config.get('cthulhu', 'db_path')
        if sqlalchemy is not None and db_path:
            try:
                # Prepare persistence
                engine = create_engine(config.get('cthulhu', 'db_path'))  # noqa
                Session.configure(bind=engine)

                self.persister = Persister()
            except sqlalchemy.exc.ArgumentError as e:
                log.error("Database error: %s" % e)
                raise
        else:
            class NullPersister(object):
                def start(self):
                    pass

                def stop(self):
                    pass

                def join(self):
                    pass

                def __getattribute__(self, item):
                    if item.startswith('_'):
                        return object.__getattribute__(self, item)
                    else:
                        try:
                            return object.__getattribute__(self, item)
                        except AttributeError:
                            def blackhole(*args, **kwargs):
                                pass
                            return blackhole

            self.persister = NullPersister()

        # Remote operations
        self.requests = RequestCollection(self)
        self._request_ticker = Ticker(request_collection.TICK_PERIOD,
                                      lambda: self.requests.tick())

        # FSID to ClusterMonitor
        self.clusters = {}

        # Generate events on state changes
        self.eventer = Eventer(self)

        # Handle all ceph/server messages
        self.servers = ServerMonitor(self.persister, self.eventer, self.requests)
Exemplo n.º 9
0
    def _run(self):
        assert self._bound

        while not self._complete.is_set():
            try:
                log.info("%s run..." % self.__class__.__name__)
                self._server.run()
            except:
                log.error(traceback.format_exc())
                self._complete.wait(self.EXCEPTION_BACKOFF)

        log.info("%s complete..." % self.__class__.__name__)
Exemplo n.º 10
0
 def fail_all(self, failed_minion):
     """
     For use when we lose contact with the minion that was in use for running
     requests: assume all these requests are never going to return now.
     """
     for request in self.get_all(UserRequest.SUBMITTED):
         with self._update_index(request):
             request.set_error("Lost contact with server %s" % failed_minion)
             if request.jid:
                 log.error("Giving up on JID %s" % request.jid)
                 request.jid = None
             request.complete()
Exemplo n.º 11
0
    def on_job_complete(self, fqdn, jid, success, result, cmd, args):
        # It would be much nicer to put the FSID at the start of
        # the tag, if salt would only let us add custom tags to our jobs.
        # Instead we enforce a convention that calamari jobs include
        # fsid in their return value.
        if 'fsid' not in result or result['fsid'] != self.fsid:
            # Something for a different ClusterMonitor
            log.debug("Ignoring job return, not for my FSID")
            return

        if cmd == 'ceph.get_cluster_object':
            # A ceph.get_cluster_object response
            if not success:
                log.error("on_sync_object: failure from %s: %s" % (fqdn, result))
                return

            self.on_sync_object(fqdn, result)
        else:
            log.warning("Unexpected function '%s' (%s)" % (cmd, cmd))
Exemplo n.º 12
0
    def tick(self):
        """
        For walltime-based monitoring of running requests.  Long-running requests
        get a periodic call to saltutil.running to verify that things really
        are still happening.
        """

        if not self._by_jid:
            return
        else:
            log.debug("RequestCollection.tick: %s JIDs underway" % len(self._by_jid))

        # Identify JIDs who haven't had a saltutil.running reponse for too long.
        # Kill requests in a separate phase because request:JID is not 1:1
        stale_jobs = set()
        _now = now()
        for request in self._by_jid.values():
            if _now - request.alive_at > datetime.timedelta(seconds=TICK_PERIOD * 3):
                log.error("Request %s JID %s stale: now=%s, alive_at=%s" % (
                    request.id, request.jid, _now, request.alive_at
                ))
                stale_jobs.add(request)

        # Any identified stale jobs are errored out.
        for request in stale_jobs:
            with self._update_index(request):
                request.set_error("Lost contact")
                request.jid = None
                request.complete()

        # Identify minions associated with JIDs in flight
        query_minions = set()
        for jid, request in self._by_jid.items():
            query_minions.add(request.minion_id)

        # Attempt to emit a saltutil.running to ping jobs, next tick we
        # will see if we got updates to the alive_at attribute to indicate non-staleness
        if query_minions:
            log.info("RequestCollection.tick: sending saltutil.running to {0}".format(query_minions))
            client = LocalClient(config.get('cthulhu', 'salt_config_path'))
            pub_data = client.run_job(list(query_minions), 'saltutil.running', [], expr_form="list")
            if not pub_data:
                log.warning("Failed to publish saltutil.running to {0}".format(query_minions))
Exemplo n.º 13
0
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        client = LocalClient(config.get('cthulhu', 'salt_config_path'))
        # TODO clean up unused 'since' argument
        pub_data = client.run_job(minion_id, 'ceph.get_cluster_object',
                                  condition_kwarg([], {'cluster_name': self._cluster_name,
                                                       'sync_type': sync_type.str,
                                                       'since': None}))
        if not pub_data:
            log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type))
            # Don't throw an exception because if a fetch fails we should always
        else:
            log.debug("SyncObjects.fetch: jid=%s minions=%s" % (pub_data['jid'], pub_data['minions']))
Exemplo n.º 14
0
    def fetch(self, minion_id, sync_type):
        log.debug("SyncObjects.fetch: %s/%s" % (minion_id, sync_type))
        if minion_id is None:
            # We're probably being replayed to from the database
            log.warn("SyncObjects.fetch called with minion_id=None")
            return

        self._fetching_at[sync_type] = now()
        try:
            # TODO clean up unused 'since' argument
            jid = remote.run_job(minion_id, 'ceph.get_cluster_object',
                                 {'cluster_name': self._cluster_name,
                                  'sync_type': sync_type.str,
                                  'since': None})
        except Unavailable:
            # Don't throw an exception because if a fetch fails we should end up
            # issuing another on next heartbeat
            log.error("Failed to start fetch job %s/%s" % (minion_id, sync_type))
        else:
            log.debug("SyncObjects.fetch: jid=%s" % jid)
Exemplo n.º 15
0
    def _run(self):
        self._plugin_monitor.start()

        self._ready.set()
        log.debug("ClusterMonitor._run: ready")

        event = SaltEventSource(log, salt_config)

        while not self._complete.is_set():
            # No salt tag filtering: https://github.com/saltstack/salt/issues/11582
            ev = event.get_event(full=True)

            if ev is not None:
                data = ev['data']
                tag = ev['tag']
                log.debug("_run.ev: %s/tag=%s" % (data['id'] if 'id' in data else None, tag))

                # I am interested in the following tags:
                # - salt/job/<jid>/ret/<minion id> where jid is one that I started
                #   (this includes ceph.rados_command and ceph.get_cluster_object)
                # - ceph/cluster/<fsid> where fsid is my fsid

                try:
                    if tag.startswith("ceph/cluster/{0}".format(self.fsid)):
                        # A ceph.heartbeat beacon
                        self.on_heartbeat(data['id'], data['data'])
                    elif re.match("^salt/job/\d+/ret/[^/]+$", tag):
                        if data['fun'] == "saltutil.running":
                            # Update on what jobs are running
                            # It would be nice to filter these down to those which really are for
                            # this cluster, but as long as N_clusters and N_jobs are reasonably small
                            # it's not an efficiency problem.
                            self._requests.on_tick_response(data['id'], data['return'])

                        # It would be much nicer to put the FSID at the start of
                        # the tag, if salt would only let us add custom tags to our jobs.
                        # Instead we enforce a convention that all calamari jobs must include
                        # fsid in their return value.
                        if (not isinstance(data, dict)) or not isinstance(data['return'], dict):
                            # Something not formatted for ClusterMonitor
                            log.warning("Ignoring event %s" % tag)
                            continue

                        if 'fsid' not in data['return'] or data['return']['fsid'] != self.fsid:
                            # Something for a different ClusterMonitor
                            log.debug("Ignoring job return, not for my FSID")
                            continue

                        if data['fun'] == 'ceph.get_cluster_object':
                            # A ceph.get_cluster_object response
                            if not data['success']:
                                log.error("on_sync_object: failure from %s: %s" % (data['id'], data['return']))
                                continue

                            self.on_sync_object(data['id'], data['return'])
                        else:
                            log.warning("Unexpected function '%s' (%s)" % (data['fun'], tag))
                    else:
                        # This does not concern us, ignore it
                        pass
                except:
                    # Because this is our main event handling loop, swallow exceptions
                    # instead of letting them end the world.
                    log.exception("Exception handling message with tag %s" % tag)
                    log.debug("Message content: %s" % data)

        log.info("%s complete" % self.__class__.__name__)
        self._plugin_monitor.stop()
        self._plugin_monitor.join()
        self.done.set()
Exemplo n.º 16
0
    def on_completion(self, data):
        """
        Callback for when a salt/job/<jid>/ret event is received, in which
        we find the UserRequest that created the job, and inform it of
        completion so that it can progress.
        """
        with self._lock:
            jid = data['jid']
            result = data['return']
            log.debug("on_completion: jid=%s data=%s" % (jid, data))

            try:
                request = self.get_by_jid(jid)
                log.debug("on_completion: jid %s belongs to request %s" % (jid, request.id))
            except KeyError:
                log.warning("on_completion: unknown jid {0}".format(jid))
                return

            if not data['success']:
                # This indicates a failure at the salt level, i.e. job threw an exception
                log.error("Remote execution failed for request %s: %s" % (request.id, result))
                if isinstance(result, dict):
                    # Handler ran and recorded an error for us
                    request.set_error(result['error_status'])
                else:
                    # An exception, probably, stringized by salt for us
                    request.set_error(result)
                request.complete()
            elif result['error']:
                # This indicates a failure within ceph.rados_commands which was caught
                # by our code, like one of our Ceph commands returned an error code.
                # NB in future there may be UserRequest subclasses which want to receive
                # and handle these errors themselves, so this branch would be refactored
                # to allow that.
                log.error("Request %s experienced an error: %s" % (request.id, result['error_status']))
                request.jid = None
                request.set_error(result['error_status'])
                request.complete()
            else:
                if request.state != UserRequest.SUBMITTED:
                    # Unexpected, ignore.
                    log.error("Received completion for request %s/%s in state %s" % (
                        request.id, request.jid, request.state
                    ))
                    return

                try:
                    with self._update_index(request):
                        old_jid = request.jid
                        request.complete_jid(result)
                        assert request.jid != old_jid

                        # After a jid completes, requests may start waiting for cluster
                        # map updates, we ask ClusterMonitor to hurry up and get them on
                        # behalf of the request.
                        if request.awaiting_versions:
                            for sync_type, version in request.awaiting_versions.items():
                                if version is not None:
                                    log.debug("Notifying SyncObjects of awaited version %s/%s" % (sync_type.str, version))
                                    self._sync_objects.on_version(data['id'], sync_type, version)

                            # The request may be waiting for an epoch that we already have, if so
                            # give it to the request right away
                            for sync_type, want_version in request.awaiting_versions.items():
                                got_version = self._sync_objects.get_version(sync_type)
                                if want_version and sync_type.cmp(got_version, want_version) >= 0:
                                    log.info("Awaited %s %s is immediately available" % (sync_type, want_version))
                                    request.on_map(sync_type, self._sync_objects)

                except Exception as e:
                    # Ensure that a misbehaving piece of code in a UserRequest subclass
                    # results in a terminated job, not a zombie job
                    log.exception("Calling complete_jid for %s/%s" % (request.id, request.jid))
                    request.jid = None
                    request.set_error("Internal error %s" % e)
                    request.complete()

        if request.state == UserRequest.COMPLETE:
            self._eventer.on_user_request_complete(request)
Exemplo n.º 17
0
    def custom_handle_error(self, context, type, value, tb):
        if not issubclass(type, Hub.SYSTEM_ERROR + Hub.NOT_ERROR):
            log.error("Uncaught exception", exc_info=(type, value, tb))

        self._origin_handle_error(context, type, value, tb)
Exemplo n.º 18
0
CONTACT_THRESHOLD_FACTOR = int(config.get('cthulhu', 'server_timeout_factor'))  # multiple of contact period
CLUSTER_CONTACT_THRESHOLD = int(config.get('cthulhu', 'cluster_contact_threshold'))  # in seconds

MINION_CONFIG = str(config.get('cthulhu', 'salt_config_path')).replace('master', 'minion')
EMIT_EVENTS_TO_SALT_EVENT_BUS = bool(strtobool(config.get('cthulhu', 'emit_events_to_salt_event_bus')))
EVENT_TAG_PREFIX = str(config.get('cthulhu', 'event_tag_prefix'))


if EMIT_EVENTS_TO_SALT_EVENT_BUS:
    try:
        # TODO move this to import
        # from calamari_common import Caller
        import salt.client
    except ImportError as e:
        EMIT_EVENTS_TO_SALT_EVENT_BUS = False
        log.error("Could not import salt.client: %s. Events cannot be emitted to salt event bus", str(e))


class Event(object):
    def __init__(self, severity, message, **associations):
        self.severity = severity
        self.message = message
        self.associations = associations
        self.when = now()


class Eventer(gevent.greenlet.Greenlet):
    """
    I listen to changes from ClusterMonitor and ServerMonitor, and feed
    events into the event log.  I also periodically check some time-based
    conditions in my on_tick method.
Exemplo n.º 19
0
def handle_exception(exc_type, exc_value, exc_traceback):
    log.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))