示例#1
0
    def on_fetch_complete(self, minion_id, sync_type, version, data):
        """
        :return A SyncObject if this version was new to us, else None
        """
        log.debug("SyncObjects.on_fetch_complete %s/%s/%s" %
                  (minion_id, sync_type.str, version))
        self._fetching_at[sync_type] = None

        # A fetch might give us a newer version than we knew we had asked for
        if sync_type.cmp(version, self._known_versions[sync_type]) > 0:
            self._known_versions[sync_type] = version

        # Don't store this if we already got something newer
        if sync_type.cmp(version, self.get_version(sync_type)) <= 0:
            log.warn("Ignoring outdated update %s/%s from %s" %
                     (sync_type.str, version, minion_id))
            new_object = None
        else:
            log.info("Got new version %s/%s" % (sync_type.str, version))
            new_object = self.set_map(sync_type, version, data)

        # This might not be the latest: if it's not, send out another fetch
        # right away
        if sync_type.cmp(self._known_versions[sync_type], version) > 0:
            self.fetch(minion_id, sync_type)

        return new_object
示例#2
0
    def _run(self):
        log.info("Running {0}".format(self.__class__.__name__))
        while not self._complete.is_set():
            # self._emit_stats()
            self._complete.wait(self.MONITOR_PERIOD)

        self._close()
示例#3
0
    def _emit_stats(self):
        try:
            if not self._socket:
                log.info("Opening carbon socket {0}:{1}".format(
                    self.CARBON_HOST, self.CARBON_PORT))
                self._socket = socket.socket(socket.AF_INET,
                                             socket.SOCK_STREAM)
                self._socket.connect((self.CARBON_HOST, self.CARBON_PORT))

            carbon_data = ""
            t = int(time.time())
            usage = resource.getrusage(resource.RUSAGE_SELF)
            for usage_field in ("utime", "stime", "maxrss", "ixrss", "idrss",
                                "isrss", "minflt", "majflt", "nswap",
                                "inblock", "oublock", "msgsnd", "msgrcv",
                                "nsignals", "nvcsw", "nivcsw"):
                val = getattr(usage, "ru_{0}".format(usage_field))
                log.debug("{0}: {1}".format(usage_field, val))
                carbon_data += "calamari.rlyeh.ru_{0} {1} {2}\n".format(
                    usage_field, val, t)

            self._socket.sendall(carbon_data)
        except socket.gaierror, resource.error:
            log.exception("Failed to send debugging statistics")
            self._close()
示例#4
0
文件: manager.py 项目: ceph/calamari
    def _run(self):
        log.info("Running {0}".format(self.__class__.__name__))
        while not self._complete.is_set():
            # self._emit_stats()
            self._complete.wait(self.MONITOR_PERIOD)

        self._close()
示例#5
0
 def stop(self):
     log.info("%s stopping" % self.__class__.__name__)
     for monitor in self.clusters.values():
         monitor.stop()
     self._rpc_thread.stop()
     self._discovery_thread.stop()
     self._process_monitor.stop()
     self.eventer.stop()
     self._request_ticker.stop()
示例#6
0
文件: manager.py 项目: ceph/calamari
 def stop(self):
     log.info("%s stopping" % self.__class__.__name__)
     for monitor in self.clusters.values():
         monitor.stop()
     self._rpc_thread.stop()
     self._discovery_thread.stop()
     self._process_monitor.stop()
     self.eventer.stop()
     self._request_ticker.stop()
示例#7
0
文件: manager.py 项目: ceph/calamari
    def _run(self):
        log.info("%s running" % self.__class__.__name__)

        remote = get_remote()
        remote.listen(self._complete,
                      on_heartbeat=self.on_heartbeat,
                      on_job=self.on_job,
                      on_running_jobs=self._manager.requests.on_tick_response)

        log.info("%s complete" % self.__class__.__name__)
示例#8
0
    def _run(self):
        log.info("%s running" % self.__class__.__name__)

        remote = get_remote()
        remote.listen(self._complete,
                      on_heartbeat=self.on_heartbeat,
                      on_job=self.on_job,
                      on_running_jobs=self._manager.requests.on_tick_response)

        log.info("%s complete" % self.__class__.__name__)
示例#9
0
文件: manager.py 项目: ceph/calamari
 def join(self):
     log.info("%s joining" % self.__class__.__name__)
     self._rpc_thread.join()
     self._discovery_thread.join()
     self._process_monitor.join()
     self.persister.join()
     self.eventer.join()
     self._request_ticker.join()
     self.servers.join()
     for monitor in self.clusters.values():
         monitor.join()
示例#10
0
 def join(self):
     log.info("%s joining" % self.__class__.__name__)
     self._rpc_thread.join()
     self._discovery_thread.join()
     self._process_monitor.join()
     self.persister.join()
     self.eventer.join()
     self._request_ticker.join()
     self.servers.join()
     for monitor in self.clusters.values():
         monitor.join()
示例#11
0
    def _run(self):
        assert self._bound

        while not self._complete.is_set():
            try:
                log.info("%s run..." % self.__class__.__name__)
                self._server.run()
            except:
                log.error(traceback.format_exc())
                self._complete.wait(self.EXCEPTION_BACKOFF)

        log.info("%s complete..." % self.__class__.__name__)
示例#12
0
    def start(self):
        log.info("%s starting" % self.__class__.__name__)

        self._rpc_thread.bind()
        self._rpc_thread.start()
        self._discovery_thread.start()
        self._process_monitor.start()
        self.persister.start()
        self.eventer.start()
        self._request_ticker.start()

        self.servers.start()
        return True
示例#13
0
文件: manager.py 项目: ceph/calamari
    def start(self):
        log.info("%s starting" % self.__class__.__name__)

        self._rpc_thread.bind()
        self._rpc_thread.start()
        self._discovery_thread.start()
        self._process_monitor.start()
        self.persister.start()
        self.eventer.start()
        self._request_ticker.start()

        self.servers.start()
        return True
示例#14
0
文件: manager.py 项目: ceph/calamari
    def on_discovery(self, minion_id, heartbeat_data):
        log.info("on_discovery: {0}/{1}".format(minion_id, heartbeat_data['fsid']))
        cluster_monitor = ClusterMonitor(heartbeat_data['fsid'], heartbeat_data['name'],
                                         self.persister, self.servers, self.eventer, self.requests)
        self.clusters[heartbeat_data['fsid']] = cluster_monitor

        # Run before passing on the heartbeat, because otherwise the
        # syncs resulting from the heartbeat might not be received
        # by the monitor.
        cluster_monitor.start()
        # Wait for ClusterMonitor to start accepting events before asking it
        # to do anything
        cluster_monitor.ready()
        cluster_monitor.on_heartbeat(minion_id, heartbeat_data)
示例#15
0
    def _run(self):
        self._plugin_monitor.start()

        self._ready.set()
        log.debug("ClusterMonitor._run: ready")

        remote.listen(self._complete,
                      on_heartbeat=self.on_heartbeat,
                      fsid=self.fsid,
                      on_job=self.on_job_complete)

        log.info("%s complete" % self.__class__.__name__)
        self._plugin_monitor.stop()
        self._plugin_monitor.join()
        self.done.set()
示例#16
0
    def on_discovery(self, minion_id, heartbeat_data):
        log.info("on_discovery: {0}/{1}".format(minion_id,
                                                heartbeat_data['fsid']))
        cluster_monitor = ClusterMonitor(heartbeat_data['fsid'],
                                         heartbeat_data['name'],
                                         self.persister, self.servers,
                                         self.eventer, self.requests)
        self.clusters[heartbeat_data['fsid']] = cluster_monitor

        # Run before passing on the heartbeat, because otherwise the
        # syncs resulting from the heartbeat might not be received
        # by the monitor.
        cluster_monitor.start()
        # Wait for ClusterMonitor to start accepting events before asking it
        # to do anything
        cluster_monitor.ready()
        cluster_monitor.on_heartbeat(minion_id, heartbeat_data)
示例#17
0
文件: manager.py 项目: ceph/calamari
    def _emit_stats(self):
        try:
            if not self._socket:
                log.info("Opening carbon socket {0}:{1}".format(self.CARBON_HOST, self.CARBON_PORT))
                self._socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                self._socket.connect((self.CARBON_HOST, self.CARBON_PORT))

            carbon_data = ""
            t = int(time.time())
            usage = resource.getrusage(resource.RUSAGE_SELF)
            for usage_field in ("utime", "stime", "maxrss", "ixrss", "idrss", "isrss", "minflt", "majflt",
                                "nswap", "inblock", "oublock", "msgsnd", "msgrcv", "nsignals", "nvcsw", "nivcsw"):
                val = getattr(usage, "ru_{0}".format(usage_field))
                log.debug("{0}: {1}".format(usage_field, val))
                carbon_data += "calamari.rlyeh.ru_{0} {1} {2}\n".format(usage_field, val, t)

            self._socket.sendall(carbon_data)
        except socket.gaierror, resource.error:
            log.exception("Failed to send debugging statistics")
            self._close()
示例#18
0
def main():
    parser = argparse.ArgumentParser(description='Calamari management service')
    parser.add_argument('--debug',
                        dest='debug',
                        action='store_true',
                        default=False,
                        help='print log to stdout')

    args = parser.parse_args()
    if args.debug:
        handler = logging.StreamHandler(sys.stdout)
        handler.setFormatter(logging.Formatter(rlyeh.log.FORMAT))
        log.addHandler(handler)

    # Instruct salt to use the gevent version of ZMQ
    import zmq.green
    import salt.utils.event
    salt.utils.event.zmq = zmq.green

    if sqlalchemy is not None:
        # Set up gevent compatibility in psycopg2
        import psycogreen.gevent
        psycogreen.gevent.patch_psycopg()

    if manhole is not None:
        # Enable manhole for debugging.  Use oneshot mode
        # for gevent compatibility
        manhole.cry = lambda message: log.info("MANHOLE: %s" % message)
        manhole.install(oneshot_on=signal.SIGUSR1)

    m = Manager()
    m.start()

    complete = gevent.event.Event()

    def shutdown():
        log.info("Signal handler: stopping")
        complete.set()

    gevent.signal(signal.SIGTERM, shutdown)
    gevent.signal(signal.SIGINT, shutdown)

    while not complete.is_set():
        complete.wait(timeout=1)
示例#19
0
文件: manager.py 项目: ceph/calamari
def main():
    parser = argparse.ArgumentParser(description='Calamari management service')
    parser.add_argument('--debug', dest='debug', action='store_true',
                        default=False, help='print log to stdout')

    args = parser.parse_args()
    if args.debug:
        handler = logging.StreamHandler(sys.stdout)
        handler.setFormatter(logging.Formatter(rlyeh.log.FORMAT))
        log.addHandler(handler)

    # Instruct salt to use the gevent version of ZMQ
    import zmq.green
    import salt.utils.event
    salt.utils.event.zmq = zmq.green

    if sqlalchemy is not None:
        # Set up gevent compatibility in psycopg2
        import psycogreen.gevent
        psycogreen.gevent.patch_psycopg()

    if manhole is not None:
        # Enable manhole for debugging.  Use oneshot mode
        # for gevent compatibility
        manhole.cry = lambda message: log.info("MANHOLE: %s" % message)
        manhole.install(oneshot_on=signal.SIGUSR1)

    m = Manager()
    m.start()

    complete = gevent.event.Event()

    def shutdown():
        log.info("Signal handler: stopping")
        complete.set()

    gevent.signal(signal.SIGTERM, shutdown)
    gevent.signal(signal.SIGINT, shutdown)

    while not complete.is_set():
        complete.wait(timeout=1)
示例#20
0
    def on_version(self, reported_by, sync_type, new_version):
        """
        Notify me that a particular version of a particular map exists.

        I may choose to initiate RPC to retrieve the map
        """
        log.debug("SyncObjects.on_version %s/%s/%s" %
                  (reported_by, sync_type.str, new_version))
        old_version = self.get_version(sync_type)
        if sync_type.cmp(new_version, old_version) > 0:
            known_version = self._known_versions[sync_type]
            if sync_type.cmp(new_version, known_version) > 0:
                # We are out of date: request an up to date copy
                log.info("Advanced known version %s/%s %s->%s" %
                         (self._cluster_name, sync_type.str, known_version,
                          new_version))
                self._known_versions[sync_type] = new_version
            else:
                log.info("on_version: %s is newer than %s" %
                         (new_version, old_version))

            # If we already have a request out for this type of map, then consider
            # cancelling it if we've already waited for a while.
            if self._fetching_at[sync_type] is not None:
                if now() - self._fetching_at[sync_type] < self.FETCH_TIMEOUT:
                    log.info("Fetch already underway for %s" % sync_type.str)
                    return
                else:
                    log.warn("Abandoning fetch for %s started at %s" %
                             (sync_type.str, self._fetching_at[sync_type]))

            log.info(
                "on_version: fetching %s/%s from %s, currently got %s, know %s"
                % (sync_type, new_version, reported_by, old_version,
                   known_version))
            self.fetch(reported_by, sync_type)
示例#21
0
文件: manager.py 项目: ceph/calamari
    def _recover(self):
        if sqlalchemy is None:
            return

        session = Session()
        for server in session.query(Server).all():
            log.debug("Recovered server %s" % server.fqdn)
            assert server.boot_time is None or server.boot_time.tzinfo is not None  # expect timezone-aware DB backend
            self.servers.inject_server(ServerState(
                fqdn=server.fqdn,
                hostname=server.hostname,
                managed=server.managed,
                last_contact=server.last_contact,
                boot_time=server.boot_time,
                ceph_version=server.ceph_version
            ))

        for service in session.query(Service).all():
            if service.server:
                server = session.query(Server).get(service.server)
            else:
                server = None
            log.debug("Recovered service %s/%s/%s on %s" % (
                service.fsid, service.service_type, service.service_id, server.fqdn if server else None
            ))
            self.servers.inject_service(ServiceState(
                fsid=service.fsid,
                service_type=service.service_type,
                service_id=service.service_id
            ), server.fqdn if server else None)

        # I want the most recent version of every sync_object
        fsids = [(row[0], row[1]) for row in session.query(SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)]
        for fsid, name in fsids:
            cluster_monitor = ClusterMonitor(fsid, name, self.persister, self.servers,
                                             self.eventer, self.requests)
            self.clusters[fsid] = cluster_monitor

            object_types = [row[0] for row in session.query(SyncObject.sync_type).filter_by(fsid=fsid).distinct()]
            for sync_type in object_types:
                latest_record = session.query(SyncObject).filter_by(
                    fsid=fsid, sync_type=sync_type).order_by(
                    SyncObject.version.desc(), SyncObject.when.desc())[0]

                # FIXME: bit of a hack because records persisted only store their 'version'
                # if it's a real counter version, underlying problem is that we have
                # underlying data (health, pg_brief) without usable version counters.
                def md5(raw):
                    hasher = hashlib.md5()
                    hasher.update(raw)
                    return hasher.hexdigest()

                if latest_record.version:
                    version = latest_record.version
                else:
                    version = md5(latest_record.data)

                when = latest_record.when
                when = when.replace(tzinfo=tzutc())
                if cluster_monitor.update_time is None or when > cluster_monitor.update_time:
                    cluster_monitor.update_time = when

                cluster_monitor.inject_sync_object(None, sync_type, version, msgpack.unpackb(latest_record.data))

        for monitor in self.clusters.values():
            log.info("Recovery: Cluster %s with update time %s" % (monitor.fsid, monitor.update_time))
            monitor.start()
示例#22
0
    def _recover(self):
        if sqlalchemy is None:
            return

        session = Session()
        for server in session.query(Server).all():
            log.debug("Recovered server %s" % server.fqdn)
            assert server.boot_time is None or server.boot_time.tzinfo is not None  # expect timezone-aware DB backend
            self.servers.inject_server(
                ServerState(fqdn=server.fqdn,
                            hostname=server.hostname,
                            managed=server.managed,
                            last_contact=server.last_contact,
                            boot_time=server.boot_time,
                            ceph_version=server.ceph_version))

        for service in session.query(Service).all():
            if service.server:
                server = session.query(Server).get(service.server)
            else:
                server = None
            log.debug("Recovered service %s/%s/%s on %s" %
                      (service.fsid, service.service_type, service.service_id,
                       server.fqdn if server else None))
            self.servers.inject_service(
                ServiceState(fsid=service.fsid,
                             service_type=service.service_type,
                             service_id=service.service_id),
                server.fqdn if server else None)

        # I want the most recent version of every sync_object
        fsids = [(row[0], row[1]) for row in session.query(
            SyncObject.fsid, SyncObject.cluster_name).distinct(SyncObject.fsid)
                 ]
        for fsid, name in fsids:
            cluster_monitor = ClusterMonitor(fsid, name, self.persister,
                                             self.servers, self.eventer,
                                             self.requests)
            self.clusters[fsid] = cluster_monitor

            object_types = [
                row[0]
                for row in session.query(SyncObject.sync_type).filter_by(
                    fsid=fsid).distinct()
            ]
            for sync_type in object_types:
                latest_record = session.query(SyncObject).filter_by(
                    fsid=fsid,
                    sync_type=sync_type).order_by(SyncObject.version.desc(),
                                                  SyncObject.when.desc())[0]

                # FIXME: bit of a hack because records persisted only store their 'version'
                # if it's a real counter version, underlying problem is that we have
                # underlying data (health, pg_brief) without usable version counters.
                def md5(raw):
                    hasher = hashlib.md5()
                    hasher.update(raw)
                    return hasher.hexdigest()

                if latest_record.version:
                    version = latest_record.version
                else:
                    version = md5(latest_record.data)

                when = latest_record.when
                when = when.replace(tzinfo=tzutc())
                if cluster_monitor.update_time is None or when > cluster_monitor.update_time:
                    cluster_monitor.update_time = when

                cluster_monitor.inject_sync_object(
                    None, sync_type, version,
                    msgpack.unpackb(latest_record.data))

        for monitor in self.clusters.values():
            log.info("Recovery: Cluster %s with update time %s" %
                     (monitor.fsid, monitor.update_time))
            monitor.start()
示例#23
0
文件: manager.py 项目: ceph/calamari
 def shutdown():
     log.info("Signal handler: stopping")
     complete.set()
示例#24
0
 def shutdown():
     log.info("Signal handler: stopping")
     complete.set()
示例#25
0
 def bind(self):
     log.info("%s bind..." % self.__class__.__name__)
     self._server.bind(config.get('rlyeh', 'rpc_url'))
     self._bound = True
示例#26
0
文件: rpc.py 项目: ceph/calamari
    def stop(self):
        log.info("%s stopping" % self.__class__.__name__)

        self._complete.set()
        if self._server:
            self._server.stop()
示例#27
0
 def stop(self):
     log.info("%s stopping" % self.__class__.__name__)
     self._complete.set()