class PBDaemon(ZenDaemon, pb.Referenceable): name = 'pbdaemon' initialServices = ['EventService'] heartbeatEvent = {'eventClass':Heartbeat} heartbeatTimeout = 60*3 _customexitcode = 0 _pushEventsDeferred = None _eventHighWaterMark = None _healthMonitorInterval = 30 def __init__(self, noopts=0, keeproot=False, name=None): # if we were provided our collector name via the constructor instead of # via code, be sure to store it correctly. if name is not None: self.name = name self.mname = name try: ZenDaemon.__init__(self, noopts, keeproot) except IOError: import traceback self.log.critical(traceback.format_exc(0)) sys.exit(1) self._thresholds = None self._threshold_notifier = None self.rrdStats = DaemonStats() self.lastStats = 0 self.perspective = None self.services = {} self.eventQueueManager = EventQueueManager(self.options, self.log) self.startEvent = startEvent.copy() self.stopEvent = stopEvent.copy() details = dict(component=self.name, device=self.options.monitor) for evt in self.startEvent, self.stopEvent, self.heartbeatEvent: evt.update(details) self.initialConnect = defer.Deferred() self.stopped = False self.counters = collections.Counter() self._pingedZenhub = None self._connectionTimeout = None self._publisher = None self._internal_publisher = None self._metric_writer = None self._derivative_tracker = None self._metrologyReporter = None # Add a shutdown trigger to send a stop event and flush the event queue reactor.addSystemEventTrigger('before', 'shutdown', self._stopPbDaemon) # Set up a looping call to support the health check. self.healthMonitor = task.LoopingCall(self._checkZenHub) self.healthMonitor.start(self._healthMonitorInterval) def publisher(self): if not self._publisher: host, port = urlparse(self.options.redisUrl).netloc.split(':') try: port = int(port) except ValueError: self.log.exception("redis url contains non-integer port " + "value {port}, defaulting to {default}". format(port=port, default=publisher.defaultRedisPort)) port = publisher.defaultRedisPort self._publisher = publisher.RedisListPublisher( host, port, self.options.metricBufferSize, channel=self.options.metricsChannel, maxOutstandingMetrics=self.options.maxOutstandingMetrics ) return self._publisher def internalPublisher(self): if not self._internal_publisher: url = os.environ.get( "CONTROLPLANE_CONSUMER_URL", None) username = os.environ.get( "CONTROLPLANE_CONSUMER_USERNAME", "") password = os.environ.get( "CONTROLPLANE_CONSUMER_PASSWORD", "") if url: self._internal_publisher = publisher.HttpPostPublisher( username, password, url) return self._internal_publisher def metricWriter(self): if not self._metric_writer: publisher = self.publisher() metric_writer = MetricWriter(publisher) if os.environ.get( "CONTROLPLANE", "0") == "1": internal_publisher = self.internalPublisher() if internal_publisher: internal_metric_filter = lambda metric, value, timestamp, tags:\ tags and tags.get("internal", False) internal_metric_writer = FilteredMetricWriter(internal_publisher, internal_metric_filter) self._metric_writer = AggregateMetricWriter( [metric_writer, internal_metric_writer]) else: self._metric_writer = metric_writer return self._metric_writer def derivativeTracker(self): if not self._derivative_tracker: self._derivative_tracker = DerivativeTracker() return self._derivative_tracker def connecting(self): """ Called when about to connect to zenhub """ self.log.info("Attempting to connect to zenhub") def getZenhubInstanceId(self): """ Called after we connected to zenhub. """ def callback(result): self.log.info("Connected to the zenhub/%s instance", result) def errback(result): self.log.info("Unexpected error appeared while getting zenhub instance number %s", result) d = self.perspective.callRemote('getHubInstanceId') d.addCallback(callback) d.addErrback(errback) return d def gotPerspective(self, perspective): """ This gets called every time we reconnect. @parameter perspective: Twisted perspective object @type perspective: Twisted perspective object """ self.perspective = perspective self.getZenhubInstanceId() # Cancel the connection timeout timer as it's no longer needed. if self._connectionTimeout: try: self._connectionTimeout.cancel() except AlreadyCalled: pass self._connectionTimeout = None d2 = self.getInitialServices() if self.initialConnect: self.log.debug('Chaining getInitialServices with d2') self.initialConnect, d = None, self.initialConnect d2.chainDeferred(d) def connect(self): pingInterval = self.options.zhPingInterval factory = ReconnectingPBClientFactory(connectTimeout=60, pingPerspective=self.options.pingPerspective, pingInterval=pingInterval, pingtimeout=pingInterval * 5) self.log.info("Connecting to %s:%d" % (self.options.hubhost, self.options.hubport)) factory.connectTCP(self.options.hubhost, self.options.hubport) username = self.options.hubusername password = self.options.hubpassword self.log.debug("Logging in as %s" % username) c = credentials.UsernamePassword(username, password) factory.gotPerspective = self.gotPerspective factory.connecting = self.connecting factory.setCredentials(c) def timeout(d): if not d.called: self.connectTimeout() self._connectionTimeout = reactor.callLater( self.options.hubtimeout, timeout, self.initialConnect) return self.initialConnect def connectTimeout(self): self.log.error('Timeout connecting to zenhub: is it running?') pass def eventService(self): return self.getServiceNow('EventService') def getServiceNow(self, svcName): if not svcName in self.services: self.log.warning('No service named %r: ZenHub may be disconnected' % svcName) return self.services.get(svcName, None) or FakeRemote() def getService(self, serviceName, serviceListeningInterface=None): """ Attempt to get a service from zenhub. Returns a deferred. When service is retrieved it is stashed in self.services with serviceName as the key. When getService is called it will first check self.services and if serviceName is already there it will return the entry from self.services wrapped in a defer.succeed """ if serviceName in self.services: return defer.succeed(self.services[serviceName]) def removeService(ignored): self.log.debug('Removing service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] def callback(result, serviceName): self.log.debug('Loaded service %s from zenhub' % serviceName) self.services[serviceName] = result result.notifyOnDisconnect(removeService) return result def errback(error, serviceName): self.log.debug('errback after getting service %s' % serviceName) self.log.error('Could not retrieve service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] return error d = self.perspective.callRemote('getService', serviceName, self.options.monitor, serviceListeningInterface or self, self.options.__dict__) d.addCallback(callback, serviceName) d.addErrback(errback, serviceName) return d def getInitialServices(self): """ After connecting to zenhub, gather our initial list of services. """ def errback(error): if isinstance(error, Failure): self.log.critical( "Invalid monitor: %s" % self.options.monitor) reactor.stop() return defer.fail(RemoteBadMonitor( "Invalid monitor: %s" % self.options.monitor, '')) return error self.log.debug('Setting up initial services: %s' % \ ', '.join(self.initialServices)) d = defer.DeferredList( [self.getService(name) for name in self.initialServices], fireOnOneErrback=True, consumeErrors=True) d.addErrback(errback) return d def connected(self): pass def _getThresholdNotifier(self): if not self._threshold_notifier: self._threshold_notifier = ThresholdNotifier(self.sendEvent, self.getThresholds()) return self._threshold_notifier def getThresholds(self): if not self._thresholds: self._thresholds = Thresholds() return self._thresholds def run(self): def stopReporter(): if self._metrologyReporter: return self._metrologyReporter.stop() # Order of the shutdown triggers matter. Want to stop reporter first, calling self.metricWriter() below # registers shutdown triggers for the actual metric http and redis publishers. reactor.addSystemEventTrigger('before', 'shutdown', stopReporter) threshold_notifier = self._getThresholdNotifier() self.rrdStats.config(self.name, self.options.monitor, self.metricWriter(), threshold_notifier, self.derivativeTracker()) self.log.debug('Starting PBDaemon initialization') d = self.connect() def callback(result): self.sendEvent(self.startEvent) self.pushEventsLoop() self.log.debug('Calling connected.') self.connected() return result def startStatsLoop(): self.log.debug("Starting Statistic posting") loop = task.LoopingCall(self.postStatistics) loop.start(self.options.writeStatistics, now=False) daemonTags = { 'zenoss_daemon': self.name, 'zenoss_monitor': self.options.monitor, 'internal': True } self._metrologyReporter = TwistedMetricReporter(self.options.writeStatistics, self.metricWriter(), daemonTags) self._metrologyReporter.start() if self.options.cycle: reactor.callWhenRunning(startStatsLoop) d.addCallback(callback) d.addErrback(twisted.python.log.err) reactor.run() if self._customexitcode: sys.exit(self._customexitcode) def setExitCode(self, exitcode): self._customexitcode = exitcode def stop(self, ignored=''): if reactor.running: try: reactor.stop() except ReactorNotRunning: self.log.debug("Tried to stop reactor that was stopped") else: self.log.debug("stop() called when not running") def _stopPbDaemon(self): if self.stopped: return self.stopped = True if 'EventService' in self.services: # send stop event if we don't have an implied --cycle, # or if --cycle has been specified if not hasattr(self.options, 'cycle') or \ getattr(self.options, 'cycle', True): self.sendEvent(self.stopEvent) self.log.debug("Sent a 'stop' event") if self._pushEventsDeferred: self.log.debug("Currently sending events. Queueing next call") d = self._pushEventsDeferred # Schedule another call to flush any additional queued events d.addBoth(lambda unused: self.pushEvents()) else: d = self.pushEvents() return d self.log.debug("No event sent as no EventService available.") def sendEvents(self, events): map(self.sendEvent, events) def sendEvent(self, event, **kw): """ Add event to queue of events to be sent. If we have an event service then process the queue. """ generatedEvent = self.generateEvent(event, **kw) self.eventQueueManager.addEvent(generatedEvent) self.counters['eventCount'] += 1 if self._eventHighWaterMark: return self._eventHighWaterMark elif self.eventQueueManager.event_queue_length >= self.options.maxqueuelen * self.options.queueHighWaterMark: return self.pushEvents() else: return defer.succeed(None) def generateEvent(self, event, **kw): """ Add event to queue of events to be sent. If we have an event service then process the queue. """ if not reactor.running: return eventCopy = {} for k, v in chain(event.items(), kw.items()): if isinstance(v, basestring): #default max size is 512k size = LIMITS.get(k, DEFAULT_LIMIT) eventCopy[k] = v[0:size] if len(v)>size else v else: eventCopy[k] = v eventCopy['agent'] = self.name eventCopy['monitor'] = self.options.monitor eventCopy['manager'] = self.fqdn return eventCopy @defer.inlineCallbacks def pushEventsLoop(self): """Periodially, wake up and flush events to ZenHub. """ reactor.callLater(self.options.eventflushseconds, self.pushEventsLoop) yield self.pushEvents() # Record the number of events in the queue up to every 2 seconds. now = time.time() if self.rrdStats.name and now >= (self.lastStats + 2): self.lastStats = now self.rrdStats.gauge( 'eventQueueLength', self.eventQueueManager.event_queue_length) @defer.inlineCallbacks def pushEvents(self): """Flush events to ZenHub. """ # are we already shutting down? if not reactor.running: self.log.debug("Skipping event sending - reactor not running.") return if self.eventQueueManager.event_queue_length >= self.options.maxqueuelen * self.options.queueHighWaterMark and not self._eventHighWaterMark: self.log.debug("Queue length exceeded high water mark, %s ;creating high water mark deferred", self.eventQueueManager.event_queue_length) self._eventHighWaterMark = defer.Deferred() # are still connected to ZenHub? evtSvc = self.services.get('EventService', None) if not evtSvc: self.log.error("No event service: %r", evtSvc) yield task.deferLater(reactor, 0, lambda:None) if self._eventHighWaterMark: d, self._eventHighWaterMark = self._eventHighWaterMark, None #not connected, release throttle and let things queue d.callback("No Event Service") defer.returnValue(None) if self._pushEventsDeferred: self.log.debug("Skipping event sending - previous call active.") defer.returnValue("Push Pending") sent = 0 try: #only set _pushEventsDeferred after we know we have an evtSvc/connectivity self._pushEventsDeferred = defer.Deferred() def repush(val): if self.eventQueueManager.event_queue_length >= self.options.eventflushchunksize: self.pushEvents() return val # conditionally push more events after this pushEvents call finishes self._pushEventsDeferred.addCallback(repush) discarded_events = self.eventQueueManager.discarded_events if discarded_events: self.log.error( 'Discarded oldest %d events because maxqueuelen was ' 'exceeded: %d/%d', discarded_events, discarded_events + self.options.maxqueuelen, self.options.maxqueuelen) self.counters['discardedEvents'] += discarded_events self.eventQueueManager.discarded_events = 0 send_events_fn = partial(evtSvc.callRemote, 'sendEvents') try: sent = yield self.eventQueueManager.sendEvents(send_events_fn) except ConnectionLost as ex: self.log.error('Error sending event: %s', ex) #let the reactor have time to clean up any connection errors and make callbacks yield task.deferLater(reactor, 0, lambda:None) except Exception as ex: self.log.exception(ex) #let the reactor have time to clean up any connection errors and make callbacks yield task.deferLater(reactor, 0, lambda:None) finally: if self._pushEventsDeferred: d, self._pushEventsDeferred = self._pushEventsDeferred, None d.callback('sent %s' % sent) if self._eventHighWaterMark and self.eventQueueManager.event_queue_length < self.options.maxqueuelen * self.options.queueHighWaterMark: self.log.debug("Queue restored to below high water mark: %s", self.eventQueueManager.event_queue_length) d, self._eventHighWaterMark = self._eventHighWaterMark, None d.callback("Queue length below high water mark") def heartbeat(self): """if cycling, send a heartbeat, else, shutdown""" if not self.options.cycle: self.stop() return heartbeatEvent = self.generateEvent(self.heartbeatEvent, timeout=self.heartbeatTimeout) self.eventQueueManager.addHeartbeatEvent(heartbeatEvent) # heartbeat is normally 3x cycle time self.niceDoggie(self.heartbeatTimeout / 3) def postStatisticsImpl(self): pass def postStatistics(self): # save daemon counter stats for name, value in self.counters.items(): self.log.info("Counter %s, value %d", name, value) self.rrdStats.counter(name, value) # persist counters values self.postStatisticsImpl() def _pickleName(self): instance_id = os.environ.get('CONTROLPLANE_INSTANCE_ID') return 'var/%s_%s_counters.pickle' % (self.name, instance_id) def remote_getName(self): return self.name def remote_shutdown(self, unused): self.stop() self.sigTerm() def remote_setPropertyItems(self, items): pass @translateError def remote_updateThresholdClasses(self, classes): from Products.ZenUtils.Utils import importClass self.log.debug("Loading classes %s", classes) for c in classes: try: importClass(c) except ImportError: self.log.error("Unable to import class %s", c) def _checkZenHub(self): """ Check status of ZenHub (using ping method of service). @return: if ping occurs, return deferred with result of ping attempt. """ self.log.debug('_checkZenHub: entry') def callback(result): self.log.debug('ZenHub health check: Got result %s' % result) if result == 'pong': self.log.debug('ZenHub health check: Success - received pong from ZenHub ping service.') self._signalZenHubAnswering(True) else: self.log.error('ZenHub health check did not respond as expected.') self._signalZenHubAnswering(False) def errback(error): self.log.error('Error pinging ZenHub: %s (%s).' % (error, getattr(error, 'message', ''))) self._signalZenHubAnswering(False) try: if self.perspective: self.log.debug('ZenHub health check: perspective found. attempting remote ping call.') d = self.perspective.callRemote('ping') d.addCallback(callback) d.addErrback(errback) return d else: self.log.debug('ZenHub health check: ZenHub may be down.') self._signalZenHubAnswering(False) except pb.DeadReferenceError: self.log.warning("ZenHub health check: DeadReferenceError - lost connection to ZenHub.") self._signalZenHubAnswering(False) except Exception as e: self.log.error('ZenHub health check: caught %s exception: %s' % (e.__class__, e.message)) self._signalZenHubAnswering(False) def _signalZenHubAnswering(self, answering): """ Write or remove file that the ZenHub_answering health check uses to report status. @param answering: true if ZenHub is answering, False, otherwise. """ self.log.debug('_signalZenHubAnswering(%s)' % answering) filename = 'zenhub_connected' signalFilePath = zenPath('var', filename) if answering: self.log.debug('writing file at %s' % signalFilePath) atomicWrite(signalFilePath, '') else: try: self.log.debug('removing file at %s' % signalFilePath) os.remove(signalFilePath) except Exception as e: self.log.debug('ignoring %s exception (%s) removing file %s' % (e.__class__, e.message, signalFilePath)) def buildOptions(self): ZenDaemon.buildOptions(self) self.parser.add_option('--hubhost', dest='hubhost', default=DEFAULT_HUB_HOST, help='Host of zenhub daemon.' ' Default is %s.' % DEFAULT_HUB_HOST) self.parser.add_option('--hubport', dest='hubport', type='int', default=DEFAULT_HUB_PORT, help='Port zenhub listens on.' 'Default is %s.' % DEFAULT_HUB_PORT) self.parser.add_option('--hubusername', dest='hubusername', default=DEFAULT_HUB_USERNAME, help='Username for zenhub login.' ' Default is %s.' % DEFAULT_HUB_USERNAME) self.parser.add_option('--hubpassword', dest='hubpassword', default=DEFAULT_HUB_PASSWORD, help='Password for zenhub login.' ' Default is %s.' % DEFAULT_HUB_PASSWORD) self.parser.add_option('--monitor', dest='monitor', default=DEFAULT_HUB_MONITOR, help='Name of monitor instance to use for' ' configuration. Default is %s.' % DEFAULT_HUB_MONITOR) self.parser.add_option('--initialHubTimeout', dest='hubtimeout', type='int', default=30, help='Initial time to wait for a ZenHub ' 'connection') self.parser.add_option('--allowduplicateclears', dest='allowduplicateclears', default=False, action='store_true', help='Send clear events even when the most ' 'recent event was also a clear event.') self.parser.add_option('--duplicateclearinterval', dest='duplicateclearinterval', default=0, type='int', help=('Send a clear event every [DUPLICATECLEARINTEVAL] ' 'events.') ) self.parser.add_option('--eventflushseconds', dest='eventflushseconds', default=5., type='float', help='Seconds between attempts to flush ' 'events to ZenHub.') self.parser.add_option('--eventflushchunksize', dest='eventflushchunksize', default=50, type='int', help='Number of events to send to ZenHub' 'at one time') self.parser.add_option('--maxqueuelen', dest='maxqueuelen', default=5000, type='int', help='Maximum number of events to queue') self.parser.add_option('--queuehighwatermark', dest='queueHighWaterMark', default=0.75, type='float', help='The size, in percent, of the event queue when event pushback starts') self.parser.add_option('--zenhubpinginterval', dest='zhPingInterval', default=120, type='int', help='How often to ping zenhub') self.parser.add_option('--disable-event-deduplication', dest='deduplicate_events', default=True, action='store_false', help='Disable event de-duplication') self.parser.add_option('--redis-url', dest='redisUrl', type='string', default='redis://localhost:{default}/0'.format(default=publisher.defaultRedisPort), help='redis connection string: redis://[hostname]:[port]/[db], default: %default') self.parser.add_option('--metricBufferSize', dest='metricBufferSize', type='int', default=publisher.defaultMetricBufferSize, help='Number of metrics to buffer if redis goes down') self.parser.add_option('--metricsChannel', dest='metricsChannel', type='string', default=publisher.defaultMetricsChannel, help='redis channel to which metrics are published') self.parser.add_option('--maxOutstandingMetrics', dest='maxOutstandingMetrics', type='int', default=publisher.defaultMaxOutstandingMetrics, help='Max Number of metrics to allow in redis') self.parser.add_option('--disable-ping-perspective', dest='pingPerspective', help="Enable or disable ping perspective", default=True, action='store_false') self.parser.add_option('--writeStatistics', dest='writeStatistics', type='int', default=30, help='How often to write internal statistics value in seconds')
class PBDaemon(ZenDaemon, pb.Referenceable): name = 'pbdaemon' initialServices = ['EventService'] heartbeatEvent = {'eventClass': Heartbeat} heartbeatTimeout = 60 * 3 _customexitcode = 0 _pushEventsDeferred = None _eventHighWaterMark = None _healthMonitorInterval = 30 def __init__(self, noopts=0, keeproot=False, name=None): # if we were provided our collector name via the constructor instead of # via code, be sure to store it correctly. if name is not None: self.name = name self.mname = name try: ZenDaemon.__init__(self, noopts, keeproot) except IOError: import traceback self.log.critical(traceback.format_exc(0)) sys.exit(1) self._thresholds = None self._threshold_notifier = None self.rrdStats = DaemonStats() self.lastStats = 0 self.perspective = None self.services = {} self.eventQueueManager = EventQueueManager(self.options, self.log) self.startEvent = startEvent.copy() self.stopEvent = stopEvent.copy() details = dict(component=self.name, device=self.options.monitor) for evt in self.startEvent, self.stopEvent, self.heartbeatEvent: evt.update(details) self.initialConnect = defer.Deferred() self.stopped = False self.counters = collections.Counter() self._pingedZenhub = None self._connectionTimeout = None self._publisher = None self._internal_publisher = None self._metric_writer = None self._derivative_tracker = None self._metrologyReporter = None # Add a shutdown trigger to send a stop event and flush the event queue reactor.addSystemEventTrigger('before', 'shutdown', self._stopPbDaemon) # Set up a looping call to support the health check. self.healthMonitor = task.LoopingCall(self._checkZenHub) self.healthMonitor.start(self._healthMonitorInterval) def publisher(self): if not self._publisher: host, port = urlparse(self.options.redisUrl).netloc.split(':') try: port = int(port) except ValueError: self.log.exception( "redis url contains non-integer port " + "value {port}, defaulting to {default}".format( port=port, default=publisher.defaultRedisPort)) port = publisher.defaultRedisPort self._publisher = publisher.RedisListPublisher( host, port, self.options.metricBufferSize, channel=self.options.metricsChannel, maxOutstandingMetrics=self.options.maxOutstandingMetrics) return self._publisher def internalPublisher(self): if not self._internal_publisher: url = os.environ.get("CONTROLPLANE_CONSUMER_URL", None) username = os.environ.get("CONTROLPLANE_CONSUMER_USERNAME", "") password = os.environ.get("CONTROLPLANE_CONSUMER_PASSWORD", "") if url: self._internal_publisher = publisher.HttpPostPublisher( username, password, url) return self._internal_publisher def metricWriter(self): if not self._metric_writer: publisher = self.publisher() metric_writer = MetricWriter(publisher) if os.environ.get("CONTROLPLANE", "0") == "1": internal_publisher = self.internalPublisher() if internal_publisher: internal_metric_filter = lambda metric, value, timestamp, tags:\ tags and tags.get("internal", False) internal_metric_writer = FilteredMetricWriter( internal_publisher, internal_metric_filter) self._metric_writer = AggregateMetricWriter( [metric_writer, internal_metric_writer]) else: self._metric_writer = metric_writer return self._metric_writer def derivativeTracker(self): if not self._derivative_tracker: self._derivative_tracker = DerivativeTracker() return self._derivative_tracker def connecting(self): """ Called when about to connect to zenhub """ self.log.info("Attempting to connect to zenhub") def getZenhubInstanceId(self): """ Called after we connected to zenhub. """ def callback(result): self.log.info("Connected to the zenhub/%s instance", result) def errback(result): self.log.info( "Unexpected error appeared while getting zenhub instance number %s", result) d = self.perspective.callRemote('getHubInstanceId') d.addCallback(callback) d.addErrback(errback) return d def gotPerspective(self, perspective): """ This gets called every time we reconnect. @parameter perspective: Twisted perspective object @type perspective: Twisted perspective object """ self.perspective = perspective self.getZenhubInstanceId() # Cancel the connection timeout timer as it's no longer needed. if self._connectionTimeout: try: self._connectionTimeout.cancel() except AlreadyCalled: pass self._connectionTimeout = None d2 = self.getInitialServices() if self.initialConnect: self.log.debug('Chaining getInitialServices with d2') self.initialConnect, d = None, self.initialConnect d2.chainDeferred(d) def connect(self): pingInterval = self.options.zhPingInterval factory = ReconnectingPBClientFactory( connectTimeout=60, pingPerspective=self.options.pingPerspective, pingInterval=pingInterval, pingtimeout=pingInterval * 5) self.log.info("Connecting to %s:%d" % (self.options.hubhost, self.options.hubport)) factory.connectTCP(self.options.hubhost, self.options.hubport) username = self.options.hubusername password = self.options.hubpassword self.log.debug("Logging in as %s" % username) c = credentials.UsernamePassword(username, password) factory.gotPerspective = self.gotPerspective factory.connecting = self.connecting factory.setCredentials(c) def timeout(d): if not d.called: self.connectTimeout() self._connectionTimeout = reactor.callLater(self.options.hubtimeout, timeout, self.initialConnect) return self.initialConnect def connectTimeout(self): self.log.error('Timeout connecting to zenhub: is it running?') pass def eventService(self): return self.getServiceNow('EventService') def getServiceNow(self, svcName): if not svcName in self.services: self.log.warning( 'No service named %r: ZenHub may be disconnected' % svcName) return self.services.get(svcName, None) or FakeRemote() def getService(self, serviceName, serviceListeningInterface=None): """ Attempt to get a service from zenhub. Returns a deferred. When service is retrieved it is stashed in self.services with serviceName as the key. When getService is called it will first check self.services and if serviceName is already there it will return the entry from self.services wrapped in a defer.succeed """ if serviceName in self.services: return defer.succeed(self.services[serviceName]) def removeService(ignored): self.log.debug('Removing service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] def callback(result, serviceName): self.log.debug('Loaded service %s from zenhub' % serviceName) self.services[serviceName] = result result.notifyOnDisconnect(removeService) return result def errback(error, serviceName): self.log.debug('errback after getting service %s' % serviceName) self.log.error('Could not retrieve service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] return error d = self.perspective.callRemote('getService', serviceName, self.options.monitor, serviceListeningInterface or self, self.options.__dict__) d.addCallback(callback, serviceName) d.addErrback(errback, serviceName) return d def getInitialServices(self): """ After connecting to zenhub, gather our initial list of services. """ def errback(error): if isinstance(error, Failure): self.log.critical("Invalid monitor: %s" % self.options.monitor) reactor.stop() return defer.fail( RemoteBadMonitor( "Invalid monitor: %s" % self.options.monitor, '')) return error self.log.debug('Setting up initial services: %s' % \ ', '.join(self.initialServices)) d = defer.DeferredList( [self.getService(name) for name in self.initialServices], fireOnOneErrback=True, consumeErrors=True) d.addErrback(errback) return d def connected(self): pass def _getThresholdNotifier(self): if not self._threshold_notifier: self._threshold_notifier = ThresholdNotifier( self.sendEvent, self.getThresholds()) return self._threshold_notifier def getThresholds(self): if not self._thresholds: self._thresholds = Thresholds() return self._thresholds def run(self): def stopReporter(): if self._metrologyReporter: return self._metrologyReporter.stop() # Order of the shutdown triggers matter. Want to stop reporter first, calling self.metricWriter() below # registers shutdown triggers for the actual metric http and redis publishers. reactor.addSystemEventTrigger('before', 'shutdown', stopReporter) threshold_notifier = self._getThresholdNotifier() self.rrdStats.config(self.name, self.options.monitor, self.metricWriter(), threshold_notifier, self.derivativeTracker()) self.log.debug('Starting PBDaemon initialization') d = self.connect() def callback(result): self.sendEvent(self.startEvent) self.pushEventsLoop() self.log.debug('Calling connected.') self.connected() return result def startStatsLoop(): self.log.debug("Starting Statistic posting") loop = task.LoopingCall(self.postStatistics) loop.start(self.options.writeStatistics, now=False) daemonTags = { 'zenoss_daemon': self.name, 'zenoss_monitor': self.options.monitor, 'internal': True } self._metrologyReporter = TwistedMetricReporter( self.options.writeStatistics, self.metricWriter(), daemonTags) self._metrologyReporter.start() reactor.callWhenRunning(startStatsLoop) d.addCallback(callback) d.addErrback(twisted.python.log.err) reactor.run() if self._customexitcode: sys.exit(self._customexitcode) def setExitCode(self, exitcode): self._customexitcode = exitcode def stop(self, ignored=''): if reactor.running: try: reactor.stop() except ReactorNotRunning: self.log.debug("Tried to stop reactor that was stopped") else: self.log.debug("stop() called when not running") def _stopPbDaemon(self): if self.stopped: return self.stopped = True if 'EventService' in self.services: # send stop event if we don't have an implied --cycle, # or if --cycle has been specified if not hasattr(self.options, 'cycle') or \ getattr(self.options, 'cycle', True): self.sendEvent(self.stopEvent) self.log.debug("Sent a 'stop' event") if self._pushEventsDeferred: self.log.debug("Currently sending events. Queueing next call") d = self._pushEventsDeferred # Schedule another call to flush any additional queued events d.addBoth(lambda unused: self.pushEvents()) else: d = self.pushEvents() return d self.log.debug("No event sent as no EventService available.") def sendEvents(self, events): map(self.sendEvent, events) def sendEvent(self, event, **kw): """ Add event to queue of events to be sent. If we have an event service then process the queue. """ generatedEvent = self.generateEvent(event, **kw) self.eventQueueManager.addEvent(generatedEvent) self.counters['eventCount'] += 1 if self._eventHighWaterMark: return self._eventHighWaterMark elif self.eventQueueManager.event_queue_length >= self.options.maxqueuelen * self.options.queueHighWaterMark: return self.pushEvents() else: return defer.succeed(None) def generateEvent(self, event, **kw): """ Add event to queue of events to be sent. If we have an event service then process the queue. """ if not reactor.running: return eventCopy = {} for k, v in chain(event.items(), kw.items()): if isinstance(v, basestring): #default max size is 512k size = LIMITS.get(k, DEFAULT_LIMIT) eventCopy[k] = v[0:size] if len(v) > size else v else: eventCopy[k] = v eventCopy['agent'] = self.name eventCopy['monitor'] = self.options.monitor eventCopy['manager'] = self.fqdn return eventCopy @defer.inlineCallbacks def pushEventsLoop(self): """Periodially, wake up and flush events to ZenHub. """ reactor.callLater(self.options.eventflushseconds, self.pushEventsLoop) yield self.pushEvents() # Record the number of events in the queue up to every 2 seconds. now = time.time() if self.rrdStats.name and now >= (self.lastStats + 2): self.lastStats = now self.rrdStats.gauge('eventQueueLength', self.eventQueueManager.event_queue_length) @defer.inlineCallbacks def pushEvents(self): """Flush events to ZenHub. """ # are we already shutting down? if not reactor.running: self.log.debug("Skipping event sending - reactor not running.") return if self.eventQueueManager.event_queue_length >= self.options.maxqueuelen * self.options.queueHighWaterMark and not self._eventHighWaterMark: self.log.debug( "Queue length exceeded high water mark, %s ;creating high water mark deferred", self.eventQueueManager.event_queue_length) self._eventHighWaterMark = defer.Deferred() # are still connected to ZenHub? evtSvc = self.services.get('EventService', None) if not evtSvc: self.log.error("No event service: %r", evtSvc) yield task.deferLater(reactor, 0, lambda: None) if self._eventHighWaterMark: d, self._eventHighWaterMark = self._eventHighWaterMark, None #not connected, release throttle and let things queue d.callback("No Event Service") defer.returnValue(None) if self._pushEventsDeferred: self.log.debug("Skipping event sending - previous call active.") defer.returnValue("Push Pending") sent = 0 try: #only set _pushEventsDeferred after we know we have an evtSvc/connectivity self._pushEventsDeferred = defer.Deferred() def repush(val): if self.eventQueueManager.event_queue_length >= self.options.eventflushchunksize: self.pushEvents() return val # conditionally push more events after this pushEvents call finishes self._pushEventsDeferred.addCallback(repush) discarded_events = self.eventQueueManager.discarded_events if discarded_events: self.log.error( 'Discarded oldest %d events because maxqueuelen was ' 'exceeded: %d/%d', discarded_events, discarded_events + self.options.maxqueuelen, self.options.maxqueuelen) self.counters['discardedEvents'] += discarded_events self.eventQueueManager.discarded_events = 0 send_events_fn = partial(evtSvc.callRemote, 'sendEvents') try: sent = yield self.eventQueueManager.sendEvents(send_events_fn) except ConnectionLost as ex: self.log.error('Error sending event: %s', ex) #let the reactor have time to clean up any connection errors and make callbacks yield task.deferLater(reactor, 0, lambda: None) except Exception as ex: self.log.exception(ex) #let the reactor have time to clean up any connection errors and make callbacks yield task.deferLater(reactor, 0, lambda: None) finally: if self._pushEventsDeferred: d, self._pushEventsDeferred = self._pushEventsDeferred, None d.callback('sent %s' % sent) if self._eventHighWaterMark and self.eventQueueManager.event_queue_length < self.options.maxqueuelen * self.options.queueHighWaterMark: self.log.debug("Queue restored to below high water mark: %s", self.eventQueueManager.event_queue_length) d, self._eventHighWaterMark = self._eventHighWaterMark, None d.callback("Queue length below high water mark") def heartbeat(self): """if cycling, send a heartbeat, else, shutdown""" if not self.options.cycle: self.stop() return heartbeatEvent = self.generateEvent(self.heartbeatEvent, timeout=self.heartbeatTimeout) self.eventQueueManager.addHeartbeatEvent(heartbeatEvent) # heartbeat is normally 3x cycle time self.niceDoggie(self.heartbeatTimeout / 3) def postStatisticsImpl(self): pass def postStatistics(self): # save daemon counter stats for name, value in self.counters.items(): self.log.info("Counter %s, value %d", name, value) self.rrdStats.counter(name, value) # persist counters values self.postStatisticsImpl() def _pickleName(self): instance_id = os.environ.get('CONTROLPLANE_INSTANCE_ID') return 'var/%s_%s_counters.pickle' % (self.name, instance_id) def remote_getName(self): return self.name def remote_shutdown(self, unused): self.stop() self.sigTerm() def remote_setPropertyItems(self, items): pass @translateError def remote_updateThresholdClasses(self, classes): from Products.ZenUtils.Utils import importClass self.log.debug("Loading classes %s", classes) for c in classes: try: importClass(c) except ImportError: self.log.error("Unable to import class %s", c) def _checkZenHub(self): """ Check status of ZenHub (using ping method of service). @return: if ping occurs, return deferred with result of ping attempt. """ self.log.debug('_checkZenHub: entry') def callback(result): self.log.debug('ZenHub health check: Got result %s' % result) if result == 'pong': self.log.debug( 'ZenHub health check: Success - received pong from ZenHub ping service.' ) self._signalZenHubAnswering(True) else: self.log.error( 'ZenHub health check did not respond as expected.') self._signalZenHubAnswering(False) def errback(error): self.log.error('Error pinging ZenHub: %s (%s).' % (error, getattr(error, 'message', ''))) self._signalZenHubAnswering(False) try: if self.perspective: self.log.debug( 'ZenHub health check: perspective found. attempting remote ping call.' ) d = self.perspective.callRemote('ping') d.addCallback(callback) d.addErrback(errback) return d else: self.log.debug('ZenHub health check: ZenHub may be down.') self._signalZenHubAnswering(False) except pb.DeadReferenceError: self.log.warning( "ZenHub health check: DeadReferenceError - lost connection to ZenHub." ) self._signalZenHubAnswering(False) except Exception as e: self.log.error('ZenHub health check: caught %s exception: %s' % (e.__class__, e.message)) self._signalZenHubAnswering(False) def _signalZenHubAnswering(self, answering): """ Write or remove file that the ZenHub_answering health check uses to report status. @param answering: true if ZenHub is answering, False, otherwise. """ self.log.debug('_signalZenHubAnswering(%s)' % answering) filename = 'zenhub_connected' signalFilePath = zenPath('var', filename) if answering: self.log.debug('writing file at %s' % signalFilePath) atomicWrite(signalFilePath, '') else: try: self.log.debug('removing file at %s' % signalFilePath) os.remove(signalFilePath) except Exception as e: self.log.debug('ignoring %s exception (%s) removing file %s' % (e.__class__, e.message, signalFilePath)) def buildOptions(self): self.parser.add_option('--hubhost', dest='hubhost', default=DEFAULT_HUB_HOST, help='Host of zenhub daemon.' ' Default is %s.' % DEFAULT_HUB_HOST) self.parser.add_option('--hubport', dest='hubport', type='int', default=DEFAULT_HUB_PORT, help='Port zenhub listens on.' 'Default is %s.' % DEFAULT_HUB_PORT) self.parser.add_option('--hubusername', dest='hubusername', default=DEFAULT_HUB_USERNAME, help='Username for zenhub login.' ' Default is %s.' % DEFAULT_HUB_USERNAME) self.parser.add_option('--hubpassword', dest='hubpassword', default=DEFAULT_HUB_PASSWORD, help='Password for zenhub login.' ' Default is %s.' % DEFAULT_HUB_PASSWORD) self.parser.add_option('--monitor', dest='monitor', default=DEFAULT_HUB_MONITOR, help='Name of monitor instance to use for' ' configuration. Default is %s.' % DEFAULT_HUB_MONITOR) self.parser.add_option('--initialHubTimeout', dest='hubtimeout', type='int', default=30, help='Initial time to wait for a ZenHub ' 'connection') self.parser.add_option('--allowduplicateclears', dest='allowduplicateclears', default=False, action='store_true', help='Send clear events even when the most ' 'recent event was also a clear event.') self.parser.add_option( '--duplicateclearinterval', dest='duplicateclearinterval', default=0, type='int', help=('Send a clear event every [DUPLICATECLEARINTEVAL] ' 'events.')) self.parser.add_option('--eventflushseconds', dest='eventflushseconds', default=5., type='float', help='Seconds between attempts to flush ' 'events to ZenHub.') self.parser.add_option('--eventflushchunksize', dest='eventflushchunksize', default=50, type='int', help='Number of events to send to ZenHub' 'at one time') self.parser.add_option('--maxqueuelen', dest='maxqueuelen', default=5000, type='int', help='Maximum number of events to queue') self.parser.add_option( '--queuehighwatermark', dest='queueHighWaterMark', default=0.75, type='float', help= 'The size, in percent, of the event queue when event pushback starts' ) self.parser.add_option('--zenhubpinginterval', dest='zhPingInterval', default=120, type='int', help='How often to ping zenhub') self.parser.add_option('--disable-event-deduplication', dest='deduplicate_events', default=True, action='store_false', help='Disable event de-duplication') self.parser.add_option( '--redis-url', dest='redisUrl', type='string', default='redis://localhost:{default}/0'.format( default=publisher.defaultRedisPort), help= 'redis connection string: redis://[hostname]:[port]/[db], default: %default' ) self.parser.add_option( '--metricBufferSize', dest='metricBufferSize', type='int', default=publisher.defaultMetricBufferSize, help='Number of metrics to buffer if redis goes down') self.parser.add_option( '--metricsChannel', dest='metricsChannel', type='string', default=publisher.defaultMetricsChannel, help='redis channel to which metrics are published') self.parser.add_option('--maxOutstandingMetrics', dest='maxOutstandingMetrics', type='int', default=publisher.defaultMaxOutstandingMetrics, help='Max Number of metrics to allow in redis') self.parser.add_option('--disable-ping-perspective', dest='pingPerspective', help="Enable or disable ping perspective", default=True, action='store_false') self.parser.add_option( '--writeStatistics', dest='writeStatistics', type='int', default=30, help='How often to write internal statistics value in seconds') ZenDaemon.buildOptions(self)
class PBDaemon(ZenDaemon, pb.Referenceable): name = 'pbdaemon' initialServices = ['EventService'] heartbeatEvent = {'eventClass': Heartbeat} heartbeatTimeout = 60 * 3 _customexitcode = 0 _pushEventsDeferred = None def __init__(self, noopts=0, keeproot=False, name=None): # if we were provided our collector name via the constructor instead of # via code, be sure to store it correctly. if name is not None: self.name = name self.mname = name try: ZenDaemon.__init__(self, noopts, keeproot) except IOError: import traceback self.log.critical(traceback.format_exc(0)) sys.exit(1) self.rrdStats = DaemonStats() self.lastStats = 0 self.perspective = None self.services = {} self.eventQueueManager = EventQueueManager(self.options, self.log) self.startEvent = startEvent.copy() self.stopEvent = stopEvent.copy() details = dict(component=self.name, device=self.options.monitor) for evt in self.startEvent, self.stopEvent, self.heartbeatEvent: evt.update(details) self.initialConnect = defer.Deferred() self.stopped = False self.counters = collections.Counter() self.loadCounters() self._pingedZenhub = None self._connectionTimeout = None # Add a shutdown trigger to send a stop event and flush the event queue reactor.addSystemEventTrigger('before', 'shutdown', self._stopPbDaemon) def connecting(self): """ Called when about to connect to zenhub """ self.log.info("Attempting to connect to zenhub") def gotPerspective(self, perspective): """ This gets called every time we reconnect. @parameter perspective: Twisted perspective object @type perspective: Twisted perspective object """ self.log.info("Connected to ZenHub") self.perspective = perspective # Cancel the connection timeout timer as it's no longer needed. if self._connectionTimeout: try: self._connectionTimeout.cancel() except AlreadyCalled: pass self._connectionTimeout = None d2 = self.getInitialServices() if self.initialConnect: self.log.debug('Chaining getInitialServices with d2') self.initialConnect, d = None, self.initialConnect d2.chainDeferred(d) def connect(self): factory = ReconnectingPBClientFactory(connectTimeout=60) self.log.info("Connecting to %s:%d" % (self.options.hubhost, self.options.hubport)) factory.connectTCP(self.options.hubhost, self.options.hubport) username = self.options.hubusername password = self.options.hubpassword self.log.debug("Logging in as %s" % username) c = credentials.UsernamePassword(username, password) factory.gotPerspective = self.gotPerspective factory.connecting = self.connecting factory.startLogin(c) def timeout(d): if not d.called: self.connectTimeout() self._connectionTimeout = reactor.callLater(self.options.hubtimeout, timeout, self.initialConnect) return self.initialConnect def connectTimeout(self): self.log.error('Timeout connecting to zenhub: is it running?') pass def eventService(self): return self.getServiceNow('EventService') def getServiceNow(self, svcName): if not svcName in self.services: self.log.warning( 'No service named %r: ZenHub may be disconnected' % svcName) return self.services.get(svcName, None) or FakeRemote() def getService(self, serviceName, serviceListeningInterface=None): """ Attempt to get a service from zenhub. Returns a deferred. When service is retrieved it is stashed in self.services with serviceName as the key. When getService is called it will first check self.services and if serviceName is already there it will return the entry from self.services wrapped in a defer.succeed """ if serviceName in self.services: return defer.succeed(self.services[serviceName]) def removeService(ignored): self.log.debug('Removing service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] def callback(result, serviceName): self.log.debug('Loaded service %s from zenhub' % serviceName) self.services[serviceName] = result result.notifyOnDisconnect(removeService) return result def errback(error, serviceName): self.log.debug('errback after getting service %s' % serviceName) self.log.error('Could not retrieve service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] return error d = self.perspective.callRemote('getService', serviceName, self.options.monitor, serviceListeningInterface or self) d.addCallback(callback, serviceName) d.addErrback(errback, serviceName) return d def getInitialServices(self): """ After connecting to zenhub, gather our initial list of services. """ def errback(error): if isinstance(error, Failure): self.log.critical("Invalid monitor: %s" % self.options.monitor) reactor.stop() return defer.fail( RemoteBadMonitor( "Invalid monitor: %s" % self.options.monitor, '')) return error self.log.debug('Setting up initial services: %s' % \ ', '.join(self.initialServices)) d = defer.DeferredList( [self.getService(name) for name in self.initialServices], fireOnOneErrback=True, consumeErrors=True) d.addErrback(errback) return d def connected(self): pass def run(self): self.rrdStats.config(self.options.monitor, self.name, []) self.log.debug('Starting PBDaemon initialization') d = self.connect() def callback(result): self.sendEvent(self.startEvent) self.pushEventsLoop() self.log.debug('Calling connected.') self.connected() return result d.addCallback(callback) d.addErrback(twisted.python.log.err) reactor.run() if self._customexitcode: sys.exit(self._customexitcode) def setExitCode(self, exitcode): self._customexitcode = exitcode def stop(self, ignored=''): if reactor.running: try: reactor.stop() except ReactorNotRunning: self.log.debug("Tried to stop reactor that was stopped") else: self.log.debug("stop() called when not running") def _stopPbDaemon(self): if self.stopped: return self.stopped = True if 'EventService' in self.services: # send stop event if we don't have an implied --cycle, # or if --cycle has been specified if not hasattr(self.options, 'cycle') or\ getattr(self.options, 'cycle', True): self.sendEvent(self.stopEvent) self.log.debug("Sent a 'stop' event") if self._pushEventsDeferred: self.log.debug("Currently sending events. Queueing next call") d = self._pushEventsDeferred # Schedule another call to flush any additional queued events d.addBoth(lambda unused: self.pushEvents()) else: d = self.pushEvents() d.addBoth(lambda unused: self.saveCounters()) return d self.log.debug("No event sent as no EventService available.") self.saveCounters() def sendEvents(self, events): map(self.sendEvent, events) def sendEvent(self, event, **kw): ''' Add event to queue of events to be sent. If we have an event service then process the queue. ''' generatedEvent = self.generateEvent(event, **kw) self.eventQueueManager.addEvent(generatedEvent) self.counters['eventCount'] += 1 def generateEvent(self, event, **kw): ''' Add event to queue of events to be sent. If we have an event service then process the queue. ''' if not reactor.running: return event = event.copy() event['agent'] = self.name event['monitor'] = self.options.monitor event['manager'] = self.fqdn event.update(kw) return event @defer.inlineCallbacks def pushEventsLoop(self): """Periodially, wake up and flush events to ZenHub. """ reactor.callLater(self.options.eventflushseconds, self.pushEventsLoop) yield self.pushEvents() # Record the number of events in the queue every 5 minutes. now = time.time() if self.rrdStats.name and now >= (self.lastStats + 300): self.lastStats = now events = self.rrdStats.gauge( 'eventQueueLength', 300, self.eventQueueManager.event_queue_length) for event in events: self.eventQueueManager.addPerformanceEvent(event) @defer.inlineCallbacks def pushEvents(self): """Flush events to ZenHub. """ # are we already shutting down? if not reactor.running: self.log.debug("Skipping event sending - reactor not running.") return if self._pushEventsDeferred: self.log.debug("Skipping event sending - previous call active.") return try: self._pushEventsDeferred = defer.Deferred() # are still connected to ZenHub? evtSvc = self.services.get('EventService', None) if not evtSvc: self.log.error("No event service: %r", evtSvc) return discarded_events = self.eventQueueManager.discarded_events if discarded_events: self.log.error( 'Discarded oldest %d events because maxqueuelen was ' 'exceeded: %d/%d', discarded_events, discarded_events + self.options.maxqueuelen, self.options.maxqueuelen) self.counters['discardedEvents'] += discarded_events self.eventQueueManager.discarded_events = 0 send_events_fn = partial(evtSvc.callRemote, 'sendEvents') try: yield self.eventQueueManager.sendEvents(send_events_fn) except ConnectionLost as ex: self.log.error('Error sending event: %s', ex) except ConnectionDone: pass except Exception as ex: self.log.exception(ex) finally: d, self._pushEventsDeferred = self._pushEventsDeferred, None d.callback('sent') def heartbeat(self): 'if cycling, send a heartbeat, else, shutdown' if not self.options.cycle: self.stop() return heartbeatEvent = self.generateEvent(self.heartbeatEvent, timeout=self.heartbeatTimeout) self.eventQueueManager.addHeartbeatEvent(heartbeatEvent) # heartbeat is normally 3x cycle time self.niceDoggie(self.heartbeatTimeout / 3) events = [] # save daemon counter stats for name, value in self.counters.items(): self.log.info("Counter %s, value %d", name, value) events += self.rrdStats.counter(name, 300, value) self.sendEvents(events) # persist counters values self.saveCounters() def saveCounters(self): atomicWrite( zenPath('var/%s_counters.pickle' % self.name), pickle.dumps(self.counters), raiseException=False, ) def loadCounters(self): try: self.counters = pickle.load( open(zenPath('var/%s_counters.pickle' % self.name))) except Exception: pass def remote_getName(self): return self.name def remote_shutdown(self, unused): self.stop() self.sigTerm() def remote_setPropertyItems(self, items): pass @translateError def remote_updateThresholdClasses(self, classes): from Products.ZenUtils.Utils import importClass self.log.debug("Loading classes %s", classes) for c in classes: try: importClass(c) except ImportError: self.log.error("Unable to import class %s", c) def buildOptions(self): self.parser.add_option('--hubhost', dest='hubhost', default=DEFAULT_HUB_HOST, help='Host of zenhub daemon.' ' Default is %s.' % DEFAULT_HUB_HOST) self.parser.add_option('--hubport', dest='hubport', type='int', default=DEFAULT_HUB_PORT, help='Port zenhub listens on.' 'Default is %s.' % DEFAULT_HUB_PORT) self.parser.add_option('--hubusername', dest='hubusername', default=DEFAULT_HUB_USERNAME, help='Username for zenhub login.' ' Default is %s.' % DEFAULT_HUB_USERNAME) self.parser.add_option('--hubpassword', dest='hubpassword', default=DEFAULT_HUB_PASSWORD, help='Password for zenhub login.' ' Default is %s.' % DEFAULT_HUB_PASSWORD) self.parser.add_option('--monitor', dest='monitor', default=DEFAULT_HUB_MONITOR, help='Name of monitor instance to use for' ' configuration. Default is %s.' % DEFAULT_HUB_MONITOR) self.parser.add_option('--initialHubTimeout', dest='hubtimeout', type='int', default=30, help='Initial time to wait for a ZenHub ' 'connection') self.parser.add_option('--allowduplicateclears', dest='allowduplicateclears', default=False, action='store_true', help='Send clear events even when the most ' 'recent event was also a clear event.') self.parser.add_option( '--duplicateclearinterval', dest='duplicateclearinterval', default=0, type='int', help=('Send a clear event every [DUPLICATECLEARINTEVAL] ' 'events.')) self.parser.add_option('--eventflushseconds', dest='eventflushseconds', default=5., type='float', help='Seconds between attempts to flush ' 'events to ZenHub.') self.parser.add_option('--eventflushchunksize', dest='eventflushchunksize', default=50, type='int', help='Number of events to send to ZenHub' 'at one time') self.parser.add_option('--maxqueuelen', dest='maxqueuelen', default=5000, type='int', help='Maximum number of events to queue') self.parser.add_option('--zenhubpinginterval', dest='zhPingInterval', default=30, type='int', help='How often to ping zenhub') self.parser.add_option('--disable-event-deduplication', dest='deduplicate_events', default=True, action='store_false', help='Disable event de-duplication') ZenDaemon.buildOptions(self)
class EventServer(PBDaemon): 'Base class for a daemon whose primary job is to post events' name = 'EventServer' def __init__(self): PBDaemon.__init__(self, keeproot=True) self.stats = Stats() self.rrdStats = DaemonStats() def connected(self): self.sendEvent( dict(device=self.options.monitor, eventClass=App_Start, summary="%s started" % self.name, severity=0, component=self.name)) self.log.info("started") self.configure() def model(self): return self.services.get('EventService', FakeRemote()) def configure(self): def inner(driver): self.log.info("fetching default RRDCreateCommand") yield self.model().callRemote('getDefaultRRDCreateCommand') createCommand = driver.next() self.log.info("getting threshold classes") yield self.model().callRemote('getThresholdClasses') self.remote_updateThresholdClasses(driver.next()) self.log.info("getting collector thresholds") yield self.model().callRemote('getCollectorThresholds') self.rrdStats.config(self.options.monitor, self.name, driver.next(), createCommand) self.heartbeat() self.reportCycle() d = drive(inner) def error(result): self.log.error("Unexpected error in configure: %s" % result) d.addErrback(error) return d def sendEvent(self, event, **kw): # FIXME: get real event processing stats if 'firstTime' in event: self.stats.add(min(time.time() - event['firstTime'], 0)) PBDaemon.sendEvent(self, event, **kw) def useUdpFileDescriptor(self, fd): from twisted.internet import udp s = socket.fromfd(fd, socket.AF_INET, socket.SOCK_DGRAM) import os os.close(fd) port = s.getsockname()[1] transport = udp.Port(port, self) s.setblocking(0) transport.socket = s transport.fileno = s.fileno transport.connected = 1 transport._realPortNumber = port self.transport = transport # hack around startListening not being called self.numPorts = 1 transport.startReading() def useTcpFileDescriptor(self, fd, factory): import os, socket for i in range(19800, 19999): try: p = reactor.listenTCP(i, factory) os.dup2(fd, p.socket.fileno()) p.socket.listen(p.backlog) p.socket.setblocking(False) p.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) os.close(fd) return p except socket.error: pass raise socket.error("Unable to find an open socket to listen on") def reportCycle(self): if self.options.statcycle: self.report() reactor.callLater(self.options.statcycle, self.reportCycle) def heartbeat(self): """Since we don't do anything on a regular basis, just push heartbeats regularly""" seconds = self.heartbeatTimeout / 3 reactor.callLater(self.heartbeatTimeout / 3, self.heartbeat) PBDaemon.heartbeat(self) totalTime, totalEvents, maxTime = self.stats.report() for ev in (self.rrdStats.counter('events', seconds, totalEvents) + self.rrdStats.counter('totalTime', seconds, int(totalTime * 1000))): self.sendEvent(ev) def report(self): 'report some simple diagnostics at shutdown' totalTime, totalEvents, maxTime = self.stats.report() self.log.info("%d events processed in %.2f seconds", totalEvents, totalTime) if totalEvents > 0: self.log.info("%.5f average seconds per event", (totalTime / totalEvents)) self.log.info("Maximum processing time for one event was %.5f", maxTime) def buildOptions(self): PBDaemon.buildOptions(self) self.parser.add_option( '--statcycle', dest='statcycle', type='int', help='Number of seconds between the writing of statistics', default=0)
class PBDaemon(ZenDaemon, pb.Referenceable): name = 'pbdaemon' initialServices = ['EventService'] heartbeatEvent = {'eventClass':Heartbeat} heartbeatTimeout = 60*3 _customexitcode = 0 _pushEventsDeferred = None def __init__(self, noopts=0, keeproot=False, name=None): # if we were provided our collector name via the constructor instead of # via code, be sure to store it correctly. if name is not None: self.name = name self.mname = name try: ZenDaemon.__init__(self, noopts, keeproot) except IOError: import traceback self.log.critical( traceback.format_exc( 0 ) ) sys.exit(1) self.rrdStats = DaemonStats() self.lastStats = 0 self.perspective = None self.services = {} self.eventQueueManager = EventQueueManager(self.options, self.log) self.startEvent = startEvent.copy() self.stopEvent = stopEvent.copy() details = dict(component=self.name, device=self.options.monitor) for evt in self.startEvent, self.stopEvent, self.heartbeatEvent: evt.update(details) self.initialConnect = defer.Deferred() self.stopped = False self.counters = collections.Counter() self.loadCounters() self._pingedZenhub = None self._connectionTimeout = None # Add a shutdown trigger to send a stop event and flush the event queue reactor.addSystemEventTrigger('before', 'shutdown', self._stopPbDaemon) def connecting(self): """ Called when about to connect to zenhub """ self.log.info("Attempting to connect to zenhub") def gotPerspective(self, perspective): """ This gets called every time we reconnect. @parameter perspective: Twisted perspective object @type perspective: Twisted perspective object """ self.log.info("Connected to ZenHub") self.perspective = perspective # Cancel the connection timeout timer as it's no longer needed. if self._connectionTimeout: try: self._connectionTimeout.cancel() except AlreadyCalled: pass self._connectionTimeout = None d2 = self.getInitialServices() if self.initialConnect: self.log.debug('Chaining getInitialServices with d2') self.initialConnect, d = None, self.initialConnect d2.chainDeferred(d) def connect(self): factory = ReconnectingPBClientFactory(connectTimeout=60) self.log.info("Connecting to %s:%d" % (self.options.hubhost, self.options.hubport)) factory.connectTCP(self.options.hubhost, self.options.hubport) username = self.options.hubusername password = self.options.hubpassword self.log.debug("Logging in as %s" % username) c = credentials.UsernamePassword(username, password) factory.gotPerspective = self.gotPerspective factory.connecting = self.connecting factory.startLogin(c) def timeout(d): if not d.called: self.connectTimeout() self._connectionTimeout = reactor.callLater( self.options.hubtimeout, timeout, self.initialConnect) return self.initialConnect def connectTimeout(self): self.log.error('Timeout connecting to zenhub: is it running?') pass def eventService(self): return self.getServiceNow('EventService') def getServiceNow(self, svcName): if not svcName in self.services: self.log.warning('No service named %r: ZenHub may be disconnected' % svcName) return self.services.get(svcName, None) or FakeRemote() def getService(self, serviceName, serviceListeningInterface=None): """ Attempt to get a service from zenhub. Returns a deferred. When service is retrieved it is stashed in self.services with serviceName as the key. When getService is called it will first check self.services and if serviceName is already there it will return the entry from self.services wrapped in a defer.succeed """ if serviceName in self.services: return defer.succeed(self.services[serviceName]) def removeService(ignored): self.log.debug('Removing service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] def callback(result, serviceName): self.log.debug('Loaded service %s from zenhub' % serviceName) self.services[serviceName] = result result.notifyOnDisconnect(removeService) return result def errback(error, serviceName): self.log.debug('errback after getting service %s' % serviceName) self.log.error('Could not retrieve service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] return error d = self.perspective.callRemote('getService', serviceName, self.options.monitor, serviceListeningInterface or self) d.addCallback(callback, serviceName) d.addErrback(errback, serviceName) return d def getInitialServices(self): """ After connecting to zenhub, gather our initial list of services. """ def errback(error): if isinstance(error, Failure): self.log.critical( "Invalid monitor: %s" % self.options.monitor) reactor.stop() return defer.fail(RemoteBadMonitor( "Invalid monitor: %s" % self.options.monitor, '')) return error self.log.debug('Setting up initial services: %s' % \ ', '.join(self.initialServices)) d = defer.DeferredList( [self.getService(name) for name in self.initialServices], fireOnOneErrback=True, consumeErrors=True) d.addErrback(errback) return d def connected(self): pass def run(self): self.rrdStats.config(self.options.monitor, self.name, []) self.log.debug('Starting PBDaemon initialization') d = self.connect() def callback(result): self.sendEvent(self.startEvent) self.pushEventsLoop() self.log.debug('Calling connected.') self.connected() return result d.addCallback(callback) d.addErrback(twisted.python.log.err) reactor.run() if self._customexitcode: sys.exit(self._customexitcode) def setExitCode(self, exitcode): self._customexitcode = exitcode def stop(self, ignored=''): if reactor.running: try: reactor.stop() except ReactorNotRunning: self.log.debug("Tried to stop reactor that was stopped") else: self.log.debug("stop() called when not running") def _stopPbDaemon(self): if self.stopped: return self.stopped = True if 'EventService' in self.services: # send stop event if we don't have an implied --cycle, # or if --cycle has been specified if not hasattr(self.options, 'cycle') or\ getattr(self.options, 'cycle', True): self.sendEvent(self.stopEvent) self.log.debug("Sent a 'stop' event") if self._pushEventsDeferred: self.log.debug("Currently sending events. Queueing next call") d = self._pushEventsDeferred # Schedule another call to flush any additional queued events d.addBoth(lambda unused: self.pushEvents()) else: d = self.pushEvents() d.addBoth(lambda unused: self.saveCounters()) return d self.log.debug("No event sent as no EventService available.") self.saveCounters() def sendEvents(self, events): map(self.sendEvent, events) def sendEvent(self, event, **kw): ''' Add event to queue of events to be sent. If we have an event service then process the queue. ''' generatedEvent = self.generateEvent(event, **kw) self.eventQueueManager.addEvent(generatedEvent) self.counters['eventCount'] += 1 def generateEvent(self, event, **kw): ''' Add event to queue of events to be sent. If we have an event service then process the queue. ''' if not reactor.running: return event = event.copy() event['agent'] = self.name event['monitor'] = self.options.monitor event['manager'] = self.fqdn event.update(kw) return event @defer.inlineCallbacks def pushEventsLoop(self): """Periodially, wake up and flush events to ZenHub. """ reactor.callLater(self.options.eventflushseconds, self.pushEventsLoop) yield self.pushEvents() # Record the number of events in the queue every 5 minutes. now = time.time() if self.rrdStats.name and now >= (self.lastStats + 300): self.lastStats = now events = self.rrdStats.gauge('eventQueueLength', 300, self.eventQueueManager.event_queue_length) for event in events: self.eventQueueManager.addPerformanceEvent(event) @defer.inlineCallbacks def pushEvents(self): """Flush events to ZenHub. """ # are we already shutting down? if not reactor.running: self.log.debug("Skipping event sending - reactor not running.") return if self._pushEventsDeferred: self.log.debug("Skipping event sending - previous call active.") return try: self._pushEventsDeferred = defer.Deferred() # are still connected to ZenHub? evtSvc = self.services.get('EventService', None) if not evtSvc: self.log.error("No event service: %r", evtSvc) return discarded_events = self.eventQueueManager.discarded_events if discarded_events: self.log.error( 'Discarded oldest %d events because maxqueuelen was ' 'exceeded: %d/%d', discarded_events, discarded_events + self.options.maxqueuelen, self.options.maxqueuelen) self.counters['discardedEvents'] += discarded_events self.eventQueueManager.discarded_events = 0 send_events_fn = partial(evtSvc.callRemote, 'sendEvents') try: yield self.eventQueueManager.sendEvents(send_events_fn) except ConnectionLost as ex: self.log.error('Error sending event: %s', ex) except ConnectionDone: pass except Exception as ex: self.log.exception(ex) finally: d, self._pushEventsDeferred = self._pushEventsDeferred, None d.callback('sent') def heartbeat(self): 'if cycling, send a heartbeat, else, shutdown' if not self.options.cycle: self.stop() return heartbeatEvent = self.generateEvent(self.heartbeatEvent, timeout=self.heartbeatTimeout) self.eventQueueManager.addHeartbeatEvent(heartbeatEvent) # heartbeat is normally 3x cycle time self.niceDoggie(self.heartbeatTimeout / 3) events = [] # save daemon counter stats for name, value in self.counters.items(): self.log.info("Counter %s, value %d", name, value) events += self.rrdStats.counter(name, 300, value) self.sendEvents(events) # persist counters values self.saveCounters() def saveCounters(self): atomicWrite( zenPath('var/%s_counters.pickle' % self.name), pickle.dumps(self.counters), raiseException=False, ) def loadCounters(self): try: self.counters = pickle.load(open(zenPath('var/%s_counters.pickle'% self.name))) except Exception: pass def remote_getName(self): return self.name def remote_shutdown(self, unused): self.stop() self.sigTerm() def remote_setPropertyItems(self, items): pass @translateError def remote_updateThresholdClasses(self, classes): from Products.ZenUtils.Utils import importClass self.log.debug("Loading classes %s", classes) for c in classes: try: importClass(c) except ImportError: self.log.error("Unable to import class %s", c) def buildOptions(self): self.parser.add_option('--hubhost', dest='hubhost', default=DEFAULT_HUB_HOST, help='Host of zenhub daemon.' ' Default is %s.' % DEFAULT_HUB_HOST) self.parser.add_option('--hubport', dest='hubport', type='int', default=DEFAULT_HUB_PORT, help='Port zenhub listens on.' 'Default is %s.' % DEFAULT_HUB_PORT) self.parser.add_option('--hubusername', dest='hubusername', default=DEFAULT_HUB_USERNAME, help='Username for zenhub login.' ' Default is %s.' % DEFAULT_HUB_USERNAME) self.parser.add_option('--hubpassword', dest='hubpassword', default=DEFAULT_HUB_PASSWORD, help='Password for zenhub login.' ' Default is %s.' % DEFAULT_HUB_PASSWORD) self.parser.add_option('--monitor', dest='monitor', default=DEFAULT_HUB_MONITOR, help='Name of monitor instance to use for' ' configuration. Default is %s.' % DEFAULT_HUB_MONITOR) self.parser.add_option('--initialHubTimeout', dest='hubtimeout', type='int', default=30, help='Initial time to wait for a ZenHub ' 'connection') self.parser.add_option('--allowduplicateclears', dest='allowduplicateclears', default=False, action='store_true', help='Send clear events even when the most ' 'recent event was also a clear event.') self.parser.add_option('--duplicateclearinterval', dest='duplicateclearinterval', default=0, type='int', help=('Send a clear event every [DUPLICATECLEARINTEVAL] ' 'events.') ) self.parser.add_option('--eventflushseconds', dest='eventflushseconds', default=5., type='float', help='Seconds between attempts to flush ' 'events to ZenHub.') self.parser.add_option('--eventflushchunksize', dest='eventflushchunksize', default=50, type='int', help='Number of events to send to ZenHub' 'at one time') self.parser.add_option('--maxqueuelen', dest='maxqueuelen', default=5000, type='int', help='Maximum number of events to queue') self.parser.add_option('--zenhubpinginterval', dest='zhPingInterval', default=30, type='int', help='How often to ping zenhub') self.parser.add_option('--disable-event-deduplication', dest='deduplicate_events', default=True, action='store_false', help='Disable event de-duplication') ZenDaemon.buildOptions(self)
class EventServer(PBDaemon): 'Base class for a daemon whose primary job is to post events' name = 'EventServer' def __init__(self): PBDaemon.__init__(self, keeproot=True) self.stats = Stats() self.rrdStats = DaemonStats() def connected(self): self.sendEvent(dict(device=self.options.monitor, eventClass=App_Start, summary="%s started" % self.name, severity=0, component=self.name)) self.log.info("started") self.configure() def model(self): return self.services.get('EventService', FakeRemote()) def configure(self): def inner(driver): self.log.info("fetching default RRDCreateCommand") yield self.model().callRemote('getDefaultRRDCreateCommand') createCommand = driver.next() self.log.info("getting threshold classes") yield self.model().callRemote('getThresholdClasses') self.remote_updateThresholdClasses(driver.next()) self.log.info("getting collector thresholds") yield self.model().callRemote('getCollectorThresholds') self.rrdStats.config(self.options.monitor, self.name, driver.next(), createCommand) self.heartbeat() self.reportCycle() d = drive(inner) def error(result): self.log.error("Unexpected error in configure: %s" % result) d.addErrback(error) return d def sendEvent(self, event, **kw): # FIXME: get real event processing stats if 'firstTime' in event: self.stats.add(min(time.time() - event['firstTime'], 0)) PBDaemon.sendEvent(self, event, **kw) def useUdpFileDescriptor(self, fd): from twisted.internet import udp s = socket.fromfd(fd, socket.AF_INET, socket.SOCK_DGRAM) import os os.close(fd) port = s.getsockname()[1] transport = udp.Port(port, self) s.setblocking(0) transport.socket = s transport.fileno = s.fileno transport.connected = 1 transport._realPortNumber = port self.transport = transport # hack around startListening not being called self.numPorts = 1 transport.startReading() def useTcpFileDescriptor(self, fd, factory): import os, socket for i in range(19800, 19999): try: p = reactor.listenTCP(i, factory) os.dup2(fd, p.socket.fileno()) p.socket.listen(p.backlog) p.socket.setblocking(False) p.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) os.close(fd) return p except socket.error: pass raise socket.error("Unable to find an open socket to listen on") def reportCycle(self): if self.options.statcycle: self.report() reactor.callLater(self.options.statcycle, self.reportCycle) def heartbeat(self): """Since we don't do anything on a regular basis, just push heartbeats regularly""" seconds = self.heartbeatTimeout / 3 reactor.callLater(self.heartbeatTimeout / 3, self.heartbeat) PBDaemon.heartbeat(self) totalTime, totalEvents, maxTime = self.stats.report() for ev in (self.rrdStats.counter('events', seconds, totalEvents) + self.rrdStats.counter('totalTime', seconds, int(totalTime * 1000))): self.sendEvent(ev) def report(self): 'report some simple diagnostics at shutdown' totalTime, totalEvents, maxTime = self.stats.report() self.log.info("%d events processed in %.2f seconds", totalEvents, totalTime) if totalEvents > 0: self.log.info("%.5f average seconds per event", (totalTime / totalEvents)) self.log.info("Maximum processing time for one event was %.5f", maxTime) def buildOptions(self): PBDaemon.buildOptions(self) self.parser.add_option('--statcycle', dest='statcycle', type='int', help='Number of seconds between the writing of statistics', default=0)