def __init__(self, noopts=0, keeproot=False, name=None): # if we were provided our collector name via the constructor instead of # via code, be sure to store it correctly. if name is not None: self.name = name self.mname = name try: ZenDaemon.__init__(self, noopts, keeproot) except IOError: import traceback self.log.critical(traceback.format_exc(0)) sys.exit(1) self.rrdStats = DaemonStats() self.lastStats = 0 self.perspective = None self.services = {} self.eventQueueManager = EventQueueManager(self.options, self.log) self.startEvent = startEvent.copy() self.stopEvent = stopEvent.copy() details = dict(component=self.name, device=self.options.monitor) for evt in self.startEvent, self.stopEvent, self.heartbeatEvent: evt.update(details) self.initialConnect = defer.Deferred() self.stopped = False self.counters = collections.Counter() self.loadCounters() self._pingedZenhub = None self._connectionTimeout = None # Add a shutdown trigger to send a stop event and flush the event queue reactor.addSystemEventTrigger('before', 'shutdown', self._stopPbDaemon)
def __init__(self, single=False): """ Initalizer @param single: collect from a single device? @type single: boolean """ PBDaemon.__init__(self) # FIXME: cleanup --force option #2660 self.options.force = True self.start = None self.startat = None self.rrdStats = DaemonStats() self.single = single if self.options.device: self.single = True self.modelerCycleInterval = self.options.cycletime # get the minutes and convert to fraction of a day self.collage = float(self.options.collage) / 1440.0 self.pendingNewClients = False self.clients = [] self.finished = [] self.devicegen = None self.counters = collections.Counter() self.configFilter = None self.configLoaded = False # Make sendEvent() available to plugins zope.component.provideUtility(self, IEventService) # Delay start for between 10 and 60 seconds when run as a daemon. self.started = False self.startDelay = 0 self.immediate = 1 if self.options.daemon or self.options.cycle: if self.options.now: self.log.debug('option "now" specified, starting immediately.') else: # self.startDelay = randint(10, 60) * 60 self.startDelay = randint(10, 60) * 1 self.immediate = 0 self.log.info( 'option "now" not specified, waiting %s seconds to start.' % self.startDelay) else: self.log.debug("Run in foreground, starting immediately.") # ZEN-26637 self.collectorLoopIteration = 0 self.mainLoopGotDeviceList = False self.isMainScheduled = False self._modeledDevicesMetric = Metrology.meter( "zenmodeler.modeledDevices") self._failuresMetric = Metrology.counter("zenmodeler.failures")
def get_rrd_stats(self, hub_config, send_event): rrd_stats = DaemonStats() thresholds = hub_config.getThresholdInstances(BuiltInDS.sourcetype) threshold_notifier = ThresholdNotifier(send_event, thresholds) derivative_tracker = DerivativeTracker() rrd_stats.config( 'zenhub', hub_config.id, self.metric_writer, threshold_notifier, derivative_tracker ) return rrd_stats
def __init__(self, noopts=0, keeproot=False, name=None): # if we were provided our collector name via the constructor instead of # via code, be sure to store it correctly. if name is not None: self.name = name self.mname = name try: ZenDaemon.__init__(self, noopts, keeproot) except IOError: import traceback self.log.critical( traceback.format_exc( 0 ) ) sys.exit(1) self.rrdStats = DaemonStats() self.lastStats = 0 self.perspective = None self.services = {} self.eventQueueManager = EventQueueManager(self.options, self.log) self.startEvent = startEvent.copy() self.stopEvent = stopEvent.copy() details = dict(component=self.name, device=self.options.monitor) for evt in self.startEvent, self.stopEvent, self.heartbeatEvent: evt.update(details) self.initialConnect = defer.Deferred() self.stopped = False self.counters = collections.Counter() self.loadCounters() self._pingedZenhub = None self._connectionTimeout = None # Add a shutdown trigger to send a stop event and flush the event queue reactor.addSystemEventTrigger('before', 'shutdown', self._stopPbDaemon)
def getRRDStats(self): """ Return the most recent RRD statistic information. """ rrdStats = DaemonStats() perfConf = self._getConf() from Products.ZenModel.BuiltInDS import BuiltInDS threshs = perfConf.getThresholdInstances(BuiltInDS.sourcetype) threshold_notifier = ThresholdNotifier(self.zem.sendEvent, threshs) derivative_tracker = DerivativeTracker() rrdStats.config('zenhub', perfConf.id, self._metric_writer, threshold_notifier, derivative_tracker) return rrdStats
def __init__(self, single=False): """ Initalizer @param single: collect from a single device? @type single: boolean """ PBDaemon.__init__(self) # FIXME: cleanup --force option #2660 self.options.force = True self.start = None self.startat = None self.rrdStats = DaemonStats() self.single = single if self.options.device: self.single = True self.modelerCycleInterval = self.options.cycletime self.collage = float(self.options.collage) / 1440.0 self.pendingNewClients = False self.clients = [] self.finished = [] self.devicegen = None self.counters = collections.Counter() # Make sendEvent() available to plugins zope.component.provideUtility(self, IEventService) # Delay start for between 10 and 60 seconds when run as a daemon. self.started = False self.startDelay = 0 self.immediate = 1 if self.options.daemon: if self.options.now: self.log.debug("Run as a daemon, starting immediately.") else: # self.startDelay = randint(10, 60) * 60 self.startDelay = randint(10, 60) * 1 self.immediate = 0 self.log.info("Run as a daemon, waiting %s seconds to start." % self.startDelay) else: self.log.debug("Run in foreground, starting immediately.") # load performance counters self.loadCounters()
def __init__(self, single=False ): """ Initalizer @param single: collect from a single device? @type single: boolean """ PBDaemon.__init__(self) # FIXME: cleanup --force option #2660 self.options.force = True self.start = None self.startat = None self.rrdStats = DaemonStats() self.single = single if self.options.device: self.single = True self.modelerCycleInterval = self.options.cycletime # get the minutes and convert to fraction of a day self.collage = float( self.options.collage ) / 1440.0 self.pendingNewClients = False self.clients = [] self.finished = [] self.devicegen = None self.counters = collections.Counter() self.configFilter = None self.configLoaded = False # Make sendEvent() available to plugins zope.component.provideUtility(self, IEventService) # Delay start for between 10 and 60 seconds when run as a daemon. self.started = False self.startDelay = 0 self.immediate = 1 if self.options.daemon or self.options.cycle: if self.options.now: self.log.debug('option "now" specified, starting immediately.') else: # self.startDelay = randint(10, 60) * 60 self.startDelay = randint(10, 60) * 1 self.immediate = 0 self.log.info('option "now" not specified, waiting %s seconds to start.' % self.startDelay) else: self.log.debug("Run in foreground, starting immediately.") # ZEN-26637 self.collectorLoopIteration = 0 self.mainLoopGotDeviceList = False self._modeledDevicesMetric = Metrology.meter("zenmodeler.modeledDevices") self._failuresMetric = Metrology.counter("zenmodeler.failures")
class DaemonStatsTest(BaseTestCase): """Test the DaemonStats""" def setUp(self): self.daemon_stats = DaemonStats() def testDaemonsTagsServiceId(self): os.environ["CONTROLPLANE"] = "1" os.environ["CONTROLPLANE_SERVICE_ID"] = "ID" os.environ["CONTROLPLANE_TENANT_ID"] = "foo" os.environ["CONTROLPLANE_INSTANCE_ID"] = "bar" self.daemon_stats.config( "name", "monitor", None, None, None) self.assertEqual( {'daemon': 'name', 'instance': 'bar', 'internal': True, 'monitor': 'monitor', 'metricType': 'type', 'serviceId': 'ID', 'tenantId': 'foo'}, self.daemon_stats._tags("type") ) def testDaemonsDoesNotTagServiceId(self): if "CONTROLPLANE" in os.environ: del os.environ["CONTROLPLANE"] if "CONTROLPLANE_SERVICE_ID" in os.environ: del os.environ["CONTROLPLANE_SERVICE_ID"] self.daemon_stats.config( "name", "monitor", None, None, None) self.assertEqual( {'daemon': 'name', 'internal': True, 'monitor': 'monitor', 'metricType': 'type'}, self.daemon_stats._tags("type") )
def __init__(self, single=False ): """ Initalizer @param single: collect from a single device? @type single: boolean """ PBDaemon.__init__(self) # FIXME: cleanup --force option #2660 self.options.force = True self.start = None self.startat = None self.rrdStats = DaemonStats() self.single = single if self.options.device: self.single = True self.modelerCycleInterval = self.options.cycletime self.collage = float( self.options.collage ) / 1440.0 self.pendingNewClients = False self.clients = [] self.finished = [] self.devicegen = None self.counters = collections.Counter() # Make sendEvent() available to plugins zope.component.provideUtility(self, IEventService) # Delay start for between 10 and 60 seconds when run as a daemon. self.started = False self.startDelay = 0 self.immediate = 1 if self.options.daemon: if self.options.now: self.log.debug("Run as a daemon, starting immediately.") else: # self.startDelay = randint(10, 60) * 60 self.startDelay = randint(10, 60) * 1 self.immediate = 0 self.log.info("Run as a daemon, waiting %s seconds to start." % self.startDelay) else: self.log.debug("Run in foreground, starting immediately.") # load performance counters self.loadCounters()
class DaemonStatsTest(BaseTestCase): """Test the DaemonStats""" def setUp(self): self.daemon_stats = DaemonStats() def testDaemonsTagsServiceId(self): os.environ["CONTROLPLANE"] = "1" os.environ["CONTROLPLANE_SERVICE_ID"] = "ID" os.environ["CONTROLPLANE_TENANT_ID"] = "foo" os.environ["CONTROLPLANE_INSTANCE_ID"] = "bar" self.daemon_stats.config("name", "monitor", None, None, None) self.assertEqual( { 'daemon': 'name', 'instance': 'bar', 'internal': True, 'monitor': 'monitor', 'metricType': 'type', 'serviceId': 'ID', 'tenantId': 'foo' }, self.daemon_stats._tags("type")) def testDaemonsDoesNotTagServiceId(self): if "CONTROLPLANE" in os.environ: del os.environ["CONTROLPLANE"] if "CONTROLPLANE_SERVICE_ID" in os.environ: del os.environ["CONTROLPLANE_SERVICE_ID"] self.daemon_stats.config("name", "monitor", None, None, None) self.assertEqual( { 'daemon': 'name', 'internal': True, 'monitor': 'monitor', 'metricType': 'type' }, self.daemon_stats._tags("type"))
class PBDaemon(ZenDaemon, pb.Referenceable): name = 'pbdaemon' initialServices = ['EventService'] heartbeatEvent = {'eventClass': Heartbeat} heartbeatTimeout = 60 * 3 _customexitcode = 0 _pushEventsDeferred = None _eventHighWaterMark = None _healthMonitorInterval = 30 def __init__(self, noopts=0, keeproot=False, name=None): # if we were provided our collector name via the constructor instead of # via code, be sure to store it correctly. if name is not None: self.name = name self.mname = name try: ZenDaemon.__init__(self, noopts, keeproot) except IOError: import traceback self.log.critical(traceback.format_exc(0)) sys.exit(1) self._thresholds = None self._threshold_notifier = None self.rrdStats = DaemonStats() self.lastStats = 0 self.perspective = None self.services = {} self.eventQueueManager = EventQueueManager(self.options, self.log) self.startEvent = startEvent.copy() self.stopEvent = stopEvent.copy() details = dict(component=self.name, device=self.options.monitor) for evt in self.startEvent, self.stopEvent, self.heartbeatEvent: evt.update(details) self.initialConnect = defer.Deferred() self.stopped = False self.counters = collections.Counter() self._pingedZenhub = None self._connectionTimeout = None self._publisher = None self._internal_publisher = None self._metric_writer = None self._derivative_tracker = None self._metrologyReporter = None # Add a shutdown trigger to send a stop event and flush the event queue reactor.addSystemEventTrigger('before', 'shutdown', self._stopPbDaemon) # Set up a looping call to support the health check. self.healthMonitor = task.LoopingCall(self._checkZenHub) self.healthMonitor.start(self._healthMonitorInterval) def publisher(self): if not self._publisher: host, port = urlparse(self.options.redisUrl).netloc.split(':') try: port = int(port) except ValueError: self.log.exception( "redis url contains non-integer port " + "value {port}, defaulting to {default}".format( port=port, default=publisher.defaultRedisPort)) port = publisher.defaultRedisPort self._publisher = publisher.RedisListPublisher( host, port, self.options.metricBufferSize, channel=self.options.metricsChannel, maxOutstandingMetrics=self.options.maxOutstandingMetrics) return self._publisher def internalPublisher(self): if not self._internal_publisher: url = os.environ.get("CONTROLPLANE_CONSUMER_URL", None) username = os.environ.get("CONTROLPLANE_CONSUMER_USERNAME", "") password = os.environ.get("CONTROLPLANE_CONSUMER_PASSWORD", "") if url: self._internal_publisher = publisher.HttpPostPublisher( username, password, url) return self._internal_publisher def metricWriter(self): if not self._metric_writer: publisher = self.publisher() metric_writer = MetricWriter(publisher) if os.environ.get("CONTROLPLANE", "0") == "1": internal_publisher = self.internalPublisher() if internal_publisher: internal_metric_filter = lambda metric, value, timestamp, tags:\ tags and tags.get("internal", False) internal_metric_writer = FilteredMetricWriter( internal_publisher, internal_metric_filter) self._metric_writer = AggregateMetricWriter( [metric_writer, internal_metric_writer]) else: self._metric_writer = metric_writer return self._metric_writer def derivativeTracker(self): if not self._derivative_tracker: self._derivative_tracker = DerivativeTracker() return self._derivative_tracker def connecting(self): """ Called when about to connect to zenhub """ self.log.info("Attempting to connect to zenhub") def getZenhubInstanceId(self): """ Called after we connected to zenhub. """ def callback(result): self.log.info("Connected to the zenhub/%s instance", result) def errback(result): self.log.info( "Unexpected error appeared while getting zenhub instance number %s", result) d = self.perspective.callRemote('getHubInstanceId') d.addCallback(callback) d.addErrback(errback) return d def gotPerspective(self, perspective): """ This gets called every time we reconnect. @parameter perspective: Twisted perspective object @type perspective: Twisted perspective object """ self.perspective = perspective self.getZenhubInstanceId() # Cancel the connection timeout timer as it's no longer needed. if self._connectionTimeout: try: self._connectionTimeout.cancel() except AlreadyCalled: pass self._connectionTimeout = None d2 = self.getInitialServices() if self.initialConnect: self.log.debug('Chaining getInitialServices with d2') self.initialConnect, d = None, self.initialConnect d2.chainDeferred(d) def connect(self): pingInterval = self.options.zhPingInterval factory = ReconnectingPBClientFactory( connectTimeout=60, pingPerspective=self.options.pingPerspective, pingInterval=pingInterval, pingtimeout=pingInterval * 5) self.log.info("Connecting to %s:%d" % (self.options.hubhost, self.options.hubport)) factory.connectTCP(self.options.hubhost, self.options.hubport) username = self.options.hubusername password = self.options.hubpassword self.log.debug("Logging in as %s" % username) c = credentials.UsernamePassword(username, password) factory.gotPerspective = self.gotPerspective factory.connecting = self.connecting factory.setCredentials(c) def timeout(d): if not d.called: self.connectTimeout() self._connectionTimeout = reactor.callLater(self.options.hubtimeout, timeout, self.initialConnect) return self.initialConnect def connectTimeout(self): self.log.error('Timeout connecting to zenhub: is it running?') pass def eventService(self): return self.getServiceNow('EventService') def getServiceNow(self, svcName): if not svcName in self.services: self.log.warning( 'No service named %r: ZenHub may be disconnected' % svcName) return self.services.get(svcName, None) or FakeRemote() def getService(self, serviceName, serviceListeningInterface=None): """ Attempt to get a service from zenhub. Returns a deferred. When service is retrieved it is stashed in self.services with serviceName as the key. When getService is called it will first check self.services and if serviceName is already there it will return the entry from self.services wrapped in a defer.succeed """ if serviceName in self.services: return defer.succeed(self.services[serviceName]) def removeService(ignored): self.log.debug('Removing service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] def callback(result, serviceName): self.log.debug('Loaded service %s from zenhub' % serviceName) self.services[serviceName] = result result.notifyOnDisconnect(removeService) return result def errback(error, serviceName): self.log.debug('errback after getting service %s' % serviceName) self.log.error('Could not retrieve service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] return error d = self.perspective.callRemote('getService', serviceName, self.options.monitor, serviceListeningInterface or self, self.options.__dict__) d.addCallback(callback, serviceName) d.addErrback(errback, serviceName) return d def getInitialServices(self): """ After connecting to zenhub, gather our initial list of services. """ def errback(error): if isinstance(error, Failure): self.log.critical("Invalid monitor: %s" % self.options.monitor) reactor.stop() return defer.fail( RemoteBadMonitor( "Invalid monitor: %s" % self.options.monitor, '')) return error self.log.debug('Setting up initial services: %s' % \ ', '.join(self.initialServices)) d = defer.DeferredList( [self.getService(name) for name in self.initialServices], fireOnOneErrback=True, consumeErrors=True) d.addErrback(errback) return d def connected(self): pass def _getThresholdNotifier(self): if not self._threshold_notifier: self._threshold_notifier = ThresholdNotifier( self.sendEvent, self.getThresholds()) return self._threshold_notifier def getThresholds(self): if not self._thresholds: self._thresholds = Thresholds() return self._thresholds def run(self): def stopReporter(): if self._metrologyReporter: return self._metrologyReporter.stop() # Order of the shutdown triggers matter. Want to stop reporter first, calling self.metricWriter() below # registers shutdown triggers for the actual metric http and redis publishers. reactor.addSystemEventTrigger('before', 'shutdown', stopReporter) threshold_notifier = self._getThresholdNotifier() self.rrdStats.config(self.name, self.options.monitor, self.metricWriter(), threshold_notifier, self.derivativeTracker()) self.log.debug('Starting PBDaemon initialization') d = self.connect() def callback(result): self.sendEvent(self.startEvent) self.pushEventsLoop() self.log.debug('Calling connected.') self.connected() return result def startStatsLoop(): self.log.debug("Starting Statistic posting") loop = task.LoopingCall(self.postStatistics) loop.start(self.options.writeStatistics, now=False) daemonTags = { 'zenoss_daemon': self.name, 'zenoss_monitor': self.options.monitor, 'internal': True } self._metrologyReporter = TwistedMetricReporter( self.options.writeStatistics, self.metricWriter(), daemonTags) self._metrologyReporter.start() reactor.callWhenRunning(startStatsLoop) d.addCallback(callback) d.addErrback(twisted.python.log.err) reactor.run() if self._customexitcode: sys.exit(self._customexitcode) def setExitCode(self, exitcode): self._customexitcode = exitcode def stop(self, ignored=''): if reactor.running: try: reactor.stop() except ReactorNotRunning: self.log.debug("Tried to stop reactor that was stopped") else: self.log.debug("stop() called when not running") def _stopPbDaemon(self): if self.stopped: return self.stopped = True if 'EventService' in self.services: # send stop event if we don't have an implied --cycle, # or if --cycle has been specified if not hasattr(self.options, 'cycle') or \ getattr(self.options, 'cycle', True): self.sendEvent(self.stopEvent) self.log.debug("Sent a 'stop' event") if self._pushEventsDeferred: self.log.debug("Currently sending events. Queueing next call") d = self._pushEventsDeferred # Schedule another call to flush any additional queued events d.addBoth(lambda unused: self.pushEvents()) else: d = self.pushEvents() return d self.log.debug("No event sent as no EventService available.") def sendEvents(self, events): map(self.sendEvent, events) def sendEvent(self, event, **kw): """ Add event to queue of events to be sent. If we have an event service then process the queue. """ generatedEvent = self.generateEvent(event, **kw) self.eventQueueManager.addEvent(generatedEvent) self.counters['eventCount'] += 1 if self._eventHighWaterMark: return self._eventHighWaterMark elif self.eventQueueManager.event_queue_length >= self.options.maxqueuelen * self.options.queueHighWaterMark: return self.pushEvents() else: return defer.succeed(None) def generateEvent(self, event, **kw): """ Add event to queue of events to be sent. If we have an event service then process the queue. """ if not reactor.running: return eventCopy = {} for k, v in chain(event.items(), kw.items()): if isinstance(v, basestring): #default max size is 512k size = LIMITS.get(k, DEFAULT_LIMIT) eventCopy[k] = v[0:size] if len(v) > size else v else: eventCopy[k] = v eventCopy['agent'] = self.name eventCopy['monitor'] = self.options.monitor eventCopy['manager'] = self.fqdn return eventCopy @defer.inlineCallbacks def pushEventsLoop(self): """Periodially, wake up and flush events to ZenHub. """ reactor.callLater(self.options.eventflushseconds, self.pushEventsLoop) yield self.pushEvents() # Record the number of events in the queue up to every 2 seconds. now = time.time() if self.rrdStats.name and now >= (self.lastStats + 2): self.lastStats = now self.rrdStats.gauge('eventQueueLength', self.eventQueueManager.event_queue_length) @defer.inlineCallbacks def pushEvents(self): """Flush events to ZenHub. """ # are we already shutting down? if not reactor.running: self.log.debug("Skipping event sending - reactor not running.") return if self.eventQueueManager.event_queue_length >= self.options.maxqueuelen * self.options.queueHighWaterMark and not self._eventHighWaterMark: self.log.debug( "Queue length exceeded high water mark, %s ;creating high water mark deferred", self.eventQueueManager.event_queue_length) self._eventHighWaterMark = defer.Deferred() # are still connected to ZenHub? evtSvc = self.services.get('EventService', None) if not evtSvc: self.log.error("No event service: %r", evtSvc) yield task.deferLater(reactor, 0, lambda: None) if self._eventHighWaterMark: d, self._eventHighWaterMark = self._eventHighWaterMark, None #not connected, release throttle and let things queue d.callback("No Event Service") defer.returnValue(None) if self._pushEventsDeferred: self.log.debug("Skipping event sending - previous call active.") defer.returnValue("Push Pending") sent = 0 try: #only set _pushEventsDeferred after we know we have an evtSvc/connectivity self._pushEventsDeferred = defer.Deferred() def repush(val): if self.eventQueueManager.event_queue_length >= self.options.eventflushchunksize: self.pushEvents() return val # conditionally push more events after this pushEvents call finishes self._pushEventsDeferred.addCallback(repush) discarded_events = self.eventQueueManager.discarded_events if discarded_events: self.log.error( 'Discarded oldest %d events because maxqueuelen was ' 'exceeded: %d/%d', discarded_events, discarded_events + self.options.maxqueuelen, self.options.maxqueuelen) self.counters['discardedEvents'] += discarded_events self.eventQueueManager.discarded_events = 0 send_events_fn = partial(evtSvc.callRemote, 'sendEvents') try: sent = yield self.eventQueueManager.sendEvents(send_events_fn) except ConnectionLost as ex: self.log.error('Error sending event: %s', ex) #let the reactor have time to clean up any connection errors and make callbacks yield task.deferLater(reactor, 0, lambda: None) except Exception as ex: self.log.exception(ex) #let the reactor have time to clean up any connection errors and make callbacks yield task.deferLater(reactor, 0, lambda: None) finally: if self._pushEventsDeferred: d, self._pushEventsDeferred = self._pushEventsDeferred, None d.callback('sent %s' % sent) if self._eventHighWaterMark and self.eventQueueManager.event_queue_length < self.options.maxqueuelen * self.options.queueHighWaterMark: self.log.debug("Queue restored to below high water mark: %s", self.eventQueueManager.event_queue_length) d, self._eventHighWaterMark = self._eventHighWaterMark, None d.callback("Queue length below high water mark") def heartbeat(self): """if cycling, send a heartbeat, else, shutdown""" if not self.options.cycle: self.stop() return heartbeatEvent = self.generateEvent(self.heartbeatEvent, timeout=self.heartbeatTimeout) self.eventQueueManager.addHeartbeatEvent(heartbeatEvent) # heartbeat is normally 3x cycle time self.niceDoggie(self.heartbeatTimeout / 3) def postStatisticsImpl(self): pass def postStatistics(self): # save daemon counter stats for name, value in self.counters.items(): self.log.info("Counter %s, value %d", name, value) self.rrdStats.counter(name, value) # persist counters values self.postStatisticsImpl() def _pickleName(self): instance_id = os.environ.get('CONTROLPLANE_INSTANCE_ID') return 'var/%s_%s_counters.pickle' % (self.name, instance_id) def remote_getName(self): return self.name def remote_shutdown(self, unused): self.stop() self.sigTerm() def remote_setPropertyItems(self, items): pass @translateError def remote_updateThresholdClasses(self, classes): from Products.ZenUtils.Utils import importClass self.log.debug("Loading classes %s", classes) for c in classes: try: importClass(c) except ImportError: self.log.error("Unable to import class %s", c) def _checkZenHub(self): """ Check status of ZenHub (using ping method of service). @return: if ping occurs, return deferred with result of ping attempt. """ self.log.debug('_checkZenHub: entry') def callback(result): self.log.debug('ZenHub health check: Got result %s' % result) if result == 'pong': self.log.debug( 'ZenHub health check: Success - received pong from ZenHub ping service.' ) self._signalZenHubAnswering(True) else: self.log.error( 'ZenHub health check did not respond as expected.') self._signalZenHubAnswering(False) def errback(error): self.log.error('Error pinging ZenHub: %s (%s).' % (error, getattr(error, 'message', ''))) self._signalZenHubAnswering(False) try: if self.perspective: self.log.debug( 'ZenHub health check: perspective found. attempting remote ping call.' ) d = self.perspective.callRemote('ping') d.addCallback(callback) d.addErrback(errback) return d else: self.log.debug('ZenHub health check: ZenHub may be down.') self._signalZenHubAnswering(False) except pb.DeadReferenceError: self.log.warning( "ZenHub health check: DeadReferenceError - lost connection to ZenHub." ) self._signalZenHubAnswering(False) except Exception as e: self.log.error('ZenHub health check: caught %s exception: %s' % (e.__class__, e.message)) self._signalZenHubAnswering(False) def _signalZenHubAnswering(self, answering): """ Write or remove file that the ZenHub_answering health check uses to report status. @param answering: true if ZenHub is answering, False, otherwise. """ self.log.debug('_signalZenHubAnswering(%s)' % answering) filename = 'zenhub_connected' signalFilePath = zenPath('var', filename) if answering: self.log.debug('writing file at %s' % signalFilePath) atomicWrite(signalFilePath, '') else: try: self.log.debug('removing file at %s' % signalFilePath) os.remove(signalFilePath) except Exception as e: self.log.debug('ignoring %s exception (%s) removing file %s' % (e.__class__, e.message, signalFilePath)) def buildOptions(self): self.parser.add_option('--hubhost', dest='hubhost', default=DEFAULT_HUB_HOST, help='Host of zenhub daemon.' ' Default is %s.' % DEFAULT_HUB_HOST) self.parser.add_option('--hubport', dest='hubport', type='int', default=DEFAULT_HUB_PORT, help='Port zenhub listens on.' 'Default is %s.' % DEFAULT_HUB_PORT) self.parser.add_option('--hubusername', dest='hubusername', default=DEFAULT_HUB_USERNAME, help='Username for zenhub login.' ' Default is %s.' % DEFAULT_HUB_USERNAME) self.parser.add_option('--hubpassword', dest='hubpassword', default=DEFAULT_HUB_PASSWORD, help='Password for zenhub login.' ' Default is %s.' % DEFAULT_HUB_PASSWORD) self.parser.add_option('--monitor', dest='monitor', default=DEFAULT_HUB_MONITOR, help='Name of monitor instance to use for' ' configuration. Default is %s.' % DEFAULT_HUB_MONITOR) self.parser.add_option('--initialHubTimeout', dest='hubtimeout', type='int', default=30, help='Initial time to wait for a ZenHub ' 'connection') self.parser.add_option('--allowduplicateclears', dest='allowduplicateclears', default=False, action='store_true', help='Send clear events even when the most ' 'recent event was also a clear event.') self.parser.add_option( '--duplicateclearinterval', dest='duplicateclearinterval', default=0, type='int', help=('Send a clear event every [DUPLICATECLEARINTEVAL] ' 'events.')) self.parser.add_option('--eventflushseconds', dest='eventflushseconds', default=5., type='float', help='Seconds between attempts to flush ' 'events to ZenHub.') self.parser.add_option('--eventflushchunksize', dest='eventflushchunksize', default=50, type='int', help='Number of events to send to ZenHub' 'at one time') self.parser.add_option('--maxqueuelen', dest='maxqueuelen', default=5000, type='int', help='Maximum number of events to queue') self.parser.add_option( '--queuehighwatermark', dest='queueHighWaterMark', default=0.75, type='float', help= 'The size, in percent, of the event queue when event pushback starts' ) self.parser.add_option('--zenhubpinginterval', dest='zhPingInterval', default=120, type='int', help='How often to ping zenhub') self.parser.add_option('--disable-event-deduplication', dest='deduplicate_events', default=True, action='store_false', help='Disable event de-duplication') self.parser.add_option( '--redis-url', dest='redisUrl', type='string', default='redis://localhost:{default}/0'.format( default=publisher.defaultRedisPort), help= 'redis connection string: redis://[hostname]:[port]/[db], default: %default' ) self.parser.add_option( '--metricBufferSize', dest='metricBufferSize', type='int', default=publisher.defaultMetricBufferSize, help='Number of metrics to buffer if redis goes down') self.parser.add_option( '--metricsChannel', dest='metricsChannel', type='string', default=publisher.defaultMetricsChannel, help='redis channel to which metrics are published') self.parser.add_option('--maxOutstandingMetrics', dest='maxOutstandingMetrics', type='int', default=publisher.defaultMaxOutstandingMetrics, help='Max Number of metrics to allow in redis') self.parser.add_option('--disable-ping-perspective', dest='pingPerspective', help="Enable or disable ping perspective", default=True, action='store_false') self.parser.add_option( '--writeStatistics', dest='writeStatistics', type='int', default=30, help='How often to write internal statistics value in seconds') ZenDaemon.buildOptions(self)
class ZenModeler(PBDaemon): """ Daemon class to attach to zenhub and pass along device configuration information. """ name = 'zenmodeler' initialServices = PBDaemon.initialServices + ['ModelerService'] generateEvents = True configCycleInterval = 360 classCollectorPlugins = () def __init__(self, single=False ): """ Initalizer @param single: collect from a single device? @type single: boolean """ PBDaemon.__init__(self) # FIXME: cleanup --force option #2660 self.options.force = True self.start = None self.startat = None self.rrdStats = DaemonStats() self.single = single if self.options.device: self.single = True self.modelerCycleInterval = self.options.cycletime self.collage = float( self.options.collage ) / 1440.0 self.pendingNewClients = False self.clients = [] self.finished = [] self.devicegen = None self.counters = collections.Counter() # Make sendEvent() available to plugins zope.component.provideUtility(self, IEventService) # Delay start for between 10 and 60 seconds when run as a daemon. self.started = False self.startDelay = 0 self.immediate = 1 if self.options.daemon: if self.options.now: self.log.debug("Run as a daemon, starting immediately.") else: # self.startDelay = randint(10, 60) * 60 self.startDelay = randint(10, 60) * 1 self.immediate = 0 self.log.info("Run as a daemon, waiting %s seconds to start." % self.startDelay) else: self.log.debug("Run in foreground, starting immediately.") # load performance counters self.loadCounters() def reportError(self, error): """ Log errors that have occurred @param error: error message @type error: string """ self.log.error("Error occured: %s", error) def connected(self): """ Called after connected to the zenhub service """ d = self.configure() d.addCallback(self.heartbeat) d.addErrback(self.reportError) def configure(self): """ Get our configuration from zenhub """ # add in the code to fetch cycle time, etc. def inner(driver): """ Generator function to gather our configuration @param driver: driver object @type driver: driver object """ self.log.debug('fetching monitor properties') yield self.config().callRemote('propertyItems') items = dict(driver.next()) # If the cycletime option is not specified or zero, then use the # modelerCycleInterval value in the database. if not self.options.cycletime: self.modelerCycleInterval = items.get('modelerCycleInterval', _DEFAULT_CYCLE_INTERVAL) self.configCycleInterval = items.get('configCycleInterval', self.configCycleInterval) reactor.callLater(self.configCycleInterval * 60, self.configure) self.log.debug("Getting threshold classes...") yield self.config().callRemote('getThresholdClasses') self.remote_updateThresholdClasses(driver.next()) self.log.debug("Getting collector thresholds...") yield self.config().callRemote('getCollectorThresholds') thresholds = driver.next() threshold_notifier = ThresholdNotifier(self.sendEvent, thresholds) self.rrdStats.config(self.name, self.options.monitor, self.metricWriter(), threshold_notifier, self.derivativeTracker()) self.log.debug("Getting collector plugins for each DeviceClass") yield self.config().callRemote('getClassCollectorPlugins') self.classCollectorPlugins = driver.next() return drive(inner) def config(self): """ Get the ModelerService """ return self.services.get('ModelerService', FakeRemote()) def selectPlugins(self, device, transport): """ Build a list of active plugins for a device, based on: * the --collect command-line option which is a regex * the --ignore command-line option which is a regex * transport which is a string describing the type of plugin @param device: device to collect against @type device: string @param transport: python, ssh, snmp, telnet, cmd @type transport: string @return: results of the plugin @type: string @todo: determine if an event for the collector AND the device should be sent """ plugins = [] valid_loaders = [] for loader in device.plugins: try: plugin= loader.create() self.log.debug( "Loaded plugin %s" % plugin.name() ) plugins.append( plugin ) valid_loaders.append( loader ) except (SystemExit, KeyboardInterrupt), ex: self.log.info( "Interrupted by external signal (%s)" % str(ex) ) raise except Plugins.PluginImportError, import_error: import socket component, _ = os.path.splitext( os.path.basename( sys.argv[0] ) ) collector_host= socket.gethostname() # NB: an import errror affects all devices, # so report the issue against the collector # TOOD: determine if an event for the collector AND the device should be sent evt= { "eventClass":"/Status/Update", "component":component, "agent":collector_host, "device":collector_host, "severity":Error } info= "Problem loading plugin %s" % import_error.plugin self.log.error( info ) evt[ 'summary' ]= info info= import_error.traceback self.log.error( info ) evt[ 'message' ]= info info= ("Due to import errors, removing the %s plugin" " from this collection cycle.") % import_error.plugin self.log.error( info ) evt[ 'message' ] += "%s\n" % info self.sendEvent( evt )
class PBDaemon(ZenDaemon, pb.Referenceable): name = 'pbdaemon' initialServices = ['EventService'] heartbeatEvent = {'eventClass': Heartbeat} heartbeatTimeout = 60 * 3 _customexitcode = 0 _pushEventsDeferred = None def __init__(self, noopts=0, keeproot=False, name=None): # if we were provided our collector name via the constructor instead of # via code, be sure to store it correctly. if name is not None: self.name = name self.mname = name try: ZenDaemon.__init__(self, noopts, keeproot) except IOError: import traceback self.log.critical(traceback.format_exc(0)) sys.exit(1) self.rrdStats = DaemonStats() self.lastStats = 0 self.perspective = None self.services = {} self.eventQueueManager = EventQueueManager(self.options, self.log) self.startEvent = startEvent.copy() self.stopEvent = stopEvent.copy() details = dict(component=self.name, device=self.options.monitor) for evt in self.startEvent, self.stopEvent, self.heartbeatEvent: evt.update(details) self.initialConnect = defer.Deferred() self.stopped = False self.counters = collections.Counter() self.loadCounters() self._pingedZenhub = None self._connectionTimeout = None # Add a shutdown trigger to send a stop event and flush the event queue reactor.addSystemEventTrigger('before', 'shutdown', self._stopPbDaemon) def connecting(self): """ Called when about to connect to zenhub """ self.log.info("Attempting to connect to zenhub") def gotPerspective(self, perspective): """ This gets called every time we reconnect. @parameter perspective: Twisted perspective object @type perspective: Twisted perspective object """ self.log.info("Connected to ZenHub") self.perspective = perspective # Cancel the connection timeout timer as it's no longer needed. if self._connectionTimeout: try: self._connectionTimeout.cancel() except AlreadyCalled: pass self._connectionTimeout = None d2 = self.getInitialServices() if self.initialConnect: self.log.debug('Chaining getInitialServices with d2') self.initialConnect, d = None, self.initialConnect d2.chainDeferred(d) def connect(self): factory = ReconnectingPBClientFactory(connectTimeout=60) self.log.info("Connecting to %s:%d" % (self.options.hubhost, self.options.hubport)) factory.connectTCP(self.options.hubhost, self.options.hubport) username = self.options.hubusername password = self.options.hubpassword self.log.debug("Logging in as %s" % username) c = credentials.UsernamePassword(username, password) factory.gotPerspective = self.gotPerspective factory.connecting = self.connecting factory.startLogin(c) def timeout(d): if not d.called: self.connectTimeout() self._connectionTimeout = reactor.callLater(self.options.hubtimeout, timeout, self.initialConnect) return self.initialConnect def connectTimeout(self): self.log.error('Timeout connecting to zenhub: is it running?') pass def eventService(self): return self.getServiceNow('EventService') def getServiceNow(self, svcName): if not svcName in self.services: self.log.warning( 'No service named %r: ZenHub may be disconnected' % svcName) return self.services.get(svcName, None) or FakeRemote() def getService(self, serviceName, serviceListeningInterface=None): """ Attempt to get a service from zenhub. Returns a deferred. When service is retrieved it is stashed in self.services with serviceName as the key. When getService is called it will first check self.services and if serviceName is already there it will return the entry from self.services wrapped in a defer.succeed """ if serviceName in self.services: return defer.succeed(self.services[serviceName]) def removeService(ignored): self.log.debug('Removing service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] def callback(result, serviceName): self.log.debug('Loaded service %s from zenhub' % serviceName) self.services[serviceName] = result result.notifyOnDisconnect(removeService) return result def errback(error, serviceName): self.log.debug('errback after getting service %s' % serviceName) self.log.error('Could not retrieve service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] return error d = self.perspective.callRemote('getService', serviceName, self.options.monitor, serviceListeningInterface or self) d.addCallback(callback, serviceName) d.addErrback(errback, serviceName) return d def getInitialServices(self): """ After connecting to zenhub, gather our initial list of services. """ def errback(error): if isinstance(error, Failure): self.log.critical("Invalid monitor: %s" % self.options.monitor) reactor.stop() return defer.fail( RemoteBadMonitor( "Invalid monitor: %s" % self.options.monitor, '')) return error self.log.debug('Setting up initial services: %s' % \ ', '.join(self.initialServices)) d = defer.DeferredList( [self.getService(name) for name in self.initialServices], fireOnOneErrback=True, consumeErrors=True) d.addErrback(errback) return d def connected(self): pass def run(self): self.rrdStats.config(self.options.monitor, self.name, []) self.log.debug('Starting PBDaemon initialization') d = self.connect() def callback(result): self.sendEvent(self.startEvent) self.pushEventsLoop() self.log.debug('Calling connected.') self.connected() return result d.addCallback(callback) d.addErrback(twisted.python.log.err) reactor.run() if self._customexitcode: sys.exit(self._customexitcode) def setExitCode(self, exitcode): self._customexitcode = exitcode def stop(self, ignored=''): if reactor.running: try: reactor.stop() except ReactorNotRunning: self.log.debug("Tried to stop reactor that was stopped") else: self.log.debug("stop() called when not running") def _stopPbDaemon(self): if self.stopped: return self.stopped = True if 'EventService' in self.services: # send stop event if we don't have an implied --cycle, # or if --cycle has been specified if not hasattr(self.options, 'cycle') or\ getattr(self.options, 'cycle', True): self.sendEvent(self.stopEvent) self.log.debug("Sent a 'stop' event") if self._pushEventsDeferred: self.log.debug("Currently sending events. Queueing next call") d = self._pushEventsDeferred # Schedule another call to flush any additional queued events d.addBoth(lambda unused: self.pushEvents()) else: d = self.pushEvents() d.addBoth(lambda unused: self.saveCounters()) return d self.log.debug("No event sent as no EventService available.") self.saveCounters() def sendEvents(self, events): map(self.sendEvent, events) def sendEvent(self, event, **kw): ''' Add event to queue of events to be sent. If we have an event service then process the queue. ''' generatedEvent = self.generateEvent(event, **kw) self.eventQueueManager.addEvent(generatedEvent) self.counters['eventCount'] += 1 def generateEvent(self, event, **kw): ''' Add event to queue of events to be sent. If we have an event service then process the queue. ''' if not reactor.running: return event = event.copy() event['agent'] = self.name event['monitor'] = self.options.monitor event['manager'] = self.fqdn event.update(kw) return event @defer.inlineCallbacks def pushEventsLoop(self): """Periodially, wake up and flush events to ZenHub. """ reactor.callLater(self.options.eventflushseconds, self.pushEventsLoop) yield self.pushEvents() # Record the number of events in the queue every 5 minutes. now = time.time() if self.rrdStats.name and now >= (self.lastStats + 300): self.lastStats = now events = self.rrdStats.gauge( 'eventQueueLength', 300, self.eventQueueManager.event_queue_length) for event in events: self.eventQueueManager.addPerformanceEvent(event) @defer.inlineCallbacks def pushEvents(self): """Flush events to ZenHub. """ # are we already shutting down? if not reactor.running: self.log.debug("Skipping event sending - reactor not running.") return if self._pushEventsDeferred: self.log.debug("Skipping event sending - previous call active.") return try: self._pushEventsDeferred = defer.Deferred() # are still connected to ZenHub? evtSvc = self.services.get('EventService', None) if not evtSvc: self.log.error("No event service: %r", evtSvc) return discarded_events = self.eventQueueManager.discarded_events if discarded_events: self.log.error( 'Discarded oldest %d events because maxqueuelen was ' 'exceeded: %d/%d', discarded_events, discarded_events + self.options.maxqueuelen, self.options.maxqueuelen) self.counters['discardedEvents'] += discarded_events self.eventQueueManager.discarded_events = 0 send_events_fn = partial(evtSvc.callRemote, 'sendEvents') try: yield self.eventQueueManager.sendEvents(send_events_fn) except ConnectionLost as ex: self.log.error('Error sending event: %s', ex) except ConnectionDone: pass except Exception as ex: self.log.exception(ex) finally: d, self._pushEventsDeferred = self._pushEventsDeferred, None d.callback('sent') def heartbeat(self): 'if cycling, send a heartbeat, else, shutdown' if not self.options.cycle: self.stop() return heartbeatEvent = self.generateEvent(self.heartbeatEvent, timeout=self.heartbeatTimeout) self.eventQueueManager.addHeartbeatEvent(heartbeatEvent) # heartbeat is normally 3x cycle time self.niceDoggie(self.heartbeatTimeout / 3) events = [] # save daemon counter stats for name, value in self.counters.items(): self.log.info("Counter %s, value %d", name, value) events += self.rrdStats.counter(name, 300, value) self.sendEvents(events) # persist counters values self.saveCounters() def saveCounters(self): atomicWrite( zenPath('var/%s_counters.pickle' % self.name), pickle.dumps(self.counters), raiseException=False, ) def loadCounters(self): try: self.counters = pickle.load( open(zenPath('var/%s_counters.pickle' % self.name))) except Exception: pass def remote_getName(self): return self.name def remote_shutdown(self, unused): self.stop() self.sigTerm() def remote_setPropertyItems(self, items): pass @translateError def remote_updateThresholdClasses(self, classes): from Products.ZenUtils.Utils import importClass self.log.debug("Loading classes %s", classes) for c in classes: try: importClass(c) except ImportError: self.log.error("Unable to import class %s", c) def buildOptions(self): self.parser.add_option('--hubhost', dest='hubhost', default=DEFAULT_HUB_HOST, help='Host of zenhub daemon.' ' Default is %s.' % DEFAULT_HUB_HOST) self.parser.add_option('--hubport', dest='hubport', type='int', default=DEFAULT_HUB_PORT, help='Port zenhub listens on.' 'Default is %s.' % DEFAULT_HUB_PORT) self.parser.add_option('--hubusername', dest='hubusername', default=DEFAULT_HUB_USERNAME, help='Username for zenhub login.' ' Default is %s.' % DEFAULT_HUB_USERNAME) self.parser.add_option('--hubpassword', dest='hubpassword', default=DEFAULT_HUB_PASSWORD, help='Password for zenhub login.' ' Default is %s.' % DEFAULT_HUB_PASSWORD) self.parser.add_option('--monitor', dest='monitor', default=DEFAULT_HUB_MONITOR, help='Name of monitor instance to use for' ' configuration. Default is %s.' % DEFAULT_HUB_MONITOR) self.parser.add_option('--initialHubTimeout', dest='hubtimeout', type='int', default=30, help='Initial time to wait for a ZenHub ' 'connection') self.parser.add_option('--allowduplicateclears', dest='allowduplicateclears', default=False, action='store_true', help='Send clear events even when the most ' 'recent event was also a clear event.') self.parser.add_option( '--duplicateclearinterval', dest='duplicateclearinterval', default=0, type='int', help=('Send a clear event every [DUPLICATECLEARINTEVAL] ' 'events.')) self.parser.add_option('--eventflushseconds', dest='eventflushseconds', default=5., type='float', help='Seconds between attempts to flush ' 'events to ZenHub.') self.parser.add_option('--eventflushchunksize', dest='eventflushchunksize', default=50, type='int', help='Number of events to send to ZenHub' 'at one time') self.parser.add_option('--maxqueuelen', dest='maxqueuelen', default=5000, type='int', help='Maximum number of events to queue') self.parser.add_option('--zenhubpinginterval', dest='zhPingInterval', default=30, type='int', help='How often to ping zenhub') self.parser.add_option('--disable-event-deduplication', dest='deduplicate_events', default=True, action='store_false', help='Disable event de-duplication') ZenDaemon.buildOptions(self)
class PBDaemon(ZenDaemon, pb.Referenceable): name = 'pbdaemon' initialServices = ['EventService'] heartbeatEvent = {'eventClass':Heartbeat} heartbeatTimeout = 60*3 _customexitcode = 0 _pushEventsDeferred = None _eventHighWaterMark = None _healthMonitorInterval = 30 def __init__(self, noopts=0, keeproot=False, name=None): # if we were provided our collector name via the constructor instead of # via code, be sure to store it correctly. if name is not None: self.name = name self.mname = name try: ZenDaemon.__init__(self, noopts, keeproot) except IOError: import traceback self.log.critical(traceback.format_exc(0)) sys.exit(1) self._thresholds = None self._threshold_notifier = None self.rrdStats = DaemonStats() self.lastStats = 0 self.perspective = None self.services = {} self.eventQueueManager = EventQueueManager(self.options, self.log) self.startEvent = startEvent.copy() self.stopEvent = stopEvent.copy() details = dict(component=self.name, device=self.options.monitor) for evt in self.startEvent, self.stopEvent, self.heartbeatEvent: evt.update(details) self.initialConnect = defer.Deferred() self.stopped = False self.counters = collections.Counter() self._pingedZenhub = None self._connectionTimeout = None self._publisher = None self._internal_publisher = None self._metric_writer = None self._derivative_tracker = None self._metrologyReporter = None # Add a shutdown trigger to send a stop event and flush the event queue reactor.addSystemEventTrigger('before', 'shutdown', self._stopPbDaemon) # Set up a looping call to support the health check. self.healthMonitor = task.LoopingCall(self._checkZenHub) self.healthMonitor.start(self._healthMonitorInterval) def publisher(self): if not self._publisher: host, port = urlparse(self.options.redisUrl).netloc.split(':') try: port = int(port) except ValueError: self.log.exception("redis url contains non-integer port " + "value {port}, defaulting to {default}". format(port=port, default=publisher.defaultRedisPort)) port = publisher.defaultRedisPort self._publisher = publisher.RedisListPublisher( host, port, self.options.metricBufferSize, channel=self.options.metricsChannel, maxOutstandingMetrics=self.options.maxOutstandingMetrics ) return self._publisher def internalPublisher(self): if not self._internal_publisher: url = os.environ.get( "CONTROLPLANE_CONSUMER_URL", None) username = os.environ.get( "CONTROLPLANE_CONSUMER_USERNAME", "") password = os.environ.get( "CONTROLPLANE_CONSUMER_PASSWORD", "") if url: self._internal_publisher = publisher.HttpPostPublisher( username, password, url) return self._internal_publisher def metricWriter(self): if not self._metric_writer: publisher = self.publisher() metric_writer = MetricWriter(publisher) if os.environ.get( "CONTROLPLANE", "0") == "1": internal_publisher = self.internalPublisher() if internal_publisher: internal_metric_filter = lambda metric, value, timestamp, tags:\ tags and tags.get("internal", False) internal_metric_writer = FilteredMetricWriter(internal_publisher, internal_metric_filter) self._metric_writer = AggregateMetricWriter( [metric_writer, internal_metric_writer]) else: self._metric_writer = metric_writer return self._metric_writer def derivativeTracker(self): if not self._derivative_tracker: self._derivative_tracker = DerivativeTracker() return self._derivative_tracker def connecting(self): """ Called when about to connect to zenhub """ self.log.info("Attempting to connect to zenhub") def getZenhubInstanceId(self): """ Called after we connected to zenhub. """ def callback(result): self.log.info("Connected to the zenhub/%s instance", result) def errback(result): self.log.info("Unexpected error appeared while getting zenhub instance number %s", result) d = self.perspective.callRemote('getHubInstanceId') d.addCallback(callback) d.addErrback(errback) return d def gotPerspective(self, perspective): """ This gets called every time we reconnect. @parameter perspective: Twisted perspective object @type perspective: Twisted perspective object """ self.perspective = perspective self.getZenhubInstanceId() # Cancel the connection timeout timer as it's no longer needed. if self._connectionTimeout: try: self._connectionTimeout.cancel() except AlreadyCalled: pass self._connectionTimeout = None d2 = self.getInitialServices() if self.initialConnect: self.log.debug('Chaining getInitialServices with d2') self.initialConnect, d = None, self.initialConnect d2.chainDeferred(d) def connect(self): pingInterval = self.options.zhPingInterval factory = ReconnectingPBClientFactory(connectTimeout=60, pingPerspective=self.options.pingPerspective, pingInterval=pingInterval, pingtimeout=pingInterval * 5) self.log.info("Connecting to %s:%d" % (self.options.hubhost, self.options.hubport)) factory.connectTCP(self.options.hubhost, self.options.hubport) username = self.options.hubusername password = self.options.hubpassword self.log.debug("Logging in as %s" % username) c = credentials.UsernamePassword(username, password) factory.gotPerspective = self.gotPerspective factory.connecting = self.connecting factory.setCredentials(c) def timeout(d): if not d.called: self.connectTimeout() self._connectionTimeout = reactor.callLater( self.options.hubtimeout, timeout, self.initialConnect) return self.initialConnect def connectTimeout(self): self.log.error('Timeout connecting to zenhub: is it running?') pass def eventService(self): return self.getServiceNow('EventService') def getServiceNow(self, svcName): if not svcName in self.services: self.log.warning('No service named %r: ZenHub may be disconnected' % svcName) return self.services.get(svcName, None) or FakeRemote() def getService(self, serviceName, serviceListeningInterface=None): """ Attempt to get a service from zenhub. Returns a deferred. When service is retrieved it is stashed in self.services with serviceName as the key. When getService is called it will first check self.services and if serviceName is already there it will return the entry from self.services wrapped in a defer.succeed """ if serviceName in self.services: return defer.succeed(self.services[serviceName]) def removeService(ignored): self.log.debug('Removing service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] def callback(result, serviceName): self.log.debug('Loaded service %s from zenhub' % serviceName) self.services[serviceName] = result result.notifyOnDisconnect(removeService) return result def errback(error, serviceName): self.log.debug('errback after getting service %s' % serviceName) self.log.error('Could not retrieve service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] return error d = self.perspective.callRemote('getService', serviceName, self.options.monitor, serviceListeningInterface or self, self.options.__dict__) d.addCallback(callback, serviceName) d.addErrback(errback, serviceName) return d def getInitialServices(self): """ After connecting to zenhub, gather our initial list of services. """ def errback(error): if isinstance(error, Failure): self.log.critical( "Invalid monitor: %s" % self.options.monitor) reactor.stop() return defer.fail(RemoteBadMonitor( "Invalid monitor: %s" % self.options.monitor, '')) return error self.log.debug('Setting up initial services: %s' % \ ', '.join(self.initialServices)) d = defer.DeferredList( [self.getService(name) for name in self.initialServices], fireOnOneErrback=True, consumeErrors=True) d.addErrback(errback) return d def connected(self): pass def _getThresholdNotifier(self): if not self._threshold_notifier: self._threshold_notifier = ThresholdNotifier(self.sendEvent, self.getThresholds()) return self._threshold_notifier def getThresholds(self): if not self._thresholds: self._thresholds = Thresholds() return self._thresholds def run(self): def stopReporter(): if self._metrologyReporter: return self._metrologyReporter.stop() # Order of the shutdown triggers matter. Want to stop reporter first, calling self.metricWriter() below # registers shutdown triggers for the actual metric http and redis publishers. reactor.addSystemEventTrigger('before', 'shutdown', stopReporter) threshold_notifier = self._getThresholdNotifier() self.rrdStats.config(self.name, self.options.monitor, self.metricWriter(), threshold_notifier, self.derivativeTracker()) self.log.debug('Starting PBDaemon initialization') d = self.connect() def callback(result): self.sendEvent(self.startEvent) self.pushEventsLoop() self.log.debug('Calling connected.') self.connected() return result def startStatsLoop(): self.log.debug("Starting Statistic posting") loop = task.LoopingCall(self.postStatistics) loop.start(self.options.writeStatistics, now=False) daemonTags = { 'zenoss_daemon': self.name, 'zenoss_monitor': self.options.monitor, 'internal': True } self._metrologyReporter = TwistedMetricReporter(self.options.writeStatistics, self.metricWriter(), daemonTags) self._metrologyReporter.start() if self.options.cycle: reactor.callWhenRunning(startStatsLoop) d.addCallback(callback) d.addErrback(twisted.python.log.err) reactor.run() if self._customexitcode: sys.exit(self._customexitcode) def setExitCode(self, exitcode): self._customexitcode = exitcode def stop(self, ignored=''): if reactor.running: try: reactor.stop() except ReactorNotRunning: self.log.debug("Tried to stop reactor that was stopped") else: self.log.debug("stop() called when not running") def _stopPbDaemon(self): if self.stopped: return self.stopped = True if 'EventService' in self.services: # send stop event if we don't have an implied --cycle, # or if --cycle has been specified if not hasattr(self.options, 'cycle') or \ getattr(self.options, 'cycle', True): self.sendEvent(self.stopEvent) self.log.debug("Sent a 'stop' event") if self._pushEventsDeferred: self.log.debug("Currently sending events. Queueing next call") d = self._pushEventsDeferred # Schedule another call to flush any additional queued events d.addBoth(lambda unused: self.pushEvents()) else: d = self.pushEvents() return d self.log.debug("No event sent as no EventService available.") def sendEvents(self, events): map(self.sendEvent, events) def sendEvent(self, event, **kw): """ Add event to queue of events to be sent. If we have an event service then process the queue. """ generatedEvent = self.generateEvent(event, **kw) self.eventQueueManager.addEvent(generatedEvent) self.counters['eventCount'] += 1 if self._eventHighWaterMark: return self._eventHighWaterMark elif self.eventQueueManager.event_queue_length >= self.options.maxqueuelen * self.options.queueHighWaterMark: return self.pushEvents() else: return defer.succeed(None) def generateEvent(self, event, **kw): """ Add event to queue of events to be sent. If we have an event service then process the queue. """ if not reactor.running: return eventCopy = {} for k, v in chain(event.items(), kw.items()): if isinstance(v, basestring): #default max size is 512k size = LIMITS.get(k, DEFAULT_LIMIT) eventCopy[k] = v[0:size] if len(v)>size else v else: eventCopy[k] = v eventCopy['agent'] = self.name eventCopy['monitor'] = self.options.monitor eventCopy['manager'] = self.fqdn return eventCopy @defer.inlineCallbacks def pushEventsLoop(self): """Periodially, wake up and flush events to ZenHub. """ reactor.callLater(self.options.eventflushseconds, self.pushEventsLoop) yield self.pushEvents() # Record the number of events in the queue up to every 2 seconds. now = time.time() if self.rrdStats.name and now >= (self.lastStats + 2): self.lastStats = now self.rrdStats.gauge( 'eventQueueLength', self.eventQueueManager.event_queue_length) @defer.inlineCallbacks def pushEvents(self): """Flush events to ZenHub. """ # are we already shutting down? if not reactor.running: self.log.debug("Skipping event sending - reactor not running.") return if self.eventQueueManager.event_queue_length >= self.options.maxqueuelen * self.options.queueHighWaterMark and not self._eventHighWaterMark: self.log.debug("Queue length exceeded high water mark, %s ;creating high water mark deferred", self.eventQueueManager.event_queue_length) self._eventHighWaterMark = defer.Deferred() # are still connected to ZenHub? evtSvc = self.services.get('EventService', None) if not evtSvc: self.log.error("No event service: %r", evtSvc) yield task.deferLater(reactor, 0, lambda:None) if self._eventHighWaterMark: d, self._eventHighWaterMark = self._eventHighWaterMark, None #not connected, release throttle and let things queue d.callback("No Event Service") defer.returnValue(None) if self._pushEventsDeferred: self.log.debug("Skipping event sending - previous call active.") defer.returnValue("Push Pending") sent = 0 try: #only set _pushEventsDeferred after we know we have an evtSvc/connectivity self._pushEventsDeferred = defer.Deferred() def repush(val): if self.eventQueueManager.event_queue_length >= self.options.eventflushchunksize: self.pushEvents() return val # conditionally push more events after this pushEvents call finishes self._pushEventsDeferred.addCallback(repush) discarded_events = self.eventQueueManager.discarded_events if discarded_events: self.log.error( 'Discarded oldest %d events because maxqueuelen was ' 'exceeded: %d/%d', discarded_events, discarded_events + self.options.maxqueuelen, self.options.maxqueuelen) self.counters['discardedEvents'] += discarded_events self.eventQueueManager.discarded_events = 0 send_events_fn = partial(evtSvc.callRemote, 'sendEvents') try: sent = yield self.eventQueueManager.sendEvents(send_events_fn) except ConnectionLost as ex: self.log.error('Error sending event: %s', ex) #let the reactor have time to clean up any connection errors and make callbacks yield task.deferLater(reactor, 0, lambda:None) except Exception as ex: self.log.exception(ex) #let the reactor have time to clean up any connection errors and make callbacks yield task.deferLater(reactor, 0, lambda:None) finally: if self._pushEventsDeferred: d, self._pushEventsDeferred = self._pushEventsDeferred, None d.callback('sent %s' % sent) if self._eventHighWaterMark and self.eventQueueManager.event_queue_length < self.options.maxqueuelen * self.options.queueHighWaterMark: self.log.debug("Queue restored to below high water mark: %s", self.eventQueueManager.event_queue_length) d, self._eventHighWaterMark = self._eventHighWaterMark, None d.callback("Queue length below high water mark") def heartbeat(self): """if cycling, send a heartbeat, else, shutdown""" if not self.options.cycle: self.stop() return heartbeatEvent = self.generateEvent(self.heartbeatEvent, timeout=self.heartbeatTimeout) self.eventQueueManager.addHeartbeatEvent(heartbeatEvent) # heartbeat is normally 3x cycle time self.niceDoggie(self.heartbeatTimeout / 3) def postStatisticsImpl(self): pass def postStatistics(self): # save daemon counter stats for name, value in self.counters.items(): self.log.info("Counter %s, value %d", name, value) self.rrdStats.counter(name, value) # persist counters values self.postStatisticsImpl() def _pickleName(self): instance_id = os.environ.get('CONTROLPLANE_INSTANCE_ID') return 'var/%s_%s_counters.pickle' % (self.name, instance_id) def remote_getName(self): return self.name def remote_shutdown(self, unused): self.stop() self.sigTerm() def remote_setPropertyItems(self, items): pass @translateError def remote_updateThresholdClasses(self, classes): from Products.ZenUtils.Utils import importClass self.log.debug("Loading classes %s", classes) for c in classes: try: importClass(c) except ImportError: self.log.error("Unable to import class %s", c) def _checkZenHub(self): """ Check status of ZenHub (using ping method of service). @return: if ping occurs, return deferred with result of ping attempt. """ self.log.debug('_checkZenHub: entry') def callback(result): self.log.debug('ZenHub health check: Got result %s' % result) if result == 'pong': self.log.debug('ZenHub health check: Success - received pong from ZenHub ping service.') self._signalZenHubAnswering(True) else: self.log.error('ZenHub health check did not respond as expected.') self._signalZenHubAnswering(False) def errback(error): self.log.error('Error pinging ZenHub: %s (%s).' % (error, getattr(error, 'message', ''))) self._signalZenHubAnswering(False) try: if self.perspective: self.log.debug('ZenHub health check: perspective found. attempting remote ping call.') d = self.perspective.callRemote('ping') d.addCallback(callback) d.addErrback(errback) return d else: self.log.debug('ZenHub health check: ZenHub may be down.') self._signalZenHubAnswering(False) except pb.DeadReferenceError: self.log.warning("ZenHub health check: DeadReferenceError - lost connection to ZenHub.") self._signalZenHubAnswering(False) except Exception as e: self.log.error('ZenHub health check: caught %s exception: %s' % (e.__class__, e.message)) self._signalZenHubAnswering(False) def _signalZenHubAnswering(self, answering): """ Write or remove file that the ZenHub_answering health check uses to report status. @param answering: true if ZenHub is answering, False, otherwise. """ self.log.debug('_signalZenHubAnswering(%s)' % answering) filename = 'zenhub_connected' signalFilePath = zenPath('var', filename) if answering: self.log.debug('writing file at %s' % signalFilePath) atomicWrite(signalFilePath, '') else: try: self.log.debug('removing file at %s' % signalFilePath) os.remove(signalFilePath) except Exception as e: self.log.debug('ignoring %s exception (%s) removing file %s' % (e.__class__, e.message, signalFilePath)) def buildOptions(self): ZenDaemon.buildOptions(self) self.parser.add_option('--hubhost', dest='hubhost', default=DEFAULT_HUB_HOST, help='Host of zenhub daemon.' ' Default is %s.' % DEFAULT_HUB_HOST) self.parser.add_option('--hubport', dest='hubport', type='int', default=DEFAULT_HUB_PORT, help='Port zenhub listens on.' 'Default is %s.' % DEFAULT_HUB_PORT) self.parser.add_option('--hubusername', dest='hubusername', default=DEFAULT_HUB_USERNAME, help='Username for zenhub login.' ' Default is %s.' % DEFAULT_HUB_USERNAME) self.parser.add_option('--hubpassword', dest='hubpassword', default=DEFAULT_HUB_PASSWORD, help='Password for zenhub login.' ' Default is %s.' % DEFAULT_HUB_PASSWORD) self.parser.add_option('--monitor', dest='monitor', default=DEFAULT_HUB_MONITOR, help='Name of monitor instance to use for' ' configuration. Default is %s.' % DEFAULT_HUB_MONITOR) self.parser.add_option('--initialHubTimeout', dest='hubtimeout', type='int', default=30, help='Initial time to wait for a ZenHub ' 'connection') self.parser.add_option('--allowduplicateclears', dest='allowduplicateclears', default=False, action='store_true', help='Send clear events even when the most ' 'recent event was also a clear event.') self.parser.add_option('--duplicateclearinterval', dest='duplicateclearinterval', default=0, type='int', help=('Send a clear event every [DUPLICATECLEARINTEVAL] ' 'events.') ) self.parser.add_option('--eventflushseconds', dest='eventflushseconds', default=5., type='float', help='Seconds between attempts to flush ' 'events to ZenHub.') self.parser.add_option('--eventflushchunksize', dest='eventflushchunksize', default=50, type='int', help='Number of events to send to ZenHub' 'at one time') self.parser.add_option('--maxqueuelen', dest='maxqueuelen', default=5000, type='int', help='Maximum number of events to queue') self.parser.add_option('--queuehighwatermark', dest='queueHighWaterMark', default=0.75, type='float', help='The size, in percent, of the event queue when event pushback starts') self.parser.add_option('--zenhubpinginterval', dest='zhPingInterval', default=120, type='int', help='How often to ping zenhub') self.parser.add_option('--disable-event-deduplication', dest='deduplicate_events', default=True, action='store_false', help='Disable event de-duplication') self.parser.add_option('--redis-url', dest='redisUrl', type='string', default='redis://localhost:{default}/0'.format(default=publisher.defaultRedisPort), help='redis connection string: redis://[hostname]:[port]/[db], default: %default') self.parser.add_option('--metricBufferSize', dest='metricBufferSize', type='int', default=publisher.defaultMetricBufferSize, help='Number of metrics to buffer if redis goes down') self.parser.add_option('--metricsChannel', dest='metricsChannel', type='string', default=publisher.defaultMetricsChannel, help='redis channel to which metrics are published') self.parser.add_option('--maxOutstandingMetrics', dest='maxOutstandingMetrics', type='int', default=publisher.defaultMaxOutstandingMetrics, help='Max Number of metrics to allow in redis') self.parser.add_option('--disable-ping-perspective', dest='pingPerspective', help="Enable or disable ping perspective", default=True, action='store_false') self.parser.add_option('--writeStatistics', dest='writeStatistics', type='int', default=30, help='How often to write internal statistics value in seconds')
class EventServer(PBDaemon): 'Base class for a daemon whose primary job is to post events' name = 'EventServer' def __init__(self): PBDaemon.__init__(self, keeproot=True) self.stats = Stats() self.rrdStats = DaemonStats() def connected(self): self.sendEvent(dict(device=self.options.monitor, eventClass=App_Start, summary="%s started" % self.name, severity=0, component=self.name)) self.log.info("started") self.configure() def model(self): return self.services.get('EventService', FakeRemote()) def configure(self): def inner(driver): self.log.info("fetching default RRDCreateCommand") yield self.model().callRemote('getDefaultRRDCreateCommand') createCommand = driver.next() self.log.info("getting threshold classes") yield self.model().callRemote('getThresholdClasses') self.remote_updateThresholdClasses(driver.next()) self.log.info("getting collector thresholds") yield self.model().callRemote('getCollectorThresholds') self.rrdStats.config(self.options.monitor, self.name, driver.next(), createCommand) self.heartbeat() self.reportCycle() d = drive(inner) def error(result): self.log.error("Unexpected error in configure: %s" % result) d.addErrback(error) return d def sendEvent(self, event, **kw): # FIXME: get real event processing stats if 'firstTime' in event: self.stats.add(min(time.time() - event['firstTime'], 0)) PBDaemon.sendEvent(self, event, **kw) def useUdpFileDescriptor(self, fd): from twisted.internet import udp s = socket.fromfd(fd, socket.AF_INET, socket.SOCK_DGRAM) import os os.close(fd) port = s.getsockname()[1] transport = udp.Port(port, self) s.setblocking(0) transport.socket = s transport.fileno = s.fileno transport.connected = 1 transport._realPortNumber = port self.transport = transport # hack around startListening not being called self.numPorts = 1 transport.startReading() def useTcpFileDescriptor(self, fd, factory): import os, socket for i in range(19800, 19999): try: p = reactor.listenTCP(i, factory) os.dup2(fd, p.socket.fileno()) p.socket.listen(p.backlog) p.socket.setblocking(False) p.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) os.close(fd) return p except socket.error: pass raise socket.error("Unable to find an open socket to listen on") def reportCycle(self): if self.options.statcycle: self.report() reactor.callLater(self.options.statcycle, self.reportCycle) def heartbeat(self): """Since we don't do anything on a regular basis, just push heartbeats regularly""" seconds = self.heartbeatTimeout / 3 reactor.callLater(self.heartbeatTimeout / 3, self.heartbeat) PBDaemon.heartbeat(self) totalTime, totalEvents, maxTime = self.stats.report() for ev in (self.rrdStats.counter('events', seconds, totalEvents) + self.rrdStats.counter('totalTime', seconds, int(totalTime * 1000))): self.sendEvent(ev) def report(self): 'report some simple diagnostics at shutdown' totalTime, totalEvents, maxTime = self.stats.report() self.log.info("%d events processed in %.2f seconds", totalEvents, totalTime) if totalEvents > 0: self.log.info("%.5f average seconds per event", (totalTime / totalEvents)) self.log.info("Maximum processing time for one event was %.5f", maxTime) def buildOptions(self): PBDaemon.buildOptions(self) self.parser.add_option('--statcycle', dest='statcycle', type='int', help='Number of seconds between the writing of statistics', default=0)
class ZenModeler(PBDaemon): """ Daemon class to attach to zenhub and pass along device configuration information. """ name = 'zenmodeler' initialServices = PBDaemon.initialServices + ['ModelerService'] generateEvents = True configCycleInterval = 360 classCollectorPlugins = () def __init__(self, single=False): """ Initalizer @param single: collect from a single device? @type single: boolean """ PBDaemon.__init__(self) # FIXME: cleanup --force option #2660 self.options.force = True self.start = None self.startat = None self.rrdStats = DaemonStats() self.single = single if self.options.device: self.single = True self.modelerCycleInterval = self.options.cycletime # get the minutes and convert to fraction of a day self.collage = float(self.options.collage) / 1440.0 self.pendingNewClients = False self.clients = [] self.finished = [] self.devicegen = None self.counters = collections.Counter() self.configFilter = None self.configLoaded = False # Make sendEvent() available to plugins zope.component.provideUtility(self, IEventService) # Delay start for between 10 and 60 seconds when run as a daemon. self.started = False self.startDelay = 0 self.immediate = 1 if self.options.daemon or self.options.cycle: if self.options.now: self.log.debug('option "now" specified, starting immediately.') else: # self.startDelay = randint(10, 60) * 60 self.startDelay = randint(10, 60) * 1 self.immediate = 0 self.log.info( 'option "now" not specified, waiting %s seconds to start.' % self.startDelay) else: self.log.debug("Run in foreground, starting immediately.") # ZEN-26637 self.collectorLoopIteration = 0 self.mainLoopGotDeviceList = False self.isMainScheduled = False self._modeledDevicesMetric = Metrology.meter( "zenmodeler.modeledDevices") self._failuresMetric = Metrology.counter("zenmodeler.failures") def reportError(self, error): """ Log errors that have occurred @param error: error message @type error: string """ self.log.error("Error occured: %s", error) def connected(self): """ Called after connected to the zenhub service """ reactor.callLater(_CONFIG_PULLING_TIMEOUT, self._checkConfigLoad) d = self.configure() d.addCallback(self.heartbeat) d.addErrback(self.reportError) def _checkConfigLoad(self): """ Looping call to check whether zenmodeler got configuration from ZenHub. """ if not self.configLoaded: self.log.info("Modeling has not started pending configuration " "pull from ZenHub. Is ZenHub overloaded?") reactor.callLater(_CONFIG_PULLING_TIMEOUT, self._checkConfigLoad) def configure(self): """ Get our configuration from zenhub """ # add in the code to fetch cycle time, etc. self.log.info("Getting configuration from ZenHub...") def inner(driver): """ Generator function to gather our configuration @param driver: driver object @type driver: driver object """ self.log.debug('fetching monitor properties') yield self.config().callRemote('propertyItems') items = dict(driver.next()) # If the cycletime option is not specified or zero, then use the # modelerCycleInterval value in the database. if not self.options.cycletime: self.modelerCycleInterval = items.get('modelerCycleInterval', _DEFAULT_CYCLE_INTERVAL) self.configCycleInterval = items.get('configCycleInterval', self.configCycleInterval) reactor.callLater(self.configCycleInterval * 60, self.configure) self.log.debug("Getting threshold classes...") yield self.config().callRemote('getThresholdClasses') self.remote_updateThresholdClasses(driver.next()) self.log.debug("Getting collector thresholds...") yield self.config().callRemote('getCollectorThresholds') thresholds = driver.next() threshold_notifier = ThresholdNotifier(self.sendEvent, thresholds) self.rrdStats.config(self.name, self.options.monitor, self.metricWriter(), threshold_notifier, self.derivativeTracker()) self.log.debug("Getting collector plugins for each DeviceClass") yield self.config().callRemote('getClassCollectorPlugins') self.classCollectorPlugins = driver.next() self.configLoaded = True return drive(inner) def config(self): """ Get the ModelerService """ return self.services.get('ModelerService', FakeRemote()) def selectPlugins(self, device, transport): """ Build a list of active plugins for a device, based on: * the --collect command-line option which is a regex * the --ignore command-line option which is a regex * transport which is a string describing the type of plugin @param device: device to collect against @type device: string @param transport: python, ssh, snmp, telnet, cmd @type transport: string @return: results of the plugin @type: string @todo: determine if an event for the collector AND the device should be sent """ plugins = [] valid_loaders = [] for loader in device.plugins: try: plugin = loader.create() self.log.debug("Loaded plugin %s" % plugin.name()) plugins.append(plugin) valid_loaders.append(loader) except Plugins.PluginImportError as import_error: import socket component, _ = os.path.splitext(os.path.basename(sys.argv[0])) collector_host = socket.gethostname() # NB: an import errror affects all devices, # so report the issue against the collector # TOOD: determine if an event for the collector AND the device should be sent evt = { "eventClass": "/Status/Update", "component": component, "agent": collector_host, "device": collector_host, "severity": Error } info = "Problem loading plugin %s" % import_error.plugin self.log.error(info) evt['summary'] = info info = import_error.traceback self.log.error(info) evt['message'] = info info = ("Due to import errors, removing the %s plugin" " from this collection cycle.") % import_error.plugin self.log.error(info) evt['message'] += "%s\n" % info self.sendEvent(evt) # Make sure that we don't generate messages for bad loaders again # NB: doesn't update the device's zProperties if len(device.plugins) != len(valid_loaders): device.plugins = valid_loaders # Create functions to search for what plugins we will and # won't supply to the device collectTest = lambda x: False ignoreTest = lambda x: False if self.options.collectPlugins: collectTest = re.compile(self.options.collectPlugins).search elif self.options.ignorePlugins: ignoreTest = re.compile(self.options.ignorePlugins).search result = [] for plugin in plugins: if plugin.transport != transport: continue name = plugin.name() if ignoreTest(name): self.log.debug("Ignoring %s on %s because of --ignore flag", name, device.id) elif collectTest(name): self.log.debug("Using %s on %s because of --collect flag", name, device.id) result.append(plugin) elif not self.options.collectPlugins: self.log.debug("Using %s on %s", name, device.id) result.append(plugin) return result def collectDevice(self, device): """ Collect data from a single device. @param device: device to collect against @type device: string """ clientTimeout = getattr(device, 'zCollectorClientTimeout', 180) ip = device.manageIp timeout = clientTimeout + time.time() if USE_WMI: self.wmiCollect(device, ip, timeout) else: self.log.info( "skipping WMI-based collection, PySamba zenpack not installed") self.log.info("Collect on device {} for collector loop #{:03d}".format( device.id, self.collectorLoopIteration)) self.pythonCollect(device, ip, timeout) self.cmdCollect(device, ip, timeout) self.snmpCollect(device, ip, timeout) self.portscanCollect(device, ip, timeout) def wmiCollect(self, device, ip, timeout): """ Start the Windows Management Instrumentation (WMI) collector @param device: device to collect against @type device: string @param ip: IP address of device to collect against @type ip: string @param timeout: timeout before failing the connection @type timeout: integer """ if self.options.nowmi: return client = None try: plugins = self.selectPlugins(device, 'wmi') if not plugins: self.log.info("No WMI plugins found for %s" % device.id) return if self.checkCollection(device): self.log.info('WMI collector method for device %s' % device.id) self.log.info("plugins: %s", ", ".join(map(lambda p: p.name(), plugins))) client = WMIClient(device, self, plugins) if not client or not plugins: self.log.warn("WMI collector creation failed") return except Exception: self.log.exception("Error opening WMI collector") self.addClient(client, timeout, 'WMI', device.id) def pythonCollect(self, device, ip, timeout): """ Start local Python collection client. @param device: device to collect against @type device: string @param ip: IP address of device to collect against @type ip: string @param timeout: timeout before failing the connection @type timeout: integer """ client = None try: plugins = self.selectPlugins(device, "python") if not plugins: self.log.info("No Python plugins found for %s" % device.id) return if self.checkCollection(device): self.log.info('Python collection device %s' % device.id) self.log.info("plugins: %s", ", ".join(map(lambda p: p.name(), plugins))) client = PythonClient(device, self, plugins) if not client or not plugins: self.log.warn("Python client creation failed") return except Exception: self.log.exception("Error opening pythonclient") self.addClient(client, timeout, 'python', device.id) def cmdCollect(self, device, ip, timeout): """ Start shell command collection client. @param device: device to collect against @type device: string @param ip: IP address of device to collect against @type ip: string @param timeout: timeout before failing the connection @type timeout: integer """ client = None clientType = 'snmp' # default to SNMP if we can't figure out a protocol hostname = device.id try: plugins = self.selectPlugins(device, "command") if not plugins: self.log.info("No command plugins found for %s" % hostname) return protocol = getattr(device, 'zCommandProtocol', defaultProtocol) commandPort = getattr(device, 'zCommandPort', defaultPort) # don't even create a client if we shouldn't collect/model yet if not self.checkCollection(device): return if protocol == "ssh": client = SshClient(hostname, ip, commandPort, options=self.options, plugins=plugins, device=device, datacollector=self, isLoseConnection=True) clientType = 'ssh' self.log.info('Using SSH collection method for device %s' % hostname) elif protocol == 'telnet': if commandPort == 22: commandPort = 23 #set default telnet client = TelnetClient(hostname, ip, commandPort, options=self.options, plugins=plugins, device=device, datacollector=self) clientType = 'telnet' self.log.info('Using telnet collection method for device %s' % hostname) else: info = ("Unknown protocol %s for device %s -- " "defaulting to %s collection method" % (protocol, hostname, clientType)) self.log.warn(info) import socket component, _ = os.path.splitext(os.path.basename(sys.argv[0])) collector_host = socket.gethostname() evt = { "eventClass": "/Status/Update", "agent": collector_host, "device": hostname, "severity": Error } evt['summary'] = info self.sendEvent(evt) return if not client: self.log.warn("Shell command collector creation failed") else: self.log.info("plugins: %s", ", ".join(map(lambda p: p.name(), plugins))) except Exception: self.log.exception("Error opening command collector") self.addClient(client, timeout, clientType, device.id) def snmpCollect(self, device, ip, timeout): """ Start SNMP collection client. @param device: device to collect against @type device: string @param ip: IP address of device to collect against @type ip: string @param timeout: timeout before failing the connection @type timeout: integer """ client = None try: hostname = device.id if getattr(device, "zSnmpMonitorIgnore", True): self.log.info("SNMP monitoring off for %s" % hostname) return if not ip: self.log.info("No manage IP for %s" % hostname) return plugins = [] plugins = self.selectPlugins(device, "snmp") if not plugins: self.log.info("No SNMP plugins found for %s" % hostname) return if self.checkCollection(device): self.log.info('SNMP collection device %s' % hostname) self.log.info("plugins: %s", ", ".join(map(lambda p: p.name(), plugins))) client = SnmpClient(device.id, ip, self.options, device, self, plugins) if not client or not plugins: self.log.warn("SNMP collector creation failed") return except Exception: self.log.exception("Error opening the SNMP collector") self.addClient(client, timeout, 'SNMP', device.id) ######## need to make async test for snmp work at some point -EAD ######### # def checkSnmpConnection(self, device): # """ # Check to see if our current community string is still valid # # @param device: the device against which we will check # @type device: a Device instance # @return: result is None or a tuple containing # (community, port, version, snmp name) # @rtype: deferred: Twisted deferred # """ # from pynetsnmp.twistedsnmp import AgentProxy # # def inner(driver): # self.log.debug("Checking SNMP community %s on %s", # device.zSnmpCommunity, device.id) # # oid = ".1.3.6.1.2.1.1.5.0" # proxy = AgentProxy(device.id, # device.zSnmpPort, # timeout=device.zSnmpTimeout, # community=device.zSnmpCommunity, # snmpVersion=device.zSnmpVer, # tries=2) # proxy.open() # yield proxy.get([oid]) # devname = driver.next().values()[0] # if devname: # yield succeed(True) # yield succeed(False) # # return drive(inner) def addClient(self, device, timeout, clientType, name): """ If device is not None, schedule the device to be collected. Otherwise log an error. @param device: device to collect against @type device: string @param timeout: timeout before failing the connection @type timeout: integer @param clientType: description of the plugin type @type clientType: string @param name: plugin name @type name: string """ if device: device.timeout = timeout device.timedOut = False self.clients.append(device) device.run() else: self.log.warn('Unable to create a %s collector for %s', clientType, name) # XXX double-check this, once the implementation is in place def portscanCollect(self, device, ip, timeout): """ Start portscan collection client. @param device: device to collect against @type device: string @param ip: IP address of device to collect against @type ip: string @param timeout: timeout before failing the connection @type timeout: integer """ client = None try: hostname = device.id plugins = self.selectPlugins(device, "portscan") if not plugins: self.log.info("No portscan plugins found for %s" % hostname) return if self.checkCollection(device): self.log.info('Portscan collector method for device %s' % hostname) self.log.info("plugins: %s", ", ".join(map(lambda p: p.name(), plugins))) client = PortscanClient(device.id, ip, self.options, device, self, plugins) if not client or not plugins: self.log.warn("Portscan collector creation failed") return except Exception: self.log.exception("Error opening portscan collector") self.addClient(client, timeout, 'portscan', device.id) def checkCollection(self, device): """ See how old the data is that we've collected @param device: device to collect against @type device: string @return: is the last collection time + collage older than now or is the SNMP status number > 0 ? @type: boolean """ delay = device.getSnmpLastCollection() + self.collage if delay >= float( DateTime.DateTime()) and device.getSnmpStatusNumber() == 0: self.log.info("Skipped collection of %s" % device.id) return False return True def clientFinished(self, collectorClient): """ Callback that processes the return values from a device. Python iterable. @param collectorClient: collector instance @type collectorClient: collector class @return: Twisted deferred object @type: Twisted deferred object """ device = collectorClient.device self.log.debug("Client for %s finished collecting", device.id) def processClient(driver): try: if (isinstance(collectorClient, SnmpClient) and collectorClient.connInfo.changed == True): self.log.info( "SNMP connection info for %s changed. Updating...", device.id) yield self.config().callRemote( 'setSnmpConnectionInfo', device.id, collectorClient.connInfo.zSnmpVer, collectorClient.connInfo.zSnmpPort, collectorClient.connInfo.zSnmpCommunity) driver.next() pluginStats = {} self.log.debug("Processing data for device %s", device.id) devchanged = False maps = [] for plugin, results in collectorClient.getResults(): if plugin is None: continue self.log.debug("Processing plugin %s on device %s ...", plugin.name(), device.id) if not results: self.log.warn("The plugin %s returned no results.", plugin.name()) continue if self.options.save_raw_results: self.savePluginData(device.id, plugin.name(), 'raw', results) self.log.debug("Plugin %s results = %s", plugin.name(), results) datamaps = [] try: results = plugin.preprocess(results, self.log) if results: datamaps = plugin.process(device, results, self.log) if datamaps: pluginStats.setdefault(plugin.name(), plugin.weight) except (SystemExit, KeyboardInterrupt) as ex: self.log.info("Plugin %s terminated due to external" " signal (%s)" % (plugin.name(), str(ex))) continue except Exception as ex: # NB: don't discard the plugin, as it might be a # temporary issue # Also, report it against the device, rather than at # a collector as it might be just for this device. import socket collector_host = socket.gethostname() evt = { "eventClass": "/Status/Update", "agent": collector_host, "device": device.id, "severity": Error } info = "Problem while executing plugin %s" % plugin.name( ) self.log.error(info) evt['summary'] = info info = traceback.format_exc() self.log.error(info) evt['message'] = info self.sendEvent(evt) continue # allow multiple maps to be returned from one plugin if not isinstance(datamaps, (list, tuple)): datamaps = [ datamaps, ] if datamaps: newmaps = [m for m in datamaps if m] for m in newmaps: setattr(m, PLUGIN_NAME_ATTR, plugin.name()) if self.options.save_processed_results: self.savePluginData(device.id, plugin.name(), 'processed', newmaps) maps += newmaps if maps: deviceClass = Classifier.classifyDevice( pluginStats, self.classCollectorPlugins) # If self.single is True, then call singleApplyDataMaps # instead of applyDataMaps. if not self.single: method = "applyDataMaps" else: method = "singleApplyDataMaps" yield self.config().callRemote(method, device.id, maps, deviceClass, True) if driver.next(): devchanged = True if devchanged: self.log.info("Changes in configuration applied") else: self.log.info("No change in configuration detected") except Exception as ex: self.log.exception(ex) raise def processClientFinished(result): """ Called after the client collection finishes @param result: object (unused) @type result: object """ self.counters['modeledDevicesCount'] += 1 self._modeledDevicesMetric.mark() # result is now the result of remote_applyDataMaps (from processClient) if result and isinstance(result, (basestring, Failure)): self.log.error("Client %s finished with message: %s" % (device.id, result)) self._failuresMetric.increment() else: self.log.debug("Client %s finished" % device.id) try: self.clients.remove(collectorClient) self.finished.append(collectorClient) except ValueError: self.log.debug( "Client %s not found in in the list" " of active clients", device.id) self.log.info( "Finished processing client within collector loop #{0:03d}". format(self.collectorLoopIteration)) d = drive(self.fillCollectionSlots) d.addErrback(self.fillError) d = drive(processClient) d.addBoth(processClientFinished) def savePluginData(self, deviceName, pluginName, dataType, data): filename = "/tmp/%s.%s.%s.pickle.gz" % (deviceName, pluginName, dataType) try: with gzip.open(filename, 'wb') as fd: pickle.dump(data, fd) except Exception as ex: self.log.warn("Unable to save data into file '%s': %s", filename, ex) def fillError(self, reason): """ Twisted errback routine to log an error when unable to collect some data @param reason: error message @type reason: string """ self.log.error("Unable to fill collection slots: %s" % reason) def cycleTime(self): """ Return our cycle time (in minutes) @return: cycle time @rtype: integer """ return self.modelerCycleInterval * 60 def heartbeat(self, ignored=None): """ Twisted keep-alive mechanism to ensure that we're still connected to zenhub @param ignored: object (unused) @type ignored: object """ ARBITRARY_BEAT = 30 reactor.callLater(ARBITRARY_BEAT, self.heartbeat) if self.options.cycle: evt = dict(eventClass=Heartbeat, component='zenmodeler', device=self.options.monitor, timeout=self.options.heartbeatTimeout) self.sendEvent(evt) self.niceDoggie(self.cycleTime()) # We start modeling from here to accomodate the startup delay. if not self.started: if self.immediate == 0 and self.startat: # This stuff relies on ARBITRARY_BEAT being < 60s if self.timeMatches(): self.started = True self.log.info("Starting modeling...") reactor.callLater(1, self.main) elif not self.isMainScheduled: self.isMainScheduled = True reactor.callLater(self.cycleTime(), self.main) else: self.started = True self.log.info("Starting modeling in %s seconds.", self.startDelay) reactor.callLater(self.startDelay, self.main) def postStatisticsImpl(self): # save modeled device rate self.rrdStats.derive('modeledDevices', self.counters['modeledDevicesCount']) # save running count self.rrdStats.gauge('modeledDevicesCount', self.counters['modeledDevicesCount']) def _getCountersFile(self): return zenPath('var/%s_%s.pickle' % ( self.name, self.options.monitor, )) @property def _devicegen_has_items(self): """check it self.devicegen (an iterator) is not empty and has at least one value. doing this check changes the iterator, so this method restores it to its original state before returning""" result = False if self.devicegen is not None: try: first = self.devicegen.next() except StopIteration: pass else: result = True self.devicegen = chain([first], self.devicegen) return result def checkStop(self, unused=None): """ Check to see if there's anything to do. If there isn't, report our statistics and exit. @param unused: unused (unused) @type unused: string """ if self.pendingNewClients or self.clients: return if self._devicegen_has_items: return if not self.mainLoopGotDeviceList: return # ZEN-26637 to prevent race between checkStop and mainLoop if self.start: runTime = time.time() - self.start self.start = None if not self.didCollect: self.log.info("Did not collect during collector loop") self.log.info("Scan time: %0.2f seconds for collector loop #%03d", runTime, self.collectorLoopIteration) self.log.info( "Scanned %d of %d devices during collector loop #%03d", self.processedDevicesCount, self.iterationDeviceCount, self.collectorLoopIteration) devices = len(self.finished) timedOut = len([c for c in self.finished if c.timedOut]) self.rrdStats.gauge('cycleTime', runTime) self.rrdStats.gauge('devices', devices) self.rrdStats.gauge('timedOut', timedOut) if not self.options.cycle: self.stop() self.finished = [] def fillCollectionSlots(self, driver): """ An iterator which either returns a device to collect or calls checkStop() @param driver: driver object @type driver: driver object """ count = len(self.clients) while count < self.options.parallel and self._devicegen_has_items \ and not self.pendingNewClients: self.pendingNewClients = True try: device = self.devicegen.next() yield self.config().callRemote('getDeviceConfig', [device], self.options.checkStatus) # just collect one device, and let the timer add more devices = driver.next() if devices: self.processedDevicesCount = self.processedDevicesCount + 1 self.log.info( "Filled collection slots for %d of %d devices during collector loop #%03d", self.processedDevicesCount, self.iterationDeviceCount, self.collectorLoopIteration ) #TODO should this message be logged at debug level? self.didCollect = True d = devices[0] if d.skipModelMsg: self.log.info(d.skipModelMsg) else: self.collectDevice(d) else: self.log.info("Device %s not returned is it down?", device) except StopIteration: self.devicegen = None finally: self.pendingNewClients = False break update = len(self.clients) if update != count and update != 1: self.log.info('Running %d clients', update) else: self.log.debug('Running %d clients', update) self.checkStop() def timeMatches(self): """ Check whether the current time matches a cron-like specification, return a straight true or false """ if self.startat is None: return True def match_entity(entity, value): if entity == '*': return True value = int(value) if entity.isdigit() and int(entity) == value: return True if entity.startswith('*/') and entity[2:].isdigit(): if value % int(entity[2:]) == 0: return True if ',' in entity and any( segment.isdigit() and int(segment) == value for segment in entity.split(',')): return True return False curtime = time.localtime() # match minutes, hours, date, and month fields if all( match_entity(self.startat[a], curtime[b]) for a, b in ((0, 4), (1, 3), (2, 2), (3, 1))): dayofweek = curtime[6] + 1 if (match_entity(self.startat[4], dayofweek) or dayofweek == 7 and match_entity(self.startat[4], 0)): return True return False def buildOptions(self): """ Build our list of command-line options """ PBDaemon.buildOptions(self) self.parser.add_option('--debug', dest='debug', action="store_true", default=False, help="Don't fork threads for processing") self.parser.add_option('--nowmi', dest='nowmi', action="store_true", default=not USE_WMI, help="Do not execute WMI plugins") self.parser.add_option( '--parallel', dest='parallel', type='int', default=defaultParallel, help="Number of devices to collect from in parallel") self.parser.add_option('--cycletime', dest='cycletime', type='int', help="Run collection every x minutes") self.parser.add_option( '--ignore', dest='ignorePlugins', default="", help="Modeler plugins to ignore. Takes a regular expression") self.parser.add_option( '--collect', dest='collectPlugins', default="", help="Modeler plugins to use. Takes a regular expression") self.parser.add_option( '-p', '--path', dest='path', help="Start class path for collection ie /NetworkDevices") self.parser.add_option( '-d', '--device', dest='device', help="Fully qualified device name ie www.confmon.com") self.parser.add_option('--startat', dest='startat', help="Start string in cron(8) format") self.parser.add_option( '-a', '--collage', dest='collage', default=0, type='float', help="Do not collect from devices whose collect date " + "is within this many minutes") self.parser.add_option('--writetries', dest='writetries', default=2, type='int', help="Number of times to try to write if a " "read conflict is found") # FIXME: cleanup --force option #2660 self.parser.add_option( "-F", "--force", dest="force", action='store_true', default=True, help="Force collection of config data (deprecated)") self.parser.add_option( '--portscantimeout', dest='portscantimeout', type='int', default=defaultPortScanTimeout, help="Time to wait for connection failures when port scanning") self.parser.add_option( '--now', dest='now', action="store_true", default=False, help="Start daemon now, do not sleep before starting") self.parser.add_option( '--communities', dest='discoverCommunity', action="store_true", default=False, help= "If an snmp connection fails try and rediscover it's connection info" ) self.parser.add_option( '--checkstatus', dest='checkStatus', action="store_true", default=False, help="Don't model if the device is ping or snmp down") self.parser.add_option( '--save_raw_results', dest='save_raw_results', action="store_true", default=False, help="Save raw results for replay purposes in /tmp") self.parser.add_option( '--save_processed_results', dest='save_processed_results', action="store_true", default=False, help="Save modeler plugin outputs for replay purposes in /tmp") addWorkerOptions(self.parser) TCbuildOptions(self.parser, self.usage) if USE_WMI: addNTLMv2Option(self.parser) def processOptions(self): """ Check what the user gave us vs what we'll accept for command-line options """ if not self.options.path and not self.options.device: self.options.path = "/Devices" if self.options.ignorePlugins and self.options.collectPlugins: raise SystemExit("Only one of --ignore or --collect" " can be used at a time") if self.options.startat: cronmatch = re.match( '^\s*([\*/,\d]+)\s+([\*/,\d]+)\s+([\*/,\d]+)\s+([\*/,\d]+)\s+([\*/,\d]+)\s*$', self.options.startat) if cronmatch: self.startat = cronmatch.groups() else: self.log.error( 'startat option "%s" was invalid, carrying on anyway', self.options.startat) if USE_WMI: setNTLMv2Auth(self.options) configFilter = parseWorkerOptions(self.options.__dict__) if configFilter: self.configFilter = configFilter def _timeoutClients(self): """ The guts of the timeoutClients method (minus the twisted reactor stuff). Breaking this part out as a separate method facilitates unit testing. """ active = [] for client in self.clients: if client.timeout < time.time(): self.log.warn("Client %s timeout", client.hostname) self.finished.append(client) client.timedOut = True try: client.stop() except AssertionError: pass # session closed twice http://dev.zenoss.org/trac/ticket/6354 else: active.append(client) self.clients = active def timeoutClients(self, unused=None): """ Check to see which clients have timed out and which ones haven't. Stop processing anything that's timed out. @param unused: unused (unused) @type unused: string """ reactor.callLater(1, self.timeoutClients) self._timeoutClients() d = drive(self.fillCollectionSlots) d.addCallback(self.checkStop) d.addErrback(self.fillError) def reactorLoop(self): """ Twisted main loop """ reactor.startRunning() while reactor.running: try: while reactor.running: reactor.runUntilCurrent() timeout = reactor.timeout() reactor.doIteration(timeout) except Exception: if reactor.running: self.log.exception("Unexpected error in main loop.") def getDeviceList(self): """ Get the list of devices for which we are collecting: * if -d devicename was used, use the devicename * if a class path flag was supplied, gather the devices along that organizer * otherwise get all of the devices associated with our collector @return: list of devices @rtype: list """ if self.options.device: self.log.info("Collecting for device %s", self.options.device) return succeed([self.options.device]) self.log.info("Collecting for path %s", self.options.path) d = self.config().callRemote('getDeviceListByOrganizer', self.options.path, self.options.monitor, self.options.__dict__) def handle(results): if hasattr(results, "type") and results.type is HubDown: self.log.warning("Collection aborted: %s", results.getErrorMessage()) return self.log.critical("%s is not a valid organizer.", self.options.path) reactor.running = False sys.exit(1) d.addErrback(handle) return d def mainLoop(self, driver): """ Main collection loop, a Python iterable @param driver: driver object @type driver: driver object @return: Twisted deferred object @rtype: Twisted deferred object """ if self.options.cycle: self.isMainScheduled = True driveLater(self.cycleTime(), self.mainLoop) if self.clients: self.log.error("Modeling cycle taking too long") return # ZEN-26637 - did we collect during collector loop? self.didCollect = False self.mainLoopGotDeviceList = False self.start = time.time() self.collectorLoopIteration = self.collectorLoopIteration + 1 self.log.info("Starting collector loop #{:03d}...".format( self.collectorLoopIteration)) yield self.getDeviceList() deviceList = driver.next() self.log.debug("getDeviceList returned %s devices", len(deviceList)) self.log.debug("getDeviceList returned %s devices", deviceList) self.devicegen = iter(deviceList) self.iterationDeviceCount = len(deviceList) self.processedDevicesCount = 0 self.log.info( "Got %d devices to be scanned during collector loop #%03d", self.iterationDeviceCount, self.collectorLoopIteration) d = drive(self.fillCollectionSlots) d.addErrback(self.fillError) self.mainLoopGotDeviceList = True yield d driver.next() self.log.debug("Collection slots filled") def main(self, unused=None): """ Wrapper around the mainLoop @param unused: unused (unused) @type unused: string @return: Twisted deferred object @rtype: Twisted deferred object """ self.finished = [] d = drive(self.mainLoop) d.addCallback(self.timeoutClients) return d def remote_deleteDevice(self, device): """ Stub function @param device: device name (unused) @type device: string @todo: implement """ # we fetch the device list before every scan self.log.debug("Asynch deleteDevice %s" % device) def remote_deleteDevices(self, devices): """ Stub function @param devices: device ids (unused) @type device: set @todo: implement """ # we fetch the device list before every scan self.log.debug("Asynch deleteDevices {0}".format(len(devices)))
class PBDaemon(ZenDaemon, pb.Referenceable): name = 'pbdaemon' initialServices = ['EventService'] heartbeatEvent = {'eventClass':Heartbeat} heartbeatTimeout = 60*3 _customexitcode = 0 _pushEventsDeferred = None def __init__(self, noopts=0, keeproot=False, name=None): # if we were provided our collector name via the constructor instead of # via code, be sure to store it correctly. if name is not None: self.name = name self.mname = name try: ZenDaemon.__init__(self, noopts, keeproot) except IOError: import traceback self.log.critical( traceback.format_exc( 0 ) ) sys.exit(1) self.rrdStats = DaemonStats() self.lastStats = 0 self.perspective = None self.services = {} self.eventQueueManager = EventQueueManager(self.options, self.log) self.startEvent = startEvent.copy() self.stopEvent = stopEvent.copy() details = dict(component=self.name, device=self.options.monitor) for evt in self.startEvent, self.stopEvent, self.heartbeatEvent: evt.update(details) self.initialConnect = defer.Deferred() self.stopped = False self.counters = collections.Counter() self.loadCounters() self._pingedZenhub = None self._connectionTimeout = None # Add a shutdown trigger to send a stop event and flush the event queue reactor.addSystemEventTrigger('before', 'shutdown', self._stopPbDaemon) def connecting(self): """ Called when about to connect to zenhub """ self.log.info("Attempting to connect to zenhub") def gotPerspective(self, perspective): """ This gets called every time we reconnect. @parameter perspective: Twisted perspective object @type perspective: Twisted perspective object """ self.log.info("Connected to ZenHub") self.perspective = perspective # Cancel the connection timeout timer as it's no longer needed. if self._connectionTimeout: try: self._connectionTimeout.cancel() except AlreadyCalled: pass self._connectionTimeout = None d2 = self.getInitialServices() if self.initialConnect: self.log.debug('Chaining getInitialServices with d2') self.initialConnect, d = None, self.initialConnect d2.chainDeferred(d) def connect(self): factory = ReconnectingPBClientFactory(connectTimeout=60) self.log.info("Connecting to %s:%d" % (self.options.hubhost, self.options.hubport)) factory.connectTCP(self.options.hubhost, self.options.hubport) username = self.options.hubusername password = self.options.hubpassword self.log.debug("Logging in as %s" % username) c = credentials.UsernamePassword(username, password) factory.gotPerspective = self.gotPerspective factory.connecting = self.connecting factory.startLogin(c) def timeout(d): if not d.called: self.connectTimeout() self._connectionTimeout = reactor.callLater( self.options.hubtimeout, timeout, self.initialConnect) return self.initialConnect def connectTimeout(self): self.log.error('Timeout connecting to zenhub: is it running?') pass def eventService(self): return self.getServiceNow('EventService') def getServiceNow(self, svcName): if not svcName in self.services: self.log.warning('No service named %r: ZenHub may be disconnected' % svcName) return self.services.get(svcName, None) or FakeRemote() def getService(self, serviceName, serviceListeningInterface=None): """ Attempt to get a service from zenhub. Returns a deferred. When service is retrieved it is stashed in self.services with serviceName as the key. When getService is called it will first check self.services and if serviceName is already there it will return the entry from self.services wrapped in a defer.succeed """ if serviceName in self.services: return defer.succeed(self.services[serviceName]) def removeService(ignored): self.log.debug('Removing service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] def callback(result, serviceName): self.log.debug('Loaded service %s from zenhub' % serviceName) self.services[serviceName] = result result.notifyOnDisconnect(removeService) return result def errback(error, serviceName): self.log.debug('errback after getting service %s' % serviceName) self.log.error('Could not retrieve service %s' % serviceName) if serviceName in self.services: del self.services[serviceName] return error d = self.perspective.callRemote('getService', serviceName, self.options.monitor, serviceListeningInterface or self) d.addCallback(callback, serviceName) d.addErrback(errback, serviceName) return d def getInitialServices(self): """ After connecting to zenhub, gather our initial list of services. """ def errback(error): if isinstance(error, Failure): self.log.critical( "Invalid monitor: %s" % self.options.monitor) reactor.stop() return defer.fail(RemoteBadMonitor( "Invalid monitor: %s" % self.options.monitor, '')) return error self.log.debug('Setting up initial services: %s' % \ ', '.join(self.initialServices)) d = defer.DeferredList( [self.getService(name) for name in self.initialServices], fireOnOneErrback=True, consumeErrors=True) d.addErrback(errback) return d def connected(self): pass def run(self): self.rrdStats.config(self.options.monitor, self.name, []) self.log.debug('Starting PBDaemon initialization') d = self.connect() def callback(result): self.sendEvent(self.startEvent) self.pushEventsLoop() self.log.debug('Calling connected.') self.connected() return result d.addCallback(callback) d.addErrback(twisted.python.log.err) reactor.run() if self._customexitcode: sys.exit(self._customexitcode) def setExitCode(self, exitcode): self._customexitcode = exitcode def stop(self, ignored=''): if reactor.running: try: reactor.stop() except ReactorNotRunning: self.log.debug("Tried to stop reactor that was stopped") else: self.log.debug("stop() called when not running") def _stopPbDaemon(self): if self.stopped: return self.stopped = True if 'EventService' in self.services: # send stop event if we don't have an implied --cycle, # or if --cycle has been specified if not hasattr(self.options, 'cycle') or\ getattr(self.options, 'cycle', True): self.sendEvent(self.stopEvent) self.log.debug("Sent a 'stop' event") if self._pushEventsDeferred: self.log.debug("Currently sending events. Queueing next call") d = self._pushEventsDeferred # Schedule another call to flush any additional queued events d.addBoth(lambda unused: self.pushEvents()) else: d = self.pushEvents() d.addBoth(lambda unused: self.saveCounters()) return d self.log.debug("No event sent as no EventService available.") self.saveCounters() def sendEvents(self, events): map(self.sendEvent, events) def sendEvent(self, event, **kw): ''' Add event to queue of events to be sent. If we have an event service then process the queue. ''' generatedEvent = self.generateEvent(event, **kw) self.eventQueueManager.addEvent(generatedEvent) self.counters['eventCount'] += 1 def generateEvent(self, event, **kw): ''' Add event to queue of events to be sent. If we have an event service then process the queue. ''' if not reactor.running: return event = event.copy() event['agent'] = self.name event['monitor'] = self.options.monitor event['manager'] = self.fqdn event.update(kw) return event @defer.inlineCallbacks def pushEventsLoop(self): """Periodially, wake up and flush events to ZenHub. """ reactor.callLater(self.options.eventflushseconds, self.pushEventsLoop) yield self.pushEvents() # Record the number of events in the queue every 5 minutes. now = time.time() if self.rrdStats.name and now >= (self.lastStats + 300): self.lastStats = now events = self.rrdStats.gauge('eventQueueLength', 300, self.eventQueueManager.event_queue_length) for event in events: self.eventQueueManager.addPerformanceEvent(event) @defer.inlineCallbacks def pushEvents(self): """Flush events to ZenHub. """ # are we already shutting down? if not reactor.running: self.log.debug("Skipping event sending - reactor not running.") return if self._pushEventsDeferred: self.log.debug("Skipping event sending - previous call active.") return try: self._pushEventsDeferred = defer.Deferred() # are still connected to ZenHub? evtSvc = self.services.get('EventService', None) if not evtSvc: self.log.error("No event service: %r", evtSvc) return discarded_events = self.eventQueueManager.discarded_events if discarded_events: self.log.error( 'Discarded oldest %d events because maxqueuelen was ' 'exceeded: %d/%d', discarded_events, discarded_events + self.options.maxqueuelen, self.options.maxqueuelen) self.counters['discardedEvents'] += discarded_events self.eventQueueManager.discarded_events = 0 send_events_fn = partial(evtSvc.callRemote, 'sendEvents') try: yield self.eventQueueManager.sendEvents(send_events_fn) except ConnectionLost as ex: self.log.error('Error sending event: %s', ex) except ConnectionDone: pass except Exception as ex: self.log.exception(ex) finally: d, self._pushEventsDeferred = self._pushEventsDeferred, None d.callback('sent') def heartbeat(self): 'if cycling, send a heartbeat, else, shutdown' if not self.options.cycle: self.stop() return heartbeatEvent = self.generateEvent(self.heartbeatEvent, timeout=self.heartbeatTimeout) self.eventQueueManager.addHeartbeatEvent(heartbeatEvent) # heartbeat is normally 3x cycle time self.niceDoggie(self.heartbeatTimeout / 3) events = [] # save daemon counter stats for name, value in self.counters.items(): self.log.info("Counter %s, value %d", name, value) events += self.rrdStats.counter(name, 300, value) self.sendEvents(events) # persist counters values self.saveCounters() def saveCounters(self): atomicWrite( zenPath('var/%s_counters.pickle' % self.name), pickle.dumps(self.counters), raiseException=False, ) def loadCounters(self): try: self.counters = pickle.load(open(zenPath('var/%s_counters.pickle'% self.name))) except Exception: pass def remote_getName(self): return self.name def remote_shutdown(self, unused): self.stop() self.sigTerm() def remote_setPropertyItems(self, items): pass @translateError def remote_updateThresholdClasses(self, classes): from Products.ZenUtils.Utils import importClass self.log.debug("Loading classes %s", classes) for c in classes: try: importClass(c) except ImportError: self.log.error("Unable to import class %s", c) def buildOptions(self): self.parser.add_option('--hubhost', dest='hubhost', default=DEFAULT_HUB_HOST, help='Host of zenhub daemon.' ' Default is %s.' % DEFAULT_HUB_HOST) self.parser.add_option('--hubport', dest='hubport', type='int', default=DEFAULT_HUB_PORT, help='Port zenhub listens on.' 'Default is %s.' % DEFAULT_HUB_PORT) self.parser.add_option('--hubusername', dest='hubusername', default=DEFAULT_HUB_USERNAME, help='Username for zenhub login.' ' Default is %s.' % DEFAULT_HUB_USERNAME) self.parser.add_option('--hubpassword', dest='hubpassword', default=DEFAULT_HUB_PASSWORD, help='Password for zenhub login.' ' Default is %s.' % DEFAULT_HUB_PASSWORD) self.parser.add_option('--monitor', dest='monitor', default=DEFAULT_HUB_MONITOR, help='Name of monitor instance to use for' ' configuration. Default is %s.' % DEFAULT_HUB_MONITOR) self.parser.add_option('--initialHubTimeout', dest='hubtimeout', type='int', default=30, help='Initial time to wait for a ZenHub ' 'connection') self.parser.add_option('--allowduplicateclears', dest='allowduplicateclears', default=False, action='store_true', help='Send clear events even when the most ' 'recent event was also a clear event.') self.parser.add_option('--duplicateclearinterval', dest='duplicateclearinterval', default=0, type='int', help=('Send a clear event every [DUPLICATECLEARINTEVAL] ' 'events.') ) self.parser.add_option('--eventflushseconds', dest='eventflushseconds', default=5., type='float', help='Seconds between attempts to flush ' 'events to ZenHub.') self.parser.add_option('--eventflushchunksize', dest='eventflushchunksize', default=50, type='int', help='Number of events to send to ZenHub' 'at one time') self.parser.add_option('--maxqueuelen', dest='maxqueuelen', default=5000, type='int', help='Maximum number of events to queue') self.parser.add_option('--zenhubpinginterval', dest='zhPingInterval', default=30, type='int', help='How often to ping zenhub') self.parser.add_option('--disable-event-deduplication', dest='deduplicate_events', default=True, action='store_false', help='Disable event de-duplication') ZenDaemon.buildOptions(self)
def setUp(self): self.daemon_stats = DaemonStats()
def __init__(self): PBDaemon.__init__(self, keeproot=True) self.stats = Stats() self.rrdStats = DaemonStats()
class ZenModeler(PBDaemon): """ Daemon class to attach to zenhub and pass along device configuration information. """ name = 'zenmodeler' initialServices = PBDaemon.initialServices + ['ModelerService'] generateEvents = True configCycleInterval = 360 classCollectorPlugins = () def __init__(self, single=False): """ Initalizer @param single: collect from a single device? @type single: boolean """ PBDaemon.__init__(self) # FIXME: cleanup --force option #2660 self.options.force = True self.start = None self.startat = None self.rrdStats = DaemonStats() self.single = single if self.options.device: self.single = True self.modelerCycleInterval = self.options.cycletime # get the minutes and convert to fraction of a day self.collage = float(self.options.collage) / 1440.0 self.pendingNewClients = False self.clients = [] self.finished = [] self.devicegen = None self.counters = collections.Counter() self.configFilter = None self.configLoaded = False # Make sendEvent() available to plugins zope.component.provideUtility(self, IEventService) # Delay start for between 10 and 60 seconds when run as a daemon. self.started = False self.startDelay = 0 self.immediate = 1 if self.options.daemon or self.options.cycle: if self.options.now: self.log.debug('option "now" specified, starting immediately.') else: # self.startDelay = randint(10, 60) * 60 self.startDelay = randint(10, 60) * 1 self.immediate = 0 self.log.info( 'option "now" not specified, waiting %s seconds to start.' % self.startDelay) else: self.log.debug("Run in foreground, starting immediately.") # ZEN-26637 self.collectorLoopIteration = 0 self.mainLoopGotDeviceList = False self._modeledDevicesMetric = Metrology.meter( "zenmodeler.modeledDevices") self._failuresMetric = Metrology.counter("zenmodeler.failures") def reportError(self, error): """ Log errors that have occurred @param error: error message @type error: string """ self.log.error("Error occured: %s", error) def connected(self): """ Called after connected to the zenhub service """ reactor.callLater(_CONFIG_PULLING_TIMEOUT, self._checkConfigLoad) d = self.configure() d.addCallback(self.heartbeat) d.addErrback(self.reportError) def _checkConfigLoad(self): """ Looping call to check whether zenmodeler got configuration from ZenHub. """ if not self.configLoaded: self.log.info("Modeling has not started pending configuration " "pull from ZenHub. Is ZenHub overloaded?") reactor.callLater(_CONFIG_PULLING_TIMEOUT, self._checkConfigLoad) def configure(self): """ Get our configuration from zenhub """ # add in the code to fetch cycle time, etc. self.log.info("Getting configuration from ZenHub...") def inner(driver): """ Generator function to gather our configuration @param driver: driver object @type driver: driver object """ self.log.debug('fetching monitor properties') yield self.config().callRemote('propertyItems') items = dict(driver.next()) # If the cycletime option is not specified or zero, then use the # modelerCycleInterval value in the database. if not self.options.cycletime: self.modelerCycleInterval = items.get('modelerCycleInterval', _DEFAULT_CYCLE_INTERVAL) self.configCycleInterval = items.get('configCycleInterval', self.configCycleInterval) reactor.callLater(self.configCycleInterval * 60, self.configure) self.log.debug("Getting threshold classes...") yield self.config().callRemote('getThresholdClasses') self.remote_updateThresholdClasses(driver.next()) self.log.debug("Getting collector thresholds...") yield self.config().callRemote('getCollectorThresholds') thresholds = driver.next() threshold_notifier = ThresholdNotifier(self.sendEvent, thresholds) self.rrdStats.config(self.name, self.options.monitor, self.metricWriter(), threshold_notifier, self.derivativeTracker()) self.log.debug("Getting collector plugins for each DeviceClass") yield self.config().callRemote('getClassCollectorPlugins') self.classCollectorPlugins = driver.next() self.configLoaded = True return drive(inner) def config(self): """ Get the ModelerService """ return self.services.get('ModelerService', FakeRemote()) def selectPlugins(self, device, transport): """ Build a list of active plugins for a device, based on: * the --collect command-line option which is a regex * the --ignore command-line option which is a regex * transport which is a string describing the type of plugin @param device: device to collect against @type device: string @param transport: python, ssh, snmp, telnet, cmd @type transport: string @return: results of the plugin @type: string @todo: determine if an event for the collector AND the device should be sent """ plugins = [] valid_loaders = [] for loader in device.plugins: try: plugin = loader.create() self.log.debug("Loaded plugin %s" % plugin.name()) plugins.append(plugin) valid_loaders.append(loader) except (SystemExit, KeyboardInterrupt), ex: self.log.info("Interrupted by external signal (%s)" % str(ex)) raise except Plugins.PluginImportError, import_error: import socket component, _ = os.path.splitext(os.path.basename(sys.argv[0])) collector_host = socket.gethostname() # NB: an import errror affects all devices, # so report the issue against the collector # TOOD: determine if an event for the collector AND the device should be sent evt = { "eventClass": "/Status/Update", "component": component, "agent": collector_host, "device": collector_host, "severity": Error } info = "Problem loading plugin %s" % import_error.plugin self.log.error(info) evt['summary'] = info info = import_error.traceback self.log.error(info) evt['message'] = info info = ("Due to import errors, removing the %s plugin" " from this collection cycle.") % import_error.plugin self.log.error(info) evt['message'] += "%s\n" % info self.sendEvent(evt)
class EventServer(PBDaemon): 'Base class for a daemon whose primary job is to post events' name = 'EventServer' def __init__(self): PBDaemon.__init__(self, keeproot=True) self.stats = Stats() self.rrdStats = DaemonStats() def connected(self): self.sendEvent( dict(device=self.options.monitor, eventClass=App_Start, summary="%s started" % self.name, severity=0, component=self.name)) self.log.info("started") self.configure() def model(self): return self.services.get('EventService', FakeRemote()) def configure(self): def inner(driver): self.log.info("fetching default RRDCreateCommand") yield self.model().callRemote('getDefaultRRDCreateCommand') createCommand = driver.next() self.log.info("getting threshold classes") yield self.model().callRemote('getThresholdClasses') self.remote_updateThresholdClasses(driver.next()) self.log.info("getting collector thresholds") yield self.model().callRemote('getCollectorThresholds') self.rrdStats.config(self.options.monitor, self.name, driver.next(), createCommand) self.heartbeat() self.reportCycle() d = drive(inner) def error(result): self.log.error("Unexpected error in configure: %s" % result) d.addErrback(error) return d def sendEvent(self, event, **kw): # FIXME: get real event processing stats if 'firstTime' in event: self.stats.add(min(time.time() - event['firstTime'], 0)) PBDaemon.sendEvent(self, event, **kw) def useUdpFileDescriptor(self, fd): from twisted.internet import udp s = socket.fromfd(fd, socket.AF_INET, socket.SOCK_DGRAM) import os os.close(fd) port = s.getsockname()[1] transport = udp.Port(port, self) s.setblocking(0) transport.socket = s transport.fileno = s.fileno transport.connected = 1 transport._realPortNumber = port self.transport = transport # hack around startListening not being called self.numPorts = 1 transport.startReading() def useTcpFileDescriptor(self, fd, factory): import os, socket for i in range(19800, 19999): try: p = reactor.listenTCP(i, factory) os.dup2(fd, p.socket.fileno()) p.socket.listen(p.backlog) p.socket.setblocking(False) p.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) os.close(fd) return p except socket.error: pass raise socket.error("Unable to find an open socket to listen on") def reportCycle(self): if self.options.statcycle: self.report() reactor.callLater(self.options.statcycle, self.reportCycle) def heartbeat(self): """Since we don't do anything on a regular basis, just push heartbeats regularly""" seconds = self.heartbeatTimeout / 3 reactor.callLater(self.heartbeatTimeout / 3, self.heartbeat) PBDaemon.heartbeat(self) totalTime, totalEvents, maxTime = self.stats.report() for ev in (self.rrdStats.counter('events', seconds, totalEvents) + self.rrdStats.counter('totalTime', seconds, int(totalTime * 1000))): self.sendEvent(ev) def report(self): 'report some simple diagnostics at shutdown' totalTime, totalEvents, maxTime = self.stats.report() self.log.info("%d events processed in %.2f seconds", totalEvents, totalTime) if totalEvents > 0: self.log.info("%.5f average seconds per event", (totalTime / totalEvents)) self.log.info("Maximum processing time for one event was %.5f", maxTime) def buildOptions(self): PBDaemon.buildOptions(self) self.parser.add_option( '--statcycle', dest='statcycle', type='int', help='Number of seconds between the writing of statistics', default=0)