def testSpeed(self): # Pretend to be gmetad and serve a large piece of content original_file = Fixtures.file('ganglia.txt') server = subprocess.Popen("nc -l 8651 < %s" % original_file, shell=True) # Wait for 1 second time.sleep(1) pfile = tempfile.NamedTemporaryFile() g = Ganglia(logging.getLogger(__file__)) # Running the profiler # profile.runctx("g.check({'ganglia_host': 'localhost', 'ganglia_port': 8651})", {}, {"g": g}, pfile.name) # p = pstats.Stats(pfile.name) # p.sort_stats('time').print_stats() parsed = StringIO( g.check({ 'ganglia_host': 'localhost', 'ganglia_port': 8651 })) original = Fixtures.file('ganglia.txt') x1 = tree.parse(parsed) x2 = tree.parse(original) # Cursory test self.assertEquals([c.tag for c in x1.getroot()], [c.tag for c in x2.getroot()])
def testSpeed(self): # Pretend to be gmetad and serve a large piece of content server = subprocess.Popen("nc -l 8651 < %s" % TEST_FN, shell=True) # Wait for 1 second time.sleep(1) pfile = tempfile.NamedTemporaryFile() g = Ganglia(logging.getLogger(__file__)) # Running the profiler # profile.runctx("g.check({'ganglia_host': 'localhost', 'ganglia_port': 8651})", {}, {"g": g}, pfile.name) # p = pstats.Stats(pfile.name) # p.sort_stats('time').print_stats() self.assertEquals(md5(g.check({'ganglia_host': 'localhost', 'ganglia_port': 8651})).hexdigest(), md5(open(TEST_FN).read()).hexdigest())
def testSpeed(self): # Pretend to be gmetad and serve a large piece of content original_file = Fixtures.file('ganglia.txt') subprocess.Popen("nc -l 8651 < %s" % original_file, shell=True) # Wait for 1 second time.sleep(1) g = Ganglia(logging.getLogger(__file__)) parsed = StringIO(g.check({'ganglia_host': 'localhost', 'ganglia_port': 8651})) original = Fixtures.file('ganglia.txt') x1 = tree.parse(parsed) x2 = tree.parse(original) # Cursory test self.assertEquals([c.tag for c in x1.getroot()], [c.tag for c in x2.getroot()])
def testSpeed(self): # Pretend to be gmetad and serve a large piece of content server = subprocess.Popen("nc -l 8651 < %s" % TEST_FN, shell=True) # Wait for 1 second time.sleep(1) pfile = tempfile.NamedTemporaryFile() g = Ganglia(logging.getLogger(__file__)) # Running the profiler # profile.runctx("g.check({'ganglia_host': 'localhost', 'ganglia_port': 8651})", {}, {"g": g}, pfile.name) # p = pstats.Stats(pfile.name) # p.sort_stats('time').print_stats() parsed = StringIO(g.check({'ganglia_host': 'localhost', 'ganglia_port': 8651})) original = open(TEST_FN) x1 = tree.parse(parsed) x2 = tree.parse(original) # Cursory test self.assertEquals([c.tag for c in x1.getroot()], [c.tag for c in x2.getroot()])
def testSpeed(self): # Pretend to be gmetad and serve a large piece of content server = subprocess.Popen("nc -l 8651 < %s" % TEST_FN, shell=True) # Wait for 1 second time.sleep(1) pfile = tempfile.NamedTemporaryFile() g = Ganglia(logging.getLogger(__file__)) # Running the profiler # profile.runctx("g.check({'ganglia_host': 'localhost', 'ganglia_port': 8651})", {}, {"g": g}, pfile.name) # p = pstats.Stats(pfile.name) # p.sort_stats('time').print_stats() self.assertEquals( md5(g.check({ 'ganglia_host': 'localhost', 'ganglia_port': 8651 })).hexdigest(), md5(open(TEST_FN).read()).hexdigest())
def __init__(self, agentConfig, emitters, systemStats): self.emit_duration = None self.agentConfig = agentConfig # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.metadata_interval = int(agentConfig.get('metadata_interval', 10 * 60)) self.metadata_start = time.time() socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = [] # Unix System Checks self._unix_system_checks = { 'disk': u.Disk(log), 'io': u.IO(log), 'load': u.Load(log), 'memory': u.Memory(log), 'processes': u.Processes(log), 'cpu': u.Cpu(log) } # Win32 System `Checks self._win32_system_checks = { 'disk': w32.Disk(log), 'io': w32.IO(log), 'proc': w32.Processes(log), 'memory': w32.Memory(log), 'network': w32.Network(log), 'cpu': w32.Cpu(log) } # Old-style metric checks self._ganglia = Ganglia(log) self._dogstream = Dogstreams.init(log, self.agentConfig) self._ddforwarder = DdForwarder(log, self.agentConfig) # Agent Metrics self._agent_metrics = CollectorMetrics(log) self._metrics_checks = [] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, 'Check')(log)) log.info("Registered custom check %s" % module_spec) log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version") except Exception, e: log.exception('Unable to load custom check module %s' % module_spec)
def testSpeed(self): # Pretend to be gmetad and serve a large piece of content original_file = Fixtures.file('ganglia.txt') subprocess.Popen("nc -l 8651 < %s" % original_file, shell=True) # Wait for 1 second time.sleep(1) g = Ganglia(logging.getLogger(__file__)) parsed = StringIO( g.check({ 'ganglia_host': 'localhost', 'ganglia_port': 8651 })) original = Fixtures.file('ganglia.txt') x1 = tree.parse(parsed) x2 = tree.parse(original) # Cursory test self.assertEquals([c.tag for c in x1.getroot()], [c.tag for c in x2.getroot()])
def __init__(self, agentConfig, emitters): self.agentConfig = agentConfig self.plugins = None self.emitters = emitters self.os = None self.checksLogger = logging.getLogger('checks') socket.setdefaulttimeout(15) self._apache = Apache(self.checksLogger) self._nginx = Nginx(self.checksLogger) self._disk = Disk(self.checksLogger) self._io = IO() self._load = Load(self.checksLogger) self._memory = Memory(self.checksLogger) self._network = Network(self.checksLogger) self._processes = Processes() self._cpu = Cpu() self._couchdb = CouchDb(self.checksLogger) self._mongodb = MongoDb(self.checksLogger) self._mysql = MySql(self.checksLogger) self._pgsql = PostgreSql(self.checksLogger) self._rabbitmq = RabbitMq() self._ganglia = Ganglia(self.checksLogger) self._cassandra = Cassandra() self._redis = Redis(self.checksLogger) self._jvm = Jvm(self.checksLogger) self._tomcat = Tomcat(self.checksLogger) self._activemq = ActiveMQ(self.checksLogger) self._solr = Solr(self.checksLogger) self._memcache = Memcache(self.checksLogger) self._dogstream = Dogstreams.init(self.checksLogger, self.agentConfig) self._ddforwarder = DdForwarder(self.checksLogger, self.agentConfig) # All new checks should be metrics checks: self._metrics_checks = [ Cacti(self.checksLogger), Redis(self.checksLogger), Varnish(self.checksLogger), ElasticSearch(self.checksLogger), ] self._event_checks = [Hudson(), Nagios(socket.gethostname())] self._resources_checks = [ResProcesses(self.checksLogger,self.agentConfig)] self._ec2 = EC2(self.checksLogger)
class Collector(object): """ The collector is responsible for collecting data from each check and passing it along to the emitters, who send it to their final destination. """ def __init__(self, agentConfig, emitters, systemStats, hostname): self.emit_duration = None self.agentConfig = agentConfig self.hostname = hostname # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.check_timings = agentConfig.get('check_timings') self.push_times = { 'host_metadata': { 'start': time.time(), 'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60)) }, 'external_host_tags': { 'start': time.time() - 3 * 60, # Wait for the checks to init 'interval': int(agentConfig.get('external_host_tags', 5 * 60)) }, 'agent_checks': { 'start': time.time(), 'interval': int(agentConfig.get('agent_checks_interval', 10 * 60)) }, 'processes': { 'start': time.time(), 'interval': int(agentConfig.get('processes_interval', 60)) } } socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.hostname_metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = {} # Unix System Checks self._unix_system_checks = { 'io': u.IO(log), 'load': u.Load(log), 'memory': u.Memory(log), 'processes': u.Processes(log), 'cpu': u.Cpu(log), 'system': u.System(log) } # Win32 System `Checks self._win32_system_checks = { 'io': w32.IO(log), 'proc': w32.Processes(log), 'memory': w32.Memory(log), 'network': w32.Network(log), 'cpu': w32.Cpu(log), 'system': w32.System(log) } # Old-style metric checks self._ganglia = Ganglia(log) self._dogstream = Dogstreams.init(log, self.agentConfig) self._ddforwarder = DdForwarder(log, self.agentConfig) # Agent performance metrics check self._agent_metrics = None self._metrics_checks = [] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, 'Check')(log)) log.info("Registered custom check %s" % module_spec) log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version") except Exception: log.exception('Unable to load custom check module %s' % module_spec) def stop(self): """ Tell the collector to stop at the next logical point. """ # This is called when the process is being killed, so # try to stop the collector as soon as possible. # Most importantly, don't try to submit to the emitters # because the forwarder is quite possibly already killed # in which case we'll get a misleading error in the logs. # Best to not even try. self.continue_running = False for check in self.initialized_checks_d: check.stop() @staticmethod def _stats_for_display(raw_stats): return pprint.pformat(raw_stats, indent=4) @log_exceptions(log) def run(self, checksd=None, start_event=True, configs_reloaded=False): """ Collect data from each check and submit their data. """ log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks']))) timer = Timer() if not Platform.is_windows(): cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd['initialized_checks'] # is a list of AgentCheck instances self.init_failed_checks_d = checksd['init_failed_checks'] # is of type {check_name: {error, traceback}} payload = AgentPayload() # Find the AgentMetrics check and pop it out # This check must run at the end of the loop to collect info on agent performance if not self._agent_metrics or configs_reloaded: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break # Initialize payload self._build_payload(payload) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] # Run the system checks. Checks will depend on the OS if Platform.is_windows(): # Win32 system checks try: metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['system'].check(self.agentConfig)) except Exception: log.exception('Unable to fetch Windows system metrics.') else: # Unix system checks sys_checks = self._unix_system_checks load = sys_checks['load'].check(self.agentConfig) payload.update(load) system = sys_checks['system'].check(self.agentConfig) payload.update(system) memory = sys_checks['memory'].check(self.agentConfig) if memory: memstats = { 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsable': memory.get('physPctUsable'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared'), 'memSlab': memory.get('physSlab'), 'memPageTables': memory.get('physPageTables'), 'memSwapCached': memory.get('swapCached') } payload.update(memstats) ioStats = sys_checks['io'].check(self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(self.agentConfig) payload.update({'processes': processes}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks gangliaData = self._ganglia.check(self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # metrics about the forwarder if ddforwarderData: payload['datadog'] = ddforwarderData # process collector of gohai (compliant with payload of legacy "resources checks") if not Platform.is_windows() and self._should_send_additional_data('processes'): gohai_processes = self._run_gohai_processes() if gohai_processes: try: gohai_processes_json = json.loads(gohai_processes) processes_payload = { 'snaps': [gohai_processes_json.get('processes')], 'format_version': 1 } if self._is_first_run(): processes_payload['format_description'] = PROCESSES_FORMAT_DESCRIPTION payload['resources'] = { 'processes': processes_payload, 'meta': { 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], } } except Exception: log.exception("Error running gohai processes collection") # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() # Collect metadata current_check_metadata = check.get_service_metadata() # Save metrics & events for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats ) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datadog.agent.check_status', status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = 'datadog.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) # Add a service check for the agent service_checks.append(create_service_check('datadog.agent.up', AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks # Populate metadata self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self._agent_metrics: metric_context = { 'collection_time': collect_duration, 'emit_time': self.emit_duration, } if not Platform.is_windows(): metric_context['cpu_time'] = time.clock() - cpu_clock self._agent_metrics.set_metric_context(payload, metric_context) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) if self.agentConfig.get('developer_mode'): log.debug("\n Agent developer mode stats: \n {0}".format( Collector._stats_for_display(agent_stats)) ) # Flush metadata for the Agent Metrics check. Otherwise they'll just accumulate and leak. self._agent_metrics.get_service_metadata() # Let's send our payload emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload @staticmethod def run_single_check(check, verbose=True): log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() current_service_checks = check.get_service_checks() current_service_metadata = check.get_service_metadata() check_stats = check._get_internal_profiling_stats() # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) service_check_count = len(current_service_checks) print "Metrics: \n{0}".format(pprint.pformat(current_check_metrics)) print "Events: \n{0}".format(pprint.pformat(current_check_events)) print "Service Checks: \n{0}".format(pprint.pformat(current_service_checks)) print "Service Metadata: \n{0}".format(pprint.pformat(current_service_metadata)) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats ) return check_status def _emit(self, payload): """ Send the payload via the emitters. """ statuses = [] for emitter in self.emitters: # Don't try to send to an emitter if we're stopping/ if not self.continue_running: return statuses name = emitter.__name__ emitter_status = EmitterStatus(name) try: emitter(payload, log, self.agentConfig) except Exception, e: log.exception("Error running emitter: %s" % emitter.__name__) emitter_status = EmitterStatus(name, e) statuses.append(emitter_status) return statuses
class Collector(object): """ The collector is responsible for collecting data from each check and passing it along to the emitters, who send it to their final destination. """ def __init__(self, agentConfig, emitters, systemStats, hostname): self.emit_duration = None self.agentConfig = agentConfig self.hostname = hostname # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.check_timings = agentConfig.get('check_timings') self.push_times = { 'host_metadata': { 'start': time.time(), 'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60)) }, 'external_host_tags': { 'start': time.time() - 3 * 60, # Wait for the checks to init 'interval': int(agentConfig.get('external_host_tags', 5 * 60)) }, 'agent_checks': { 'start': time.time(), 'interval': int(agentConfig.get('agent_checks_interval', 10 * 60)) }, } socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.hostname_metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = {} # Unix System Checks self._unix_system_checks = { 'io': u.IO(log), 'load': u.Load(log), 'memory': u.Memory(log), 'processes': u.Processes(log), 'cpu': u.Cpu(log), 'system': u.System(log) } # Win32 System `Checks self._win32_system_checks = { 'io': w32.IO(log), 'proc': w32.Processes(log), 'memory': w32.Memory(log), 'network': w32.Network(log), 'cpu': w32.Cpu(log), 'system': w32.System(log) } # Old-style metric checks self._ganglia = Ganglia(log) self._dogstream = Dogstreams.init(log, self.agentConfig) self._ddforwarder = DdForwarder(log, self.agentConfig) # Agent performance metrics check self._agent_metrics = None self._metrics_checks = [] # Custom metric checks for module_spec in [ s.strip() for s in self.agentConfig.get('custom_checks', '').split(',') ]: if len(module_spec) == 0: continue try: self._metrics_checks.append( modules.load(module_spec, 'Check')(log)) log.info("Registered custom check %s" % module_spec) log.warning( "Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version" ) except Exception: log.exception('Unable to load custom check module %s' % module_spec) # Resource Checks self._resources_checks = [ResProcesses(log, self.agentConfig)] def stop(self): """ Tell the collector to stop at the next logical point. """ # This is called when the process is being killed, so # try to stop the collector as soon as possible. # Most importantly, don't try to submit to the emitters # because the forwarder is quite possibly already killed # in which case we'll get a misleading error in the logs. # Best to not even try. self.continue_running = False for check in self.initialized_checks_d: check.stop() @staticmethod def _stats_for_display(raw_stats): return pprint.pformat(raw_stats, indent=4) @log_exceptions(log) def run(self, checksd=None, start_event=True, configs_reloaded=False): """ Collect data from each check and submit their data. """ log.debug("Found {num_checks} checks".format( num_checks=len(checksd['initialized_checks']))) timer = Timer() if not Platform.is_windows(): cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd[ 'initialized_checks'] # is a list of AgentCheck instances self.init_failed_checks_d = checksd[ 'init_failed_checks'] # is of type {check_name: {error, traceback}} payload = AgentPayload() # Find the AgentMetrics check and pop it out # This check must run at the end of the loop to collect info on agent performance if not self._agent_metrics or configs_reloaded: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break # Initialize payload self._build_payload(payload) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] # Run the system checks. Checks will depend on the OS if Platform.is_windows(): # Win32 system checks try: metrics.extend(self._win32_system_checks['memory'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['system'].check( self.agentConfig)) except Exception: log.exception('Unable to fetch Windows system metrics.') else: # Unix system checks sys_checks = self._unix_system_checks load = sys_checks['load'].check(self.agentConfig) payload.update(load) system = sys_checks['system'].check(self.agentConfig) payload.update(system) memory = sys_checks['memory'].check(self.agentConfig) if memory: memstats = { 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsable': memory.get('physPctUsable'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared'), 'memSlab': memory.get('physSlab'), 'memPageTables': memory.get('physPageTables'), 'memSwapCached': memory.get('swapCached') } payload.update(memstats) ioStats = sys_checks['io'].check(self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(self.agentConfig) payload.update({'processes': processes}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks gangliaData = self._ganglia.check(self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # metrics about the forwarder if ddforwarderData: payload['datadog'] = ddforwarderData # Resources checks if not Platform.is_windows(): has_resource = False for resources_check in self._resources_checks: try: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = { 'snaps': snaps, 'format_version': resources_check.get_format_version() } res_format = resources_check.describe_format_if_needed( ) if res_format is not None: res_value['format_description'] = res_format payload['resources'][ resources_check.RESOURCE_KEY] = res_value except Exception: log.exception("Error running resource check %s" % resources_check.RESOURCE_KEY) if has_resource: payload['resources']['meta'] = { 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], } # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() # Collect metadata current_check_metadata = check.get_service_metadata() # Save metrics & events for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datadog.agent.check_status', status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = 'datadog.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) # Add a service check for the agent service_checks.append( create_service_check('datadog.agent.up', AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks # Populate metadata self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self._agent_metrics: metric_context = { 'collection_time': collect_duration, 'emit_time': self.emit_duration, } if not Platform.is_windows(): metric_context['cpu_time'] = time.clock() - cpu_clock self._agent_metrics.set_metric_context(payload, metric_context) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) if self.agentConfig.get('developer_mode'): log.debug("\n Agent developer mode stats: \n {0}".format( Collector._stats_for_display(agent_stats))) # Let's send our payload emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info( "First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug( "Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload @staticmethod def run_single_check(check, verbose=True): log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() current_service_checks = check.get_service_checks() current_service_metadata = check.get_service_metadata() check_stats = check._get_internal_profiling_stats() # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) service_check_count = len(current_service_checks) print "Metrics: \n{0}".format( pprint.pformat(current_check_metrics)) print "Events: \n{0}".format(pprint.pformat(current_check_events)) print "Service Checks: \n{0}".format( pprint.pformat(current_service_checks)) print "Service Metadata: \n{0}".format( pprint.pformat(current_service_metadata)) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus(check.name, instance_statuses, metric_count, event_count, service_check_count, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats) return check_status def _emit(self, payload): """ Send the payload via the emitters. """ statuses = [] for emitter in self.emitters: # Don't try to send to an emitter if we're stopping/ if not self.continue_running: return statuses name = emitter.__name__ emitter_status = EmitterStatus(name) try: emitter(payload, log, self.agentConfig) except Exception, e: log.exception("Error running emitter: %s" % emitter.__name__) emitter_status = EmitterStatus(name, e) statuses.append(emitter_status) return statuses
class Collector(object): """ The collector is responsible for collecting data from each check and passing it along to the emitters, who send it to their final destination. """ def __init__(self, agentConfig, emitters, systemStats, hostname): self.emit_duration = None self.agentConfig = agentConfig self.hostname = hostname # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.check_timings = agentConfig.get('check_timings') self.push_times = { 'host_metadata': { 'start': time.time(), 'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60)) }, 'external_host_tags': { 'start': time.time() - 3 * 60, # Wait for the checks to init 'interval': int(agentConfig.get('external_host_tags', 5 * 60)) }, 'agent_checks': { 'start': time.time(), 'interval': int(agentConfig.get('agent_checks_interval', 10 * 60)) }, 'processes': { 'start': time.time(), 'interval': int(agentConfig.get('processes_interval', 60)) } } socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.hostname_metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = {} # Unix System Checks self._unix_system_checks = { 'io': u.IO(log), 'load': u.Load(log), 'memory': u.Memory(log), 'processes': u.Processes(log), 'cpu': u.Cpu(log), 'system': u.System(log) } # Win32 System `Checks self._win32_system_checks = { 'io': w32.IO(log), 'proc': w32.Processes(log), 'memory': w32.Memory(log), 'network': w32.Network(log), 'cpu': w32.Cpu(log), 'system': w32.System(log) } # Old-style metric checks self._ganglia = Ganglia(log) self._dogstream = Dogstreams.init(log, self.agentConfig) self._ddforwarder = DdForwarder(log, self.agentConfig) # Agent performance metrics check self._agent_metrics = None self._metrics_checks = [] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, 'Check')(log)) log.info("Registered custom check %s" % module_spec) log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version") except Exception: log.exception('Unable to load custom check module %s' % module_spec) def stop(self): """ Tell the collector to stop at the next logical point. """ # This is called when the process is being killed, so # try to stop the collector as soon as possible. # Most importantly, don't try to submit to the emitters # because the forwarder is quite possibly already killed # in which case we'll get a misleading error in the logs. # Best to not even try. self.continue_running = False for check in self.initialized_checks_d: check.stop() @staticmethod def _stats_for_display(raw_stats): return pprint.pformat(raw_stats, indent=4) @log_exceptions(log) def run(self, checksd=None, start_event=True, configs_reloaded=False): """ Collect data from each check and submit their data. """ log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks']))) timer = Timer() if not Platform.is_windows(): cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd['initialized_checks'] # is a list of AgentCheck instances self.init_failed_checks_d = checksd['init_failed_checks'] # is of type {check_name: {error, traceback}} payload = AgentPayload() # Find the AgentMetrics check and pop it out # This check must run at the end of the loop to collect info on agent performance if not self._agent_metrics or configs_reloaded: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break # Initialize payload self._build_payload(payload) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] # Run the system checks. Checks will depend on the OS if Platform.is_windows(): # Win32 system checks try: metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['system'].check(self.agentConfig)) except Exception: log.exception('Unable to fetch Windows system metrics.') else: # Unix system checks sys_checks = self._unix_system_checks load = sys_checks['load'].check(self.agentConfig) payload.update(load) system = sys_checks['system'].check(self.agentConfig) payload.update(system) memory = sys_checks['memory'].check(self.agentConfig) if memory: memstats = { 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsable': memory.get('physPctUsable'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared'), 'memSlab': memory.get('physSlab'), 'memPageTables': memory.get('physPageTables'), 'memSwapCached': memory.get('swapCached') } payload.update(memstats) ioStats = sys_checks['io'].check(self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(self.agentConfig) payload.update({'processes': processes}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks gangliaData = self._ganglia.check(self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # metrics about the forwarder if ddforwarderData: payload['datadog'] = ddforwarderData # process collector of gohai (compliant with payload of legacy "resources checks") if not Platform.is_windows() and self._should_send_additional_data('processes'): gohai_processes = self._run_gohai_processes() if gohai_processes: try: gohai_processes_json = json.loads(gohai_processes) processes_payload = { 'snaps': [gohai_processes_json.get('processes')], 'format_version': 1 } if self._is_first_run(): processes_payload['format_description'] = PROCESSES_FORMAT_DESCRIPTION payload['resources'] = { 'processes': processes_payload, 'meta': { 'host': payload['internalHostname'], } } except Exception: log.exception("Error running gohai processes collection") # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() # Collect metadata current_check_metadata = check.get_service_metadata() # Save metrics & events for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats ) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datadog.agent.check_status', status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = 'datadog.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) # Add a service check for the agent service_checks.append(create_service_check('datadog.agent.up', AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks # Populate metadata self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self._agent_metrics: metric_context = { 'collection_time': collect_duration, 'emit_time': self.emit_duration, } if not Platform.is_windows(): metric_context['cpu_time'] = time.clock() - cpu_clock self._agent_metrics.set_metric_context(payload, metric_context) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) if self.agentConfig.get('developer_mode'): log.debug("\n Agent developer mode stats: \n {0}".format( Collector._stats_for_display(agent_stats)) ) # Flush metadata for the Agent Metrics check. Otherwise they'll just accumulate and leak. self._agent_metrics.get_service_metadata() # Let's send our payload emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload @staticmethod def run_single_check(check, verbose=True): log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() current_service_checks = check.get_service_checks() current_service_metadata = check.get_service_metadata() check_stats = check._get_internal_profiling_stats() # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) service_check_count = len(current_service_checks) print "Metrics: \n{0}".format(pprint.pformat(current_check_metrics)) print "Events: \n{0}".format(pprint.pformat(current_check_events)) print "Service Checks: \n{0}".format(pprint.pformat(current_service_checks)) print "Service Metadata: \n{0}".format(pprint.pformat(current_service_metadata)) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats ) return check_status def _emit(self, payload): """ Send the payload via the emitters. """ statuses = [] for emitter in self.emitters: # Don't try to send to an emitter if we're stopping/ if not self.continue_running: return statuses name = emitter.__name__ emitter_status = EmitterStatus(name) try: emitter(payload, log, self.agentConfig) except Exception as e: log.exception("Error running emitter: %s" % emitter.__name__) emitter_status = EmitterStatus(name, e) statuses.append(emitter_status) return statuses def _is_first_run(self): return self.run_count <= 1 def _build_payload(self, payload): """ Build the payload skeleton, so it contains all of the generic payload data. """ now = time.time() payload['collection_timestamp'] = now payload['os'] = self.os payload['python'] = sys.version payload['agentVersion'] = self.agentConfig['version'] payload['apiKey'] = self.agentConfig['api_key'] payload['events'] = {} payload['metrics'] = [] payload['service_checks'] = [] payload['resources'] = {} payload['internalHostname'] = self.hostname payload['uuid'] = get_uuid() payload['host-tags'] = {} payload['external_host_tags'] = {} def _populate_payload_metadata(self, payload, check_statuses, start_event=True): """ Periodically populate the payload with metadata related to the system, host, and/or checks. """ now = time.time() # Include system stats on first postback if start_event and self._is_first_run(): payload['systemStats'] = self.agentConfig.get('system_stats', {}) # Also post an event in the newsfeed payload['events']['System'] = [{ 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': now, 'event_type':'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Periodically send the host metadata. if self._should_send_additional_data('host_metadata'): # gather metadata with gohai gohai_metadata = self._run_gohai_metadata() if gohai_metadata: payload['gohai'] = gohai_metadata payload['systemStats'] = get_system_stats( proc_path=self.agentConfig.get('procfs_path', '/proc').rstrip('/') ) payload['meta'] = self._get_hostname_metadata() self.hostname_metadata_cache = payload['meta'] # Add static tags from the configuration file host_tags = [] if self.agentConfig['tags'] is not None: host_tags.extend([unicode(tag.strip()) for tag in self.agentConfig['tags'].split(",")]) if self.agentConfig['collect_ec2_tags']: host_tags.extend(EC2.get_tags(self.agentConfig)) if host_tags: payload['host-tags']['system'] = host_tags # If required by the user, let's create the dd_check:xxx host tags if self.agentConfig['create_dd_check_tags']: app_tags_list = [DD_CHECK_TAG.format(c.name) for c in self.initialized_checks_d] app_tags_list.extend([DD_CHECK_TAG.format(cname) for cname in JMXFiles.get_jmx_appnames()]) if 'system' not in payload['host-tags']: payload['host-tags']['system'] = [] payload['host-tags']['system'].extend(app_tags_list) GCE_tags = GCE.get_tags(self.agentConfig) if GCE_tags is not None: payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags # Log the metadata on the first run if self._is_first_run(): log.info("Hostnames: %s, tags: %s" % (repr(self.hostname_metadata_cache), payload['host-tags'])) # Periodically send extra hosts metadata (vsphere) # Metadata of hosts that are not the host where the agent runs, not all the checks use # that external_host_tags = [] if self._should_send_additional_data('external_host_tags'): for check in self.initialized_checks_d: try: getter = getattr(check, 'get_external_host_tags') check_tags = getter() external_host_tags.extend(check_tags) except AttributeError: pass if external_host_tags: payload['external_host_tags'] = external_host_tags # Periodically send agent_checks metadata if self._should_send_additional_data('agent_checks'): # Add agent checks statuses and error/warning messages agent_checks = [] for check in check_statuses: if check.instance_statuses is not None: for i, instance_status in enumerate(check.instance_statuses): agent_checks.append( ( check.name, check.source_type_name, instance_status.instance_id, instance_status.status, # put error message or list of warning messages in the same field # it will be handled by the UI instance_status.error or instance_status.warnings or "", check.service_metadata[i] ) ) else: agent_checks.append( ( check.name, check.source_type_name, "initialization", check.status, repr(check.init_failed_error) ) ) payload['agent_checks'] = agent_checks payload['meta'] = self.hostname_metadata_cache # add hostname metadata def _get_hostname_metadata(self): """ Returns a dictionnary that contains hostname metadata. """ metadata = EC2.get_metadata(self.agentConfig) if metadata.get('hostname'): metadata['ec2-hostname'] = metadata.get('hostname') del metadata['hostname'] if self.agentConfig.get('hostname'): metadata['agent-hostname'] = self.agentConfig.get('hostname') else: try: metadata["socket-hostname"] = socket.gethostname() except Exception: pass try: metadata["socket-fqdn"] = socket.getfqdn() except Exception: pass metadata["hostname"] = self.hostname metadata["timezones"] = sanitize_tzname(time.tzname) # Add cloud provider aliases host_aliases = GCE.get_host_aliases(self.agentConfig) if host_aliases: metadata['host_aliases'] = host_aliases return metadata def _should_send_additional_data(self, data_name): if self._is_first_run(): return True # If the interval has passed, send the metadata again now = time.time() if now - self.push_times[data_name]['start'] >= self.push_times[data_name]['interval']: log.debug('%s interval has passed. Sending it.' % data_name) self.push_times[data_name]['start'] = now return True return False def _run_gohai_metadata(self): return self._run_gohai(['--exclude', 'processes']) def _run_gohai_processes(self): return self._run_gohai(['--only', 'processes']) def _run_gohai(self, options): output = None try: if not Platform.is_windows(): command = "gohai" else: command = "gohai\gohai.exe" output, err, _ = get_subprocess_output([command] + options, log) if err: log.warning("GOHAI LOG | {0}".format(err)) except OSError as e: if e.errno == 2: # file not found, expected when install from source log.info("gohai file not found") else: log.warning("Unexpected OSError when running gohai %s", e) except Exception as e: log.warning("gohai command failed with error %s", e) return output
def __init__(self, agentConfig, emitters, systemStats, hostname): self.emit_duration = None self.agentConfig = agentConfig self.hostname = hostname # system stats is generated by config.get_system_stats self.agentConfig["system_stats"] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.check_timings = agentConfig.get("check_timings") self.push_times = { "host_metadata": {"start": time.time(), "interval": int(agentConfig.get("metadata_interval", 4 * 60 * 60))}, "external_host_tags": { "start": time.time() - 3 * 60, # Wait for the checks to init "interval": int(agentConfig.get("external_host_tags", 5 * 60)), }, "agent_checks": {"start": time.time(), "interval": int(agentConfig.get("agent_checks_interval", 10 * 60))}, "processes": {"start": time.time(), "interval": int(agentConfig.get("processes_interval", 60))}, } socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.hostname_metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = {} # Unix System Checks self._unix_system_checks = { "io": u.IO(log), "load": u.Load(log), "memory": u.Memory(log), "processes": u.Processes(log), "cpu": u.Cpu(log), "system": u.System(log), } # Win32 System `Checks self._win32_system_checks = { "io": w32.IO(log), "proc": w32.Processes(log), "memory": w32.Memory(log), "network": w32.Network(log), "cpu": w32.Cpu(log), "system": w32.System(log), } # Old-style metric checks self._ganglia = Ganglia(log) if self.agentConfig.get("ganglia_host", "") != "" else None self._dogstream = None if self.agentConfig.get("dogstreams") is None else Dogstreams.init(log, self.agentConfig) # Agent performance metrics check self._agent_metrics = None self._metrics_checks = [] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get("custom_checks", "").split(",")]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, "Check")(log)) log.info("Registered custom check %s" % module_spec) log.warning( "Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version" ) except Exception: log.exception("Unable to load custom check module %s" % module_spec)
def __init__(self, agentConfig, emitters, systemStats): self.agentConfig = agentConfig # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = getOS() self.plugins = None self.emitters = emitters self.metadata_interval = int( agentConfig.get('metadata_interval', 10 * 60)) self.metadata_start = time.time() socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True # Unix System Checks self._unix_system_checks = { 'disk': u.Disk(checks_logger), 'io': u.IO(), 'load': u.Load(checks_logger), 'memory': u.Memory(checks_logger), 'network': u.Network(checks_logger), 'processes': u.Processes(), 'cpu': u.Cpu(checks_logger) } # Win32 System `Checks self._win32_system_checks = { 'disk': w32.Disk(checks_logger), 'io': w32.IO(checks_logger), 'proc': w32.Processes(checks_logger), 'memory': w32.Memory(checks_logger), 'network': w32.Network(checks_logger), 'cpu': w32.Cpu(checks_logger) } # Old-style metric checks self._couchdb = CouchDb(checks_logger) self._mongodb = MongoDb(checks_logger) self._mysql = MySql(checks_logger) self._rabbitmq = RabbitMq() self._ganglia = Ganglia(checks_logger) self._cassandra = Cassandra() self._dogstream = Dogstreams.init(checks_logger, self.agentConfig) self._ddforwarder = DdForwarder(checks_logger, self.agentConfig) self._ec2 = EC2(checks_logger) # Metric Checks self._metrics_checks = [ ElasticSearch(checks_logger), Jvm(checks_logger), Tomcat(checks_logger), ActiveMQ(checks_logger), Solr(checks_logger), WMICheck(checks_logger), Memcache(checks_logger), ] # Custom metric checks for module_spec in [ s.strip() for s in self.agentConfig.get('custom_checks', '').split(',') ]: if len(module_spec) == 0: continue try: self._metrics_checks.append( modules.load(module_spec, 'Check')(checks_logger)) logger.info("Registered custom check %s" % module_spec) except Exception, e: logger.exception('Unable to load custom check module %s' % module_spec)
class checks(object): def __init__(self, agentConfig, emitters): self.agentConfig = agentConfig self.plugins = None self.emitters = emitters self.os = None self.checksLogger = logging.getLogger('checks') socket.setdefaulttimeout(15) self._apache = Apache(self.checksLogger) self._nginx = Nginx(self.checksLogger) self._disk = Disk(self.checksLogger) self._io = IO() self._load = Load(self.checksLogger) self._memory = Memory(self.checksLogger) self._network = Network(self.checksLogger) self._processes = Processes() self._cpu = Cpu() self._couchdb = CouchDb(self.checksLogger) self._mongodb = MongoDb(self.checksLogger) self._mysql = MySql(self.checksLogger) self._pgsql = PostgreSql(self.checksLogger) self._rabbitmq = RabbitMq() self._ganglia = Ganglia(self.checksLogger) self._cassandra = Cassandra() self._redis = Redis(self.checksLogger) self._jvm = Jvm(self.checksLogger) self._tomcat = Tomcat(self.checksLogger) self._activemq = ActiveMQ(self.checksLogger) self._solr = Solr(self.checksLogger) self._memcache = Memcache(self.checksLogger) self._dogstream = Dogstreams.init(self.checksLogger, self.agentConfig) self._ddforwarder = DdForwarder(self.checksLogger, self.agentConfig) # All new checks should be metrics checks: self._metrics_checks = [ Cacti(self.checksLogger), Redis(self.checksLogger), Varnish(self.checksLogger), ElasticSearch(self.checksLogger), ] self._event_checks = [Hudson(), Nagios(socket.gethostname())] self._resources_checks = [ResProcesses(self.checksLogger,self.agentConfig)] self._ec2 = EC2(self.checksLogger) # # Checks - FIXME migrating to the new Check interface is a WIP # @recordsize def getApacheStatus(self): return self._apache.check(self.agentConfig) @recordsize def getCouchDBStatus(self): return self._couchdb.check(self.agentConfig) @recordsize def getDiskUsage(self): return self._disk.check(self.agentConfig) @recordsize def getIOStats(self): return self._io.check(self.checksLogger, self.agentConfig) @recordsize def getLoadAvrgs(self): return self._load.check(self.agentConfig) @recordsize def getMemoryUsage(self): return self._memory.check(self.agentConfig) @recordsize def getMongoDBStatus(self): return self._mongodb.check(self.agentConfig) @recordsize def getMySQLStatus(self): return self._mysql.check(self.agentConfig) @recordsize def getPgSQLStatus(self): return self._pgsql.check(self.agentConfig) @recordsize def getNetworkTraffic(self): return self._network.check(self.agentConfig) @recordsize def getNginxStatus(self): return self._nginx.check(self.agentConfig) @recordsize def getProcesses(self): return self._processes.check(self.checksLogger, self.agentConfig) @recordsize def getRabbitMQStatus(self): return self._rabbitmq.check(self.checksLogger, self.agentConfig) @recordsize def getGangliaData(self): return self._ganglia.check(self.agentConfig) @recordsize def getCassandraData(self): return self._cassandra.check(self.checksLogger, self.agentConfig) @recordsize def getJvmData(self): return self._jvm.check(self.agentConfig) @recordsize def getTomcatData(self): return self._tomcat.check(self.agentConfig) @recordsize def getActiveMQData(self): return self._activemq.check(self.agentConfig) @recordsize def getSolrData(self): return self._solr.check(self.agentConfig) @recordsize def getMemcacheData(self): return self._memcache.check(self.agentConfig) @recordsize def getDogstreamData(self): return self._dogstream.check(self.agentConfig) @recordsize def getDdforwarderData(self): return self._ddforwarder.check(self.agentConfig) @recordsize def getCPUStats(self): return self._cpu.check(self.checksLogger, self.agentConfig) @recordsize def get_metadata(self): metadata = self._ec2.get_metadata() if metadata.get('hostname'): metadata['ec2-hostname'] = metadata.get('hostname') if self.agentConfig.get('hostname'): metadata['agent-hostname'] = self.agentConfig.get('hostname') try: metadata["hostname"] = socket.gethostname() except: pass try: metadata["fqdn"] = socket.getfqdn() except: pass return metadata def doChecks(self, firstRun=False, systemStats=False): """Actual work """ self.checksLogger.info("Starting checks") apacheStatus = self.getApacheStatus() diskUsage = self.getDiskUsage() loadAvrgs = self.getLoadAvrgs() memory = self.getMemoryUsage() mysqlStatus = self.getMySQLStatus() pgsqlStatus = self.getPgSQLStatus() networkTraffic = self.getNetworkTraffic() nginxStatus = self.getNginxStatus() processes = self.getProcesses() rabbitmq = self.getRabbitMQStatus() mongodb = self.getMongoDBStatus() couchdb = self.getCouchDBStatus() ioStats = self.getIOStats() cpuStats = self.getCPUStats() gangliaData = self.getGangliaData() cassandraData = self.getCassandraData() jvmData = self.getJvmData() tomcatData = self.getTomcatData() activeMQData = self.getActiveMQData() solrData = self.getSolrData() memcacheData = self.getMemcacheData() dogstreamData = self.getDogstreamData() ddforwarderData = self.getDdforwarderData() checksData = { 'collection_timestamp': time.time(), 'os' : self.os, 'python': sys.version, 'agentVersion' : self.agentConfig['version'], 'loadAvrg1' : loadAvrgs['1'], 'loadAvrg5' : loadAvrgs['5'], 'loadAvrg15' : loadAvrgs['15'], 'memPhysUsed' : memory.get('physUsed'), 'memPhysFree' : memory.get('physFree'), 'memPhysTotal' : memory.get('physTotal'), 'memPhysUsable' : memory.get('physUsable'), 'memSwapUsed' : memory.get('swapUsed'), 'memSwapFree' : memory.get('swapFree'), 'memSwapTotal' : memory.get('swapTotal'), 'memCached' : memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared'), 'networkTraffic' : networkTraffic, 'processes' : processes, 'apiKey': self.agentConfig['apiKey'], 'events': {}, 'resources': {}, } if diskUsage is not False and len(diskUsage) == 2: checksData["diskUsage"] = diskUsage[0] checksData["inodes"] = diskUsage[1] if cpuStats is not False and cpuStats is not None: checksData.update(cpuStats) if gangliaData is not False and gangliaData is not None: checksData['ganglia'] = gangliaData if cassandraData is not False and cassandraData is not None: checksData['cassandra'] = cassandraData # Apache Status if apacheStatus: checksData.update(apacheStatus) # MySQL Status if mysqlStatus: checksData.update(mysqlStatus) # PostgreSQL status if pgsqlStatus: checksData['postgresql'] = pgsqlStatus # Nginx Status if nginxStatus: checksData.update(nginxStatus) # RabbitMQ if rabbitmq: checksData['rabbitMQ'] = rabbitmq # MongoDB if mongodb: if mongodb.has_key('events'): checksData['events']['Mongo'] = mongodb['events']['Mongo'] del mongodb['events'] checksData['mongoDB'] = mongodb # CouchDB if couchdb: checksData['couchDB'] = couchdb if ioStats: checksData['ioStats'] = ioStats if jvmData: checksData['jvm'] = jvmData if tomcatData: checksData['tomcat'] = tomcatData if activeMQData: checksData['activemq'] = activeMQData if solrData: checksData['solr'] = solrData if memcacheData: checksData['memcache'] = memcacheData if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in checksData['events']: checksData['events']['dogstream'].extend(dogstreamEvents) else: checksData['events']['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] checksData.update(dogstreamData) if ddforwarderData: checksData['datadog'] = ddforwarderData # Include server indentifiers checksData['internalHostname'] = gethostname(self.agentConfig) checksData['uuid'] = getUuid() self.checksLogger.debug('doChecks: added uuid %s' % checksData['uuid']) # Process the event checks. for event_check in self._event_checks: event_data = event_check.check(self.checksLogger, self.agentConfig) if event_data: checksData['events'][event_check.key] = event_data # Include system stats on first postback if firstRun: checksData['systemStats'] = systemStats # Add static tags from the configuration file if self.agentConfig['tags'] is not None: checksData['tags'] = self.agentConfig['tags'] # Also post an event in the newsfeed checksData['events']['System'] = [{'api_key': self.agentConfig['apiKey'], 'host': checksData['internalHostname'], 'timestamp': int(time.mktime(datetime.datetime.now().timetuple())), 'event_type':'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Collect metadata checksData['meta'] = self.get_metadata() # Resources checks has_resource = False for resources_check in self._resources_checks: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = { 'snaps': snaps, 'format_version': resources_check.get_format_version() } res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value['format_description'] = res_format checksData['resources'][resources_check.RESOURCE_KEY] = res_value if has_resource: checksData['resources']['meta'] = { 'api_key': self.agentConfig['apiKey'], 'host': checksData['internalHostname'], } metrics = [] for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) checksData['metrics'] = metrics # Send back data self.checksLogger.debug("checksData: %s" % checksData) for emitter in self.emitters: emitter(checksData, self.checksLogger, self.agentConfig) self.checksLogger.info("Checks done")
def __init__(self, agentConfig, emitters, systemStats, hostname): self.emit_duration = None self.agentConfig = agentConfig self.hostname = hostname # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.check_timings = agentConfig.get('check_timings') self.push_times = { 'host_metadata': { 'start': time.time(), 'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60)) }, 'external_host_tags': { 'start': time.time() - 3 * 60, # Wait for the checks to init 'interval': int(agentConfig.get('external_host_tags', 5 * 60)) }, 'agent_checks': { 'start': time.time(), 'interval': int(agentConfig.get('agent_checks_interval', 10 * 60)) }, 'processes': { 'start': time.time(), 'interval': int(agentConfig.get('processes_interval', 60)) } } socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.hostname_metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = {} if Platform.is_linux() and psutil is not None: procfs_path = agentConfig.get('procfs_path', '/proc').rstrip('/') psutil.PROCFS_PATH = procfs_path # Unix System Checks self._unix_system_checks = { 'io': u.IO(log), 'load': u.Load(log), 'memory': u.Memory(log), 'processes': u.Processes(log), 'cpu': u.Cpu(log), 'system': u.System(log) } # Win32 System `Checks self._win32_system_checks = { 'io': w32.IO(log), 'proc': w32.Processes(log), 'memory': w32.Memory(log), 'network': w32.Network(log), 'cpu': w32.Cpu(log), 'system': w32.System(log) } # Old-style metric checks self._ganglia = Ganglia(log) if self.agentConfig.get('ganglia_host', '') != '' else None self._dogstream = None if self.agentConfig.get('dogstreams') is None else Dogstreams.init(log, self.agentConfig) # Agent performance metrics check self._agent_metrics = None self._metrics_checks = [] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, 'Check')(log)) log.info("Registered custom check %s" % module_spec) log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version") except Exception: log.exception('Unable to load custom check module %s' % module_spec)
class Collector(object): def __init__(self, agentConfig, emitters, systemStats, hostname): self.ip = get_ip(agentConfig) self.emit_duration = None self.agentConfig = agentConfig self.hostname = hostname self.agentConfig['system_stats'] = systemStats self.os = get_os() self.plugins = None self.emitters = emitters self.check_timings = agentConfig.get('check_timings') self.push_times = { 'host_metadata': { 'start': time.time(), 'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60)) }, 'external_host_tags': { 'start': time.time() - 3 * 60, 'interval': int(agentConfig.get('external_host_tags', 5 * 60)) }, 'agent_checks': { 'start': time.time(), 'interval': int(agentConfig.get('agent_checks_interval', 10 * 60)) }, } socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.hostname_metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = {} self._unix_system_checks = { 'io': u.IO(log), 'load': u.Load(log), 'memory': u.Memory(log), 'processes': u.Processes(log), 'cpu': u.Cpu(log), 'system': u.System(log) } self._win32_system_checks = { 'io': w32.IO(log), 'proc': w32.Processes(log), 'memory': w32.Memory(log), 'network': w32.Network(log), 'cpu': w32.Cpu(log), 'system': w32.System(log) } self._ganglia = Ganglia(log) self._monitorstream = monitorstreams.init(log, self.agentConfig) self._ddforwarder = DdForwarder(log, self.agentConfig) self._agent_metrics = None self._metrics_checks = [] for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, 'Check')(log)) log.info("Registered custom check %s" % module_spec) log.warning( "Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version") except Exception: log.exception('Unable to load custom check module %s' % module_spec) def stop(self): self.continue_running = False for check in self.initialized_checks_d: check.stop() @staticmethod def _stats_for_display(raw_stats): return pprint.pformat(raw_stats, indent=4) @log_exceptions(log) def run(self, checksd=None, start_event=True, configs_reloaded=False): log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks']))) timer = Timer() if not Platform.is_windows(): cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd['initialized_checks'] self.init_failed_checks_d = checksd['init_failed_checks'] payload = AgentPayload() if not self._agent_metrics or configs_reloaded: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break self._build_payload(payload) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] if Platform.is_windows(): try: metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['system'].check(self.agentConfig)) except Exception: log.exception('Unable to fetch Windows system metrics.') else: sys_checks = self._unix_system_checks load = sys_checks['load'].check(self.agentConfig) payload.update(load) system = sys_checks['system'].check(self.agentConfig) payload.update(system) memory = sys_checks['memory'].check(self.agentConfig) if memory: memstats = { 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsage': memory.get('physPctUsage'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared'), 'memSlab': memory.get('physSlab'), 'memPageTables': memory.get('physPageTables'), 'memSwapCached': memory.get('swapCached') } payload.update(memstats) ioStats = sys_checks['io'].check(self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(self.agentConfig) payload.update({'processes': processes}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) gangliaData = self._ganglia.check(self.agentConfig) monitorstreamData = self._monitorstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData if monitorstreamData: monitorstreamEvents = monitorstreamData.get('monitorstreamEvents', None) if monitorstreamEvents: if 'monitorstream' in payload['events']: events['monitorstream'].extend(monitorstreamEvents) else: events['monitorstream'] = monitorstreamEvents del monitorstreamData['monitorstreamEvents'] payload.update(monitorstreamData) if ddforwarderData: payload['datamonitor'] = ddforwarderData for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: instance_statuses = check.run() current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() current_check_metadata = check.get_service_metadata() metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats ) service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datamonitor.agent.check_status', status, tags=service_check_tags) current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) if self.check_timings: metric = 'datamonitor.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['error']) check_statuses.append(check_status) service_checks.append(create_service_check('datamonitor.agent.up', AgentCheck.OK, hostname=self.hostname)) payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self._agent_metrics: metric_context = { 'collection_time': collect_duration, 'emit_time': self.emit_duration, } if not Platform.is_windows(): metric_context['cpu_time'] = time.clock() - cpu_clock self._agent_metrics.set_metric_context(payload, metric_context) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) if self.agentConfig.get('developer_mode'): log.debug("\n Agent developer mode stats: \n {0}".format( Collector._stats_for_display(agent_stats)) ) self._agent_metrics.get_service_metadata() emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload @staticmethod def run_single_check(check, verbose=True): log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_stats = None try: instance_statuses = check.run() current_check_metrics = check.get_metrics() current_check_events = check.get_events() current_service_checks = check.get_service_checks() current_service_metadata = check.get_service_metadata() check_stats = check._get_internal_profiling_stats() metric_count = len(current_check_metrics) event_count = len(current_check_events) service_check_count = len(current_service_checks) print "Metrics: \n{0}".format(pprint.pformat(current_check_metrics)) print "Events: \n{0}".format(pprint.pformat(current_check_events)) print "Service Checks: \n{0}".format(pprint.pformat(current_service_checks)) print "Service Metadata: \n{0}".format(pprint.pformat(current_service_metadata)) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats ) return check_status def _emit(self, payload): statuses = [] for emitter in self.emitters: if not self.continue_running: return statuses name = emitter.__name__ emitter_status = EmitterStatus(name) try: emitter(payload, log, self.agentConfig) except Exception, e: log.exception("Error running emitter: %s" % emitter.__name__) emitter_status = EmitterStatus(name, e) statuses.append(emitter_status) return statuses
def __init__(self, agentConfig, emitters, systemStats, hostname): self.ip = get_ip(agentConfig) self.emit_duration = None self.agentConfig = agentConfig self.hostname = hostname self.agentConfig['system_stats'] = systemStats self.os = get_os() self.plugins = None self.emitters = emitters self.check_timings = agentConfig.get('check_timings') self.push_times = { 'host_metadata': { 'start': time.time(), 'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60)) }, 'external_host_tags': { 'start': time.time() - 3 * 60, 'interval': int(agentConfig.get('external_host_tags', 5 * 60)) }, 'agent_checks': { 'start': time.time(), 'interval': int(agentConfig.get('agent_checks_interval', 10 * 60)) }, } socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.hostname_metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = {} self._unix_system_checks = { 'io': u.IO(log), 'load': u.Load(log), 'memory': u.Memory(log), 'processes': u.Processes(log), 'cpu': u.Cpu(log), 'system': u.System(log) } self._win32_system_checks = { 'io': w32.IO(log), 'proc': w32.Processes(log), 'memory': w32.Memory(log), 'network': w32.Network(log), 'cpu': w32.Cpu(log), 'system': w32.System(log) } self._ganglia = Ganglia(log) self._monitorstream = monitorstreams.init(log, self.agentConfig) self._ddforwarder = DdForwarder(log, self.agentConfig) self._agent_metrics = None self._metrics_checks = [] for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, 'Check')(log)) log.info("Registered custom check %s" % module_spec) log.warning( "Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version") except Exception: log.exception('Unable to load custom check module %s' % module_spec)
def __init__(self, agentConfig, emitters, systemStats, hostname): self.emit_duration = None self.agentConfig = agentConfig self.hostname = hostname # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.check_timings = agentConfig.get('check_timings') self.push_times = { 'host_metadata': { 'start': time.time(), 'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60)) }, 'external_host_tags': { 'start': time.time() - 3 * 60, # Wait for the checks to init 'interval': int(agentConfig.get('external_host_tags', 5 * 60)) }, 'agent_checks': { 'start': time.time(), 'interval': int(agentConfig.get('agent_checks_interval', 10 * 60)) }, 'processes': { 'start': time.time(), 'interval': int(agentConfig.get('processes_interval', 60)) } } socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.hostname_metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = {} # Unix System Checks self._unix_system_checks = { 'io': u.IO(log), 'load': u.Load(log), 'memory': u.Memory(log), 'processes': u.Processes(log), 'cpu': u.Cpu(log), 'system': u.System(log) } # Win32 System `Checks self._win32_system_checks = { 'io': w32.IO(log), 'proc': w32.Processes(log), 'memory': w32.Memory(log), 'network': w32.Network(log), 'cpu': w32.Cpu(log), 'system': w32.System(log) } # Old-style metric checks self._ganglia = Ganglia(log) self._dogstream = Dogstreams.init(log, self.agentConfig) self._ddforwarder = DdForwarder(log, self.agentConfig) # Agent performance metrics check self._agent_metrics = None self._metrics_checks = [] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, 'Check')(log)) log.info("Registered custom check %s" % module_spec) log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version") except Exception: log.exception('Unable to load custom check module %s' % module_spec)
class Collector(object): """ The collector is responsible for collecting data from each check and passing it along to the emitters, who send it to their final destination. """ def __init__(self, agentConfig, emitters, systemStats, hostname): self.emit_duration = None self.agentConfig = agentConfig self.hostname = hostname # system stats is generated by config.get_system_stats self.agentConfig["system_stats"] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.check_timings = agentConfig.get("check_timings") self.push_times = { "host_metadata": {"start": time.time(), "interval": int(agentConfig.get("metadata_interval", 4 * 60 * 60))}, "external_host_tags": { "start": time.time() - 3 * 60, # Wait for the checks to init "interval": int(agentConfig.get("external_host_tags", 5 * 60)), }, "agent_checks": {"start": time.time(), "interval": int(agentConfig.get("agent_checks_interval", 10 * 60))}, } socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.hostname_metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = {} # Unix System Checks self._unix_system_checks = { "io": u.IO(log), "load": u.Load(log), "memory": u.Memory(log), "processes": u.Processes(log), "cpu": u.Cpu(log), "system": common.System(log), } # Win32 System `Checks self._win32_system_checks = { "io": w32.IO(log), "proc": w32.Processes(log), "memory": w32.Memory(log), "network": w32.Network(log), "cpu": w32.Cpu(log), "system": common.System(log), } # Old-style metric checks self._ganglia = Ganglia(log) self._dogstream = Dogstreams.init(log, self.agentConfig) self._ddforwarder = DdForwarder(log, self.agentConfig) # Agent performance metrics check self._agent_metrics = None self._metrics_checks = [] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get("custom_checks", "").split(",")]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, "Check")(log)) log.info("Registered custom check %s" % module_spec) log.warning( "Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version" ) except Exception: log.exception("Unable to load custom check module %s" % module_spec) # Resource Checks self._resources_checks = [ResProcesses(log, self.agentConfig)] def stop(self): """ Tell the collector to stop at the next logical point. """ # This is called when the process is being killed, so # try to stop the collector as soon as possible. # Most importantly, don't try to submit to the emitters # because the forwarder is quite possibly already killed # in which case we'll get a misleading error in the logs. # Best to not even try. self.continue_running = False for check in self.initialized_checks_d: check.stop() @staticmethod def _stats_for_display(raw_stats): return pprint.pformat(raw_stats, indent=4) @log_exceptions(log) def run(self, checksd=None, start_event=True, configs_reloaded=False): """ Collect data from each check and submit their data. """ log.debug("Found {num_checks} checks".format(num_checks=len(checksd["initialized_checks"]))) timer = Timer() if not Platform.is_windows(): cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd["initialized_checks"] # is a list of AgentCheck instances self.init_failed_checks_d = checksd["init_failed_checks"] # is of type {check_name: {error, traceback}} payload = AgentPayload() # Find the AgentMetrics check and pop it out # This check must run at the end of the loop to collect info on agent performance if not self._agent_metrics or configs_reloaded: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break # Initialize payload self._build_payload(payload) metrics = payload["metrics"] events = payload["events"] service_checks = payload["service_checks"] # Run the system checks. Checks will depend on the OS if Platform.is_windows(): # Win32 system checks try: metrics.extend(self._win32_system_checks["memory"].check(self.agentConfig)) metrics.extend(self._win32_system_checks["cpu"].check(self.agentConfig)) metrics.extend(self._win32_system_checks["network"].check(self.agentConfig)) metrics.extend(self._win32_system_checks["io"].check(self.agentConfig)) metrics.extend(self._win32_system_checks["proc"].check(self.agentConfig)) except Exception: log.exception("Unable to fetch Windows system metrics.") else: # Unix system checks sys_checks = self._unix_system_checks load = sys_checks["load"].check(self.agentConfig) payload.update(load) system = sys_checks["system"].check(self.agentConfig) payload.update(system) memory = sys_checks["memory"].check(self.agentConfig) if memory: memstats = { "memPhysUsed": memory.get("physUsed"), "memPhysPctUsable": memory.get("physPctUsable"), "memPhysFree": memory.get("physFree"), "memPhysTotal": memory.get("physTotal"), "memPhysUsable": memory.get("physUsable"), "memSwapUsed": memory.get("swapUsed"), "memSwapFree": memory.get("swapFree"), "memSwapPctFree": memory.get("swapPctFree"), "memSwapTotal": memory.get("swapTotal"), "memCached": memory.get("physCached"), "memBuffers": memory.get("physBuffers"), "memShared": memory.get("physShared"), "memSlab": memory.get("physSlab"), "memPageTables": memory.get("physPageTables"), "memSwapCached": memory.get("swapCached"), } payload.update(memstats) ioStats = sys_checks["io"].check(self.agentConfig) if ioStats: payload["ioStats"] = ioStats processes = sys_checks["processes"].check(self.agentConfig) payload.update({"processes": processes}) cpuStats = sys_checks["cpu"].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks gangliaData = self._ganglia.check(self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload["ganglia"] = gangliaData # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get("dogstreamEvents", None) if dogstreamEvents: if "dogstream" in payload["events"]: events["dogstream"].extend(dogstreamEvents) else: events["dogstream"] = dogstreamEvents del dogstreamData["dogstreamEvents"] payload.update(dogstreamData) # metrics about the forwarder if ddforwarderData: payload["datadog"] = ddforwarderData # Resources checks if not Platform.is_windows(): has_resource = False for resources_check in self._resources_checks: try: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = {"snaps": snaps, "format_version": resources_check.get_format_version()} res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value["format_description"] = res_format payload["resources"][resources_check.RESOURCE_KEY] = res_value except Exception: log.exception("Error running resource check %s" % resources_check.RESOURCE_KEY) if has_resource: payload["resources"]["meta"] = { "api_key": self.agentConfig["api_key"], "host": payload["internalHostname"], } # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() # Collect metadata current_check_metadata = check.get_service_metadata() # Save metrics & events for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats, ) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check("datadog.agent.check_status", status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = "datadog.agent.check_run_time" meta = {"tags": ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus( check_name, None, None, None, None, init_failed_error=info["error"], init_failed_traceback=info["traceback"], ) check_statuses.append(check_status) # Add a service check for the agent service_checks.append(create_service_check("datadog.agent.up", AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload["metrics"] = metrics payload["events"] = events payload["service_checks"] = service_checks # Populate metadata self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self._agent_metrics: metric_context = {"collection_time": collect_duration, "emit_time": self.emit_duration} if not Platform.is_windows(): metric_context["cpu_time"] = time.clock() - cpu_clock self._agent_metrics.set_metric_context(payload, metric_context) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload["metrics"].extend(agent_stats) if self.agentConfig.get("developer_mode"): log.debug("\n Agent developer mode stats: \n {0}".format(Collector._stats_for_display(agent_stats))) # Let's send our payload emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info( "Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)) ) if self.run_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug( "Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2)) ) return payload @staticmethod def run_single_check(check, verbose=True): log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() current_service_checks = check.get_service_checks() current_service_metadata = check.get_service_metadata() check_stats = check._get_internal_profiling_stats() # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) service_check_count = len(current_service_checks) print "Metrics: \n{0}".format(pprint.pformat(current_check_metrics)) print "Events: \n{0}".format(pprint.pformat(current_check_events)) print "Service Checks: \n{0}".format(pprint.pformat(current_service_checks)) print "Service Metadata: \n{0}".format(pprint.pformat(current_service_metadata)) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats, ) return check_status def _emit(self, payload): """ Send the payload via the emitters. """ statuses = [] for emitter in self.emitters: # Don't try to send to an emitter if we're stopping/ if not self.continue_running: return statuses name = emitter.__name__ emitter_status = EmitterStatus(name) try: emitter(payload, log, self.agentConfig) except Exception, e: log.exception("Error running emitter: %s" % emitter.__name__) emitter_status = EmitterStatus(name, e) statuses.append(emitter_status) return statuses