def __init__(self, port, agentConfig, watchdog=True, skip_ssl_validation=False): self._port = int(port) self._agentConfig = agentConfig self._metrics = {} MetricTransaction.set_application(self) MetricTransaction.set_endpoints() self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY) MetricTransaction.set_tr_manager(self._tr_manager) self._watchdog = None self.skip_ssl_validation = skip_ssl_validation or agentConfig.get( 'skip_ssl_validation', False) if self.skip_ssl_validation: log.info( "Skipping SSL hostname validation, useful when using a transparent proxy" ) if watchdog: watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER self._watchdog = Watchdog(watchdog_timeout, max_mem_mb=agentConfig.get( 'limit_memory_consumption', None))
def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False): threading.Thread.__init__(self) self.daemon = True self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.watchdog = None if use_watchdog: from util import Watchdog self.watchdog = Watchdog(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.http_conn_cls = http_client.HTTPSConnection match = re.match('^(https?)://(.*)', api_host) if match: self.api_host = match.group(2) if match.group(1) == 'http': self.http_conn_cls = http_client.HTTPConnection
def _get_watchdog(self, check_freq, agentConfig): watchdog = None if agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER, max_mem_mb=agentConfig.get('limit_memory_consumption', None)) watchdog.reset() return watchdog
def __init__(self, port, agentConfig, watchdog=True): self._port = int(port) self._agentConfig = agentConfig self._metrics = {} MetricTransaction.set_application(self) MetricTransaction.set_endpoints() self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY) MetricTransaction.set_tr_manager(self._tr_manager) self._watchdog = None if watchdog: watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER self._watchdog = Watchdog(watchdog_timeout)
def run(self, agentConfig=None, run_forever=True): """Main loop of the collector""" agentLogger = logging.getLogger('agent') systemStats = get_system_stats() agentLogger.debug('System Properties: ' + str(systemStats)) if agentConfig is None: agentConfig = get_config() # Load the checks.d checks checksd = load_check_directory(agentConfig) # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get('hostname') is None and agentConfig.get('use_ec2_instance_id'): instanceId = EC2.get_instance_id() if instanceId is not None: agentLogger.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: agentLogger.info('Not running on EC2, using hostname to identify this server') emitters = [http_emitter] for emitter_spec in [s.strip() for s in agentConfig.get('custom_emitters', '').split(',')]: if len(emitter_spec) == 0: continue emitters.append(modules.load(emitter_spec, 'emitter')) check_freq = int(agentConfig['check_freq']) # Checks instance c = checks(agentConfig, emitters) # Watchdog watchdog = None if agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER) watchdog.reset() # Run checks once, to get once-in-a-run data c.doChecks(True, systemStats, checksd) # Main loop while run_forever: if watchdog is not None: watchdog.reset() time.sleep(check_freq) c.doChecks(checksd=checksd)
def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False, event_chunk_size=None): threading.Thread.__init__(self) self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.log_count = 0 self.watchdog = None if use_watchdog: from util import Watchdog self.watchdog = Watchdog(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE self.http_conn_cls = http_client.HTTPSConnection match = re.match('^(https?)://(.*)', api_host) if match: self.api_host = match.group(2) if match.group(1) == 'http': self.http_conn_cls = http_client.HTTPConnection
def __init__(self, port, agentConfig, watchdog=True, skip_ssl_validation=False, use_simple_http_client=False): self._port = int(port) self._agentConfig = agentConfig self._metrics = {} AgentTransaction.set_application(self) AgentTransaction.set_endpoints(agentConfig['endpoints']) AgentTransaction.set_request_timeout(agentConfig['forwarder_timeout']) max_parallelism = self.NO_PARALLELISM # Multiple endpoints => enable parallelism if len(agentConfig['endpoints']) > 1: max_parallelism = self.DEFAULT_PARALLELISM self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY, max_parallelism=max_parallelism) AgentTransaction.set_tr_manager(self._tr_manager) self._watchdog = None self.skip_ssl_validation = skip_ssl_validation or agentConfig.get('skip_ssl_validation', False) self.use_simple_http_client = use_simple_http_client if self.skip_ssl_validation: log.info("Skipping SSL hostname validation, useful when using a transparent proxy") # Monitor activity if watchdog: watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER / 1000 self._watchdog = Watchdog( watchdog_timeout, max_mem_mb=agentConfig.get('limit_memory_consumption', None), max_resets=WATCHDOG_HIGH_ACTIVITY_THRESHOLD )
def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False, event_chunk_size=None): threading.Thread.__init__(self) self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.log_count = 0 self.watchdog = None if use_watchdog: from util import Watchdog self.watchdog = Watchdog(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE
def __init__(self, port, agentConfig, watchdog=True, skip_ssl_validation=False, use_simple_http_client=False): self._port = int(port) self._agentConfig = agentConfig self._metrics = {} AgentTransaction.set_application(self) AgentTransaction.set_endpoints() self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY) AgentTransaction.set_tr_manager(self._tr_manager) self._watchdog = None self.skip_ssl_validation = skip_ssl_validation or agentConfig.get( 'skip_ssl_validation', False) self.use_simple_http_client = use_simple_http_client if self.skip_ssl_validation: log.info( "Skipping SSL hostname validation, useful when using a transparent proxy" ) # Monitor activity if watchdog: watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER / 1000 self._watchdog = Watchdog( watchdog_timeout, max_mem_mb=agentConfig.get('limit_memory_consumption', None), max_resets=WATCHDOG_HIGH_ACTIVITY_THRESHOLD)
def use_lots_of_memory(self): # Skip this step on travis if os.environ.get('TRAVIS', False): return a = Application(12345, {}) a._watchdog = Watchdog(30, 50) a._tr_manager = MemoryHogTxManager() a.run()
def __init__(self, port, agentConfig): self._port = port self._agentConfig = agentConfig self._metrics = {} self._watchdog = Watchdog(TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER) MetricTransaction.set_application(self) self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY) MetricTransaction.set_tr_manager(self._tr_manager)
def __init__(self, port, agentConfig, watchdog=True, skip_ssl_validation=False, use_simple_http_client=False): self._port = int(port) self._agentConfig = agentConfig self._metrics = {} AgentTransaction.set_application(self) AgentTransaction.set_endpoints(agentConfig['endpoints']) if agentConfig['endpoints'] == {}: log.warning( u"No valid endpoint found. Forwarder will drop all incoming payloads." ) AgentTransaction.set_request_timeout(agentConfig['forwarder_timeout']) max_parallelism = self.NO_PARALLELISM # Multiple endpoints => enable parallelism if len(agentConfig['endpoints']) > 1: max_parallelism = self.DEFAULT_PARALLELISM self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY, max_parallelism=max_parallelism) AgentTransaction.set_tr_manager(self._tr_manager) self._watchdog = None self.skip_ssl_validation = skip_ssl_validation or agentConfig.get( 'skip_ssl_validation', False) self.use_simple_http_client = use_simple_http_client if self.skip_ssl_validation: log.info( "Skipping SSL hostname validation, useful when using a transparent proxy" ) # Monitor activity if watchdog: watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER / 1000 self._watchdog = Watchdog( watchdog_timeout, max_mem_mb=agentConfig.get('limit_memory_consumption', None), max_resets=WATCHDOG_HIGH_ACTIVITY_THRESHOLD)
def __init__(self, port, agentConfig, watchdog=True, skip_ssl_validation=False): self._port = int(port) self._agentConfig = agentConfig self._metrics = {} MetricTransaction.set_application(self) MetricTransaction.set_endpoints() self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY) MetricTransaction.set_tr_manager(self._tr_manager) self._watchdog = None self.skip_ssl_validation = skip_ssl_validation or agentConfig.get('skip_ssl_validation', False) if self.skip_ssl_validation: log.info("Skipping SSL hostname validation, useful when using a transparent proxy") if watchdog: watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER self._watchdog = Watchdog(watchdog_timeout, max_mem_mb=agentConfig.get('limit_memory_consumption', None))
def run(self, agentConfig=None, run_forever=True): """Main loop of the collector""" agentLogger = logging.getLogger('agent') systemStats = get_system_stats() if agentConfig is None: agentConfig = get_config() # Load the checks.d checks checksd = load_check_directory(agentConfig) # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get('hostname') is None and agentConfig.get( 'use_ec2_instance_id'): instanceId = EC2.get_instance_id() if instanceId is not None: agentLogger.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: agentLogger.info( 'Not running on EC2, using hostname to identify this server' ) emitters = [http_emitter] for emitter_spec in [ s.strip() for s in agentConfig.get('custom_emitters', '').split(',') ]: if len(emitter_spec) == 0: continue emitters.append(modules.load(emitter_spec, 'emitter')) check_freq = int(agentConfig['check_freq']) # Checks instance collector = Collector(agentConfig, emitters, systemStats) # Watchdog watchdog = None if agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER) watchdog.reset() # Main loop while run_forever: collector.run(checksd=checksd) if watchdog is not None: watchdog.reset() time.sleep(check_freq)
def test_watchdog_frenesy_detection(self, mock_restarted): """ Watchdog restarts the process on suspicious high activity. """ # Limit the restart timeframe for test purpose Watchdog._RESTART_TIMEFRAME = 1 # Create a watchdog with a low activity tolerancy process_watchdog = Watchdog(10, max_resets=3) ping_watchdog = process_watchdog.reset with self.set_time(1): # Can be reset 3 times within the watchdog timeframe for x in xrange(0, 3): ping_watchdog() # On the 4th attempt, the watchdog detects a suspicously high activity self.assertRaises(WatchdogKill, ping_watchdog) with self.set_time(3): # Gets back to normal when the activity timeframe expires. ping_watchdog()
class Reporter(threading.Thread): """ The reporter periodically sends the aggregated metrics to the server. """ def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False, event_chunk_size=None): threading.Thread.__init__(self) self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.log_count = 0 self.watchdog = None if use_watchdog: from util import Watchdog self.watchdog = Watchdog(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE self.http_conn_cls = http_client.HTTPSConnection match = re.match('^(https?)://(.*)', api_host) if match: self.api_host = match.group(2) if match.group(1) == 'http': self.http_conn_cls = http_client.HTTPConnection def stop(self): log.info("Stopping reporter") self.finished.set() def run(self): log.info("Reporting to %s every %ss" % (self.api_host, self.interval)) log.debug("Watchdog enabled: %s" % bool(self.watchdog)) # Persist a start-up message. DogstatsdStatus().persist() while not self.finished.isSet( ): # Use camel case isSet for 2.4 support. self.finished.wait(self.interval) self.metrics_aggregator.send_packet_count( 'datadog.dogstatsd.packet.count') self.flush() if self.watchdog: self.watchdog.reset() # Clean up the status messages. log.debug("Stopped reporter") DogstatsdStatus.remove_latest_status() def flush(self): try: self.flush_count += 1 self.log_count += 1 packets_per_second = self.metrics_aggregator.packets_per_second( self.interval) packet_count = self.metrics_aggregator.total_count metrics = self.metrics_aggregator.flush() count = len(metrics) if self.flush_count % FLUSH_LOGGING_PERIOD == 0: self.log_count = 0 if count: self.submit(metrics) events = self.metrics_aggregator.flush_events() event_count = len(events) if event_count: self.submit_events(events) should_log = self.flush_count <= FLUSH_LOGGING_INITIAL or self.log_count <= FLUSH_LOGGING_COUNT log_func = log.info if not should_log: log_func = log.debug log_func("Flush #%s: flushed %s metric%s and %s event%s" % (self.flush_count, count, plural(count), event_count, plural(event_count))) if self.flush_count == FLUSH_LOGGING_INITIAL: log.info( "First flushes done, %s flushes will be logged every %s flushes." % (FLUSH_LOGGING_COUNT, FLUSH_LOGGING_PERIOD)) # Persist a status message. packet_count = self.metrics_aggregator.total_count DogstatsdStatus( flush_count=self.flush_count, packet_count=packet_count, packets_per_second=packets_per_second, metric_count=count, event_count=event_count, ).persist() except Exception: if self.finished.isSet(): log.debug( "Couldn't flush metrics, but that's expected as we're stopping" ) else: log.exception("Error flushing metrics") def submit(self, metrics): # Copy and pasted from dogapi, because it's a bit of a pain to distribute python # dependencies with the agent. body, headers = serialize_metrics(metrics) method = 'POST' params = {} if self.api_key: params['api_key'] = self.api_key url = '/api/v1/series?%s' % urlencode(params) start_time = time() status = None conn = self.http_conn_cls(self.api_host) try: conn.request(method, url, body, headers) #FIXME: add timeout handling code here response = conn.getresponse() status = response.status response.close() finally: conn.close() duration = round((time() - start_time) * 1000.0, 4) log.debug("%s %s %s%s (%sms)" % (status, method, self.api_host, url, duration)) return duration def submit_events(self, events): headers = {'Content-Type': 'application/json'} method = 'POST' events_len = len(events) event_chunk_size = self.event_chunk_size for chunk in chunks(events, event_chunk_size): payload = { 'apiKey': self.api_key, 'events': { 'api': chunk }, 'uuid': get_uuid(), 'internalHostname': get_hostname() } params = {} if self.api_key: params['api_key'] = self.api_key url = '/intake?%s' % urlencode(params) status = None conn = self.http_conn_cls(self.api_host) try: start_time = time() conn.request(method, url, json.dumps(payload), headers) response = conn.getresponse() status = response.status response.close() duration = round((time() - start_time) * 1000.0, 4) log.debug("%s %s %s%s (%sms)" % (status, method, self.api_host, url, duration)) finally: conn.close()
def fast_tornado(self): a = Application(12345, {}) a._watchdog = Watchdog(6) a._tr_manager = MockTxManager() a.run()
class Reporter(threading.Thread): """ The reporter periodically sends the aggregated metrics to the server. """ def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False, event_chunk_size=None): threading.Thread.__init__(self) self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.log_count = 0 self.watchdog = None if use_watchdog: from util import Watchdog self.watchdog = Watchdog(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE def stop(self): log.info("Stopping reporter") self.finished.set() def run(self): log.info("Reporting to %s every %ss" % (self.api_host, self.interval)) log.debug("Watchdog enabled: %s" % bool(self.watchdog)) # Persist a start-up message. DogstatsdStatus().persist() while not self.finished.isSet(): # Use camel case isSet for 2.4 support. self.finished.wait(self.interval) self.metrics_aggregator.send_packet_count('datadog.dogstatsd.packet.count') self.flush() if self.watchdog: self.watchdog.reset() # Clean up the status messages. log.debug("Stopped reporter") DogstatsdStatus.remove_latest_status() def flush(self): try: self.flush_count += 1 self.log_count += 1 packets_per_second = self.metrics_aggregator.packets_per_second(self.interval) packet_count = self.metrics_aggregator.total_count metrics = self.metrics_aggregator.flush() count = len(metrics) if self.flush_count % FLUSH_LOGGING_PERIOD == 0: self.log_count = 0 if count: self.submit(metrics) events = self.metrics_aggregator.flush_events() event_count = len(events) if event_count: self.submit_events(events) service_checks = self.metrics_aggregator.flush_service_checks() check_count = len(service_checks) if check_count: self.submit_service_checks(service_checks) should_log = self.flush_count <= FLUSH_LOGGING_INITIAL or self.log_count <= FLUSH_LOGGING_COUNT log_func = log.info if not should_log: log_func = log.debug log_func("Flush #%s: flushed %s metric%s, %s event%s, and %s service check run%s" % (self.flush_count, count, plural(count), event_count, plural(event_count), check_count, plural(check_count))) if self.flush_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, %s flushes will be logged every %s flushes." % (FLUSH_LOGGING_COUNT, FLUSH_LOGGING_PERIOD)) # Persist a status message. packet_count = self.metrics_aggregator.total_count DogstatsdStatus( flush_count=self.flush_count, packet_count=packet_count, packets_per_second=packets_per_second, metric_count=count, event_count=event_count, ).persist() except Exception: if self.finished.isSet(): log.debug("Couldn't flush metrics, but that's expected as we're stopping") else: log.exception("Error flushing metrics") def submit(self, metrics): body, headers = serialize_metrics(metrics) params = {} if self.api_key: params['api_key'] = self.api_key url = '%s/api/v1/series?%s' % (self.api_host, urlencode(params)) self.submit_http(url, body, headers) def submit_events(self, events): headers = {'Content-Type':'application/json'} event_chunk_size = self.event_chunk_size for chunk in chunks(events, event_chunk_size): payload = { 'apiKey': self.api_key, 'events': { 'api': chunk }, 'uuid': get_uuid(), 'internalHostname': get_hostname() } params = {} if self.api_key: params['api_key'] = self.api_key url = '%s/intake?%s' % (self.api_host, urlencode(params)) self.submit_http(url, json.dumps(payload), headers) def submit_http(self, url, data, headers): headers["DD-Dogstatsd-Version"] = get_version() log.debug("Posting payload to %s" % url) try: start_time = time() r = requests.post(url, data=data, timeout=5, headers=headers) r.raise_for_status() if r.status_code >= 200 and r.status_code < 205: log.debug("Payload accepted") status = r.status_code duration = round((time() - start_time) * 1000.0, 4) log.debug("%s POST %s (%sms)" % (status, url, duration)) except Exception: log.exception("Unable to post payload.") try: log.error("Received status code: {0}".format(r.status_code)) except Exception: pass def submit_service_checks(self, service_checks): headers = {'Content-Type':'application/json'} params = {} if self.api_key: params['api_key'] = self.api_key url = '{0}/api/v1/check_run?{1}'.format(self.api_host, urlencode(params)) self.submit_http(url, json.dumps(service_checks), headers)
class Reporter(threading.Thread): """ The reporter periodically sends the aggregated metrics to the server. """ def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False): threading.Thread.__init__(self) self.daemon = True self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.watchdog = None if use_watchdog: from util import Watchdog self.watchdog = Watchdog(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.http_conn_cls = http_client.HTTPSConnection match = re.match('^(https?)://(.*)', api_host) if match: self.api_host = match.group(2) if match.group(1) == 'http': self.http_conn_cls = http_client.HTTPConnection def stop(self): self.finished.set() def run(self): logger.info("Reporting to %s every %ss" % (self.api_host, self.interval)) logger.debug("Watchdog enabled: %s" % bool(self.watchdog)) while True: if self.finished.isSet( ): # Use camel case version for 2.4 support. break self.finished.wait(self.interval) self.metrics_aggregator.send_packet_count( 'datadog.dogstatsd.packet.count') self.flush() if self.watchdog: self.watchdog.reset() def flush(self): try: self.flush_count += 1 metrics = self.metrics_aggregator.flush() count = len(metrics) if not count: logger.info("Flush #%s: No metrics to flush." % self.flush_count) return logger.info("Flush #%s: flushing %s metrics" % (self.flush_count, count)) self.submit(metrics) except: logger.exception("Error flushing metrics") def submit(self, metrics): # HACK - Copy and pasted from dogapi, because it's a bit of a pain to distribute python # dependencies with the agent. conn = self.http_conn_cls(self.api_host) body = json.dumps({"series": metrics}) headers = {'Content-Type': 'application/json'} method = 'POST' params = {} if self.api_key: params['api_key'] = self.api_key url = '/api/v1/series?%s' % urlencode(params) start_time = time() conn.request(method, url, body, headers) #FIXME: add timeout handling code here response = conn.getresponse() duration = round((time() - start_time) * 1000.0, 4) logger.info("%s %s %s%s (%sms)" % (response.status, method, self.api_host, url, duration))
class Application(tornado.web.Application): def __init__(self, port, agentConfig, watchdog=True): self._port = int(port) self._agentConfig = agentConfig self._metrics = {} MetricTransaction.set_application(self) MetricTransaction.set_endpoints() self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY) MetricTransaction.set_tr_manager(self._tr_manager) self._watchdog = None if watchdog: watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER self._watchdog = Watchdog(watchdog_timeout) def appendMetric(self, prefix, name, host, device, ts, value): if self._metrics.has_key(prefix): metrics = self._metrics[prefix] else: metrics = {} self._metrics[prefix] = metrics if metrics.has_key(name): metrics[name].append([host, device, ts, value]) else: metrics[name] = [[host, device, ts, value]] def _postMetrics(self): if len(self._metrics) > 0: self._metrics["uuid"] = getUuid() self._metrics["internalHostname"] = gethostname(self._agentConfig) self._metrics["apiKey"] = self._agentConfig["api_key"] MetricTransaction(self._metrics, {}) self._metrics = {} def run(self): handlers = [ (r"/intake/?", AgentInputHandler), (r"/api/v1/series/?", ApiInputHandler), (r"/status/?", StatusHandler), ] settings = dict(cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=", xsrf_cookies=False, debug=True) tornado.web.Application.__init__(self, handlers, **settings) http_server = tornado.httpserver.HTTPServer(self) http_server.listen(self._port) logging.info("Listening on port %d" % self._port) # Register callbacks self.mloop = tornado.ioloop.IOLoop.instance() def flush_trs(): if self._watchdog: self._watchdog.reset() self._postMetrics() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: logging.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, gethostname(self._agentConfig), io_loop=self.mloop) gs.listen(gport) # Start everything if self._watchdog: self._watchdog.reset() tr_sched.start() self.mloop.start() def stop(self): self.mloop.stop()
class Application(tornado.web.Application): NO_PARALLELISM = 1 DEFAULT_PARALLELISM = 5 def __init__(self, port, agentConfig, watchdog=True, skip_ssl_validation=False, use_simple_http_client=False): self._port = int(port) self._agentConfig = agentConfig self._metrics = {} AgentTransaction.set_application(self) AgentTransaction.set_endpoints(agentConfig['endpoints']) if agentConfig['endpoints'] == {}: log.warning( u"No valid endpoint found. Forwarder will drop all incoming payloads." ) AgentTransaction.set_request_timeout(agentConfig['forwarder_timeout']) max_parallelism = self.NO_PARALLELISM # Multiple endpoints => enable parallelism if len(agentConfig['endpoints']) > 1: max_parallelism = self.DEFAULT_PARALLELISM self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY, max_parallelism=max_parallelism) AgentTransaction.set_tr_manager(self._tr_manager) self._watchdog = None self.skip_ssl_validation = skip_ssl_validation or agentConfig.get( 'skip_ssl_validation', False) self.use_simple_http_client = use_simple_http_client if self.skip_ssl_validation: log.info( "Skipping SSL hostname validation, useful when using a transparent proxy" ) # Monitor activity if watchdog: watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER / 1000 self._watchdog = Watchdog( watchdog_timeout, max_mem_mb=agentConfig.get('limit_memory_consumption', None), max_resets=WATCHDOG_HIGH_ACTIVITY_THRESHOLD) def log_request(self, handler): """ Override the tornado logging method. If everything goes well, log level is DEBUG. Otherwise it's WARNING or ERROR depending on the response code. """ if handler.get_status() < 400: log_method = log.debug elif handler.get_status() < 500: log_method = log.warning else: log_method = log.error request_time = 1000.0 * handler.request.request_time() log_method(u"%d %s %.2fms", handler.get_status(), handler._request_summary(), request_time) def appendMetric(self, prefix, name, host, device, ts, value): if prefix in self._metrics: metrics = self._metrics[prefix] else: metrics = {} self._metrics[prefix] = metrics if name in metrics: metrics[name].append([host, device, ts, value]) else: metrics[name] = [[host, device, ts, value]] def _postMetrics(self): if len(self._metrics) > 0: self._metrics['uuid'] = get_uuid() self._metrics['internalHostname'] = get_hostname(self._agentConfig) self._metrics['apiKey'] = self._agentConfig['api_key'] MetricTransaction(json.dumps(self._metrics), headers={'Content-Type': 'application/json'}) self._metrics = {} def run(self): handlers = [ (r"/intake/?", AgentInputHandler), (r"/intake/metrics?", MetricsAgentInputHandler), (r"/intake/metadata?", MetadataAgentInputHandler), (r"/api/v1/series/?", ApiInputHandler), (r"/api/v1/check_run/?", ApiCheckRunHandler), (r"/status/?", StatusHandler), ] settings = dict( cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=", xsrf_cookies=False, debug=False, log_function=self.log_request) non_local_traffic = self._agentConfig.get("non_local_traffic", False) tornado.web.Application.__init__(self, handlers, **settings) http_server = tornado.httpserver.HTTPServer(self) try: # non_local_traffic must be == True to match, not just some non-false value if non_local_traffic is True: http_server.listen(self._port) else: # localhost in lieu of 127.0.0.1 to support IPv6 try: http_server.listen(self._port, address=self._agentConfig['bind_host']) except gaierror: log.warning( "localhost seems undefined in your host file, using 127.0.0.1 instead" ) http_server.listen(self._port, address="127.0.0.1") except socket_error as e: if "Errno 99" in str(e): log.warning( "IPv6 doesn't seem to be fully supported. Falling back to IPv4" ) http_server.listen(self._port, address="127.0.0.1") else: raise except socket_error as e: log.exception( "Socket error %s. Is another application listening on the same port ? Exiting", e) sys.exit(1) except Exception as e: log.exception("Uncaught exception. Forwarder is exiting.") sys.exit(1) log.info("Listening on port %d" % self._port) # Register callbacks self.mloop = tornado.ioloop.IOLoop.current() logging.getLogger().setLevel(get_logging_config()['log_level'] or logging.INFO) def flush_trs(): if self._watchdog: self._watchdog.reset() self._postMetrics() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: log.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, get_hostname(self._agentConfig), io_loop=self.mloop) if non_local_traffic is True: gs.listen(gport) else: gs.listen(gport, address="localhost") # Start everything if self._watchdog: self._watchdog.reset() tr_sched.start() self.mloop.start() log.info("Stopped") def stop(self): self.mloop.stop()
def slow_tornado(self): a = Application(12345, self.AGENT_CONFIG) a._watchdog = Watchdog(4) a._tr_manager = MockTxManager() a.run()
class Reporter(threading.Thread): """ The reporter periodically sends the aggregated metrics to the server. """ def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False): threading.Thread.__init__(self) self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.watchdog = None if use_watchdog: from util import Watchdog self.watchdog = Watchdog(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.http_conn_cls = http_client.HTTPSConnection match = re.match('^(https?)://(.*)', api_host) if match: self.api_host = match.group(2) if match.group(1) == 'http': self.http_conn_cls = http_client.HTTPConnection def stop(self): log.info("Stopping reporter") self.finished.set() def run(self): log.info("Reporting to %s every %ss" % (self.api_host, self.interval)) log.debug("Watchdog enabled: %s" % bool(self.watchdog)) # Persist a start-up message. DogstatsdStatus().persist() while not self.finished.isSet(): # Use camel case isSet for 2.4 support. self.finished.wait(self.interval) self.metrics_aggregator.send_packet_count('datadog.dogstatsd.packet.count') self.flush() if self.watchdog: self.watchdog.reset() # Clean up the status messages. log.debug("Stopped reporter") DogstatsdStatus.remove_latest_status() def flush(self): try: self.flush_count += 1 packets_per_second = self.metrics_aggregator.packets_per_second(self.interval) packet_count = self.metrics_aggregator.total_count metrics = self.metrics_aggregator.flush() count = len(metrics) should_log = self.flush_count < LOGGING_INTERVAL or self.flush_count % LOGGING_INTERVAL == 0 if not count: if should_log: log.info("Flush #%s: No metrics to flush." % self.flush_count) else: if should_log: log.info("Flush #%s: flushing %s metrics" % (self.flush_count, count)) self.submit(metrics) # Persist a status message. packet_count = self.metrics_aggregator.total_count DogstatsdStatus( flush_count=self.flush_count, packet_count=packet_count, packets_per_second=packets_per_second, metric_count=count).persist() except: log.exception("Error flushing metrics") def submit(self, metrics): # HACK - Copy and pasted from dogapi, because it's a bit of a pain to distribute python # dependencies with the agent. body = serialize(metrics) headers = {'Content-Type':'application/json'} method = 'POST' params = {} if self.api_key: params['api_key'] = self.api_key url = '/api/v1/series?%s' % urlencode(params) start_time = time() status = None conn = self.http_conn_cls(self.api_host) try: conn.request(method, url, body, headers) #FIXME: add timeout handling code here response = conn.getresponse() status = response.status response.close() finally: conn.close() duration = round((time() - start_time) * 1000.0, 4) log.debug("%s %s %s%s (%sms)" % ( status, method, self.api_host, url, duration)) return duration
class Reporter(threading.Thread): """ The reporter periodically sends the aggregated metrics to the server. """ def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False): threading.Thread.__init__(self) self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.watchdog = None if use_watchdog: from util import Watchdog self.watchdog = Watchdog(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.http_conn_cls = http_client.HTTPSConnection match = re.match('^(https?)://(.*)', api_host) if match: self.api_host = match.group(2) if match.group(1) == 'http': self.http_conn_cls = http_client.HTTPConnection def stop(self): log.info("Stopping reporter") self.finished.set() def run(self): log.info("Reporting to %s every %ss" % (self.api_host, self.interval)) log.debug("Watchdog enabled: %s" % bool(self.watchdog)) # Persist a start-up message. DogstatsdStatus().persist() while not self.finished.isSet(): # Use camel case isSet for 2.4 support. self.finished.wait(self.interval) self.metrics_aggregator.send_packet_count('datadog.dogstatsd.packet.count') self.flush() if self.watchdog: self.watchdog.reset() # Clean up the status messages. log.debug("Stopped reporter") DogstatsdStatus.remove_latest_status() def flush(self): try: self.flush_count += 1 packets_per_second = self.metrics_aggregator.packets_per_second(self.interval) packet_count = self.metrics_aggregator.total_count metrics = self.metrics_aggregator.flush() count = len(metrics) should_log = self.flush_count < LOGGING_INTERVAL or self.flush_count % LOGGING_INTERVAL == 0 if not count: if should_log: log.info("Flush #%s: No metrics to flush." % self.flush_count) else: if should_log: log.info("Flush #%s: flushing %s metrics" % (self.flush_count, count)) self.submit(metrics) events = self.metrics_aggregator.flush_events() event_count = len(events) if not event_count: if should_log: log.info("Flush #%s: No events to flush." % self.flush_count) else: log.debug("Flush #%s: No events to flush." % self.flush_count) else: if should_log: log.info("Flush #%s: flushing %s events" % (self.flush_count, len(events))) else: log.debug("Flush #%s: flushing %s events" % (self.flush_count, len(events))) self.submit_events(events) # Persist a status message. packet_count = self.metrics_aggregator.total_count DogstatsdStatus( flush_count=self.flush_count, packet_count=packet_count, packets_per_second=packets_per_second, metric_count=count, event_count=event_count, ).persist() except Exception, e: log.exception("Error flushing metrics")
def busy_run(self): w = Watchdog(5) w.reset() x = 0 while True: x = random()
def hanging_net(self): w = Watchdog(5) w.reset() x = url.urlopen("http://localhost:31834") print "ERROR Net call returned", x return True
class Application(tornado.web.Application): NO_PARALLELISM = 1 DEFAULT_PARALLELISM = 5 def __init__(self, port, agentConfig, watchdog=True, skip_ssl_validation=False, use_simple_http_client=False): self._port = int(port) self._agentConfig = agentConfig self._metrics = {} AgentTransaction.set_application(self) AgentTransaction.set_endpoints(agentConfig['endpoints']) AgentTransaction.set_request_timeout(agentConfig['forwarder_timeout']) max_parallelism = self.NO_PARALLELISM # Multiple endpoints => enable parallelism if len(agentConfig['endpoints']) > 1: max_parallelism = self.DEFAULT_PARALLELISM self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY, max_parallelism=max_parallelism) AgentTransaction.set_tr_manager(self._tr_manager) self._watchdog = None self.skip_ssl_validation = skip_ssl_validation or agentConfig.get('skip_ssl_validation', False) self.use_simple_http_client = use_simple_http_client if self.skip_ssl_validation: log.info("Skipping SSL hostname validation, useful when using a transparent proxy") # Monitor activity if watchdog: watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER / 1000 self._watchdog = Watchdog( watchdog_timeout, max_mem_mb=agentConfig.get('limit_memory_consumption', None), max_resets=WATCHDOG_HIGH_ACTIVITY_THRESHOLD ) def log_request(self, handler): """ Override the tornado logging method. If everything goes well, log level is DEBUG. Otherwise it's WARNING or ERROR depending on the response code. """ if handler.get_status() < 400: log_method = log.debug elif handler.get_status() < 500: log_method = log.warning else: log_method = log.error request_time = 1000.0 * handler.request.request_time() log_method( u"%d %s %.2fms", handler.get_status(), handler._request_summary(), request_time ) def appendMetric(self, prefix, name, host, device, ts, value): if prefix in self._metrics: metrics = self._metrics[prefix] else: metrics = {} self._metrics[prefix] = metrics if name in metrics: metrics[name].append([host, device, ts, value]) else: metrics[name] = [[host, device, ts, value]] def _postMetrics(self): if len(self._metrics) > 0: self._metrics['uuid'] = get_uuid() self._metrics['internalHostname'] = get_hostname(self._agentConfig) self._metrics['apiKey'] = self._agentConfig['api_key'] MetricTransaction(json.dumps(self._metrics), headers={'Content-Type': 'application/json'}) self._metrics = {} def run(self): handlers = [ (r"/intake/?", AgentInputHandler), (r"/intake/metrics?", MetricsAgentInputHandler), (r"/intake/metadata?", MetadataAgentInputHandler), (r"/api/v1/series/?", ApiInputHandler), (r"/api/v1/check_run/?", ApiCheckRunHandler), (r"/status/?", StatusHandler), ] settings = dict( cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=", xsrf_cookies=False, debug=False, log_function=self.log_request ) non_local_traffic = self._agentConfig.get("non_local_traffic", False) tornado.web.Application.__init__(self, handlers, **settings) http_server = tornado.httpserver.HTTPServer(self) try: # non_local_traffic must be == True to match, not just some non-false value if non_local_traffic is True: http_server.listen(self._port) else: # localhost in lieu of 127.0.0.1 to support IPv6 try: http_server.listen(self._port, address=self._agentConfig['bind_host']) except gaierror: log.warning("localhost seems undefined in your host file, using 127.0.0.1 instead") http_server.listen(self._port, address="127.0.0.1") except socket_error as e: if "Errno 99" in str(e): log.warning("IPv6 doesn't seem to be fully supported. Falling back to IPv4") http_server.listen(self._port, address="127.0.0.1") else: raise except socket_error as e: log.exception("Socket error %s. Is another application listening on the same port ? Exiting", e) sys.exit(1) except Exception as e: log.exception("Uncaught exception. Forwarder is exiting.") sys.exit(1) log.info("Listening on port %d" % self._port) # Register callbacks self.mloop = get_tornado_ioloop() logging.getLogger().setLevel(get_logging_config()['log_level'] or logging.INFO) def flush_trs(): if self._watchdog: self._watchdog.reset() self._postMetrics() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: log.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, get_hostname(self._agentConfig), io_loop=self.mloop) if non_local_traffic is True: gs.listen(gport) else: gs.listen(gport, address="localhost") # Start everything if self._watchdog: self._watchdog.reset() tr_sched.start() self.mloop.start() log.info("Stopped") def stop(self): self.mloop.stop()
def _get_watchdog(self, check_freq, agentConfig): watchdog = None if agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER) watchdog.reset() return watchdog
class Reporter(threading.Thread): """ The reporter periodically sends the aggregated metrics to the server. """ def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False): threading.Thread.__init__(self) self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.watchdog = None if use_watchdog: from util import Watchdog self.watchdog = Watchdog(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.http_conn_cls = http_client.HTTPSConnection match = re.match('^(https?)://(.*)', api_host) if match: self.api_host = match.group(2) if match.group(1) == 'http': self.http_conn_cls = http_client.HTTPConnection def stop(self): log.info("Stopping reporter") self.finished.set() def run(self): log.info("Reporting to %s every %ss" % (self.api_host, self.interval)) log.debug("Watchdog enabled: %s" % bool(self.watchdog)) # Persist a start-up message. DogstatsdStatus().persist() while not self.finished.isSet( ): # Use camel case isSet for 2.4 support. self.finished.wait(self.interval) self.metrics_aggregator.send_packet_count( 'datadog.dogstatsd.packet.count') self.flush() if self.watchdog: self.watchdog.reset() # Clean up the status messages. log.debug("Stopped reporter") DogstatsdStatus.remove_latest_status() def flush(self): try: self.flush_count += 1 packets_per_second = self.metrics_aggregator.packets_per_second( self.interval) packet_count = self.metrics_aggregator.total_count metrics = self.metrics_aggregator.flush() count = len(metrics) should_log = self.flush_count < LOGGING_INTERVAL or self.flush_count % LOGGING_INTERVAL == 0 if not count: if should_log: log.info("Flush #%s: No metrics to flush." % self.flush_count) else: if should_log: log.info("Flush #%s: flushing %s metrics" % (self.flush_count, count)) self.submit(metrics) events = self.metrics_aggregator.flush_events() event_count = len(events) if not event_count: log.info("Flush #%s: No events to flush." % self.flush_count) else: log.info("Flush #%s: flushing %s events" % (self.flush_count, len(events))) self.submit_events(events) # Persist a status message. packet_count = self.metrics_aggregator.total_count DogstatsdStatus(flush_count=self.flush_count, packet_count=packet_count, packets_per_second=packets_per_second, metric_count=count, event_count=event_count).persist() except: log.exception("Error flushing metrics") def submit(self, metrics): # HACK - Copy and pasted from dogapi, because it's a bit of a pain to distribute python # dependencies with the agent. body = serialize_metrics(metrics) headers = {'Content-Type': 'application/json'} method = 'POST' params = {} if self.api_key: params['api_key'] = self.api_key url = '/api/v1/series?%s' % urlencode(params) start_time = time() status = None conn = self.http_conn_cls(self.api_host) try: conn.request(method, url, body, headers) #FIXME: add timeout handling code here response = conn.getresponse() status = response.status response.close() finally: conn.close() duration = round((time() - start_time) * 1000.0, 4) log.debug("%s %s %s%s (%sms)" % (status, method, self.api_host, url, duration)) return duration def submit_events(self, events): headers = {'Content-Type': 'application/json'} method = 'POST' params = {} if self.api_key: params['api_key'] = self.api_key url = '/api/v1/events?%s' % urlencode(params) status = None conn = self.http_conn_cls(self.api_host) try: for event in events: start_time = time() body = serialize_event(event) log.debug('Sending event: %s' % body) conn.request(method, url, body, headers) response = conn.getresponse() status = response.status response.close() duration = round((time() - start_time) * 1000.0, 4) log.debug("%s %s %s%s (%sms)" % (status, method, self.api_host, url, duration)) finally: conn.close()
def normal_run(self): w = Watchdog(2) w.reset() for i in range(5): time.sleep(1) w.reset()
class Application(tornado.web.Application): def __init__(self, port, agentConfig, watchdog=True): self._port = int(port) self._agentConfig = agentConfig self._metrics = {} MetricTransaction.set_application(self) MetricTransaction.set_endpoints() self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY) MetricTransaction.set_tr_manager(self._tr_manager) self._watchdog = None if watchdog: watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER self._watchdog = Watchdog(watchdog_timeout, max_mem_mb=agentConfig.get('limit_memory_consumption', None)) def log_request(self, handler): """ Override the tornado logging method. If everything goes well, log level is DEBUG. Otherwise it's WARNING or ERROR depending on the response code. """ if handler.get_status() < 400: log_method = log.debug elif handler.get_status() < 500: log_method = log.warning else: log_method = log.error request_time = 1000.0 * handler.request.request_time() log_method("%d %s %.2fms", handler.get_status(), handler._request_summary(), request_time) def appendMetric(self, prefix, name, host, device, ts, value): if self._metrics.has_key(prefix): metrics = self._metrics[prefix] else: metrics = {} self._metrics[prefix] = metrics if metrics.has_key(name): metrics[name].append([host, device, ts, value]) else: metrics[name] = [[host, device, ts, value]] def _postMetrics(self): if len(self._metrics) > 0: self._metrics['uuid'] = get_uuid() self._metrics['internalHostname'] = get_hostname(self._agentConfig) self._metrics['apiKey'] = self._agentConfig['api_key'] MetricTransaction(json.dumps(self._metrics), headers={'Content-Type': 'application/json'}) self._metrics = {} def run(self): handlers = [ (r"/intake/?", AgentInputHandler), (r"/api/v1/series/?", ApiInputHandler), (r"/status/?", StatusHandler), ] settings = dict( cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=", xsrf_cookies=False, debug=False, log_function=self.log_request ) non_local_traffic = self._agentConfig.get("non_local_traffic", False) tornado.web.Application.__init__(self, handlers, **settings) http_server = tornado.httpserver.HTTPServer(self) # non_local_traffic must be == True to match, not just some non-false value if non_local_traffic is True: http_server.listen(self._port) else: # localhost in lieu of 127.0.0.1 to support IPv6 try: http_server.listen(self._port, address = "localhost") except gaierror: log.warning("Warning localhost seems undefined in your host file, using 127.0.0.1 instead") http_server.listen(self._port, address = "127.0.0.1") log.info("Listening on port %d" % self._port) # Register callbacks self.mloop = tornado.ioloop.IOLoop.instance() logging.getLogger().setLevel(get_logging_config()['log_level'] or logging.INFO) def flush_trs(): if self._watchdog: self._watchdog.reset() self._postMetrics() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs,TRANSACTION_FLUSH_INTERVAL, io_loop = self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: log.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, get_hostname(self._agentConfig), io_loop=self.mloop) if non_local_traffic is True: gs.listen(gport) else: gs.listen(gport, address = "localhost") # Start everything if self._watchdog: self._watchdog.reset() tr_sched.start() self.mloop.start() log.info("Stopped") def stop(self): self.mloop.stop()
class Reporter(threading.Thread): """ The reporter periodically sends the aggregated metrics to the server. """ def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False): threading.Thread.__init__(self) self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.watchdog = None if use_watchdog: from util import Watchdog self.watchdog = Watchdog(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.http_conn_cls = http_client.HTTPSConnection match = re.match('^(https?)://(.*)', api_host) if match: self.api_host = match.group(2) if match.group(1) == 'http': self.http_conn_cls = http_client.HTTPConnection def stop(self): log.info("Stopping reporter") self.finished.set() def run(self): log.info("Reporting to %s every %ss" % (self.api_host, self.interval)) log.debug("Watchdog enabled: %s" % bool(self.watchdog)) # Persist a start-up message. DogstatsdStatus().persist() while not self.finished.isSet(): # Use camel case isSet for 2.4 support. self.finished.wait(self.interval) self.metrics_aggregator.send_packet_count('datadog.dogstatsd.packet.count') self.flush() if self.watchdog: self.watchdog.reset() # Clean up the status messages. log.debug("Stopped reporter") DogstatsdStatus.remove_latest_status() def flush(self): try: self.flush_count += 1 packets_per_second = self.metrics_aggregator.packets_per_second(self.interval) packet_count = self.metrics_aggregator.total_count metrics = self.metrics_aggregator.flush() count = len(metrics) should_log = self.flush_count < LOGGING_INTERVAL or self.flush_count % LOGGING_INTERVAL == 0 if not count: if should_log: log.info("Flush #%s: No metrics to flush." % self.flush_count) else: if should_log: log.info("Flush #%s: flushing %s metrics" % (self.flush_count, count)) self.submit(metrics) events = self.metrics_aggregator.flush_events() event_count = len(events) if not event_count: if should_log: log.info("Flush #%s: No events to flush." % self.flush_count) else: log.debug("Flush #%s: No events to flush." % self.flush_count) else: if should_log: log.info("Flush #%s: flushing %s events" % (self.flush_count, len(events))) else: log.debug("Flush #%s: flushing %s events" % (self.flush_count, len(events))) self.submit_events(events) # Persist a status message. packet_count = self.metrics_aggregator.total_count DogstatsdStatus( flush_count=self.flush_count, packet_count=packet_count, packets_per_second=packets_per_second, metric_count=count, event_count=event_count ).persist() except Exception, e: log.exception("Error flushing metrics")
def fast_tornado(self): a = Application(12345, {"bind_host": "localhost"}) a._watchdog = Watchdog(6) a._tr_manager = MockTxManager() a.run()
class Application(tornado.web.Application): def __init__(self, port, agentConfig, watchdog=True): self._port = int(port) self._agentConfig = agentConfig self._metrics = {} MetricTransaction.set_application(self) MetricTransaction.set_endpoints() self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY) MetricTransaction.set_tr_manager(self._tr_manager) self._watchdog = None if watchdog: watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER self._watchdog = Watchdog(watchdog_timeout) def appendMetric(self, prefix, name, host, device, ts, value): if self._metrics.has_key(prefix): metrics = self._metrics[prefix] else: metrics = {} self._metrics[prefix] = metrics if metrics.has_key(name): metrics[name].append([host, device, ts, value]) else: metrics[name] = [[host, device, ts, value]] def _postMetrics(self): if len(self._metrics) > 0: self._metrics['uuid'] = get_uuid() self._metrics['internalHostname'] = gethostname(self._agentConfig) self._metrics['apiKey'] = self._agentConfig['api_key'] MetricTransaction(self._metrics, {}) self._metrics = {} def run(self): handlers = [ (r"/intake/?", AgentInputHandler), (r"/api/v1/series/?", ApiInputHandler), (r"/status/?", StatusHandler), ] settings = dict( cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=", xsrf_cookies=False, debug=False, ) non_local_traffic = self._agentConfig.get("non_local_traffic", False) tornado.web.Application.__init__(self, handlers, **settings) http_server = tornado.httpserver.HTTPServer(self) # set the root logger to warn so tornado is less chatty logging.getLogger().setLevel(logging.WARNING) # but keep the forwarder logger at the original level forwarder_logger = logging.getLogger('forwarder') log_config = get_logging_config() forwarder_logger.setLevel(log_config['log_level'] or logging.INFO) # non_local_traffic must be == True to match, not just some non-false value if non_local_traffic is True: http_server.listen(self._port) else: # localhost in lieu of 127.0.0.1 to support IPv6 try: http_server.listen(self._port, address = "localhost") except gaierror: log.warning("Warning localhost seems undefined in your host file, using 127.0.0.1 instead") http_server.listen(self._port, address = "127.0.0.1") log.info("Listening on port %d" % self._port) # Register callbacks self.mloop = tornado.ioloop.IOLoop.instance() def flush_trs(): if self._watchdog: self._watchdog.reset() self._postMetrics() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs,TRANSACTION_FLUSH_INTERVAL, io_loop = self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: log.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, gethostname(self._agentConfig), io_loop=self.mloop) if non_local_traffic is True: gs.listen(gport) else: gs.listen(gport, address = "localhost") # Start everything if self._watchdog: self._watchdog.reset() tr_sched.start() self.mloop.start() log.info("Stopped") def stop(self): self.mloop.stop()
class Application(tornado.web.Application): def __init__(self, port, agentConfig, watchdog=True): self._port = int(port) self._agentConfig = agentConfig self._metrics = {} MetricTransaction.set_application(self) MetricTransaction.set_endpoints() self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY) MetricTransaction.set_tr_manager(self._tr_manager) self._watchdog = None if watchdog: watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER self._watchdog = Watchdog(watchdog_timeout) def appendMetric(self, prefix, name, host, device, ts, value): if self._metrics.has_key(prefix): metrics = self._metrics[prefix] else: metrics = {} self._metrics[prefix] = metrics if metrics.has_key(name): metrics[name].append([host, device, ts, value]) else: metrics[name] = [[host, device, ts, value]] def _postMetrics(self): if len(self._metrics) > 0: self._metrics['uuid'] = get_uuid() self._metrics['internalHostname'] = gethostname(self._agentConfig) self._metrics['apiKey'] = self._agentConfig['api_key'] MetricTransaction(self._metrics, {}) self._metrics = {} def run(self): handlers = [ (r"/intake/?", AgentInputHandler), (r"/api/v1/series/?", ApiInputHandler), (r"/status/?", StatusHandler), ] settings = dict( cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=", xsrf_cookies=False, debug=True, ) tornado.web.Application.__init__(self, handlers, **settings) http_server = tornado.httpserver.HTTPServer(self) http_server.listen(self._port) logging.info("Listening on port %d" % self._port) # Register callbacks self.mloop = tornado.ioloop.IOLoop.instance() def flush_trs(): if self._watchdog: self._watchdog.reset() self._postMetrics() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: logging.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, gethostname(self._agentConfig), io_loop=self.mloop) gs.listen(gport) # Start everything if self._watchdog: self._watchdog.reset() tr_sched.start() self.mloop.start() logging.info("Stopped") def stop(self): self.mloop.stop()
class Reporter(threading.Thread): """ The reporter periodically sends the aggregated metrics to the server. """ def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False, event_chunk_size=None): threading.Thread.__init__(self) self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.log_count = 0 self.watchdog = None if use_watchdog: from util import Watchdog self.watchdog = Watchdog(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE self.http_conn_cls = http_client.HTTPSConnection match = re.match('^(https?)://(.*)', api_host) if match: self.api_host = match.group(2) if match.group(1) == 'http': self.http_conn_cls = http_client.HTTPConnection def stop(self): log.info("Stopping reporter") self.finished.set() def run(self): log.info("Reporting to %s every %ss" % (self.api_host, self.interval)) log.debug("Watchdog enabled: %s" % bool(self.watchdog)) # Persist a start-up message. DogstatsdStatus().persist() while not self.finished.isSet( ): # Use camel case isSet for 2.4 support. self.finished.wait(self.interval) self.metrics_aggregator.send_packet_count( 'datadog.dogstatsd.packet.count') self.flush() if self.watchdog: self.watchdog.reset() # Clean up the status messages. log.debug("Stopped reporter") DogstatsdStatus.remove_latest_status() def flush(self): try: self.flush_count += 1 self.log_count += 1 packets_per_second = self.metrics_aggregator.packets_per_second( self.interval) packet_count = self.metrics_aggregator.total_count metrics = self.metrics_aggregator.flush() count = len(metrics) if self.flush_count % FLUSH_LOGGING_PERIOD == 0: self.log_count = 0 if count: self.submit(metrics) events = self.metrics_aggregator.flush_events() event_count = len(events) if event_count: self.submit_events(events) should_log = self.flush_count <= FLUSH_LOGGING_INITIAL or self.log_count <= FLUSH_LOGGING_COUNT log_func = log.info if not should_log: log_func = log.debug log_func("Flush #%s: flushed %s metric%s and %s event%s" % (self.flush_count, count, plural(count), event_count, plural(event_count))) if self.flush_count == FLUSH_LOGGING_INITIAL: log.info( "First flushes done, %s flushes will be logged every %s flushes." % (FLUSH_LOGGING_COUNT, FLUSH_LOGGING_PERIOD)) # Persist a status message. packet_count = self.metrics_aggregator.total_count DogstatsdStatus( flush_count=self.flush_count, packet_count=packet_count, packets_per_second=packets_per_second, metric_count=count, event_count=event_count, ).persist() except Exception, e: log.exception("Error flushing metrics")
class Reporter(threading.Thread): """ The reporter periodically sends the aggregated metrics to the server. """ def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False, event_chunk_size=None): threading.Thread.__init__(self) self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.log_count = 0 self.watchdog = None if use_watchdog: from util import Watchdog self.watchdog = Watchdog(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE self.http_conn_cls = http_client.HTTPSConnection match = re.match('^(https?)://(.*)', api_host) if match: self.api_host = match.group(2) if match.group(1) == 'http': self.http_conn_cls = http_client.HTTPConnection def stop(self): log.info("Stopping reporter") self.finished.set() def run(self): log.info("Reporting to %s every %ss" % (self.api_host, self.interval)) log.debug("Watchdog enabled: %s" % bool(self.watchdog)) # Persist a start-up message. DogstatsdStatus().persist() while not self.finished.isSet(): # Use camel case isSet for 2.4 support. self.finished.wait(self.interval) self.metrics_aggregator.send_packet_count('datadog.dogstatsd.packet.count') self.flush() if self.watchdog: self.watchdog.reset() # Clean up the status messages. log.debug("Stopped reporter") DogstatsdStatus.remove_latest_status() def flush(self): try: self.flush_count += 1 self.log_count += 1 packets_per_second = self.metrics_aggregator.packets_per_second(self.interval) packet_count = self.metrics_aggregator.total_count metrics = self.metrics_aggregator.flush() count = len(metrics) if self.flush_count % FLUSH_LOGGING_PERIOD == 0: self.log_count = 0 if count: self.submit(metrics) events = self.metrics_aggregator.flush_events() event_count = len(events) if event_count: self.submit_events(events) should_log = self.flush_count <= FLUSH_LOGGING_INITIAL or self.log_count <= FLUSH_LOGGING_COUNT log_func = log.info if not should_log: log_func = log.debug log_func("Flush #%s: flushed %s metric%s and %s event%s" % (self.flush_count, count, plural(count), event_count, plural(event_count))) if self.flush_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, %s flushes will be logged every %s flushes." % (FLUSH_LOGGING_COUNT, FLUSH_LOGGING_PERIOD)) # Persist a status message. packet_count = self.metrics_aggregator.total_count DogstatsdStatus( flush_count=self.flush_count, packet_count=packet_count, packets_per_second=packets_per_second, metric_count=count, event_count=event_count, ).persist() except Exception: log.exception("Error flushing metrics") def submit(self, metrics): # Copy and pasted from dogapi, because it's a bit of a pain to distribute python # dependencies with the agent. body, headers = serialize_metrics(metrics) method = 'POST' params = {} if self.api_key: params['api_key'] = self.api_key url = '/api/v1/series?%s' % urlencode(params) start_time = time() status = None conn = self.http_conn_cls(self.api_host) try: conn.request(method, url, body, headers) #FIXME: add timeout handling code here response = conn.getresponse() status = response.status response.close() finally: conn.close() duration = round((time() - start_time) * 1000.0, 4) log.debug("%s %s %s%s (%sms)" % ( status, method, self.api_host, url, duration)) return duration def submit_events(self, events): headers = {'Content-Type':'application/json'} method = 'POST' events_len = len(events) event_chunk_size = self.event_chunk_size for chunk in chunks(events, event_chunk_size): payload = { 'apiKey': self.api_key, 'events': { 'api': chunk }, 'uuid': get_uuid(), 'internalHostname': get_hostname() } params = {} if self.api_key: params['api_key'] = self.api_key url = '/intake?%s' % urlencode(params) status = None conn = self.http_conn_cls(self.api_host) try: start_time = time() conn.request(method, url, json.dumps(payload), headers) response = conn.getresponse() status = response.status response.close() duration = round((time() - start_time) * 1000.0, 4) log.debug("%s %s %s%s (%sms)" % ( status, method, self.api_host, url, duration)) finally: conn.close()
class Reporter(threading.Thread): """ The reporter periodically sends the aggregated metrics to the server. """ def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False, event_chunk_size=None): threading.Thread.__init__(self) self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.log_count = 0 self.watchdog = None if use_watchdog: from util import Watchdog self.watchdog = Watchdog(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.event_chunk_size = event_chunk_size or EVENT_CHUNK_SIZE self.http_conn_cls = http_client.HTTPSConnection match = re.match('^(https?)://(.*)', api_host) if match: self.api_host = match.group(2) if match.group(1) == 'http': self.http_conn_cls = http_client.HTTPConnection def stop(self): log.info("Stopping reporter") self.finished.set() def run(self): log.info("Reporting to %s every %ss" % (self.api_host, self.interval)) log.debug("Watchdog enabled: %s" % bool(self.watchdog)) # Persist a start-up message. DogstatsdStatus().persist() while not self.finished.isSet(): # Use camel case isSet for 2.4 support. self.finished.wait(self.interval) self.metrics_aggregator.send_packet_count('datadog.dogstatsd.packet.count') self.flush() if self.watchdog: self.watchdog.reset() # Clean up the status messages. log.debug("Stopped reporter") DogstatsdStatus.remove_latest_status() def flush(self): try: self.flush_count += 1 self.log_count += 1 packets_per_second = self.metrics_aggregator.packets_per_second(self.interval) packet_count = self.metrics_aggregator.total_count metrics = self.metrics_aggregator.flush() count = len(metrics) if self.flush_count % FLUSH_LOGGING_PERIOD == 0: self.log_count = 0 if count: self.submit(metrics) events = self.metrics_aggregator.flush_events() event_count = len(events) if event_count: self.submit_events(events) should_log = self.flush_count <= FLUSH_LOGGING_INITIAL or self.log_count <= FLUSH_LOGGING_COUNT log_func = log.info if not should_log: log_func = log.debug log_func("Flush #%s: flushed %s metric%s and %s event%s" % (self.flush_count, count, plural(count), event_count, plural(event_count))) if self.flush_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, %s flushes will be logged every %s flushes." % (FLUSH_LOGGING_COUNT, FLUSH_LOGGING_PERIOD)) # Persist a status message. packet_count = self.metrics_aggregator.total_count DogstatsdStatus( flush_count=self.flush_count, packet_count=packet_count, packets_per_second=packets_per_second, metric_count=count, event_count=event_count, ).persist() except Exception, e: log.exception("Error flushing metrics")
class Application(tornado.web.Application): def __init__(self, port, agentConfig, watchdog=True, skip_ssl_validation=False): self._port = int(port) self._agentConfig = agentConfig self._metrics = {} MetricTransaction.set_application(self) MetricTransaction.set_endpoints() self._tr_manager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY) MetricTransaction.set_tr_manager(self._tr_manager) self._watchdog = None self.skip_ssl_validation = skip_ssl_validation or agentConfig.get('skip_ssl_validation', False) if self.skip_ssl_validation: log.info("Skipping SSL hostname validation, useful when using a transparent proxy") if watchdog: watchdog_timeout = TRANSACTION_FLUSH_INTERVAL * WATCHDOG_INTERVAL_MULTIPLIER self._watchdog = Watchdog(watchdog_timeout, max_mem_mb=agentConfig.get('limit_memory_consumption', None)) def log_request(self, handler): """ Override the tornado logging method. If everything goes well, log level is DEBUG. Otherwise it's WARNING or ERROR depending on the response code. """ if handler.get_status() < 400: log_method = log.debug elif handler.get_status() < 500: log_method = log.warning else: log_method = log.error request_time = 1000.0 * handler.request.request_time() log_method("%d %s %.2fms", handler.get_status(), handler._request_summary(), request_time) def appendMetric(self, prefix, name, host, device, ts, value): if self._metrics.has_key(prefix): metrics = self._metrics[prefix] else: metrics = {} self._metrics[prefix] = metrics if metrics.has_key(name): metrics[name].append([host, device, ts, value]) else: metrics[name] = [[host, device, ts, value]] def _postMetrics(self): if len(self._metrics) > 0: self._metrics['uuid'] = get_uuid() self._metrics['internalHostname'] = get_hostname(self._agentConfig) self._metrics['apiKey'] = self._agentConfig['api_key'] MetricTransaction(json.dumps(self._metrics), headers={'Content-Type': 'application/json'}) self._metrics = {} def run(self): handlers = [ (r"/intake/?", AgentInputHandler), (r"/api/v1/series/?", ApiInputHandler), (r"/status/?", StatusHandler), ] settings = dict( cookie_secret="12oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo=", xsrf_cookies=False, debug=False, log_function=self.log_request ) non_local_traffic = self._agentConfig.get("non_local_traffic", False) tornado.web.Application.__init__(self, handlers, **settings) http_server = tornado.httpserver.HTTPServer(self) # non_local_traffic must be == True to match, not just some non-false value if non_local_traffic is True: http_server.listen(self._port) else: # localhost in lieu of 127.0.0.1 to support IPv6 try: http_server.listen(self._port, address = "localhost") except gaierror: log.warning("Warning localhost seems undefined in your host file, using 127.0.0.1 instead") http_server.listen(self._port, address = "127.0.0.1") log.info("Listening on port %d" % self._port) # Register callbacks self.mloop = get_tornado_ioloop() logging.getLogger().setLevel(get_logging_config()['log_level'] or logging.INFO) def flush_trs(): if self._watchdog: self._watchdog.reset() self._postMetrics() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs,TRANSACTION_FLUSH_INTERVAL, io_loop = self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: log.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, get_hostname(self._agentConfig), io_loop=self.mloop) if non_local_traffic is True: gs.listen(gport) else: gs.listen(gport, address = "localhost") # Start everything if self._watchdog: self._watchdog.reset() tr_sched.start() self.mloop.start() log.info("Stopped") def stop(self): self.mloop.stop()
class Reporter(threading.Thread): """ The reporter periodically sends the aggregated metrics to the server. """ def __init__(self, interval, metrics_aggregator, api_host, api_key=None, use_watchdog=False): threading.Thread.__init__(self) self.daemon = True self.interval = int(interval) self.finished = threading.Event() self.metrics_aggregator = metrics_aggregator self.flush_count = 0 self.watchdog = None if use_watchdog: from util import Watchdog self.watchdog = Watchdog(WATCHDOG_TIMEOUT) self.api_key = api_key self.api_host = api_host self.http_conn_cls = http_client.HTTPSConnection match = re.match('^(https?)://(.*)', api_host) if match: self.api_host = match.group(2) if match.group(1) == 'http': self.http_conn_cls = http_client.HTTPConnection def stop(self): self.finished.set() def run(self): logger.info("Reporting to %s every %ss" % (self.api_host, self.interval)) logger.debug("Watchdog enabled: %s" % bool(self.watchdog)) while True: if self.finished.isSet(): # Use camel case version for 2.4 support. break self.finished.wait(self.interval) self.metrics_aggregator.send_packet_count('datadog.dogstatsd.packet.count') self.flush() if self.watchdog: self.watchdog.reset() def flush(self): try: self.flush_count += 1 metrics = self.metrics_aggregator.flush() count = len(metrics) if not count: logger.info("Flush #%s: No metrics to flush." % self.flush_count) return logger.info("Flush #%s: flushing %s metrics" % (self.flush_count, count)) self.submit(metrics) except: logger.exception("Error flushing metrics") def submit(self, metrics): # HACK - Copy and pasted from dogapi, because it's a bit of a pain to distribute python # dependencies with the agent. conn = self.http_conn_cls(self.api_host) body = json.dumps({"series" : metrics}) headers = {'Content-Type':'application/json'} method = 'POST' params = {} if self.api_key: params['api_key'] = self.api_key url = '/api/v1/series?%s' % urlencode(params) start_time = time() conn.request(method, url, body, headers) #FIXME: add timeout handling code here response = conn.getresponse() duration = round((time() - start_time) * 1000.0, 4) logger.info("%s %s %s%s (%sms)" % ( response.status, method, self.api_host, url, duration))