def process(metric, datapoint): increment('datapointsReceived') for rule in RewriteRuleManager.preRules: metric = rule.apply(metric) aggregate_metrics = [] for rule in RuleManager.rules: aggregate_metric = rule.get_aggregate_metric(metric) if aggregate_metric is None: continue else: aggregate_metrics.append(aggregate_metric) buffer = BufferManager.get_buffer(aggregate_metric) if not buffer.configured: buffer.configure_aggregation(rule.frequency, rule.aggregation_func) buffer.input(datapoint) for rule in RewriteRuleManager.postRules: metric = rule.apply(metric) if metric not in aggregate_metrics: events.metricGenerated(metric, datapoint) if len(aggregate_metrics) == 0: log.msg("Couldn't match metric %s with any aggregation rule. Passing on un-aggregated." % metric)
def processMessage(self, message): """Parse a message and post it as a metric.""" if self.factory.verbose: log.listener("Message received: %s" % (message,)) metric = message.routing_key for line in message.content.body.split("\n"): try: if settings.get("AMQP_METRIC_NAME_IN_BODY", False): metric, value, timestamp = line.strip().split() else: value, timestamp = line.strip().split() datapoint = ( float(timestamp), float(value) ) except ValueError: log.listener("invalid message line: %s" % (line,)) continue increment('metricsReceived') metricReceived(metric, datapoint) if self.factory.verbose: log.listener("Metric posted: %s %s %s" % (metric, value, timestamp,))
def compute_value(self): value = self.aggregation_func(self.values) timestamp = time.time() - self.aggregation_frequency datapoint = (timestamp, value) self.values = [] send_metric(self.metric_path, datapoint) increment('aggregateDatapointsSent')
def createWhisperFile(metric, dbFilePath, dbFileExists): if not dbFileExists: archiveConfig = None xFilesFactor, aggregationMethod = None, None for schema in schemas: if schema.matches(metric): log.creates('new metric %s matched schema %s' % (metric, schema.name)) archiveConfig = [archive.getTuple() for archive in schema.archives] break for schema in agg_schemas: if schema.matches(metric): log.creates('new metric %s matched aggregation schema %s' % (metric, schema.name)) xFilesFactor, aggregationMethod = schema.archives break if not archiveConfig: raise Exception("No storage schema matched the metric '%s', check your storage-schemas.conf file." % metric) dbDir = dirname(dbFilePath) try: os.makedirs(dbDir) except OSError as e: if e.errno != errno.EEXIST: log.err("%s" % e) log.creates("creating database file %s (archive=%s xff=%s agg=%s)" % (dbFilePath, archiveConfig, xFilesFactor, aggregationMethod)) try: whisper.create(dbFilePath, archiveConfig, xFilesFactor, aggregationMethod, settings.WHISPER_SPARSE_CREATE, settings.WHISPER_FALLOCATE_CREATE) instrumentation.increment('creates') except Exception, e: log.err("Error creating %s: %s" % (dbFilePath, e)) return False
def connectionQualityMonitor(self): """Checks to see if the connection for this factory appears to be delivering stats at a speed close to what we're receiving them at. This is open to other measures of connection quality. Returns a Bool True means that quality is good, OR True means that the total received is less than settings.MIN_RESET_STAT_FLOW False means that quality is bad """ destination_sent = float(instrumentation.prior_stats.get(self.sent, 0)) total_received = float(instrumentation.prior_stats.get('metricsReceived', 0)) instrumentation.increment(self.slowConnectionReset, 0) if total_received < settings.MIN_RESET_STAT_FLOW: return True if (destination_sent / total_received) < settings.MIN_RESET_RATIO: return False else: return True
def process(metric, datapoint): increment('datapointsReceived') for rule in RewriteRuleManager.preRules: metric = rule.apply(metric) aggregate_metrics = [] for rule in RuleManager.rules: aggregate_metric = rule.get_aggregate_metric(metric) if aggregate_metric is None: continue else: aggregate_metrics.append(aggregate_metric) buffer = BufferManager.get_buffer(aggregate_metric) if not buffer.configured: buffer.configure_aggregation(rule.frequency, rule.aggregation_func) buffer.input(datapoint) for rule in RewriteRuleManager.postRules: metric = rule.apply(metric) if metric not in aggregate_metrics: events.metricGenerated(metric, datapoint)
def process(self, metric, datapoint): increment('datapointsReceived') for rule in RewriteRuleManager.rules(PRE): metric = rule.apply(metric) aggregate_metrics = set() for rule in RuleManager.rules: aggregate_metric = rule.get_aggregate_metric(metric) if aggregate_metric is None: continue else: aggregate_metrics.add(aggregate_metric) values_buffer = BufferManager.get_buffer(aggregate_metric) if not values_buffer.configured: values_buffer.configure_aggregation(rule.frequency, rule.aggregation_func) values_buffer.input(datapoint) for rule in RewriteRuleManager.rules(POST): metric = rule.apply(metric) if metric not in aggregate_metrics: yield (metric, datapoint)
def process(self, metric, datapoint): increment('datapointsReceived') aggregate_metrics = set() for rule in RuleManager.rules: aggregate_metric = rule.get_aggregate_metric(metric) if aggregate_metric is None: continue else: aggregate_metrics.add(aggregate_metric) values_buffer = BufferManager.get_buffer(aggregate_metric) if not values_buffer.configured: values_buffer.configure_aggregation(rule.frequency, rule.aggregation_func) values_buffer.input(datapoint) if settings.FORWARD_ALL and metric not in aggregate_metrics: if settings.LOG_AGGREGATOR_MISSES and len(aggregate_metrics) == 0: log.msg( "Couldn't match metric %s with any aggregation rule. Passing on un-aggregated." % metric) yield (metric, datapoint)
def writeWhisperFile(dbFilePath, datapoints): try: whisper.update_many(dbFilePath, datapoints) except: log.msg("Error writing to %s" % (dbFilePath)) log.err() instrumentation.increment('errors') return False return True
def lineReceived(self, line): try: metric, value, timestamp = line.strip().split() datapoint = ( float(timestamp), float(value) ) except: log.listener('invalid line received from client %s, ignoring' % self.peerAddr) return increment('metricsReceived') metricReceived(metric, datapoint)
def sendDatapoint(self, metric, datapoint): if self.paused: self.factory.enqueue(metric, datapoint) instrumentation.increment(self.queuedUntilReady) elif self.factory.hasQueuedDatapoints(): self.factory.enqueue(metric, datapoint) self.sendQueued() else: self._sendDatapoints([(metric, datapoint)])
def store(self, metric, datapoint): if self.isFull(): increment('cache.overflow') return metric = '.'.join(part for part in metric.split('.') if part) # normalize the path try: self.lock.acquire() self.setdefault(metric, []).append(datapoint) self.size += 1 finally: self.lock.release()
def sendQueued(self): while (not self.paused) and self.factory.hasQueuedDatapoints(): datapoints = self.factory.takeSomeFromQueue() self.sendString( pickle.dumps(datapoints, protocol=-1) ) self.factory.checkQueue() instrumentation.increment(self.sent, len(datapoints)) if (settings.USE_FLOW_CONTROL and state.metricReceiversPaused and self.factory.queueSize < SEND_QUEUE_LOW_WATERMARK): log.clients('send queue has space available, resuming paused clients') events.resumeReceivingMetrics()
def send(self, metric, datapoint): if self.paused: self.queue.append( (metric, datapoint) ) increment(self.queuedUntilReady) elif self.queue: self.queue.append( (metric, datapoint) ) self.flushQueue() else: datapoints = [ (metric, datapoint) ] self.sendString( pickle.dumps(datapoints) ) increment(self.sent)
def sendDatapoint(self, metric, datapoint): if self.paused: self.factory.enqueue(metric, datapoint) instrumentation.increment(self.queuedUntilReady) elif self.factory.hasQueuedDatapoints(): self.factory.enqueue(metric, datapoint) self.sendQueued() else: datapoints = [(metric, datapoint)] self.sendString(pickle.dumps(datapoints, protocol=-1)) instrumentation.increment(self.sent) self.factory.checkQueue()
def resetConnectionForQualityReasons(self, reason): """Only re-sets the connection if it's been settings.MIN_RESET_INTERVAL seconds since the last re-set. Reason should be a string containing the quality info that led to a re-set. """ if (time() - self.lastResetTime) < float(settings.MIN_RESET_INTERVAL): return else: self.factory.connectedProtocol.disconnect() self.lastResetTime = time() instrumentation.increment(self.slowConnectionReset) log.clients("%s:: resetConnectionForQualityReasons: %s" % (self, reason))
def stringReceived(self, data): try: datapoints = pickle.loads(data) except: log.listener('invalid pickle received from client %s, disconnecting' % self.peerAddr) self.transport.loseConnection() return for (metric, datapoint) in datapoints: datapoint = ( float(datapoint[0]), float(datapoint[1]) ) #force proper types if datapoint[1] == datapoint[1]: # filter out NaN values metricReceived(metric, datapoint) increment('metricsReceived', len(datapoints))
def writeCachedDataPoints(): "Write datapoints until the MetricCache is completely empty" while MetricCache: dataWritten = False for (metric, datapoints, dbFileExists) in optimalWriteOrder(): dataWritten = True if not dbFileExists: archiveConfig = None xFilesFactor, aggregationMethod = None, None for schema in SCHEMAS: if schema.matches(metric): log.creates('new metric %s matched schema %s' % (metric, schema.name)) archiveConfig = [archive.getTuple() for archive in schema.archives] break for schema in AGGREGATION_SCHEMAS: if schema.matches(metric): log.creates('new metric %s matched aggregation schema %s' % (metric, schema.name)) xFilesFactor, aggregationMethod = schema.archives break if not archiveConfig: raise Exception("No storage schema matched the metric '%s', check your storage-schemas.conf file." % metric) log.creates("creating database metric %s (archive=%s xff=%s agg=%s)" % (metric, archiveConfig, xFilesFactor, aggregationMethod)) try: state.database.create(metric, archiveConfig, xFilesFactor, aggregationMethod) instrumentation.increment('creates') except Exception, e: log.err() log.msg("Error creating %s: %s" % (metric, e)) instrumentation.increment('errors') continue # If we've got a rate limit configured lets makes sure we enforce it if UPDATE_BUCKET: UPDATE_BUCKET.drain(1, blocking=True) try: t1 = time.time() # If we have duplicated points, always pick the last. update_many() # has no guaranted behavior for that, and in fact the current implementation # will keep the first point in the list. datapoints = dict(datapoints).items() state.database.write(metric, datapoints) updateTime = time.time() - t1 except Exception, e: log.err() log.msg("Error writing to %s: %s" % (metric, e)) instrumentation.increment('errors') else: pointCount = len(datapoints) instrumentation.increment('committedPoints', pointCount) instrumentation.append('updateTimes', updateTime) if settings.LOG_UPDATES: log.updates("wrote %d datapoints for %s in %.5f seconds" % (pointCount, metric, updateTime))
def compute_value(self): now = int(time.time()) current_interval = now - (now % self.aggregation_frequency) age_threshold = current_interval - (settings["MAX_AGGREGATION_INTERVALS"] * self.aggregation_frequency) for buffer in self.interval_buffers.values(): if buffer.active: value = self.aggregation_func(buffer.values) datapoint = (buffer.interval, value) send_metric(self.metric_path, datapoint) increment("aggregateDatapointsSent") buffer.mark_inactive() if buffer.interval < age_threshold: del self.interval_buffers[buffer.interval]
def process(metric, datapoint): increment('datapointsReceived') for rule in RewriteRuleManager.preRules: metric = rule.apply(metric) aggregate_metrics = [] if settings.AGGREGATOR_RULE_METHOD == "rules": for rule in RuleManager.rules: aggregate_metric = rule.get_aggregate_metric(metric) if aggregate_metric is None: continue else: aggregate_metrics.append(aggregate_metric) buffer = BufferManager.get_buffer(aggregate_metric) if not buffer.configured: buffer.configure_aggregation(rule.frequency, rule.aggregation_func) buffer.input(datapoint) # Custom rule to sum metrics elif settings.AGGREGATOR_RULE_METHOD == "sumall": sum_index = metric.find(".sum.") metric_namespace = metric[:metric.find(".")] nsDict = settings["aggregation-sumall-rules"] if sum_index != -1: aggregate_metric = metric[:sum_index] + ".sum_all.hosts" aggregate_metrics.append(aggregate_metric) buffer = BufferManager.get_buffer(aggregate_metric) aggregate_time_interval = 60 if metric_namespace in nsDict: aggregate_time_interval = int(nsDict[metric_namespace]) if not buffer.configured: buffer.configure_aggregation(aggregate_time_interval, sum) buffer.input(datapoint) for rule in RewriteRuleManager.postRules: metric = rule.apply(metric) if settings['FORWARD_ALL'] and metric not in aggregate_metrics: #log.msg("Couldn't match metric %s with any aggregation rule. Passing on un-aggregated." % metric) events.metricGenerated(metric, datapoint)
def sendHighPriorityDatapoint(self, metric, datapoint): """The high priority datapoint is one relating to the carbon daemon itself. It puts the datapoint on the left of the deque, ahead of other stats, so that when the carbon-relay, specifically, is overwhelmed its stats are more likely to make it through and expose the issue at hand. In addition, these stats go on the deque even when the max stats capacity has been reached. This relies on not creating the deque with a fixed max size. """ instrumentation.increment(self.attemptedRelays) self.enqueue_from_left(metric, datapoint) if self.connectedProtocol: reactor.callLater(settings.TIME_TO_DEFER_SENDING, self.connectedProtocol.sendQueued) else: instrumentation.increment(self.queuedUntilConnected)
def optimalWriteOrder(): """Generates metrics with the most cached values first and applies a soft rate limit on new metrics""" global lastCreateInterval global createCount metrics = MetricCache.counts() t = time.time() metrics.sort(key=lambda item: item[1], reverse=True) # by queue size, descending log.debug("Sorted %d cache queues in %.6f seconds" % (len(metrics), time.time() - t)) for metric, queueSize in metrics: if state.cacheTooFull and MetricCache.size < CACHE_SIZE_LOW_WATERMARK: events.cacheSpaceAvailable() dbFilePath = getFilesystemPath(metric) dbFileExists = exists(dbFilePath) if not dbFileExists: createCount += 1 now = time.time() if now - lastCreateInterval >= 60: lastCreateInterval = now createCount = 1 elif createCount >= settings.MAX_CREATES_PER_MINUTE: # dropping queued up datapoints for new metrics prevents filling up the entire cache # when a bunch of new metrics are received. try: MetricCache.pop(metric) except KeyError: pass instrumentation.increment('droppedCreates') continue try: # metrics can momentarily disappear from the MetricCache due to the implementation of MetricCache.store() datapoints = MetricCache.pop(metric) except KeyError: log.msg("MetricCache contention, skipping %s update for now" % metric) continue # we simply move on to the next metric when this race condition occurs yield (metric, datapoints, dbFilePath, dbFileExists)
def stringReceived(self, data): try: datapoints = pickle.loads(data) except: log.listener('invalid pickle received from client %s, ignoring' % self.peerAddr) return for (metric, datapoint) in datapoints: try: datapoint = (float(datapoint[0]), float(datapoint[1]) ) #force proper types except: continue if datapoint[1] == datapoint[1]: # filter out NaN values metricReceived(metric, datapoint) increment('metricsReceived', len(datapoints))
def writeCachedDataPoints(): "Write datapoints until the MetricCache is completely empty" updates = 0 lastSecond = 0 while MetricCache: dataWritten = False for (metric, datapoints, dbFilePath, dbFileExists) in optimalWriteOrder(): dataWritten = True write_lock.acquire() try: if not createWhisperFile(metric, dbFilePath, dbFileExists): continue t1 = time.time() written = writeWhisperFile(dbFilePath, datapoints) finally: write_lock.release() if written: t2 = time.time() updateTime = t2 - t1 pointCount = len(datapoints) instrumentation.increment('committedPoints', pointCount) instrumentation.append('updateTimes', updateTime) if settings.LOG_UPDATES: log.updates("wrote %d datapoints for %s in %.5f seconds" % (pointCount, metric, updateTime)) # Rate limit update operations thisSecond = int(t2) if thisSecond != lastSecond: lastSecond = thisSecond updates = 0 else: updates += 1 if updates >= settings.MAX_UPDATES_PER_SECOND: time.sleep(int(t2 + 1) - t2) # Avoid churning CPU when only new metrics are in the cache if not dataWritten: time.sleep(0.1)
def writeCachedDataPoints(): "Write datapoints until the MetricCache is completely empty" while MetricCache: dataWritten = False for (metric, datapoints, dbFilePath, dbFileExists) in optimalWriteOrder(): dataWritten = True if not dbFileExists: archiveConfig = None xFilesFactor, aggregationMethod = None, None for schema in SCHEMAS: if schema.matches(metric): log.creates('new metric %s matched schema %s' % (metric, schema.name)) archiveConfig = [archive.getTuple() for archive in schema.archives] break for schema in AGGREGATION_SCHEMAS: if schema.matches(metric): log.creates('new metric %s matched aggregation schema %s' % (metric, schema.name)) xFilesFactor, aggregationMethod = schema.archives break if not archiveConfig: raise Exception("No storage schema matched the metric '%s', check your storage-schemas.conf file." % metric) log.creates("creating database file %s (archive=%s xff=%s agg=%s)" % (dbFilePath, archiveConfig, xFilesFactor, aggregationMethod)) try: state.database.create(metric, archiveConfig, xFilesFactor, aggregationMethod) instrumentation.increment('creates') except Exception: log.err("Error creating %s" % (dbFilePath)) continue # If we've got a rate limit configured lets makes sure we enforce it if UPDATE_BUCKET: UPDATE_BUCKET.drain(1, blocking=True) try: t1 = time.time() state.database.write(metric, datapoints) updateTime = time.time() - t1 except Exception: log.msg("Error writing to %s" % (dbFilePath)) log.err() instrumentation.increment('errors') else: pointCount = len(datapoints) instrumentation.increment('committedPoints', pointCount) instrumentation.append('updateTimes', updateTime) if settings.LOG_UPDATES: log.updates("wrote %d datapoints for %s in %.5f seconds" % (pointCount, metric, updateTime)) # Avoid churning CPU when only new metrics are in the cache if not dataWritten: time.sleep(0.1)
def sendDatapoint(self, metric, datapoint): instrumentation.increment(self.attemptedRelays) if len(self.queue) >= settings.MAX_QUEUE_SIZE: log.clients("%s::sendDatapoint send queue full, dropping datapoint") instrumentation.increment(self.fullQueueDrops) elif self.connectedProtocol: self.connectedProtocol.sendDatapoint(metric, datapoint) else: self.enqueue(metric, datapoint) instrumentation.increment(self.queuedUntilConnected)
def send(self, metric, datapoint): increment(self.attemptedRelays) if len(self.queue) >= settings.MAX_QUEUE_SIZE: log.relay('relay queue full for %s, dropping data' % self.remoteAddr) increment(self.fullQueueDrops) elif self.connectedProtocol: self.connectedProtocol.send(metric, datapoint) else: self.queue.append( (metric, datapoint) ) increment(self.queuedUntilConnected)
def sendDatapoint(self, metric, datapoint): instrumentation.increment(self.attemptedRelays) queueSize = self.queueSize if queueSize >= settings.MAX_QUEUE_SIZE: if not self.queueFull.called: self.queueFull.callback(queueSize) instrumentation.increment(self.fullQueueDrops) elif self.connectedProtocol: self.connectedProtocol.sendDatapoint(metric, datapoint) else: self.enqueue(metric, datapoint) instrumentation.increment(self.queuedUntilConnected)
def sendDatapoint(self, metric, datapoint): instrumentation.increment(self.attemptedRelays) if self.queueSize >= settings.MAX_QUEUE_SIZE: if not self.queueFull.called: self.queueFull.callback(self.queueSize) instrumentation.increment(self.fullQueueDrops) else: self.enqueue(metric, datapoint) if self.connectedProtocol: reactor.callLater(settings.TIME_TO_DEFER_SENDING, self.connectedProtocol.sendQueued) else: instrumentation.increment(self.queuedUntilConnected)
def writeCachedDataPoints(): "Write datapoints until the MetricCache is completely empty" updates = 0 lastSecond = 0 while MetricCache: dataWritten = False #for (metric, datapoints, dbFilePath, dbFileExists) in optimalWriteOrder(): for (metric, datapoints, dbIdentifier, dbExists) in optimalWriteOrder(): dataWritten = True if not dbExists: persister.create_db(metric) instrumentation.increment('creates') try: t1 = time.time() persister.update_many(metric, datapoints, dbIdentifier) t2 = time.time() updateTime = t2 - t1 except: log.msg("Error writing to %s" % (dbIdentifier)) log.err() instrumentation.increment('errors') else: pointCount = len(datapoints) instrumentation.increment('committedPoints', pointCount) instrumentation.append('updateTimes', updateTime) if settings.LOG_UPDATES: log.updates("wrote %d datapoints for %s in %.5f seconds" % (pointCount, metric, updateTime)) # Rate limit update operations thisSecond = int(t2) if thisSecond != lastSecond: lastSecond = thisSecond updates = 0 else: updates += 1 if updates >= settings.MAX_UPDATES_PER_SECOND: time.sleep( int(t2 + 1) - t2 ) # Let the persister know it can flush # (depends on the implementation) persister.flush() # Avoid churning CPU when only new metrics are in the cache if not dataWritten: time.sleep(0.1)
def send(self, metric, datapoint): increment(self.attemptedRelays) if len(self.queue) >= settings.MAX_QUEUE_SIZE: log.relay('relay queue full for %s:%d, dropping data' % self.remoteAddr) increment(self.fullQueueDrops) elif self.connectedProtocol: self.connectedProtocol.send(metric, datapoint) else: self.queue.append((metric, datapoint)) increment(self.queuedUntilConnected)
def sendDatapoint(self, metric, datapoint): instrumentation.increment(self.attemptedRelays) instrumentation.max(self.relayMaxQueueLength, self.queueSize) if self.queueSize >= settings.MAX_QUEUE_SIZE: if not self.queueFull.called: self.queueFull.callback(self.queueSize) instrumentation.increment(self.fullQueueDrops) else: self.enqueue(metric, datapoint) if self.connectedProtocol: self.scheduleSend() else: instrumentation.increment(self.queuedUntilConnected)