def __init__(self, metrics, metric_group_prefix, conns): self.metrics = metrics self.metric_group_name = metric_group_prefix + '-metrics' self.connection_closed = metrics.sensor('connections-closed') self.connection_closed.add( metrics.metric_name( 'connection-close-rate', self.metric_group_name, 'Connections closed per second in the window.'), Rate()) self.connection_created = metrics.sensor('connections-created') self.connection_created.add( metrics.metric_name( 'connection-creation-rate', self.metric_group_name, 'New connections established per second in the window.'), Rate()) self.select_time = metrics.sensor('select-time') self.select_time.add( metrics.metric_name( 'select-rate', self.metric_group_name, 'Number of times the I/O layer checked for new I/O to perform per' ' second'), Rate(sampled_stat=Count())) self.select_time.add( metrics.metric_name( 'io-wait-time-ns-avg', self.metric_group_name, 'The average length of time the I/O thread spent waiting for a' ' socket ready for reads or writes in nanoseconds.'), Avg()) self.select_time.add( metrics.metric_name( 'io-wait-ratio', self.metric_group_name, 'The fraction of time the I/O thread spent waiting.'), Rate(time_unit=TimeUnit.NANOSECONDS)) self.io_time = metrics.sensor('io-time') self.io_time.add( metrics.metric_name( 'io-time-ns-avg', self.metric_group_name, 'The average length of time for I/O per select call in nanoseconds.' ), Avg()) self.io_time.add( metrics.metric_name( 'io-ratio', self.metric_group_name, 'The fraction of time the I/O thread spent doing I/O'), Rate(time_unit=TimeUnit.NANOSECONDS)) metrics.add_metric( metrics.metric_name('connection-count', self.metric_group_name, 'The current number of active connections.'), AnonMeasurable(lambda config, now: len(conns)))
def __init__(self, heartbeat, metrics, prefix, tags=None): self.heartbeat = heartbeat self.metrics = metrics self.metric_group_name = prefix + "-coordinator-metrics" self.heartbeat_latency = metrics.sensor('heartbeat-latency') self.heartbeat_latency.add(metrics.metric_name( 'heartbeat-response-time-max', self.metric_group_name, 'The max time taken to receive a response to a heartbeat request', tags), Max()) self.heartbeat_latency.add(metrics.metric_name( 'heartbeat-rate', self.metric_group_name, 'The average number of heartbeats per second', tags), Rate(sampled_stat=Count())) self.join_latency = metrics.sensor('join-latency') self.join_latency.add(metrics.metric_name( 'join-time-avg', self.metric_group_name, 'The average time taken for a group rejoin', tags), Avg()) self.join_latency.add(metrics.metric_name( 'join-time-max', self.metric_group_name, 'The max time taken for a group rejoin', tags), Max()) self.join_latency.add(metrics.metric_name( 'join-rate', self.metric_group_name, 'The number of group joins per second', tags), Rate(sampled_stat=Count())) self.sync_latency = metrics.sensor('sync-latency') self.sync_latency.add(metrics.metric_name( 'sync-time-avg', self.metric_group_name, 'The average time taken for a group sync', tags), Avg()) self.sync_latency.add(metrics.metric_name( 'sync-time-max', self.metric_group_name, 'The max time taken for a group sync', tags), Max()) self.sync_latency.add(metrics.metric_name( 'sync-rate', self.metric_group_name, 'The number of group syncs per second', tags), Rate(sampled_stat=Count())) metrics.add_metric(metrics.metric_name( 'last-heartbeat-seconds-ago', self.metric_group_name, 'The number of seconds since the last controller heartbeat was sent', tags), AnonMeasurable( lambda _, now: (now / 1000) - self.heartbeat.last_send))
def __init__(self, metrics, metric_group_prefix, subscription): self.metrics = metrics self.metric_group_name = '%s-coordinator-metrics' % ( metric_group_prefix, ) self.commit_latency = metrics.sensor('commit-latency') self.commit_latency.add( metrics.metric_name('commit-latency-avg', self.metric_group_name, 'The average time taken for a commit request'), Avg()) self.commit_latency.add( metrics.metric_name('commit-latency-max', self.metric_group_name, 'The max time taken for a commit request'), Max()) self.commit_latency.add( metrics.metric_name('commit-rate', self.metric_group_name, 'The number of commit calls per second'), Rate(sampled_stat=Count())) num_parts = AnonMeasurable( lambda config, now: len(subscription.assigned_partitions())) metrics.add_metric( metrics.metric_name( 'assigned-partitions', self.metric_group_name, 'The number of partitions currently assigned to this consumer' ), num_parts)
def __init__( self, memory, poolable_size, metrics=None, metric_group_prefix="producer-metrics", ): """Create a new buffer pool. Arguments: memory (int): maximum memory that this buffer pool can allocate poolable_size (int): memory size per buffer to cache in the free list rather than deallocating """ self._poolable_size = poolable_size self._lock = threading.RLock() buffers = int(memory / poolable_size) if poolable_size else 0 self._free = collections.deque([io.BytesIO() for _ in range(buffers)]) self._waiters = collections.deque() self.wait_time = None if metrics: self.wait_time = metrics.sensor("bufferpool-wait-time") self.wait_time.add( metrics.metric_name( "bufferpool-wait-ratio", metric_group_prefix, "The fraction of time an appender waits for space allocation.", ), Rate(), )
def test_rate_windowing(mocker, time_keeper, metrics): mocker.patch('time.time', side_effect=time_keeper.time) # Use the default time window. Set 3 samples config = MetricConfig(samples=3) sensor = metrics.sensor('test.sensor', config) sensor.add(metrics.metric_name('test.rate', 'grp1'), Rate(TimeUnit.SECONDS)) sum_val = 0 count = config.samples - 1 # Advance 1 window after every record for i in range(count): sensor.record(100) sum_val += 100 time_keeper.sleep(config.time_window_ms / 1000.0) # Sleep for half the window. time_keeper.sleep(config.time_window_ms / 2.0 / 1000.0) # prior to any time passing elapsed_secs = (config.time_window_ms * (config.samples - 1) + config.time_window_ms / 2.0) / 1000.0 kafka_metric = metrics.metrics.get(metrics.metric_name('test.rate', 'grp1')) assert abs((sum_val / elapsed_secs) - kafka_metric.value()) < EPS, \ 'Rate(0...2) = 2.666' assert abs(elapsed_secs - (kafka_metric.measurable.window_size(config, time.time() * 1000) / 1000.0)) \ < EPS, 'Elapsed Time = 75 seconds'
def maybe_register_topic_metrics(self, topic): def sensor_name(name): return "topic.{0}.{1}".format(topic, name) # if one sensor of the metrics has been registered for the topic, # then all other sensors should have been registered; and vice versa if not self.metrics.get_sensor(sensor_name("records-per-batch")): self.add_metric( "record-send-rate", Rate(), sensor_name=sensor_name("records-per-batch"), group_name="producer-topic-metrics." + topic, description="Records sent per second for topic " + topic, ) self.add_metric( "byte-rate", Rate(), sensor_name=sensor_name("bytes"), group_name="producer-topic-metrics." + topic, description="Bytes per second for topic " + topic, ) self.add_metric( "compression-rate", Avg(), sensor_name=sensor_name("compression-rate"), group_name="producer-topic-metrics." + topic, description="Average Compression ratio for topic " + topic, ) self.add_metric( "record-retry-rate", Rate(), sensor_name=sensor_name("record-retries"), group_name="producer-topic-metrics." + topic, description="Record retries per second for topic " + topic, ) self.add_metric( "record-error-rate", Rate(), sensor_name=sensor_name("record-errors"), group_name="producer-topic-metrics." + topic, description="Record errors per second for topic " + topic, )
def record_topic_fetch_metrics(self, topic, num_bytes, num_records): # record bytes fetched name = '.'.join(['topic', topic, 'bytes-fetched']) bytes_fetched = self.metrics.get_sensor(name) if not bytes_fetched: metric_tags = {'topic': topic.replace('.', '_')} bytes_fetched = self.metrics.sensor(name) bytes_fetched.add( self.metrics.metric_name( 'fetch-size-avg', self.group_name, 'The average number of bytes fetched per request for topic %s' % (topic, ), metric_tags), Avg()) bytes_fetched.add( self.metrics.metric_name( 'fetch-size-max', self.group_name, 'The maximum number of bytes fetched per request for topic %s' % (topic, ), metric_tags), Max()) bytes_fetched.add( self.metrics.metric_name( 'bytes-consumed-rate', self.group_name, 'The average number of bytes consumed per second for topic %s' % (topic, ), metric_tags), Rate()) bytes_fetched.record(num_bytes) # record records fetched name = '.'.join(['topic', topic, 'records-fetched']) records_fetched = self.metrics.get_sensor(name) if not records_fetched: metric_tags = {'topic': topic.replace('.', '_')} records_fetched = self.metrics.sensor(name) records_fetched.add( self.metrics.metric_name( 'records-per-request-avg', self.group_name, 'The average number of records in each request for topic %s' % (topic, ), metric_tags), Avg()) records_fetched.add( self.metrics.metric_name( 'records-consumed-rate', self.group_name, 'The average number of records consumed per second for topic %s' % (topic, ), metric_tags), Rate()) records_fetched.record(num_records)
def maybe_register_topic_metrics(self, topic): def sensor_name(name): return 'topic.{0}.{1}'.format(topic, name) # if one sensor of the metrics has been registered for the topic, # then all other sensors should have been registered; and vice versa if not self.metrics.get_sensor(sensor_name('records-per-batch')): self.add_metric('record-send-rate', Rate(), sensor_name=sensor_name('records-per-batch'), group_name='producer-topic-metrics.' + topic, description='Records sent per second for topic ' + topic) self.add_metric('byte-rate', Rate(), sensor_name=sensor_name('bytes'), group_name='producer-topic-metrics.' + topic, description='Bytes per second for topic ' + topic) self.add_metric( 'compression-rate', Avg(), sensor_name=sensor_name('compression-rate'), group_name='producer-topic-metrics.' + topic, description='Average Compression ratio for topic ' + topic) self.add_metric( 'record-retry-rate', Rate(), sensor_name=sensor_name('record-retries'), group_name='producer-topic-metrics.' + topic, description='Record retries per second for topic ' + topic) self.add_metric('record-error-rate', Rate(), sensor_name=sensor_name('record-errors'), group_name='producer-topic-metrics.' + topic, description='Record errors per second for topic ' + topic)
def __init__(self, metrics, metric_group_prefix, node_id): self.metrics = metrics # Any broker may have registered summary metrics already # but if not, we need to create them so we can set as parents below all_conns_transferred = metrics.get_sensor('bytes-sent-received') if not all_conns_transferred: metric_group_name = metric_group_prefix + '-metrics' bytes_transferred = metrics.sensor('bytes-sent-received') bytes_transferred.add(metrics.metric_name( 'network-io-rate', metric_group_name, 'The average number of network operations (reads or writes) on all' ' connections per second.'), Rate(sampled_stat=Count())) bytes_sent = metrics.sensor('bytes-sent', parents=[bytes_transferred]) bytes_sent.add(metrics.metric_name( 'outgoing-byte-rate', metric_group_name, 'The average number of outgoing bytes sent per second to all' ' servers.'), Rate()) bytes_sent.add(metrics.metric_name( 'request-rate', metric_group_name, 'The average number of requests sent per second.'), Rate(sampled_stat=Count())) bytes_sent.add(metrics.metric_name( 'request-size-avg', metric_group_name, 'The average size of all requests in the window.'), Avg()) bytes_sent.add(metrics.metric_name( 'request-size-max', metric_group_name, 'The maximum size of any request sent in the window.'), Max()) bytes_received = metrics.sensor('bytes-received', parents=[bytes_transferred]) bytes_received.add(metrics.metric_name( 'incoming-byte-rate', metric_group_name, 'Bytes/second read off all sockets'), Rate()) bytes_received.add(metrics.metric_name( 'response-rate', metric_group_name, 'Responses received sent per second.'), Rate(sampled_stat=Count())) request_latency = metrics.sensor('request-latency') request_latency.add(metrics.metric_name( 'request-latency-avg', metric_group_name, 'The average request latency in ms.'), Avg()) request_latency.add(metrics.metric_name( 'request-latency-max', metric_group_name, 'The maximum request latency in ms.'), Max()) # if one sensor of the metrics has been registered for the connection, # then all other sensors should have been registered; and vice versa node_str = 'node-{0}'.format(node_id) node_sensor = metrics.get_sensor(node_str + '.bytes-sent') if not node_sensor: metric_group_name = metric_group_prefix + '-node-metrics.' + node_str bytes_sent = metrics.sensor( node_str + '.bytes-sent', parents=[metrics.get_sensor('bytes-sent')]) bytes_sent.add(metrics.metric_name( 'outgoing-byte-rate', metric_group_name, 'The average number of outgoing bytes sent per second.'), Rate()) bytes_sent.add(metrics.metric_name( 'request-rate', metric_group_name, 'The average number of requests sent per second.'), Rate(sampled_stat=Count())) bytes_sent.add(metrics.metric_name( 'request-size-avg', metric_group_name, 'The average size of all requests in the window.'), Avg()) bytes_sent.add(metrics.metric_name( 'request-size-max', metric_group_name, 'The maximum size of any request sent in the window.'), Max()) bytes_received = metrics.sensor( node_str + '.bytes-received', parents=[metrics.get_sensor('bytes-received')]) bytes_received.add(metrics.metric_name( 'incoming-byte-rate', metric_group_name, 'Bytes/second read off node-connection socket'), Rate()) bytes_received.add(metrics.metric_name( 'response-rate', metric_group_name, 'The average number of responses received per second.'), Rate(sampled_stat=Count())) request_time = metrics.sensor( node_str + '.latency', parents=[metrics.get_sensor('request-latency')]) request_time.add(metrics.metric_name( 'request-latency-avg', metric_group_name, 'The average request latency in ms.'), Avg()) request_time.add(metrics.metric_name( 'request-latency-max', metric_group_name, 'The maximum request latency in ms.'), Max()) self.bytes_sent = metrics.sensor(node_str + '.bytes-sent') self.bytes_received = metrics.sensor(node_str + '.bytes-received') self.request_time = metrics.sensor(node_str + '.latency')
def __init__(self, metrics, client, metadata): self.metrics = metrics self._client = client self._metadata = metadata sensor_name = 'batch-size' self.batch_size_sensor = self.metrics.sensor(sensor_name) self.add_metric( 'batch-size-avg', Avg(), sensor_name=sensor_name, description= 'The average number of bytes sent per partition per-request.') self.add_metric( 'batch-size-max', Max(), sensor_name=sensor_name, description= 'The max number of bytes sent per partition per-request.') sensor_name = 'compression-rate' self.compression_rate_sensor = self.metrics.sensor(sensor_name) self.add_metric( 'compression-rate-avg', Avg(), sensor_name=sensor_name, description='The average compression rate of record batches.') sensor_name = 'queue-time' self.queue_time_sensor = self.metrics.sensor(sensor_name) self.add_metric( 'record-queue-time-avg', Avg(), sensor_name=sensor_name, description= 'The average time in ms record batches spent in the record accumulator.' ) self.add_metric( 'record-queue-time-max', Max(), sensor_name=sensor_name, description= 'The maximum time in ms record batches spent in the record accumulator.' ) sensor_name = 'produce-throttle-time' self.produce_throttle_time_sensor = self.metrics.sensor(sensor_name) self.add_metric('produce-throttle-time-avg', Avg(), sensor_name=sensor_name, description='The average throttle time in ms') self.add_metric('produce-throttle-time-max', Max(), sensor_name=sensor_name, description='The maximum throttle time in ms') sensor_name = 'records-per-request' self.records_per_request_sensor = self.metrics.sensor(sensor_name) self.add_metric( 'record-send-rate', Rate(), sensor_name=sensor_name, description='The average number of records sent per second.') self.add_metric( 'records-per-request-avg', Avg(), sensor_name=sensor_name, description='The average number of records per request.') sensor_name = 'bytes' self.byte_rate_sensor = self.metrics.sensor(sensor_name) self.add_metric( 'byte-rate', Rate(), sensor_name=sensor_name, description='The average number of bytes sent per second.') sensor_name = 'record-retries' self.retry_sensor = self.metrics.sensor(sensor_name) self.add_metric( 'record-retry-rate', Rate(), sensor_name=sensor_name, description='The average per-second number of retried record sends' ) sensor_name = 'errors' self.error_sensor = self.metrics.sensor(sensor_name) self.add_metric( 'record-error-rate', Rate(), sensor_name=sensor_name, description= 'The average per-second number of record sends that resulted in errors' ) sensor_name = 'record-size-max' self.max_record_size_sensor = self.metrics.sensor(sensor_name) self.add_metric( 'record-size-max', Max(), sensor_name=sensor_name, description='The maximum record size across all batches') self.add_metric( 'record-size-avg', Avg(), sensor_name=sensor_name, description='The average maximum record size per batch') self.add_metric( 'requests-in-flight', AnonMeasurable(lambda *_: self._client.in_flight_request_count()), description= 'The current number of in-flight requests awaiting a response.') self.add_metric( 'metadata-age', AnonMeasurable(lambda _, now: ( now - self._metadata._last_successful_refresh_ms) / 1000), description= 'The age in seconds of the current producer metadata being used.')
def __init__(self, metrics, metric_group_prefix, conns): self.metrics = metrics self.metric_group_name = metric_group_prefix + "-metrics" self.connection_closed = metrics.sensor("connections-closed") self.connection_closed.add( metrics.metric_name( "connection-close-rate", self.metric_group_name, "Connections closed per second in the window.", ), Rate(), ) self.connection_created = metrics.sensor("connections-created") self.connection_created.add( metrics.metric_name( "connection-creation-rate", self.metric_group_name, "New connections established per second in the window.", ), Rate(), ) self.select_time = metrics.sensor("select-time") self.select_time.add( metrics.metric_name( "select-rate", self.metric_group_name, "Number of times the I/O layer checked for new I/O to perform per" " second", ), Rate(sampled_stat=Count()), ) self.select_time.add( metrics.metric_name( "io-wait-time-ns-avg", self.metric_group_name, "The average length of time the I/O thread spent waiting for a" " socket ready for reads or writes in nanoseconds.", ), Avg(), ) self.select_time.add( metrics.metric_name( "io-wait-ratio", self.metric_group_name, "The fraction of time the I/O thread spent waiting.", ), Rate(time_unit=TimeUnit.NANOSECONDS), ) self.io_time = metrics.sensor("io-time") self.io_time.add( metrics.metric_name( "io-time-ns-avg", self.metric_group_name, "The average length of time for I/O per select call in nanoseconds.", ), Avg(), ) self.io_time.add( metrics.metric_name( "io-ratio", self.metric_group_name, "The fraction of time the I/O thread spent doing I/O", ), Rate(time_unit=TimeUnit.NANOSECONDS), ) metrics.add_metric( metrics.metric_name( "connection-count", self.metric_group_name, "The current number of active connections.", ), AnonMeasurable(lambda config, now: len(conns)), )
def test_simple_stats(mocker, time_keeper, config, metrics): mocker.patch('time.time', side_effect=time_keeper.time) measurable = ConstantMeasurable() metrics.add_metric( metrics.metric_name( 'direct.measurable', 'grp1', 'The fraction of time an appender waits for space allocation.'), measurable) sensor = metrics.sensor('test.sensor') sensor.add(metrics.metric_name('test.avg', 'grp1'), Avg()) sensor.add(metrics.metric_name('test.max', 'grp1'), Max()) sensor.add(metrics.metric_name('test.min', 'grp1'), Min()) sensor.add(metrics.metric_name('test.rate', 'grp1'), Rate(TimeUnit.SECONDS)) sensor.add(metrics.metric_name('test.occurences', 'grp1'), Rate(TimeUnit.SECONDS, Count())) sensor.add(metrics.metric_name('test.count', 'grp1'), Count()) percentiles = [ Percentile(metrics.metric_name('test.median', 'grp1'), 50.0), Percentile(metrics.metric_name('test.perc99_9', 'grp1'), 99.9) ] sensor.add_compound( Percentiles(100, BucketSizing.CONSTANT, 100, -100, percentiles=percentiles)) sensor2 = metrics.sensor('test.sensor2') sensor2.add(metrics.metric_name('s2.total', 'grp1'), Total()) sensor2.record(5.0) sum_val = 0 count = 10 for i in range(count): sensor.record(i) sum_val += i # prior to any time passing elapsed_secs = (config.time_window_ms * (config.samples - 1)) / 1000.0 assert abs(count / elapsed_secs - metrics.metrics.get(metrics.metric_name('test.occurences', 'grp1')).value()) \ < EPS, 'Occurrences(0...%d) = %f' % (count, count / elapsed_secs) # pretend 2 seconds passed... sleep_time_seconds = 2.0 time_keeper.sleep(sleep_time_seconds) elapsed_secs += sleep_time_seconds assert abs(5.0 - metrics.metrics.get(metrics.metric_name('s2.total', 'grp1')).value()) \ < EPS, 's2 reflects the constant value' assert abs(4.5 - metrics.metrics.get(metrics.metric_name('test.avg', 'grp1')).value()) \ < EPS, 'Avg(0...9) = 4.5' assert abs((count - 1) - metrics.metrics.get(metrics.metric_name('test.max', 'grp1')).value()) \ < EPS, 'Max(0...9) = 9' assert abs(0.0 - metrics.metrics.get(metrics.metric_name('test.min', 'grp1')).value()) \ < EPS, 'Min(0...9) = 0' assert abs((sum_val / elapsed_secs) - metrics.metrics.get(metrics.metric_name('test.rate', 'grp1')).value()) \ < EPS, 'Rate(0...9) = 1.40625' assert abs((count / elapsed_secs) - metrics.metrics.get(metrics.metric_name('test.occurences', 'grp1')).value()) \ < EPS, 'Occurrences(0...%d) = %f' % (count, count / elapsed_secs) assert abs(count - metrics.metrics.get(metrics.metric_name('test.count', 'grp1')).value()) \ < EPS, 'Count(0...9) = 10'