def __init__(self): self.main_counter = pc.Counter( "main_counter", "total requests to your redirect page") self.redirect_time = pc.Histogram( "redirect_time", "this a histogram of the redirect time") self.users_counter = pc.Counter( "users_counter", "a counter of the users in our platform", ["ip", "browser", "platform", "language"])
class Metric(object): """A Namespace for our metrics""" # Metrics we record in prometheus _INVALID_RECORDS = prometheus_client.Counter( 'mjolnir_bulk_invalid_records_total', "Number of requests that could not be processed", ['reason']) FAIL_VALIDATE = _INVALID_RECORDS.labels(reason='fail_validate') MISSING_INDEX = _INVALID_RECORDS.labels(reason='missing_index') SUBMIT_BATCH = prometheus_client.Summary( 'mjolnir_bulk_submit_batch_seconds', 'Time taken to submit a batch from kafka to elasticsearch') RECORDS_PROCESSED = prometheus_client.Counter( 'mjolnir_bulk_records_total', 'Number of kafka records processed') _BULK_ACTION_RESULT = prometheus_client.Counter( 'mjolnir_bulk_action_total', 'Number of bulk action somethings', ['result']) ACTION_RESULTS = { 'updated': _BULK_ACTION_RESULT.labels(result='updated'), 'created': _BULK_ACTION_RESULT.labels(result='created'), 'noop': _BULK_ACTION_RESULT.labels(result='noop'), } OK_UNKNOWN = _BULK_ACTION_RESULT.labels(result='ok_unknown') MISSING = _BULK_ACTION_RESULT.labels(result='missing') FAILED = _BULK_ACTION_RESULT.labels(result='failed')
class Metric: _INVALID_RECORDS = prometheus_client.Counter( 'mjolnir_swift_invalid_records_total', "Number of requests that could not be processed", ['reason'] ) FAIL_VALIDATE = _INVALID_RECORDS.labels(reason="validate") FAIL_NO_CONFIG = _INVALID_RECORDS.labels(reason="no_config") PROCESS_MESSAGE = prometheus_client.Summary( 'mjolnir_swift_process_message_seconds', 'Time taken to process individual kafka messages') BULK_IMPORT = prometheus_client.Summary( 'mjolnir_swift_import_file_seconds', 'Time taken to import a file into elasticsearch' ) _BULK_ACTION_RESULT = prometheus_client.Counter( 'mjolnir_swift_action_total', 'Number of bulk action responses per result type', ['result']) ACTION_RESULTS = { 'updated': _BULK_ACTION_RESULT.labels(result='updated'), 'created': _BULK_ACTION_RESULT.labels(result='created'), 'noop': _BULK_ACTION_RESULT.labels(result='noop'), } OK_UNKNOWN = _BULK_ACTION_RESULT.labels(result='ok_unknown') MISSING = _BULK_ACTION_RESULT.labels(result='missing') FAILED = _BULK_ACTION_RESULT.labels(result='failed') TIMEOUT = _BULK_ACTION_RESULT.labels(result='timeout')
def after_process_boot(self, broker): os.environ["prometheus_multiproc_dir"] = DB_PATH # This import MUST happen at runtime, after process boot and # after the env variable has been set up. import prometheus_client as prom self.logger.debug("Setting up metrics...") registry = prom.CollectorRegistry() self.total_messages = prom.Counter( "dramatiq_messages_total", "The total number of messages processed.", ["queue_name", "actor_name"], registry=registry, ) self.total_errored_messages = prom.Counter( "dramatiq_message_errors_total", "The total number of errored messages.", ["queue_name", "actor_name"], registry=registry, ) self.total_retried_messages = prom.Counter( "dramatiq_message_retries_total", "The total number of retried messages.", ["queue_name", "actor_name"], registry=registry, ) self.total_rejected_messages = prom.Counter( "dramatiq_message_rejects_total", "The total number of dead-lettered messages.", ["queue_name", "actor_name"], registry=registry, ) self.total_revived_messages = prom.Counter( "dramatiq_message_revives_total", "The total number of messages revived from dead workers.", ["queue_name", "actor_name"], registry=registry, ) self.inprogress_messages = prom.Gauge( "dramatiq_messages_inprogress", "The number of messages in progress.", ["queue_name", "actor_name"], registry=registry, multiprocess_mode="livesum", ) self.inprogress_delayed_messages = prom.Gauge( "dramatiq_delayed_messages_inprogress", "The number of delayed messages in memory.", ["queue_name", "actor_name"], registry=registry, ) self.message_durations = prom.Histogram( "dramatiq_message_duration_milliseconds", "The time spent processing messages.", ["queue_name", "actor_name"], buckets=(5, 10, 25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 30000, 60000, 600000, 900000, float("inf")), registry=registry, )
def initialize(name, host, port): queue_latency = prometheus_client.Histogram('queue_latency', 'queue latency', ['app', 'queue']) queue_counter = prometheus_client.Counter('queue_counter', 'queue counter', ['app', 'queue']) dequeue_latency = prometheus_client.Histogram('dequeue_latency', 'queue latency', ['app', 'queue']) dequeue_counter = prometheus_client.Counter('dequeue_counter', 'queue counter', ['app', 'queue']) for _ in range(10): try: redis_conn = redis.Redis(host, port, decode_responses=True) break except Exception as e: print('ERROR', e) time.sleep(1) def push(queue, data): value = json.dumps(data) with queue_latency.labels(app=name, queue=queue).time(): redis_conn.rpush(queue, value) queue_counter.labels(app=name, queue=queue).inc() def pop(queue, function): while True: try: _, value = redis_conn.blpop(queue) dequeue_counter.labels(app=name, queue=queue).inc() try: with dequeue_latency.labels(app=name, queue=queue).time(): function(json.loads(value)) except Exception as e: print('ERROR:', e) push(queue + '.dead', value) except Exception as e: print('ERROR:', e) return push, pop
def __init__(self): self.registry = prometheus_client.CollectorRegistry() self._video_processed = prometheus_client.Counter( "video_precessed", "Video processed count", labelnames=(), namespace="youtube", subsystem="video", unit="", registry=self.registry, labelvalues=None, ) self._emails_found = prometheus_client.Counter( "emails_found", "Emails found", labelnames=(), namespace="youtube", subsystem="video", unit="", registry=self.registry, labelvalues=None, ) self._unique_emails_found = prometheus_client.Counter( "unique_emails_found", "Unique emails found", labelnames=(), namespace="youtube", subsystem="video", unit="", registry=self.registry, labelvalues=None, )
def __init__(self, prefix, description, labels): """ :param prefix: prefix to use for each metric name :param description: description of action to use in metric description :param labels: label names to define for each metric """ self.full_prefix = '{}_{}'.format(self.__class__._PREFIX, prefix) self.progress = prometheus_client.Gauge( '{}_attempt_inprogress'.format(self.full_prefix), 'In progress attempts to {}'.format(description), labels, registry=REGISTRY, multiprocess_mode='livesum') self.attempt_total = prometheus_client.Counter( '{}_attempt_total'.format(self.full_prefix), 'Total attempts to {}'.format(description), labels, registry=REGISTRY) self.failure_total = prometheus_client.Counter( '{}_failure_total'.format(self.full_prefix), 'Total failures to {}'.format(description), labels, registry=REGISTRY) self.duration = prometheus_client.Histogram( '{}_duration_seconds'.format(self.full_prefix), 'Seconds to {}'.format(description), labels, registry=REGISTRY)
def run(args): s3uri = args.s3uri localpath = args.localpath excludes = args.exclude interval = args.interval i = pc.Info('s3insync_version', 'Version and config information for the client') i.info({ 'version': s3insync.__version__, 'aws_repo': s3uri, 'localpath': localpath, }) start_time = pc.Gauge('s3insync_start_time', 'Time the sync process was started') start_time.set_to_current_time() last_sync = pc.Gauge('s3insync_last_sync_time', 'Time the last sync completed') op_count = pc.Counter('s3insync_operations', 'Count of operations', labelnames=('type', )) failed_op_count = pc.Counter('s3insync_failed_operations', 'Count of failed operations', labelnames=('type', )) files_in_s3 = pc.Gauge( 's3insync_files_in_s3', 'Number of files in S3', ) pc.start_http_server(8087) src = r.S3Repo('s3', s3uri) dest = r.LocalFSRepo('fs', localpath, os.path.join(os.getenv('HOME'), ".s3insync")) dest.ensure_directories() sync = sd.SyncDecider(excludes) set_exit = setup_signals() while not set_exit.is_set(): logger.debug("Starting sync") start = time.monotonic() try: success, failures = sync.execute_sync(src, dest) files_in_s3.set(success.pop('total', 0)) set_op_counts(success, op_count) set_op_counts(failures, failed_op_count) last_sync.set_to_current_time() except Exception: logger.exception("Failed to excute sync") duration = time.monotonic() - start logger.debug("Stopping sync after %g secs", duration) set_exit.wait(max(30, interval - duration))
def __init__(self): self.metrics = { 'notifications': prometheus_client.Counter( 'prometheus_webhook_snmp_notifications', 'Number of processed Prometheus Alertmanager notifications.'), 'traps': prometheus_client.Counter( 'prometheus_webhook_snmp_traps', 'Number of sent SNMP traps.') }
def __init__(self, bot: Life) -> None: self.bot = bot self.process = psutil.Process() self.ready = False self.guild_stats = prometheus_client.Gauge( 'counts', documentation='Guild counts', namespace='guild', labelnames=['guild_id', 'count']) self.socket_responses = prometheus_client.Counter( 'socket_responses', documentation='Socket responses', namespace='life', labelnames=['response']) self.socket_events = prometheus_client.Counter( 'socket_events', documentation='Socket events', namespace='life', labelnames=['event']) self.counters = prometheus_client.Counter('stats', documentation='Life stats', namespace='life', labelnames=['stat']) self.gauges = prometheus_client.Gauge('counts', documentation='Life counts', namespace='life', labelnames=['count']) self.op_types = { 0: 'DISPATCH', 1: 'HEARTBEAT', 2: 'IDENTIFY', 3: 'PRESENCE', 4: 'VOICE_STATE', 5: 'VOICE_PING', 6: 'RESUME', 7: 'RECONNECT', 8: 'REQUEST_MEMBERS', 9: 'INVALIDATE_SESSION', 10: 'HELLO', 11: 'HEARTBEAT_ACK', 12: 'GUILD_SYNC', } self.stats_five_minutes.start() self.stats_thirty_seconds.start()
def _setup_prom_data(self): self.middleware = PromScrapeMiddleware(self) util_bot.bot.middleware.append(self.middleware) self.messages_sent = prom.Counter('messages_sent', 'Messages sent by channel', ['channel']) self.messages_received = prom.Counter('messages_received', 'Messages received by channel', ['channel']) self.commands_executed = prom.Counter('commands_executed', 'Commands executed by name', ['command']) self.hastebins_created = prom.Counter('hastebins_created', 'Hastebins created')
def __init__(self, client): self.prometheus_port = client.prometheus_port self.run_rule = client.run_rule self.writeback = client.writeback client.run_rule = self.metrics_run_rule client.writeback = self.metrics_writeback # initialize prometheus metrics to be exposed self.prom_scrapes = prometheus_client.Counter( 'elastalert_scrapes', 'Number of scrapes for rule', ['rule_name']) self.prom_hits = prometheus_client.Counter('elastalert_hits', 'Number of hits for rule', ['rule_name']) self.prom_matches = prometheus_client.Counter( 'elastalert_matches', 'Number of matches for rule', ['rule_name']) self.prom_time_taken = prometheus_client.Counter( 'elastalert_time_taken', 'Time taken to evaluate rule', ['rule_name']) self.prom_alerts_sent = prometheus_client.Counter( 'elastalert_alerts_sent', 'Number of alerts sent for rule', ['rule_name']) self.prom_alerts_not_sent = prometheus_client.Counter( 'elastalert_alerts_not_sent', 'Number of alerts not sent', ['rule_name']) self.prom_errors = prometheus_client.Counter( 'elastalert_errors', 'Number of errors for rule') self.prom_alerts_silenced = prometheus_client.Counter( 'elastalert_alerts_silenced', 'Number of silenced alerts', ['rule_name'])
def _create_metrics(self): """Creates a registry and records metrics""" self.registry = prometheus_client.CollectorRegistry() self.quota_free_count = prometheus_client.Gauge( 'kuryr_quota_free_count', 'Amount of quota available' ' for the network resource', labelnames={'resource'}, registry=self.registry) self.port_quota_per_subnet = prometheus_client.Gauge( 'kuryr_port_quota_per_subnet', 'Amount of ports available' ' on Subnet', labelnames={'subnet_id', 'subnet_name'}, registry=self.registry) self.lbs_members_count = prometheus_client.Gauge( 'kuryr_critical_lb_members_count', 'Amount of members per ' 'critical Load Balancer pool', labelnames={'lb_name', 'lb_pool_name'}, registry=self.registry) self.lbs_state = prometheus_client.Enum('kuryr_critical_lb_state', 'Critical Load Balancer State', labelnames={'lb_name'}, states=[ 'ERROR', 'ACTIVE', 'DELETED', 'PENDING_CREATE', 'PENDING_UPDATE', 'PENDING_DELETE' ], registry=self.registry) buckets = (10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, _INF) self.pod_creation_latency = prometheus_client.Histogram( 'kuryr_pod_creation_latency', 'Time taken for a pod to have' ' Kuryr annotations set', buckets=buckets, registry=self.registry) self.load_balancer_readiness = prometheus_client.Counter( 'kuryr_load_balancer_readiness', 'This counter is increased when ' 'Kuryr notices that an Octavia load balancer is stuck in an ' 'unexpected state', registry=self.registry) self.port_readiness = prometheus_client.Counter( 'kuryr_port_readiness', 'This counter is increased when Kuryr ' 'times out waiting for Neutron to move port to ACTIVE', registry=self.registry)
def __init__(self, bot, prefix) -> None: self.command_counter = prom.Counter( f"{prefix}_commands_ran", "How many times commands were ran", ["command_name", "cluster"]) self.user_message_raw_count = prom.Counter( f"{prefix}_user_message_raw_count", "Raw count of how many messages we have seen from users", ["cluster"]) self.bot_message_raw_count = prom.Counter( f"{prefix}_bot_message_raw_count", "Raw count of how many messages we have seen from bots", ["cluster"]) self.own_message_raw_count = prom.Counter( f"{prefix}_own_message_raw_count", "Raw count of how many messages GearBot has send", ["cluster"]) self.bot_guilds = prom.Gauge(f"{prefix}_guilds", "How many guilds the bot is in", ["cluster"]) self.bot_users = prom.Gauge(f"{prefix}_users", "How many users the bot can see", ["cluster"]) self.bot_users_unique = prom.Gauge( f"{prefix}_users_unique", "How many unique users the bot can see", ["cluster"]) self.bot_event_counts = prom.Counter(f"{prefix}_event_counts", "How much each event occurred", ["event_name", "cluster"]) self.bot_latency = prom.Gauge(f"{prefix}_latency", "Current bot latency", ["cluster"]) self.uid_usage = prom.Counter( f"{prefix}_context_uid_usage", "Times uid was used from the context command", ["type", "cluster"]) self.userinfo_usage = prom.Counter( f"{prefix}_context_userinfo_usage", "Times userinfo was used from the context command", ["type", "cluster"]) self.inf_search_usage = prom.Counter( f"{prefix}_context_inf_search_usage", "Times inf serach was used from the context command", ["type", "cluster"]) bot.metrics_reg.register(self.command_counter) bot.metrics_reg.register(self.user_message_raw_count) bot.metrics_reg.register(self.bot_message_raw_count) bot.metrics_reg.register(self.bot_guilds) bot.metrics_reg.register(self.bot_users) bot.metrics_reg.register(self.bot_users_unique) bot.metrics_reg.register(self.bot_event_counts) bot.metrics_reg.register(self.own_message_raw_count) bot.metrics_reg.register(self.bot_latency) bot.metrics_reg.register(self.uid_usage) bot.metrics_reg.register(self.userinfo_usage) bot.metrics_reg.register(self.inf_search_usage)
def csets_filter_worker(args, config, db): class FilterAmqp(messagebus.Amqp): def on_message(self, payload, message): logger.info('Filter: {}'.format(payload)) start = time.time() if cset_filter(self.config, self.db, payload): amqp.send(payload, schema_name='cset', schema_version=1, routing_key='analysis_cset.osmtracker') m_events.labels('analysis', 'in').inc() m_events.labels('filter', 'out').inc() elapsed = time.time() - start m_filter_time.observe(elapsed) logger.info('Filtering of cid {} took {:.2f}s'.format( payload['cid'], elapsed)) message.ack() amqp = FilterAmqp(args.amqp_url, AMQP_EXCHANGE_TOPIC, 'topic', AMQP_QUEUES, [AMQP_FILTER_QUEUE]) amqp.config = config amqp.db = db if args.metrics: m_events = prometheus_client.Counter('osmtracker_events', 'Number of events', EVENT_LABELS) m_filter_time = prometheus_client.Histogram( 'osmtracker_changeset_filter_processing_time_seconds', 'Changeset filtering time (seconds)') logger.debug('Starting filter worker') amqp.run()
def __init__(self, kubeconfig, token_path): token_file = None token_path = token_path or "/var/run/secrets/kubernetes.io/serviceaccount" if kubeconfig and os.path.exists(kubeconfig): logger.debug("Using configuration from kubeconfig %s" % kubeconfig) kubernetes.config.load_kube_config(config_file=kubeconfig) elif os.path.exists(token_path): logger.debug("Using configuration from token in %s" % token_path) loader = kubernetes.config.incluster_config.InClusterConfigLoader( os.path.join(token_path, "token"), os.path.join(token_path, "ca.crt"), ) loader.load_and_set() else: raise Exception("No kubeconfig or token found") if token_file: loader = kubernetes.config.incluster_config.InClusterConfigLoader( "/var/run/secrets/kubernetes.io/serviceaccount/token", "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt", ) configuration = kubernetes.client.Configuration() configuration.host = "https://192.168.1.10:6443" loader.load_and_set() self.v1 = kubernetes.client.CoreV1Api() self.appsv1 = kubernetes.client.AppsV1Api() self.client = kubernetes.client.ApiClient() self.custom = kubernetes.client.CustomObjectsApi() self.deleted_total = prometheus.Counter( "memguardian_deleted_pod_total", "Total deleted pods from start.", ["namespace", "owner"], )
def setup_status(app) -> prometheus_client.CollectorRegistry: """Add /status to serve Prometheus-driven runtime metrics.""" registry = prometheus_client.CollectorRegistry(auto_describe=True) app["request_count"] = prometheus_client.Counter( "requests_total", "Total Request Count", ["app_name", "method", "endpoint", "http_status"], registry=registry, ) app["request_latency"] = prometheus_client.Histogram( "request_latency_seconds", "Request latency", ["app_name", "endpoint"], registry=registry, ) app["request_in_progress"] = prometheus_client.Gauge( "requests_in_progress_total", "Requests in progress", ["app_name", "endpoint", "method"], registry=registry, ) prometheus_client.Info("server", "API server version", registry=registry).info({ "version": metadata.__version__, "commit": getattr(metadata, "__commit__", "null"), "build_date": getattr(metadata, "__date__", "null"), }) app.middlewares.insert(0, instrument) # passing StatusRenderer(registry) without __call__ triggers a spurious DeprecationWarning # FIXME(vmarkovtsev): https://github.com/aio-libs/aiohttp/issues/4519 app.router.add_get("/status", StatusRenderer(registry).__call__) return registry
def __init__( self, reporter_order=( 'hdf5_reporter', 'dashboard_reporter', ), ): self.reporter_order = reporter_order # counters self.cycle_counter = prom.Counter('wepy_cycle_idx', "") # gauges self.walker_size_g = prom.Gauge('wepy_walker_single_size_bytes', "") self.ensemble_size_g = prom.Gauge('wepy_walker_ensemble_size_bytes', "") self.runner_size_g = prom.Gauge('wepy_runner_size_bytes', "") self.resampler_size_g = prom.Gauge('wepy_resampler_size_bytes', "") self.bc_size_g = prom.Gauge('wepy_bc_size_bytes', "") self.mapper_size_g = prom.Gauge('wepy_mapper_size_bytes', "") self.sim_manager_size_g = prom.Gauge('wepy_sim_manager_size_bytes', "") self.reporter_size_g = prom.Gauge( 'wepy_reporters_size_bytes', "", ["name"], )
def consumer(name, instance, host, port): consumer_latency = prometheus_client.Histogram( 'consumer_latency', 'consumer latency', ['app', 'instance', 'topic']) consumer_counter = prometheus_client.Counter('consumer_counter', 'consumer counter', ['app', 'instance', 'topic']) for _ in range(100): try: consumer = kafka.KafkaConsumer(bootstrap_servers='{}:{}'.format( host, port), group_id=name, auto_offset_reset='earliest', enable_auto_commit=False) break except Exception as e: print('ERROR', e) time.sleep(1) def consume(topic, function): consumer.subscribe([topic]) for message in consumer: consumer_counter.labels(app=name, instance=instance, topic=topic).inc() with consumer_latency.labels(app=name, instance=instance, topic=topic).time(): function(json.loads(message.value.decode('utf-8'))) consumer.commit() return consume
def initialize(name): app = flask.Flask(name) request_latency = prometheus_client.Histogram( 'request_latency', 'request latency', ['app', 'method', 'path', 'status']) request_counter = prometheus_client.Counter( 'request_counter', 'request counter', ['app', 'method', 'path', 'status']) @app.route('/health') def health(): return 'OK' @app.route('/version') def version(): return '0.1.0' @app.before_request def before_request(): flask.request.start_time = time.time() @app.after_request def after_request(response): latency = time.time() - flask.request.start_time request_latency.labels(app=name, method=flask.request.method, path=flask.request.path, status=response.status_code).observe(latency) request_counter.labels(app=name, method=flask.request.method, path=flask.request.path, status=response.status_code).inc() return response return app
def initialize(name, host, port): db_latency = prometheus_client.Histogram('db_latency', 'db latency', ['app', 'query']) db_counter = prometheus_client.Counter('db_counter', 'db counter', ['app', 'query']) for _ in range(10): try: postgres = psycopg2.connect(host=host, port=port, dbname='postgres', user='******', password='******') break except Exception as e: print('ERROR', e) time.sleep(1) execute('CREATE EXTENSION IF NOT EXISTS "uuid-ossp";') def execute(query, values=None): with db_latency.labels(app=name, query=query).time(): cursor = postgres.cursor() cursor.execute(query, values) data = cursor.fetchall() if cursor.description else [] postgres.commit() cursor.close() db_counter.labels(app=name, query=query).inc() return data return execute
def __init__(self): self._feed_pk_to_system_id_and_feed_id = {} self._feed_pk_to_successful_update_data: typing.Dict[int, typing.Tuple[ float, float]] = {} self._num_updates = prometheus.Counter( PROMETHEUS_NUM_UPDATES, "Number of feed updates of a given feed, status and result", ["system_id", "feed_id", "status", "result"], ) self._last_update = prometheus.Gauge( PROMETHEUS_LAST_UPDATE, "Time since the last update of a given feed, status and result", ["system_id", "feed_id", "status", "result"], ) self._num_entities = prometheus.Gauge( PROMETHEUS_NUM_ENTITIES, "Number of entities of a given type present from a given feed", ["system_id", "feed_id", "entity_type"], ) self._update_latency = prometheus.Gauge( PROMETHEUS_SUCCESSFUL_UPDATE_LATENCY, "Number of seconds between successful updates of a feed", ["system_id", "feed_id"], )
def initialize(name, host, port): db_latency = prometheus_client.Histogram('db_latency', 'db latency', ['app', 'query']) db_counter = prometheus_client.Counter('db_counter', 'db counter', ['app', 'query']) for _ in range(100): try: session = cassandra.cluster.Cluster( [host], load_balancing_policy=cassandra.policies.RoundRobinPolicy(), port=port).connect() break except Exception as e: print('ERROR', e) time.sleep(1) session.execute( "CREATE KEYSPACE IF NOT EXISTS hjalp WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': '2' }" ) session.set_keyspace('hjalp') def execute(query, values=tuple()): with db_latency.labels(app=name, query=query).time(): rows = session.execute(session.prepare(query).bind(values)) data = [r._asdict() for r in rows] db_counter.labels(app=name, query=query).inc() return data return execute
def install_stacksampler(interval=0.005): """Samples the stack every INTERVAL seconds of user time. We could use user+sys time but that leads to interrupting syscalls, which may affect performance, and we care mostly about user time anyway. """ # Note we only start each next timer once the previous timer signal has been processed. # There are two reasons for this: # 1. Avoid handling a signal while already handling a signal, however unlikely, # as this could lead to a deadlock due to locking inside prometheus_client. # 2. Avoid biasing the results by effectively not including the time taken to do the actual # stack sampling. flamegraph = prom.Counter( "flamegraph", "Approx time consumed by each unique stack trace seen by sampling the stack", ["stack"]) # HACK: It's possible to deadlock if we handle a signal during a prometheus collect # operation that locks our flamegraph metric. We then try to take the lock when recording the # metric, but can't. # As a hacky work around, we replace the lock with a dummy lock that doesn't actually lock anything. # This is reasonably safe. We know that only one copy of sample() will ever run at once, # and nothing else but sample() and collect() will touch the metric, leaving two possibilities: # 1. Multiple collects happen at once: Safe. They only do read operations. # 2. A sample during a collect: Safe. The collect only does a copy inside the locked part, # so it just means it'll either get a copy with the new label set, or without it. # This presumes the implementation doesn't change to make that different, however. flamegraph._lock = gevent.lock.DummySemaphore() # There is also a lock we need to bypass on the actual counter values themselves. # Since they get created dynamically, this means we need to replace the lock function # that is used to create them. # This unfortunately means we go without locking for all metrics, not just this one, # however this is safe because we are using gevent, not threading. The lock is only # used to make incrementing/decrementing the counter thread-safe, which is not a concern # under gevent since there are no switch points under the lock. import prometheus_client.values prometheus_client.values.Lock = gevent.lock.DummySemaphore def sample(signum, frame): stack = [] while frame is not None: stack.append(frame) frame = frame.f_back # format each frame as FUNCTION(MODULE) stack = ";".join("{}({})".format(frame.f_code.co_name, frame.f_globals.get('__name__')) for frame in stack[::-1]) # increase counter by interval, so final units are in seconds flamegraph.labels(stack).inc(interval) # schedule the next signal signal.setitimer(signal.ITIMER_VIRTUAL, interval) def cancel(): signal.setitimer(signal.ITIMER_VIRTUAL, 0) atexit.register(cancel) signal.signal(signal.SIGVTALRM, sample) # deliver the first signal in INTERVAL seconds signal.setitimer(signal.ITIMER_VIRTUAL, interval)
class Metrics(object): RequestCounter = prom.Counter('http_requests_total', 'Total number of HTTP requests.', ['method', 'scheme']) ResponseCounter = prom.Counter('http_responses_total', 'Total number of HTTP responses.', ['status']) LatencyHistogram = prom.Histogram('http_latency_seconds', 'Overall HTTP transaction latency.') RequestSizeHistogram = prom.Histogram( 'http_requests_body_bytes', 'Breakdown of HTTP requests by content length.', buckets=powers_of(5, 11)) ResponseSizeHistogram = prom.Histogram( 'http_responses_body_bytes', 'Breakdown of HTTP responses by content length.', buckets=powers_of(5, 11))
def gteCounter(name, description, labels): if name in counters: counter = counters[name] else: print("Creating Counter: {}".format(name)) counter = prometheus_client.Counter(name, description, labels) counters[name] = counter return counter
def _recreate_metrics(self, registry): self._failure_counter = prometheus_client.Counter( 'transmission_failures', 'Number of failed transmissions', registry=registry) self._transmission_gauge = prometheus_client.Gauge( 'zone_transmissions', 'Transmissions during the last job', registry=registry)
async def setup( port: int, consul_host: Optional[str] = "127.0.0.1", use_IPs=False, ): app = web.Application() app['consul_host'] = consul_host app['cfg'] = { "port": port, "use_ips": use_IPs, } jinja_env = Environment(loader=FileSystemLoader('templates'), autoescape=select_autoescape(['html', 'xml'])) app['jinja_env'] = jinja_env if util.is_rpi3(): low_voltage_observed = prometheus_client.Gauge( "rpi_low_voltage_observed", "Raspberry PI low voltage observed over observation window", []) asyncio.ensure_future( monitor_voltage(lambda x: low_voltage_observed.set(x))) if util.is_rpi3(): journald_logged = prometheus_client.Counter( "journald_logged", "Message was logged to journald", []) asyncio.ensure_future( run_journalctl( lambda x: journald_logged.inc(), lambda x: None, )) asyncio.ensure_future( read_temperature.monitor_temperatures(TEMPERATURE)) if util.is_rpi3(): asyncio.create_task(update_time()) app.add_routes([ web.get('/', handle), web.get('/a', host_handler), web.get('/static/{name}.js', static_text_handler("js")), web.get('/health', health.health_check), web.get('/metrics', handle_metrics), web.get('/stop', stop), web.get('/start', start), web.get('/restart', restart), web.get('/restart-host', restart_host), web.get('/shutdown-host', shutdown_host), web.get('/time', gettime), ]) return app
def get_prometheus_counter(self): counter = getattr(prometheus.REGISTRY, '_command_executor_counter', None) if not counter: counter = prometheus.Counter( 'cds_ce_execution_error_total', 'How many times CE actions (upload, prepare env and execute) got executed and failed for each CBA python script', ['step', 'blueprint_name', 'blueprint_version', 'script_name']) prometheus.REGISTRY._command_executor_counter = counter return counter
def test_prometheus(self, request): """Increment prometheus metric for testing""" if not pkg_is_installed('prometheus-client'): return Response('Not Supported', status=501) if not hasattr(self, 'test_counter'): import prometheus_client self.test_counter = prometheus_client.Counter('test', 'test') self.test_counter.inc() return Response('Incremented test counter')