def time(self, key, time): """ Timer metric """ prometheus_histogram = Histogram( # pylint: disable=no-value-for-parameter key ) prometheus_histogram.observe(time)
def test_histogram(self): s = Histogram("hh", "A histogram", registry=self.registry) s.observe(0.05) self.assertEqual( b"""# HELP hh A histogram # TYPE hh histogram hh_bucket{le="0.005"} 0.0 hh_bucket{le="0.01"} 0.0 hh_bucket{le="0.025"} 0.0 hh_bucket{le="0.05"} 1.0 hh_bucket{le="0.075"} 1.0 hh_bucket{le="0.1"} 1.0 hh_bucket{le="0.25"} 1.0 hh_bucket{le="0.5"} 1.0 hh_bucket{le="0.75"} 1.0 hh_bucket{le="1.0"} 1.0 hh_bucket{le="2.5"} 1.0 hh_bucket{le="5.0"} 1.0 hh_bucket{le="7.5"} 1.0 hh_bucket{le="10.0"} 1.0 hh_bucket{le="+Inf"} 1.0 hh_count 1.0 hh_sum 0.05 """, generate_latest(self.registry), )
def test_customize_reducer(self): h = Histogram('test_value', 'Testing roller', registry=self.registry) roller_max = HistogramRoller(h, registry=self.registry, options={ 'reducer': 'max' }) roller_min = HistogramRoller(h, registry=self.registry, options={ 'reducer': 'sum' }) def always_one(*args, **kwargs): return 1 roller_one = HistogramRoller(h, registry=self.registry, options={ 'reducer': always_one }) for state in [2.6, 4.7, 3.8, 2.8]: h.observe(state) roller_max.collect() roller_min.collect() roller_one.collect() # Deltas = 1, 1, 1 nchecks = 0 for m in self.registry.collect(): if m.name.endswith('max_rolled'): for name, labels, val in m.samples: if labels['le'] == '5.0': nchecks += 1 self.assertEqual(val, 1.0) self.assertTrue(nchecks > 0) nchecks = 0 for m in self.registry.collect(): if m.name.endswith('sum_rolled'): for name, labels, val in m.samples: if labels['le'] == '5.0': self.assertEqual(val, 3.0) nchecks += 1 self.assertTrue(nchecks > 0) nchecks = 0 for m in self.registry.collect(): if m.name.endswith('always_one_rolled'): for name, labels, val in m.samples: if labels['le'] == '5.0': self.assertEqual(val, 1.0) nchecks += 1 self.assertTrue(nchecks > 0)
def test_collect(self): h = Histogram('test_value', 'Testing roller', registry=self.registry) roller = HistogramRoller(h, registry=self.registry) # Get values roller.collect() n_buckets = 0 for _, _, _ in self.get_hist_samples(): n_buckets += 1 n_created_guages = 0 for _, _, _ in self.get_rolled_samples(): n_created_guages += 1 self.assertTrue(n_buckets > 0) self.assertTrue(n_created_guages > 0) self.assertEqual(n_buckets, n_created_guages) # Check that roller values are still 0.0 after initial collection for name, labels, value in self.get_rolled_samples(): self.assertEqual(value, 0.0) # Add some samples for i in range(100): h.observe(pow(2, i/10 - 2)) # Collect hisogram values hist_values = dict() for name, labels, value in self.get_hist_samples(): hist_values[labels['le']] = value # Make sure they are still equal after collection for name, labels, value in self.get_rolled_samples(): self.assertEqual(value, 0.0) roller.collect() for name, labels, value in self.get_rolled_samples(): self.assertEqual(value, hist_values[labels['le']])
them manually here. """ from enum import Enum from prometheus_client import Histogram REQUEST_DURATION_SECONDS = Histogram( 'request_duration_seconds', 'request duration for all HTTP requests', ['method', 'handler', 'code'] ) SERVER_SPAWN_DURATION_SECONDS = Histogram( 'server_spawn_duration_seconds', 'time taken for server spawning operation', ['status'], # Use custom bucket sizes, since the default bucket ranges # are meant for quick running processes. Spawns can take a while! buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")] ) class ServerSpawnStatus(Enum): """ Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS """ success = 'success' failure = 'failure' already_pending = 'already-pending' throttled = 'throttled' too_many_users = 'too-many-users' def __str__(self):
from functools import partial from prometheus_client import Counter, Histogram BUCKETS = (0.01, 0.05, 0.1, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 15.0, 20.0, 30.0) requests_total = Counter(namespace='aiohttp', subsystem='http', name='requests_total', documentation='Asyncio total Request Count', labelnames=['method', 'handler', 'status']) request_duration = Histogram( namespace='aiohttp', subsystem='http', name='request_duration_seconds', documentation='Request latency', labelnames=['method', 'handler'], buckets=BUCKETS, ) class MetricsMiddleware: def __init__(self): pass async def __call__(self, app, handler): return partial(self.middleware, handler) async def middleware(self, handler, request): start_time = time() handler_name = handler.__name__
from prometheus_client import Counter, Histogram api_exceptions = Counter("system_baseline_api_exceptions", "count of exceptions raised on public API") baseline_create_requests = Histogram("baseline_create_requests", "baseline create request stats") baseline_fetch_requests = Histogram("baseline_fetch_requests", "baseline fetch request stats") baseline_fetch_all_requests = Histogram("baseline_fetch_all_requests", "baseline fetch all request stats") baseline_delete_requests = Histogram("baseline_delete_requests", "baseline delete request stats") inventory_service_requests = Histogram("drift_inventory_service_requests", "inventory service call stats") inventory_service_exceptions = Counter( "drift_inventory_service_exceptions", "count of exceptions raised by inv service") systems_compared_no_sysprofile = Histogram( "drift_systems_compared_no_sysprofile", "count of systems without system profile" "compared in each request", buckets=[2, 4, 8, 16, 32, 64, 128, 256], )
########################## # jinja2 globals ########################## def version(): return os.environ.get("VERSION", "dev")[:6] app.jinja_env.globals.update(version=version) ########################## # Metrics! ########################## REQUEST_LATENCY = Histogram("flask_request_latency_seconds", "Request Latency", ['method', 'endpoint']) REQUEST_COUNT = Counter("flask_request_count", "Request Count", ["method", "endpoint", "status"]) @app.before_request def start_timer(): request.stats_start = time() @app.after_request def stop_timer(response): delta = time() - request.stats_start REQUEST_LATENCY.labels(request.method, request.endpoint).observe(delta) #pylint: disable=no-member REQUEST_COUNT.labels(request.method, request.endpoint, response.status_code).inc() #pylint: disable=no-member
def _create_histogram(name, description, value): METRICS[name] = METRICS.get(name) or Histogram(name, description) histogram = METRICS.get(name) histogram.observe(value)
NotificationRecord = collections.namedtuple('NotificationRecord', [ 'subscriptionId', 'clientState', 'changeType', 'resource', 'dataType', 'url', 'id', ]) if PROMETHEUS: SUBSCR_COUNT = Counter('kopano_mfr_kopano_total_subscriptions', 'Total number of subscriptions') SUBSCR_EXPIRED = Counter('kopano_mfr_kopano_total_expired_subscriptions', 'Total number of subscriptions which expired') SUBSCR_ACTIVE = Gauge('kopano_mfr_kopano_active_subscriptions', 'Number of active subscriptions', multiprocess_mode='liveall') PROCESSOR_BATCH_HIST = Histogram('kopano_mfr_kopano_webhook_batch_size', 'Number of webhook posts processed in one batch') POST_COUNT = Counter('kopano_mfr_kopano_total_webhook_posts', 'Total number of webhook posts') POST_ERRORS = Counter('kopano_mfr_kopano_total_webhook_post_errors', 'Total number of webhook post errors') POST_HIST = Histogram('kopano_mfr_kopano_webhook_post_duration_seconds', 'Duration of webhook post requests in seconds') DANGLING_COUNT = Counter('kopano_mfr_kopano_total_broken_subscription_conns', 'Total number of broken subscription connections') QUEUE_SIZE_GAUGE = Gauge('kopano_mfr_kopano_subscription_queue_size', 'Current size of subscriptions processor queue', multiprocess_mode='liveall') PROCESSOR_POOL_GAUGE = Gauge('kopano_mfr_kopano_webhook_pools', 'Current number of webhook pools') class Record: """Record binds subscription and conection information per user.""" def __init__(self, server, user, store, subscriptions): """Python built-in method. Args:
import re from urllib.parse import urlparse from ...config import config from prometheus_client import Summary from prometheus_client import Histogram from prometheus_async.aio import time REQ_TIME = Summary("external_to_internal_req_time", "time spent with external_to_internal endpoint") REQ_HISTOGRAM_TIME = Histogram("external_to_internal_req_histogram", "Histogram for external_to_internal endpoint") @time(REQ_TIME) @time(REQ_HISTOGRAM_TIME) async def translate(external_to_internal_spec, repo_provider): external_url = external_to_internal_spec["external_url"] internal_url = await translate_external_to_internal(external_url) result = {"external_url": external_url, "internal_url": internal_url} return result async def translate_external_to_internal(external_git_url): """ Logic from original maitai code to do this: found in GitUrlParser.java#generateInternalGitRepoName """
LOGGER = get_logger(__name__) VMAAS_HOST = os.getenv('VMAAS_HOST', 'http://vmaas-webapp-1.vmaas-ci.svc:8080') VMAAS_VULNERABILITIES_API = os.getenv("VMAAS_VULNERABILITIES_API", "/api/v1/vulnerabilities") vmaas_vulnerabilities_endpoint = "%s%s" % (VMAAS_HOST, VMAAS_VULNERABILITIES_API) # pylint: disable=invalid-name kafka_evaluator_topic = os.getenv('EVALUATOR_TOPIC', # pylint: disable=invalid-name 'vulnerability.evaluator.upload,vulnerability.evaluator.recalc').split(",") prometheus_port = os.getenv('PROMETHEUS_PORT', '8085') # pylint: disable=invalid-name # number of worker threads WORKER_THREADS = int(os.getenv('WORKER_THREADS', '30')) MAX_QUEUE_SIZE = int(os.getenv('MAX_QUEUE_SIZE', '30')) # prometheus probes # times VMAAS_EVAL_TIME = Histogram('ve_evaluator_vmaas_evaluation_seconds', 'Time spent checking a system for vmaas hits') # counts VMAAS_COUNT = Counter('ve_evaluator_vmaas_calls', 'Number of VMaaS-evaluations attempted') INV_ID_NOT_FOUND = Counter('ve_evaluator_inventory_not_found', 'Number of times inventory-id not in SystemPlatform') UNKNOWN_MSG = Counter('ve_evaluator_unknown_msg', 'Number of unrecognized messages delivered from queue') UNKNOWN_TOPIC = Counter('ve_evaluator_unknown_topic', 'Number of times message delivered from unsupported topic') CONSUMER_QUEUE = mqueue.MQReader(kafka_evaluator_topic) WEBHOOKS_QUEUE = mqueue.MQWriter(mqueue.WEBHOOKS_TOPIC) async def terminate(_, loop): """Trigger shutdown.""" LOGGER.info("Signal received, stopping kafka consumers.") await CONSUMER_QUEUE.stop() await WEBHOOKS_QUEUE.stop()
import time from flask import request from prometheus_client import Counter, Histogram from prometheus_client import start_http_server, make_wsgi_app from werkzeug.wsgi import DispatcherMiddleware FLASK_REQUEST_ENDPOINT_SENTINEL = '-' FLASK_REQUEST_LATENCY = Histogram('flask_request_latency_seconds', 'Flask Request Latency', ['method', 'endpoint']) FLASK_REQUEST_COUNT = Counter('flask_request_count', 'Flask Request Count', ['method', 'endpoint', 'http_status']) def before_request(): request.start_time = time.time() def after_request(response): request_latency = time.time() - request.start_time endpoint = request.url_rule.rule if request.url_rule else FLASK_REQUEST_ENDPOINT_SENTINEL FLASK_REQUEST_LATENCY.labels(request.method, endpoint).observe(request_latency) FLASK_REQUEST_COUNT.labels(request.method, endpoint, response.status_code).inc() return response
metrics_registry = CollectorRegistry() multiprocess.MultiProcessCollector(metrics_registry) APP_INFO = Info("app_info", "Application information", registry=metrics_registry) REQUESTS_TOTAL = Counter( "http_requests_total", "Service Request Count", ["method", "endpoint", "http_status"], registry=metrics_registry, ) REQUEST_LATENCY = Histogram( "request_latency_ms", "Request latency in milliseconds", ["method", "endpoint"], registry=metrics_registry, ) RPCS_TOTAL = Counter( "rpc_requests_total", "Remote procedure call count", ["method", "endpoint", "http_status"], registry=metrics_registry, ) RPC_LATENCY = Histogram( "rpc_request_latency_ms", "Remote procedure call latency in milliseconds", ["method", "endpoint", "http_status"], registry=metrics_registry, )
'trace_traceback_number', 'number of traceback produced by function', ['app_name', 'endpoint', 'func'], registry=registry, ) TRACEBACK_COUNTER_FUNC1 = TRACEBACK_COUNTER.labels(app_name='poc_app', endpoint='/trace', func='tracefail') TRACEBACK_COUNTER_FUNC2 = TRACEBACK_COUNTER.labels(app_name='poc_app', endpoint='/trace2', func='tracefail2') REQUEST_DECORATED = Histogram( 'root_request_processing_seconds', 'Time spent processing request', ['app_name', 'endpoint'], registry=registry, ) REQUEST_DECORATED_TIME = REQUEST_DECORATED.labels(app_name='poc_app', endpoint='/') REQUEST_DECORATED_TIME2 = REQUEST_DECORATED.labels(app_name='poc_app', endpoint='/counter') REQUEST_DECORATED_TIME3 = REQUEST_DECORATED.labels(app_name='poc_app', endpoint='/gauge') REQUEST_DECORATED_TIME4 = REQUEST_DECORATED.labels(app_name='poc_app', endpoint='/trace') REQUEST_DECORATED_TIME5 = REQUEST_DECORATED.labels(app_name='poc_app', endpoint='/trace2') INFO_TYPE = Info(
def got_histogram_observe(self, name, value): if self.check_enabled(): histogram = self._monitoring_items[self.HISTOGRAM] if not histogram.get(name): histogram[name] = Histogram(name, name) return histogram[name].observe(value)
class ZombieCollector(Collector): logs_histogram = Histogram( "cmd_docker_logs_latency_seconds", "Command call latency for docker logs (seconds)") logs_timeout = 1 # 99th latency is 0.04s zombie_container_count = Gauge( "zombie_container_count", "number of zombie container found for this node", ["type"]) class ZombieRecorder(object): def __init__(self, type): self.type = type self.zombies = { } # key is container id, value is enter zombie time # When we first meet zombie container, we only record time of that meet, # we wait extra decay_time to report it as zombie. Because at the time # of our recording, zombie just produced, and haven't been recycled, we # wait 5 minutes to avoid possible cases of normal zombie. self.decay_time = datetime.timedelta(minutes=5) def update(self, zombie_ids, now): """ feed in new zombie ids and get id of decayed zombie """ # remove all records not exist anymore for z_id in list(self.zombies.keys()): if z_id not in zombie_ids: logger.debug("pop zombie %s that not exist anymore", z_id) self.zombies.pop(z_id) result = set() for current in zombie_ids: if current in self.zombies: enter_zombie_time = self.zombies[current] if now - enter_zombie_time > self.decay_time: result.add(current) else: logger.debug("new zombie %s", current) self.zombies[current] = now ZombieCollector.zombie_container_count.labels(self.type).set( len(result)) return result def __len__(self): return len(self.zombies) def __init__(self, name, sleep_time, atomic_ref, iteration_counter, stats_info_ref, zombie_ids_ref): Collector.__init__(self, name, sleep_time, atomic_ref, iteration_counter) self.stats_info_ref = stats_info_ref self.zombie_ids_ref = zombie_ids_ref self.type1_zombies = ZombieCollector.ZombieRecorder("job_exit_hangs") self.type2_zombies = ZombieCollector.ZombieRecorder("residual_job") self.yarn_pattern = u"container_\w{3}_[0-9]{13}_[0-9]{4}_[0-9]{2}_[0-9]{6}" self.yarn_container_reg = re.compile(u"^" + self.yarn_pattern + "$") self.job_container_reg = re.compile(u"^.+(" + self.yarn_pattern + u")$") def update_zombie_count_type1(self, exited_containers, now): """ this fn will generate zombie container count for the first type, exited_containers is container id set of which we believe exited """ return self.type1_zombies.update(exited_containers, now) def update_zombie_count_type2(self, stats, now): """ this fn will generate zombie container count for the second type """ name_to_id = {} for info in stats.values(): name_to_id[info["name"]] = info["id"] # key is job name, value is tuple of corresponding # yarn_container name and job container id job_containers = {} yarn_containers = set() zombie_ids = set() for name, id in name_to_id.items(): if re.match(self.yarn_container_reg, name) is not None: yarn_containers.add(name) elif re.match(self.job_container_reg, name) is not None: match = re.match(self.job_container_reg, name) value = match.groups()[0] job_containers[name] = (value, id) else: pass # ignore for _, val in job_containers.items(): yarn_name, job_id = val if yarn_name not in yarn_containers: zombie_ids.add(job_id) return self.type2_zombies.update(zombie_ids, now) def docker_logs(self, container_id, tail="all"): try: return utils.exec_cmd( ["docker", "logs", "--tail", str(tail), str(container_id)], histogram=ZombieCollector.logs_histogram, stderr=subprocess.STDOUT, # also capture stderr output timeout=ZombieCollector.logs_timeout) except subprocess.TimeoutExpired as e: logger.warning("docker log timeout") except subprocess.CalledProcessError as e: logger.warning("docker logs returns %d, output %s", e.returncode, e.output) except Exception: logger.exception("exec docker logs error") return "" def is_container_exited(self, container_id): logs = self.docker_logs(container_id, tail=50) if re.search(u"USER COMMAND END", logs): return True return False def update_zombie_count(self, stats): """ There are two types of zombie: 1. container which outputted "USER COMMAND END" but did not exist for a long period of time 2. yarn container exited but job container didn't return set of container id that deemed as zombie """ if stats is None: logger.warning("docker stats is None") return exited_containers = set(filter(self.is_container_exited, stats.keys())) now = datetime.datetime.now() type1_zombies = self.update_zombie_count_type1(exited_containers, now) type2_zombies = self.update_zombie_count_type2(stats, now) return type1_zombies.union(type2_zombies) def collect_impl(self): # set it to None so if docker-stats hangs till next time we get, # we will get None stats_info = self.stats_info_ref.get(datetime.datetime.now()) all_zombies = self.update_zombie_count(stats_info) self.zombie_ids_ref.set(all_zombies, datetime.datetime.now())
class ContainerCollector(Collector): stats_histogram = Histogram( "cmd_docker_stats_latency_seconds", "Command call latency for docker stats (seconds)") stats_timeout = 20 # 99th latency may larger than 10s, # Because prometheus's largest bucket for recording histogram is 10s, # we can not get value higher than 10s. inspect_histogram = Histogram( "cmd_docker_inspect_latency_seconds", "Command call latency for docker inspect (seconds)") inspect_timeout = 1 # 99th latency is 0.042s iftop_histogram = Histogram("cmd_iftop_latency_seconds", "Command call latency for iftop (seconds)") iftop_timeout = 10 # 99th latency is 7.4s lsof_histogram = Histogram("cmd_lsof_latency_seconds", "Command call latency for lsof (seconds)") lsof_timeout = 2 # 99th latency is 0.5s pai_services = list( map( lambda s: "k8s_" + s, [ # Run in master node "rest-server", "pylon", "webportal", "grafana", "prometheus", "alertmanager", "watchdog", "frameworkcontroller", "hivedscheduler", "framework-watcher_database-controller", "write-merger_database-controller", "poller_database-controller", "dshuttle-master", "dshuttle-job-master", "fluentd", "postgresql_postgresql", # Run as daemon set "node-exporter", "job-exporter", "log-manager-nginx", "log-cleaner", "dshuttle-worker", "dshuttle-job-worker", "dshuttle-csi-daemon", "weave", "weave-npc", "nvidia-device-plugin-ctr", "k8s-host-device", "amdgpu", "k8s-rdma", ])) def __init__(self, name, sleep_time, atomic_ref, iteration_counter, gpu_info_ref, stats_info_ref, interface): Collector.__init__(self, name, sleep_time, atomic_ref, iteration_counter) self.gpu_info_ref = gpu_info_ref self.stats_info_ref = stats_info_ref self.network_interface = network.try_to_get_right_interface(interface) logger.info( "found %s as potential network interface to listen network traffic", self.network_interface) self.gpu_vendor = utils.get_gpu_vendor() # k8s will prepend "k8s_" to pod name. There will also be a container name # prepend with "k8s_POD_" which is a docker container used to construct # network & pid namespace for specific container. These container prepend # with "k8s_POD" consume nothing. def collect_impl(self): all_conns = network.iftop(self.network_interface, ContainerCollector.iftop_histogram, ContainerCollector.iftop_timeout) stats_obj = docker_stats.stats(ContainerCollector.stats_histogram, ContainerCollector.stats_timeout) now = datetime.datetime.now() gpu_infos = self.gpu_info_ref.get(now) self.stats_info_ref.set(stats_obj, now) logger.debug("all_conns is %s", all_conns) logger.debug("gpu_info is %s", gpu_infos) logger.debug("stats_obj is %s", stats_obj) return self.collect_container_metrics(stats_obj, gpu_infos, all_conns) @staticmethod def parse_from_labels(inspect_info, gpu_infos): gpu_ids = [] result_labels = {} result_labels["username"] = inspect_info.username or "unknown" result_labels["job_name"] = inspect_info.job_name or "unknown" result_labels["role_name"] = inspect_info.role_name or "unknown" result_labels["task_index"] = inspect_info.task_index or "unknown" result_labels[ "job_instance_id"] = inspect_info.job_instance_id or "unknown" result_labels[ "virtual_cluster"] = inspect_info.virtual_cluster or "unknown" if inspect_info.gpu_ids: ids = inspect_info.gpu_ids.replace("\"", "").split(",") for id in ids: # If the container was scheduled by yarn, we get its GPU usage # info from label GPU_ID, value of the label is minor_number, and # will be digits. # If the container was scheduled by kube launcher, we get its GPU # usage info from environment NVIDIA_VISIBLE_DEVICES, the value # is like GPU-dc0671b0-61a4-443e-f456-f8fa6359b788. The mapping # from uuid to minor_number is get via nvidia-smi, and gpu_infos # should have key of this uuid. if id.isdigit(): gpu_ids.append(id) elif id and gpu_infos is not None: # id is in form of UUID like if gpu_infos.get(id) is not None: gpu_ids.append(gpu_infos[id].minor) else: logger.warning( "gpu uuid %s can not be found in map %s", id, gpu_infos) else: logger.warning("unknown gpu id %s, gpu_infos is %s", id, gpu_infos) return gpu_ids, result_labels @classmethod def infer_service_name(cls, container_name): """ try to infer service name from container_name, if it's container not belongs to pai service, will return None """ if container_name.startswith("k8s_POD_"): # this is empty container created by k8s for pod return None # TODO speed this up, since this is O(n^2) for service_name in cls.pai_services: if container_name.startswith(service_name): return service_name[4:] # remove "k8s_" prefix return None def process_one_container(self, container_id, stats, gpu_infos, all_conns, gauges): container_name = utils.walk_json_field_safe(stats, "name") pai_service_name = ContainerCollector.infer_service_name( container_name) inspect_info = docker_inspect.inspect( container_id, ContainerCollector.inspect_histogram, ContainerCollector.inspect_timeout, self.gpu_vendor) pid = inspect_info.pid job_name = inspect_info.job_name logger.debug("%s has inspect result %s, service_name %s", container_name, inspect_info, pai_service_name) if job_name is None and pai_service_name is None: logger.debug("%s is ignored", container_name) return # other container, maybe kubelet or api-server # get network consumption, since all our services/jobs running in host # network, and network statistic from docker is not specific to that # container. We have to get network statistic by ourselves. lsof_result = network.lsof(pid, ContainerCollector.lsof_histogram, ContainerCollector.lsof_timeout) net_in, net_out = network.get_container_network_metrics( all_conns, lsof_result) if logger.isEnabledFor(logging.DEBUG): debug_info = utils.exec_cmd( "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True) logger.debug( "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid, debug_info.strip(), lsof_result, net_in, net_out) if pai_service_name is None: gpu_ids, container_labels = ContainerCollector.parse_from_labels( inspect_info, gpu_infos) if gpu_infos: for id in gpu_ids: if gpu_infos.get(id) is None: continue nvidia_gpu_status = gpu_infos[id] labels = copy.deepcopy(container_labels) labels["minor_number"] = id gauges.add_value("task_gpu_percent", labels, nvidia_gpu_status.gpu_util) gauges.add_value("task_gpu_mem_percent", labels, nvidia_gpu_status.gpu_mem_util) gauges.add_value("task_cpu_percent", container_labels, stats["CPUPerc"]) gauges.add_value("task_mem_usage_byte", container_labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("task_mem_limit_byte", container_labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("task_net_in_byte", container_labels, net_in) gauges.add_value("task_net_out_byte", container_labels, net_out) gauges.add_value("task_block_in_byte", container_labels, stats["BlockIO"]["in"]) gauges.add_value("task_block_out_byte", container_labels, stats["BlockIO"]["out"]) gauges.add_value("task_mem_usage_percent", container_labels, stats["MemPerc"]) else: labels = {"name": pai_service_name} gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"]) gauges.add_value("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("service_mem_usage_percent", labels, stats["MemPerc"]) gauges.add_value("service_net_in_byte", labels, net_in) gauges.add_value("service_net_out_byte", labels, net_out) gauges.add_value("service_block_in_byte", labels, stats["BlockIO"]["in"]) gauges.add_value("service_block_out_byte", labels, stats["BlockIO"]["out"]) def collect_container_metrics(self, stats_obj, gpu_infos, all_conns): if stats_obj is None: logger.warning("docker stats returns None") return None gauges = ResourceGauges() for container_id, stats in stats_obj.items(): try: self.process_one_container(container_id, stats, gpu_infos, all_conns, gauges) except Exception: logger.exception( "error when trying to process container %s with name %s", container_id, utils.walk_json_field_safe(stats, "name")) return gauges.as_array()
class GpuCollector(Collector): nvidia_cmd_histogram = Histogram( "cmd_nvidia_smi_latency_seconds", "Command call latency for nvidia-smi (seconds)") amd_cmd_hostogram = Histogram( "cmd_rocm_smi_latency_seconds", "Command call latency for rocm-smi (seconds)") cmd_timeout = 60 # 99th latency is 0.97s def __init__(self, name, sleep_time, atomic_ref, iteration_counter, gpu_info_ref, zombie_info_ref, mem_leak_thrashold): Collector.__init__(self, name, sleep_time, atomic_ref, iteration_counter) self.gpu_info_ref = gpu_info_ref self.zombie_info_ref = zombie_info_ref self.mem_leak_thrashold = mem_leak_thrashold self.gpu_vendor = utils.get_gpu_vendor() @staticmethod def get_container_id(pid): """ return two values, the first one is if we found the corresponding container_id, the second one is the container_id if found """ path = "/proc/%d/cgroup" % (pid) if not os.path.isfile(path): return False, "" with open(path) as f: content = f.read() for line in content.split("\n"): line = line.strip() if "pids" in line: if "/docker/" in line: parts = line.split("/docker/") if len(parts) == 2 and re.match(u"[0-9a-f]+", parts[1]): return True, parts[1] elif "/kubepods/" in line: parts = line.split("/kubepods/") if len(parts) == 2 and re.match(u"pod[0-9a-f-]+", parts[1]): return True, parts[1] else: logger.info("unknown format in pid cgroup %s", line) return False, "" @staticmethod def gen_common_gpu_gauge(): return gen_gpu_util_gauge(), gen_gpu_mem_util_gauge() @staticmethod def convert_nvidia_gpu_info_to_metrics( gpu_info, zombie_info, pid_to_cid_fn, mem_leak_thrashold, node_name=os.environ.get("NODE_NAME")): """ This fn used to convert gpu_info & zombie_info into metrics, used to make it easier to do unit test """ # common gpu metrics gpu_core_util, gpu_mem_util = GpuCollector.gen_common_gpu_gauge() # nvidia metrics nvidia_core_utils = gen_nvidia_gpu_util_gauge() nvidia_mem_utils = gen_nvidia_gpu_mem_util_gauge() nvidia_gpu_temp = gen_nvidia_gpu_temperature_gauge() nvidia_ecc_errors = gen_nvidia_gpu_ecc_counter() nvidia_mem_leak = gen_nvidia_gpu_memory_leak_counter() external_process = gen_gpu_used_by_external_process_counter() zombie_container = gen_gpu_used_by_zombie_container_counter() pids_use_gpu = {} # key is gpu minor, value is an array of pid for minor, info in gpu_info.items(): if not minor.isdigit(): continue # ignore UUID gpu_core_util.add_metric([minor, GpuVendor.NVIDIA.value], info.gpu_util) gpu_mem_util.add_metric([minor, GpuVendor.NVIDIA.value], info.gpu_mem_util) nvidia_core_utils.add_metric([minor], info.gpu_util) nvidia_mem_utils.add_metric([minor], info.gpu_mem_util) if info.temperature is not None: nvidia_gpu_temp.add_metric([minor], info.temperature) nvidia_ecc_errors.add_metric([node_name, minor, "single"], info.ecc_errors.single) nvidia_ecc_errors.add_metric([node_name, minor, "double"], info.ecc_errors.double) # TODO: this piece of code seems not corret, gpu_mem_util is # a percentage number but mem_leak_thrashold is memory size. Need to fix it. if info.gpu_mem_util > mem_leak_thrashold and len(info.pids) == 0: # we found memory leak less than 20M can be mitigated automatically nvidia_mem_leak.add_metric([minor], 1) if len(info.pids) > 0: pids_use_gpu[minor] = info.pids logger.debug("pids_use_gpu is %s, zombie_info is %s", pids_use_gpu, zombie_info) if len(pids_use_gpu) > 0: if zombie_info is None: zombie_info = [] for minor, pids in pids_use_gpu.items(): for pid in pids: found, z_id = pid_to_cid_fn(pid) logger.debug("pid %s has found %s, z_id %s", pid, found, z_id) if found: # NOTE: zombie_info is a set of short docker container id, but # z_id is full id. for zombie_id in zombie_info: if z_id.startswith(zombie_id): # found corresponding container zombie_container.add_metric([minor, zombie_id], 1) else: external_process.add_metric([minor, str(pid)], 1) if len(zombie_container.samples) > 0 or len( external_process.samples) > 0: logger.warning( "found gpu used by external %s, zombie container %s", external_process, zombie_container) return [ nvidia_core_utils, nvidia_mem_utils, nvidia_ecc_errors, nvidia_mem_leak, external_process, zombie_container, nvidia_gpu_temp, gpu_core_util, gpu_mem_util ] @staticmethod def convert_amd_gpu_info_to_metrics(gpu_info): # common gpu metrics gpu_core_util, gpu_mem_util = GpuCollector.gen_common_gpu_gauge() # amd metrics amd_core_utils = gen_amd_gpu_util_gauge() amd_mem_utils = gen_amd_gpu_mem_util_gauge() amd_gpu_temp = gen_amd_gpu_temperature_gauge() for minor, info in gpu_info.items(): gpu_core_util.add_metric([minor, GpuVendor.AMD.value], info.gpu_util) gpu_mem_util.add_metric([minor, GpuVendor.AMD.value], info.gpu_mem_util) amd_core_utils.add_metric([minor], info.gpu_util) amd_mem_utils.add_metric([minor], info.gpu_mem_util) amd_gpu_temp.add_metric([minor], info.temperature) return [ amd_core_utils, amd_mem_utils, amd_gpu_temp, gpu_core_util, gpu_mem_util ] def collect_impl(self): if self.gpu_vendor == GpuVendor.UNKNOWN: logger.warning( "Couldn't identify the GPU vendor, please make sure the GPU driver installed correctly" ) return None if self.gpu_vendor == GpuVendor.NVIDIA: gpu_info = nvidia.nvidia_smi(GpuCollector.nvidia_cmd_histogram, GpuCollector.cmd_timeout) logger.debug("get nvidia gpu_info %s", gpu_info) now = datetime.datetime.now() self.gpu_info_ref.set(gpu_info, now) zombie_info = self.zombie_info_ref.get(now) if gpu_info: return GpuCollector.convert_nvidia_gpu_info_to_metrics( gpu_info, zombie_info, GpuCollector.get_container_id, self.mem_leak_thrashold) return None if self.gpu_vendor == GpuVendor.AMD: gpu_info = amd.rocm_smi(GpuCollector.amd_cmd_hostogram, GpuCollector.cmd_timeout) logger.debug("get amd gpu info %s", gpu_info) self.gpu_info_ref.set(gpu_info, datetime.datetime.now()) if gpu_info: return GpuCollector.convert_amd_gpu_info_to_metrics(gpu_info) return None return None
from synapse.events.snapshot import EventContext from synapse.logging.utils import log_function from synapse.state import v1, v2 from synapse.storage.data_stores.main.events_worker import EventRedactBehaviour from synapse.types import StateMap from synapse.util.async_helpers import Linearizer from synapse.util.caches import get_cache_factor_for from synapse.util.caches.expiringcache import ExpiringCache from synapse.util.metrics import Measure, measure_func logger = logging.getLogger(__name__) # Metrics for number of state groups involved in a resolution. state_groups_histogram = Histogram( "synapse_state_number_state_groups_in_resolution", "Number of state groups used when performing a state resolution", buckets=(1, 2, 3, 5, 7, 10, 15, 20, 50, 100, 200, 500, "+Inf"), ) KeyStateTuple = namedtuple("KeyStateTuple", ("context", "type", "state_key")) SIZE_OF_CACHE = 100000 * get_cache_factor_for("state_cache") EVICTION_TIMEOUT_SECONDS = 60 * 60 _NEXT_STATE_ID = 1 POWER_KEY = (EventTypes.PowerLevels, "") def _gen_state_id(): global _NEXT_STATE_ID
import argparse from flask import Flask, render_template_string, abort from prometheus_client import generate_latest, REGISTRY, Counter, Gauge, Histogram app = Flask(__name__) # A counter to count the total number of HTTP requests REQUESTS = Counter('http_requests_total', 'Total HTTP Requests (count)', ['method', 'endpoint', 'status_code']) # A gauge (i.e. goes up and down) to monitor the total number of in progress requests IN_PROGRESS = Gauge('http_requests_inprogress', 'Number of in progress HTTP requests') # A histogram to measure the latency of the HTTP requests TIMINGS = Histogram('http_request_duration_seconds', 'HTTP request latency (seconds)') # A gauge to count the number of packages newly added PACKAGES_NEW = Gauge('packages_newly_added', 'Packages newly added') # Standard Flask route stuff. @app.route('/') # Helper annotation to measure how long a method takes and save as a histogram metric. @TIMINGS.time() # Helper annotation to increment a gauge when entering the method and decrementing when leaving. @IN_PROGRESS.track_inprogress() def hello_world(): REQUESTS.labels(method='GET', endpoint="/", status_code=200).inc() # Increment the counter return 'Hello, World!'
def _histogram(self, var, var_help, labels, buckets): return Histogram(var, var_help, labels, buckets=buckets, registry=self._reg) # pylint: disable=unexpected-keyword-arg
import contextlib from prometheus_client import Counter, Gauge, Histogram """ The logic of the Prometheus metrics is defined in this module """ IDUNN_WIKI_REQUEST_DURATION = Histogram( "idunn_wiki_request_duration_seconds", "Time spent processing a Wiki request.", ["target", "handler"], ) IDUNN_WIKI_EXCEPTIONS_COUNT = Counter( "idunn_wiki_exceptions_count", "Number of exceptions caught in Idunn WikipediaBlock.", ["exception_type"] ) @contextlib.contextmanager def wiki_request_duration(target, handler): with IDUNN_WIKI_REQUEST_DURATION.labels(target, handler).time(): yield def exception(exception_type): IDUNN_WIKI_EXCEPTIONS_COUNT.labels(exception_type).inc()
from models import sql j2env = jinja2.Environment( loader=jinja2.FileSystemLoader("templates"), autoescape=jinja2.select_autoescape(["html", "xml"]), ) # These things are either misconfigured to not send a static device_id # or they're maliciously inflating their values. As such, we reject stats # coming from them. BLACKLIST = {"device_version": {"13.0-20180304-UNOFFICIAL-ht16": True}} REQUEST_LATENCY = Histogram( "falcon_request_latency_seconds", "Request Latency", ["method", "endpoint"] ) REQUEST_COUNT = Counter( "falcon_request_count", "Request Count", ["method", "endpoint", "status"] ) class PrometheusComponent(object): def process_request(self, req, resp): req.context["start_time"] = time() def process_response(self, req, resp, resource, req_suceeded): delta = time() - req.context["start_time"] if req.relative_uri in ["/api/v1/stats", "/"]: REQUEST_LATENCY.labels(req.method, req.relative_uri).observe(delta) REQUEST_COUNT.labels(req.method, req.relative_uri, resp.status).inc()
version, "min_version": util.version_string(wallet_server_version.PROTOCOL_MIN), "cpu_count": CPU_COUNT }) SESSIONS_COUNT = Gauge("session_count", "Number of connected client sessions", namespace=NAMESPACE, labelnames=("version", )) REQUESTS_COUNT = Counter("requests_count", "Number of requests received", namespace=NAMESPACE, labelnames=("method", "version")) RESPONSE_TIMES = Histogram("response_time", "Response times", namespace=NAMESPACE, labelnames=("method", "version")) NOTIFICATION_COUNT = Counter( "notification", "Number of notifications sent (for subscriptions)", namespace=NAMESPACE, labelnames=("method", "version")) REQUEST_ERRORS_COUNT = Counter("request_error", "Number of requests that returned errors", namespace=NAMESPACE, labelnames=("method", "version")) SQLITE_INTERRUPT_COUNT = Counter("interrupt", "Number of interrupted queries", namespace=NAMESPACE) SQLITE_OPERATIONAL_ERROR_COUNT = Counter( "operational_error",
def export_defaults(self, buckets=None, group_by='path', latency_as_histogram=True, prefix='flask', app=None, **kwargs): """ Export the default metrics: - HTTP request latencies - HTTP request exceptions - Number of HTTP requests :param buckets: the time buckets for request latencies (will use the default when `None`) :param group_by: group default HTTP metrics by this request property, like `path`, `endpoint`, `rule`, etc. (defaults to `path`) :param latency_as_histogram: export request latencies as a Histogram, otherwise use a Summary instead (defaults to `True` to export as a Histogram) :param prefix: prefix to start the default metrics names with or `NO_PREFIX` (to skip prefix) :param app: the Flask application """ if app is None: app = self.app or current_app if not prefix: prefix = self._defaults_prefix or 'flask' if kwargs.get('group_by_endpoint') is True: warnings.warn( 'The `group_by_endpoint` argument of ' '`PrometheusMetrics.export_defaults` is deprecated since 0.4.0, ' 'please use the new `group_by` argument.', DeprecationWarning) duration_group = 'endpoint' elif group_by: duration_group = group_by else: duration_group = 'path' if callable(duration_group): duration_group_name = duration_group.__name__ else: duration_group_name = duration_group if prefix == NO_PREFIX: prefix = "" else: prefix = prefix + "_" try: self.info('%sexporter_info' % prefix, 'Information about the Prometheus Flask exporter', version=self.version) except ValueError: return # looks like we have already exported the default metrics labels = self._get_combined_labels(None) if latency_as_histogram: # use the default buckets from prometheus_client if not given here buckets_as_kwargs = {} if buckets is not None: buckets_as_kwargs['buckets'] = buckets request_duration_metric = Histogram( '%shttp_request_duration_seconds' % prefix, 'Flask HTTP request duration in seconds', ('method', duration_group_name, 'status') + labels.keys(), registry=self.registry, **buckets_as_kwargs) else: # export as Summary instead request_duration_metric = Summary( '%shttp_request_duration_seconds' % prefix, 'Flask HTTP request duration in seconds', ('method', duration_group_name, 'status') + labels.keys(), registry=self.registry) counter_labels = ('method', 'status') + labels.keys() request_total_metric = Counter('%shttp_request_total' % prefix, 'Total number of HTTP requests', counter_labels, registry=self.registry) request_exceptions_metric = Counter( '%shttp_request_exceptions_total' % prefix, 'Total number of HTTP requests which resulted in an exception', counter_labels, registry=self.registry) def before_request(): request.prom_start_time = default_timer() def after_request(response): if hasattr(request, 'prom_do_not_track') or hasattr( request, 'prom_exclude_all'): return response if self.excluded_paths: if any( pattern.match(request.path) for pattern in self.excluded_paths): return response if hasattr(request, 'prom_start_time'): total_time = max(default_timer() - request.prom_start_time, 0) if callable(duration_group): group = duration_group(request) else: group = getattr(request, duration_group) request_duration_labels = { 'method': request.method, 'status': _to_status_code(response.status_code), duration_group_name: group } request_duration_labels.update(labels.values_for(response)) request_duration_metric.labels( **request_duration_labels).observe(total_time) request_total_metric.labels(method=request.method, status=_to_status_code( response.status_code), **labels.values_for(response)).inc() return response def teardown_request(exception=None): if not exception or hasattr(request, 'prom_do_not_track') or hasattr( request, 'prom_exclude_all'): return if self.excluded_paths: if any( pattern.match(request.path) for pattern in self.excluded_paths): return response = make_response('Exception: %s' % exception, 500) if callable(duration_group): group = duration_group(request) else: group = getattr(request, duration_group) request_exceptions_metric.labels( method=request.method, status=500, **labels.values_for(response)).inc() if hasattr(request, 'prom_start_time'): total_time = max(default_timer() - request.prom_start_time, 0) request_duration_labels = { 'method': request.method, 'status': 500, duration_group_name: group } request_duration_labels.update(labels.values_for(response)) request_duration_metric.labels( **request_duration_labels).observe(total_time) request_total_metric.labels(method=request.method, status=500, **labels.values_for(response)).inc() return app.before_request(before_request) app.after_request(after_request) app.teardown_request(teardown_request)
import time import logging from typing import Text from flask import Blueprint, Response, request from prometheus_client import multiprocess, Counter, Histogram, generate_latest, CollectorRegistry, REGISTRY from pyms.flask.services.driver import DriverService # Based on https://github.com/sbarratt/flask-prometheus # and https://github.com/korfuri/python-logging-prometheus/ FLASK_REQUEST_LATENCY = Histogram( "http_server_requests_seconds", "Flask Request Latency", ["service", "method", "uri", "status"] ) FLASK_REQUEST_COUNT = Counter( "http_server_requests_count", "Flask Request Count", ["service", "method", "uri", "status"] ) LOGGER_TOTAL_MESSAGES = Counter( "logger_messages_total", "Count of log entries by service and level.", ["service", "level"], ) class FlaskMetricsWrapper(): def __init__(self, app_name): self.app_name = app_name def before_request(self): # pylint: disable=R0201 request.start_time = time.time()
'hpfeeds_broker_subscriptions', 'Number of subscriptions to a channel', ['ident', 'chan'], ) RECEIVE_PUBLISH_COUNT = Counter( 'hpfeeds_broker_receive_publish_count', 'Number of events received by broker for a channel', ['ident', 'chan'], ) RECEIVE_PUBLISH_SIZE = Histogram( 'hpfeeds_broker_receive_publish_size', 'Sizes of messages received by broker for a channel', ['ident', 'chan'], buckets=[ 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304 ], ) def reset(): ''' Reset the metrics to 0. This is intended for tests **only**. ''' CLIENT_CONNECTIONS._value.set(0) SUBSCRIPTIONS._metrics = {} RECEIVE_PUBLISH_SIZE._metrics = {} RECEIVE_PUBLISH_COUNT._metrics = {} CLIENT_RECEIVE_BUFFER_FILL._metrics = {} CLIENT_SEND_BUFFER_FILL._metrics = {} CLIENT_SEND_BUFFER_DRAIN._metrics = {}
from buildman.manager.basemanager import BaseManager from buildman.manager.executor import PopenExecutor, EC2Executor, KubernetesExecutor from buildman.component.buildcomponent import BuildComponent from buildman.jobutil.buildjob import BuildJob from buildman.server import BuildJobResult from util import slash_join from util.morecollections import AttrDict logger = logging.getLogger(__name__) build_fallback = Counter("quay_build_fallback_total", "number of times a build has been retried", labelnames=["executor"]) build_ack_duration = Histogram( "quay_build_ack_duration_seconds", "seconds taken for the builder to acknowledge a queued build", labelnames=["executor"], ) build_duration = Histogram( "quay_build_duration_seconds", "seconds taken for a build's execution", labelnames=["executor", "job_status"], ) JOB_PREFIX = "building/" LOCK_PREFIX = "lock/" REALM_PREFIX = "realm/" CANCEL_PREFIX = "cancel/" METRIC_PREFIX = "metric/" CANCELED_LOCK_PREFIX = slash_join(LOCK_PREFIX, "job-cancelled")
"Total number of RPCs started on the server.", ["doge_service", "doge_method"], ) DOGE_SERVER_HANDLED_TOTAL_COUNTER = Counter( "doge_server_handled_total", ( "Total number of RPCs completed on the server, " "regardless of success or failure." ), ["doge_service", "doge_method", "code"], ) DOGE_SERVER_HANDLED_LATENCY_SECONDS = Histogram( "doge_server_handled_latency_seconds", "Histogram of response latency (seconds) of gRPC that had been " "application-level handled by the server", ["doge_service", "doge_method"], ) class MetricsServerFilter(BaseFilter): def execute(self, req: Request) -> Response: doge_service = req.service doge_method = req.method DOGE_SERVER_STARTED_TOTAL_COUNTER.labels( doge_service=doge_service, doge_method=doge_method ).inc() with DOGE_SERVER_HANDLED_LATENCY_SECONDS.labels( doge_service=doge_service, doge_method=doge_method
class TestHistogram(unittest.TestCase): def setUp(self): self.registry = CollectorRegistry() self.histogram = Histogram('h', 'help', registry=self.registry) self.labels = Histogram('hl', 'help', ['l'], registry=self.registry) def test_histogram(self): self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '1.0'})) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '2.5'})) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '5.0'})) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) self.assertEqual(0, self.registry.get_sample_value('h_count')) self.assertEqual(0, self.registry.get_sample_value('h_sum')) self.histogram.observe(2) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '1.0'})) self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '2.5'})) self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '5.0'})) self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) self.assertEqual(1, self.registry.get_sample_value('h_count')) self.assertEqual(2, self.registry.get_sample_value('h_sum')) self.histogram.observe(2.5) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '1.0'})) self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '2.5'})) self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '5.0'})) self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) self.assertEqual(2, self.registry.get_sample_value('h_count')) self.assertEqual(4.5, self.registry.get_sample_value('h_sum')) self.histogram.observe(float("inf")) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '1.0'})) self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '2.5'})) self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '5.0'})) self.assertEqual(3, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) self.assertEqual(3, self.registry.get_sample_value('h_count')) self.assertEqual(float("inf"), self.registry.get_sample_value('h_sum')) def test_setting_buckets(self): h = Histogram('h', 'help', registry=None, buckets=[0, 1, 2]) self.assertEqual([0.0, 1.0, 2.0, float("inf")], h._upper_bounds) h = Histogram('h', 'help', registry=None, buckets=[0, 1, 2, float("inf")]) self.assertEqual([0.0, 1.0, 2.0, float("inf")], h._upper_bounds) self.assertRaises(ValueError, Histogram, 'h', 'help', registry=None, buckets=[]) self.assertRaises(ValueError, Histogram, 'h', 'help', registry=None, buckets=[float("inf")]) self.assertRaises(ValueError, Histogram, 'h', 'help', registry=None, buckets=[3, 1]) def test_labels(self): self.labels.labels('a').observe(2) self.assertEqual(0, self.registry.get_sample_value('hl_bucket', {'le': '1.0', 'l': 'a'})) self.assertEqual(1, self.registry.get_sample_value('hl_bucket', {'le': '2.5', 'l': 'a'})) self.assertEqual(1, self.registry.get_sample_value('hl_bucket', {'le': '5.0', 'l': 'a'})) self.assertEqual(1, self.registry.get_sample_value('hl_bucket', {'le': '+Inf', 'l': 'a'})) self.assertEqual(1, self.registry.get_sample_value('hl_count', {'l': 'a'})) self.assertEqual(2, self.registry.get_sample_value('hl_sum', {'l': 'a'})) def test_function_decorator(self): self.assertEqual(0, self.registry.get_sample_value('h_count')) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) @self.histogram.time() def f(): pass f() self.assertEqual(1, self.registry.get_sample_value('h_count')) self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) def test_block_decorator(self): self.assertEqual(0, self.registry.get_sample_value('h_count')) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) with self.histogram.time(): pass self.assertEqual(1, self.registry.get_sample_value('h_count')) self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '+Inf'}))
def update_seconds_not_divisible_by_1_exception(): h = Histogram('test_value', 'Testing roller', registry=self.registry) roller = HistogramRoller(h, registry=self.registry, options={ 'update_seconds': 2.5 })
define('PIO_MODEL_NAMESPACE', default='', help='prediction model namespace', type=str) define('PIO_MODEL_NAME', default='', help='prediction model name', type=str) define('PIO_MODEL_VERSION', default='', help='prediction model version', type=str) define('PIO_MODEL_SERVER_PORT', default='9876', help='tornado http server listen port', type=int) define('PIO_MODEL_SERVER_PROMETHEUS_PORT', default=8080, help='port to run the prometheus http metrics server on', type=int) MODEL_MODULE_NAME = 'pio_bundle' # Create a metric to track time spent and requests made. REQUEST_TIME = Summary('request_processing_seconds', 'Model Server: Time spent processing request') REQUEST_TIME.observe(1.0) # Observe 1.0 (seconds in this case) REQUESTS_IN_PROGRESS = Gauge('inprogress_requests', 'model server: request current in progress') REQUESTS_COUNT = Counter('http_requests_total', 'model server: total \ http request count since the last time the process was restarted', ['method', 'model_type', 'model_namespace', 'model_name', 'model_version']) EX_COUNT = Counter('exceptions_total', 'model server: total http request count since the last time the process was restarted') REQUEST_LATENCY = Histogram('http_request_processing_seconds', 'model server: time in seconds spent processing requests.') REQUEST_LATENCY_BUCKETS = Histogram('http_request_duration_microseconds', 'model server: \ time in microseconds spent processing requests.', ['method', 'model_type', 'model_namespace', 'model_name', 'model_version']) REGISTRY = CollectorRegistry() REGISTRY.register(REQUEST_TIME) REGISTRY.register(REQUESTS_IN_PROGRESS) REGISTRY.register(REQUESTS_COUNT) REGISTRY.register(EX_COUNT) REGISTRY.register(REQUEST_LATENCY) REGISTRY.register(REQUEST_LATENCY_BUCKETS) LOGGER = logging.getLogger(__name__) LOGGER.setLevel(logging.DEBUG) CH = logging.StreamHandler() CH.setLevel(logging.DEBUG) LOGGER.addHandler(CH)
from django_prometheus.utils import Time, TimeSince, PowersOf import django if django.VERSION >= (1, 10, 0): from django.utils.deprecation import MiddlewareMixin else: MiddlewareMixin = object requests_total = Counter( 'django_http_requests_before_middlewares_total', 'Total count of requests before middlewares run.') responses_total = Counter( 'django_http_responses_before_middlewares_total', 'Total count of responses before middlewares run.') requests_latency_before = Histogram( 'django_http_requests_latency_including_middlewares_seconds', ('Histogram of requests processing time (including middleware ' 'processing time).')) requests_unknown_latency_before = Counter( 'django_http_requests_unknown_latency_including_middlewares_total', ('Count of requests for which the latency was unknown (when computing ' 'django_http_requests_latency_including_middlewares_seconds).')) class PrometheusBeforeMiddleware(MiddlewareMixin): """Monitoring middleware that should run before other middlewares.""" def process_request(self, request): requests_total.inc() request.prometheus_before_middleware_event = Time() def process_response(self, request, response): responses_total.inc()
class _MarshalService(cls): def __init__(self, *args, **kwargs): from prometheus_client import Histogram, Counter, Gauge super(_MarshalService, self).__init__(*args, **kwargs) namespace = config('instrument').get( 'default_namespace') # its own namespace? service_name = self.bento_service_metadata_pb.name self.metrics_request_batch_size = Histogram( name=service_name + '_mb_batch_size', documentation=service_name + "microbatch request batch size", namespace=namespace, labelnames=['endpoint'], ) self.metrics_request_duration = Histogram( name=service_name + '_mb_requestmb_duration_seconds', documentation=service_name + "API HTTP request duration in seconds", namespace=namespace, labelnames=['endpoint', 'http_response_code'], ) self.metrics_request_in_progress = Gauge( name=service_name + "_mb_request_in_progress", documentation='Totoal number of HTTP requests in progress now', namespace=namespace, labelnames=['endpoint', 'http_method'], ) self.metrics_request_exception = Counter( name=service_name + "_mb_request_exception", documentation='Totoal number of service exceptions', namespace=namespace, labelnames=['endpoint', 'exception_class'], ) self.metrics_request_total = Counter( name=service_name + "_mb_request_total", documentation='Totoal number of service exceptions', namespace=namespace, labelnames=['endpoint', 'http_response_code'], ) async def request_dispatcher(self, request): func = super(_MarshalService, self).request_dispatcher api_name = request.match_info.get("name", "/") _metrics_request_in_progress = self.metrics_request_in_progress.labels( endpoint=api_name, http_method=request.method, ) _metrics_request_in_progress.inc() time_st = time.time() try: resp = await func(request) except Exception as e: # pylint: disable=broad-except self.metrics_request_exception.labels( endpoint=api_name, exception_class=e.__class__.__name__).inc() logger.error(traceback.format_exc()) resp = aiohttp.web.Response(status=500) self.metrics_request_total.labels( endpoint=api_name, http_response_code=resp.status).inc() self.metrics_request_duration.labels( endpoint=api_name, http_response_code=resp.status).observe(time.time() - time_st) _metrics_request_in_progress.dec() return resp async def _batch_handler_template(self, requests, api_name): func = super(_MarshalService, self)._batch_handler_template self.metrics_request_batch_size.labels(endpoint=api_name).observe( len(requests)) return await func(requests, api_name)
def setUp(self): self.registry = CollectorRegistry() self.histogram = Histogram('h', 'help', registry=self.registry) self.labels = Histogram('hl', 'help', ['l'], registry=self.registry)
'''middleware.py''' import time import falcon from prometheus_client import Counter, Histogram from prometheus_client import multiprocess, CollectorRegistry from prometheus_client import generate_latest, CONTENT_TYPE_LATEST REQUEST_COUNT = Counter('request_count', 'App Request Count', ['app_name', 'method', 'endpoint', 'http_status']) REQUEST_LATENCY = Histogram('request_latency_seconds', 'Request latency', ['app_name', 'endpoint']) API_PATHS = [ '/misc/angdia', '/ct/lbb6/star', '/mt/wbh/star', '/t5/cargogen', '/ct/lbb2/cargogen/purchase', '/ct/lbb2/cargogen/sale', '/t5/orbit', '/misc/starcolor', '/metrics', '/ping' ] class PrometheusMetrics(object): '''Prometheus metrics middleware''' @staticmethod def start_timer(request): '''Start request timer''' request.start_time = time.time() def stop_timer(self, request, response): '''Stop request timer''' metric_path = self.trim_path(request.path) resp_time = time.time() - request.start_time if metric_path: