import time from classification_model import __version__ as live_version from dl_classification_model import __version__ as shadow_version from api.config import APP_NAME from api.persistence import data_access # Counter and Histogram are examples of default metrics # available from the prometheus Python client. REQUEST_COUNT = Counter( name='http_request_count', documentation='App Request Count', labelnames=['app_name', 'method', 'endpoint', 'http_status']) REQUEST_LATENCY = Histogram(name='http_request_latency_seconds', documentation='Request latency', labelnames=['app_name', 'endpoint']) def start_timer() -> None: """Get start time of a request.""" request._prometheus_metrics_request_start_time = time.time() def stop_timer(response: Response) -> Response: """Get stop time of a request..""" request_latency = time.time( ) - request._prometheus_metrics_request_start_time REQUEST_LATENCY.labels(app_name=APP_NAME, endpoint=request.path).observe(request_latency) return response
import http.server from prometheus_client import start_http_server, Histogram import time LATENCY = Histogram('hello_world_latency_seconds', 'Time for a request Hello World.') class MyHandler(http.server.BaseHTTPRequestHandler): @LATENCY.time() def do_GET(self): self.send_response(200) self.end_headers() self.wfile.write(b"Hello World") if __name__ == "__main__": start_http_server(8000) server = http.server.HTTPServer(('localhost', 8001), MyHandler) server.serve_forever()
label_1='ping', label_2='GET', ).inc() request.app.metrics[MetricsType.TIME_LATENCY.name].labels( label_1='ping', label_2='GET', ).observe(round(float(time() - start_time), 3)) return json({'success': 'you are home'}) if __name__ == "__main__": monitor(app, multiprocess_mode='all', metrics_path='/metrics', is_middleware=False, metrics_list=[(MetricsType.COUNT.name, Counter(name=MetricsType.COUNT.value, documentation='Total count', labelnames=['label_1', 'label_2'])), (MetricsType.TIME_LATENCY.name, Histogram( name=MetricsType.TIME_LATENCY.value, documentation='Gauge', labelnames=['label_1', 'label_2'], ))]).expose_endpoint() app.add_route(ping, 'ping', methods=['GET']) app.run(host="127.0.0.1", port=8000, workers=2)
from prometheus_client import start_http_server, Summary, Gauge, Histogram import random import time import math # Create Prometheus metrics REQUEST_TIME = Summary('request_processing_seconds', 'Time spent processing request') LEVEL = Gauge('current_sea_level', 'Height of Tide at Time (Seconds)') STATUS = Gauge('current_tide_direction', 'Status of tide (Incoming Outgoing High Low)') buckets = (0, 0.05, 0.1, .15, .2, .25, .3, .35, .4, .45, .5, .55, math.inf) REQUEST_HIST = Histogram('latency_histogram', 'Latency Histogram', buckets=buckets) @REQUEST_TIME.time() def measure_sea_level(t): """A toy function that 'measures sea level' with some latency.""" t = t/10 time.sleep(max(0, random.normalvariate(0.3, 0.1))) level = 4 * math.asin(math.sin(t / 2)) * math.cos(t - 2) df = dfdt(t) # {rising: 1, falling: -1, high: 2, low: -2} if abs(df) > 0.25: status = math.copysign(1, df) elif dfdt(t - 1) > 0: status = 2 else: status = -2
debug = os.getenv("DEBUG", "false") == "true" metrics_port = os.getenv("METRICS_PORT", "19000") METRIC_MISSES = Gauge("terra_oracle_misses_total", "Total number of oracle misses") METRIC_HEIGHT = Gauge("terra_oracle_height", "Block height of the LCD node") METRIC_VOTES = Counter("terra_oracle_votes", "Counter of oracle votes") METRIC_MARKET_PRICE = Gauge("terra_oracle_market_price", "Last market price", ['denom']) METRIC_SWAP_PRICE = Gauge("terra_oracle_swap_price", "Last swap price", ['denom']) METRIC_EXCHANGE_ASK_PRICE = Gauge("terra_oracle_exchange_ask_price", "Exchange ask price", ['exchange', 'denom']) METRIC_EXCHANGE_MID_PRICE = Gauge("terra_oracle_exchange_mid_price", "Exchange mid price", ['exchange', 'denom']) METRIC_EXCHANGE_BID_PRICE = Gauge("terra_oracle_exchange_bid_price", "Exchange bid price", ['exchange', 'denom']) METRIC_OUTBOUND_ERROR = Counter("terra_oracle_request_errors", "Outbound HTTP request error count", ["remote"]) METRIC_OUTBOUND_LATENCY = Histogram("terra_oracle_request_latency", "Outbound HTTP request latency", ["remote"]) # parameters fx_map = { "uusd": "USDUSD", "ukrw": "USDKRW", "usdr": "USDSDR", "umnt": "USDMNT" } active_candidate = [ "uusd", "ukrw", "usdr", "umnt" ]
from tornado.log import app_log from tornado.queues import Queue from tornado.web import Finish, authenticated from .base import BaseHandler from .build import Build, ProgressEvent from .utils import KUBE_REQUEST_TIMEOUT # Separate buckets for builds and launches. # Builds and launches have very different characteristic times, # and there is a cost to having too many buckets in prometheus. BUILD_BUCKETS = [60, 120, 300, 600, 1800, 3600, 7200, float("inf")] LAUNCH_BUCKETS = [2, 5, 10, 20, 30, 60, 120, 300, 600, float("inf")] BUILD_TIME = Histogram( "binderhub_build_time_seconds", "Histogram of build times", ["status"], buckets=BUILD_BUCKETS, ) LAUNCH_TIME = Histogram( "binderhub_launch_time_seconds", "Histogram of launch times", ["status", "retries"], buckets=LAUNCH_BUCKETS, ) BUILD_COUNT = Counter( "binderhub_build_count", "Counter of builds by repo", ["status", "provider", "repo"], ) LAUNCH_COUNT = Counter( "binderhub_launch_count",
import time from prometheus_client import Counter, Histogram from prometheus_client import start_http_server from flask import request FLASK_REQUEST_ENDPOINT_SENTINEL = "-" FLASK_REQUEST_LATENCY = Histogram( 'flask_request_latency_seconds', 'Flask Request Latency', ['method', 'endpoint']) FLASK_REQUEST_COUNT = Counter( 'flask_request_count', 'Flask Request Count', ['method', 'endpoint', 'http_status']) FLASK_REQUEST_EXCEPTION_COUNT = Counter( 'flask_request_exception_count', 'Flask Request Exception Count', ['method', 'endpoint', 'http_status']) def before_request(): request.start_time = time.time() def after_request(response): request_latency = time.time() - request.start_time endpoint = request.url_rule.rule if request.url_rule\ else FLASK_REQUEST_ENDPOINT_SENTINEL FLASK_REQUEST_LATENCY.labels(request.method, endpoint).observe(request_latency) FLASK_REQUEST_COUNT.labels(request.method, endpoint, response.status_code).inc() FLASK_REQUEST_EXCEPTION_COUNT.labels(request.method, endpoint, response.status_code).count_exceptions() return response
class ZombieCollector(Collector): logs_histogram = Histogram("cmd_docker_logs_latency_seconds", "Command call latency for docker logs (seconds)", buckets=(1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0, float("inf"))) logs_timeout = 1 # 99th latency is 0.04s zombie_container_count = Gauge("zombie_container_count", "number of zombie container found for this node", ["type"]) class ZombieRecorder(object): def __init__(self, type): self.type = type self.zombies = {} # key is container id, value is enter zombie time # When we first meet zombie container, we only record time of that meet, # we wait extra decay_time to report it as zombie. Because at the time # of our recording, zombie just produced, and haven't been recycled, we # wait 5 minutes to avoid possible cases of normal zombie. self.decay_time = datetime.timedelta(minutes=5) def update(self, zombie_ids, now): """ feed in new zombie ids and get id of decayed zombie """ # remove all records not exist anymore for z_id in list(self.zombies.keys()): if z_id not in zombie_ids: logger.debug("pop zombie %s that not exist anymore", z_id) self.zombies.pop(z_id) result = set() for current in zombie_ids: if current in self.zombies: enter_zombie_time = self.zombies[current] if now - enter_zombie_time > self.decay_time: result.add(current) else: logger.debug("new zombie %s", current) self.zombies[current] = now ZombieCollector.zombie_container_count.labels(self.type).set(len(result)) return result def __len__(self): return len(self.zombies) def __init__(self, name, sleep_time, atomic_ref, iteration_counter, stats_info_ref, zombie_ids_ref): Collector.__init__(self, name, sleep_time, atomic_ref, iteration_counter) self.stats_info_ref = stats_info_ref self.zombie_ids_ref = zombie_ids_ref self.type1_zombies = ZombieCollector.ZombieRecorder("job_exit_hangs") self.type2_zombies = ZombieCollector.ZombieRecorder("residual_job") self.yarn_pattern = u"container_\w{3}_[0-9]{13}_[0-9]{4}_[0-9]{2}_[0-9]{6}" self.yarn_container_reg = re.compile(u"^" + self.yarn_pattern + "$") self.job_container_reg = re.compile(u"^.+(" + self.yarn_pattern + u")$") def update_zombie_count_type1(self, exited_containers, now): """ this fn will generate zombie container count for the first type, exited_containers is container id set of which we believe exited """ return self.type1_zombies.update(exited_containers, now) def update_zombie_count_type2(self, stats, now): """ this fn will generate zombie container count for the second type """ name_to_id = {} for info in stats.values(): name_to_id[info["name"]] = info["id"] # key is job name, value is tuple of corresponding # yarn_container name and job container id job_containers = {} yarn_containers = set() zombie_ids = set() for name, id in name_to_id.items(): if re.match(self.yarn_container_reg, name) is not None: yarn_containers.add(name) elif re.match(self.job_container_reg, name) is not None: match = re.match(self.job_container_reg, name) value = match.groups()[0] job_containers[name] = (value, id) else: pass # ignore for job_name, val in job_containers.items(): yarn_name, job_id = val if yarn_name not in yarn_containers: zombie_ids.add(job_id) return self.type2_zombies.update(zombie_ids, now) def docker_logs(self, container_id, tail="all"): try: return utils.exec_cmd( ["docker", "logs", "--tail", str(tail), str(container_id)], histogram=ZombieCollector.logs_histogram, stderr=subprocess.STDOUT, # also capture stderr output timeout=ZombieCollector.logs_timeout) except subprocess.TimeoutExpired as e: logger.warning("docker log timeout") except subprocess.CalledProcessError as e: logger.warning("docker logs returns %d, output %s", e.returncode, e.output) except Exception: logger.exception("exec docker logs error") return "" def is_container_exited(self, container_id): logs = self.docker_logs(container_id, tail=50) if re.search(u"USER COMMAND END", logs): return True return False def update_zombie_count(self, stats): """ There are two types of zombie: 1. container which outputed "USER COMMAND END" but did not exist for a long period of time 2. yarn container exited but job container didn't return set of container id that deemed as zombie """ if stats is None: logger.warning("docker stats is None") return exited_containers = set(filter(self.is_container_exited, stats.keys())) now = datetime.datetime.now() type1_zombies = self.update_zombie_count_type1(exited_containers, now) type2_zombies = self.update_zombie_count_type2(stats, now) return type1_zombies.union(type2_zombies) def collect_impl(self): # set it to None so if docker-stats hangs till next time we get, # we will get None stats_info = self.stats_info_ref.get(datetime.datetime.now()) all_zombies = self.update_zombie_count(stats_info) self.zombie_ids_ref.set(all_zombies, datetime.datetime.now())
registry=registry) freshmaker_event_failed_counter = Counter( 'freshmaker_event_failed', 'Number of events, which failed due to error(s)', registry=registry) freshmaker_event_skipped_counter = Counter( 'freshmaker_event_skipped', 'Number of events, for which no action was taken', registry=registry) freshmaker_event_canceled_counter = Counter( 'freshmaker_event_canceled', 'Number of events canceled during their handling', registry=registry) freshmaker_build_api_latency = Histogram('build_api_latency', 'BuildAPI latency', registry=registry) freshmaker_event_api_latency = Histogram('event_api_latency', 'EventAPI latency', registry=registry) def db_hook_event_listeners(target=None): # Service-specific import of db from freshmaker import db if not target: target = db.engine @event.listens_for(target, 'engine_connect') def receive_engine_connect(conn, branch):
class GpuCollector(Collector): cmd_histogram = Histogram("cmd_nvidia_smi_latency_seconds", "Command call latency for nvidia-smi (seconds)", buckets=(1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0, float("inf"))) cmd_timeout = 600 def __init__(self, name, sleep_time, atomic_ref, iteration_counter, gpu_info_ref, zombie_info_ref, mem_leak_thrashold): Collector.__init__(self, name, sleep_time, atomic_ref, iteration_counter) self.gpu_info_ref = gpu_info_ref self.zombie_info_ref = zombie_info_ref self.mem_leak_thrashold = mem_leak_thrashold @staticmethod def get_container_id(pid): """ return two values, the first one is if we found the corresponding container_id, the second one is the container_id if found """ path = "/proc/%d/cgroup" % (pid) if not os.path.isfile(path): return False, "" with open(path) as f: content = f.read() for line in content.split("\n"): line = line.strip() if "pids" in line: if "/docker/" in line: parts = line.split("/docker/") if len(parts) == 2 and re.match(u"[0-9a-f]+", parts[1]): return True, parts[1] elif "/kubepods/" in line: parts = line.split("/kubepods/") if len(parts) == 2 and re.match(u"pod[0-9a-f-]+", parts[1]): return True, parts[1] else: logger.info("unknown format in pid cgroup %s", line) return False, "" @staticmethod def convert_to_metrics(gpu_info, zombie_info, pid_to_cid_fn, mem_leak_thrashold): """ This fn used to convert gpu_info & zombie_info into metrics, used to make it easier to do unit test """ core_utils = gen_gpu_util_gauge() mem_utils = gen_gpu_mem_util_gauge() gpu_temp = gen_gpu_temperature_gauge() ecc_errors = gen_gpu_ecc_counter() retired_page = gen_gpu_retired_page_count() mem_leak = gen_gpu_memory_leak_counter() external_process = gen_gpu_used_by_external_process_counter() zombie_container = gen_gpu_used_by_zombie_container_counter() pids_use_gpu = {} # key is gpu minor, value is an array of pid for minor, info in gpu_info.items(): if not minor.isdigit(): continue # ignore UUID uuid = info.uuid core_utils.add_metric([minor, uuid], info.gpu_util) mem_utils.add_metric([minor, uuid], info.gpu_mem_util) if info.temperature is not None: gpu_temp.add_metric([minor, uuid], info.temperature) ecc_errors.add_metric([minor, uuid, "volatile_single"], info.ecc_errors.volatile_single) ecc_errors.add_metric([minor, uuid, "volatile_double"], info.ecc_errors.volatile_double) ecc_errors.add_metric([minor, uuid, "aggregated_single"], info.ecc_errors.aggregated_single) ecc_errors.add_metric([minor, uuid, "aggregated_double"], info.ecc_errors.aggregated_double) retired_page.add_metric([minor, uuid, "single"], info.ecc_errors.single_retirement) retired_page.add_metric([minor, uuid, "double"], info.ecc_errors.double_retirement) if info.gpu_mem_util > mem_leak_thrashold and len(info.pids) == 0: # we found memory leak less than 20M can be mitigated automatically mem_leak.add_metric([minor, uuid], 1) if len(info.pids) > 0: pids_use_gpu[minor]= info.pids logger.debug("pids_use_gpu is %s, zombie_info is %s", pids_use_gpu, zombie_info) if len(pids_use_gpu) > 0: if zombie_info is None: zombie_info = [] for minor, pids in pids_use_gpu.items(): for pid in pids: found, z_id = pid_to_cid_fn(pid) logger.debug("pid %s has found %s, z_id %s", pid, found, z_id) if found: # NOTE: zombie_info is a set of short docker container id, but # z_id is full id. for zombie_id in zombie_info: if z_id.startswith(zombie_id): # found corresponding container zombie_container.add_metric([minor, zombie_id], 1) else: external_process.add_metric([minor, str(pid)], 1) if len(zombie_container.samples) > 0 or len(external_process.samples) > 0: logger.warning("found gpu used by external %s, zombie container %s", external_process, zombie_container) return [core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, retired_page] def collect_impl(self): gpu_info = nvidia.nvidia_smi(GpuCollector.cmd_histogram, GpuCollector.cmd_timeout) logger.debug("get gpu_info %s", gpu_info) now = datetime.datetime.now() self.gpu_info_ref.set(gpu_info, now) zombie_info = self.zombie_info_ref.get(now) if gpu_info is not None: return GpuCollector.convert_to_metrics(gpu_info, zombie_info, GpuCollector.get_container_id, self.mem_leak_thrashold) return None
class ContainerCollector(Collector): stats_histogram = Histogram( "cmd_docker_stats_latency_seconds", "Command call latency for docker stats (seconds)", buckets=(1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0, float("inf"))) stats_timeout = 20 # 99th latency may larger than 10s, # Because prometheus's largest bucket for recording histogram is 10s, # we can not get value higher than 10s. inspect_histogram = Histogram("cmd_docker_inspect_latency_seconds", "Command call latency for docker inspect (seconds)") inspect_timeout = 2 # 99th latency is 0.042s iftop_histogram = Histogram("cmd_iftop_latency_seconds", "Command call latency for iftop (seconds)") iftop_timeout = 10 # 99th latency is 7.4s lsof_histogram = Histogram("cmd_lsof_latency_seconds", "Command call latency for lsof (seconds)") lsof_timeout = 2 # 99th latency is 0.5s pai_services = list( map( lambda s: "k8s_" + s, [ "grafana", "prometheus", "alertmanager", "watchdog", "end-to-end-test", "yarn-frameworklauncher", "hadoop-jobhistory-service", "hadoop-name-node", "hadoop-node-manager", "hadoop-resource-manager", "hadoop-data-node", "zookeeper", "node-exporter", "job-exporter", "yarn-exporter", "nvidia-drivers", "docker-cleaner", # Below are DLTS services "nginx", "restfulapi", "weave", "weave-npc", "nvidia-device-plugin-ctr", "mysql", "jobmanager", "fluent-bit", "azure-blob-adapter", "nvidia-dcgm-exporter", "alert-manager", "reaper", "dashboard", "kubedns", "repairmanager", "redis", ])) def __init__(self, name, sleep_time, atomic_ref, iteration_counter, gpu_info_ref, stats_info_ref, interface,npu_info_ref,dcgm_info_ref): Collector.__init__(self, name, sleep_time, atomic_ref, iteration_counter) self.gpu_info_ref = gpu_info_ref self.npu_info_ref = npu_info_ref self.stats_info_ref = stats_info_ref self.dcgm_info_ref = dcgm_info_ref self.network_interface = network.try_to_get_right_interface(interface) logger.info("found %s as potential network interface to listen network traffic", self.network_interface) # k8s will prepend "k8s_" to pod name. There will also be a container name # prepend with "k8s_POD_" which is a docker container used to construct # network & pid namespace for specific container. These container prepend # with "k8s_POD" consume nothing. def collect_impl(self): all_conns = network.iftop(self.network_interface, ContainerCollector.iftop_histogram, ContainerCollector.iftop_timeout) stats_obj = docker_stats.stats(ContainerCollector.stats_histogram, ContainerCollector.stats_timeout) now = datetime.datetime.now() gpu_infos = self.gpu_info_ref.get(now) npu_infos = self.npu_info_ref.get(now) self.stats_info_ref.set(stats_obj, now) dcgm_infos = self.dcgm_info_ref.get(now) logger.debug("all_conns is %s", all_conns) logger.debug("gpu_info is %s", gpu_infos) logger.debug("stats_obj is %s", stats_obj) logger.debug("dcgm_infos is %s", dcgm_infos) return self.collect_container_metrics(stats_obj, gpu_infos, all_conns,npu_infos,dcgm_infos) @staticmethod def parse_from_labels(inspect_info, gpu_infos): gpu_ids = [] npu_ids = [] result_labels = {} result_labels["username"] = inspect_info.username or "unknown" result_labels["job_name"] = inspect_info.job_name or "unknown" result_labels["role_name"] = inspect_info.role_name or "unknown" result_labels["task_index"] = inspect_info.task_index or "unknown" result_labels["pod_name"] = inspect_info.pod_name or "unknown" result_labels["user_email"] = inspect_info.email or "unknown" result_labels["vc_name"] = inspect_info.vc_name or "unknown" if inspect_info.gpu_ids: ids = inspect_info.gpu_ids.replace("\"", "").split(",") for id in ids: # If the container was scheduled by yarn, we get its GPU usage # info from label GPU_ID, value of the label is minor_number, and # will be digits. # If the container was scheduled by kube launcher, we get its GPU # usage info from environment NVIDIA_VISIBLE_DEVICES, the value # is like GPU-dc0671b0-61a4-443e-f456-f8fa6359b788. The mapping # from uuid to minor_number is get via nvidia-smi, and gpu_infos # should have key of this uuid. if id.isdigit(): gpu_ids.append(id) elif id and gpu_infos is not None: # id is in form of UUID like if gpu_infos.get(id) is not None: gpu_ids.append(gpu_infos[id].minor) else: logger.warning("gpu uuid %s can not be found in map %s", id, gpu_infos) else: logger.warning("unknown gpu id %s, gpu_infos is %s", id, gpu_infos) if inspect_info.npu_ids: ids = inspect_info.npu_ids.replace("\"", "").split(",") for id in ids: if id.isdigit(): npu_ids.append(id) return gpu_ids,npu_ids,result_labels @classmethod def infer_service_name(cls, container_name): """ try to infer service name from container_name, if it's container not belongs to pai service, will return None """ if container_name.startswith("k8s_POD_"): # this is empty container created by k8s for pod return None # TODO speed this up, since this is O(n^2) for service_name in cls.pai_services: if container_name.startswith(service_name): return service_name[4:] # remove "k8s_" prefix return None def process_one_container(self, container_id, stats, gpu_infos, all_conns, gauges,npu_infos,dcgm_infos): container_name = utils.walk_json_field_safe(stats, "name") pai_service_name = ContainerCollector.infer_service_name(container_name) inspect_info = docker_inspect.inspect(container_id, ContainerCollector.inspect_histogram, ContainerCollector.inspect_timeout) if inspect_info is None: return pid = inspect_info.pid job_name = inspect_info.job_name logger.debug("%s has inspect result %s, service_name %s", container_name, inspect_info, pai_service_name) if job_name is None and pai_service_name is None: logger.debug("%s is ignored", container_name) return # other container, maybe kubelet or api-server # get network consumption, if container is host network, we will treat # node network consumption as container consumption. If not, use data # from docker state. # This will result network consumption of service using host network # equals to node network consumption. is_host_network = inspect_info.is_host_network if is_host_network: net_in, net_out = network.get_network_consumption( self.network_interface) else: net_in, net_out = network.get_non_host_network_consumption(pid) if pai_service_name is None: gpu_ids,npu_ids,container_labels = ContainerCollector.parse_from_labels(inspect_info, gpu_infos) logger.info("start to collect metric for jobId: %s",container_labels["job_name"]) if container_labels["username"] == "unknown": logger.warn("jobId: %s has none username,pass!" %(container_labels["job_name"])) return if gpu_infos: for id in gpu_ids: if gpu_infos.get(id) is None: continue nvidia_gpu_status = gpu_infos[id] uuid = nvidia_gpu_status.uuid labels = copy.deepcopy(container_labels) labels["minor_number"] = id labels["device_type"] = inspect_info.gpu_type or "unknown" labels["uuid"] = uuid labels["device_str"] = "nvidia.com/gpu" gauges.add_value("task_device_percent", labels, nvidia_gpu_status.gpu_util) gauges.add_value("task_device_mem_percent", labels, nvidia_gpu_status.gpu_mem_util) if npu_infos: for id in npu_ids: if npu_infos.get(id) is None: continue npu_status = npu_infos[id] labels = copy.deepcopy(container_labels) labels["minor_number"] = id labels["device_type"] = inspect_info.gpu_type or "unknown" labels["device_str"] = "npu.huawei.com/NPU" ### each npu device should have one unique string labels["uuid"] = id if inspect_info.node_name: labels["uuid"] =inspect_info.node_name+ "_" + str(id) gauges.add_value("task_device_percent", labels, npu_status.npu_util) gauges.add_value("task_device_mem_percent", labels, npu_status.npu_mem_util) if dcgm_infos: for id in gpu_ids: if dcgm_infos.get(id) is None: continue dcgm_metric = dcgm_infos[id] # will be type of DCGMMetrics uuid = dcgm_metric.uuid labels = copy.deepcopy(container_labels) labels["minor_number"] = id labels["uuid"] = uuid gauges.add_dcgm_metric(dcgm_metric, labels) gauges.add_value("task_cpu_percent", container_labels, stats["CPUPerc"]) gauges.add_value("task_mem_usage_byte", container_labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("task_mem_limit_byte", container_labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("task_net_in_byte", container_labels, net_in) gauges.add_value("task_net_out_byte", container_labels, net_out) gauges.add_value("task_block_in_byte", container_labels, stats["BlockIO"]["in"]) gauges.add_value("task_block_out_byte", container_labels, stats["BlockIO"]["out"]) gauges.add_value("task_mem_usage_percent", container_labels, stats["MemPerc"]) else: labels = {"name": pai_service_name} gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"]) gauges.add_value("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("service_mem_usage_percent", labels, stats["MemPerc"]) gauges.add_value("service_net_in_byte", labels, net_in) gauges.add_value("service_net_out_byte", labels, net_out) gauges.add_value("service_block_in_byte", labels, stats["BlockIO"]["in"]) gauges.add_value("service_block_out_byte", labels, stats["BlockIO"]["out"]) def collect_container_metrics(self, stats_obj, gpu_infos, all_conns,npu_infos,dcgm_infos): if stats_obj is None: logger.warning("docker stats returns None") return None gauges = ResourceGauges() for container_id, stats in stats_obj.items(): try: self.process_one_container(container_id, stats, gpu_infos, all_conns, gauges,npu_infos,dcgm_infos) except Exception: logger.exception("error when trying to process container %s with name %s", container_id, utils.walk_json_field_safe(stats, "name")) return gauges.as_array()
class ContainerCollector(Collector): stats_histogram = Histogram( "cmd_docker_stats_latency_seconds", "Command call latency for docker stats (seconds)") stats_timeout = 20 # 99th latency may larger than 10s, # Because prometheus's largest bucket for recording histogram is 10s, # we can not get value higher than 10s. inspect_histogram = Histogram( "cmd_docker_inspect_latency_seconds", "Command call latency for docker inspect (seconds)") inspect_timeout = 1 # 99th latency is 0.042s iftop_histogram = Histogram("cmd_iftop_latency_seconds", "Command call latency for iftop (seconds)") iftop_timeout = 10 # 99th latency is 7.4s lsof_histogram = Histogram("cmd_lsof_latency_seconds", "Command call latency for lsof (seconds)") lsof_timeout = 2 # 99th latency is 0.5s pai_services = list( map( lambda s: "k8s_" + s, [ "rest-server", "pylon", "webportal", "grafana", "prometheus", "alertmanager", "watchdog", "end-to-end-test", "yarn-frameworklauncher", "hadoop-jobhistory-service", "hadoop-name-node", "hadoop-node-manager", "hadoop-resource-manager", "hadoop-data-node", "zookeeper", "node-exporter", "job-exporter", "yarn-exporter", "nvidia-drivers", "docker-cleaner", # Below are DLTS services "nginx", "restfulapi", "weave", "weave-npc", "nvidia-device-plugin-ctr", "mysql", "jobmanager", ])) def __init__(self, name, sleep_time, atomic_ref, iteration_counter, gpu_info_ref, stats_info_ref, interface): Collector.__init__(self, name, sleep_time, atomic_ref, iteration_counter) self.gpu_info_ref = gpu_info_ref self.stats_info_ref = stats_info_ref self.network_interface = network.try_to_get_right_interface(interface) logger.info( "found %s as potential network interface to listen network traffic", self.network_interface) # k8s will prepend "k8s_" to pod name. There will also be a container name # prepend with "k8s_POD_" which is a docker container used to construct # network & pid namespace for specific container. These container prepend # with "k8s_POD" consume nothing. def collect_impl(self): all_conns = network.iftop(self.network_interface, ContainerCollector.iftop_histogram, ContainerCollector.iftop_timeout) stats_obj = docker_stats.stats(ContainerCollector.stats_histogram, ContainerCollector.stats_timeout) now = datetime.datetime.now() gpu_infos = self.gpu_info_ref.get(now) self.stats_info_ref.set(stats_obj, now) logger.debug("all_conns is %s", all_conns) logger.debug("gpu_info is %s", gpu_infos) logger.debug("stats_obj is %s", stats_obj) return self.collect_container_metrics(stats_obj, gpu_infos, all_conns) @staticmethod def parse_from_labels(inspect_info, gpu_infos): gpu_ids = [] result_labels = {} result_labels["username"] = inspect_info.username or "unknown" result_labels["job_name"] = inspect_info.job_name or "unknown" result_labels["role_name"] = inspect_info.role_name or "unknown" result_labels["task_index"] = inspect_info.task_index or "unknown" result_labels[ "job_instance_id"] = inspect_info.job_instance_id or "unknown" if inspect_info.gpu_ids: ids = inspect_info.gpu_ids.replace("\"", "").split(",") for id in ids: # If the container was scheduled by yarn, we get its GPU usage # info from label GPU_ID, value of the label is minor_number, and # will be digits. # If the container was scheduled by kube launcher, we get its GPU # usage info from environment NVIDIA_VISIBLE_DEVICES, the value # is like GPU-dc0671b0-61a4-443e-f456-f8fa6359b788. The mapping # from uuid to minor_number is get via nvidia-smi, and gpu_infos # should have key of this uuid. if id.isdigit(): gpu_ids.append(id) elif id and gpu_infos is not None: # id is in form of UUID like if gpu_infos.get(id) is not None: gpu_ids.append(gpu_infos[id].minor) else: logger.warning( "gpu uuid %s can not be found in map %s", id, gpu_infos) else: logger.warning("unknown gpu id %s, gpu_infos is %s", id, gpu_infos) return gpu_ids, result_labels @classmethod def infer_service_name(cls, container_name): """ try to infer service name from container_name, if it's container not belongs to pai service, will return None """ if container_name.startswith("k8s_POD_"): # this is empty container created by k8s for pod return None # TODO speed this up, since this is O(n^2) for service_name in cls.pai_services: if container_name.startswith(service_name): return service_name[4:] # remove "k8s_" prefix return None def process_one_container(self, container_id, stats, gpu_infos, all_conns, gauges): container_name = utils.walk_json_field_safe(stats, "name") pai_service_name = ContainerCollector.infer_service_name( container_name) inspect_info = docker_inspect.inspect( container_id, ContainerCollector.inspect_histogram, ContainerCollector.inspect_timeout) pid = inspect_info.pid job_name = inspect_info.job_name logger.debug("%s has inspect result %s, service_name %s", container_name, inspect_info, pai_service_name) if job_name is None and pai_service_name is None: logger.debug("%s is ignored", container_name) return # other container, maybe kubelet or api-server # get network consumption, since all our services/jobs running in host # network, and network statistic from docker is not specific to that # container. We have to get network statistic by ourselves. lsof_result = network.lsof(pid, ContainerCollector.lsof_histogram, ContainerCollector.lsof_timeout) net_in, net_out = network.get_container_network_metrics( all_conns, lsof_result) if logger.isEnabledFor(logging.DEBUG): debug_info = utils.exec_cmd( "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True) logger.debug( "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid, debug_info.strip(), lsof_result, net_in, net_out) if pai_service_name is None: gpu_ids, container_labels = ContainerCollector.parse_from_labels( inspect_info, gpu_infos) if gpu_infos: for id in gpu_ids: if gpu_infos.get(id) is None: continue nvidia_gpu_status = gpu_infos[id] labels = copy.deepcopy(container_labels) labels["minor_number"] = id gauges.add_value("task_gpu_percent", labels, nvidia_gpu_status.gpu_util) gauges.add_value("task_gpu_mem_percent", labels, nvidia_gpu_status.gpu_mem_util) gauges.add_value("task_cpu_percent", container_labels, stats["CPUPerc"]) gauges.add_value("task_mem_usage_byte", container_labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("task_mem_limit_byte", container_labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("task_net_in_byte", container_labels, net_in) gauges.add_value("task_net_out_byte", container_labels, net_out) gauges.add_value("task_block_in_byte", container_labels, stats["BlockIO"]["in"]) gauges.add_value("task_block_out_byte", container_labels, stats["BlockIO"]["out"]) gauges.add_value("task_mem_usage_percent", container_labels, stats["MemPerc"]) else: labels = {"name": pai_service_name} gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"]) gauges.add_value("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("service_mem_usage_percent", labels, stats["MemPerc"]) gauges.add_value("service_net_in_byte", labels, net_in) gauges.add_value("service_net_out_byte", labels, net_out) gauges.add_value("service_block_in_byte", labels, stats["BlockIO"]["in"]) gauges.add_value("service_block_out_byte", labels, stats["BlockIO"]["out"]) def collect_container_metrics(self, stats_obj, gpu_infos, all_conns): if stats_obj is None: logger.warning("docker stats returns None") return None gauges = ResourceGauges() for container_id, stats in stats_obj.items(): try: self.process_one_container(container_id, stats, gpu_infos, all_conns, gauges) except Exception: logger.exception( "error when trying to process container %s with name %s", container_id, utils.walk_json_field_safe(stats, "name")) return gauges.as_array()
"""Prometheus metrics.""" from prometheus_client import CollectorRegistry, multiprocess from prometheus_client import Counter, Histogram, Gauge REGISTRY = CollectorRegistry() multiprocess.MultiProcessCollector(REGISTRY) TWEETS_COUNTER = Counter('tweets_counter', 'Global count of tweets', registry=REGISTRY) TWEETS_COUNTS_HISTOGRAM = Histogram('tweets_counts_histogram', 'Tweets per request histogram', registry=REGISTRY) TWEETS_AVERAGE_REQUEST_TIME = Gauge('tweets_average_fetch_time_gauge', 'Average time of fetching one tweet', ['param'], registry=REGISTRY) TWEETS_TIME_HISTOGRAM = Histogram('tweets_time_histogram', 'Histogram of time per tweet', registry=REGISTRY) TWEETS_AVERAGE_LENGHT = Gauge('tweets_average_length', 'Averge length of specific tweet', ['param'], registry=REGISTRY) def update_average_request_time(size, elapsed): """Parses script arguments."""
from prometheus_client import Counter, Histogram from django_prometheus.utils import Time, TimeSince, PowersOf import django if django.VERSION >= (1, 10, 0): from django.utils.deprecation import MiddlewareMixin else: MiddlewareMixin = object requests_total = Counter('django_http_requests_before_middlewares_total', 'Total count of requests before middlewares run.') responses_total = Counter('django_http_responses_before_middlewares_total', 'Total count of responses before middlewares run.') requests_latency_before = Histogram( 'django_http_requests_latency_including_middlewares_seconds', ('Histogram of requests processing time (including middleware ' 'processing time).')) requests_unknown_latency_before = Counter( 'django_http_requests_unknown_latency_including_middlewares_total', ('Count of requests for which the latency was unknown (when computing ' 'django_http_requests_latency_including_middlewares_seconds).')) class PrometheusBeforeMiddleware(MiddlewareMixin): """Monitoring middleware that should run before other middlewares.""" def process_request(self, request): requests_total.inc() request.prometheus_before_middleware_event = Time() def process_response(self, request, response): responses_total.inc()
from synapse.api.constants import EventTypes from synapse.api.room_versions import KNOWN_ROOM_VERSIONS, StateResolutionVersions from synapse.events.snapshot import EventContext from synapse.state import v1, v2 from synapse.util.async_helpers import Linearizer from synapse.util.caches import get_cache_factor_for from synapse.util.caches.expiringcache import ExpiringCache from synapse.util.logutils import log_function from synapse.util.metrics import Measure logger = logging.getLogger(__name__) # Metrics for number of state groups involved in a resolution. state_groups_histogram = Histogram( "synapse_state_number_state_groups_in_resolution", "Number of state groups used when performing a state resolution", buckets=(1, 2, 3, 5, 7, 10, 15, 20, 50, 100, 200, 500, "+Inf"), ) KeyStateTuple = namedtuple("KeyStateTuple", ("context", "type", "state_key")) SIZE_OF_CACHE = 100000 * get_cache_factor_for("state_cache") EVICTION_TIMEOUT_SECONDS = 60 * 60 _NEXT_STATE_ID = 1 POWER_KEY = (EventTypes.PowerLevels, "") def _gen_state_id(): global _NEXT_STATE_ID
import logging from timeit import default_timer from flask import request from prometheus_client import Counter, Histogram, Info logger = logging.getLogger(__name__) APP_NAME = "phippy_api" APP_INFO = Info("api_version", "API Version") ERRORS_COUNT = Counter("errors_total", "Number of errors", ["app", "verb", "endpoint", "status"]) REQUESTS_COUNT = Counter("request_total", "Request duration in seconds", ["app", "verb", "endpoint", "status"]) REQUEST_DURATION_HISTOGRAM = Histogram( "request_duration_seconds", "Request duration in seconds", ["app", "verb", "endpoint", "status"] ) def register_metrics(app, app_version=None, app_config=None): """Register metrics middlewares""" app.before_request(before_request) app.after_request(after_request) def record_error_metric(status=None): """Record errors""" ERRORS_COUNT.labels( APP_NAME, request.method, request.endpoint,
GITHUB_ACCESS_TOKENS_SELECTOR = 0 CODE_INVENTORY = Gauge('code_inventory', 'Amount of unmerged work in a repository.', ['owner', 'repo', 'metric']) FEATURES = Gauge( 'features', 'Counts of features in org repositories, based on number of manifest files.', ['owner', 'repo']) CODE_INVENTORY_AGE = Histogram('code_inventory_age', 'Code inventory age in days.', ['owner', 'repo'], buckets=[ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250, 300, 365, float("inf") ]) REPO_SCRAPE_TIMES = {} def get_access_token(): global GITHUB_ACCESS_TOKENS_SELECTOR token = GITHUB_ACCESS_TOKENS[GITHUB_ACCESS_TOKENS_SELECTOR % 2] GITHUB_ACCESS_TOKENS_SELECTOR += 1 return token
from prometheus_client import start_http_server, Summary, Histogram, Counter, Gauge import random import time # Create a metric to track time spent and requests made. request_time = Summary('request_processing_seconds', 'Time spent processing request') histogram = Histogram('request_latency_seconds', 'Request latency') #histogram.DEFAULT_BUCKETS = (.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0, 7.5, 10.0, INF) counter = Counter('http_requests_total', 'Total Request Count') in_progress = Gauge('requests_in_progress_total', 'Requests in progress') # Decorate function with metric. @request_time.time() @histogram.time() #@in_progress.time() def process_request(t): """A dummy function that takes some time.""" time.sleep(t) time.sleep(2) #counter.inc() if __name__ == '__main__': # Start up the server to expose the metrics. start_http_server(4000) # Generate some requests. while True: process_request(random.random())
import time from flask import request from prometheus_client import Counter, Histogram, Info # # Metrics registration # METRICS_REQUEST_LATENCY = Histogram("app_request_latency_seconds", "Application Request Latency", ["method", "endpoint"]) METRICS_REQUEST_COUNT = Counter( "app_request_count", "Application Request Count", ["method", "endpoint", "http_status"], ) METRICS_INFO = Info("app_version", "Application Version") # # Request callbacks # def before_request(): """ Get start time of a request """ request._prometheus_metrics_request_start_time = time.time()
import pyshark import time import os from prometheus_client import Counter, Gauge, Histogram, start_http_server # Global variables HTTP_REQUESTS = [] # Number of http requests currently being processed. http_inprogress_requests = Gauge('http_inprogress_requests', '<description/>') http_request_latency = Histogram('http_request_latency_ms', '<description/>') http_counter = Counter('http_request_total', '<description/>', ['method', 'uri', 'response_code', 'response_time']) #, # Main def main(): #Start prometheus exporter. start_http_server(12301) #Setup of pyshark # TODO: move interface and ip to environment variables... capture = pyshark.LiveCapture( interface='eth0', bpf_filter='host ' + os.environ['ROYALNETM_IP'] + ' and not port 12301') #, display_filter='http') capture.set_debug() capture for packet in capture.sniff_continuously():
# The number of times we are recalculating state when there is only a # single forward extremity state_delta_single_event_counter = Counter( "synapse_storage_events_state_delta_single_event", "") # The number of times we are reculating state when we could have resonably # calculated the delta when we calculated the state for an event we were # persisting. state_delta_reuse_delta_counter = Counter( "synapse_storage_events_state_delta_reuse_delta", "") # The number of forward extremities for each new event. forward_extremities_counter = Histogram( "synapse_storage_events_forward_extremities_persisted", "Number of forward extremities for each new event", buckets=(1, 2, 3, 5, 7, 10, 15, 20, 50, 100, 200, 500, "+Inf"), ) # The number of stale forward extremities for each new event. Stale extremities # are those that were in the previous set of extremities as well as the new. stale_forward_extremities_counter = Histogram( "synapse_storage_events_stale_forward_extremities_persisted", "Number of unchanged forward extremities for each new event", buckets=(0, 1, 2, 3, 5, 7, 10, 15, 20, 50, 100, 200, 500, "+Inf"), ) state_resolutions_during_persistence = Counter( "synapse_storage_events_state_resolutions_during_persistence", "Number of times we had to do state res to calculate new current state", )
from tornado.concurrent import chain_future, Future from tornado import gen, web from tornado.queues import Queue from tornado.iostream import StreamClosedError from tornado.ioloop import IOLoop from tornado.log import app_log from prometheus_client import Histogram, Gauge from .base import BaseHandler from .build import Build, FakeBuild BUCKETS = [ 2, 5, 10, 15, 20, 25, 30, 60, 120, 240, 480, 960, 1920, float("inf") ] BUILD_TIME = Histogram('binderhub_build_time_seconds', 'Histogram of build times', ['status', 'provider', 'repo'], buckets=BUCKETS) LAUNCH_TIME = Histogram('binderhub_launch_time_seconds', 'Histogram of launch times', ['status', 'provider', 'repo', 'retries'], buckets=BUCKETS) BUILDS_INPROGRESS = Gauge('binderhub_inprogress_builds', 'Builds currently in progress') LAUNCHES_INPROGRESS = Gauge('binderhub_inprogress_launches', 'Launches currently in progress') class BuildHandler(BaseHandler): """A handler for working with GitHub.""" # emit keepalives every 25 seconds to avoid idle connections being closed KEEPALIVE_INTERVAL = 25
from flask import Response, Flask, request import prometheus_client from prometheus_client.core import CollectorRegistry from prometheus_client import Summary, Counter, Histogram, Gauge import time app = Flask(__name__) _INF = float("inf") graphs = {} graphs['c'] = Counter('python_request_operations_total', 'The total number of processed requests') graphs['h'] = Histogram('python_request_duration_seconds', 'Histogram for the duration in seconds.', buckets=(1, 2, 5, 6, 10, _INF)) @app.route("/") def hello(): start = time.time() graphs['c'].inc() time.sleep(0.600) end = time.time() graphs['h'].observe(end - start) return "Hello World!" @app.route("/metrics") def requests_count():
def set_histogram(name, *args, **kwargs): metrics[name] = Histogram(name, *args, **kwargs)
# gc_unreachable = Gauge("python_gc_unreachable_total", "Unreachable GC objects", ["gen"]) gc_time = Histogram( "python_gc_time", "Time taken to GC (sec)", ["gen"], buckets=[ 0.0025, 0.005, 0.01, 0.025, 0.05, 0.10, 0.25, 0.50, 1.00, 2.50, 5.00, 7.50, 15.00, 30.00, 45.00, 60.00, ], ) class GCCounts: def collect(self):
class BlockProcessor: """Process blocks and update the DB state to match. Employ a prefetcher to prefetch blocks in batches for processing. Coordinate backing up in case of chain reorganisations. """ block_count_metric = Gauge("block_count", "Number of processed blocks", namespace=NAMESPACE) block_update_time_metric = Histogram("block_time", "Block update times", namespace=NAMESPACE, buckets=HISTOGRAM_BUCKETS) reorg_count_metric = Gauge("reorg_count", "Number of reorgs", namespace=NAMESPACE) def __init__(self, env, db, daemon, notifications): self.env = env self.db = db self.daemon = daemon self.notifications = notifications self.coin = env.coin self.blocks_event = asyncio.Event() self.prefetcher = Prefetcher(daemon, env.coin, self.blocks_event) self.logger = class_logger(__name__, self.__class__.__name__) self.executor = ThreadPoolExecutor(1) # Meta self.next_cache_check = 0 self.touched = set() self.reorg_count = 0 # Caches of unflushed items. self.headers = [] self.block_hashes = [] self.block_txs = [] self.undo_infos = [] # UTXO cache self.utxo_cache = {} self.db_deletes = [] # If the lock is successfully acquired, in-memory chain state # is consistent with self.height self.state_lock = asyncio.Lock() self.search_cache = {} self.history_cache = {} async def run_in_thread_with_lock(self, func, *args): # Run in a thread to prevent blocking. Shielded so that # cancellations from shutdown don't lose work - when the task # completes the data will be flushed and then we shut down. # Take the state lock to be certain in-memory state is # consistent and not being updated elsewhere. async def run_in_thread_locked(): async with self.state_lock: return await asyncio.get_event_loop().run_in_executor( self.executor, func, *args) return await asyncio.shield(run_in_thread_locked()) async def check_and_advance_blocks(self, raw_blocks): """Process the list of raw blocks passed. Detects and handles reorgs. """ if not raw_blocks: return first = self.height + 1 blocks = [ self.coin.block(raw_block, first + n) for n, raw_block in enumerate(raw_blocks) ] headers = [block.header for block in blocks] hprevs = [self.coin.header_prevhash(h) for h in headers] chain = [self.tip] + [self.coin.header_hash(h) for h in headers[:-1]] if hprevs == chain: start = time.perf_counter() await self.run_in_thread_with_lock(self.advance_blocks, blocks) for cache in self.search_cache.values(): cache.clear() self.history_cache.clear() self.notifications.notified_mempool_txs.clear() await self._maybe_flush() processed_time = time.perf_counter() - start self.block_count_metric.set(self.height) self.block_update_time_metric.observe(processed_time) if not self.db.first_sync: s = '' if len(blocks) == 1 else 's' self.logger.info('processed {:,d} block{} in {:.1f}s'.format( len(blocks), s, processed_time)) if self._caught_up_event.is_set(): await self.notifications.on_block(self.touched, self.height) self.touched = set() elif hprevs[0] != chain[0]: await self.reorg_chain() else: # It is probably possible but extremely rare that what # bitcoind returns doesn't form a chain because it # reorg-ed the chain as it was processing the batched # block hash requests. Should this happen it's simplest # just to reset the prefetcher and try again. self.logger.warning('daemon blocks do not form a chain; ' 'resetting the prefetcher') await self.prefetcher.reset_height(self.height) async def reorg_chain(self, count: Optional[int] = None): """Handle a chain reorganisation. Count is the number of blocks to simulate a reorg, or None for a real reorg.""" if count is None: self.logger.info('chain reorg detected') else: self.logger.info(f'faking a reorg of {count:,d} blocks') await self.flush(True) async def get_raw_blocks(last_height, hex_hashes): heights = range(last_height, last_height - len(hex_hashes), -1) try: blocks = [ await self.db.read_raw_block(height) for height in heights ] self.logger.info(f'read {len(blocks)} blocks from disk') return blocks except FileNotFoundError: return await self.daemon.raw_blocks(hex_hashes) def flush_backup(): # self.touched can include other addresses which is # harmless, but remove None. self.touched.discard(None) self.db.flush_backup(self.flush_data(), self.touched) start, last, hashes = await self.reorg_hashes(count) # Reverse and convert to hex strings. hashes = [hash_to_hex_str(hash) for hash in reversed(hashes)] for hex_hashes in chunks(hashes, 50): raw_blocks = await get_raw_blocks(last, hex_hashes) await self.run_in_thread_with_lock(self.backup_blocks, raw_blocks) await self.run_in_thread_with_lock(flush_backup) last -= len(raw_blocks) await self.run_in_thread_with_lock( self.db.sql.delete_claims_above_height, self.height) await self.prefetcher.reset_height(self.height) self.reorg_count_metric.inc() async def reorg_hashes(self, count): """Return a pair (start, last, hashes) of blocks to back up during a reorg. The hashes are returned in order of increasing height. Start is the height of the first hash, last of the last. """ start, count = await self.calc_reorg_range(count) last = start + count - 1 s = '' if count == 1 else 's' self.logger.info(f'chain was reorganised replacing {count:,d} ' f'block{s} at heights {start:,d}-{last:,d}') return start, last, await self.db.fs_block_hashes(start, count) async def calc_reorg_range(self, count: Optional[int]): """Calculate the reorg range""" def diff_pos(hashes1, hashes2): """Returns the index of the first difference in the hash lists. If both lists match returns their length.""" for n, (hash1, hash2) in enumerate(zip(hashes1, hashes2)): if hash1 != hash2: return n return len(hashes) if count is None: # A real reorg start = self.height - 1 count = 1 while start > 0: hashes = await self.db.fs_block_hashes(start, count) hex_hashes = [hash_to_hex_str(hash) for hash in hashes] d_hex_hashes = await self.daemon.block_hex_hashes(start, count) n = diff_pos(hex_hashes, d_hex_hashes) if n > 0: start += n break count = min(count * 2, start) start -= count count = (self.height - start) + 1 else: start = (self.height - count) + 1 return start, count def estimate_txs_remaining(self): # Try to estimate how many txs there are to go daemon_height = self.daemon.cached_height() coin = self.coin tail_count = daemon_height - max(self.height, coin.TX_COUNT_HEIGHT) # Damp the initial enthusiasm realism = max(2.0 - 0.9 * self.height / coin.TX_COUNT_HEIGHT, 1.0) return (tail_count * coin.TX_PER_BLOCK + max(coin.TX_COUNT - self.tx_count, 0)) * realism # - Flushing def flush_data(self): """The data for a flush. The lock must be taken.""" assert self.state_lock.locked() return FlushData(self.height, self.tx_count, self.headers, self.block_hashes, self.block_txs, self.undo_infos, self.utxo_cache, self.db_deletes, self.tip) async def flush(self, flush_utxos): def flush(): self.db.flush_dbs(self.flush_data(), flush_utxos, self.estimate_txs_remaining) await self.run_in_thread_with_lock(flush) async def _maybe_flush(self): # If caught up, flush everything as client queries are # performed on the DB. if self._caught_up_event.is_set(): await self.flush(True) elif time.perf_counter() > self.next_cache_check: await self.flush(True) self.next_cache_check = time.perf_counter() + 30 def check_cache_size(self): """Flush a cache if it gets too big.""" # Good average estimates based on traversal of subobjects and # requesting size from Python (see deep_getsizeof). one_MB = 1000 * 1000 utxo_cache_size = len(self.utxo_cache) * 205 db_deletes_size = len(self.db_deletes) * 57 hist_cache_size = self.db.history.unflushed_memsize() # Roughly ntxs * 32 + nblocks * 42 tx_hash_size = ((self.tx_count - self.db.fs_tx_count) * 32 + (self.height - self.db.fs_height) * 42) utxo_MB = (db_deletes_size + utxo_cache_size) // one_MB hist_MB = (hist_cache_size + tx_hash_size) // one_MB self.logger.info('our height: {:,d} daemon: {:,d} ' 'UTXOs {:,d}MB hist {:,d}MB'.format( self.height, self.daemon.cached_height(), utxo_MB, hist_MB)) # Flush history if it takes up over 20% of cache memory. # Flush UTXOs once they take up 80% of cache memory. cache_MB = self.env.cache_MB if utxo_MB + hist_MB >= cache_MB or hist_MB >= cache_MB // 5: return utxo_MB >= cache_MB * 4 // 5 return None def advance_blocks(self, blocks): """Synchronously advance the blocks. It is already verified they correctly connect onto our tip. """ min_height = self.db.min_undo_height(self.daemon.cached_height()) height = self.height for block in blocks: height += 1 undo_info = self.advance_txs( height, block.transactions, self.coin.electrum_header(block.header, height), self.coin.header_hash(block.header)) if height >= min_height: self.undo_infos.append((undo_info, height)) self.db.write_raw_block(block.raw, height) headers = [block.header for block in blocks] self.height = height self.headers.extend(headers) self.tip = self.coin.header_hash(headers[-1]) def advance_txs(self, height, txs, header, block_hash): self.block_hashes.append(block_hash) self.block_txs.append( (b''.join(tx_hash for tx, tx_hash in txs), [tx.raw for tx, _ in txs])) undo_info = [] tx_num = self.tx_count hashXs_by_tx = [] # Use local vars for speed in the loops put_utxo = self.utxo_cache.__setitem__ spend_utxo = self.spend_utxo undo_info_append = undo_info.append update_touched = self.touched.update append_hashX_by_tx = hashXs_by_tx.append hashX_from_script = self.coin.hashX_from_script for tx, tx_hash in txs: hashXs = [] append_hashX = hashXs.append tx_numb = pack('<I', tx_num) # Spend the inputs for txin in tx.inputs: if txin.is_generation(): continue cache_value = spend_utxo(txin.prev_hash, txin.prev_idx) undo_info_append(cache_value) append_hashX(cache_value[:-12]) # Add the new UTXOs for idx, txout in enumerate(tx.outputs): # Get the hashX. Ignore unspendable outputs hashX = hashX_from_script(txout.pk_script) if hashX: append_hashX(hashX) put_utxo(tx_hash + pack('<H', idx), hashX + tx_numb + pack('<Q', txout.value)) append_hashX_by_tx(hashXs) update_touched(hashXs) self.db.total_transactions.append(tx_hash) tx_num += 1 self.db.history.add_unflushed(hashXs_by_tx, self.tx_count) self.tx_count = tx_num self.db.tx_counts.append(tx_num) return undo_info def backup_blocks(self, raw_blocks): """Backup the raw blocks and flush. The blocks should be in order of decreasing height, starting at. self.height. A flush is performed once the blocks are backed up. """ self.db.assert_flushed(self.flush_data()) assert self.height >= len(raw_blocks) coin = self.coin for raw_block in raw_blocks: # Check and update self.tip block = coin.block(raw_block, self.height) header_hash = coin.header_hash(block.header) if header_hash != self.tip: raise ChainError( 'backup block {} not tip {} at height {:,d}'.format( hash_to_hex_str(header_hash), hash_to_hex_str(self.tip), self.height)) self.tip = coin.header_prevhash(block.header) self.backup_txs(block.transactions) self.height -= 1 self.db.tx_counts.pop() self.logger.info(f'backed up to height {self.height:,d}') def backup_txs(self, txs): # Prevout values, in order down the block (coinbase first if present) # undo_info is in reverse block order undo_info = self.db.read_undo_info(self.height) if undo_info is None: raise ChainError( f'no undo information found for height {self.height:,d}') n = len(undo_info) # Use local vars for speed in the loops s_pack = pack undo_entry_len = 12 + HASHX_LEN for tx, tx_hash in reversed(txs): self.db.total_transactions.pop() for idx, txout in enumerate(tx.outputs): # Spend the TX outputs. Be careful with unspendable # outputs - we didn't save those in the first place. hashX = self.coin.hashX_from_script(txout.pk_script) if hashX: cache_value = self.spend_utxo(tx_hash, idx) self.touched.add(cache_value[:-12]) # Restore the inputs for txin in reversed(tx.inputs): if txin.is_generation(): continue n -= undo_entry_len undo_item = undo_info[n:n + undo_entry_len] self.utxo_cache[txin.prev_hash + s_pack('<H', txin.prev_idx)] = undo_item self.touched.add(undo_item[:-12]) assert n == 0 self.tx_count -= len(txs) """An in-memory UTXO cache, representing all changes to UTXO state since the last DB flush. We want to store millions of these in memory for optimal performance during initial sync, because then it is possible to spend UTXOs without ever going to the database (other than as an entry in the address history, and there is only one such entry per TX not per UTXO). So store them in a Python dictionary with binary keys and values. Key: TX_HASH + TX_IDX (32 + 2 = 34 bytes) Value: HASHX + TX_NUM + VALUE (11 + 4 + 8 = 23 bytes) That's 57 bytes of raw data in-memory. Python dictionary overhead means each entry actually uses about 205 bytes of memory. So almost 5 million UTXOs can fit in 1GB of RAM. There are approximately 42 million UTXOs on bitcoin mainnet at height 433,000. Semantics: add: Add it to the cache dictionary. spend: Remove it if in the cache dictionary. Otherwise it's been flushed to the DB. Each UTXO is responsible for two entries in the DB. Mark them for deletion in the next cache flush. The UTXO database format has to be able to do two things efficiently: 1. Given an address be able to list its UTXOs and their values so its balance can be efficiently computed. 2. When processing transactions, for each prevout spent - a (tx_hash, idx) pair - we have to be able to remove it from the DB. To send notifications to clients we also need to know any address it paid to. To this end we maintain two "tables", one for each point above: 1. Key: b'u' + address_hashX + tx_idx + tx_num Value: the UTXO value as a 64-bit unsigned integer 2. Key: b'h' + compressed_tx_hash + tx_idx + tx_num Value: hashX The compressed tx hash is just the first few bytes of the hash of the tx in which the UTXO was created. As this is not unique there will be potential collisions so tx_num is also in the key. When looking up a UTXO the prefix space of the compressed hash needs to be searched and resolved if necessary with the tx_num. The collision rate is low (<0.1%). """ def spend_utxo(self, tx_hash, tx_idx): """Spend a UTXO and return the 33-byte value. If the UTXO is not in the cache it must be on disk. We store all UTXOs so not finding one indicates a logic error or DB corruption. """ # Fast track is it being in the cache idx_packed = pack('<H', tx_idx) cache_value = self.utxo_cache.pop(tx_hash + idx_packed, None) if cache_value: return cache_value # Spend it from the DB. # Key: b'h' + compressed_tx_hash + tx_idx + tx_num # Value: hashX prefix = b'h' + tx_hash[:4] + idx_packed candidates = { db_key: hashX for db_key, hashX in self.db.utxo_db.iterator(prefix=prefix) } for hdb_key, hashX in candidates.items(): tx_num_packed = hdb_key[-4:] if len(candidates) > 1: tx_num, = unpack('<I', tx_num_packed) hash, height = self.db.fs_tx_hash(tx_num) if hash != tx_hash: assert hash is not None # Should always be found continue # Key: b'u' + address_hashX + tx_idx + tx_num # Value: the UTXO value as a 64-bit unsigned integer udb_key = b'u' + hashX + hdb_key[-6:] utxo_value_packed = self.db.utxo_db.get(udb_key) if utxo_value_packed: # Remove both entries for this UTXO self.db_deletes.append(hdb_key) self.db_deletes.append(udb_key) return hashX + tx_num_packed + utxo_value_packed raise ChainError('UTXO {} / {:,d} not found in "h" table'.format( hash_to_hex_str(tx_hash), tx_idx)) async def _process_prefetched_blocks(self): """Loop forever processing blocks as they arrive.""" while True: if self.height == self.daemon.cached_height(): if not self._caught_up_event.is_set(): await self._first_caught_up() self._caught_up_event.set() await self.blocks_event.wait() self.blocks_event.clear() if self.reorg_count: await self.reorg_chain(self.reorg_count) self.reorg_count = 0 else: blocks = self.prefetcher.get_prefetched_blocks() await self.check_and_advance_blocks(blocks) async def _first_caught_up(self): self.logger.info(f'caught up to height {self.height}') # Flush everything but with first_sync->False state. first_sync = self.db.first_sync self.db.first_sync = False await self.flush(True) if first_sync: self.logger.info(f'{lbry.__version__} synced to ' f'height {self.height:,d}') # Reopen for serving await self.db.open_for_serving() async def _first_open_dbs(self): await self.db.open_for_sync() self.height = self.db.db_height self.tip = self.db.db_tip self.tx_count = self.db.db_tx_count # --- External API async def fetch_and_process_blocks(self, caught_up_event): """Fetch, process and index blocks from the daemon. Sets caught_up_event when first caught up. Flushes to disk and shuts down cleanly if cancelled. This is mainly because if, during initial sync ElectrumX is asked to shut down when a large number of blocks have been processed but not written to disk, it should write those to disk before exiting, as otherwise a significant amount of work could be lost. """ self._caught_up_event = caught_up_event try: await self._first_open_dbs() await asyncio.wait([ self.prefetcher.main_loop(self.height), self._process_prefetched_blocks() ]) except asyncio.CancelledError: raise except: self.logger.exception("Block processing failed!") raise finally: # Shut down block processing self.logger.info('flushing to DB for a clean shutdown...') await self.flush(True) self.db.close() self.executor.shutdown(wait=True) def force_chain_reorg(self, count): """Force a reorg of the given number of blocks. Returns True if a reorg is queued, false if not caught up. """ if self._caught_up_event.is_set(): self.reorg_count = count self.blocks_event.set() return True return False
will not actually exist until the first failure. This makes dashboarding and alerting difficult, so we explicitly list statuses and create them manually here. .. versionchanged:: 1.3 added ``jupyterhub_`` prefix to metric names. """ from enum import Enum from prometheus_client import Gauge from prometheus_client import Histogram REQUEST_DURATION_SECONDS = Histogram( 'jupyterhub_request_duration_seconds', 'request duration for all HTTP requests', ['method', 'handler', 'code'], ) SERVER_SPAWN_DURATION_SECONDS = Histogram( 'jupyterhub_server_spawn_duration_seconds', 'time taken for server spawning operation', ['status'], # Use custom bucket sizes, since the default bucket ranges # are meant for quick running processes. Spawns can take a while! buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")], ) RUNNING_SERVERS = Gauge('jupyterhub_running_servers', 'the number of user servers currently running')
import bitmath import rehash from prometheus_client import Counter, Histogram from data.registry_model import registry_model from data.database import CloseForLongOperation, db_transaction from digest import digest_tools from util.registry.filelike import wrap_with_handler, StreamSlice from util.registry.gzipstream import calculate_size_handler logger = logging.getLogger(__name__) chunk_upload_duration = Histogram( "quay_chunk_upload_duration_seconds", "number of seconds for a chunk to be uploaded to the registry", labelnames=["region"], ) pushed_bytes_total = Counter("quay_registry_image_pushed_bytes_total", "number of bytes pushed to the registry") BLOB_CONTENT_TYPE = "application/octet-stream" class BlobUploadException(Exception): """ Base for all exceptions raised when uploading blobs. """ class BlobRangeMismatchException(BlobUploadException):
from starlette.responses import Response from starlette.routing import Match from starlette.status import HTTP_500_INTERNAL_SERVER_ERROR from starlette.types import ASGIApp REQUESTS = Counter("starlette_requests_total", "Total count of requests by method and path.", ["method", "path_template"]) RESPONSES = Counter( "starlette_responses_total", "Total count of responses by method, path and status codes.", ["method", "path_template", "status_code"], ) REQUESTS_PROCESSING_TIME = Histogram( "starlette_requests_processing_time_seconds", "Histogram of requests processing time by path (in seconds)", ["method", "path_template"], ) EXCEPTIONS = Counter( "starlette_exceptions_total", "Total count of exceptions raised by path and exception type", ["method", "path_template", "exception_type"], ) REQUESTS_IN_PROGRESS = Gauge( "starlette_requests_in_progress", "Gauge of requests by method and path currently being processed", ["method", "path_template"], ) class PrometheusMiddleware(BaseHTTPMiddleware):
from sentry_sdk.tracing import Span from structlog.stdlib import BoundLogger, get_logger from authentik.core.models import User from authentik.policies.models import Policy, PolicyBinding, PolicyBindingModel, PolicyEngineMode from authentik.policies.process import PolicyProcess, cache_key from authentik.policies.types import PolicyRequest, PolicyResult CURRENT_PROCESS = current_process() GAUGE_POLICIES_CACHED = Gauge( "authentik_policies_cached", "Cached Policies", ) HIST_POLICIES_BUILD_TIME = Histogram( "authentik_policies_build_time", "Execution times complete policy result to an object", ["object_name", "object_type", "user"], ) class PolicyProcessInfo: """Dataclass to hold all information and communication channels to a process""" process: PolicyProcess connection: Connection result: Optional[PolicyResult] binding: PolicyBinding def __init__(self, process: PolicyProcess, connection: Connection, binding: PolicyBinding): self.process = process