def test_negative_bucket_histogram(self): families = text_string_to_metric_families("""# TYPE a histogram # HELP a help a_bucket{le="-1.0"} 0 a_bucket{le="1.0"} 1 a_bucket{le="+Inf"} 3 a_count 3 # EOF """) self.assertEqual([ HistogramMetricFamily("a", "help", buckets=[("-1.0", 0.0), ("1.0", 1.0), ("+Inf", 3.0)]) ], list(families))
def test_histogram_exemplars(self): families = text_string_to_metric_families("""# TYPE a histogram # HELP a help a_bucket{le="1.0"} 0 # {a="b"} 0.5 a_bucket{le="2.0"} 2 # {a="c"} 0.5 a_bucket{le="+Inf"} 3 # {a="1234567890123456789012345678901234567890123456789012345678"} 4 123 # EOF """) hfm = HistogramMetricFamily("a", "help") hfm.add_sample("a_bucket", {"le": "1.0"}, 0.0, None, Exemplar({"a": "b"}, 0.5)) hfm.add_sample("a_bucket", {"le": "2.0"}, 2.0, None, Exemplar({"a": "c"}, 0.5)), hfm.add_sample("a_bucket", {"le": "+Inf"}, 3.0, None, Exemplar({"a": "1234567890123456789012345678901234567890123456789012345678"}, 4, Timestamp(123, 0))) self.assertEqual([hfm], list(families))
def test_simple_histogram(self): families = text_string_to_metric_families("""# TYPE a histogram # HELP a help a_bucket{le="1.0"} 0 a_bucket{le="+Inf"} 3 a_count 3 a_sum 2 # EOF """) self.assertEqual([ HistogramMetricFamily("a", "help", sum_value=2, buckets=[("1.0", 0.0), ("+Inf", 3.0)]) ], list(families))
def collect(self): start = time.time() h = HistogramMetricFamily('request_size', 'Time spent processing request', labels=["job", "pool"]) # Request data from Jenkins self._request_data() self._buckets.append(["+Inf", 1]) h.add_metric(labels=["zpool_writes", "t03_db"], buckets=self._buckets, sum_value=4096) yield h duration = time.time() - start COLLECTION_TIME.observe(duration)
def collect(self): for url in urls: response = requests.request(req_type, url, timeout=req_timeout) status_code = response.status_code response_time = response.elapsed.total_seconds() if status_code == 200: success_status = 1 else: success_status = 0 status_code_str = str(status_code) sum_response_time[url][success_status] += response_time sum_response_time_ms = sum_response_time[url][success_status] count_requests[url][success_status] += 1 total_count = count_requests[url][success_status] for key in range(len(buckets) - 1): if response_time <= int(buckets[key]): count_bucket[url][success_status][key] += 1 g = GaugeMetricFamily("sample_external_url_up", 'Sample external URL up status', labels=['url', 'code', 'method']) g.add_metric([url, status_code_str, req_type], success_status) yield g c = GaugeMetricFamily("sample_external_url_response_ms", 'Sample external URL response in ms', labels=['url', 'code', 'method']) c.add_metric([url, status_code_str, req_type], response_time) yield c d = HistogramMetricFamily( "sample_external_url_response_ms", 'Sample external URL response bucket in ms', labels=['url', 'code', 'method']) d.add_metric( [url, status_code_str, req_type], buckets=[(buckets[0], count_bucket[url][success_status][0]), (buckets[1], count_bucket[url][success_status][1]), (buckets[2], count_bucket[url][success_status][2]), (buckets[3], total_count)], sum_value=sum_response_time_ms) yield d logging.info("GET: %s", url)
def collect(self): yield GaugeMetricFamily('my_gauge', 'Help text', value=7) c = CounterMetricFamily('my_counter_total', 'Help text', labels=['foo']) c.add_metric(['bar'], 1.7) c.add_metric(['baz'], 3.8) yield c h = HistogramMetricFamily('my_histogram', 'Help text', labels=['handler']) h.add_metric(['prometheus'], buckets=[('.025', 1), ('.05', 2), ('.075', 3), ('.1', 4), ('.25', 5), ('.5', 6), ('.75', 7), ('1.0', 8), ('+Inf', 9)], sum_value=45) yield h
def scrape(): try: config.load_incluster_config() except config.config_exception.ConfigException: config.load_kube_config() batch_v1_api = client.BatchV1Api() jobs = retrieve_jobs(NAMESPACE, batch_v1_api) kubernetes_jobs_total_family = CounterMetricFamily( "kubernetes_jobs_total", "Count of all kubernetes jobs", labels=[JOB_LABEL]) for value, labels in kubernetes_jobs_total(jobs): kubernetes_jobs_total_family.add_metric(labels, value) kubernetes_job_errors_total_family = CounterMetricFamily( "kubernetes_job_errors_total", "Count of all kubernetes job errors", labels=[JOB_LABEL], ) error_jobs = [job for job in jobs if job.status.succeeded != 1] for value, labels in kubernetes_jobs_total(error_jobs): kubernetes_job_errors_total_family.add_metric(labels, value) kubernetes_job_duration_seconds_family = HistogramMetricFamily( "kubernetes_job_duration_seconds", "Histogram of kubernetes job durations", labels=[JOB_LABEL], ) succeeded_jobs = [job for job in jobs if job.status.succeeded == 1] for buckets, duration_sum, labels in kubernetes_job_duration_seconds( succeeded_jobs): kubernetes_job_duration_seconds_family.add_metric( labels, buckets, sum_value=duration_sum) # Replace this in one atomic operation to avoid race condition to the Expositor metrics.update({ "kubernetes_jobs_total": kubernetes_jobs_total_family, "kubernetes_job_errors_total": kubernetes_job_errors_total_family, "kubernetes_job_duration_seconds": kubernetes_job_duration_seconds_family, })
def test_histogram_labels(self): cmf = HistogramMetricFamily('h', 'help', labels=['a']) cmf.add_metric(['b'], buckets=[('0', 1), ('+Inf', 2)], sum_value=3) self.custom_collector(cmf) self.assertEqual( 1, self.registry.get_sample_value('h_bucket', { 'a': 'b', 'le': '0' })) self.assertEqual( 2, self.registry.get_sample_value('h_bucket', { 'a': 'b', 'le': '+Inf' })) self.assertEqual(2, self.registry.get_sample_value('h_count', {'a': 'b'})) self.assertEqual(3, self.registry.get_sample_value('h_sum', {'a': 'b'}))
def dump_frequency( cls, metric_name: str, documentation: str, bin_to_count: Mapping[TBin, int], sum_value: Optional[float] = None, ) -> Metric: """Converts a dictionary of bin to count to Prometheus histogram. :param metric_name: Name of the metric (must be the same for training and serving) :type metric_name: str :param documentation: Help text describing the metric (used for documentation) :type documentation: str :param bin_to_count: Counts of items in each bin (must be inserted in ascending order of the bin's numerical value). The last bin can be "+Inf" to capture None, NaN, and inf. :type bin_to_count: Mapping[Union[str, float, int], int] :param sum_value: The total value of all samples, defaults to raw bucket value * count :type sum_value: Optional[float], optional :return: The converted Prometheus histogram metric. :rtype: Metric """ buckets = [] accumulator = 0 for k, v in bin_to_count.items(): accumulator += v # Integer values will be handled like floats by Prometheus buckets.append([str(k), accumulator]) # Prometheus histogram requires at least 2 buckets if len(bin_to_count) - int("+Inf" in bin_to_count) < 1: buckets.insert(0, ["0.0", 0]) if "+Inf" not in bin_to_count: buckets.append(["+Inf", buckets[-1][1]]) return HistogramMetricFamily( name=metric_name, documentation=documentation, buckets=buckets, sum_value=sum_value or sum(float(k) * v for k, v in bin_to_count.items() if k != "+Inf"), )
def collect(self): # Fetch the data -- this must be synchronous! data = self.data_collector() buckets = {} res = [] for x in data.keys(): for i, bound in enumerate(self.buckets): if x <= bound: buckets[bound] = buckets.get(bound, 0) + data[x] for i in self.buckets: res.append([str(i), buckets.get(i, 0)]) res.append(["+Inf", sum(data.values())]) metric = HistogramMetricFamily( self.name, "", buckets=res, sum_value=sum([x * y for x, y in data.items()]) ) yield metric
def __init__(self, target, user, password, insecure): # Create a metric to track time spent and requests made. self.hist = HistogramMetricFamily('request_size', 'Time spent processing request', buckets=[['4096.0', 0], ['8192.0', 0], ['16384.0', 0], ['32768.0', 0], ['65536.0', 0], ['131072.0', 0], ['262144.0', 0], ['524288.0', 0], ['1048576.0', 0], ['2097152.0', 0], ["+Inf", 0]], sum_value=0) self._target = target.rstrip("/") self._user = user self._password = password self._insecure = insecure self._buckets = [] self.count = 0
def scrape(): global START today = datetime.utcnow().date() START = datetime.timestamp(datetime.combine(today, datetime.min.time())) tasks = retrieve_recent_koji_tasks() koji_tasks_total_family = CounterMetricFamily('koji_tasks_total', 'Count of all koji tasks', labels=TASK_LABELS) for value, labels in koji_tasks_total(tasks): koji_tasks_total_family.add_metric(labels, value) koji_task_errors_total_family = CounterMetricFamily( 'koji_task_errors_total', 'Count of all koji task errors', labels=TASK_LABELS) error_tasks = only(tasks, states=error_states) for value, labels in koji_tasks_total(error_tasks): koji_task_errors_total_family.add_metric(labels, value) koji_in_progress_tasks_family = GaugeMetricFamily( 'koji_in_progress_tasks', 'Count of all in-progress koji tasks', labels=TASK_LABELS, ) in_progress_tasks = retrieve_open_koji_tasks() for value, labels in koji_tasks_total(in_progress_tasks): koji_in_progress_tasks_family.add_metric(labels, value) koji_waiting_tasks_family = GaugeMetricFamily( 'koji_waiting_tasks', 'Count of all waiting, unscheduled koji tasks', labels=TASK_LABELS, ) waiting_tasks = retrieve_waiting_koji_tasks() for value, labels in koji_tasks_total(waiting_tasks): koji_waiting_tasks_family.add_metric(labels, value) koji_task_duration_seconds_family = HistogramMetricFamily( 'koji_task_duration_seconds', 'Histogram of koji task durations', labels=TASK_LABELS, ) for buckets, duration_sum, labels in koji_task_duration_seconds(tasks): koji_task_duration_seconds_family.add_metric(labels, buckets, sum_value=duration_sum) koji_enabled_hosts_count_family = GaugeMetricFamily( 'koji_enabled_hosts_count', 'Count of all koji hosts by channel', labels=HOST_LABELS, ) koji_enabled_hosts_capacity_family = GaugeMetricFamily( 'koji_enabled_hosts_capacity', 'Reported capacity of all koji hosts by channel', labels=HOST_LABELS, ) hosts = retrieve_hosts_by_channel() for value, labels in koji_enabled_hosts_count(hosts): koji_enabled_hosts_count_family.add_metric(labels, value) for value, labels in koji_enabled_hosts_capacity(hosts): koji_enabled_hosts_capacity_family.add_metric(labels, value) koji_task_load_family = GaugeMetricFamily( 'koji_task_load', 'Task load of all koji builders by channel', labels=HOST_LABELS, ) task_load = retrieve_task_load_by_channel() for value, labels in koji_task_load(task_load): koji_task_load_family.add_metric(labels, value) # Replace this in one atomic operation to avoid race condition to the Expositor metrics.update({ 'koji_tasks_total': koji_tasks_total_family, 'koji_task_errors_total': koji_task_errors_total_family, 'koji_in_progress_tasks': koji_in_progress_tasks_family, 'koji_waiting_tasks': koji_waiting_tasks_family, 'koji_task_duration_seconds': koji_task_duration_seconds_family, 'koji_enabled_hosts_count': koji_enabled_hosts_count_family, 'koji_enabled_hosts_capacity': koji_enabled_hosts_capacity_family, 'koji_task_load': koji_task_load_family, })
def test_histogram(self): self.custom_collector(HistogramMetricFamily('h', 'help', buckets=[('0', 1), ('+Inf', 2)], sum_value=3)) self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '0'})) self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) self.assertEqual(2, self.registry.get_sample_value('h_count', {})) self.assertEqual(3, self.registry.get_sample_value('h_sum', {}))
def to_metric(self, desc, tag_values, agg_data): """ to_metric translate the data that OpenCensus create to Prometheus format, using Prometheus Metric object :type desc: dict :param desc: The map that describes view definition :type tag_values: tuple of :class: `~opencensus.tags.tag_value.TagValue` :param object of opencensus.tags.tag_value.TagValue: TagValue object used as label values :type agg_data: object of :class: `~opencensus.stats.aggregation_data.AggregationData` :param object of opencensus.stats.aggregation_data.AggregationData: Aggregated data that needs to be converted as Prometheus samples :rtype: :class:`~prometheus_client.core.CounterMetricFamily` or :class:`~prometheus_client.core.HistogramMetricFamily` or :class:`~prometheus_client.core.UnknownMetricFamily` or :class:`~prometheus_client.core.GaugeMetricFamily` :returns: A Prometheus metric object """ metric_name = desc['name'] metric_description = desc['documentation'] label_keys = desc['labels'] assert (len(tag_values) == len(label_keys)) # Prometheus requires that all tag values be strings hence # the need to cast none to the empty string before exporting. See # https://github.com/census-instrumentation/opencensus-python/issues/480 tag_values = [tv if tv else "" for tv in tag_values] if isinstance(agg_data, aggregation_data_module.CountAggregationData): metric = CounterMetricFamily(name=metric_name, documentation=metric_description, labels=label_keys) metric.add_metric(labels=tag_values, value=agg_data.count_data) return metric elif isinstance(agg_data, aggregation_data_module.DistributionAggregationData): assert (agg_data.bounds == sorted(agg_data.bounds)) # buckets are a list of buckets. Each bucket is another list with # a pair of bucket name and value, or a triple of bucket name, # value, and exemplar. buckets need to be in order. buckets = [] cum_count = 0 # Prometheus buckets expect cumulative count. for ii, bound in enumerate(agg_data.bounds): cum_count += agg_data.counts_per_bucket[ii] bucket = [str(bound), cum_count] buckets.append(bucket) # Prometheus requires buckets to be sorted, and +Inf present. # In OpenCensus we don't have +Inf in the bucket bonds so need to # append it here. buckets.append(["+Inf", agg_data.count_data]) metric = HistogramMetricFamily(name=metric_name, documentation=metric_description, labels=label_keys) metric.add_metric( labels=tag_values, buckets=buckets, sum_value=agg_data.sum, ) return metric elif isinstance(agg_data, aggregation_data_module.SumAggregationDataFloat): metric = UnknownMetricFamily(name=metric_name, documentation=metric_description, labels=label_keys) metric.add_metric(labels=tag_values, value=agg_data.sum_data) return metric elif isinstance(agg_data, aggregation_data_module.LastValueAggregationData): metric = GaugeMetricFamily(name=metric_name, documentation=metric_description, labels=label_keys) metric.add_metric(labels=tag_values, value=agg_data.value) return metric else: raise ValueError("unsupported aggregation type %s" % type(agg_data))
def to_metric(self, desc, view): """ to_metric translate the data that OpenCensus create to Prometheus format, using Prometheus Metric object :type desc: str :param desc: The view descriptor :type view: object of :class: `~opencensus.stats.view.View` :param object of opencensus.stats.view.View view: View object to translate :rtype: :class:`~prometheus_client.core.CounterMetricFamily` or :class:`~prometheus_client.core.HistogramMetricFamily` or :class:`~prometheus_client.core.UntypedMetricFamily` or :class:`~prometheus_client.core.GaugeMetricFamily` :returns: A Prometheus metric object """ agg_data = view.aggregation.aggregation_data if isinstance(agg_data, aggregation_data_module.CountAggregationData): labels = desc['labels'] if agg_data.count_data is None else None return CounterMetricFamily(name=desc['name'], documentation=desc['documentation'], value=float(agg_data.count_data), labels=labels) elif isinstance(agg_data, aggregation_data_module.DistributionAggregationData): points = {} # Histograms are cumulative in Prometheus. # 1. Sort buckets in ascending order but, retain # their indices for reverse lookup later on. # TODO: If there is a guarantee that distribution elements # are always sorted, then skip the sorting. indices_map = {} buckets = [] i = 0 for boundarie in view.aggregation.boundaries.boundaries: if boundarie not in indices_map \ or indices_map == {}: # pragma: NO COVER indices_map[str(boundarie)] = i buckets.append(str(boundarie)) i += 1 buckets.sort() # 2. Now that the buckets are sorted by magnitude # we can create cumulative indicesmap them back by reverse index cum_count = 0 for bucket in buckets: i = indices_map[bucket] cum_count += int(agg_data.counts_per_bucket[i]) points[bucket] = cum_count labels = desc['labels'] if points is None else None return HistogramMetricFamily(name=desc['name'], documentation=desc['documentation'], buckets=list(points.items()), sum_value=agg_data.sum, labels=labels) elif isinstance(agg_data, aggregation_data_module.SumAggregationDataFloat): labels = desc['labels'] if agg_data.sum_data is None else None return UntypedMetricFamily(name=desc['name'], documentation=desc['documentation'], value=agg_data.sum_data, labels=labels) elif isinstance(agg_data, aggregation_data_module.LastValueAggregationData): labels = desc['labels'] if agg_data.value is None else None return GaugeMetricFamily(name=desc['name'], documentation=desc['documentation'], value=agg_data.value, labels=labels) else: raise ValueError("unsupported aggregation type %s" % type(agg_data))
def to_metric(self, desc, view): """ to_metric translate the data that OpenCensus create to Prometheus format, using Prometheus Metric object :type desc: str :param desc: The view descriptor :type view: object of :class: `~opencensus.stats.view.View` :param object of opencensus.stats.view.View view: View object to translate :rtype: :class:`~prometheus_client.core.CounterMetricFamily` or :class:`~prometheus_client.core.HistogramMetricFamily` or :class:`~prometheus_client.core.UntypedMetricFamily` or :class:`~prometheus_client.core.GaugeMetricFamily` :returns: A Prometheus metric object """ agg_data = view.aggregation.aggregation_data if isinstance(agg_data, aggregation_data_module.CountAggregationData): labels = desc['labels'] if agg_data.count_data is None else None return CounterMetricFamily(name=desc['name'], documentation=desc['documentation'], value=float(agg_data.count_data), labels=labels) elif isinstance(agg_data, aggregation_data_module.DistributionAggregationData): assert(agg_data.bounds == sorted(agg_data.bounds)) points = {} cum_count = 0 for ii, bound in enumerate(agg_data.bounds): cum_count += agg_data.counts_per_bucket[ii] points[str(bound)] = cum_count labels = desc['labels'] if points is None else None return HistogramMetricFamily(name=desc['name'], documentation=desc['documentation'], buckets=list(points.items()), sum_value=agg_data.sum, labels=labels) elif isinstance(agg_data, aggregation_data_module.SumAggregationDataFloat): labels = desc['labels'] if agg_data.sum_data is None else None return UntypedMetricFamily(name=desc['name'], documentation=desc['documentation'], value=agg_data.sum_data, labels=labels) elif isinstance(agg_data, aggregation_data_module.LastValueAggregationData): labels = desc['labels'] if agg_data.value is None else None return GaugeMetricFamily(name=desc['name'], documentation=desc['documentation'], value=agg_data.value, labels=labels) else: raise ValueError("unsupported aggregation type %s" % type(agg_data))
def scrape(): global START today = datetime.utcnow().date() START = datetime.timestamp(datetime.combine(today, datetime.min.time())) tasks = retrieve_recent_koji_tasks() koji_tasks_total_family = CounterMetricFamily( 'koji_tasks_total', 'Count of all koji tasks', labels=TASK_LABELS ) for value, labels in koji_tasks_total(tasks): koji_tasks_total_family.add_metric(labels, value) koji_task_errors_total_family = CounterMetricFamily( 'koji_task_errors_total', 'Count of all koji task errors', labels=TASK_LABELS ) error_tasks = only(tasks, states=error_states) for value, labels in koji_tasks_total(error_tasks): koji_task_errors_total_family.add_metric(labels, value) koji_task_completions_total_family = CounterMetricFamily( 'koji_task_completions_total', 'Count of all koji task completed', labels=TASK_LABELS ) completed_tasks = only(tasks, states=completed_states) for value, labels in koji_tasks_total(completed_tasks): koji_task_completions_total_family.add_metric(labels, value) koji_in_progress_tasks_family = GaugeMetricFamily( 'koji_in_progress_tasks', 'Count of all in-progress koji tasks', labels=TASK_LABELS, ) in_progress_tasks = retrieve_open_koji_tasks() for value, labels in koji_tasks_total(in_progress_tasks): koji_in_progress_tasks_family.add_metric(labels, value) koji_waiting_tasks_family = GaugeMetricFamily( 'koji_waiting_tasks', 'Count of all waiting, unscheduled koji tasks', labels=TASK_LABELS, ) waiting_tasks = retrieve_waiting_koji_tasks() for value, labels in koji_tasks_total(waiting_tasks): koji_waiting_tasks_family.add_metric(labels, value) koji_task_duration_seconds_family = HistogramMetricFamily( 'koji_task_duration_seconds', 'Histogram of koji task durations', labels=TASK_LABELS, ) for buckets, duration_sum, labels in koji_task_duration_seconds( tasks, calculate_overall_duration ): koji_task_duration_seconds_family.add_metric(labels, buckets, sum_value=duration_sum) koji_task_waiting_duration_seconds_family = HistogramMetricFamily( 'koji_task_waiting_duration_seconds', 'Histogram of koji tasks durations while waiting', labels=TASK_LABELS, ) for buckets, duration_sum, labels in koji_task_duration_seconds( tasks, calculate_waiting_duration ): koji_task_waiting_duration_seconds_family.add_metric( labels, buckets, sum_value=duration_sum ) koji_task_in_progress_duration_seconds_family = HistogramMetricFamily( 'koji_task_in_progress_duration_seconds', 'Histogram of koji task durations while in-progress', labels=TASK_LABELS, ) for buckets, duration_sum, labels in koji_task_duration_seconds( tasks, calculate_in_progress_duration ): koji_task_in_progress_duration_seconds_family.add_metric( labels, buckets, sum_value=duration_sum ) koji_enabled_hosts_count_family = GaugeMetricFamily( 'koji_enabled_hosts_count', 'Count of all koji hosts by channel', labels=HOST_LABELS, ) koji_enabled_hosts_capacity_family = GaugeMetricFamily( 'koji_enabled_hosts_capacity', 'Reported capacity of all koji hosts by channel', labels=HOST_LABELS, ) koji_hosts_last_update_family = GaugeMetricFamily( 'koji_hosts_last_update', 'Gauge of last update from host', labels=BUILDER_LABELS, ) hosts = retrieve_hosts_by_channel() # result_object is a VirtualCall object from the use of the MultiCallSession from the Koji API for result_object, labels in koji_hosts_last_update(hosts): koji_hosts_last_update_family.add_metric(labels, result_object.result) for value, labels in koji_enabled_hosts_count(hosts): koji_enabled_hosts_count_family.add_metric(labels, value) for value, labels in koji_enabled_hosts_capacity(hosts): koji_enabled_hosts_capacity_family.add_metric(labels, value) koji_task_load_family = GaugeMetricFamily( 'koji_task_load', 'Task load of all koji builders by channel', labels=HOST_LABELS, ) task_load = retrieve_task_load_by_channel() for value, labels in koji_task_load(task_load): koji_task_load_family.add_metric(labels, value) # Replace this in one atomic operation to avoid race condition to the Expositor metrics.update( { 'koji_tasks_total': koji_tasks_total_family, 'koji_task_errors_total': koji_task_errors_total_family, 'koji_task_completions_total': koji_task_completions_total_family, 'koji_in_progress_tasks': koji_in_progress_tasks_family, 'koji_waiting_tasks': koji_waiting_tasks_family, 'koji_task_duration_seconds': koji_task_duration_seconds_family, 'koji_task_waiting_duration_seconds': koji_task_waiting_duration_seconds_family, 'koji_task_in_progress_duration_seconds': koji_task_in_progress_duration_seconds_family, 'koji_enabled_hosts_count': koji_enabled_hosts_count_family, 'koji_enabled_hosts_capacity': koji_enabled_hosts_capacity_family, 'koji_task_load': koji_task_load_family, 'koji_hosts_last_update': koji_hosts_last_update_family, } )
def collect(self): leases = self._read_file() total_metric = GaugeMetricFamily("dhcpd_leases_total", "Total known DHCP leases", None, ["network"]) in_use_metric = GaugeMetricFamily("dhcpd_leases_in_use_total", "In use DHCP leases", None, ["network"]) expired_metric = GaugeMetricFamily("dhcpd_leases_expired", "Expired DHCP leaess", None, ["network"]) _buckets = [ 60, 300, 600, 900, 1200, 1500, 1800, 2100, 2400, 2700, 3600 ] buckets = list(zip(chain([0], _buckets), chain(_buckets, ["+Inf"]))) now = datetime.utcnow() def age(t): d = now - t return d active_lease_age_metric = HistogramMetricFamily( "dhcpd_leases_in_use_age_histogram", "In use leases by age", labels=("network", ), ) for subnet in self._subnets: n = "{}".format(subnet) l = leases.filter( lambda lease: ipaddress.IPv4Address(lease.ip) in subnet) total = l.count() in_use = l.active.valid.count() expired = l.filter(lambda l: l.binding_state == "free").count() in_use_metric.add_metric([n], in_use) expired_metric.add_metric([n], expired) total_metric.add_metric([n], total) b = {} for (prev, bucket) in buckets: if bucket == "+Inf": prev = timedelta(seconds=prev) c = l.active.filter(lambda l: age(l.start) > prev).count() else: prev = timedelta(seconds=prev) upper = timedelta(seconds=bucket) c = l.active.filter( lambda l: prev < age(l.start) <= upper).count() b[str(bucket)] = c sum_age = sum(map(lambda l: age(l.start).seconds, l.active)) active_lease_age_metric.add_metric([n], list(b.items()), sum_age) yield in_use_metric yield expired_metric yield total_metric yield active_lease_age_metric
def collect(self): # Loop through all metrics configured, and get datapoints # for them saved by the exporter. for daemon in self.metrics_config.keys(): for druid_metric_name in self.metrics_config[daemon]: metric_type = self.metrics_config[daemon][druid_metric_name][ 'type'] if metric_type == 'gauge' or metric_type == 'counter': try: self.counters[druid_metric_name] self.counters[druid_metric_name][daemon] except KeyError: continue if metric_type == 'gauge': metric_family_obj = GaugeMetricFamily else: metric_family_obj = CounterMetricFamily prometheus_metric = metric_family_obj( self.metrics_config[daemon][druid_metric_name] ['prometheus_metric_name'], self.metrics_config[daemon][druid_metric_name] ['description'], labels=map( lambda x: x.lower(), self.metrics_config[daemon] [druid_metric_name]['labels'])) label_values = list( self.counters[druid_metric_name][daemon].keys()) for label_value in label_values: value = self.counters[druid_metric_name][daemon][ label_value] prometheus_metric.add_metric(label_value, value) elif metric_type == 'histogram': try: self.histograms[druid_metric_name] self.histograms[druid_metric_name][daemon] except KeyError: continue prometheus_metric = HistogramMetricFamily( self.metrics_config[daemon][druid_metric_name] ['prometheus_metric_name'], self.metrics_config[daemon][druid_metric_name] ['description'], labels=map( lambda x: x.lower(), self.metrics_config[daemon] [druid_metric_name]['labels'])) label_values = list( self.histograms[druid_metric_name][daemon].keys()) for label_value in label_values: value = self.histograms[druid_metric_name][daemon][ label_value] buckets_without_sum = [[key, value] for key, value in value.items() if key != 'sum'] prometheus_metric.add_metric( label_value, buckets=buckets_without_sum, sum_value=value['sum']) else: log.info( 'metric type not supported: {}'.format(metric_type)) continue yield prometheus_metric registered = CounterMetricFamily( 'druid_exporter_datapoints_registered', 'Number of datapoints successfully registered ' 'by the exporter.') registered.add_metric([], self.datapoints_registered) yield registered
def collect(self): """ Scrape /server-status url and collect metrics """ # Counters accesses_total = CounterMetricFamily( 'apache_accesses_total', 'Total requests served count since startup', labels=['exporter_name']) traffic_total = CounterMetricFamily( 'apache_traffic_bytes_total', 'Total bytes transfered since startup', labels=['exporter_name']) balancer_acc = CounterMetricFamily( 'apache_balancer_requests_total', 'Total requests count', labels=['cluster', 'host', 'route', 'exporter_name']) balancer_wr = CounterMetricFamily( 'apache_balancer_write_bytes_total', 'Total bytes written', labels=['cluster', 'host', 'route', 'exporter_name']) balancer_rd = CounterMetricFamily( 'apache_balancer_read_bytes_total', 'Total bytes read', labels=['cluster', 'host', 'route', 'exporter_name']) # Gauges requests_sec = GaugeMetricFamily('apache_requests_per_second', 'Requests per second', labels=['exporter_name']) bytes_sec = GaugeMetricFamily('apache_io_bytes_per_second', 'Bytes write/read per second', labels=['exporter_name']) bytes_request = GaugeMetricFamily('apache_io_bytes_per_request', 'Bytes write/read per request', labels=['exporter_name']) route_ok = GaugeMetricFamily( 'apache_balancer_route_ok', 'Balancing status of the route is OK', labels=['cluster', 'host', 'route', 'exporter_name']) route_dis = GaugeMetricFamily( 'apache_balancer_route_disabled', 'Balancing status of the route is DISABLED', labels=['cluster', 'host', 'route', 'exporter_name']) route_err = GaugeMetricFamily( 'apache_balancer_route_error', 'Balancing status of the route is ERROR', labels=['cluster', 'host', 'route', 'exporter_name']) route_unk = GaugeMetricFamily( 'apache_balancer_route_unknown', 'Balancing status of the route is UNKNOWN', labels=['cluster', 'host', 'route', 'exporter_name']) scoreboard = GaugeMetricFamily('apache_scoreboard_current', 'Count of workers grouped by status', labels=['status', 'exporter_name']) latest_scrape = GaugeMetricFamily( 'apache_latest_scrape_duration_seconds', 'Latest scrape duration in seconds', labels=['metric_name', 'exporter_name']) operation_duration = GaugeMetricFamily( 'apache_operation_duration_seconds', 'Operation duration in seconds', labels=['operation', 'exporter_name']) # Histograms endpoint_response_time = HistogramMetricFamily( 'apache_endpoint_response_time_seconds', 'Response time by endpoints', labels=['method', 'endpoint', 'exporter_name']) try: exporter_name = os.environ['APACHE_EXPORTER_NAME'] except: exporter_name = 'none' start = time.clock() try: page = requests.get(self.url, verify=False) page.raise_for_status() except Exception as e: self.logger.error(f'Failed to Apache status page. Exception: {e}') duration = float("%.3g" % (time.clock() - start)) operation_duration.add_metric(['load_page', exporter_name], duration) start = time.clock() try: root = html.fromstring(page.content) except Exception as e: self.logger.error(f'Failed to parse page as html. Exception: {e}') duration = float("%.3g" % (time.clock() - start)) operation_duration.add_metric(['parse_page', exporter_name], duration) # Total traffic and accesses and requests,bytes per second/request start = time.clock() for x in range(1, 20): tmp_str = root.xpath("/html/body/dl[2]/dt[%d]" % x)[0].text.strip() if tmp_str.find('Total accesses:') >= 0: match = re.match('Total accesses: (.*) - Total Traffic: (.*)', tmp_str) _accesses_total = match.group(1) _traffic_total = self.str_to_bytes(match.group(2)) # Update metrics if they were found if _accesses_total is not None: accesses_total.add_metric([exporter_name], _accesses_total) if _traffic_total is not None: traffic_total.add_metric([exporter_name], _traffic_total) break duration = float("%.3g" % (time.clock() - start)) latest_scrape.add_metric(['apache_accesses_total', exporter_name], duration) latest_scrape.add_metric(['apache_traffic_bytes_total', exporter_name], duration) start = time.clock() for x in range(1, 20): tmp_str = root.xpath("/html/body/dl[2]/dt[%d]" % x)[0].text.strip() if tmp_str.find('requests') >= 0 and tmp_str.find('second') >= 0: match = re.match( '(.*) requests/sec - (.*/second) - (.*/request)', tmp_str) _requests_sec = match.group(1) _bytes_sec = self.str_to_bytes(match.group(2)) _bytes_request = self.str_to_bytes(match.group(3)) # Update metrics if they were found if _requests_sec is not None: requests_sec.add_metric([exporter_name], _requests_sec) if _bytes_sec is not None: bytes_sec.add_metric([exporter_name], _bytes_sec) if _bytes_request is not None: bytes_request.add_metric([exporter_name], _bytes_request) break duration = float("%.3g" % (time.clock() - start)) latest_scrape.add_metric(['apache_requests_per_second', exporter_name], duration) latest_scrape.add_metric(['apache_io_bytes_per_second', exporter_name], duration) latest_scrape.add_metric( ['apache_io_bytes_per_request', exporter_name], duration) # Get workers statuses start = time.clock() workers_map = {} workers = root.xpath('/html/body/pre')[0].text.strip() for symbol in range(0, len(workers)): if workers[symbol] in workers_map: workers_map[workers[symbol]] += 1 else: workers_map[workers[symbol]] = 1 # Update metrics for worker_status in workers_map: if worker_status == ".": status = "Open slot" elif worker_status == "_": status = "Waiting for Connection" elif worker_status == "S": status = "Starting up" elif worker_status == "R": status = "Reading Request" elif worker_status == "W": status = "Sending Reply" elif worker_status == "K": status = "Keepalive" elif worker_status == "D": status = "DNS Lookup" elif worker_status == "C": status = "Closing connection" elif worker_status == "L": status = "Logging" elif worker_status == "G": status = "Gracefully finishing" elif worker_status == "I": status = "Idle cleanup of worker" else: status = "Unknown" if worker_status != "\n": #Update workers scoreboard scoreboard.add_metric([status, exporter_name], int(workers_map[worker_status])) duration = float("%.3g" % (time.clock() - start)) latest_scrape.add_metric(['apache_scoreboard_current', exporter_name], duration) # Get balancing and routes status start = time.clock() try: cluster_xpaths = json.loads(os.environ['APACHE_EXPORTER_CLUSTERS']) except Exception as e: self.logger.error(f'Cannot load APACHE_EXPORTER_CLUSTERS. {e}') cluster_xpaths = None for cluster in cluster_xpaths: h = 0 for row in root.xpath(cluster_xpaths[cluster]): if h == 0: h += 1 continue else: host = "%s" % row[1].text route = "%s" % row[3].text status = row[2].text acc = row[7].text wr = self.str_to_bytes(row[8].text) rd = self.str_to_bytes(row[9].text) # Update nodes statuses ok, dis, err, unk = 0, 0, 0, 0 if status.find('Ok') >= 0: ok = 1 elif status.find('Dis') >= 0: dis = 1 elif status.find('Err') >= 0: err = 1 else: unk = 1 # Route statuses route_ok.add_metric([cluster, host, route, exporter_name], ok) route_dis.add_metric([cluster, host, route, exporter_name], dis) route_err.add_metric([cluster, host, route, exporter_name], err) route_unk.add_metric([cluster, host, route, exporter_name], unk) # Update requests, wr, rd counters balancer_acc.add_metric([cluster, host, route, exporter_name], int(acc)) balancer_wr.add_metric([cluster, host, route, exporter_name], int(wr)) balancer_rd.add_metric([cluster, host, route, exporter_name], int(rd)) duration = float("%.3g" % (time.clock() - start)) latest_scrape.add_metric(['apache_balancer_route_ok', exporter_name], duration) latest_scrape.add_metric( ['apache_balancer_route_disabled', exporter_name], duration) latest_scrape.add_metric( ['apache_balancer_route_error', exporter_name], duration) latest_scrape.add_metric( ['apache_balancer_route_unknown', exporter_name], duration) latest_scrape.add_metric( ['apache_balancer_requests_total', exporter_name], duration) latest_scrape.add_metric( ['apache_balancer_write_bytes_total', exporter_name], duration) latest_scrape.add_metric( ['apache_balancer_read_bytes_total', exporter_name], duration) # Get response time by endpoints start = time.clock() h = 0 for row in root.xpath('/html/body/table[1]/tr'): last_column = len(row) if h == 0: h += 1 for h in range(0, last_column): header = row[h].text.upper() if header == 'REQ': req_pos = h elif header == 'REQUEST': request_pos = h continue else: try: duration = float(row[req_pos].text) / 1000 url = ("%s" % row[request_pos].text).strip() method, url = self.sanitize_url(url) if method is not None and url is not None: self.put_histogram_values(method, url, duration) except: pass # group buckets into one list url_buckets = {} for i in self.url_count: if (i[0], i[1]) not in url_buckets: url_buckets[i[0], i[1]] = [[i[2], self.url_count[i]]] else: url_buckets[i[0], i[1]].append([i[2], self.url_count[i]]) for t in url_buckets: if (t[0], t[1]) in self.url_sum: endpoint_response_time.add_metric([t[0], t[1], exporter_name], buckets=url_buckets[t], sum_value=self.url_sum[t[0], t[1]]) duration = float("%.3g" % (time.clock() - start)) latest_scrape.add_metric( ['apache_endpoint_response_time_seconds', exporter_name], duration) # counters yield accesses_total yield traffic_total yield balancer_acc yield balancer_wr yield balancer_rd # gauges yield requests_sec yield bytes_sec yield bytes_request yield route_ok yield route_dis yield route_err yield route_unk yield scoreboard yield latest_scrape yield operation_duration # histograms if self.endpoint_stats: yield endpoint_response_time
def _setup_ipc_labels(self): total_calltime_flag, response_size_flag, process_calltime_flag, queue_calltime_flag, request_size_flag, exception_flag = 1, 1, 1, 1, 1, 1 for metric in self._metrics['IPC']: label = ["cluster", "host"] snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() if '_min' in metric or '_max' in metric or '_mean' in metric or 'median' in metric: name = "_".join(['ipc', snake_case]) self._hadoop_hbase_metrics['IPC'][metric] = GaugeMetricFamily("_".join([self._prefix, name]), self._metrics['IPC'][metric], labels=label) elif 'RangeCount_' in metric: name = metric.replace("-", "_").lower() self._hadoop_hbase_metrics['IPC'][metric] = GaugeMetricFamily("_".join([self._prefix, 'ipc', name]), self._metrics['IPC'][metric], labels=label) elif 'TotalCallTime' in metric: if total_calltime_flag: total_calltime_flag = 0 name = 'ipc_total_calltime_latency_microseconds' key = 'TotalCallTime' self._hadoop_hbase_metrics['IPC'][key] = HistogramMetricFamily("_".join([self._prefix, name]), "The percentile of total calltime latency in microseconds", labels=label) else: continue elif 'ResponseSize' in metric: if response_size_flag: response_size_flag = 0 name = 'ipc_response_size_bytes' key = 'ResponseSize' self._hadoop_hbase_metrics['IPC'][key] = HistogramMetricFamily("_".join([self._prefix, name]), "The percentile of response size in bytes", labels=label) else: continue elif 'ProcessCallTime' in metric: if process_calltime_flag: process_calltime_flag = 0 name = 'ipc_prcess_calltime_latency_microseconds' key = 'ProcessCallTime' self._hadoop_hbase_metrics['IPC'][key] = HistogramMetricFamily("_".join([self._prefix, name]), "The percentile of process calltime latency in microseconds", labels=label) else: continue elif 'RequestSize' in metric: if request_size_flag: request_size_flag = 0 name = 'ipc_request_size_bytes' key = 'RequestSize' self._hadoop_hbase_metrics['IPC'][key] = HistogramMetricFamily("_".join([self._prefix, name]), "The percentile of request size in bytes", labels=label) else: continue elif 'QueueCallTime' in metric: if queue_calltime_flag: queue_calltime_flag = 0 name = 'ipc_queue_calltime_latency_microseconds' key = 'QueueCallTime' self._hadoop_hbase_metrics['IPC'][key] = HistogramMetricFamily("_".join([self._prefix, name]), "The percentile of queue calltime latency in microseconds", labels=label) elif 'exceptions' in metric: if exception_flag: exception_flag = 0 name = 'ipc_exceptions_total' key = 'exceptions' label.append("type") self._hadoop_hbase_metrics['IPC'][key] = GaugeMetricFamily("_".join([self._prefix, name]), "Exceptions caused by requests", labels = label) else: continue else: name = "_".join(['ipc', snake_case]) self._hadoop_hbase_metrics['IPC'][metric] = GaugeMetricFamily("_".join([self._prefix, name]), self._metrics['IPC'][metric], labels=label)
def to_metric(self, desc, tag_values, agg_data): """ to_metric translate the data that OpenCensus create to Prometheus format, using Prometheus Metric object :type desc: dict :param desc: The map that describes view definition :type tag_values: tuple of :class: `~opencensus.tags.tag_value.TagValue` :param object of opencensus.tags.tag_value.TagValue: TagValue object used as label values :type agg_data: object of :class: `~opencensus.stats.aggregation_data.AggregationData` :param object of opencensus.stats.aggregation_data.AggregationData: Aggregated data that needs to be converted as Prometheus samples :rtype: :class:`~prometheus_client.core.CounterMetricFamily` or :class:`~prometheus_client.core.HistogramMetricFamily` or :class:`~prometheus_client.core.UntypedMetricFamily` or :class:`~prometheus_client.core.GaugeMetricFamily` :returns: A Prometheus metric object """ metric_name = desc['name'] metric_description = desc['documentation'] label_keys = desc['labels'] if isinstance(agg_data, aggregation_data_module.CountAggregationData): metric = CounterMetricFamily(name=metric_name, documentation=metric_description, labels=label_keys) metric.add_metric(labels=list(tag_values), value=agg_data.count_data) return metric elif isinstance(agg_data, aggregation_data_module.DistributionAggregationData): assert (agg_data.bounds == sorted(agg_data.bounds)) points = {} cum_count = 0 for ii, bound in enumerate(agg_data.bounds): cum_count += agg_data.counts_per_bucket[ii] points[str(bound)] = cum_count metric = HistogramMetricFamily(name=metric_name, documentation=metric_description, labels=label_keys) metric.add_metric( labels=list(tag_values), buckets=list(points.items()), sum_value=agg_data.sum, ) return metric elif isinstance(agg_data, aggregation_data_module.SumAggregationDataFloat): metric = UntypedMetricFamily(name=metric_name, documentation=metric_description, labels=label_keys) metric.add_metric(labels=list(tag_values), value=agg_data.sum_data) return metric elif isinstance(agg_data, aggregation_data_module.LastValueAggregationData): metric = GaugeMetricFamily(name=metric_name, documentation=metric_description, labels=label_keys) metric.add_metric(labels=list(tag_values), value=agg_data.value) return metric else: raise ValueError("unsupported aggregation type %s" % type(agg_data))
def scrape(): global START today = datetime.utcnow() START = datetime.combine(today, datetime.min.time()).isoformat() tasks = retrieve_recent_pulp_tasks() pulp_tasks_total_family = CounterMetricFamily('pulp_tasks_total', 'Count of all pulp tasks', labels=TASK_LABELS) for value, labels in pulp_tasks_total(tasks): pulp_tasks_total_family.add_metric(labels, value) pulp_task_errors_total_family = CounterMetricFamily( 'pulp_task_errors_total', 'Count of all pulp task errors', labels=TASK_LABELS) error_tasks = only(tasks, states=error_states) for value, labels in pulp_tasks_total(error_tasks): pulp_task_errors_total_family.add_metric(labels, value) pulp_in_progress_tasks_family = GaugeMetricFamily( 'pulp_in_progress_tasks', 'Count of all in-progress pulp tasks', labels=TASK_LABELS, ) in_progress_tasks = retrieve_open_pulp_tasks() for value, labels in pulp_tasks_total(in_progress_tasks): pulp_in_progress_tasks_family.add_metric(labels, value) pulp_waiting_tasks_family = GaugeMetricFamily( 'pulp_waiting_tasks', 'Count of all waiting, unscheduled pulp tasks', labels=TASK_LABELS, ) waiting_tasks = retrieve_waiting_pulp_tasks() for value, labels in pulp_tasks_total(waiting_tasks): pulp_waiting_tasks_family.add_metric(labels, value) pulp_task_duration_seconds_family = HistogramMetricFamily( 'pulp_task_duration_seconds', 'Histogram of pulp task durations', labels=TASK_LABELS, ) for buckets, duration_sum, labels in pulp_task_duration_seconds(tasks): pulp_task_duration_seconds_family.add_metric(labels, buckets, sum_value=duration_sum) # Replace this in one atomic operation to avoid race condition to the Expositor metrics.update({ 'pulp_tasks_total': pulp_tasks_total_family, 'pulp_task_errors_total': pulp_task_errors_total_family, 'pulp_in_progress_tasks': pulp_in_progress_tasks_family, 'pulp_waiting_tasks': pulp_waiting_tasks_family, 'pulp_task_duration_seconds': pulp_task_duration_seconds_family, })
def scrape(): global START START = datetime.datetime.utcnow().date().strftime('%Y-%m-%d %H:%M:%S') pushes = retrieve_recent_pub_pushes() pub_pushes_total_family = CounterMetricFamily('pub_pushes_total', 'Count of all pub pushes', labels=PUSH_LABELS) for value, labels in pub_pushes_total(pushes): pub_pushes_total_family.add_metric(labels, value) pub_push_errors_total_family = CounterMetricFamily( 'pub_push_errors_total', 'Count of all pub push errors', labels=PUSH_LABELS) error_pushes = only(pushes, states=error_states) for value, labels in pub_pushes_total(error_pushes): pub_push_errors_total_family.add_metric(labels, value) pub_in_progress_pushes_family = GaugeMetricFamily( 'pub_in_progress_pushes', 'Count of all in-progress pub pushes', labels=PUSH_LABELS, ) in_progress_pushes = retrieve_open_pub_pushes() for value, labels in pub_pushes_total(in_progress_pushes): pub_in_progress_pushes_family.add_metric(labels, value) pub_waiting_pushes_family = GaugeMetricFamily( 'pub_waiting_pushes', 'Count of all waiting, unscheduled pub pushes', labels=PUSH_LABELS, ) waiting_pushes = retrieve_waiting_pub_pushes() for value, labels in pub_pushes_total(waiting_pushes): pub_waiting_pushes_family.add_metric(labels, value) pub_push_duration_seconds_family = HistogramMetricFamily( 'pub_push_duration_seconds', 'Histogram of pub push durations', labels=PUSH_LABELS, ) for buckets, duration_sum, labels in pub_push_duration_seconds(pushes): pub_push_duration_seconds_family.add_metric(labels, buckets, sum_value=duration_sum) # Replace this in one atomic operation to avoid race condition to the Expositor metrics.update({ 'pub_pushes_total': pub_pushes_total_family, 'pub_push_errors_total': pub_push_errors_total_family, 'pub_in_progress_pushes': pub_in_progress_pushes_family, 'pub_waiting_pushes': pub_waiting_pushes_family, 'pub_push_duration_seconds': pub_push_duration_seconds_family, })