class PrometheusClient: def __init__(self): # # Start up the server to expose the metrics. start_http_server(EXPORT_PORT) self.g = Gauge('sensor_data', 'Value gathered by sensor', ['sensor_id', 'ipe_id', 'category_id', 'unit']) def export_data(self, xml_data=None): root = ET.fromstring(xml_data) ipe_id = root.find('./*[@name="ipeId"]').attrib['val'] app_id = root.find('./*[@name="appId"]').attrib['val'] category = root.find('./*[@name="category"]').attrib['val'] data = int(root.find('./*[@name="data"]').attrib['val']) unit = root.find('./*[@name="unit"]').attrib['val'] # json_body = [ # { # "measurement": "sensor_status", # "tags": { # "sensor_id": app_id, # "ipe_id": ipe_id, # "category": category # }, # "fields": { # "data": data, # "unit": unit # } # } # ] self.g.labels(app_id, ipe_id, category, unit).set(data)
def export_metric(device, id, type, value): global temperature_gauge if (temperature_gauge == None): temperature_gauge = Gauge('temperature', 'Temperatures from MQTT', ['device', 'id', 'type']) global gauges if (not id in gauges): gauges[id] = temperature_gauge.labels(device, id, type) logging.debug('New gauge added. Device: ' + device + ' Id: ' + str(id) + ', type: ' + type) gauges[id].set(value) logging.debug('Gauge temperature set with value: ' + str(value) + ' - Device: ' + device + ' Id: ' + id + ' Type: ' + type)
def update_gauges(metrics): metric_dict = {} for (name_list, label_dict, value) in metrics: metric_name = format_metric_name(name_list) if metric_name not in metric_dict: metric_dict[metric_name] = (tuple(label_dict.keys()), {}) label_keys = metric_dict[metric_name][0] label_values = tuple([ format_label_value(label_dict[key]) for key in label_keys ]) metric_dict[metric_name][1][label_values] = value for metric_name, (label_keys, value_dict) in metric_dict.items(): if metric_name in gauges: (old_label_values_set, gauge) = gauges[metric_name] else: old_label_values_set = set() gauge = Gauge(metric_name, '', label_keys) new_label_values_set = set(value_dict.keys()) for label_values in old_label_values_set - new_label_values_set: gauge.remove(*label_values) for label_values, value in value_dict.items(): if label_values: gauge.labels(*label_values).set(value) else: gauge.set(value) gauges[metric_name] = (new_label_values_set, gauge)
def get_stats(self): registry = CollectorRegistry() labels = ['region'] label_values = [self.region] duration = Gauge('openstack_exporter_cache_refresh_duration_seconds', 'Cache refresh duration in seconds.', labels, registry=registry) duration.labels(*label_values).set(self.duration) return generate_latest(registry)
class HistogramRoller(RollerBase): """Accepts a Histogram object and creates a guage with multiple labels tracking bucket values over a given time period. """ def __init__(self, histogram, options=None, registry=REGISTRY, roller_registry=ROLLER_REGISTRY): self.hist = histogram if self.hist._type != 'histogram': raise ValueError('Only a Histogram object should be passed to HistogramRoller') options = options or {} self.extract_options(options) # Keys are 'le' values # Holds deques containing values for each gauge self.past_values = dict() full_name = "" for full_name, labels, _ in iter_hist_buckets(self.hist): le_label = labels['le'] self.past_values[le_label] = deque() self.configure_with_full_name(full_name, is_histogram=True) # A single top level gauge with bucket labels tracks the values self.gauge = Gauge( self.name, self.documentation, labelnames=('le',), registry=registry ) roller_registry[self.name] = self def collect(self): """Loop over current histogram bucket values and update gauges. Usage: * Collect should only be called about every second, not in a tight loop. * Should only be called in 1 thread at a time. """ now = datetime.datetime.now() # Fetch values from histograms for _, labels, value in iter_hist_buckets(self.hist): sample_key = labels['le'] # Add value self.past_values[sample_key].append((now, value)) # Drop old values remove_old_values(self.past_values[sample_key], now - self.retention_td) # Calculate and record new rolled value v = self.reducer(values_to_deltas(self.past_values[sample_key]), **self.reducer_kwargs) self.gauge.labels({'le': sample_key}).set(v)
def register_prometheus_gauges(export_internal_raspberry=False): g = Gauge("sensor_temperature_in_celsius", "Local room temperature around the raspberry pi", ["sensor"]) error_g = Gauge("faulty_sensor_read", "Is 1 if the sensor could not be read.", ["sensor"]) sensors = find_sensors() print "Found sensors:", ", ".join(map(lambda x: str(x), sensors)) for sensor in sensors: g.labels(str(sensor)).set_function(sensor) sensor.set_error_gauge(error_g.labels(str(sensor))) if export_internal_raspberry: g = Gauge("cpu_temperature_in_celsius", "CPU Temperature of the Raspberry Pi") g.set_function(read_raspberry_pi_temperature) return sensors
def sendGauge(self,metric, description ,value, job, labels): for g in self.metrics: if g._name == metric and g._type == 'gauge': g.labels(resource_id=self.id,snmp_ip=labels['ip'],snmp_port=labels['port']).set(value) self.push(job=job) return g = Gauge(metric, description , ["resource_id","snmp_ip","snmp_port"], registry=self.registry) g.labels(resource_id=self.id,snmp_ip=labels['ip'],snmp_port=labels['port']).set(value) self.metrics.append(g) self.push(job=job)
def create_timeseries(self, data, **config): """Create Prometheus timeseries. Args: data (dict): Data to send to Prometheus. config (dict): Metric / exporter config. Returns: object: Metric descriptor. """ metric_type = config.get('metric_type', DEFAULT_METRIC_TYPE) metric_description = config.get('metric_description', DEFAULT_METRIC_DESCRIPTION) prometheus_push_url = config.get('url', DEFAULT_PUSHGATEWAY_URL) prometheus_push_job_name = config.get('job', DEFAULT_PUSHGATEWAY_JOB) burn_rate = data['error_budget_burn_rate'] # Write timeseries w/ metric labels. labels = { 'service_name': data['service_name'], 'feature_name': data['feature_name'], 'slo_name': data['slo_name'], 'window': str(data['window']), 'error_budget_policy_step_name': str(data['error_budget_policy_step_name']), 'alerting_burn_rate_threshold': str(data['alerting_burn_rate_threshold']), } registry = CollectorRegistry() gauge = Gauge(metric_type, metric_description, registry=registry, labelnames=labels.keys()) gauge.labels(*labels.values()).set(burn_rate) # Handle headers handler = default_handler if 'username' in config and 'password' in config: self.username = config['username'] self.password = config['password'] handler = PrometheusExporter.auth_handler return push_to_gateway(prometheus_push_url, job=prometheus_push_job_name, grouping_key=labels, registry=registry, handler=handler)
class Ping_push(Base_push): def __init__(self): self.filename = os.path.join(os.path.dirname(__file__), args.filename).replace("\\", "/") self.f = open(self.filename) self.y = yaml.load(self.f, Loader=yaml.FullLoader) self.ips = self.y['pingip']['ip'] self.targets = self.y['pushgateway']['targets'][0] self.type = self.y['pushgateway']['type'][0] self.processing() async def gather(self, ip): try: str_num = os.popen('ping -c1' + ' ' + ip + '>/dev/null 2>&1;echo $?').read() return_num = int(str_num) timestamp = time.time() if return_num == 0: pingResult = os.popen('ping -c1' + ' ' + ip).read() res_time = re.findall(r'.*time=(\d\.?\d*) ms*', pingResult) if len(res_time): response_time = float(res_time[0]) status = "ok" if return_num == 1: status = "not ok" response_time = 0 return await ip, status, timestamp, response_time except Exception as e: logging.error(e) async def processing(self): self.registry = CollectorRegistry() self.g = Gauge(self.type, '状态-时间', ['ip', 'status', 'timestamp', 'response_time'], registry=self.registry) for ip in self.ips: ip, status, timestamp, response_time = await self.gather(ip) self.g.labels(ip, status, timestamp, response_time) try: pushadd_to_gateway(self.targets, job='pingIP_status', registry=self.registry, timeout=200) except Exception as e: logging.error("Failt to push:" + str(e))
def get_stats(self): registry = CollectorRegistry() stat_gauge = Gauge('openstack_node_totals', 'OpenStack Ironic Nodes statistic', LABELS, registry=registry) for node_stat in self.get_cache_data(): label_values = [self.osclient.region ] + [node_stat.get(x, '') for x in LABELS[1:]] stat_gauge.labels(*label_values).set(1.0) return generate_latest(registry)
def get_stats(self): registry = CollectorRegistry() labels = ['cloud'] age = Gauge('openstack_exporter_cache_age_seconds', 'Cache age in seconds. It can reset more frequently ' 'than scraping interval so we use Gauge', labels, registry=registry) label_values = [config['cloud']] age.labels(*label_values).set(time() - path.getmtime(self.cache_file)) duration = Gauge('openstack_exporter_cache_refresh_duration_seconds', 'Cache refresh duration in seconds.', labels, registry=registry) duration.labels(*label_values).set(self.duration) return generate_latest(registry)
def export(self, metric_name, metric_value, metric_description="", labels={}): if self.uri: gauge = Gauge(metric_name, metric_description, registry=self.registry, labelnames=("hostname", )) gauge.labels(**labels).set(metric_value) push_to_gateway(self.uri, job='node-backups', registry=self.registry)
def exporter(): prom_gauge = Gauge('yearn', 'yearn stats', ['vault', 'param']) start_http_server(8800) registry = load_registry() vaults = load_vaults(registry) for block in chain.new_blocks(): secho(f'{block.number}', fg='green') for vault in vaults: secho(vault.name, fg='yellow') # secho(str(vault), dim=True) info = describe_vault(vault) for param, value in info.items(): # print(f'{param} = {value}') prom_gauge.labels(vault.name, param).set(value)
def get_metrics(self, registry): # type: (CollectorRegistry) -> None hdfs_service = 'hdfs-0.20' g = Gauge('hdfs_files_total_bytes', 'Total size of files on HDFS in bytes.', labelnames=['service'], registry=registry) g.labels(service=hdfs_service).set(self.total_bytes) g = Gauge('hdfs_files_total_count', 'Total number of files on HDFS.', labelnames=['service'], registry=registry) g.labels(service=hdfs_service).set(self.total_files) g = Gauge('hdfs_dirs_total_count', 'Total number of directories on HDFS.', labelnames=['service'], registry=registry) g.labels(service=hdfs_service).set(self.total_directories) g = Gauge('hdfs_under_replicated_files_total_count', 'Total number of files on HDFS with less than three copies.', labelnames=['service'], registry=registry) g.labels(service=hdfs_service).set(self.total_under_replicated)
class PrometheusAdapter(object): """ Instantiates necessary prometheus objects to update metrics """ def __init__(self, monitor_port): start_http_server(monitor_port) self.gauge = Gauge('kafka_topic_group_lag', 'topic group pair lag', ['topic', 'group']) def update_metrics(self, topic_group_list): """ Updates Gauge and prints to console """ for topic_group in topic_group_list: self.gauge.labels( topic_group.name, topic_group.group).set(topic_group.lag) BaseMetricsAdapter.update_topic_group_lag( topic_group.name, topic_group.group, topic_group.lag)
def get_metrics(): collect_reg = CollectorRegistry(auto_describe=True) try: s = ServerProxy(supervisord_url) data = s.supervisor.getAllProcessInfo() except Exception as e: print("unable to call supervisord: %s" % e) return collect_reg labels = ('name', 'group') metric_state = Gauge('state', "Process State", labelnames=labels, subsystem='supervisord', registry=collect_reg) metric_exit_status = Gauge('exit_status', "Process Exit Status", labelnames=labels, subsystem='supervisord', registry=collect_reg) metric_up = Gauge('up', "Process Up", labelnames=labels, subsystem='supervisord', registry=collect_reg) metric_start_time_seconds = Counter('start_time_seconds', "Process start time", labelnames=labels, subsystem='supervisord', registry=collect_reg) for item in data: now = item.get('now', '') group = item.get('group', '') description = item.get('description', '') stderr_logfile = item.get('stderr_logfile', '') stop = item.get('stop', '') statename = item.get('statename', '') start = item.get('start', '') state = item.get('state', '') stdout_logfile = item.get('stdout_logfile', '') logfile = item.get('logfile', '') spawnerr = item.get('spawnerr', '') name = item.get('name', '') exitstatus = item.get('exitstatus', '') labels = (name, group) metric_state.labels(*labels).set(state) metric_exit_status.labels(*labels).set(exitstatus) if is_runing(state): metric_up.labels(*labels).set(1) metric_start_time_seconds.labels(*labels).inc(start) else: metric_up.labels(*labels).set(0) return collect_reg
def get_metrics(self, registry): # type: (CollectorRegistry) -> None g = Gauge('ukwa_seeds_launched', 'Total number of seeds launched.', labelnames=['stream'], registry=registry) g.labels(stream=self.frequency).set(self.i_launches) g = Gauge('ukwa_target_errors', 'Total number of targets that appear malformed.', labelnames=['stream'], registry=registry) g.labels(stream=self.frequency).set(self.target_errors)
def gather_data(namespace, run_event): g_cpu_usage = Gauge("cpu_cumulative_usage", "CPU Cumulative Usage", ["service", "instance"]) g_cpu_utilization = Gauge('cpu_utilization', "CPU utilization", ["service", "instance"]) g_memory_usage = Gauge('memory_usage', "Memory Usage", ["servie", "instance"]) g_memory_utilization = Gauge('memory_utilization', "Memory Utilization", ["service", "instance"]) while run_event.is_set(): service_list = alauda_service_list(namespace) for service_inst in service_list: service_name = service_inst.name instance_list = alauda_instance_list(namespace, service_name) for instance in instance_list: end_time = int(time.time()) - 30 start_time = str(end_time - 100) #gather data every 1 minute, ensure we can get at least one metric end_time = str(end_time) data = alauda_get_instance_metrics(namespace, service_name, instance['uuid'], start_time, end_time, "1m") if data: g_cpu_usage.labels(service_name, instance['instance_name']).set(data['points'][0][1]) g_cpu_utilization.labels(service_name, instance['instance_name']).set(data['points'][0][2]) g_memory_usage.labels(service_name, instance['instance_name']).set(data['points'][0][3]) g_memory_utilization.labels(service_name, instance['instance_name']).set(data['points'][0][4]) time.sleep(20)
def gen_quarantine_stats(self): labels = ['cloud', 'hostname', 'ring'] swift_quarantine = Gauge('swift_quarantined_objects', 'Number of quarantined objects', labels, registry=self.registry) for h in self.swift_hosts: try: r = requests.get(self.baseurl.format(h, 'quarantined')) except requests.exceptions.RequestException: continue for ring in ['accounts', 'objects', 'containers']: swift_quarantine.labels(config['cloud'], h, ring).set(r.json().get(ring))
class AllocatedRscGauge: def __init__(self, name): self._gauge = Gauge( name, "A gauge of allocated ressources of running jobs over time", ['cluster', 'rsc', 'project']) self._request_per_cluster = "SELECT foo.cluster_name, (SELECT name FROM project WHERE id= foo.project_id), " \ "foo.mem, foo.cpu FROM (SELECT cluster_name, project_id, sum(memory) as mem, " \ "sum(cpu) as cpu FROM job "\ "WHERE state='running' GROUP BY cluster_name, project_id) as foo" self._request_total = "SELECT (SELECT name FROM project WHERE id = foo.project_id), foo.mem, foo.cpu " \ "FROM (SELECT project_id, sum(memory) as mem, sum(cpu) as cpu "\ "FROM job "\ "WHERE state='running' GROUP BY project_id) as foo" def update(self, conn): per_cluster = execute_sql(conn, self._request_per_cluster, None) total = execute_sql(conn, self._request_total, None) self._set_values(per_cluster, total) def _set_values(self, per_cluster, total): for row in per_cluster: self._gauge.labels(rsc="mem", cluster=row[0], project=row[1]).set(row[2]) self._gauge.labels(rsc="cpu", cluster=row[0], project=row[1]).set(row[3]) for row in total: self._gauge.labels(rsc="mem", cluster="'%'", project=row[0]).set(row[1]) self._gauge.labels(rsc="cpu", cluster="'%'", project=row[0]).set(row[2])
def gen_disk_usage_stats(self): labels = ['cloud', 'hostname', 'device', 'type'] swift_disk = Gauge('swift_disk_usage_bytes', 'Swift disk usage in bytes', labels, registry=self.registry) for h in self.swift_hosts: try: r = requests.get(self.baseurl.format(h, 'diskusage')) except requests.exceptions.RequestException: continue for disk in r.json(): if not all([disk.get(i, False) for i in ['size', 'used', 'device']]): continue swift_disk.labels(config['cloud'], h, disk['device'], 'size').set(int(disk['size'])) swift_disk.labels(config['cloud'], h, disk['device'], 'used').set(int(disk['used']))
def collector(): registry = CollectorRegistry() g = Gauge('test_collector_gauge', 'Description of gauge', labelnames=['label_name'], registry=registry) g.labels('label').set(5) c = Counter('test_collector_counter', 'Description of counter', registry=registry) c.inc(random.randint(0, 20)) push_to_gateway('192.168.10.3:9091', job='test_collector', registry=registry)
def get_stats(self): registry = CollectorRegistry() labels = ['region', 'host', 'service', 'state'] cinder_services_stats_cache = self.get_cache_data() for cinder_services_stat in cinder_services_stats_cache: stat_gauge = Gauge(self.gauge_name_sanitize(cinder_services_stat['stat_name']), 'Openstack Cinder Service statistic', labels, registry=registry) label_values = [self.osclient.region, cinder_services_stat.get('host',''), cinder_services_stat.get('service', ''), cinder_services_stat.get('state','')] stat_gauge.labels(*label_values).set(cinder_services_stat['stat_value']) return generate_latest(registry)
def add_programs(self): options = self.metrics_options.get("programs", {}) sources = options.get("sources") names = options.get("names") if not sources or not names: return self.all_source_names.update(sources) g = Gauge('zentral_inventory_programs_bucket', 'Zentral inventory programs', ['name', 'version', 'source_name', 'source_id', 'le'], registry=self.registry) for r in program_count(sources, names): labels = {k: r[k] for k in ('name', 'version', 'source_name', 'source_id')} for le in ("1", "7", "14", "30", "45", "90", "+Inf"): g.labels(le=le, **labels).set(r[le])
def add_osx_apps(self): options = self.metrics_options.get("osx_apps", {}) sources = options.get("sources") bundle_ids = options.get("bundle_ids") if not sources or not bundle_ids: return self.all_source_names.update(sources) g = Gauge('zentral_inventory_osx_apps_bucket', 'Zentral inventory OSX apps', ['name', 'version', 'source_name', 'source_id', 'le'], registry=self.registry) for r in osx_app_count(sources, bundle_ids): labels = {k: r[k] for k in ('name', 'version', 'source_name', 'source_id')} for le in ("1", "7", "14", "30", "45", "90", "+Inf"): g.labels(le=le, **labels).set(r[le])
def collect(dryrun=False): """Push published cloud image counts.""" registry = CollectorRegistry() count_gauge = Gauge('foundations_cloud_images_published', 'The number of cloud images published', ['image_type', 'cloud', 'release', 'arch'], registry=registry) latest_serial_gauge = Gauge('foundations_cloud_images_current_serial', 'The date portion of the latest serial', ['image_type', 'cloud', 'release'], registry=registry) latest_serial_age_gauge = Gauge( 'foundations_cloud_images_current_serial_age', 'The time in days between the last serial and today', ['image_type', 'cloud', 'release'], registry=registry) for image_type in ['daily', 'release']: for cloud_name in CLOUD_NAMES[image_type]: print('Counting {} images for {}...'.format(image_type, cloud_name)) image_counts, latest_serials = parse_simplestreams_for_images( cloud_name, image_type) for release in image_counts: for arch in image_counts[release]: count = image_counts[release][arch] print('Found {} {} images for {} {} {}'.format( count, image_type, cloud_name, release, arch)) count_gauge.labels( image_type, cloud_name, release, arch).set(count) for release in latest_serials: serial = latest_serials[release] latest_serial_gauge.labels( image_type, cloud_name, release).set(serial) latest_serial_age_gauge.labels( image_type, cloud_name, release).set( _determine_serial_age(serial)) print('Finding serials for docker-core...') docker_core_serials = get_current_download_serials(DOCKER_CORE_ROOT) for release, serial in docker_core_serials.items(): age = _determine_serial_age(serial) print('Found {} latest serial: {} ({} days old)'.format( release, serial, age)) latest_serial_gauge.labels( 'daily', 'docker-core', release).set(serial) latest_serial_age_gauge.labels( 'daily', 'docker-core', release).set(age) if not dryrun: print('Pushing data...') util.push2gateway('cloud-image-count-foundations', registry)
class PrometheusMetricsCollector(MetricsCollector): def __init__(self): super(PrometheusMetricsCollector, self).__init__() self.registry = CollectorRegistry() self.task_started_counter = Counter( 'luigi_task_started_total', 'number of started luigi tasks', ['family'], registry=self.registry ) self.task_failed_counter = Counter( 'luigi_task_failed_total', 'number of failed luigi tasks', ['family'], registry=self.registry ) self.task_disabled_counter = Counter( 'luigi_task_disabled_total', 'number of disabled luigi tasks', ['family'], registry=self.registry ) self.task_done_counter = Counter( 'luigi_task_done_total', 'number of done luigi tasks', ['family'], registry=self.registry ) self.task_execution_time = Gauge( 'luigi_task_execution_time_seconds', 'luigi task execution time in seconds', ['family'], registry=self.registry ) def generate_latest(self): return generate_latest(self.registry) def handle_task_started(self, task): self.task_started_counter.labels(family=task.family).inc() self.task_execution_time.labels(family=task.family) def handle_task_failed(self, task): self.task_failed_counter.labels(family=task.family).inc() self.task_execution_time.labels(family=task.family).set(task.updated - task.time_running) def handle_task_disabled(self, task, config): self.task_disabled_counter.labels(family=task.family).inc() self.task_execution_time.labels(family=task.family).set(task.updated - task.time_running) def handle_task_done(self, task): self.task_done_counter.labels(family=task.family).inc() # time_running can be `None` if task was already complete if task.time_running is not None: self.task_execution_time.labels(family=task.family).set(task.updated - task.time_running) def configure_http_handler(self, http_handler): http_handler.set_header('Content-Type', CONTENT_TYPE_LATEST)
def generate_prometheus_metrics(results) -> bytes: """ Generate the prometheus representation of our measurments """ registry = CollectorRegistry() has_ipv6_gauage = Gauge( "ipv6_watch_has_ipv6", "AAA resolve status", labelnames=("resolver", "resolver_provider", "site", "host"), registry=registry, ) summary_gauage = Gauge( "ipv6_watch_summary", "AAA resolve status", labelnames=("site", ), registry=registry, ) update_timestamp = Gauge("ipv6_watch_last_update", "Unix timestamp of last update", registry=registry) update_timestamp.set(int(time.time())) for site, site_results in results.items(): summary_value = -1 if site_results["summary"] == "none": summary_value = 0 elif site_results["summary"] == "some": summary_value = 0.5 elif site_results["summary"] == "all": summary_value = 1 summary_gauage.labels(site=site).set(summary_value) for host, host_results in site_results["hosts"].items(): for resolver_provider, resolve_results in host_results.items(): for resolver, res in resolve_results.items(): has_ipv6_gauage.labels( site=site, host=host, resolver_provider=resolver_provider, resolver=resolver, ).set(res) return prometheus_generate_latest(registry)
def _metrics_child(metrics_q): # Start the prometheus HTTP client start_http_server(8000) missing_counter = Counter('missing_packets', 'Number of missing packets', ['host']) messages = Counter('messages', 'Number of messages') packets = Counter('packets', 'Number of packets') reorder_counter = Counter("reordered_packets", "Reordered Packets", ['host']) failed_user = Counter("xrootd_mon_failed_user", "Failed User Collection") failed_filename = Counter("xrootd_mon_failed_filename", "Failed Filename Collection") messages_sent = Counter("xrootd_mon_messages_sent", "Number of messages sent to the message bus", ['message_type']) process_died = Counter("xrootd_mon_process_died", "Number of times the process died") hash_size = Gauge("xrootd_mon_hash_size", "Number of items in hash", ['hash_type']) # Number of messages while True: metrics_message = metrics_q.get() # Number of missing messages if metrics_message['type'] == "missing packets": missing_counter.labels(metrics_message['addr']).inc( metrics_message['count']) elif metrics_message['type'] == "messages": messages.inc(metrics_message['count']) elif metrics_message['type'] == "packets": packets.inc(metrics_message['count']) elif metrics_message['type'] == "reordered packets": reorder_counter.labels(metrics_message['addr']).inc( metrics_message['count']) elif metrics_message['type'] == "failed user": failed_user.inc(metrics_message['count']) elif metrics_message['type'] == "failed filename": failed_filename.inc(metrics_message['count']) elif metrics_message['type'] == "message sent": messages_sent.labels(metrics_message['message_type']).inc( metrics_message['count']) elif metrics_message['type'] == "process died": process_died.inc(metrics_message['count']) elif metrics_message['type'] == "hash size": hash_size.labels(metrics_message['hash name']).set( metrics_message['count'])
def push_sql_metric(metricName, dremioCluster, executor, metricValue): # Push SQL Metric registry = CollectorRegistry() metric = Gauge(metricName, "SQL Metric, pushed via Gateway", labelnames=['executor'], registry=registry) metric.labels(executor).set_to_current_time() metric.labels(executor).set(metricValue) groupingKey = dict({"job": dremioCluster, "executor": executor}) pushadd_to_gateway(pgwendpoint, job=dremioCluster, registry=registry, timeout=api_timeout, grouping_key=groupingKey)
def get_prometheus_inventory_metrics(): registry = CollectorRegistry() g = Gauge('zentral_inventory_osx_apps', 'Zentral inventory OSX apps', ['name', 'version_str', 'source'], registry=registry) for r in osx_app_count(): count = r.pop('count') g.labels(**r).set(count) g = Gauge('zentral_inventory_os_versions', 'Zentral inventory OS Versions', ['name', 'major', 'minor', 'patch', 'build', 'source'], registry=registry) for r in os_version_count(): count = r.pop('count') g.labels(**r).set(count) return generate_latest(registry)
def collectMetrics(self): s = Shell(debug=False) g_energy_usage = Gauge('hpc_energy_usage' , 'energy consumption watts', ['pdu'], registry=self.registry) for pdu in self.XYMON_PDU_LIST: cmd = "%s -c %s -q xymondlog -H %s -T watts | grep 'DevicePowerWatts' | awk '{print $NF}'" % (self.BIN_XYMONQ, self.CFG_XYMONQ, pdu) rc, output, m = s.cmd1(cmd, allowed_exit=[0,255], timeout=300) if rc == 0: g_energy_usage.labels(pdu=pdu).set(float(output)) else: logger.warn('Cannot retrieve energy consumption for %s', pdu) return
def push_api_current_executor_metric(dremioCluster, clusterName, runningCount): # Push Current Executors provisioned metric registry = CollectorRegistry() metric = Gauge(api_current_executor_metric, "Current number of expected executors, pushed via Gateway", labelnames=['cluster'], registry=registry) metric.labels(clusterName).set_to_current_time() metric.labels(clusterName).set(runningCount) groupingKey = dict({"job": dremioCluster, "cluster": clusterName}) pushadd_to_gateway(pgwendpoint, job=dremioCluster, registry=registry, timeout=api_timeout, grouping_key=groupingKey)
def push_api_cluster_status_metric(dremioCluster, clusterName, status): # Push child cluster status metric registry = CollectorRegistry() metric = Gauge(api_cluster_status_metric, "Child cluster status, pushed via Gateway", labelnames=['cluster'], registry=registry) metric.labels(clusterName).set_to_current_time() metric.labels(clusterName).set(status) groupingKey = dict({"job": dremioCluster, "cluster": clusterName}) pushadd_to_gateway(pgwendpoint, job=dremioCluster, registry=registry, timeout=api_timeout, grouping_key=groupingKey)
def info(labels: Dict[str, str], name: str = "info") -> None: """Creates a gauge with the given label value pairs. Helps with exposing static info gauges. """ info_gauge = Gauge( name, "Info.", labelnames=tuple(labels.keys()), namespace=s.PROMETHEUS_NAMESPACE, subsystem=s.PROMETHEUS_SUBSYSTEM, ) info_gauge.labels(*labels.values()).set(1)
def hello(): registry = CollectorRegistry() sql = "select * from f_spider_data_test where f_servername='baidu'" gauge = Gauge('my_gauge', 'an example showed how to use gauge', ['machine_ip', "instance"], registry=registry) gauge2 = Gauge('node_cpu_seconds_total', 'an example showed how to use gauge', ['cpu'], registry=registry) counter.labels('spider1').inc(1) gauge.labels("spider2","consumer-yf").set(random.randint(50,100)) gauge2.labels("0") histogram.labels('Histogram').observe(1001) push_to_gateway("localhosr:9091", job="python-spider", registry=registry) return Response(generate_latest(registry), mimetype='text/plain')
def gen_account_stats(self): self.keystone_tenants_map = self._read_keystone_tenants_map( config.get('keystone_tenants_map', None)) labels = ['cloud', 'swift_account', 'tenant'] swift_account = Gauge('swift_account_bytes_used', 'Swift account usage in bytes', labels, registry=self.registry) for tenant_name, tenant_id in self.keystone_tenants_map.iteritems(): account = self.reseller_prefix + tenant_id bytes_used = self._get_account_usage(account) swift_account.labels(config['cloud'], account, tenant_name).set(bytes_used)
class PromClient: # pylint: disable=too-few-public-methods """Prometheus client.""" REQUIRED_LABELS = ['dp_id', 'dp_name'] _reg = REGISTRY def __init__(self, reg=None): if reg is not None: self._reg = reg # TODO: investigate faster alternative (https://bugs.launchpad.net/pbr/+bug/1688405) self.version = VersionInfo( 'faucet').semantic_version().release_string() self.faucet_version = PromGauge( # pylint: disable=unexpected-keyword-arg 'faucet_pbr_version', 'Faucet PBR version', ['version'], registry=self._reg) self.faucet_version.labels(version=self.version).set(1) # pylint: disable=no-member self.server = None self.thread = None def start(self, prom_port, prom_addr, use_test_thread=False): """Start webserver.""" if not self.server: app = make_wsgi_app(self._reg) if use_test_thread: # pylint: disable=import-outside-toplevel from wsgiref.simple_server import (make_server, WSGIRequestHandler) import threading class NoLoggingWSGIRequestHandler(WSGIRequestHandler): """Don't log requests.""" def log_message(self, *_args): # pylint: disable=arguments-differ pass self.server = make_server( prom_addr, int(prom_port), app, handler_class=NoLoggingWSGIRequestHandler) self.thread = threading.Thread( target=self.server.serve_forever) self.thread.daemon = True self.thread.start() else: self.server = hub.WSGIServer((prom_addr, int(prom_port)), app) self.thread = hub.spawn(self.server.serve_forever) self.thread.name = 'prometheus'
class ManilaShareServerNanny(ManilaNanny): """ Manila Share Server """ def __init__(self, config_file, interval, prom_port, http_port, handler): super(ManilaShareServerNanny, self).__init__(config_file, interval, prom_port=prom_port, http_port=http_port, handler=handler) self.orphan_snapshots_lock = Lock() self.orphan_snapshots: Dict[str, Dict[str, str]] = {} self.orphan_snapshots_gauge = Gauge( 'manila_nanny_orphan_share_snapshots', 'Orphan Manila Share Snapshots', ['share_id', 'snapshot_id']) def _run(self): s = self.query_orphan_snapshots() orphan_snapshots = { snapshot_id: { 'snapshot_id': snapshot_id, 'share_id': share_id } for snapshot_id, share_id in s } for snapshot_id in orphan_snapshots: share_id = orphan_snapshots[snapshot_id]['share_id'] self.orphan_snapshots_gauge.labels(share_id=share_id, snapshot_id=snapshot_id).set(1) for snapshot_id in self.orphan_snapshots: if snapshot_id not in orphan_snapshots: share_id = self.orphan_snapshots[snapshot_id]['share_id'] self.orphan_snapshots_gauge.remove(share_id, snapshot_id) with self.orphan_snapshots_lock: self.orphan_snapshots = update_records(self.orphan_snapshots, orphan_snapshots) def query_orphan_snapshots(self): Snapshots = Table('share_snapshots', self.db_metadata, autoload=True) Shares = Table('shares', self.db_metadata, autoload=True) q = select([Snapshots.c.id, Snapshots.c.share_id])\ .select_from(Snapshots.join(Shares, Snapshots.c.share_id == Shares.c.id))\ .where(Snapshots.c.deleted == 'False')\ .where(Shares.c.deleted != 'False') return list(q.execute()) @response def get_orphan_snapshots(self): with self.orphan_snapshots_lock: return list(self.orphan_snapshots.values())
def get_stats(self): registry = CollectorRegistry() labels = ['region', 'host', 'service', 'state'] services_stats_cache = self.get_cache_data() for services_stat in services_stats_cache: stat_gauge = Gauge( self.gauge_name_sanitize( services_stat['stat_name']), 'Openstack Nova Service statistic', labels, registry=registry) label_values = [self.osclient.region, services_stat.get('host', ''), services_stat.get('service', ''), services_stat.get('state', '')] stat_gauge.labels(*label_values).set(services_stat['stat_value']) return generate_latest(registry)
def get_stats(self): registry = CollectorRegistry() labels = ['region', 'host', 'aggregate', 'aggregate_id'] hypervisor_stats_cache = self.get_cache_data() for hypervisor_stat in hypervisor_stats_cache: stat_gauge = Gauge( self.gauge_name_sanitize( hypervisor_stat['stat_name']), 'Openstack Hypervisor statistic', labels, registry=registry) label_values = [self.osclient.region, hypervisor_stat.get('host', ''), hypervisor_stat.get('aggregate', ''), hypervisor_stat.get('aggregate_id', '')] stat_gauge.labels(*label_values).set(hypervisor_stat['stat_value']) return generate_latest(registry)
def push_inventory_metrics(): ppg = settings.get('apps', {}).get('zentral.contrib.inventory', {}).get('prometheus_push_gateway', None) if not ppg: return registry = CollectorRegistry() g = Gauge('zentral_inventory_osx_apps', 'Zentral inventory OSX apps', ['name', 'version_str', 'source'], registry=registry) for r in osx_app_count(): count = r.pop('count') g.labels(r).set(count) g = Gauge('zentral_inventory_os_versions', 'Zentral inventory OS Versions', ['name', 'major', 'minor', 'patch', 'build', 'source'], registry=registry) for r in os_version_count(): count = r.pop('count') g.labels(r).set(count) push_to_gateway(ppg, job='zentral_push_inventory_metrics', registry=registry)
def get_stats(self): registry = CollectorRegistry() labels = ['region', 'url', 'service'] check_api_data_cache = self.get_cache_data() for check_api_data in check_api_data_cache: label_values = [ check_api_data['region'], check_api_data['url'], check_api_data['service']] gague_name = self.gauge_name_sanitize( "check_{}_api".format(check_api_data['service'])) check_gauge = Gauge( gague_name, 'Openstack API check. fail = 0, ok = 1 and unknown = 2', labels, registry=registry) check_gauge.labels(*label_values).set(check_api_data['status']) return generate_latest(registry)
class PromClient: # pylint: disable=too-few-public-methods """Prometheus client.""" REQUIRED_LABELS = ['dp_id', 'dp_name'] _reg = REGISTRY def __init__(self, reg=None): if reg is not None: self._reg = reg # TODO: investigate faster alternative (https://bugs.launchpad.net/pbr/+bug/1688405) version = VersionInfo('faucet').semantic_version().release_string() self.faucet_version = PromGauge( # pylint: disable=unexpected-keyword-arg 'faucet_pbr_version', 'Faucet PBR version', ['version'], registry=self._reg) self.faucet_version.labels(version=version).set(1) # pylint: disable=no-member self.server = None self.thread = None def start(self, prom_port, prom_addr, use_test_thread=False): """Start webserver.""" if not self.server: app = make_wsgi_app(self._reg) if use_test_thread: from wsgiref.simple_server import make_server, WSGIRequestHandler import threading class NoLoggingWSGIRequestHandler(WSGIRequestHandler): """Don't log requests.""" def log_message(self, *_args): # pylint: disable=arguments-differ pass self.server = make_server( prom_addr, int(prom_port), app, handler_class=NoLoggingWSGIRequestHandler) self.thread = threading.Thread(target=self.server.serve_forever) self.thread.daemon = True self.thread.start() else: self.server = hub.WSGIServer((prom_addr, int(prom_port)), app) self.thread = hub.spawn(self.server.serve_forever) self.thread.name = 'prometheus'
class Metric(object): __slots__ = ['_collect', '_base_metric'] def __init__(self, name, description, collect_fn): self._collect = collect_fn self._base_metric = Gauge('nvidia_' + name, description, ['device_index', 'device_name']) def metric_for(self, device_name, device_index, device_handle): m = self._base_metric.labels(device_index=device_index, device_name=device_name) def collect(): try: return self._collect(device_handle) except: return 0 m.set_function(collect) return m
def main(): parser = _create_parser() args = parser.parse_args() logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) registry = core.REGISTRY total_fb_memory = Gauge('gpu_total_fb_memory_mb', 'Total installed frame buffer memory (in ' 'megabytes)', ['device'], registry=registry) free_fb_memory = Gauge('gpu_free_fb_memory_mb', 'Unallocated frame buffer memory (in ' 'megabytes)', ['device'], registry=registry) used_fb_memory = Gauge('gpu_used_fb_memory_mb', 'Allocated frame buffer memory (in megabytes).' ' Note that the diver/GPU will always set ' 'a small amount of memory fore bookkeeping.', ['device'], registry=registry) gpu_utilization = Gauge('gpu_utilization_pct', 'Percent of time over the past sample period ' 'during which one or more kernels was ' 'executing on the GPU.', ['device'], registry=registry) memory_utilization = Gauge('gpu_mem_utilization_pct', 'Percent of time over the past sample ' 'period during which global (device) memory ' 'was being read or written', ['device'], registry=registry) iteration = 0 try: log.debug('Initializing NVML...') nvmlInit() log.info('Started with nVidia driver version = %s', nvmlSystemGetDriverVersion()) device_count = nvmlDeviceGetCount() log.debug('%d devices found.', device_count) if args.port: log.debug('Starting http server on port %d', args.port) start_http_server(args.port) log.info('HTTP server started on port %d', args.port) while True: iteration += 1 log.debug('Current iteration = %d', iteration) for i in range(device_count): log.debug('Analyzing device %d...', i) try: log.debug('Obtaining handle for device %d...', i) handle = nvmlDeviceGetHandleByIndex(i) log.debug('Device handle for %d is %s', i, str(handle)) log.debug('Querying for memory information...') mem_info = nvmlDeviceGetMemoryInfo(handle) log.debug('Memory information = %s', str(mem_info)) total_fb_memory.labels(device=i).set(mem_info.total / 1024) free_fb_memory.labels(device=i).set(mem_info.free / 1024) used_fb_memory.labels(device=i).set(mem_info.used / 1024) log.debug('Obtaining utilization statistics...') utilization = nvmlDeviceGetUtilizationRates(handle) log.debug('Utilization statistics = %s', str(utilization)) gpu_utilization.labels(device=i).set(utilization.gpu / 100.0) memory_utilization.labels(device=i).set(utilization.memory / 100.0) except Exception as e: log.warning(e, exc_info=True) if args.gateway: log.debug('Pushing metrics to gateway at %s...', args.gateway) hostname = platform.node() push_to_gateway(args.gateway, job=hostname, registry=core.REGISTRY) log.debug('Push complete.') time.sleep(args.update_period) except Exception as e: log.error('Exception thrown - %s', e, exc_info=True) finally: nvmlShutdown()
logger.info("last detected event was at offset %s timestamp %s", offset, last_ts) else: last_ts = args.from_timestamp logger.info("detecting conflicts newer than %s", datetime.utcfromtimestamp(last_ts)) start_http_server(4240 + PARTITIONS[args.collector]) client = KafkaClient(args.our_servers.split(",")) stats = defaultdict(int) for msg in detect_conflicts(**kwargs): ts = msg.get("timestamp", 0) if last_ts is not None and ts <= last_ts: continue events_latency.labels(args.collector, str(msg["peer_as"])).set((datetime.utcnow() - datetime.utcfromtimestamp(ts)).seconds) for enrich_func in funcs: enrich_func(msg) filter_out = False # skip these events that are probably legitimate if "valid" in msg: validated.labels(args.collector).inc() filter_out = True if "relation" in msg: relation.labels(args.collector).inc() filter_out = True if "direct" in msg: connected.labels(args.collector).inc() filter_out = True
class ESGaugeMetric(object): def __init__(self, name, desc, labels, value, value_converter, url, query, logger=None): ''' name -- metric name (e.g. node_network_status) desc -- metric description labels -- indexes (tuple of strings) in metric_data taken as labels value -- index in metric_data (dict) taken as value for metric value_converter -- sometime value may came in mixed format like - 5s, 3GB. we need to convert this value to numeric. pass a function reference to this converter, can be lambda as well. url -- elasticsearch url to index or GET query query -- elasticsearch query data for POST request logger -- instance of logging.Logger class ''' self.gauge = Gauge(name, desc, list(labels)) self.name = name self.labels = labels self.value = value self.value_converter = value_converter self.url = url self.query = query self.logger = logger def path_converter(self, path): ''' convert path from indexA.indexB to ['indexA']['indexB'] path -- path in dot notaion return -- path in bracket notaion ''' elems = [] for elem in path.split('.'): bracket = "['{0}']".format(elem) elems.append(bracket) return ''.join(elems) def es_query(self, url, data): ''' query Elasticsearch cluster and return raw requests.Response object url -- url to elastic search e.g. - http://localhost:9200/bank/_search data -- query in json format - more info reffer to Elasticsearch documentation return -- raw requests.Response object ''' headers = {'Content-Type': 'application/json'} resp = requests.post(url, headers=headers, data=data) return resp def populate(self, metric_data): ''' populate labels and value with data metric_data -- dict object return -- metric_labels - dict with label=value, metric_value - converted value ''' try: converter = getattr(self, self.value_converter) except Exception: converter = self.value_converter value_path = self.path_converter(self.value) value_var = 'metric_data{0}'.format(value_path) metric_value = converter(eval(value_var)) metric_labels = {} for label_name, label_path in self.labels.iteritems(): label_path = self.path_converter(label_path) label_var = 'metric_data{0}'.format(label_path) metric_labels[label_name] = eval(label_var) return metric_labels, metric_value def print_metric(self, metric_labels, metric_value): ''' build and print metric metric_labels -- labels to print metric_value -- value to print ''' if metric_labels: label_value = [] for label, value in metric_labels.iteritems(): label_value.append('{l}={v}'.format(l=label, v=value)) # show labels in a log text = '{n}{{{lv}}} {v}'.format(n=self.name, lv=', '.join(label_value), v=metric_value) else: # there are no labels to show text = '{n} {v}'.format(n=self.name, v=metric_value) if self.logger: self.logger.info(text) else: print '[INFO]: {t}'.format(t=text) def update(self, print_metric=False): ''' query ES and update metric with newer value print_metric -- print metric to stdout (good for dev stage) ''' resp = self.es_query(self.url, data=self.query) metric_data = json.loads(resp.text) metric_labels, metric_value = self.populate(metric_data) if print_metric: self.print_metric(metric_labels, metric_value) if self.labels: self.gauge.labels(**metric_labels).set(metric_value) else: self.gauge.set(metric_value)