class Nova(): def __init__(self): self.registry = CollectorRegistry() self.prodstack = {} with open(config['cache_file'], 'rb') as f: self.prodstack = pickle.load(f)[0] self.hypervisors = self.prodstack['hypervisors'] self.tenant_map = { t['id']: t['name'] for t in self.prodstack['tenants'] } self.flavor_map = { f['id']: { 'ram': f['ram'], 'disk': f['disk'], 'vcpus': f['vcpus'] } for f in self.prodstack['flavors'] } self.aggregate_map = {} self.services_map = {} for s in self.prodstack['services']: if s['binary'] == 'nova-compute': self.services_map[s['host']] = s['status'] for agg in self.prodstack['aggregates']: self.aggregate_map.update({i: agg['name'] for i in agg['hosts']}) def _get_schedulable_instances(self, host): free_vcpus = host['vcpus'] * config[ 'openstack_allocation_ratio_vcpu'] - host['vcpus_used'] free_ram_mbs = host['memory_mb'] * config[ 'openstack_allocation_ratio_ram'] - host['memory_mb_used'] free_disk_gbs = host['local_gb'] * config[ 'openstack_allocation_ratio_disk'] - host['local_gb_used'] s = config['schedulable_instance_size'] return min(int(free_vcpus / s['vcpu']), int(free_ram_mbs / s['ram_mbs']), int(free_disk_gbs / s['disk_gbs'])) def _get_schedulable_instances_capacity(self, host): capacity_vcpus = host['vcpus'] * config[ 'openstack_allocation_ratio_vcpu'] capacity_ram_mbs = host['memory_mb'] * config[ 'openstack_allocation_ratio_ram'] capacity_disk_gbs = host['local_gb'] * config[ 'openstack_allocation_ratio_disk'] s = config['schedulable_instance_size'] return min(int(capacity_vcpus / s['vcpu']), int(capacity_ram_mbs / s['ram_mbs']), int(capacity_disk_gbs / s['disk_gbs'])) def gen_hypervisor_stats(self): labels = [ 'cloud', 'hypervisor_hostname', 'aggregate', 'nova_service_status', 'arch' ] vms = Gauge('hypervisor_running_vms', 'Number of running VMs', labels, registry=self.registry) vcpus_total = Gauge('hypervisor_vcpus_total', 'Total number of vCPUs', labels, registry=self.registry) vcpus_used = Gauge('hypervisor_vcpus_used', 'Number of used vCPUs', labels, registry=self.registry) mem_total = Gauge('hypervisor_memory_mbs_total', 'Total amount of memory in MBs', labels, registry=self.registry) mem_used = Gauge('hypervisor_memory_mbs_used', 'Used memory in MBs', labels, registry=self.registry) disk_total = Gauge('hypervisor_disk_gbs_total', 'Total amount of disk space in GBs', labels, registry=self.registry) disk_used = Gauge('hypervisor_disk_gbs_used', 'Used disk space in GBs', labels, registry=self.registry) schedulable_instances = Gauge( 'hypervisor_schedulable_instances', 'Number of schedulable instances, see "schedulable_instance_size" option', labels, registry=self.registry) schedulable_instances_capacity = Gauge( 'hypervisor_schedulable_instances_capacity', 'Number of schedulable instances we have capacity for', labels, registry=self.registry) def squashnone(val, default=0): if val is None: return default return val for h in self.hypervisors: log.debug("Hypervisor: %s", h) host = h['service']['host'] log.debug("host: %s", host) cpu_info = h['cpu_info'] log.debug("cpu_info: %s", cpu_info) arch = 'Unknown' if not cpu_info: log.info("Could not get cpu info") elif type(cpu_info) != dict: cpu_info = json.loads(cpu_info) arch = cpu_info['arch'] l = [ config['cloud'], host, self.aggregate_map.get(host, 'unknown'), self.services_map[host], arch ] # Disabled hypervisors return None below, convert to 0 vms.labels(*l).set(squashnone(h['running_vms'])) vcpus_total.labels(*l).set(squashnone(h['vcpus'])) vcpus_used.labels(*l).set(squashnone(h['vcpus_used'])) mem_total.labels(*l).set(squashnone(h['memory_mb'])) mem_used.labels(*l).set(squashnone(h['memory_mb_used'])) disk_total.labels(*l).set(squashnone(h['local_gb'])) disk_used.labels(*l).set(squashnone(h['local_gb_used'])) if config.get("schedulable_instance_size", False): schedulable_instances.labels(*l).set( self._get_schedulable_instances(h)) schedulable_instances_capacity.labels(*l).set( self._get_schedulable_instances_capacity(h)) def gen_instance_stats(self): missing_flavors = False instances = Gauge('nova_instances', 'Nova instances metrics', ['cloud', 'tenant', 'instance_state'], registry=self.registry) res_ram = Gauge('nova_resources_ram_mbs', 'Nova RAM usage metric', ['cloud', 'tenant'], registry=self.registry) res_vcpus = Gauge('nova_resources_vcpus', 'Nova vCPU usage metric', ['cloud', 'tenant'], registry=self.registry) res_disk = Gauge('nova_resources_disk_gbs', 'Nova disk usage metric', ['cloud', 'tenant'], registry=self.registry) for i in self.prodstack['instances']: if i['tenant_id'] in self.tenant_map: tenant = self.tenant_map[i['tenant_id']] else: tenant = 'orphaned' instances.labels(config['cloud'], tenant, i['status']).inc() if i['flavor']['id'] in self.flavor_map: flavor = self.flavor_map[i['flavor']['id']] res_ram.labels(config['cloud'], tenant).inc(flavor['ram']) res_vcpus.labels(config['cloud'], tenant).inc(flavor['vcpus']) res_disk.labels(config['cloud'], tenant).inc(flavor['disk']) else: missing_flavors = True # If flavors were deleted we can't reliably find out resouerce use if missing_flavors: self.registry.unregister(res_ram) self.registry.unregister(res_vcpus) self.registry.unregister(res_disk) res_ram = Gauge( 'nova_resources_ram_mbs', 'Nova RAM usage metric unavailable, missing flavors', [], registry=self.registry) res_vcpus = Gauge( 'nova_resources_vcpus', 'Nova vCPU usage metric unavailable, missing flavors', [], registry=self.registry) res_disk = Gauge( 'nova_resources_disk_gbs', 'Nova disk usage metric unavailable, missing flavors', [], registry=self.registry) def gen_overcommit_stats(self): labels = ['cloud', 'resource'] openstack_overcommit = Gauge('openstack_allocation_ratio', 'Openstack overcommit ratios', labels, registry=self.registry) l = [config['cloud'], 'vcpu'] openstack_overcommit.labels(*l).set( config['openstack_allocation_ratio_vcpu']) l = [config['cloud'], 'ram'] openstack_overcommit.labels(*l).set( config['openstack_allocation_ratio_ram']) l = [config['cloud'], 'disk'] openstack_overcommit.labels(*l).set( config['openstack_allocation_ratio_disk']) def gen_quota_stats(self): cores = Gauge('nova_quota_cores', 'Nova cores metric', ['cloud', 'tenant', 'type'], registry=self.registry) fips = Gauge('nova_quota_floating_ips', 'Nova floating IP addresses (number)', ['cloud', 'tenant', 'type'], registry=self.registry) inst = Gauge('nova_quota_instances', 'Nova instances (number)', ['cloud', 'tenant', 'type'], registry=self.registry) ram = Gauge('nova_quota_ram_mbs', 'Nova RAM (MB)', ['cloud', 'tenant', 'type'], registry=self.registry) for t, q in self.prodstack['nova_quotas'].items(): if t in self.tenant_map: tenant = self.tenant_map[t] else: tenant = 'orphaned' # we get detailed quota information only on recent OS versions if isinstance(q['cores'], int): cores.labels(config['cloud'], tenant, 'limit').set(q['cores']) fips.labels(config['cloud'], tenant, 'limit').set(q['floating_ips']) inst.labels(config['cloud'], tenant, 'limit').set(q['instances']) ram.labels(config['cloud'], tenant, 'limit').set(q['ram']) else: for tt in ['limit', 'in_use', 'reserved']: cores.labels(config['cloud'], tenant, tt).inc(q['cores'][tt]) fips.labels(config['cloud'], tenant, tt).inc(q['floating_ips'][tt]) inst.labels(config['cloud'], tenant, tt).inc(q['instances'][tt]) ram.labels(config['cloud'], tenant, tt).inc(q['ram'][tt]) def get_stats(self): log.debug("get_stats") self.gen_hypervisor_stats() self.gen_instance_stats() self.gen_overcommit_stats() self.gen_quota_stats() return generate_latest(self.registry)
class BroadcastWebsocketStats(): def __init__(self, local_hostname, remote_hostname): self._local_hostname = local_hostname self._remote_hostname = remote_hostname self._registry = CollectorRegistry() # TODO: More robust replacement self.name = self.safe_name(self._local_hostname) self.remote_name = self.safe_name(self._remote_hostname) self._messages_received_total = Counter(f'awx_{self.remote_name}_messages_received_total', 'Number of messages received, to be forwarded, by the broadcast websocket system', registry=self._registry) self._messages_received = Gauge(f'awx_{self.remote_name}_messages_received', 'Number forwarded messages received by the broadcast websocket system, for the duration of the current connection', registry=self._registry) self._connection = Enum(f'awx_{self.remote_name}_connection', 'Websocket broadcast connection', states=['disconnected', 'connected'], registry=self._registry) self._connection_start = Gauge(f'awx_{self.remote_name}_connection_start', 'Time the connection was established', registry=self._registry) self._messages_received_per_minute = Gauge(f'awx_{self.remote_name}_messages_received_per_minute', 'Messages received per minute', registry=self._registry) self._internal_messages_received_per_minute = FixedSlidingWindow() def safe_name(self, s): # Replace all non alpha-numeric characters with _ return re.sub('[^0-9a-zA-Z]+', '_', s) def unregister(self): self._registry.unregister(f'awx_{self.remote_name}_messages_received') self._registry.unregister(f'awx_{self.remote_name}_connection') def record_message_received(self): self._internal_messages_received_per_minute.record() self._messages_received.inc() self._messages_received_total.inc() def record_connection_established(self): self._connection.state('connected') self._connection_start.set_to_current_time() self._messages_received.set(0) def record_connection_lost(self): self._connection.state('disconnected') def get_connection_duration(self): return (datetime.datetime.now() - self._connection_established_ts).total_seconds() def render(self): msgs_per_min = self._internal_messages_received_per_minute.render() self._messages_received_per_minute.set(msgs_per_min) def serialize(self): self.render() registry_data = generate_latest(self._registry).decode('UTF-8') return registry_data
class QMetrics(): job_key = 'unknown_metrics_key' counters = {} gauges = {} def __init__(self, args={}): if 'job_key' in args: self.job_key = args['job_key'] self.init_pushgateway_url() self.registry = CollectorRegistry() def create_gauge(self, metrics_key, metrics_name, data_keys): index = len(self.gauges) + 1 self.gauges[index] = Gauge(metrics_key, metrics_name, data_keys, registry=self.registry) return index def reinit_gauge(self, index, metrics_key, metrics_name, data_keys): self.registry.unregister(self.gauges[index]) self.gauges[index] = Gauge(metrics_key, metrics_name, data_keys, registry=self.registry) return index def set_gauge_to_now(self, index, *labels): now = datetime.datetime.utcnow().timestamp() #print("SETTING GAUGE {index} TO {now}".format(index=index, now=now)) self.gauges[index].labels(*labels).set(now) def gauge_set(self, index, *labels, value): #print("SETTING GAUGE {index} TO {value}".format(index=index, value=value)) self.gauges[index].labels(*labels).set(value) def create_counter(self, metrics_key, metrics_name, data_keys): index = len(self.counters) + 1 self.counters[index] = Counter(metrics_key, metrics_name, data_keys, registry=self.registry) return index def init_pushgateway_url(self): self.pushgateway_url = '{host}:{port}'.format( host=os.environ['PUSHGATEWAY_HOST'], port=os.environ['PUSHGATEWAY_PORT']) def counter_inc(self, index, *labels, increment=1): #print("INCREMENTING COUNTER {index} TO {increment}".format(index=index, increment=increment)) self.counters[index].labels(*labels).inc(increment) def gauge_inc(self, index, *labels, increment=1): #print("INCREMENTING GAUGE {index} TO {increment}".format(index=index, increment=increment)) self.gauges[index].labels(*labels).inc(increment) def push_metrics(self): job = '{job_key}_{random}'.format(job_key=self.job_key, random=random.randint( 1000000000000, 100000000000000000)) push_to_gateway(self.pushgateway_url, job=job, registry=self.registry) return job def delete(self, job_key): r = requests.delete( 'http://{pushgateway_url}/metrics/job/{job_key}'.format( pushgateway_url=self.pushgateway_url, job_key=job_key)) return r