def write_dataset(self, gerrit): ''' if dataset is empty then there is no need to write it ''' if not self.observations == {}: self.create_dataset() yaml = YamlConfig(gerrit, self) yaml.write_file() fh = open(self.full_csv_path, 'w') fh.write(self.file_contents.getvalue()) fh.close()
def main(): # config file parsing parser = argparse.ArgumentParser() parser.add_argument("-c", "--config", help="Specify config file", metavar="FILE") parser.add_argument("-d", "--datastore", help="Get metrics for datastores instead of vms", action='store_true') args, remaining_argv = parser.parse_known_args() config = YamlConfig(args.config, defaults) # list of vm properties we are using and which we get via property collector later # see: http://goo.gl/fjTEpW for all properties. # once for vms and once for datastores ... and some other stuff, which differs for the two cases if args.datastore == False: my_properties = [ "runtime.powerState", "runtime.host", "config.annotation", "config.name", "config.instanceUuid", "config.guestId", "summary.config.vmPathName" ] my_name = "vm" my_obj_type = vim.VirtualMachine else: my_properties = [ "summary.accessible", "summary.capacity", "summary.freeSpace", "summary.maintenanceMode", "summary.name", "summary.type", "summary.url", "overallStatus" ] my_name = "datastore" my_obj_type = vim.Datastore # set default log level if not defined in config file if config.get('main').get('log'): logger.setLevel( logging.getLevelName(config.get('main').get('log').upper())) else: logger.setLevel('INFO') FORMAT = '[%(asctime)s] [%(levelname)s] %(message)s' logging.basicConfig(stream=sys.stdout, format=FORMAT) # check for insecure ssl option si = None context = None if config.get('main').get('ignore_ssl') and \ hasattr(ssl, "_create_unverified_context"): context = ssl._create_unverified_context() # connect to vcenter try: si = SmartConnect(host=config.get('main').get('host'), user=config.get('main').get('user'), pwd=config.get('main').get('password'), port=int(config.get('main').get('port')), sslContext=context) atexit.register(Disconnect, si) except IOError as e: logging.error("Could not connect to vcenter." + e) if not si: raise SystemExit("Unable to connect to host with supplied info.") content = si.RetrieveContent() perfManager = content.perfManager # get the datacenter info datacenter = si.content.rootFolder.childEntity[0] datacentername = datacenter.name logging.debug('datacenter name: ' + datacentername) # create a list of vim.VirtualMachine / vim.Datastore objects so that we can query them for statistics container = content.rootFolder viewType = [my_obj_type] recursive = True # initialize some variables counterInfo = {} gauge = {} # time intervall to average vcenter data across in seconds interval = int(config.get('main').get('interval')) # compile a regex for trying to filter out openstack generated vms - they all have the "name:" field set openstack_match_regex = re.compile("^name:") # compile a regex for stripping out not required parts of hostnames etc. to have shorter label names (for better grafana display) if config.get('main').get('shorter_names_regex'): shorter_names_regex = re.compile( config.get('main').get('shorter_names_regex')) else: shorter_names_regex = re.compile('') logging.debug("name shortening regex: " + str(config.get('main').get('shorter_names_regex'))) # compile a regex for matching the vcenter_node name, so that we can deal only with the matching node or bb with this vcenter-exporter if config.get('main').get('host_match_regex'): host_match_regex = re.compile( config.get('main').get('host_match_regex')) else: host_match_regex = re.compile('') logging.debug("vcenter_node name (host) regex: " + str(config.get('main').get('host_match_regex'))) # compile a regex for matching the vmware_name against machines we do not want to collect metrics for (canary, blackbox vms etc.) if config.get('main').get('ignore_match_regex'): ignore_match_regex = re.compile( config.get('main').get('ignore_match_regex')) else: ignore_match_regex = re.compile( 'this_string_will_definitely_not_match_any_vmware_name') logging.debug("vmware name ignore regex: " + str(config.get('main').get('ignore_match_regex'))) # create a mapping from performance stats to their counterIDs # counterInfo: [performance stat => counterId] # performance stat example: cpu.usagemhz.LATEST # counterId example: 6 # level defines the amounts of metrics available and its default setting in the vcenter here is 1 counterids = perfManager.QueryPerfCounterByLevel(level=4) # start up the http server to expose the prometheus metrics start_http_server(int(config.get('main').get('listen_port'))) if args.datastore == False: logging.debug('list of all available metrics and their counterids') # loop over all counterids and build their full name and a dict relating it to the ids for c in counterids: fullName = c.groupInfo.key + "." + c.nameInfo.key + "." + c.rollupType logging.debug(fullName + ': ' + str(c.key)) counterInfo[fullName] = c.key # define a dict of vm gauges for the counter ids gauge['vcenter_' + fullName.replace('.', '_')] = Gauge( 'vcenter_' + fullName.replace('.', '_'), 'vcenter_' + fullName.replace('.', '_'), [ 'vmware_name', 'project_id', 'vcenter_name', 'vcenter_node', 'instance_uuid', 'guest_id', 'datastore', 'metric_detail' ]) # in case we have a configured set of metrics to handle, use those - otherwise use all we can get selected_metrics = config.get('main').get('vm_metrics') if selected_metrics: counterIDs = [ counterInfo[i] for i in selected_metrics if i in counterInfo ] else: counterIDs = [i.key for i in counterids] else: # define the gauges - they have to be defined by hand for the datastores, as there is no clear pattern behind gauge['vcenter_datastore_accessible'] = Gauge( 'vcenter_datastore_accessible', 'vcenter_datastore_accessible', ['datastore_name', 'datastore_type', 'datastore_url']) gauge['vcenter_datastore_capacity'] = Gauge( 'vcenter_datastore_capacity', 'vcenter_datastore_capacity', ['datastore_name', 'datastore_type', 'datastore_url']) gauge['vcenter_datastore_freespace'] = Gauge( 'vcenter_datastore_freespace', 'vcenter_datastore_freespace', ['datastore_name', 'datastore_type', 'datastore_url']) gauge['vcenter_datastore_maintenancemode'] = Gauge( 'vcenter_datastore_maintenancemode', 'vcenter_datastore_maintenancemode', ['datastore_name', 'datastore_type', 'datastore_url']) gauge['vcenter_datastore_overallstatus'] = Gauge( 'vcenter_datastore_overallstatus', 'vcenter_datastore_overallstatus', ['datastore_name', 'datastore_type', 'datastore_url']) # infinite loop for getting the metrics while True: logging.debug('====> total loop start: %s' % datetime.now()) # get the start time of the loop to be able to fill it to intervall exactly at the end loop_start_time = int(time.time()) # first the vm metric case if args.datastore == False: # get all the data regarding vcenter hosts hostView = content.viewManager.CreateContainerView( container, [vim.HostSystem], recursive) hostssystems = hostView.view # build a dict to lookup the hostname by its id later hostsystemsdict = {} for host in hostssystems: hostsystemsdict[host] = host.name logging.debug( 'list of all available vcenter nodes and their internal id') logging.debug(hostsystemsdict) # collect the properties we are interested in view = get_container_view(si, obj_type=[my_obj_type]) my_data = collect_properties(si, view_ref=view, obj_type=my_obj_type, path_set=my_properties, include_mors=True) my_count = 0 # define the time range in seconds the metric data from the vcenter should be averaged across # all based on vcenter time vchtime = si.CurrentTime() startTime = vchtime - timedelta(seconds=(interval + 60)) endTime = vchtime - timedelta(seconds=60) # loop over all vmware machines for item in my_data: try: if args.datastore == False: # only consider machines which have an annotation, are powered on, match our regex for the host system and are not in the ignore list if (item["runtime.powerState"] == "poweredOn" and openstack_match_regex.match( item["config.annotation"]) and host_match_regex.match( hostsystemsdict[item["runtime.host"]]) ) and not ignore_match_regex.match( item["config.name"]): logging.debug('current vm processed - ' + item["config.name"]) logging.debug('==> running on vcenter node: ' + hostsystemsdict[item["runtime.host"]]) # split the multi-line annotation into a dict per property (name, project-id, ...) annotation_lines = item["config.annotation"].split( '\n') # rename flavor: to flavor_, so that it does not break the split on : below annotation_lines = [ w.replace('flavor:', 'flavor_') for w in annotation_lines ] # the filter is for filtering out empty lines annotations = dict( s.split(':', 1) for s in filter(None, annotation_lines)) # datastore name datastore = item["summary.config.vmPathName"].split( '[', 1)[1].split(']')[0] # get a list of metricids for this vm in preparation for the stats query metricIDs = [ vim.PerformanceManager.MetricId(counterId=i, instance="*") for i in counterIDs ] # query spec for the metric stats query, the intervalId is the default one logging.debug( '==> vim.PerformanceManager.QuerySpec start: %s' % datetime.now()) spec = vim.PerformanceManager.QuerySpec( maxSample=1, entity=item["obj"], metricId=metricIDs, intervalId=20, startTime=startTime, endTime=endTime) logging.debug( '==> vim.PerformanceManager.QuerySpec end: %s' % datetime.now()) # get metric stats from vcenter logging.debug('==> perfManager.QueryStats start: %s' % datetime.now()) result = perfManager.QueryStats(querySpec=[spec]) logging.debug('==> perfManager.QueryStats end: %s' % datetime.now()) # loop over the metrics logging.debug('==> gauge loop start: %s' % datetime.now()) for val in result[0].value: # send gauges to prometheus exporter: metricname and value with # labels: vm name, project id, vcenter name, vcneter # node, instance uuid and metric detail (for instance a partition # for io or an interface for net metrics) - we update the gauge # only if the value is not -1 which means the vcenter has no value if val.value[0] != -1: if val.id.instance == '': metric_detail = 'total' else: metric_detail = val.id.instance gauge['vcenter_' + counterInfo.keys()[counterInfo.values( ).index(val.id.counterId)].replace( '.', '_')].labels( annotations['name'], annotations['projectid'], datacentername, shorter_names_regex.sub( '', hostsystemsdict[ item["runtime.host"]]), item["config.instanceUuid"], item["config.guestId"], datastore, metric_detail).set(val.value[0]) logging.debug('==> gauge loop end: %s' % datetime.now()) # alternatively the datastore metric case else: logging.debug('current datastore processed - ' + item["summary.name"]) logging.debug('==> accessible: ' + str(item["summary.accessible"])) # convert strings to numbers, so that we can generate a prometheus metric from them if item["summary.accessible"] == True: number_accessible = 1 else: number_accessible = 0 logging.debug('==> capacity: ' + str(item["summary.capacity"])) logging.debug('==> freeSpace: ' + str(item["summary.freeSpace"])) logging.debug('==> maintenanceMode: ' + str(item["summary.maintenanceMode"])) # convert strings to numbers, so that we can generate a prometheus metric from them if item["summary.maintenanceMode"] == "normal": number_maintenanceMode = 0 else: # fallback to note if we do not yet catch a value number_maintenanceMode = -1 logging.info( 'unexpected maintenanceMode for datastore ' + item["summary.name"]) logging.debug('==> type: ' + str(item["summary.type"])) logging.debug('==> url: ' + str(item["summary.url"])) logging.debug('==> overallStatus: ' + str(item["overallStatus"])) # convert strings to numbers, so that we can generate a prometheus metric from them if item["overallStatus"] == "green": number_overallStatus = 0 elif item["overallStatus"] == "yellow": number_overallStatus = 1 elif item["overallStatus"] == "red": number_overallStatus = 2 else: # fallback to note if we do not yet catch a value number_overallStatus = -1 logging.info( 'unexpected overallStatus for datastore ' + item["summary.name"]) # set the gauges for the datastore properties logging.debug('==> gauge start: %s' % datetime.now()) gauge['vcenter_datastore_accessible'].labels( item["summary.name"], item["summary.type"], item["summary.url"]).set(number_accessible) gauge['vcenter_datastore_capacity'].labels( item["summary.name"], item["summary.type"], item["summary.url"]).set(item["summary.capacity"]) gauge['vcenter_datastore_freespace'].labels( item["summary.name"], item["summary.type"], item["summary.url"]).set(item["summary.freeSpace"]) gauge['vcenter_datastore_maintenancemode'].labels( item["summary.name"], item["summary.type"], item["summary.url"]).set(number_maintenanceMode) gauge['vcenter_datastore_overallstatus'].labels( item["summary.name"], item["summary.type"], item["summary.url"]).set(number_overallStatus) logging.debug('==> gauge end: %s' % datetime.now()) my_count += 1 except IndexError: logging.info('a ' + my_name + ' disappeared during processing') loop_end_time = int(time.time()) logging.info('number of ' + my_name + 's we got metrics for: ' + str(my_count) + ' - actual runtime: ' + str(loop_end_time - loop_start_time) + 's') # this is the time we sleep to fill the loop runtime until it reaches "interval" # the 0.9 makes sure we have some overlap to the last interval to avoid gaps in # metrics coverage (i.e. we get the metrics quicker than the averaging time) loop_sleep_time = 0.9 * interval - (loop_end_time - loop_start_time) if loop_sleep_time < 0: logging.warn( 'getting the metrics takes around ' + str(interval) + ' seconds or longer - please increase the interval setting') loop_sleep_time = 0 logging.debug('====> loop end before sleep: %s' % datetime.now()) time.sleep(int(loop_sleep_time)) logging.debug('====> total loop end: %s' % datetime.now())
def set_config(): global _CONFIG if os.path.isfile(args.config_file): _CONFIG.update(YamlConfig(args.config_file, None)) else: raise Exception
# metrics coverage (i.e. we get the metrics quicker than the averaging time) loop_sleep_time = 0.9 * \ self.configs['main']['vc_polling_interval'] - \ (loop_end_time - loop_start_time) if loop_sleep_time < 0: logging.warn('getting the metrics takes around ' + str( self.configs['main']['vc_polling_interval']) + ' seconds or longer - please increase the interval setting') loop_sleep_time = 0 logging.debug('====> loop end before sleep: %s' % datetime.now()) time.sleep(int(loop_sleep_time)) logging.debug('====> total loop end: %s' % datetime.now()) if __name__ == "__main__": # config file parsing parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", help="Specify config file", metavar="FILE", default="config.yaml") parser.add_argument( "-t", "--type", help="The type of exporter [VM, versions, datastores]", default="versionsandapi") args, remaining_argv = parser.parse_known_args() config = YamlConfig(args.config, VcenterExporter.defaults) if args.type.upper() not in VcenterExporter.supported_types: sys.exit("Current supported exporter types [--t] are " + ', '.join(VcenterExporter.supported_types)) vcenter_exporter = VcenterExporter(config, args.type) vcenter_exporter.collect_metrics()
class VMWareMetricsResource(Resource): """ VMWare twisted ``Resource`` handling multi endpoints Only handle /metrics and /healthz path """ isLeaf = True def __init__(self): """ Init Metric Resource """ Resource.__init__(self) self.threader = Threader() def configure(self, args): if args.config_file: try: self.config = YamlConfig(args.config_file) if 'default' not in self.config.keys(): log("Error, you must have a default section in config file (for now)" ) exit(1) except Exception as exception: raise SystemExit( "Error while reading configuration file: {0}".format( exception.message)) else: config_data = """ default: vsphere_host: "{0}" vsphere_user: "******" vsphere_password: "******" ignore_ssl: {3} collect_only: vms: True vmguests: True datastores: True hosts: True snapshots: True """.format(os.environ.get('VSPHERE_HOST'), os.environ.get('VSPHERE_USER'), os.environ.get('VSPHERE_PASSWORD'), os.environ.get('VSPHERE_IGNORE_SSL', False)) self.config = yaml.load(config_data) self.config['default']['collect_only']['hosts'] = os.environ.get( 'VSPHERE_COLLECT_HOSTS', True) self.config['default']['collect_only'][ 'datastores'] = os.environ.get('VSPHERE_COLLECT_DATASTORES', True) self.config['default']['collect_only']['vms'] = os.environ.get( 'VSPHERE_COLLECT_VMS', True) self.config['default']['collect_only'][ 'vmguests'] = os.environ.get('VSPHERE_COLLECT_VMGUESTS', True) self.config['default']['collect_only'][ 'snapshots'] = os.environ.get('VSPHERE_COLLECT_SNAPSHOTS', True) def render_GET(self, request): """ handles get requests for metrics, health, and everything else """ path = request.path.decode() request.setHeader("Content-Type", "text/plain; charset=UTF-8") if path == '/metrics': deferred_request = deferLater(reactor, 0, lambda: request) deferred_request.addCallback(self.generate_latest_metrics) deferred_request.addErrback(self.errback, request) return NOT_DONE_YET elif path == '/healthz': request.setResponseCode(200) log("Service is UP") return 'Server is UP'.encode() else: log("Uri not found: " + request.uri) request.setResponseCode(404) return '404 Not Found'.encode() def errback(self, failure, request): """ handles failures from requests """ failure.printTraceback() log(failure) request.processingFailed( failure ) # This will send a trace to the browser and close the request. return None def generate_latest_metrics(self, request): """ gets the latest metrics """ section = request.args.get('section', ['default'])[0] if self.config[section].get('vsphere_host') and self.config[ section].get('vsphere_host') != "None": vsphere_host = self.config[section].get('vsphere_host') elif request.args.get('target', [None])[0]: vsphere_host = request.args.get('target', [None])[0] elif request.args.get('vsphere_host', [None])[0]: vsphere_host = request.args.get('vsphere_host')[0] else: request.setResponseCode(500) log("No vsphere_host or target defined") request.write('No vsphere_host or target defined!\n') request.finish() output = [] for metric in self.collect(vsphere_host, section): output.append('# HELP {0} {1}'.format( metric.name, metric.documentation.replace('\\', r'\\').replace('\n', r'\n'))) output.append('\n# TYPE {0} {1}\n'.format(metric.name, metric.type)) for name, labels, value in metric.samples: if labels: labelstr = '{{{0}}}'.format(','.join([ '{0}="{1}"'.format( k, v.replace('\\', r'\\').replace('\n', r'\n').replace( '"', r'\"')) for k, v in sorted(labels.items()) ])) else: labelstr = '' if isinstance(value, int): value = float(value) if isinstance(value, long): # noqa: F821 value = float(value) if isinstance(value, float): output.append('{0}{1} {2}\n'.format( name, labelstr, _floatToGoString(value))) if output != []: request.write(''.join(output).encode('utf-8')) request.finish() else: request.setResponseCode(500, message=('cannot connect to vmware')) request.finish() return def collect(self, vsphere_host, section='default'): """ collects metrics """ if section not in self.config.keys(): log("{} is not a valid section, using default".format(section)) section = 'default' host_inventory = {} ds_inventory = {} metric_list = {} metric_list['vms'] = { 'vmware_vm_power_state': GaugeMetricFamily( 'vmware_vm_power_state', 'VMWare VM Power state (On / Off)', labels=['vm_name', 'host_name', 'dc_name', 'cluster_name']), 'vmware_vm_boot_timestamp_seconds': GaugeMetricFamily( 'vmware_vm_boot_timestamp_seconds', 'VMWare VM boot time in seconds', labels=['vm_name', 'host_name', 'dc_name', 'cluster_name']), 'vmware_vm_num_cpu': GaugeMetricFamily( 'vmware_vm_num_cpu', 'VMWare Number of processors in the virtual machine', labels=['vm_name', 'host_name', 'dc_name', 'cluster_name']), } metric_list['vmguests'] = { 'vmware_vm_guest_disk_free': GaugeMetricFamily('vmware_vm_guest_disk_free', 'Disk metric per partition', labels=[ 'vm_name', 'host_name', 'dc_name', 'cluster_name', 'partition', ]), 'vmware_vm_guest_disk_capacity': GaugeMetricFamily('vmware_vm_guest_disk_capacity', 'Disk capacity metric per partition', labels=[ 'vm_name', 'host_name', 'dc_name', 'cluster_name', 'partition', ]), } metric_list['snapshots'] = { 'vmware_vm_snapshots': GaugeMetricFamily( 'vmware_vm_snapshots', 'VMWare current number of existing snapshots', labels=['vm_name', 'host_name', 'dc_name', 'cluster_name']), 'vmware_vm_snapshot_timestamp_seconds': GaugeMetricFamily('vmware_vm_snapshot_timestamp_seconds', 'VMWare Snapshot creation time in seconds', labels=[ 'vm_name', 'host_name', 'dc_name', 'cluster_name', 'vm_snapshot_name' ]), } metric_list['datastores'] = { 'vmware_datastore_capacity_size': GaugeMetricFamily('vmware_datastore_capacity_size', 'VMWare Datasore capacity in bytes', labels=['ds_name', 'dc_name', 'ds_cluster']), 'vmware_datastore_freespace_size': GaugeMetricFamily('vmware_datastore_freespace_size', 'VMWare Datastore freespace in bytes', labels=['ds_name', 'dc_name', 'ds_cluster']), 'vmware_datastore_uncommited_size': GaugeMetricFamily('vmware_datastore_uncommited_size', 'VMWare Datastore uncommitted in bytes', labels=['ds_name', 'dc_name', 'ds_cluster']), 'vmware_datastore_provisoned_size': GaugeMetricFamily('vmware_datastore_provisoned_size', 'VMWare Datastore provisoned in bytes', labels=['ds_name', 'dc_name', 'ds_cluster']), 'vmware_datastore_hosts': GaugeMetricFamily('vmware_datastore_hosts', 'VMWare Hosts number using this datastore', labels=['ds_name', 'dc_name', 'ds_cluster']), 'vmware_datastore_vms': GaugeMetricFamily('vmware_datastore_vms', 'VMWare Virtual Machines count per datastore', labels=['ds_name', 'dc_name', 'ds_cluster']), 'vmware_datastore_maintenance_mode': GaugeMetricFamily( 'vmware_datastore_maintenance_mode', 'VMWare datastore maintenance mode (normal / inMaintenance / enteringMaintenance)', labels=['ds_name', 'dc_name', 'ds_cluster', 'mode']), 'vmware_datastore_type': GaugeMetricFamily( 'vmware_datastore_type', 'VMWare datastore type (VMFS, NetworkFileSystem, NetworkFileSystem41, CIFS, VFAT, VSAN, VFFS)', labels=['ds_name', 'dc_name', 'ds_cluster', 'ds_type']), 'vmware_datastore_accessible': GaugeMetricFamily('vmware_datastore_accessible', 'VMWare datastore accessible (true / false)', labels=['ds_name', 'dc_name', 'ds_cluster']) } metric_list['hosts'] = { 'vmware_host_power_state': GaugeMetricFamily('vmware_host_power_state', 'VMWare Host Power state (On / Off)', labels=['host_name', 'dc_name', 'cluster_name']), 'vmware_host_connection_state': GaugeMetricFamily( 'vmware_host_connection_state', 'VMWare Host connection state (connected / disconnected / notResponding)', labels=['host_name', 'dc_name', 'cluster_name', 'state']), 'vmware_host_maintenance_mode': GaugeMetricFamily('vmware_host_maintenance_mode', 'VMWare Host maintenance mode (true / false)', labels=['host_name', 'dc_name', 'cluster_name']), 'vmware_host_boot_timestamp_seconds': GaugeMetricFamily('vmware_host_boot_timestamp_seconds', 'VMWare Host boot time in seconds', labels=['host_name', 'dc_name', 'cluster_name']), 'vmware_host_cpu_usage': GaugeMetricFamily('vmware_host_cpu_usage', 'VMWare Host CPU usage in Mhz', labels=['host_name', 'dc_name', 'cluster_name']), 'vmware_host_cpu_max': GaugeMetricFamily('vmware_host_cpu_max', 'VMWare Host CPU max availability in Mhz', labels=['host_name', 'dc_name', 'cluster_name']), 'vmware_host_memory_usage': GaugeMetricFamily('vmware_host_memory_usage', 'VMWare Host Memory usage in Mbytes', labels=['host_name', 'dc_name', 'cluster_name']), 'vmware_host_memory_max': GaugeMetricFamily('vmware_host_memory_max', 'VMWare Host Memory Max availability in Mbytes', labels=['host_name', 'dc_name', 'cluster_name']), } metrics = {} for key, value in self.config[section]['collect_only'].items(): if value is True: metrics.update(metric_list[key]) log("Start collecting metrics from {0}".format(vsphere_host)) self.vmware_connection = self._vmware_connect(vsphere_host, section) if not self.vmware_connection: log("Cannot connect to vmware") return content = self.vmware_connection.RetrieveContent() # Generate inventory dict log("Starting inventory collection") host_inventory, ds_inventory = self._vmware_get_inventory(content) log("Finished inventory collection") # Collect VMs metrics if self.config[section]['collect_only']['vms'] is True: log("Starting VM performance metrics collection") counter_info = self._vmware_perf_metrics(content) self._vmware_get_vms(content, metrics, counter_info, host_inventory) log("Finished VM performance metrics collection") # Collect VMs metrics if self.config[section]['collect_only']['vmguests'] is True: log("Starting VM Guests metrics collection") self._vmware_get_vmguests(content, metrics, host_inventory) log("Finished VM Guests metrics collection") # Collect Snapshots (count and age) if self.config[section]['collect_only']['snapshots'] is True: log("Starting VM snapshot metric collection") vm_snap_counts, vm_snap_ages = self._vmware_get_snapshots( content, host_inventory) for v in vm_snap_counts: metrics['vmware_vm_snapshots'].add_metric([ v['vm_name'], v['vm_host_name'], v['vm_dc_name'], v['vm_cluster_name'] ], v['vm_snapshot_count']) for vm_snap_age in vm_snap_ages: for v in vm_snap_age: metrics['vmware_vm_snapshot_timestamp_seconds'].add_metric( [ v['vm_name'], v['vm_host_name'], v['vm_dc_name'], v['vm_cluster_name'], v['vm_snapshot_name'] ], v['vm_snapshot_timestamp_seconds']) log("Finished VM snapshot metric collection") # Collect Datastore metrics if self.config[section]['collect_only']['datastores'] is True: log("Starting datastore metrics collection") self._vmware_get_datastores(content, metrics, ds_inventory) log("Finished datastore metrics collection") # Collect Hosts metrics if self.config[section]['collect_only']['hosts'] is True: log("Starting host metrics collection") self._vmware_get_hosts(content, metrics, host_inventory) log("Finished host metrics collection") log("Finished collecting metrics from {0}".format(vsphere_host)) self.threader.join() self._vmware_disconnect() for _key, metric in metrics.items(): yield metric def _to_epoch(self, my_date): """ convert to epoch time """ return (my_date - datetime(1970, 1, 1, tzinfo=pytz.utc)).total_seconds() def _vmware_get_obj(self, content, vimtype, name=None): """ Get the vsphere object associated with a given text name """ obj = None container = content.viewManager.CreateContainerView( content.rootFolder, vimtype, True) if name: for view in container.view: if view.name == name: obj = view return [obj] else: return container.view def _vmware_connect(self, vsphere_host, section): """ Connect to Vcenter and get connection """ vsphere_user = self.config[section].get('vsphere_user') vsphere_password = self.config[section].get('vsphere_password') context = None if self.config[section].get('ignore_ssl') and \ hasattr(ssl, "_create_unverified_context"): context = ssl._create_unverified_context() try: vmware_connect = connect.SmartConnect(host=vsphere_host, user=vsphere_user, pwd=vsphere_password, sslContext=context) return vmware_connect except vmodl.MethodFault as error: log("Caught vmodl fault: " + error.msg) return None def _vmware_disconnect(self): """ Disconnect from Vcenter """ connect.Disconnect(self.vmware_connection) def _vmware_perf_metrics(self, content): """ create a mapping from performance stats to their counterIDs counter_info: [performance stat => counterId] performance stat example: cpu.usagemhz.LATEST """ counter_info = {} for counter in content.perfManager.perfCounter: prefix = counter.groupInfo.key counter_full = "{}.{}.{}".format(prefix, counter.nameInfo.key, counter.rollupType) counter_info[counter_full] = counter.key return counter_info def _vmware_full_snapshots_list(self, snapshots): """ Get snapshots from a VM list, recursively """ snapshot_data = [] for snapshot in snapshots: snap_timestamp = self._to_epoch(snapshot.createTime) snap_info = { 'vm_snapshot_name': snapshot.name, 'vm_snapshot_timestamp_seconds': snap_timestamp } snapshot_data.append(snap_info) snapshot_data = snapshot_data + self._vmware_full_snapshots_list( snapshot.childSnapshotList) return snapshot_data def _vmware_get_snapshot_details(self, snapshots_count_table, snapshots_age_table, virtual_machine, inventory): """ Gathers snapshot details """ snapshot_paths = self._vmware_full_snapshots_list( virtual_machine.snapshot.rootSnapshotList) _, host_name, dc_name, cluster_name = self._vmware_vm_metadata( inventory, virtual_machine) for snapshot_path in snapshot_paths: snapshot_path['vm_name'] = virtual_machine.name snapshot_path['vm_host_name'] = host_name snapshot_path['vm_dc_name'] = dc_name snapshot_path['vm_cluster_name'] = cluster_name # Add Snapshot count per VM snapshot_count = len(snapshot_paths) snapshot_count_info = { 'vm_name': virtual_machine.name, 'vm_host_name': host_name, 'vm_dc_name': dc_name, 'vm_cluster_name': cluster_name, 'vm_snapshot_count': snapshot_count } snapshots_count_table.append(snapshot_count_info) snapshots_age_table.append(snapshot_paths) def _vmware_get_snapshots(self, content, inventory): """ Get snapshots from all VM """ snapshots_count_table = [] snapshots_age_table = [] virtual_machines = self._vmware_get_obj(content, [vim.VirtualMachine]) for virtual_machine in virtual_machines: if not virtual_machine or virtual_machine.snapshot is None: continue else: self.threader.thread_it(self._vmware_get_snapshot_details, [ snapshots_count_table, snapshots_age_table, virtual_machine, inventory ]) return snapshots_count_table, snapshots_age_table def _vmware_get_datastores(self, content, ds_metrics, inventory): """ Get Datastore information """ datastores = self._vmware_get_obj(content, [vim.Datastore]) for datastore in datastores: # ds.RefreshDatastoreStorageInfo() summary = datastore.summary ds_name = summary.name dc_name = inventory[ds_name]['dc'] ds_cluster = inventory[ds_name]['ds_cluster'] self.threader.thread_it( self._vmware_get_datastore_metrics, [datastore, dc_name, ds_cluster, ds_metrics, summary]) def _vmware_get_datastore_metrics(self, datastore, dc_name, ds_cluster, ds_metrics, summary): """ Get datastore metrics """ ds_capacity = float(summary.capacity) ds_freespace = float(summary.freeSpace) ds_uncommitted = float( summary.uncommitted) if summary.uncommitted else 0 ds_provisioned = ds_capacity - ds_freespace + ds_uncommitted ds_metrics['vmware_datastore_capacity_size'].add_metric( [summary.name, dc_name, ds_cluster], ds_capacity) ds_metrics['vmware_datastore_freespace_size'].add_metric( [summary.name, dc_name, ds_cluster], ds_freespace) ds_metrics['vmware_datastore_uncommited_size'].add_metric( [summary.name, dc_name, ds_cluster], ds_uncommitted) ds_metrics['vmware_datastore_provisoned_size'].add_metric( [summary.name, dc_name, ds_cluster], ds_provisioned) ds_metrics['vmware_datastore_hosts'].add_metric( [summary.name, dc_name, ds_cluster], len(datastore.host)) ds_metrics['vmware_datastore_vms'].add_metric( [summary.name, dc_name, ds_cluster], len(datastore.vm)) ds_metrics['vmware_datastore_maintenance_mode'].add_metric( [summary.name, dc_name, ds_cluster, summary.maintenanceMode], 1) ds_metrics['vmware_datastore_type'].add_metric( [summary.name, dc_name, ds_cluster, summary.type], 1) ds_metrics['vmware_datastore_accessible'].add_metric( [summary.name, dc_name, ds_cluster], summary.accessible * 1) def _vmware_get_vms(self, content, vm_metrics, counter_info, inventory): """ Get VM information """ # List of performance counter we want perf_list = [ 'cpu.ready.summation', 'cpu.usage.average', 'cpu.usagemhz.average', 'disk.usage.average', 'disk.read.average', 'disk.write.average', 'mem.usage.average', 'net.received.average', 'net.transmitted.average', ] # Prepare gauges for p in perf_list: p_metric = 'vmware_vm_' + p.replace('.', '_') vm_metrics[p_metric] = GaugeMetricFamily( p_metric, p_metric, labels=['vm_name', 'host_name', 'dc_name', 'cluster_name']) virtual_machines = self._vmware_get_obj(content, [vim.VirtualMachine]) log("Total Virtual Machines: {0}".format(len(virtual_machines))) for virtual_machine in virtual_machines: self.threader.thread_it(self._vmware_get_vm_perf_metrics, [ content, counter_info, perf_list, virtual_machine, vm_metrics, inventory ]) def _vmware_get_vm_perf_metrics(self, content, counter_info, perf_list, virtual_machine, vm_metrics, inventory): """ Loops over metrics in perf_list on vm """ # DEBUG ME: log("Starting VM: " + vm.name) summary = virtual_machine.summary vm_power_state = 1 if summary.runtime.powerState == 'poweredOn' else 0 vm_num_cpu = summary.config.numCpu vm_name, vm_host_name, vm_dc_name, vm_cluster_name = self._vmware_vm_metadata( inventory, virtual_machine, summary) vm_metadata = [vm_name, vm_host_name, vm_dc_name, vm_cluster_name] vm_metrics['vmware_vm_power_state'].add_metric(vm_metadata, vm_power_state) vm_metrics['vmware_vm_num_cpu'].add_metric(vm_metadata, vm_num_cpu) # Get metrics for poweredOn vms only if vm_power_state: if summary.runtime.bootTime: vm_metrics['vmware_vm_boot_timestamp_seconds'].add_metric( vm_metadata, self._to_epoch(summary.runtime.bootTime)) for p in perf_list: self.threader.thread_it(self._vmware_get_vm_perf_metric, [ content, counter_info, p, virtual_machine, vm_metrics, vm_metadata ]) # Debug Me. log("Finished VM: " + vm.name) def _vmware_get_vm_perf_metric(self, content, counter_info, perf_metric, virtual_machine, vm_metrics, vm_metadata): """ Get vm perf metric """ perf_metric_name = 'vmware_vm_' + perf_metric.replace('.', '_') counter_key = counter_info[perf_metric] metric_id = vim.PerformanceManager.MetricId(counterId=counter_key, instance='') spec = vim.PerformanceManager.QuerySpec(maxSample=1, entity=virtual_machine, metricId=[metric_id], intervalId=20) result = content.perfManager.QueryStats(querySpec=[spec]) # DEBUG ME: log("{0} {1}: {2}".format(vm.name, p, float(sum(result[0].value[0].value)))) try: vm_metrics[perf_metric_name].add_metric( vm_metadata, float(sum(result[0].value[0].value))) except: # noqa: E722 log("Error, cannot get vm metric {0} for {1}".format( perf_metric_name, vm_metadata)) def _vmware_get_vmguests(self, content, vmguest_metrics, inventory): """ Get VM Guest information """ virtual_machines = self._vmware_get_obj(content, [vim.VirtualMachine]) log("Total Virtual Machines: {0}".format(len(virtual_machines))) for virtual_machine in virtual_machines: self.threader.thread_it( self._vmware_get_vmguests_metrics, [content, virtual_machine, vmguest_metrics, inventory]) def _vmware_get_vmguests_metrics(self, content, virtual_machine, vmguest_metrics, inventory): """ Get VM Guest Metrics """ summary = virtual_machine.summary vm_name, vm_host_name, vm_dc_name, vm_cluster_name = self._vmware_vm_metadata( inventory, virtual_machine, summary) # gather disk metrics if len(virtual_machine.guest.disk) > 0: for disk in virtual_machine.guest.disk: vmguest_metrics['vmware_vm_guest_disk_free'].add_metric([ vm_name, vm_host_name, vm_dc_name, vm_cluster_name, disk.diskPath ], disk.freeSpace) vmguest_metrics['vmware_vm_guest_disk_capacity'].add_metric([ vm_name, vm_host_name, vm_dc_name, vm_cluster_name, disk.diskPath ], disk.capacity) def _vmware_get_hosts(self, content, host_metrics, inventory): """ Get Host (ESXi) information """ hosts = self._vmware_get_obj(content, [vim.HostSystem]) for host in hosts: summary = host.summary host_name, host_dc_name, host_cluster_name = self._vmware_host_metadata( inventory, host) host_metadata = [host_name, host_dc_name, host_cluster_name] # Power state power_state = 1 if summary.runtime.powerState == 'poweredOn' else 0 host_metrics['vmware_host_power_state'].add_metric( host_metadata, power_state) if power_state: self.threader.thread_it(self._vmware_get_host_metrics, [ host_name, host_dc_name, host_cluster_name, host_metrics, summary ]) def _vmware_get_host_metrics(self, host_name, host_dc_name, host_cluster_name, host_metrics, summary): """ Get Host Metrics """ labels = [host_name, host_dc_name, host_cluster_name] if summary.runtime.bootTime: # Host uptime host_metrics['vmware_host_boot_timestamp_seconds'].add_metric( labels, self._to_epoch(summary.runtime.bootTime)) # Host connection state (connected, disconnected, notResponding) metric_labels = labels metric_labels.append(summary.runtime.connectionState) host_metrics['vmware_host_connection_state'].add_metric( metric_labels, 1) # Host in maintenance mode? host_metrics['vmware_host_maintenance_mode'].add_metric( labels, summary.runtime.inMaintenanceMode * 1) # CPU Usage (in Mhz) host_metrics['vmware_host_cpu_usage'].add_metric( labels, summary.quickStats.overallCpuUsage) cpu_core_num = summary.hardware.numCpuCores cpu_total = summary.hardware.cpuMhz * cpu_core_num host_metrics['vmware_host_cpu_max'].add_metric(labels, cpu_total) # Memory Usage (in MB) host_metrics['vmware_host_memory_usage'].add_metric( labels, summary.quickStats.overallMemoryUsage) host_metrics['vmware_host_memory_max'].add_metric( labels, float(summary.hardware.memorySize) / 1024 / 1024) def _vmware_get_inventory(self, content): """ Get host and datastore inventory (datacenter, cluster) information """ host_inventory = {} ds_inventory = {} children = content.rootFolder.childEntity for child in children: # Iterate though DataCenters dc = child hostFolders = dc.hostFolder.childEntity for folder in hostFolders: # Iterate through host folders if isinstance( folder, vim.ClusterComputeResource): # Folder is a Cluster hosts = folder.host for host in hosts: # Iterate through Hosts in the Cluster host_name = host.summary.config.name host_inventory[host_name] = {} host_inventory[host_name]['dc'] = dc.name host_inventory[host_name]['cluster'] = folder.name else: # Unclustered host host_name = folder.name host_inventory[host_name] = {} host_inventory[host_name]['dc'] = dc.name host_inventory[host_name]['cluster'] = '' dsFolders = dc.datastoreFolder.childEntity for folder in dsFolders: # Iterate through datastore folders if isinstance(folder, vim.Datastore): # Unclustered datastore ds_inventory[folder.name] = {} ds_inventory[folder.name]['dc'] = dc.name ds_inventory[folder.name]['ds_cluster'] = '' else: # Folder is a Datastore Cluster datastores = folder.childEntity for datastore in datastores: ds_inventory[datastore.name] = {} ds_inventory[datastore.name]['dc'] = dc.name ds_inventory[ datastore.name]['ds_cluster'] = folder.name return host_inventory, ds_inventory def _vmware_vm_metadata(self, inventory, vm, summary=None): """ Get VM metadata from inventory """ if summary is None: summary = vm.summary vm_name = vm.name vm_host = summary.runtime.host vm_host_name = vm_host.name vm_dc_name = inventory[vm_host_name]['dc'] vm_cluster_name = inventory[vm_host_name]['cluster'] return vm_name, vm_host_name, vm_dc_name, vm_cluster_name def _vmware_host_metadata(self, inventory, host): """ Get Host metadata from inventory """ host_name = host.name host_dc_name = inventory[host_name]['dc'] host_cluster_name = inventory[host_name]['cluster'] return host_name, host_dc_name, host_cluster_name
class VMWareMetricsResource(Resource): """ VMWare twisted ``Resource`` handling multi endpoints Only handle /metrics path """ isLeaf = True def __init__(self, args): try: self.config = YamlConfig(args.config_file) if 'default' not in self.config.keys(): print("Error, you must have a default section in config file") exit(1) except: raise SystemExit("Error, cannot read configuration file") def render_GET(self, request): path = request.path.decode() request.setHeader("Content-Type", "text/plain; charset=UTF-8") if path == '/metrics': if not request.args.get('target', [None])[0]: request.setResponseCode(404) return 'No target defined\r\n'.encode() d = deferLater(reactor, 0, lambda: request) d.addCallback(self.generate_latest_target) d.addErrback(self.errback, request) return NOT_DONE_YET else: request.setResponseCode(404) return '404 Not Found'.encode() def errback(self, failure, request): failure.printTraceback() request.processingFailed( failure ) # This will send a trace to the browser and close the request. return None def generate_latest_target(self, request): target = request.args.get('target', [None])[0] section = request.args.get('section', ['default'])[0] output = [] for metric in self.collect(target, section): output.append('# HELP {0} {1}'.format( metric.name, metric.documentation.replace('\\', r'\\').replace('\n', r'\n'))) output.append('\n# TYPE {0} {1}\n'.format(metric.name, metric.type)) for name, labels, value in metric.samples: if labels: labelstr = '{{{0}}}'.format(','.join([ '{0}="{1}"'.format( k, v.replace('\\', r'\\').replace('\n', r'\n').replace( '"', r'\"')) for k, v in sorted(labels.items()) ])) else: labelstr = '' output.append('{0}{1} {2}\n'.format(name, labelstr, _floatToGoString(value))) if output != []: request.write(''.join(output).encode('utf-8')) request.finish() else: request.setResponseCode(500, message=('cannot connect to vmware')) request.finish() return def collect(self, target=None, section='default'): if section not in self.config.keys(): print("{} is not a valid section, using default".format(section)) section = 'default' metrics = { 'vmware_vm_power_state': GaugeMetricFamily('vmware_vm_power_state', 'VMWare VM Power state (On / Off)', labels=['vm_name']), 'vmware_vm_boot_timestamp_seconds': GaugeMetricFamily('vmware_vm_boot_timestamp_seconds', 'VMWare VM boot time in seconds', labels=['vm_name']), 'vmware_vm_snapshots': GaugeMetricFamily('vmware_vm_snapshots', 'VMWare current number of existing snapshots', labels=['vm_name']), 'vmware_vm_snapshot_timestamp_seconds': GaugeMetricFamily('vmware_vm_snapshot_timestamp_seconds', 'VMWare Snapshot creation time in seconds', labels=['vm_name', 'vm_snapshot_name']), 'vmware_datastore_capacity_size': GaugeMetricFamily('vmware_datastore_capacity_size', 'VMWare Datasore capacity in bytes', labels=['ds_name']), 'vmware_datastore_freespace_size': GaugeMetricFamily('vmware_datastore_freespace_size', 'VMWare Datastore freespace in bytes', labels=['ds_name']), 'vmware_datastore_uncommited_size': GaugeMetricFamily('vmware_datastore_uncommited_size', 'VMWare Datastore uncommitted in bytes', labels=['ds_name']), 'vmware_datastore_provisoned_size': GaugeMetricFamily('vmware_datastore_provisoned_size', 'VMWare Datastore provisoned in bytes', labels=['ds_name']), 'vmware_datastore_hosts': GaugeMetricFamily('vmware_datastore_hosts', 'VMWare Hosts number using this datastore', labels=['ds_name']), 'vmware_datastore_vms': GaugeMetricFamily( 'vmware_datastore_vms', 'VMWare Virtual Machines number using this datastore', labels=['ds_name']), 'vmware_host_power_state': GaugeMetricFamily('vmware_host_power_state', 'VMWare Host Power state (On / Off)', labels=['host_name']), 'vmware_host_boot_timestamp_seconds': GaugeMetricFamily('vmware_host_boot_timestamp_seconds', 'VMWare Host boot time in seconds', labels=['host_name']), 'vmware_host_cpu_usage': GaugeMetricFamily('vmware_host_cpu_usage', 'VMWare Host CPU usage in Mhz', labels=['host_name']), 'vmware_host_cpu_max': GaugeMetricFamily('vmware_host_cpu_max', 'VMWare Host CPU max availability in Mhz', labels=['host_name']), 'vmware_host_memory_usage': GaugeMetricFamily('vmware_host_memory_usage', 'VMWare Host Memory usage in Mbytes', labels=['host_name']), 'vmware_host_memory_max': GaugeMetricFamily('vmware_host_memory_max', 'VMWare Host Memory Max availability in Mbytes', labels=['host_name']), } print("[{0}] Start collecting vcenter metrics for {1}".format( datetime.utcnow().replace(tzinfo=pytz.utc), target)) self.si = self._vmware_connect(target, section) if not self.si: print("Error, cannot connect to vmware") return content = self.si.RetrieveContent() # Get performance metrics counter information counter_info = self._vmware_perf_metrics(content) # Fill Snapshots (count and age) vm_counts, vm_ages = self._vmware_get_snapshots(content) for v in vm_counts: metrics['vmware_vm_snapshots'].add_metric([v['vm_name']], v['snapshot_count']) for vm_age in vm_ages: for v in vm_age: metrics['vmware_vm_snapshot_timestamp_seconds'].add_metric( [v['vm_name'], v['vm_snapshot_name']], v['vm_snapshot_timestamp_seconds']) # Fill Datastore self._vmware_get_datastores(content, metrics) # Fill VM Informations self._vmware_get_vms(content, metrics, counter_info) # Fill Hosts Informations self._vmware_get_hosts(content, metrics) print("[{0}] Stop collecting vcenter metrics for {1}".format( datetime.utcnow().replace(tzinfo=pytz.utc), target)) self._vmware_disconnect() for metricname, metric in metrics.items(): yield metric def _to_unix_timestamp(self, my_date): return ((my_date - datetime(1970, 1, 1, tzinfo=pytz.utc)).total_seconds()) def _vmware_get_obj(self, content, vimtype, name=None): """ Get the vsphere object associated with a given text name """ obj = None container = content.viewManager.CreateContainerView( content.rootFolder, vimtype, True) if name: for c in container.view: if c.name == name: obj = c return [obj] else: return container.view def _vmware_connect(self, target, section): """ Connect to Vcenter and get connection """ context = None if self.config[section]['ignore_ssl'] and \ hasattr(ssl, "_create_unverified_context"): context = ssl._create_unverified_context() try: si = connect.Connect(target, 443, self.config[section]['vmware_user'], self.config[section]['vmware_password'], sslContext=context) return si except vmodl.MethodFault as error: print("Caught vmodl fault: " + error.msg) return None def _vmware_disconnect(self): """ Disconnect from Vcenter """ connect.Disconnect(self.si) def _vmware_perf_metrics(self, content): # create a mapping from performance stats to their counterIDs # counter_info: [performance stat => counterId] # performance stat example: cpu.usagemhz.LATEST counter_info = {} for c in content.perfManager.perfCounter: prefix = c.groupInfo.key counter_full = "{}.{}.{}".format(c.groupInfo.key, c.nameInfo.key, c.rollupType) counter_info[counter_full] = c.key return counter_info def _vmware_list_snapshots_recursively(self, snapshots): """ Get snapshots from a VM list, recursively """ snapshot_data = [] for snapshot in snapshots: snap_timestamp = self._to_unix_timestamp(snapshot.createTime) snap_info = { 'vm_snapshot_name': snapshot.name, 'vm_snapshot_timestamp_seconds': snap_timestamp } snapshot_data.append(snap_info) snapshot_data = snapshot_data + self._vmware_list_snapshots_recursively( snapshot.childSnapshotList) return snapshot_data def _vmware_get_snapshots(self, content): """ Get snapshots from all VM """ snapshots_count_table = [] snapshots_age_table = [] for vm in self._vmware_get_obj(content, [vim.VirtualMachine]): if not vm or vm.snapshot is None: continue else: snapshot_paths = self._vmware_list_snapshots_recursively( vm.snapshot.rootSnapshotList) for sn in snapshot_paths: sn['vm_name'] = vm.name # Add Snapshot count per VM snapshot_count = len(snapshot_paths) snapshot_count_info = { 'vm_name': vm.name, 'snapshot_count': snapshot_count } snapshots_count_table.append(snapshot_count_info) snapshots_age_table.append(snapshot_paths) return snapshots_count_table, snapshots_age_table def _vmware_get_datastores(self, content, ds_metrics): """ Get Datastore information """ for ds in self._vmware_get_obj(content, [vim.Datastore]): #ds.RefreshDatastoreStorageInfo() summary = ds.summary ds_capacity = summary.capacity ds_freespace = summary.freeSpace ds_uncommitted = summary.uncommitted if summary.uncommitted else 0 ds_provisioned = ds_capacity - ds_freespace + ds_uncommitted ds_metrics['vmware_datastore_capacity_size'].add_metric( [summary.name], ds_capacity) ds_metrics['vmware_datastore_freespace_size'].add_metric( [summary.name], ds_freespace) ds_metrics['vmware_datastore_uncommited_size'].add_metric( [summary.name], ds_uncommitted) ds_metrics['vmware_datastore_provisoned_size'].add_metric( [summary.name], ds_provisioned) ds_metrics['vmware_datastore_hosts'].add_metric([summary.name], len(ds.host)) ds_metrics['vmware_datastore_vms'].add_metric([summary.name], len(ds.vm)) def _vmware_get_vms(self, content, vm_metrics, counter_info): """ Get VM information """ # List of performance counter we want perf_list = [ 'cpu.ready.summation', 'cpu.usage.average', 'cpu.usagemhz.average', 'disk.usage.average', 'disk.read.average', 'disk.write.average', 'mem.usage.average', 'net.received.average', 'net.transmitted.average', ] # Prepare gauges for p in perf_list: p_metric = 'vmware_vm_' + p.replace('.', '_') vm_metrics[p_metric] = GaugeMetricFamily(p_metric, p_metric, labels=['vm_name']) for vm in self._vmware_get_obj(content, [vim.VirtualMachine]): summary = vm.summary power_state = 1 if summary.runtime.powerState == 'poweredOn' else 0 vm_metrics['vmware_vm_power_state'].add_metric([vm.name], power_state) # Get metrics for poweredOn vms only if power_state: if summary.runtime.bootTime: vm_metrics['vmware_vm_boot_timestamp_seconds'].add_metric( [vm.name], self._to_unix_timestamp(summary.runtime.bootTime)) for p in perf_list: p_metric = 'vmware_vm_' + p.replace('.', '_') counter_key = counter_info[p] metric_id = vim.PerformanceManager.MetricId( counterId=counter_key, instance='') spec = vim.PerformanceManager.QuerySpec( maxSample=1, entity=vm, metricId=[metric_id], intervalId=20) result = content.perfManager.QueryStats(querySpec=[spec]) try: vm_metrics[p_metric].add_metric( [vm.name], float(sum(result[0].value[0].value))) except: print( "Error, cannot get vm metrics {0} for {1}".format( p_metric, vm.name)) pass def _vmware_get_hosts(self, content, host_metrics): """ Get Host (ESXi) information """ for host in self._vmware_get_obj(content, [vim.HostSystem]): summary = host.summary # Power state power_state = 1 if summary.runtime.powerState == 'poweredOn' else 0 host_metrics['vmware_host_power_state'].add_metric([host.name], power_state) if power_state: # Uptime if summary.runtime.bootTime: host_metrics[ 'vmware_host_boot_timestamp_seconds'].add_metric( [host.name], self._to_unix_timestamp(summary.runtime.bootTime)) # CPU Usage (in Mhz) host_metrics['vmware_host_cpu_usage'].add_metric( [host.name], summary.quickStats.overallCpuUsage) cpu_core_num = summary.hardware.numCpuCores cpu_total = summary.hardware.cpuMhz * cpu_core_num host_metrics['vmware_host_cpu_max'].add_metric([host.name], cpu_total) # Memory Usage (in Mhz) host_metrics['vmware_host_memory_usage'].add_metric( [host.name], summary.quickStats.overallMemoryUsage) host_metrics['vmware_host_memory_max'].add_metric( [host.name], float(summary.hardware.memorySize) / 1024 / 1024)
def __init__(self, args): try: self.config = YamlConfig(args.config_file) except Exception as e: raise SystemExit(f'ERROR - {e}')
def configure(self, args): if args.config_file: try: self.config = YamlConfig(args.config_file) if 'default' not in self.config.keys(): log("Error, you must have a default section in config file (for now)" ) exit(1) return except Exception as exception: raise SystemExit( "Error while reading configuration file: {0}".format( exception.message)) self.config = { 'default': { 'vsphere_host': os.environ.get('VSPHERE_HOST'), 'vsphere_user': os.environ.get('VSPHERE_USER'), 'vsphere_password': os.environ.get('VSPHERE_PASSWORD'), 'ignore_ssl': os.environ.get('VSPHERE_IGNORE_SSL', False), 'collect_only': { 'vms': os.environ.get('VSPHERE_COLLECT_VMS', True), 'vmguests': os.environ.get('VSPHERE_COLLECT_VMGUESTS', True), 'datastores': os.environ.get('VSPHERE_COLLECT_DATASTORES', True), 'hosts': os.environ.get('VSPHERE_COLLECT_HOSTS', True), 'snapshots': os.environ.get('VSPHERE_COLLECT_SNAPSHOTS', True), } } } for key in os.environ.keys(): if key == 'VSPHERE_USER': continue if not key.startswith('VSPHERE_') or not key.endswith('_USER'): continue section = key.split('_', 1)[1].rsplit('_', 1)[0] self.config[section.lower()] = { 'vsphere_host': os.environ.get('VSPHERE_{}_HOST'.format(section)), 'vsphere_user': os.environ.get('VSPHERE_{}_USER'.format(section)), 'vsphere_password': os.environ.get('VSPHERE_{}_PASSWORD'.format(section)), 'ignore_ssl': os.environ.get('VSPHERE_{}_IGNORE_SSL'.format(section), False), 'collect_only': { 'vms': os.environ.get('VSPHERE_{}_COLLECT_VMS'.format(section), True), 'vmguests': os.environ.get( 'VSPHERE_{}_COLLECT_VMGUESTS'.format(section), True), 'datastores': os.environ.get( 'VSPHERE_{}_COLLECT_DATASTORES'.format(section), True), 'hosts': os.environ.get('VSPHERE_{}_COLLECT_HOSTS'.format(section), True), 'snapshots': os.environ.get( 'VSPHERE_{}_COLLECT_SNAPSHOTS'.format(section), True), } }
class VMWareMetricsResource(Resource): isLeaf = True def __init__(self, args): """ Init Metric Resource """ Resource.__init__(self) self.configure(args) def configure(self, args): if args.config_file: try: self.config = YamlConfig(args.config_file) if 'default' not in self.config.keys(): log("Error, you must have a default section in config file (for now)" ) exit(1) return except Exception as exception: raise SystemExit( "Error while reading configuration file: {0}".format( exception.message)) self.config = { 'default': { 'vsphere_host': os.environ.get('VSPHERE_HOST'), 'vsphere_user': os.environ.get('VSPHERE_USER'), 'vsphere_password': os.environ.get('VSPHERE_PASSWORD'), 'ignore_ssl': os.environ.get('VSPHERE_IGNORE_SSL', False), 'collect_only': { 'vms': os.environ.get('VSPHERE_COLLECT_VMS', True), 'vmguests': os.environ.get('VSPHERE_COLLECT_VMGUESTS', True), 'datastores': os.environ.get('VSPHERE_COLLECT_DATASTORES', True), 'hosts': os.environ.get('VSPHERE_COLLECT_HOSTS', True), 'snapshots': os.environ.get('VSPHERE_COLLECT_SNAPSHOTS', True), } } } for key in os.environ.keys(): if key == 'VSPHERE_USER': continue if not key.startswith('VSPHERE_') or not key.endswith('_USER'): continue section = key.split('_', 1)[1].rsplit('_', 1)[0] self.config[section.lower()] = { 'vsphere_host': os.environ.get('VSPHERE_{}_HOST'.format(section)), 'vsphere_user': os.environ.get('VSPHERE_{}_USER'.format(section)), 'vsphere_password': os.environ.get('VSPHERE_{}_PASSWORD'.format(section)), 'ignore_ssl': os.environ.get('VSPHERE_{}_IGNORE_SSL'.format(section), False), 'collect_only': { 'vms': os.environ.get('VSPHERE_{}_COLLECT_VMS'.format(section), True), 'vmguests': os.environ.get( 'VSPHERE_{}_COLLECT_VMGUESTS'.format(section), True), 'datastores': os.environ.get( 'VSPHERE_{}_COLLECT_DATASTORES'.format(section), True), 'hosts': os.environ.get('VSPHERE_{}_COLLECT_HOSTS'.format(section), True), 'snapshots': os.environ.get( 'VSPHERE_{}_COLLECT_SNAPSHOTS'.format(section), True), } } def render_GET(self, request): """ handles get requests for metrics, health, and everything else """ self._async_render_GET(request) return NOT_DONE_YET @defer.inlineCallbacks def _async_render_GET(self, request): try: yield self.generate_latest_metrics(request) except Exception: log(traceback.format_exc()) request.setResponseCode(500) request.write(b'# Collection failed') request.finish() # We used to call request.processingFailed to send a traceback to browser # This can make sense in debug mode for a HTML site - but we don't want # prometheus trying to parse a python traceback @defer.inlineCallbacks def generate_latest_metrics(self, request): """ gets the latest metrics """ section = request.args.get('section', ['default'])[0] if section not in self.config.keys(): log("{} is not a valid section, using default".format(section)) section = 'default' if self.config[section].get('vsphere_host') and self.config[ section].get('vsphere_host') != "None": vsphere_host = self.config[section].get('vsphere_host') elif request.args.get(b'target', [None])[0]: vsphere_host = request.args.get(b'target', [None])[0].decode('utf-8') elif request.args.get(b'vsphere_host', [None])[0]: vsphere_host = request.args.get(b'vsphere_host')[0].decode('utf-8') else: request.setResponseCode(500) log("No vsphere_host or target defined") request.write(b'No vsphere_host or target defined!\n') request.finish() return collector = VmwareCollector( vsphere_host, self.config[section]['vsphere_user'], self.config[section]['vsphere_password'], self.config[section]['collect_only'], self.config[section]['ignore_ssl'], ) metrics = yield collector.collect() registry = CollectorRegistry() registry.register(ListCollector(metrics)) output = generate_latest(registry) request.setHeader("Content-Type", "text/plain; charset=UTF-8") request.setResponseCode(200) request.write(output) request.finish()
class VMWareMetricsResource(Resource): """ VMWare twisted ``Resource`` handling multi endpoints Only handle /metrics and /healthz path """ isLeaf = True def __init__(self): """ Init Metric Resource """ Resource.__init__(self) def configure(self, args): if args.config_file: try: self.config = YamlConfig(args.config_file) if 'default' not in self.config.keys(): log("Error, you must have a default section in config file (for now)" ) exit(1) except Exception as exception: raise SystemExit( "Error while reading configuration file: {0}".format( exception.message)) else: config_data = """ default: vsphere_host: "{0}" vsphere_user: "******" vsphere_password: "******" ignore_ssl: {3} collect_only: vms: True vmguests: True datastores: True hosts: True snapshots: True """.format(os.environ.get('VSPHERE_HOST'), os.environ.get('VSPHERE_USER'), os.environ.get('VSPHERE_PASSWORD'), os.environ.get('VSPHERE_IGNORE_SSL', False)) self.config = yaml.load(config_data) self.config['default']['collect_only']['hosts'] = os.environ.get( 'VSPHERE_COLLECT_HOSTS', True) self.config['default']['collect_only'][ 'datastores'] = os.environ.get('VSPHERE_COLLECT_DATASTORES', True) self.config['default']['collect_only']['vms'] = os.environ.get( 'VSPHERE_COLLECT_VMS', True) self.config['default']['collect_only'][ 'vmguests'] = os.environ.get('VSPHERE_COLLECT_VMGUESTS', True) self.config['default']['collect_only'][ 'snapshots'] = os.environ.get('VSPHERE_COLLECT_SNAPSHOTS', True) def render_GET(self, request): """ handles get requests for metrics, health, and everything else """ path = request.path.decode() request.setHeader("Content-Type", "text/plain; charset=UTF-8") if path == '/metrics': deferred_request = deferLater(reactor, 0, lambda: request) deferred_request.addCallback(self.generate_latest_metrics) deferred_request.addErrback(self.errback, request) return NOT_DONE_YET elif path == '/healthz': request.setResponseCode(200) log("Service is UP") return 'Server is UP'.encode() else: log(b"Uri not found: " + request.uri) request.setResponseCode(404) return '404 Not Found'.encode() def errback(self, failure, request): """ handles failures from requests """ failure.printTraceback() log(failure) request.processingFailed( failure ) # This will send a trace to the browser and close the request. return None def generate_latest_metrics(self, request): """ gets the latest metrics """ section = request.args.get('section', ['default'])[0] if section not in self.config.keys(): log("{} is not a valid section, using default".format(section)) section = 'default' if self.config[section].get('vsphere_host') and self.config[ section].get('vsphere_host') != "None": vsphere_host = self.config[section].get('vsphere_host') elif request.args.get(b'target', [None])[0]: vsphere_host = request.args.get(b'target', [None])[0].decode('utf-8') elif request.args.get(b'vsphere_host', [None])[0]: vsphere_host = request.args.get(b'vsphere_host')[0].decode('utf-8') else: request.setResponseCode(500) log("No vsphere_host or target defined") request.write(b'No vsphere_host or target defined!\n') request.finish() return registry = CollectorRegistry() registry.register( VmwareCollector( vsphere_host, self.config[section]['vsphere_user'], self.config[section]['vsphere_password'], self.config[section]['collect_only'], self.config[section]['ignore_ssl'], )) output = generate_latest(registry) request.write(output) request.finish()
t = threading.Thread(target=httpd.serve_forever()) t.daemon = True t.start() except KeyboardInterrupt: logging.info("Stopping Arista eAPI Prometheus Server") def enable_logging(): # enable logging logger = logging.getLogger() app_environment = os.getenv('APP_ENV', default="production").lower() if app_environment == "production": logger.setLevel('INFO') else: logger.setLevel('DEBUG') format = '%(asctime)-15s %(process)d %(levelname)s %(filename)s:%(lineno)d %(message)s' logging.basicConfig(stream=sys.stdout, format=format) if __name__ == '__main__': # command line options parser = argparse.ArgumentParser() parser.add_argument( "-c", "--config", help="Specify config yaml file", metavar="FILE", required=False, default="config.yml") args = parser.parse_args() # get the config config = YamlConfig(args.config) enable_logging() falcon_app()