def refresh_metrics_metadata_cache(self): """Request the list of counters (metrics) from vSphere and store them in a cache.""" self.log.debug( "Refreshing the metrics metadata cache. Collecting all counters metadata for collection_level=%d", self.config.collection_level, ) t0 = Timer() counters = self.api.get_perf_counter_by_level( self.config.collection_level) self.gauge( "datadog.vsphere.refresh_metrics_metadata_cache.time", t0.total(), tags=self.config.base_tags, raw=True, hostname=self._hostname, ) self.log.debug("Collected %d counters metadata in %.3f seconds.", len(counters), t0.total()) for mor_type in self.config.collected_resource_types: allowed_counters = [] for c in counters: metric_name = format_metric_name(c) if metric_name in ALLOWED_METRICS_FOR_MOR[ mor_type] and not is_metric_excluded_by_filters( metric_name, mor_type, self.config.metric_filters): allowed_counters.append(c) metadata = {c.key: format_metric_name(c) for c in allowed_counters} self.metrics_metadata_cache.set_metadata(mor_type, metadata)
def collect_events(self): self.log.debug("Starting events collection.") try: t0 = Timer() new_events = self.api.get_new_events( start_time=self.latest_event_query) self.gauge( 'datadog.vsphere.collect_events.time', t0.total(), tags=self.config.base_tags, raw=True, hostname=self._hostname, ) self.log.debug("Got %s new events from the vCenter event manager", len(new_events)) event_config = {'collect_vcenter_alarms': True} for event in new_events: normalized_event = VSphereEvent(event, event_config, self.config.base_tags) # Can return None if the event if filtered out event_payload = normalized_event.get_datadog_payload() if event_payload is not None: self.event(event_payload) except Exception as e: # Don't get stuck on a failure to fetch an event # Ignore them for next pass self.log.warning("Unable to fetch Events %s", e) self.latest_event_query = self.api.get_latest_event_timestamp( ) + timedelta(seconds=1)
def collect_tags(self, infrastructure_data): # type: (InfrastructureData) -> ResourceTags """ Fetch the all tags, build tags for each monitored resources and store all of that into the tags_cache. """ if not self.api_rest: return {} # In order to be more efficient in tag collection, the infrastructure data is filtered as much as possible. # All filters are applied except the ones based on tags of course. resource_filters_without_tags = [f for f in self._config.resource_filters if not isinstance(f, TagFilter)] filtered_infra_data = { mor: props for mor, props in iteritems(infrastructure_data) if isinstance(mor, tuple(self._config.collected_resource_types)) and is_resource_collected_by_filters(mor, infrastructure_data, resource_filters_without_tags) } t0 = Timer() mors_list = list(filtered_infra_data.keys()) try: mor_tags = self.api_rest.get_resource_tags_for_mors(mors_list) except Exception as e: self.log.error("Failed to collect tags: %s", e) return {} self.gauge( 'datadog.vsphere.query_tags.time', t0.total(), tags=self._config.base_tags, raw=True, hostname=self._hostname, ) return mor_tags
def _cache_metrics_metadata(self, instance): """ Get all the performance counters metadata meaning name/group/description... from the server instance, attached with the corresponding ID """ # ## <TEST-INSTRUMENTATION> t = Timer() # ## </TEST-INSTRUMENTATION> i_key = self._instance_key(instance) self.metadata_cache.init_instance(i_key) self.log.info("Warming metrics metadata cache for instance %s", i_key) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager new_metadata = {} metric_ids = [] # Use old behaviour with metrics to collect defined by our constants if self.in_compatibility_mode(instance, log_warning=True): for counter in perfManager.perfCounter: metric_name = self.format_metric_name(counter, compatibility=True) new_metadata[counter.key] = { 'name': metric_name, 'unit': counter.unitInfo.key } # Build the list of metrics we will want to collect if instance.get("all_metrics") or metric_name in BASIC_METRICS: metric_ids.append( vim.PerformanceManager.MetricId(counterId=counter.key, instance="*")) else: collection_level = instance.get("collection_level", 1) for counter in perfManager.QueryPerfCounterByLevel( collection_level): new_metadata[counter.key] = { "name": self.format_metric_name(counter), "unit": counter.unitInfo.key } # Build the list of metrics we will want to collect metric_ids.append( vim.PerformanceManager.MetricId(counterId=counter.key, instance="*")) self.log.info("Finished metadata collection for instance %s", i_key) # Reset metadata self.metadata_cache.set_metadata(i_key, new_metadata) self.metadata_cache.set_metric_ids(i_key, metric_ids) self.cache_config.set_last(CacheConfig.Metadata, i_key, time.time()) # ## <TEST-INSTRUMENTATION> custom_tags = instance.get('tags', []) + ['instance:{}'.format(i_key)] self.histogram('datadog.agent.vsphere.metric_metadata_collection.time', t.total(), tags=custom_tags)
def query_metrics_wrapper(self, query_specs): """Just an instrumentation wrapper around the VSphereAPI.query_metrics method Warning: called in threads """ t0 = Timer() metrics_values = self.api.query_metrics(query_specs) self.histogram('datadog.vsphere.query_metrics.time', t0.total(), tags=self.config.base_tags, raw=True) return metrics_values
def query_metrics_wrapper(self, query_specs): # type: (List[vim.PerformanceManager.QuerySpec]) -> List[vim.PerformanceManager.EntityMetricBase] """Just an instrumentation wrapper around the VSphereAPI.query_metrics method Warning: called in threads """ t0 = Timer() metrics_values = self.api.query_metrics(query_specs) self.histogram('datadog.vsphere.query_metrics.time', t0.total(), tags=self.config.base_tags, raw=True) return metrics_values
def collect_events(self): # type: () -> None self.log.debug("Starting events collection (query start time: %s).", self.latest_event_query) latest_event_time = None collect_start_time = get_current_datetime() try: t0 = Timer() new_events = self.api.get_new_events( start_time=self.latest_event_query) self.gauge( 'datadog.vsphere.collect_events.time', t0.total(), tags=self.config.base_tags, raw=True, hostname=self._hostname, ) self.log.debug("Got %s new events from the vCenter event manager", len(new_events)) event_config = {'collect_vcenter_alarms': True} for event in new_events: self.log.debug("Processing event with id:%s, type:%s: msg:%s", event.key, type(event), event.fullFormattedMessage) normalized_event = VSphereEvent(event, event_config, self.config.base_tags) # Can return None if the event if filtered out event_payload = normalized_event.get_datadog_payload() if event_payload is not None: self.log.debug("Submit event with id:%s, type:%s: msg:%s", event.key, type(event), event.fullFormattedMessage) self.event(event_payload) if latest_event_time is None or event.createdTime > latest_event_time: latest_event_time = event.createdTime except Exception as e: # Don't get stuck on a failure to fetch an event # Ignore them for next pass self.log.warning("Unable to fetch Events %s", e) if latest_event_time is not None: self.latest_event_query = latest_event_time + dt.timedelta( seconds=1) else: # Let's set `self.latest_event_query` to `collect_start_time` as safeguard in case no events are reported # OR something bad happened (which might happen again indefinitely). self.latest_event_query = collect_start_time
def refresh_tags_cache(self): """ Fetch the all tags, build tags for each monitored resources and store all of that into the tags_cache. """ if not self.api_rest: return t0 = Timer() try: mor_tags = self.api_rest.get_resource_tags() except Exception as e: self.log.error("Failed to collect tags: %s", e) return self.gauge('datadog.vsphere.query_tags.time', t0.total(), tags=self.config.base_tags, raw=True) self.tags_cache.set_all_tags(mor_tags)
def _collect_metrics_async(self, instance, query_specs): """ Task that collects the metrics listed in the morlist for one MOR """ # ## <TEST-INSTRUMENTATION> t = Timer() # ## </TEST-INSTRUMENTATION> i_key = self._instance_key(instance) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager results = perfManager.QueryPerf(query_specs) if results: for mor_perfs in results: mor_name = str(mor_perfs.entity) try: mor = self.mor_cache.get_mor(i_key, mor_name) except MorNotFoundError: self.log.error( "Trying to get metrics from object %s deleted from the cache, skipping. " "Consider increasing the parameter `clean_morlist_interval` to avoid that", mor_name, ) continue for result in mor_perfs.value: counter_id = result.id.counterId if not self.metadata_cache.contains(i_key, counter_id): self.log.debug( "Skipping value for counter %s, because there is no metadata about it", ensure_unicode(counter_id), ) continue # Metric types are absolute, delta, and rate metric_name = self.metadata_cache.get_metadata( i_key, result.id.counterId).get('name') if self.in_compatibility_mode(instance): if metric_name not in ALL_METRICS: self.log.debug("Skipping unknown `%s` metric.", ensure_unicode(metric_name)) continue if not result.value: self.log.debug( "Skipping `%s` metric because the value is empty", ensure_unicode(metric_name)) continue instance_name = result.id.instance or "none" value = self._transform_value(instance, result.id.counterId, result.value[0]) hostname = mor['hostname'] tags = [ 'instance:{}'.format(ensure_unicode(instance_name)) ] if not hostname: # no host tags available tags.extend(mor['tags']) else: hostname = to_string(hostname) tags.extend(instance.get('tags', [])) # vsphere "rates" should be submitted as gauges (rate is # precomputed). self.gauge("vsphere.{}".format( ensure_unicode(metric_name)), value, hostname=hostname, tags=tags) # ## <TEST-INSTRUMENTATION> custom_tags = instance.get('tags', []) + ['instance:{}'.format(i_key)] self.histogram('datadog.agent.vsphere.metric_colection.time', t.total(), tags=custom_tags)
def refresh_infrastructure_cache(self): # type: () -> None """Fetch the complete infrastructure, generate tags for each monitored resources and store all of that into the infrastructure_cache. It also computes the resource `hostname` property to be used when submitting metrics for this mor.""" self.log.debug("Refreshing the infrastructure cache...") t0 = Timer() infrastructure_data = self.api.get_infrastructure() self.gauge( "datadog.vsphere.refresh_infrastructure_cache.time", t0.total(), tags=self.config.base_tags, raw=True, hostname=self._hostname, ) self.log.debug("Infrastructure cache refreshed in %.3f seconds.", t0.total()) self.log.debug("Infrastructure cache: %s", infrastructure_data) all_tags = {} if self.config.should_collect_tags: all_tags = self.collect_tags(infrastructure_data) self.infrastructure_cache.set_all_tags(all_tags) for mor, properties in iteritems(infrastructure_data): if not isinstance(mor, tuple( self.config.collected_resource_types)): # Do nothing for the resource types we do not collect continue if not is_resource_collected_by_filters( mor, infrastructure_data, self.config.resource_filters, self.infrastructure_cache.get_mor_tags(mor)): # The resource does not match the specified whitelist/blacklist patterns. continue mor_name = to_string(properties.get("name", "unknown")) mor_type_str = MOR_TYPE_AS_STRING[type(mor)] hostname = None tags = [] if isinstance(mor, vim.VirtualMachine): power_state = properties.get("runtime.powerState") if power_state != vim.VirtualMachinePowerState.poweredOn: # Skipping because the VM is not powered on # TODO: Sometimes VM are "poweredOn" but "disconnected" and thus have no metrics self.log.debug("Skipping VM %s in state %s", mor_name, to_string(power_state)) continue # Hosts are not considered as parents of the VMs they run, we use the `runtime.host` property # to get the name of the ESXi host runtime_host = properties.get("runtime.host") runtime_host_props = infrastructure_data[ runtime_host] if runtime_host else {} runtime_hostname = to_string( runtime_host_props.get("name", "unknown")) tags.append('vsphere_host:{}'.format(runtime_hostname)) if self.config.use_guest_hostname: hostname = properties.get("guest.hostName", mor_name) else: hostname = mor_name elif isinstance(mor, vim.HostSystem): hostname = mor_name else: tags.append('vsphere_{}:{}'.format(mor_type_str, mor_name)) tags.extend(get_parent_tags_recursively(mor, infrastructure_data)) tags.append('vsphere_type:{}'.format(mor_type_str)) # Attach tags from fetched attributes. tags.extend(properties.get('attributes', [])) mor_payload = {"tags": tags} # type: Dict[str, Any] if hostname: mor_payload['hostname'] = hostname self.infrastructure_cache.set_mor_props(mor, mor_payload)