def test_metrics_metadata_cache(): cache = MetricsMetadataCache(float('inf')) data = {k: object() for k in ALL_RESOURCES_WITH_METRICS} with cache.update(): for k, v in iteritems(data): cache.set_metadata(k, v) for k, v in iteritems(data): assert cache.get_metadata(k) == v
class VSphereCheck(AgentCheck): __NAMESPACE__ = 'vsphere' def __new__(cls, name, init_config, instances): # type: (Type[VSphereCheck], str, Dict[str, Any], List[Dict[str, Any]]) -> VSphereCheck """For backward compatibility reasons, there are two side-by-side implementations of the VSphereCheck. Instantiating this class will return an instance of the legacy integration for existing users and an instance of the new implementation for new users.""" if is_affirmative(instances[0].get('use_legacy_check_version', True)): from datadog_checks.vsphere.legacy.vsphere_legacy import VSphereLegacyCheck return VSphereLegacyCheck(name, init_config, instances) # type: ignore return super(VSphereCheck, cls).__new__(cls) def __init__(self, *args, **kwargs): # type: (*Any, **Any) -> None super(VSphereCheck, self).__init__(*args, **kwargs) instance = cast(InstanceConfig, self.instance) self.config = VSphereConfig(instance, self.log) self.latest_event_query = get_current_datetime() self.infrastructure_cache = InfrastructureCache( interval_sec=self.config.refresh_infrastructure_cache_interval) self.metrics_metadata_cache = MetricsMetadataCache( interval_sec=self.config.refresh_metrics_metadata_cache_interval) self.api = cast(VSphereAPI, None) self.api_rest = cast(VSphereRestAPI, None) # Do not override `AgentCheck.hostname` self._hostname = None self.thread_pool = ThreadPoolExecutor( max_workers=self.config.threads_count) self.check_initializations.append(self.initiate_api_connection) def initiate_api_connection(self): # type: () -> None try: self.log.debug( "Connecting to the vCenter API %s with username %s...", self.config.hostname, self.config.username) self.api = VSphereAPI(self.config, self.log) self.log.debug("Connected") except APIConnectionError: self.log.error( "Cannot authenticate to vCenter API. The check will not run.") self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self.config.base_tags, hostname=None) raise if self.config.should_collect_tags: try: self.api_rest = VSphereRestAPI(self.config, self.log) except Exception as e: self.log.error( "Cannot connect to vCenter REST API. Tags won't be collected. Error: %s", e) def refresh_metrics_metadata_cache(self): # type: () -> None """ Request the list of counters (metrics) from vSphere and store them in a cache. """ self.log.debug( "Refreshing the metrics metadata cache. Collecting all counters metadata for collection_level=%d", self.config.collection_level, ) t0 = Timer() counters = self.api.get_perf_counter_by_level( self.config.collection_level) self.gauge( "datadog.vsphere.refresh_metrics_metadata_cache.time", t0.total(), tags=self.config.base_tags, raw=True, hostname=self._hostname, ) self.log.debug("Collected %d counters metadata in %.3f seconds.", len(counters), t0.total()) for mor_type in self.config.collected_resource_types: allowed_counters = [] for c in counters: metric_name = format_metric_name(c) if metric_name in ALLOWED_METRICS_FOR_MOR[ mor_type] and not is_metric_excluded_by_filters( metric_name, mor_type, self.config.metric_filters): allowed_counters.append(c) metadata = { c.key: format_metric_name(c) for c in allowed_counters } # type: Dict[CounterId, MetricName] self.metrics_metadata_cache.set_metadata(mor_type, metadata) self.log.debug( "Set metadata for mor_type %s: %s", mor_type, metadata, ) # TODO: Later - Understand how much data actually changes between check runs # Apparently only when the server restarts? # https://pubs.vmware.com/vsphere-50/index.jsp?topic=%2Fcom.vmware.wssdk.pg.doc_50%2FPG_Ch16_Performance.18.5.html def collect_tags(self, infrastructure_data): # type: (InfrastructureData) -> ResourceTags """ Fetch the all tags, build tags for each monitored resources and store all of that into the tags_cache. """ if not self.api_rest: return {} # In order to be more efficient in tag collection, the infrastructure data is filtered as much as possible. # All filters are applied except the ones based on tags of course. resource_filters_without_tags = [ f for f in self.config.resource_filters if not isinstance(f, TagFilter) ] filtered_infra_data = { mor: props for mor, props in iteritems(infrastructure_data) if isinstance(mor, tuple(self.config.collected_resource_types)) and is_resource_collected_by_filters( mor, infrastructure_data, resource_filters_without_tags) } t0 = Timer() mors_list = list(filtered_infra_data.keys()) try: mor_tags = self.api_rest.get_resource_tags_for_mors(mors_list) except Exception as e: self.log.error("Failed to collect tags: %s", e) return {} self.gauge('datadog.vsphere.query_tags.time', t0.total(), tags=self.config.base_tags, raw=True, hostname=self._hostname) return mor_tags def refresh_infrastructure_cache(self): # type: () -> None """Fetch the complete infrastructure, generate tags for each monitored resources and store all of that into the infrastructure_cache. It also computes the resource `hostname` property to be used when submitting metrics for this mor.""" self.log.debug("Refreshing the infrastructure cache...") t0 = Timer() infrastructure_data = self.api.get_infrastructure() self.gauge( "datadog.vsphere.refresh_infrastructure_cache.time", t0.total(), tags=self.config.base_tags, raw=True, hostname=self._hostname, ) self.log.debug("Infrastructure cache refreshed in %.3f seconds.", t0.total()) self.log.debug("Infrastructure cache: %s", infrastructure_data) all_tags = {} if self.config.should_collect_tags: all_tags = self.collect_tags(infrastructure_data) self.infrastructure_cache.set_all_tags(all_tags) for mor, properties in iteritems(infrastructure_data): if not isinstance(mor, tuple( self.config.collected_resource_types)): # Do nothing for the resource types we do not collect continue if not is_resource_collected_by_filters( mor, infrastructure_data, self.config.resource_filters, self.infrastructure_cache.get_mor_tags(mor)): # The resource does not match the specified whitelist/blacklist patterns. continue mor_name = to_string(properties.get("name", "unknown")) mor_type_str = MOR_TYPE_AS_STRING[type(mor)] hostname = None tags = [] if isinstance(mor, vim.VirtualMachine): power_state = properties.get("runtime.powerState") if power_state != vim.VirtualMachinePowerState.poweredOn: # Skipping because the VM is not powered on # TODO: Sometimes VM are "poweredOn" but "disconnected" and thus have no metrics self.log.debug("Skipping VM %s in state %s", mor_name, to_string(power_state)) continue # Hosts are not considered as parents of the VMs they run, we use the `runtime.host` property # to get the name of the ESXi host runtime_host = properties.get("runtime.host") runtime_host_props = infrastructure_data[ runtime_host] if runtime_host else {} runtime_hostname = to_string( runtime_host_props.get("name", "unknown")) tags.append('vsphere_host:{}'.format(runtime_hostname)) if self.config.use_guest_hostname: hostname = properties.get("guest.hostName", mor_name) else: hostname = mor_name elif isinstance(mor, vim.HostSystem): hostname = mor_name else: tags.append('vsphere_{}:{}'.format(mor_type_str, mor_name)) tags.extend(get_parent_tags_recursively(mor, infrastructure_data)) tags.append('vsphere_type:{}'.format(mor_type_str)) # Attach tags from fetched attributes. tags.extend(properties.get('attributes', [])) mor_payload = {"tags": tags} # type: Dict[str, Any] if hostname: mor_payload['hostname'] = hostname self.infrastructure_cache.set_mor_props(mor, mor_payload) def submit_metrics_callback(self, query_results): # type: (List[vim.PerformanceManager.EntityMetricBase]) -> None """ Callback of the collection of metrics. This is run in the main thread! `query_results` currently contain results of one resource type in practice, but this function is generic and can handle results with mixed resource types. """ # `have_instance_value` is used later to avoid collecting aggregated metrics # when instance metrics are collected. have_instance_value = defaultdict( set) # type: Dict[Type[vim.ManagedEntity], Set[MetricName]] for results_per_mor in query_results: resource_type = type(results_per_mor.entity) metadata = self.metrics_metadata_cache.get_metadata(resource_type) for result in results_per_mor.value: if result.id.instance: have_instance_value[resource_type].add( metadata[result.id.counterId]) for results_per_mor in query_results: mor_props = self.infrastructure_cache.get_mor_props( results_per_mor.entity) if mor_props is None: self.log.debug( "Skipping results for mor %s because the integration is not yet aware of it. If this is a problem" " you can increase the value of 'refresh_infrastructure_cache_interval'.", results_per_mor.entity, ) continue self.log.debug( "Retrieved mor props for entity %s: %s", results_per_mor.entity, mor_props, ) resource_type = type(results_per_mor.entity) metadata = self.metrics_metadata_cache.get_metadata(resource_type) for result in results_per_mor.value: metric_name = metadata.get(result.id.counterId) if self.log.isEnabledFor(logging.DEBUG): # Use isEnabledFor to avoid unnecessary processing self.log.debug( "Processing metric `%s`: resource_type=`%s`, result=`%s`", metric_name, resource_type, str(result).replace("\n", "\\n"), ) if not metric_name: # Fail-safe self.log.debug( "Skipping value for counter %s, because the integration doesn't have metadata about it. If this" " is a problem you can increase the value of 'refresh_metrics_metadata_cache_interval'", result.id.counterId, ) continue if not result.value: self.log.debug( "Skipping metric %s because the value is empty", to_string(metric_name)) continue # Get the most recent value that isn't negative valid_values = [v for v in result.value if v >= 0] if not valid_values: self.log.debug( "Skipping metric %s because the value returned by vCenter" " is negative (i.e. the metric is not yet available). values: %s", to_string(metric_name), list(result.value), ) continue tags = [] if should_collect_per_instance_values( self.config, metric_name, resource_type) and ( metric_name in have_instance_value[resource_type]): instance_value = result.id.instance # When collecting per instance values, it's possible that both aggregated metric and per instance # metrics are received. In that case, the metric with no instance value is skipped. if not instance_value: continue instance_tag_key = get_mapped_instance_tag(metric_name) tags.append('{}:{}'.format(instance_tag_key, instance_value)) vsphere_tags = self.infrastructure_cache.get_mor_tags( results_per_mor.entity) mor_tags = mor_props['tags'] + vsphere_tags if resource_type in HISTORICAL_RESOURCES: # Tags are attached to the metrics tags.extend(mor_tags) hostname = None else: # Tags are (mostly) submitted as external host tags. hostname = to_string(mor_props.get('hostname')) if self.config.excluded_host_tags: tags.extend([ t for t in mor_tags if t.split(":", 1)[0] in self.config.excluded_host_tags ]) tags.extend(self.config.base_tags) value = valid_values[-1] if metric_name in PERCENT_METRICS: # Convert the percentage to a float. value /= 100.0 self.log.debug( "Submit metric: name=`%s`, value=`%s`, hostname=`%s`, tags=`%s`", metric_name, value, hostname, tags, ) # vSphere "rates" should be submitted as gauges (rate is precomputed). self.gauge(to_string(metric_name), value, hostname=hostname, tags=tags) def query_metrics_wrapper(self, query_specs): # type: (List[vim.PerformanceManager.QuerySpec]) -> List[vim.PerformanceManager.EntityMetricBase] """Just an instrumentation wrapper around the VSphereAPI.query_metrics method Warning: called in threads """ t0 = Timer() metrics_values = self.api.query_metrics(query_specs) self.histogram( 'datadog.vsphere.query_metrics.time', t0.total(), tags=self.config.base_tags, raw=True, hostname=self._hostname, ) return metrics_values def make_query_specs(self): # type: () -> Iterable[List[vim.PerformanceManager.QuerySpec]] """ Build query specs using MORs and metrics metadata. """ server_current_time = self.api.get_current_time() self.log.debug("Server current datetime: %s", server_current_time) for resource_type in self.config.collected_resource_types: mors = self.infrastructure_cache.get_mors(resource_type) counters = self.metrics_metadata_cache.get_metadata(resource_type) metric_ids = [] # type: List[vim.PerformanceManager.MetricId] for counter_key, metric_name in iteritems(counters): # PerformanceManager.MetricId `instance` kwarg: # - An asterisk (*) to specify all instances of the metric for the specified counterId # - Double-quotes ("") to specify aggregated statistics # More info https://code.vmware.com/apis/704/vsphere/vim.PerformanceManager.MetricId.html if should_collect_per_instance_values(self.config, metric_name, resource_type): instance = "*" else: instance = '' metric_ids.append( vim.PerformanceManager.MetricId(counterId=counter_key, instance=instance)) for batch in self.make_batch(mors, metric_ids, resource_type): query_specs = [] for mor, metrics in iteritems(batch): query_spec = vim.PerformanceManager.QuerySpec( ) # type: vim.PerformanceManager.QuerySpec query_spec.entity = mor query_spec.metricId = metrics if resource_type in REALTIME_RESOURCES: query_spec.intervalId = REALTIME_METRICS_INTERVAL_ID query_spec.maxSample = 1 # Request a single datapoint else: # We cannot use `maxSample` for historical metrics, let's specify a timewindow that will # contain at least one element query_spec.startTime = server_current_time - dt.timedelta( hours=2) query_specs.append(query_spec) if query_specs: yield query_specs def collect_metrics_async(self): # type: () -> None """Run queries in multiple threads and wait for completion.""" tasks = [] try: for query_specs in self.make_query_specs(): tasks.append( self.thread_pool.submit(self.query_metrics_wrapper, query_specs)) except Exception as e: self.log.warning( "Unable to schedule all metric collection tasks: %s", e) finally: self.log.debug("Queued all %d tasks, waiting for completion.", len(tasks)) for future in as_completed(tasks): future_exc = future.exception() if isinstance(future_exc, vmodl.fault.InvalidArgument): # The query was invalid or the resource does not have values for this metric. continue elif future_exc is not None: self.log.warning( "A metric collection API call failed with the following error: %s", future_exc) continue results = future.result() if not results: self.log.debug( "A metric collection API call did not return data.") continue try: # Callback is called in the main thread self.submit_metrics_callback(results) except Exception as e: self.log.exception( "Exception '%s' raised during the submit_metrics_callback. " "Ignoring the error and continuing execution.", e, ) def make_batch( self, mors, # type: Iterable[vim.ManagedEntity] metric_ids, # type: List[vim.PerformanceManager.MetricId] resource_type, # type: Type[vim.ManagedEntity] ): # type: (...) -> Generator[MorBatch, None, None] """Iterates over mor and generate batches with a fixed number of metrics to query. Querying multiple resource types in the same call is error prone if we query a cluster metric. Indeed, cluster metrics result in an unpredicatable number of internal metric queries which all count towards max_query_metrics. Therefore often collecting a single cluster metric can make the whole call to fail. That's why we should never batch cluster metrics with anything else. """ # Safeguard, let's avoid collecting multiple resources in the same call mors_filtered = [m for m in mors if isinstance(m, resource_type) ] # type: List[vim.ManagedEntity] if resource_type == vim.ClusterComputeResource: # Cluster metrics are unpredictable and a single call can max out the limit. Always collect them one by one. max_batch_size = 1 # type: float elif resource_type in REALTIME_RESOURCES or self.config.max_historical_metrics < 0: # Queries are not limited by vCenter max_batch_size = self.config.metrics_per_query else: # Collection is limited by the value of `max_query_metrics` if self.config.metrics_per_query < 0: max_batch_size = self.config.max_historical_metrics else: max_batch_size = min(self.config.metrics_per_query, self.config.max_historical_metrics) batch = defaultdict(list) # type: MorBatch batch_size = 0 for m in mors_filtered: for metric_id in metric_ids: if batch_size == max_batch_size: yield batch batch = defaultdict(list) batch_size = 0 batch[m].append(metric_id) batch_size += 1 # Do not yield an empty batch if batch: yield batch def submit_external_host_tags(self): # type: () -> None """Send external host tags to the Datadog backend. This is only useful for a REALTIME instance because only VMs and Hosts appear as 'datadog hosts'.""" external_host_tags = [] for resource_type in REALTIME_RESOURCES: for mor in self.infrastructure_cache.get_mors(resource_type): mor_props = self.infrastructure_cache.get_mor_props(mor) mor_tags = self.infrastructure_cache.get_mor_tags(mor) hostname = mor_props.get('hostname') # Safeguard if some mors have a None hostname if not hostname: continue mor_tags = mor_props['tags'] + mor_tags tags = [ t for t in mor_tags if t.split(':')[0] not in self.config.excluded_host_tags ] tags.extend(self.config.base_tags) external_host_tags.append((hostname, { self.__NAMESPACE__: tags })) if external_host_tags: self.set_external_tags(external_host_tags) def collect_events(self): # type: () -> None self.log.debug("Starting events collection (query start time: %s).", self.latest_event_query) latest_event_time = None collect_start_time = get_current_datetime() try: t0 = Timer() new_events = self.api.get_new_events( start_time=self.latest_event_query) self.gauge( 'datadog.vsphere.collect_events.time', t0.total(), tags=self.config.base_tags, raw=True, hostname=self._hostname, ) self.log.debug("Got %s new events from the vCenter event manager", len(new_events)) event_config = {'collect_vcenter_alarms': True} for event in new_events: self.log.debug("Processing event with id:%s, type:%s: msg:%s", event.key, type(event), event.fullFormattedMessage) normalized_event = VSphereEvent(event, event_config, self.config.base_tags) # Can return None if the event if filtered out event_payload = normalized_event.get_datadog_payload() if event_payload is not None: self.log.debug("Submit event with id:%s, type:%s: msg:%s", event.key, type(event), event.fullFormattedMessage) self.event(event_payload) if latest_event_time is None or event.createdTime > latest_event_time: latest_event_time = event.createdTime except Exception as e: # Don't get stuck on a failure to fetch an event # Ignore them for next pass self.log.warning("Unable to fetch Events %s", e) if latest_event_time is not None: self.latest_event_query = latest_event_time + dt.timedelta( seconds=1) else: # Let's set `self.latest_event_query` to `collect_start_time` as safeguard in case no events are reported # OR something bad happened (which might happen again indefinitely). self.latest_event_query = collect_start_time def check(self, _): # type: (Any) -> None self._hostname = datadog_agent.get_hostname() # Assert the health of the vCenter API by getting the version, and submit the service_check accordingly try: version_info = self.api.get_version() if self.is_metadata_collection_enabled(): self.set_metadata('version', version_info.version_str) except Exception: # Explicitly do not attach any host to the service checks. self.log.exception( "The vCenter API is not responding. The check will not run.") self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self.config.base_tags, hostname=None) raise else: self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=self.config.base_tags, hostname=None) # Collect and submit events if self.config.should_collect_events: self.collect_events() if self.config.collect_events_only: return # Update the value of `max_query_metrics` if needed if self.config.is_historical(): try: vcenter_max_hist_metrics = self.api.get_max_query_metrics() if vcenter_max_hist_metrics < self.config.max_historical_metrics: self.log.warning( "The integration was configured with `max_query_metrics: %d` but your vCenter has a" "limit of %d which is lower. Ignoring your configuration in favor of the vCenter value." "To update the vCenter value, please update the `%s` field", self.config.max_historical_metrics, vcenter_max_hist_metrics, MAX_QUERY_METRICS_OPTION, ) self.config.max_historical_metrics = vcenter_max_hist_metrics except Exception: self.config.max_historical_metrics = DEFAULT_MAX_QUERY_METRICS self.log.info( "Could not fetch the value of %s, setting `max_historical_metrics` to %d.", MAX_QUERY_METRICS_OPTION, DEFAULT_MAX_QUERY_METRICS, ) pass # Refresh the metrics metadata cache if self.metrics_metadata_cache.is_expired(): with self.metrics_metadata_cache.update(): self.refresh_metrics_metadata_cache() # Refresh the infrastructure cache if self.infrastructure_cache.is_expired(): with self.infrastructure_cache.update(): self.refresh_infrastructure_cache() # Submit host tags as soon as we have fresh data self.submit_external_host_tags() # Submit the number of VMs that are monitored for resource_type in self.config.collected_resource_types: for mor in self.infrastructure_cache.get_mors(resource_type): mor_props = self.infrastructure_cache.get_mor_props(mor) # Explicitly do not attach any host to those metrics. resource_tags = mor_props.get('tags', []) self.count( '{}.count'.format(MOR_TYPE_AS_STRING[resource_type]), 1, tags=self.config.base_tags + resource_tags, hostname=None, ) # Creating a thread pool and starting metric collection self.log.debug("Starting metric collection in %d threads.", self.config.threads_count) self.collect_metrics_async() self.log.debug("Metric collection completed.")