def _discover_queues(self, queue_manager, mq_pattern_filter): queues = [] for queue_type in SUPPORTED_QUEUE_TYPES: args = {pymqi.CMQC.MQCA_Q_NAME: ensure_bytes(mq_pattern_filter), pymqi.CMQC.MQIA_Q_TYPE: queue_type} try: pcf = pymqi.PCFExecute(queue_manager) response = pcf.MQCMD_INQUIRE_Q(args) except pymqi.MQMIError as e: self.warning("Error discovering queue: {}".format(e)) else: for queue_info in response: queue = queue_info[pymqi.CMQC.MQCA_Q_NAME] queues.append(ensure_unicode(queue).strip()) return queues
def submit_metrics(self, child, prefix, tags): value = child.get(metrics.METRIC_VALUE_FIELDS[child.tag]) metric_name = self.normalize(ensure_unicode(child.get('name')), prefix='{}.{}'.format( self.METRIC_PREFIX, prefix), fix_case=True) tag = child.tag if (child.get('unit') in self.custom_queries_units_gauge and prefix in self.custom_stats and tag == 'CountStatistic'): tag = 'TimeStatistic' self.metric_type_mapping[tag](metric_name, value, tags=tags) # creates new JVM metrics correctly as gauges if prefix == "jvm": jvm_metric_name = "{}_gauge".format(metric_name) self.gauge(jvm_metric_name, value, tags=tags)
def _get_data(self, instance): host = instance.get('host') port = int(instance.get('port', 2222)) # 2222 is default tags = instance.get('tags', []) if tags is None: tags = [] service_check_tags = ['host:{}'.format(host), 'port:{}'.format(port) ] + tags service_check_tags = list(set(service_check_tags)) try: addrs = socket.getaddrinfo(host, port, 0, 0, socket.IPPROTO_TCP) except socket.gaierror as e: self.log.warning("unable to retrieve address info for %s:%s - %s", host, port, e) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags) return None response = "" for addr in addrs: try: if addr[1] == socket.SOCK_STREAM: client = socket.socket(*addr[0:3]) client.connect(addr[-1]) self.log.debug(u"Querying: {0}:{1}".format(host, port)) while 1: data = ensure_unicode(client.recv(1024)) if not data: break response = ''.join([response, data]) client.close() break except socket.error as e: self.log.warning("unable to connect to %s - %s", addr[-1], e) status = AgentCheck.OK if response else AgentCheck.CRITICAL self.service_check(self.SERVICE_CHECK_NAME, status, tags=service_check_tags) return response
def test_version(): """ If the docker image is in a different repository, we check that the version requested in the VARNISH_VERSION env var is the one running inside the container. """ varnishstat = common.get_varnish_stat_path() # Version info is printed to stderr output = subprocess.check_output(shlex.split(varnishstat) + ["-V"], stderr=subprocess.STDOUT) res = re.search(r"varnish-(\d+\.\d\.\d)", ensure_unicode(output)) if res is None: raise Exception("Could not retrieve varnish version from docker") version = res.groups()[0] assert version == os.environ.get('VARNISH_VERSION', common.VARNISH_DEFAULT_VERSION)
def poll_mock(mock_http_response): registry = CollectorRegistry() g1 = Gauge('metric1', 'processor usage', ['matched_label', 'node', 'flavor'], registry=registry) g1.labels(matched_label="foobar", node="host1", flavor="test").set(99.9) g2 = Gauge('metric2', 'memory usage', ['matched_label', 'node', 'timestamp'], registry=registry) g2.labels(matched_label="foobar", node="host2", timestamp="123").set(12.2) c1 = Counter('counter1', 'hits', ['node'], registry=registry) c1.labels(node="host2").inc(42) g3 = Gauge('metric3', 'memory usage', ['matched_label', 'node', 'timestamp'], registry=registry) g3.labels(matched_label="foobar", node="host2", timestamp="456").set(float('inf')) mock_http_response(ensure_unicode(generate_latest(registry)), normalize_content=False)
def _process_mor_objects_queue(self, instance): """ Pops `batch_morlist_size` items from the mor objects queue and run asynchronously the _process_mor_objects_queue_async method to fill the Mor cache. """ i_key = self._instance_key(instance) self.mor_cache.init_instance(i_key) if not self.mor_objects_queue.contains(i_key): self.log.debug("Objects queue is not initialized yet for instance {}, skipping processing".format(i_key)) return for resource_type in RESOURCE_TYPE_METRICS: # Batch size can prevent querying large payloads at once if the environment is too large # If batch size is set to 0, process everything at once batch_size = self.batch_morlist_size or self.mor_objects_queue.size(i_key, resource_type) while self.mor_objects_queue.size(i_key, resource_type): mors = [] for _ in range(batch_size): mor = self.mor_objects_queue.pop(i_key, resource_type) if mor is None: self.log.debug( "No more objects of type '{}' left in the queue".format(ensure_unicode(resource_type)) ) break mor_name = str(mor['mor']) mor['interval'] = REAL_TIME_INTERVAL if mor['mor_type'] in REALTIME_RESOURCES else None # Always update the cache to account for Mors that might have changed parent # in the meantime (e.g. a migrated VM). self.mor_cache.set_mor(i_key, mor_name, mor) # Only do this for non real-time resources i.e. datacenter, datastore and cluster # For hosts and VMs, we can rely on a precomputed list of metrics realtime_only = is_affirmative(instance.get("collect_realtime_only", True)) if mor["mor_type"] not in REALTIME_RESOURCES and not realtime_only: mors.append(mor) # We will actually schedule jobs for non realtime resources only. if mors: self.pool.apply_async(self._process_mor_objects_queue_async, args=(instance, mors))
def test_check(self, aggregator): instance = self.INSTANCES['main'] c = NfsStatCheck(self.CHECK_NAME, self.INIT_CONFIG, {}, [instance]) with open(os.path.join(FIXTURE_DIR, 'nfsiostat'), 'rb') as f: mock_output = ensure_unicode(f.read()) with mock.patch('datadog_checks.nfsstat.nfsstat.get_subprocess_output', return_value=(mock_output, '', 0)): c.check(instance) tags = list(instance['tags']) tags.extend([ 'nfs_server:192.168.34.1', 'nfs_export:/exports/nfs/datadog/two', 'nfs_mount:/mnt/datadog/two' ]) for metric in metrics: aggregator.assert_metric(metric, tags=tags) assert aggregator.metrics_asserted_pct == 100.0
def poll_mock(): registry = CollectorRegistry() g1 = Gauge('metric1', 'processor usage', ['matched_label', 'node', 'flavor'], registry=registry) g1.labels(matched_label="foobar", node="host1", flavor="test").set(99.9) g2 = Gauge('metric2', 'memory usage', ['matched_label', 'node', 'timestamp'], registry=registry) g2.labels(matched_label="foobar", node="host2", timestamp="123").set(12.2) c1 = Counter('counter1', 'hits', ['node'], registry=registry) c1.labels(node="host2").inc(42) g3 = Gauge('metric3', 'memory usage', ['matched_label', 'node', 'timestamp'], registry=registry) g3.labels(matched_label="foobar", node="host2", timestamp="456").set(float('inf')) poll_mock_patch = mock.patch( 'requests.get', return_value=mock.MagicMock( status_code=200, iter_lines=lambda **kwargs: ensure_unicode(generate_latest(registry)).split("\n"), headers={'Content-Type': "text/plain"}, ), ) with poll_mock_patch: yield
def _end_element(self, name, tags): if name == "stat": m_name = ensure_unicode(self.normalize(self._current_metric)) if self._current_type in ("a", "c"): self.rate(m_name, long(self._current_value), tags=tags) elif self._current_type in ("i", "g"): self.gauge(m_name, long(self._current_value), tags=tags) if 'n_purges' in m_name: self.rate('varnish.n_purgesps', long(self._current_value), tags=tags) else: # Unsupported data type, ignore self._reset() return # don't save # reset for next stat element self._reset() elif name in ("ident", "name") or (name == "type" and self._current_str != "MAIN"): self._current_metric += "." + self._current_str
def _get_parent_tags(self, mor, all_objects): properties = all_objects.get(mor, {}) parent = properties.get('parent') if parent: tags = [] parent_name = ensure_unicode(all_objects.get(parent, {}).get('name', 'unknown')) if isinstance(parent, vim.HostSystem): tags.append('vsphere_host:{}'.format(parent_name)) elif isinstance(parent, vim.Folder): tags.append('vsphere_folder:{}'.format(parent_name)) elif isinstance(parent, vim.ComputeResource): if isinstance(parent, vim.ClusterComputeResource): tags.append('vsphere_cluster:{}'.format(parent_name)) tags.append('vsphere_compute:{}'.format(parent_name)) elif isinstance(parent, vim.Datacenter): tags.append('vsphere_datacenter:{}'.format(parent_name)) parent_tags = self._get_parent_tags(parent, all_objects) parent_tags.extend(tags) return parent_tags return []
def _submit_channel_status(self, queue_manager, search_channel_name, tags, config, channels_to_skip=None): """Submit channel status :param search_channel_name might contain wildcard characters """ channels_to_skip = channels_to_skip or [] search_channel_tags = tags + ["channel:{}".format(search_channel_name)] try: args = { pymqi.CMQCFC.MQCACH_CHANNEL_NAME: ensure_bytes(search_channel_name) } pcf = pymqi.PCFExecute(queue_manager) response = pcf.MQCMD_INQUIRE_CHANNEL_STATUS(args) self.service_check(self.CHANNEL_SERVICE_CHECK, AgentCheck.OK, search_channel_tags) except pymqi.MQMIError as e: self.log.warning("Error getting CHANNEL stats {}".format(e)) self.service_check(self.CHANNEL_SERVICE_CHECK, AgentCheck.CRITICAL, search_channel_tags) else: for channel_info in response: channel_name = ensure_unicode( channel_info[pymqi.CMQCFC.MQCACH_CHANNEL_NAME]).strip() if channel_name in channels_to_skip: continue channel_tags = tags + ["channel:{}".format(channel_name)] channel_status = channel_info[ pymqi.CMQCFC.MQIACH_CHANNEL_STATUS] self._submit_channel_count(channel_name, channel_status, channel_tags) self._submit_status_check(channel_name, channel_status, channel_tags, config)
def convert_and_filter_stats(self, stats): """ Converts raw query stats to native python types Drops string results as well. :param stats: raw query stats :type stats: dict(str, str) :return: converted results and tags :rtype: tuple[dict(str, float), list[str]] """ results = {} tags = set(self.additional_tags) for k, v in stats.items(): if k in self.excluded: continue found_re = False for r in self.excluded_re: if r.match(k): found_re = True break if found_re: continue try: # try a number conversion value = float(v.strip()) results[k] = value except Exception: # this is a string value instead if k == 'ups.status': if v.lower().startswith('ol') or v.lower().startswith( 'on'): results[k] = 1.0 else: results[k] = 0.0 if k in self.string_tags: tags.add('{}:{}'.format( k, ensure_unicode( self.convert_to_underscore_separated(v)))) return results, tags
def report_as_service_check(self, sc_name, status, instance, msg=None): instance_name = ensure_unicode(self.normalize(instance['name'])) host = instance.get('host', None) port = instance.get('port', None) custom_tags = instance.get('tags', []) if status == Status.UP: msg = None tags = custom_tags + [ 'target_host:{}'.format(host), 'port:{}'.format(port), 'instance:{}'.format(instance_name) ] self.service_check(self.SERVICE_CHECK_NAME, NetworkCheck.STATUS_TO_SERVICE_CHECK[status], tags=tags, message=msg) # Report as a metric as well self.gauge("network.tcp.can_connect", 1 if status == Status.UP else 0, tags=tags)
def _get_parent_tags(self, mor, all_objects): tags = [] properties = all_objects.get(mor, {}) parent = properties.get("parent") if parent: parent_name = ensure_unicode(all_objects.get(parent, {}).get("name", "unknown")) tag = [] if isinstance(parent, vim.HostSystem): tag.append('vsphere_host:{}'.format(parent_name)) elif isinstance(parent, vim.Folder): tag.append('vsphere_folder:{}'.format(parent_name)) elif isinstance(parent, vim.ComputeResource): if isinstance(parent, vim.ClusterComputeResource): tag.append('vsphere_cluster:{}'.format(parent_name)) tag.append('vsphere_compute:{}'.format(parent_name)) elif isinstance(parent, vim.Datacenter): tag.append('vsphere_datacenter:{}'.format(parent_name)) tags = self._get_parent_tags(parent, all_objects) if tag: tags.extend(tag) return tags
def _get_server_instance(self, instance): i_key = self._instance_key(instance) tags = instance.get('tags', []) service_check_tags = [ 'vcenter_server:{}'.format(self._instance_key(instance)), 'vcenter_host:{}'.format(ensure_unicode(instance.get('host'))), ] + tags service_check_tags = list(set(service_check_tags)) with self.server_instances_lock: if i_key not in self.server_instances: self.server_instances[i_key] = self._smart_connect(instance, service_check_tags) # Test if the connection is working try: self.server_instances[i_key].CurrentTime() self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) except Exception: # Try to reconnect. If the connection is definitely broken, # this will send CRITICAL service check and raise self.server_instances[i_key] = self._smart_connect(instance, service_check_tags) return self.server_instances[i_key]
def _get_all_objs(self, server_instance, regexes, include_only_marked, tags, use_guest_hostname=False): """ Explore vCenter infrastructure to discover hosts, virtual machines, etc. and compute their associated tags. Start at the vCenter `rootFolder`, so as to collect every objet. Example topology: ``` rootFolder - datacenter1 - compute_resource1 == cluster - host1 - host2 - host3 - compute_resource2 - host5 - vm1 - vm2 ``` If it's a node we want to query metric for, it will be enqueued at the instance level and will be processed by a subsequent job. """ start = time.time() obj_list = defaultdict(list) # Collect objects and their attributes all_objects = self._collect_mors_and_attributes(server_instance) # Add rootFolder since it is not explored by the propertyCollector rootFolder = server_instance.content.rootFolder all_objects[rootFolder] = {"name": rootFolder.name, "parent": None} for obj, properties in all_objects.items(): instance_tags = [] if not self._is_excluded(obj, properties, regexes, include_only_marked) and isinstance( obj, RESOURCE_TYPE_METRICS): if use_guest_hostname: hostname = properties.get( "guest.hostName", properties.get("name", "unknown")) else: hostname = properties.get("name", "unknown") if properties.get("parent"): instance_tags.extend( self._get_parent_tags(obj, all_objects)) if isinstance(obj, vim.VirtualMachine): vsphere_type = 'vsphere_type:vm' vimtype = vim.VirtualMachine mor_type = "vm" power_state = properties.get("runtime.powerState") if power_state != vim.VirtualMachinePowerState.poweredOn: self.log.debug("Skipping VM in state %s", ensure_unicode(power_state)) continue host_mor = properties.get("runtime.host") host_props = all_objects.get(host_mor, {}) host = "unknown" if host_mor and host_props: host = ensure_unicode(host_props.get( "name", "unknown")) if self._is_excluded(host_mor, host_props, regexes, include_only_marked): self.log.debug( "Skipping VM because host %s is excluded by rule %s.", host, regexes.get('host_include')) continue instance_tags.append('vsphere_host:{}'.format(host)) elif isinstance(obj, vim.HostSystem): vsphere_type = 'vsphere_type:host' vimtype = vim.HostSystem mor_type = "host" elif isinstance(obj, vim.Datastore): vsphere_type = 'vsphere_type:datastore' instance_tags.append('vsphere_datastore:{}'.format( ensure_unicode(properties.get("name", "unknown")))) hostname = None vimtype = vim.Datastore mor_type = "datastore" elif isinstance(obj, vim.Datacenter): vsphere_type = 'vsphere_type:datacenter' instance_tags.append("vsphere_datacenter:{}".format( ensure_unicode(properties.get("name", "unknown")))) hostname = None vimtype = vim.Datacenter mor_type = "datacenter" elif isinstance(obj, vim.ClusterComputeResource): vsphere_type = 'vsphere_type:cluster' instance_tags.append("vsphere_cluster:{}".format( ensure_unicode(properties.get("name", "unknown")))) hostname = None vimtype = vim.ClusterComputeResource mor_type = "cluster" else: vsphere_type = None if vsphere_type: instance_tags.append(vsphere_type) obj_list[vimtype].append({ "mor_type": mor_type, "mor": obj, "hostname": hostname, "tags": tags + instance_tags }) self.log.debug("All objects with attributes cached in %s seconds.", time.time() - start) return obj_list
def _collect_mors_and_attributes(self, server_instance): resources = list(RESOURCE_TYPE_METRICS) resources.extend(RESOURCE_TYPE_NO_METRIC) content = server_instance.content view_ref = content.viewManager.CreateContainerView( content.rootFolder, resources, True) # Object used to query MORs as well as the attributes we require in one API call # See https://code.vmware.com/apis/358/vsphere#/doc/vmodl.query.PropertyCollector.html collector = content.propertyCollector # Specify the root object from where we collect the rest of the objects obj_spec = vmodl.query.PropertyCollector.ObjectSpec() obj_spec.obj = view_ref obj_spec.skip = True # Specify the attribute of the root object to traverse to obtain all the attributes traversal_spec = vmodl.query.PropertyCollector.TraversalSpec() traversal_spec.path = "view" traversal_spec.skip = False traversal_spec.type = view_ref.__class__ obj_spec.selectSet = [traversal_spec] property_specs = [] # Specify which attributes we want to retrieve per object for resource in resources: property_spec = vmodl.query.PropertyCollector.PropertySpec() property_spec.type = resource property_spec.pathSet = ["name", "parent", "customValue"] if resource == vim.VirtualMachine: property_spec.pathSet.append("runtime.powerState") property_spec.pathSet.append("runtime.host") property_spec.pathSet.append("guest.hostName") property_specs.append(property_spec) # Create our filter spec from the above specs filter_spec = vmodl.query.PropertyCollector.FilterSpec() filter_spec.objectSet = [obj_spec] filter_spec.propSet = property_specs retr_opts = vmodl.query.PropertyCollector.RetrieveOptions() # To limit the number of objects retrieved per call. # If batch_collector_size is 0, collect maximum number of objects. retr_opts.maxObjects = self.batch_collector_size or None # Collect the objects and their properties res = collector.RetrievePropertiesEx([filter_spec], retr_opts) objects = res.objects # Results can be paginated while res.token is not None: res = collector.ContinueRetrievePropertiesEx(res.token) objects.extend(res.objects) mor_attrs = {} error_counter = 0 for obj in objects: if obj.missingSet and error_counter < 10: for prop in obj.missingSet: error_counter += 1 self.log.error( "Unable to retrieve property %s for object %s: %s", ensure_unicode(prop.path), ensure_unicode(obj.obj), ensure_unicode(prop.fault), ) if error_counter == 10: self.log.error( "Too many errors during object collection, stop logging" ) break mor_attrs[obj.obj] = {prop.name: prop.val for prop in obj.propSet } if obj.propSet else {} return mor_attrs
def _instance_key(instance): i_key = ensure_unicode(instance.get('name')) if i_key is None: raise BadConfigError( "Must define a unique 'name' per vCenter instance") return i_key
def submit_metrics(self, child, prefix, tags): value = child.get(metrics.METRIC_VALUE_FIELDS[child.tag]) metric_name = self.normalize( ensure_unicode(child.get('name')), prefix='{}.{}'.format(self.METRIC_PREFIX, prefix), fix_case=True ) self.metric_type_mapping[child.tag](metric_name, value, tags=tags)
def _check_key_lengths(self, conn, instance, tags): """ Compute the length of the configured keys across all the databases """ key_list = instance.get('keys') if key_list is None: return if not isinstance(key_list, list) or len(key_list) == 0: self.warning("keys in redis configuration is either not a list or empty") return # get all the available databases databases = list(conn.info('keyspace')) if not databases: self.warning("Redis database is empty") return # convert to integer the output of `keyspace`, from `db0` to `0` # and store items in a set databases = [int(dbstring[2:]) for dbstring in databases] # user might have configured the instance to target one specific db if 'db' in instance: db = instance['db'] if db not in databases: self.warning("Cannot find database {}".format(instance['db'])) return databases = [db] # maps a key to the total length across databases lengths_overall = defaultdict(int) # don't overwrite the configured instance, use a copy tmp_instance = deepcopy(instance) for db in databases: lengths = defaultdict(lambda: defaultdict(int)) tmp_instance['db'] = db db_conn = self._get_conn(tmp_instance) for key_pattern in key_list: if re.search(r"(?<!\\)[*?[]", key_pattern): keys = db_conn.scan_iter(match=key_pattern) else: keys = [key_pattern] for key in keys: text_key = ensure_unicode(key) try: key_type = ensure_unicode(db_conn.type(key)) except redis.ResponseError: self.log.info("key {} on remote server; skipping".format(text_key)) continue if key_type == 'list': keylen = db_conn.llen(key) lengths[text_key]["length"] += keylen lengths_overall[text_key] += keylen elif key_type == 'set': keylen = db_conn.scard(key) lengths[text_key]["length"] += keylen lengths_overall[text_key] += keylen elif key_type == 'zset': keylen = db_conn.zcard(key) lengths[text_key]["length"] += keylen lengths_overall[text_key] += keylen elif key_type == 'hash': keylen = db_conn.hlen(key) lengths[text_key]["length"] += keylen lengths_overall[text_key] += keylen elif key_type == 'string': # Send 1 if the key exists as a string lengths[text_key]["length"] += 1 lengths_overall[text_key] += 1 else: # If the type is unknown, it might be because the key doesn't exist, # which can be because the list is empty. So always send 0 in that case. lengths[text_key]["length"] += 0 lengths_overall[text_key] += 0 # Tagging with key_type since the same key can exist with a # different key_type in another db lengths[text_key]["key_type"] = key_type # Send the metrics for each db in the redis instance. for key, total in iteritems(lengths): # Only send non-zeros if tagged per db. if total["length"] > 0: self.gauge( 'redis.key.length', total["length"], tags=tags + ['key:{}'.format(key), 'key_type:{}'.format(total["key_type"]), 'redis_db:db{}'.format(db)], ) # Warn if a key is missing from the entire redis instance. # Send 0 if the key is missing/empty from the entire redis instance. for key, total in iteritems(lengths_overall): if total == 0 and instance.get("warn_on_missing_keys", True): self.gauge('redis.key.length', total, tags=tags + ['key:{}'.format(key)]) self.warning("{0} key not found in redis".format(key))
def _check_slowlog(self, instance, custom_tags): """Retrieve length and entries from Redis' SLOWLOG This will parse through all entries of the SLOWLOG and select ones within the time range between the last seen entries and now """ conn = self._get_conn(instance) tags = self._get_tags(custom_tags, instance) if not instance.get(MAX_SLOW_ENTRIES_KEY): try: max_slow_entries = int( conn.config_get(MAX_SLOW_ENTRIES_KEY) [MAX_SLOW_ENTRIES_KEY]) if max_slow_entries > DEFAULT_MAX_SLOW_ENTRIES: self.warning( "Redis {0} is higher than {1}. Defaulting to {1}. " # noqa: G001 "If you need a higher value, please set {0} in your check config" .format(MAX_SLOW_ENTRIES_KEY, DEFAULT_MAX_SLOW_ENTRIES)) max_slow_entries = DEFAULT_MAX_SLOW_ENTRIES # No config on AWS Elasticache except redis.ResponseError: max_slow_entries = DEFAULT_MAX_SLOW_ENTRIES else: max_slow_entries = int(instance.get(MAX_SLOW_ENTRIES_KEY)) # Generate a unique id for this instance to be persisted across runs ts_key = self._generate_instance_key(instance) # Get all slowlog entries slowlogs = conn.slowlog_get(max_slow_entries) # Find slowlog entries between last timestamp and now using start_time slowlogs = [ s for s in slowlogs if s['start_time'] > self.last_timestamp_seen[ts_key] ] max_ts = 0 # Slowlog entry looks like: # {'command': 'LPOP somekey', # 'duration': 11238, # 'id': 496L, # 'start_time': 1422529869} for slowlog in slowlogs: if slowlog['start_time'] > max_ts: max_ts = slowlog['start_time'] slowlog_tags = list(tags) command = slowlog['command'].split() # When the "Garantia Data" custom Redis is used, redis-py returns # an empty `command` field # FIXME when https://github.com/andymccurdy/redis-py/pull/622 is released in redis-py if command: slowlog_tags.append('command:{}'.format( ensure_unicode(command[0]))) value = slowlog['duration'] self.histogram('redis.slowlog.micros', value, tags=slowlog_tags) self.last_timestamp_seen[ts_key] = max_ts
def _check(self, instance): ( addr, ntlm_domain, username, password, client_cert, client_key, method, data, http_response_status_code, timeout, include_content, headers, response_time, content_match, reverse_content_match, tags, disable_ssl_validation, ssl_expire, instance_ca_certs, weakcipher, check_hostname, ignore_ssl_warning, skip_proxy, allow_redirects, stream, ) = from_instance(instance, self.ca_certs) start = time.time() def send_status_up(logMsg): # TODO: A6 log needs bytes and cannot handle unicode self.log.debug(logMsg) service_checks.append((self.SC_STATUS, Status.UP, "UP")) def send_status_down(loginfo, down_msg): # TODO: A6 log needs bytes and cannot handle unicode self.log.info(loginfo) if include_content: down_msg += '\nContent: {}'.format(content[:CONTENT_LENGTH]) service_checks.append((self.SC_STATUS, Status.DOWN, down_msg)) # Store tags in a temporary list so that we don't modify the global tags data structure tags_list = list(tags) tags_list.append('url:{}'.format(addr)) instance_name = self.normalize(instance['name']) tags_list.append("instance:{}".format(instance_name)) service_checks = [] r = None try: parsed_uri = urlparse(addr) self.log.debug("Connecting to {}".format(addr)) suppress_warning = False if disable_ssl_validation and parsed_uri.scheme == "https": explicit_validation = 'disable_ssl_validation' in instance if ignore_ssl_warning: if explicit_validation: suppress_warning = True else: # Log if we're skipping SSL validation for HTTPS URLs if explicit_validation: self.log.debug( "Skipping SSL certificate validation for {} based on configuration" .format(addr)) # Emit a warning if disable_ssl_validation is not explicitly set and we're not ignoring warnings else: self.warning( "Parameter disable_ssl_validation for {} is not explicitly set, " "defaults to true".format(addr)) instance_proxy = self.get_instance_proxy(instance, addr) self.log.debug("Proxies used for {} - {}".format( addr, instance_proxy)) auth = None if password is not None: if username is not None: auth = (username, password) elif ntlm_domain is not None: auth = HttpNtlmAuth(ntlm_domain, password) sess = requests.Session() sess.trust_env = False if weakcipher: base_addr = '{uri.scheme}://{uri.netloc}/'.format( uri=parsed_uri) sess.mount(base_addr, WeakCiphersAdapter()) self.log.debug( "Weak Ciphers will be used for {}. Supported Cipherlist: {}" .format(base_addr, WeakCiphersHTTPSConnection.SUPPORTED_CIPHERS)) with warnings.catch_warnings(): # Suppress warnings from urllib3 only if disable_ssl_validation is explicitly set to True # and ignore_ssl_warning is True if suppress_warning: warnings.simplefilter('ignore', InsecureRequestWarning) # Add 'Content-Type' for non GET requests when they have not been specified in custom headers if method.upper( ) in DATA_METHODS and not headers.get('Content-Type'): headers[ 'Content-Type'] = 'application/x-www-form-urlencoded' r = sess.request( method.upper(), addr, auth=auth, timeout=timeout, headers=headers, proxies=instance_proxy, allow_redirects=allow_redirects, stream=stream, verify=False if disable_ssl_validation else instance_ca_certs, json=data if method.upper() in DATA_METHODS and isinstance(data, dict) else None, data=data if method.upper() in DATA_METHODS and isinstance(data, string_types) else None, cert=(client_cert, client_key) if client_cert and client_key else None, ) except (socket.timeout, requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: length = int((time.time() - start) * 1000) self.log.info( "{} is DOWN, error: {}. Connection failed after {} ms".format( addr, str(e), length)) service_checks.append( (self.SC_STATUS, Status.DOWN, "{}. Connection failed after {} ms".format(str(e), length))) except socket.error as e: length = int((time.time() - start) * 1000) self.log.info( "{} is DOWN, error: {}. Connection failed after {} ms".format( addr, repr(e), length)) service_checks.append( (self.SC_STATUS, Status.DOWN, "Socket error: {}. Connection failed after {} ms".format( repr(e), length))) except Exception as e: length = int((time.time() - start) * 1000) self.log.error( "Unhandled exception {}. Connection failed after {} ms".format( str(e), length)) raise else: # Only add the URL tag if it's not already present if not any(filter(re.compile('^url:').match, tags_list)): tags_list.append('url:{}'.format(addr)) # Only report this metric if the site is not down if response_time and not service_checks: # Stop the timer as early as possible running_time = time.time() - start self.gauge('network.http.response_time', running_time, tags=tags_list) content = r.text # Check HTTP response status code if not (service_checks or re.match(http_response_status_code, str(r.status_code))): if http_response_status_code == DEFAULT_EXPECTED_CODE: expected_code = "1xx or 2xx or 3xx" else: expected_code = http_response_status_code message = "Incorrect HTTP return code for url {}. Expected {}, got {}.".format( addr, expected_code, str(r.status_code)) if include_content: message += '\nContent: {}'.format(content[:CONTENT_LENGTH]) self.log.info(message) service_checks.append((self.SC_STATUS, Status.DOWN, message)) if not service_checks: # Host is UP # Check content matching is set if content_match: if re.search(content_match, content, re.UNICODE): if reverse_content_match: send_status_down( '{} is found in return content with the reverse_content_match option' .format(ensure_unicode(content_match)), 'Content "{}" found in response with the reverse_content_match' .format(ensure_unicode(content_match)), ) else: send_status_up( "{} is found in return content".format( ensure_unicode(content_match))) else: if reverse_content_match: send_status_up( "{} is not found in return content with the reverse_content_match option" .format(ensure_unicode(content_match))) else: send_status_down( "{} is not found in return content".format( ensure_unicode(content_match)), 'Content "{}" not found in response.'.format( ensure_unicode(content_match)), ) else: send_status_up("{} is UP".format(addr)) finally: if r is not None: r.close() # Report status metrics as well if service_checks: can_status = 1 if service_checks[0][1] == "UP" else 0 self.gauge('network.http.can_connect', can_status, tags=tags_list) # cant_connect is useful for top lists cant_status = 0 if service_checks[0][1] == "UP" else 1 self.gauge('network.http.cant_connect', cant_status, tags=tags_list) if ssl_expire and parsed_uri.scheme == "https": status, days_left, seconds_left, msg = self.check_cert_expiration( instance, timeout, instance_ca_certs, check_hostname, client_cert, client_key) tags_list = list(tags) tags_list.append('url:{}'.format(addr)) tags_list.append("instance:{}".format(instance_name)) self.gauge('http.ssl.days_left', days_left, tags=tags_list) self.gauge('http.ssl.seconds_left', seconds_left, tags=tags_list) service_checks.append((self.SC_SSL_CERT, status, msg)) return service_checks
def from_instance(instance, default_ca_certs=None): """ Create a config object from an instance dictionary """ method = instance.get('method', 'get') data = instance.get('data', {}) tags = instance.get('tags', []) client_cert = instance.get('tls_cert') or instance.get('client_cert') client_key = instance.get('tls_private_key') or instance.get('client_key') http_response_status_code = str( instance.get('http_response_status_code', DEFAULT_EXPECTED_CODE)) config_headers = instance.get('headers', {}) default_headers = is_affirmative( instance.get("include_default_headers", True)) if default_headers: headers = agent_headers({}) else: headers = {} headers.update(config_headers) url = instance.get('url') if url is not None: url = ensure_unicode(url) content_match = instance.get('content_match') if content_match is not None: content_match = ensure_unicode(content_match) reverse_content_match = is_affirmative( instance.get('reverse_content_match', False)) response_time = is_affirmative(instance.get('collect_response_time', True)) if not url: raise ConfigurationError("Bad configuration. You must specify a url") if not url.startswith("http"): raise ConfigurationError( "The url {} must start with the scheme http or https".format(url)) include_content = is_affirmative(instance.get('include_content', False)) ssl_expire = is_affirmative( instance.get('check_certificate_expiration', True)) instance_ca_certs = instance.get( 'tls_ca_cert', instance.get('ca_certs', default_ca_certs)) weakcipher = is_affirmative(instance.get('weakciphers', False)) check_hostname = is_affirmative(instance.get('check_hostname', True)) allow_redirects = is_affirmative(instance.get('allow_redirects', True)) stream = is_affirmative(instance.get('stream', False)) return Config( url, client_cert, client_key, method, data, http_response_status_code, include_content, headers, response_time, content_match, reverse_content_match, tags, ssl_expire, instance_ca_certs, weakcipher, check_hostname, allow_redirects, stream, )
def _collect_metrics_async(self, instance, query_specs): """ Task that collects the metrics listed in the morlist for one MOR """ # ## <TEST-INSTRUMENTATION> t = Timer() # ## </TEST-INSTRUMENTATION> i_key = self._instance_key(instance) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager results = perfManager.QueryPerf(query_specs) if results: for mor_perfs in results: mor_name = str(mor_perfs.entity) try: mor = self.mor_cache.get_mor(i_key, mor_name) except MorNotFoundError: self.log.error( "Trying to get metrics from object %s deleted from the cache, skipping. " "Consider increasing the parameter `clean_morlist_interval` to avoid that", mor_name, ) continue for result in mor_perfs.value: counter_id = result.id.counterId if not self.metadata_cache.contains(i_key, counter_id): self.log.debug( "Skipping value for counter %s, because there is no metadata about it", ensure_unicode(counter_id), ) continue # Metric types are absolute, delta, and rate metric_name = self.metadata_cache.get_metadata( i_key, result.id.counterId).get('name') if self.in_compatibility_mode(instance): if metric_name not in ALL_METRICS: self.log.debug("Skipping unknown `%s` metric.", ensure_unicode(metric_name)) continue if not result.value: self.log.debug( "Skipping `%s` metric because the value is empty", ensure_unicode(metric_name)) continue instance_name = result.id.instance or "none" value = self._transform_value(instance, result.id.counterId, result.value[0]) hostname = mor['hostname'] tags = [ 'instance:{}'.format(ensure_unicode(instance_name)) ] if not hostname: # no host tags available tags.extend(mor['tags']) else: hostname = to_string(hostname) tags.extend(instance.get('tags', [])) # vsphere "rates" should be submitted as gauges (rate is # precomputed). self.gauge("vsphere.{}".format( ensure_unicode(metric_name)), value, hostname=hostname, tags=tags) # ## <TEST-INSTRUMENTATION> custom_tags = instance.get('tags', []) + ['instance:{}'.format(i_key)] self.histogram('datadog.agent.vsphere.metric_colection.time', t.total(), tags=custom_tags)
def _process_mor_objects_queue_async(self, instance, mors): """ Process a batch of items popped from the objects queue by querying the available metrics for these MORs and then putting them in the Mor cache """ t = time.time() i_key = self._instance_key(instance) server_instance = self._get_server_instance(instance) perfManager = server_instance.content.perfManager # For non realtime metrics, we need to specifically ask which counters are available for which entity, # so we call perfManager.QueryAvailablePerfMetric for each cluster, datacenter, datastore # This should be okay since the number of such entities shouldn't be excessively large for mor in mors: mor_name = str(mor['mor']) available_metrics = {m.counterId for m in perfManager.QueryAvailablePerfMetric(entity=mor["mor"])} try: self.mor_cache.set_metrics(i_key, mor_name, self._compute_needed_metrics(instance, available_metrics)) except MorNotFoundError: self.log.error("Object '{}' is missing from the cache, skipping. ".format(ensure_unicode(mor_name))) continue # TEST-INSTRUMENTATION self.histogram( 'datadog.agent.vsphere.morlist_process_atomic.time', time.time() - t, tags=instance.get('tags', []) )
def _compute_needed_metrics(self, instance, available_metrics): """ Compare the available metrics for one MOR we have computed and intersect them with the set of metrics we want to report """ i_key = self._instance_key(instance) if self.in_compatibility_mode(instance): if instance.get('all_metrics', False): return available_metrics wanted_metrics = [] # Get only the basic metrics for counter_id in available_metrics: # No cache yet, skip it for now if not self.metadata_cache.contains(i_key, counter_id): self.log.debug( "No metadata found for counter {}, will not collect it".format(ensure_unicode(counter_id)) ) continue metadata = self.metadata_cache.get_metadata(i_key, counter_id) if metadata.get('name') in BASIC_METRICS: wanted_metrics.append(vim.PerformanceManager.MetricId(counterId=counter_id, instance="*")) return wanted_metrics else: # The metadata cache contains only metrics of the desired level, so use it to filter the metrics to keep return [ vim.PerformanceManager.MetricId(counterId=counter_id, instance="*") for counter_id in available_metrics if self.metadata_cache.contains(i_key, counter_id) ]
def collect_metrics(self, instance): """ Calls asynchronously _collect_metrics_async on all MORs, as the job queue is processed the Aggregator will receive the metrics. """ i_key = self._instance_key(instance) if not self.mor_cache.contains(i_key): self.log.debug( "Not collecting metrics for instance '%s', nothing to do yet.", i_key) return server_instance = self._get_server_instance(instance) max_historical_metrics = DEFAULT_MAX_HIST_METRICS if self._should_collect_historical(instance): try: if 'max_query_metrics' in instance: max_historical_metrics = int(instance['max_query_metrics']) self.log.info("Collecting up to %d metrics", max_historical_metrics) else: vcenter_settings = server_instance.content.setting.QueryOptions( "config.vpxd.stats.maxQueryMetrics") max_historical_metrics = int(vcenter_settings[0].value) if max_historical_metrics < 0: max_historical_metrics = float('inf') except Exception: pass # TODO: Remove me once the fix for `max_query_metrics` is here by default mors_batch_method = (self.mor_cache.mors_batch if is_affirmative( instance.get('fix_max_query_metrics')) else self.mor_cache.legacy_mors_batch) vm_count = 0 custom_tags = instance.get('tags', []) tags = [ "vcenter_server:{}".format(ensure_unicode(instance.get('name'))) ] + custom_tags n_mors = self.mor_cache.instance_size(i_key) if not n_mors: if self._is_main_instance(instance): self.gauge('vsphere.vm.count', vm_count, tags=tags) self.log.debug( "No Mor objects to process for instance '%s', skip...", i_key) return self.log.debug("Collecting metrics for %s mors", ensure_unicode(n_mors)) # Request metrics for several objects at once. We can limit the number of objects with batch_size # If batch_size is 0, process everything at once batch_size = self.batch_morlist_size or n_mors for batch in mors_batch_method(i_key, batch_size, max_historical_metrics): query_specs = [] for mor in itervalues(batch): if mor['mor_type'] == 'vm': vm_count += 1 if mor['mor_type'] not in REALTIME_RESOURCES and ( 'metrics' not in mor or not mor['metrics']): continue query_spec = vim.PerformanceManager.QuerySpec() query_spec.entity = mor["mor"] query_spec.intervalId = mor.get("interval") query_spec.maxSample = 1 if mor['mor_type'] in REALTIME_RESOURCES: query_spec.metricId = self.metadata_cache.get_metric_ids( i_key) else: query_spec.metricId = mor["metrics"] query_specs.append(query_spec) if query_specs: self.pool.apply_async(self._collect_metrics_async, args=(instance, query_specs)) if self._is_main_instance(instance): self.gauge('vsphere.vm.count', vm_count, tags=tags)
def _parse_tags(self): self.tags = [] self.tags.append(u'nfs_server:{0}'.format(ensure_unicode(self.nfs_server))) self.tags.append(u'nfs_export:{0}'.format(ensure_unicode(self.nfs_export))) self.tags.append(u'nfs_mount:{0}'.format(ensure_unicode(self.mount)))
def check(self, instance): ( addr, client_cert, client_key, method, data, http_response_status_code, include_content, headers, response_time, content_match, reverse_content_match, tags, ssl_expire, instance_ca_certs, weakcipher, check_hostname, allow_redirects, stream, ) = from_instance(instance, self.ca_certs) timeout = self.http.options['timeout'][0] start = time.time() # allows default headers to be included based on `include_default_headers` flag self.http.options['headers'] = headers def send_status_up(logMsg): # TODO: A6 log needs bytes and cannot handle unicode self.log.debug(logMsg) service_checks.append((self.SC_STATUS, AgentCheck.OK, "UP")) def send_status_down(loginfo, down_msg): # TODO: A6 log needs bytes and cannot handle unicode self.log.info(loginfo) down_msg = self._include_content(include_content, down_msg, content) service_checks.append( (self.SC_STATUS, AgentCheck.CRITICAL, down_msg)) # Store tags in a temporary list so that we don't modify the global tags data structure tags_list = list(tags) tags_list.append('url:{}'.format(addr)) instance_name = self.normalize_tag(instance['name']) tags_list.append("instance:{}".format(instance_name)) service_checks = [] service_checks_tags = self._get_service_checks_tags(instance) r = None try: parsed_uri = urlparse(addr) self.log.debug("Connecting to %s", addr) self.http.session.trust_env = False if weakcipher: base_addr = '{uri.scheme}://{uri.netloc}/'.format( uri=parsed_uri) self.http.session.mount(base_addr, WeakCiphersAdapter()) self.log.debug( "Weak Ciphers will be used for %s. Supported Cipherlist: %s", base_addr, WeakCiphersHTTPSConnection.SUPPORTED_CIPHERS, ) # Add 'Content-Type' for non GET requests when they have not been specified in custom headers if method.upper( ) in DATA_METHODS and not headers.get('Content-Type'): self.http.options['headers'][ 'Content-Type'] = 'application/x-www-form-urlencoded' r = getattr(self.http, method.lower())( addr, persist=True, allow_redirects=allow_redirects, stream=stream, json=data if method.upper() in DATA_METHODS and isinstance(data, dict) else None, data=data if method.upper() in DATA_METHODS and isinstance(data, string_types) else None, ) except (socket.timeout, requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e: length = int((time.time() - start) * 1000) self.log.info( "%s is DOWN, error: %s. Connection failed after %s ms", addr, e, length) service_checks.append( (self.SC_STATUS, AgentCheck.CRITICAL, "{}. Connection failed after {} ms".format(str(e), length))) except socket.error as e: length = int((time.time() - start) * 1000) self.log.info( "%s is DOWN, error: %s. Connection failed after %s ms", addr, repr(e), length) service_checks.append(( self.SC_STATUS, AgentCheck.CRITICAL, "Socket error: {}. Connection failed after {} ms".format( repr(e), length), )) except IOError as e: # Py2 throws IOError on invalid cert path while py3 throws a socket.error length = int((time.time() - start) * 1000) self.log.info( "Host %s could not be reached: %s. Connection failed after %s ms", addr, repr(e), length) service_checks.append(( self.SC_STATUS, AgentCheck.CRITICAL, "Socket error: {}. Connection failed after {} ms".format( repr(e), length), )) except Exception as e: length = int((time.time() - start) * 1000) self.log.error( "Unhandled exception %s. Connection failed after %s ms", e, length) raise else: # Only add the URL tag if it's not already present if not any(filter(re.compile('^url:').match, tags_list)): tags_list.append('url:{}'.format(addr)) # Only report this metric if the site is not down if response_time and not service_checks: self.gauge('network.http.response_time', r.elapsed.total_seconds(), tags=tags_list) content = r.text # Check HTTP response status code if not (service_checks or re.match(http_response_status_code, str(r.status_code))): if http_response_status_code == DEFAULT_EXPECTED_CODE: expected_code = "1xx or 2xx or 3xx" else: expected_code = http_response_status_code message = "Incorrect HTTP return code for url {}. Expected {}, got {}.".format( addr, expected_code, str(r.status_code)) message = self._include_content(include_content, message, content) self.log.info(message) service_checks.append( (self.SC_STATUS, AgentCheck.CRITICAL, message)) if not service_checks: # Host is UP # Check content matching is set if content_match: if re.search(content_match, content, re.UNICODE): if reverse_content_match: send_status_down( '{} is found in return content with the reverse_content_match option' .format(ensure_unicode(content_match)), 'Content "{}" found in response with the reverse_content_match' .format(ensure_unicode(content_match)), ) else: send_status_up( "{} is found in return content".format( ensure_unicode(content_match))) else: if reverse_content_match: send_status_up( "{} is not found in return content with the reverse_content_match option" .format(ensure_unicode(content_match))) else: send_status_down( "{} is not found in return content".format( ensure_unicode(content_match)), 'Content "{}" not found in response.'.format( ensure_unicode(content_match)), ) else: send_status_up("{} is UP".format(addr)) finally: if r is not None: r.close() # resets the wrapper Session object self.http._session.close() self.http._session = None # Report status metrics as well if service_checks: can_status = 1 if service_checks[0][1] == AgentCheck.OK else 0 self.gauge('network.http.can_connect', can_status, tags=tags_list) # cant_connect is useful for top lists cant_status = 0 if service_checks[0][1] == AgentCheck.OK else 1 self.gauge('network.http.cant_connect', cant_status, tags=tags_list) if ssl_expire and parsed_uri.scheme == "https": status, days_left, seconds_left, msg = self.check_cert_expiration( instance, timeout, instance_ca_certs, check_hostname, client_cert, client_key) tags_list = list(tags) tags_list.append('url:{}'.format(addr)) tags_list.append("instance:{}".format(instance_name)) self.gauge('http.ssl.days_left', days_left, tags=tags_list) self.gauge('http.ssl.seconds_left', seconds_left, tags=tags_list) service_checks.append((self.SC_SSL_CERT, status, msg)) for status in service_checks: sc_name, status, msg = status self.report_as_service_check(sc_name, status, service_checks_tags, msg)
def check(self, instance): sites = instance.get('sites') if sites is None: expected_sites = set() else: expected_sites = set(sites) # _Total should always be in the list of expected sites; we always # report _Total if "_Total" not in expected_sites: expected_sites.add("_Total") self.log.debug("expected sites is {}".format(str(expected_sites))) key = hash_mutable(instance) for inst_name, dd_name, metric_func, counter in self._metrics[key]: try: try: vals = counter.get_all_values() except Exception as e: self.log.error("Failed to get_all_values {} {}".format( inst_name, dd_name)) continue for sitename, val in iteritems(vals): tags = [] if key in self._tags: tags = list(self._tags[key]) try: if not counter.is_single_instance(): # Skip any sites we don't specifically want. if not sites: tags.append("site:{0}".format( ensure_unicode(self.normalize(sitename)))) # always report total elif sitename == "_Total": tags.append("site:{0}".format( ensure_unicode(self.normalize(sitename)))) elif sitename not in sites: continue else: tags.append("site:{0}".format( ensure_unicode(self.normalize(sitename)))) except Exception as e: self.log.error( "Caught exception {} setting tags".format(str(e))) try: metric_func(dd_name, val, tags) except Exception as e: self.log.error("metric_func: {} {} {}".format( dd_name, str(val), str(e))) pass if dd_name == "iis.uptime": uptime = int(val) status = AgentCheck.CRITICAL if uptime == 0 else AgentCheck.OK self.service_check(self.SERVICE_CHECK, status, tags) if sitename in expected_sites: self.log.debug( "Removing {} from expected sites".format( sitename)) expected_sites.remove(sitename) else: self.log.warning( "site not in expected_sites {}".format( sitename)) except Exception as e: # don't give up on all of the metrics because one failed self.log.error( "IIS Failed to get metric data for {} {}: {}".format( inst_name, dd_name, str(e))) pass for site in expected_sites: tags = [] if key in self._tags: tags = list(self._tags[key]) tags.append("site:{}".format(ensure_unicode(self.normalize(site)))) self.service_check(self.SERVICE_CHECK, AgentCheck.CRITICAL, tags)