def check(self, instance): if instance is None: instance = {} self._excluded_ifaces = instance.get('excluded_interfaces', []) self._collect_cx_state = instance.get( 'collect_connection_state', False) self._collect_rate_metrics = instance.get( 'collect_rate_metrics', True) self._collect_count_metrics = instance.get( 'collect_count_metrics', False) # This decides whether we should split or combine connection states, # along with a few other things self._setup_metrics(instance) self._exclude_iface_re = None exclude_re = instance.get('excluded_interface_re', None) if exclude_re: self.log.debug("Excluding network devices matching: %s" % exclude_re) self._exclude_iface_re = re.compile(exclude_re) if Platform.is_linux(): self._check_linux(instance) elif Platform.is_bsd(): self._check_bsd(instance) elif Platform.is_solaris(): self._check_solaris(instance) elif Platform.is_windows(): self._check_psutil(instance)
def psutil_wrapper(self, process, method, accessors, try_sudo, *args, **kwargs): """ A psutil wrapper that is calling * psutil.method(*args, **kwargs) and returns the result OR * psutil.method(*args, **kwargs).accessor[i] for each accessors given in a list, the result being indexed in a dictionary by the accessor name """ if accessors is None: result = None else: result = {} # Ban certain method that we know fail if method == 'num_fds' and not Platform.is_unix(): return result elif method == 'num_handles' and not Platform.is_win32(): return result try: res = getattr(process, method)(*args, **kwargs) if accessors is None: result = res else: for acc in accessors: try: result[acc] = getattr(res, acc) except AttributeError: self.log.debug( "psutil.%s().%s attribute does not exist", method, acc) except (NotImplementedError, AttributeError): self.log.debug("psutil method %s not implemented", method) except psutil.AccessDenied: self.log.debug("psutil was denied access for method %s", method) if method == 'num_fds' and Platform.is_unix() and try_sudo: try: # It is up the agent's packager to grant # corresponding sudo policy on unix platforms ls_args = [ 'sudo', 'ls', '/proc/{}/fd/'.format(process.pid) ] process_ls = subprocess.check_output(ls_args) result = len(process_ls.splitlines()) except subprocess.CalledProcessError as e: self.log.exception( "trying to retrieve %s with sudo failed with return code %s", method, e.returncode) except Exception: self.log.exception( "trying to retrieve %s with sudo also failed", method) except psutil.NoSuchProcess: self.warning("Process %s disappeared while scanning", process.pid) return result
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # ad stands for access denied # We cache the PIDs getting this error and don't iterate on them more often than `access_denied_cache_duration`` # This cache is for all PIDs so it's global, but it should be refreshed by instance self.last_ad_cache_ts = {} self.ad_cache = set() self.access_denied_cache_duration = int( init_config.get('access_denied_cache_duration', DEFAULT_AD_CACHE_DURATION)) # By default cache the PID list for a while # Sometimes it's not wanted b/c it can mess with no-data monitoring # This cache is indexed per instance self.last_pid_cache_ts = {} self.pid_cache = {} self.pid_cache_duration = int( init_config.get('pid_cache_duration', DEFAULT_PID_CACHE_DURATION)) self._conflicting_procfs = False self._deprecated_init_procfs = False if Platform.is_linux(): procfs_path = init_config.get('procfs_path') if procfs_path: if 'procfs_path' in agentConfig and procfs_path != agentConfig.get( 'procfs_path').rstrip('/'): self._conflicting_procfs = True else: self._deprecated_init_procfs = True psutil.PROCFS_PATH = procfs_path # Process cache, indexed by instance self.process_cache = defaultdict(dict)
def _exclude_disk_psutil(self, part): # skip cd-rom drives with no disk in it; they may raise # ENOENT, pop-up a Windows GUI error for a non-ready # partition or just hang; # and all the other excluded disks skip_win = Platform.is_win32() and ('cdrom' in part.opts or part.fstype == '') return skip_win or self._exclude_disk(part.device, part.fstype, part.mountpoint)
def _collect_part_metrics(self, part, usage): metrics = {} for name in ['total', 'used', 'free']: # For legacy reasons, the standard unit it kB metrics[self.METRIC_DISK.format(name)] = getattr(usage, name) / 1024.0 # FIXME: 6.x, use percent, a lot more logical than in_use metrics[self.METRIC_DISK.format('in_use')] = usage.percent / 100.0 if Platform.is_unix(): metrics.update(self._collect_inodes_metrics(part.mountpoint)) return metrics
def spin_up_haproxy(): env = os.environ env['HAPROXY_CONFIG_DIR'] = os.path.join(common.HERE, 'compose') env['HAPROXY_CONFIG'] = os.path.join(common.HERE, 'compose', 'haproxy.cfg') env['HAPROXY_CONFIG_OPEN'] = os.path.join(common.HERE, 'compose', 'haproxy-open.cfg') env['HAPROXY_SOCKET_DIR'] = common.UNIXSOCKET_DIR if Platform.is_linux() and not os.path.exists(common.UNIXSOCKET_DIR): # make the temp directory on linux os.makedirs(common.UNIXSOCKET_DIR) args = [ "docker-compose", "-f", os.path.join(common.HERE, 'compose', 'haproxy.yaml') ] subprocess.check_call(args + ["down"], env=env) subprocess.check_call(args + ["up", "-d"], env=env) wait_for_haproxy() # subprocess.check_call(["ls", "-al", "/tmp/"], env=env) # subprocess.check_call(["ls", "-al", "/tmp/haproxy"], env=env) try: if Platform.is_linux(): # on linux this needs access to the socket # it won't work without access chown_args = [] user = getpass.getuser() if user != 'root': chown_args += ['sudo'] chown_args += ["chown", user, common.UNIXSOCKET_PATH] subprocess.check_call(chown_args, env=env) except subprocess.CalledProcessError: # it's not always bad if this fails pass time.sleep(20) yield subprocess.check_call(args + ["down"], env=env) if Platform.is_linux(): # make the temp directory on linux try: os.removedirs(common.UNIXSOCKET_DIR) except OSError: pass
def test_unixsocket_config(aggregator, spin_up_haproxy): if not Platform.is_linux(): return haproxy_check = HAProxy(common.CHECK_NAME, {}, {}) haproxy_check.check(common.CONFIG_UNIXSOCKET) shared_tag = ["instance_url:{0}".format(common.UNIXSOCKET_URL)] _test_frontend_metrics(aggregator, shared_tag) _test_backend_metrics(aggregator, shared_tag) _test_service_checks(aggregator) aggregator.assert_all_metrics_covered()
def get_pagefault_stats(self, pid): if not Platform.is_linux(): return None def file_to_string(path): with open(path, 'r') as f: res = f.read() return res # http://man7.org/linux/man-pages/man5/proc.5.html try: data = file_to_string('/{}/{}/stat'.format(psutil.PROCFS_PATH, pid)) except Exception: self.log.debug('error getting proc stats: file_to_string failed for /%s/%s/stat', psutil.PROCFS_PATH, pid) return None return (int(i) for i in data.split()[9:13])
def collect_metrics_psutil(self): self._valid_disks = {} for part in psutil.disk_partitions(all=True): # we check all exclude conditions if self._exclude_disk_psutil(part): continue # Get disk metrics here to be able to exclude on total usage try: disk_usage = timeout(5)(psutil.disk_usage)(part.mountpoint) except TimeoutException: self.log.warn( u"Timeout while retrieving the disk usage of `%s` mountpoint. Skipping...", part.mountpoint) continue except Exception as e: self.log.warn("Unable to get disk metrics for %s: %s", part.mountpoint, e) continue # Exclude disks with total disk size 0 if disk_usage.total == 0: continue # For later, latency metrics self._valid_disks[part.device] = (part.fstype, part.mountpoint) self.log.debug('Passed: {0}'.format(part.device)) tags = [part.fstype, 'filesystem:{}'.format(part.fstype) ] if self._tag_by_filesystem else [] device_name = part.mountpoint if self._use_mount else part.device # apply device/mountpoint specific tags for regex, device_tags in self._device_tag_re: if regex.match(device_name): tags += device_tags tags.extend(self._custom_tags) # legacy check names c: vs psutil name C:\\ if Platform.is_win32(): device_name = device_name.strip('\\').lower() for metric_name, metric_value in self._collect_part_metrics( part, disk_usage).iteritems(): self.gauge(metric_name, metric_value, tags=tags, device_name=device_name) self.collect_latency_metrics()
def test_check_real_process_regex(aggregator): "Check to specifically find this python pytest running process using regex." from datadog_checks.utils.platform import Platform instance = { 'name': 'py', 'search_string': ['.*python.*pytest'], 'exact_match': False, 'ignored_denied_access': True, 'thresholds': { 'warning': [1, 10], 'critical': [1, 100] }, } process = ProcessCheck(common.CHECK_NAME, {}, {}) expected_tags = generate_expected_tags(instance) process.check(instance) for mname in common.PROCESS_METRIC: # cases where we don't actually expect some metrics here: # - if io_counters() is not available # - if memory_info_ex() is not available # - first run so no `cpu.pct` if ((not _PSUTIL_IO_COUNTERS and '.io' in mname) or (not _PSUTIL_MEM_SHARED and 'mem.real' in mname) or mname == 'system.processes.cpu.pct'): continue if Platform.is_windows(): metric = common.UNIX_TO_WINDOWS_MAP.get(mname, mname) else: metric = mname aggregator.assert_metric(metric, at_least=1, tags=expected_tags) aggregator.assert_service_check('process.up', count=1, tags=expected_tags + ['process:py']) # this requires another run process.check(instance) aggregator.assert_metric('system.processes.cpu.pct', count=1, tags=expected_tags) aggregator.assert_metric('system.processes.cpu.normalized_pct', count=1, tags=expected_tags)
def test_complex_config_replica(aggregator, spin_up_mysql): mysql_check = MySql(common.CHECK_NAME, {}, {}) config = copy.deepcopy(common_config.MYSQL_COMPLEX_CONFIG) config['port'] = common.SLAVE_PORT mysql_check.check(config) # self.assertMetricTag('mysql.replication.seconds_behind_master', 'channel:default') # Test service check aggregator.assert_service_check('mysql.can_connect', status=MySql.OK, tags=tags.SC_TAGS_REPLICA, count=1) # Travis MySQL not running replication - FIX in flavored test. aggregator.assert_service_check('mysql.replication.slave_running', status=MySql.OK, tags=tags.SC_TAGS_REPLICA, at_least=1) ver = map(lambda x: int(x), mysql_check.mysql_version[mysql_check._get_host_key()]) ver = tuple(ver) testable_metrics = (variables.STATUS_VARS + variables.VARIABLES_VARS + variables.INNODB_VARS + variables.BINLOG_VARS + variables.SYSTEM_METRICS + variables.SCHEMA_VARS + variables.SYNTHETIC_VARS) # Test metrics for mname in testable_metrics: # These two are currently not guaranteed outside of a Linux # environment. if mname == 'mysql.performance.user_time' and not Platform.is_linux(): continue if mname == 'mysql.performance.kernel_time' and not Platform.is_linux(): continue if mname == 'mysql.performance.cpu_time' and Platform.is_windows(): continue if mname == 'mysql.performance.query_run_time.avg': aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:testdb'], count=1) elif mname == 'mysql.info.schema.size': aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:testdb'], count=1) aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:information_schema'], count=1) aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:performance_schema'], count=1) else: aggregator.assert_metric(mname, tags=tags.METRIC_TAGS, at_least=0) # test custom query metrics aggregator.assert_metric('alice.age', value=25) aggregator.assert_metric('bob.age', value=20) # test optional metrics optional_metrics = (variables.OPTIONAL_REPLICATION_METRICS + variables.OPTIONAL_INNODB_VARS + variables.OPTIONAL_STATUS_VARS + variables.OPTIONAL_STATUS_VARS_5_6_6) _test_optional_metrics(aggregator, optional_metrics, 1) # Raises when coverage < 100% aggregator.assert_all_metrics_covered()
def test_relocated_procfs(aggregator): from datadog_checks.utils.platform import Platform import tempfile import shutil import uuid already_linux = Platform.is_linux() unique_process_name = str(uuid.uuid4()) my_procfs = tempfile.mkdtemp() def _fake_procfs(arg, root=my_procfs): for key, val in arg.iteritems(): path = os.path.join(root, key) if isinstance(val, dict): os.mkdir(path) _fake_procfs(val, path) else: with open(path, "w") as f: f.write(str(val)) _fake_procfs({ '1': { 'status': ("Name:\t{}\nThreads:\t1\n").format(unique_process_name), 'stat': ('1 ({}) S 0 1 1 ' + ' 0' * 46).format(unique_process_name), 'cmdline': unique_process_name, }, 'stat': ("cpu 13034 0 18596 380856797 2013 2 2962 0 0 0\n" "btime 1448632481\n"), }) config = { 'init_config': { 'procfs_path': my_procfs }, 'instances': [{ 'name': 'moved_procfs', 'search_string': [unique_process_name], 'exact_match': False, 'ignored_denied_access': True, 'thresholds': { 'warning': [1, 10], 'critical': [1, 100] }, }] } version = int(psutil.__version__.replace(".", "")) process = ProcessCheck(common.CHECK_NAME, config['init_config'], {}, config['instances']) try: def import_mock(name, i_globals={}, i_locals={}, fromlist=[], level=-1, orig_import=__import__): # _psutil_linux and _psutil_posix are the # C bindings; use a mock for those if name in ('_psutil_linux', '_psutil_posix') or level >= 1 and\ ('_psutil_linux' in fromlist or '_psutil_posix' in fromlist): m = MagicMock() # the import system will ask us for our own name m._psutil_linux = m m._psutil_posix = m # there's a version safety check in psutil/__init__.py; # this skips it m.version = version return m return orig_import(name, i_globals, i_locals, fromlist, level) # contextlib.nested is deprecated in favor of with MGR1, MGR2, ... etc # but we have too many mocks to fit on one line and apparently \ line # continuation is not flake8 compliant, even when semantically # required (as here). Patch is unlikely to throw errors that are # suppressed, so the main downside of contextlib is avoided. with contextlib.nested( patch('sys.platform', 'linux'), patch('socket.AF_PACKET', create=True), patch('__builtin__.__import__', side_effect=import_mock)): if not already_linux: # Reloading psutil fails on linux, but we only # need to do so if we didn't start out on a linux platform reload(psutil) assert Platform.is_linux() process.check(config["instances"][0]) finally: shutil.rmtree(my_procfs) if not already_linux: # restore the original psutil that doesn't have our mocks reload(psutil) else: psutil.PROCFS_PATH = '/proc' expected_tags = generate_expected_tags(config['instances'][0]) expected_tags += ['process:moved_procfs'] aggregator.assert_service_check('process.up', count=1, tags=expected_tags)
def collect_metrics_psutil(self): self._valid_disks = {} for part in psutil.disk_partitions(all=True): # we check all exclude conditions if self._exclude_disk_psutil(part): continue # Get disk metrics here to be able to exclude on total usage try: disk_usage = timeout(5)(psutil.disk_usage)(part.mountpoint) except TimeoutException: self.log.warn( u"Timeout while retrieving the disk usage of `%s` mountpoint. Skipping...", part.mountpoint) continue except Exception as e: self.log.warn("Unable to get disk metrics for %s: %s", part.mountpoint, e) continue # Exclude disks with total disk size 0 if disk_usage.total == 0: continue # For later, latency metrics self._valid_disks[part.device] = (part.fstype, part.mountpoint) self.log.debug('Passed: {0}'.format(part.device)) tags = [part.fstype, 'filesystem:{}'.format(part.fstype) ] if self._tag_by_filesystem else [] device_name = part.mountpoint if self._use_mount else part.device # apply device/mountpoint specific tags for regex, device_tags in self._device_tag_re: if regex.match(device_name): tags += device_tags tags.extend(self._custom_tags) # legacy check names c: vs psutil name C:\\ if Platform.is_win32(): device_name = device_name.strip('\\').lower() for metric_name, metric_value in self._collect_part_metrics( part, disk_usage).iteritems(): self.gauge(metric_name, metric_value, tags=tags, device_name=device_name) # Add in a disk read write or read only check if self._service_check_rw: rwro = list(set(['rw', 'ro']) & set(part.opts.split(','))) if len(rwro) == 1: self.service_check( 'disk.read_write', AgentCheck.OK if rwro[0] == 'rw' else AgentCheck.CRITICAL, tags=tags + ['device:%s' % (device_name)]) else: self.service_check('disk.read_write', AgentCheck.UNKNOWN, tags=tags + ['device:%s' % (device_name)]) self.collect_latency_metrics()
# (C) Datadog, Inc. 2019-present # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) import psutil from datadog_checks.dev import get_here from datadog_checks.utils.platform import Platform HERE = get_here() CHECK_NAME = "system_core" INSTANCE = {"tags": ["tag1:value1"]} if Platform.is_mac(): CHECK_RATES = [ 'system.core.idle', 'system.core.nice', 'system.core.system', 'system.core.user' ] MOCK_PSUTIL_CPU_TIMES = [ psutil._psosx.scputimes(user=7877.29, nice=0.0, system=7469.72, idle=38164.81), psutil._psosx.scputimes(user=3826.74, nice=0.0, system=2701.6, idle=46981.39), psutil._psosx.scputimes(user=7486.51, nice=0.0, system=5991.36, idle=40031.88),
def _check_bsd(self, instance): netstat_flags = ['-i', '-b'] custom_tags = instance.get('tags', []) # FreeBSD's netstat truncates device names unless you pass '-W' if Platform.is_freebsd(): netstat_flags.append('-W') try: output, _, _ = get_subprocess_output(["netstat"] + netstat_flags, self.log) lines = output.splitlines() # Name Mtu Network Address Ipkts Ierrs Ibytes Opkts Oerrs Obytes Coll # lo0 16384 <Link#1> 318258 0 428252203 318258 0 428252203 0 # lo0 16384 localhost fe80:1::1 318258 - 428252203 318258 - 428252203 - # lo0 16384 127 localhost 318258 - 428252203 318258 - 428252203 - # lo0 16384 localhost ::1 318258 - 428252203 318258 - 428252203 - # gif0* 1280 <Link#2> 0 0 0 0 0 0 0 # stf0* 1280 <Link#3> 0 0 0 0 0 0 0 # en0 1500 <Link#4> 04:0c:ce:db:4e:fa 20801309 0 13835457425 15149389 0 11508790198 0 # en0 1500 seneca.loca fe80:4::60c:ceff: 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 192.168.1 192.168.1.63 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # p2p0 2304 <Link#5> 06:0c:ce:db:4e:fa 0 0 0 0 0 0 0 # ham0 1404 <Link#6> 7a:79:05:4d:bf:f5 30100 0 6815204 18742 0 8494811 0 # ham0 1404 5 5.77.191.245 30100 - 6815204 18742 - 8494811 - # ham0 1404 seneca.loca fe80:6::7879:5ff: 30100 - 6815204 18742 - 8494811 - # ham0 1404 2620:9b::54 2620:9b::54d:bff5 30100 - 6815204 18742 - 8494811 - headers = lines[0].split() # Given the irregular structure of the table above, better to parse from the end of each line # Verify headers first # -7 -6 -5 -4 -3 -2 -1 for h in ("Ipkts", "Ierrs", "Ibytes", "Opkts", "Oerrs", "Obytes", "Coll"): if h not in headers: self.log.error("%s not found in %s; cannot parse" % (h, headers)) return False current = None for l in lines[1:]: # Another header row, abort now, this is IPv6 land if "Name" in l: break x = l.split() if len(x) == 0: break iface = x[0] if iface.endswith("*"): iface = iface[:-1] if iface == current: # skip multiple lines of same interface continue else: current = iface # Filter inactive interfaces if self._parse_value(x[-5]) or self._parse_value(x[-2]): iface = current metrics = { 'bytes_rcvd': self._parse_value(x[-5]), 'bytes_sent': self._parse_value(x[-2]), 'packets_in.count': self._parse_value(x[-7]), 'packets_in.error': self._parse_value(x[-6]), 'packets_out.count': self._parse_value(x[-4]), 'packets_out.error': self._parse_value(x[-3]), } self._submit_devicemetrics(iface, metrics, custom_tags) except SubprocessOutputEmptyError: self.log.exception("Error collecting connection stats.") try: netstat, _, _ = get_subprocess_output(["netstat", "-s", "-p" "tcp"], self.log) # 3651535 packets sent # 972097 data packets (615753248 bytes) # 5009 data packets (2832232 bytes) retransmitted # 0 resends initiated by MTU discovery # 2086952 ack-only packets (471 delayed) # 0 URG only packets # 0 window probe packets # 310851 window update packets # 336829 control packets # 0 data packets sent after flow control # 3058232 checksummed in software # 3058232 segments (571218834 bytes) over IPv4 # 0 segments (0 bytes) over IPv6 # 4807551 packets received # 1143534 acks (for 616095538 bytes) # 165400 duplicate acks # ... self._submit_regexed_values(netstat, BSD_TCP_METRICS, custom_tags) except SubprocessOutputEmptyError: self.log.exception("Error collecting TCP stats.")
def _check_linux(self, instance): """ _check_linux can be run inside a container and still collects the network metrics from the host For that procfs_path can be set to something like "/host/proc" When a custom procfs_path is set, the collect_connection_state option is ignored """ proc_location = self.agentConfig.get('procfs_path', '/proc').rstrip('/') custom_tags = instance.get('tags', []) if Platform.is_containerized() and proc_location != "/proc": proc_location = "%s/1" % proc_location if self._is_collect_cx_state_runnable(proc_location): try: self.log.debug("Using `ss` to collect connection state") # Try using `ss` for increased performance over `netstat` for ip_version in ['4', '6']: for protocol in ['tcp', 'udp']: # Call `ss` for each IP version because there's no built-in way of distinguishing # between the IP versions in the output # Also calls `ss` for each protocol, because on some systems (e.g. Ubuntu 14.04), there is a # bug that print `tcp` even if it's `udp` output, _, _ = get_subprocess_output(["ss", "-n", "-{0}".format(protocol[0]), "-a", "-{0}".format(ip_version)], self.log) lines = output.splitlines() # State Recv-Q Send-Q Local Address:Port Peer Address:Port # UNCONN 0 0 127.0.0.1:8125 *:* # ESTAB 0 0 127.0.0.1:37036 127.0.0.1:8125 # UNCONN 0 0 fe80::a00:27ff:fe1c:3c4:123 :::* # TIME-WAIT 0 0 90.56.111.177:56867 46.105.75.4:143 # LISTEN 0 0 ::ffff:127.0.0.1:33217 ::ffff:127.0.0.1:7199 # ESTAB 0 0 ::ffff:127.0.0.1:58975 ::ffff:127.0.0.1:2181 metrics = self._parse_linux_cx_state(lines[1:], self.tcp_states['ss'], 0, protocol=protocol, ip_version=ip_version) # Only send the metrics which match the loop iteration's ip version for stat, metric in self.cx_state_gauge.iteritems(): if stat[0].endswith(ip_version) and stat[0].startswith(protocol): self.gauge(metric, metrics.get(metric), tags=custom_tags) except OSError: self.log.info("`ss` not found: using `netstat` as a fallback") output, _, _ = get_subprocess_output(["netstat", "-n", "-u", "-t", "-a"], self.log) lines = output.splitlines() # Active Internet connections (w/o servers) # Proto Recv-Q Send-Q Local Address Foreign Address State # tcp 0 0 46.105.75.4:80 79.220.227.193:2032 SYN_RECV # tcp 0 0 46.105.75.4:143 90.56.111.177:56867 ESTABLISHED # tcp 0 0 46.105.75.4:50468 107.20.207.175:443 TIME_WAIT # tcp6 0 0 46.105.75.4:80 93.15.237.188:58038 FIN_WAIT2 # tcp6 0 0 46.105.75.4:80 79.220.227.193:2029 ESTABLISHED # udp 0 0 0.0.0.0:123 0.0.0.0:* # udp6 0 0 :::41458 :::* metrics = self._parse_linux_cx_state(lines[2:], self.tcp_states['netstat'], 5) for metric, value in metrics.iteritems(): self.gauge(metric, value, tags=custom_tags) except SubprocessOutputEmptyError: self.log.exception("Error collecting connection stats.") proc_dev_path = "{}/net/dev".format(proc_location) with open(proc_dev_path, 'r') as proc: lines = proc.readlines() # Inter-| Receive | Transmit # face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed # noqa: E501 # lo:45890956 112797 0 0 0 0 0 0 45890956 112797 0 0 0 0 0 0 # noqa: E501 # eth0:631947052 1042233 0 19 0 184 0 1206 1208625538 1320529 0 0 0 0 0 0 # noqa: E501 # eth1: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # noqa: E501 for l in lines[2:]: cols = l.split(':', 1) x = cols[1].split() # Filter inactive interfaces if self._parse_value(x[0]) or self._parse_value(x[8]): iface = cols[0].strip() metrics = { 'bytes_rcvd': self._parse_value(x[0]), 'bytes_sent': self._parse_value(x[8]), 'packets_in.count': self._parse_value(x[1]), 'packets_in.error': self._parse_value(x[2]) + self._parse_value(x[3]), 'packets_out.count': self._parse_value(x[9]), 'packets_out.error': self._parse_value(x[10]) + self._parse_value(x[11]), } self._submit_devicemetrics(iface, metrics, custom_tags) netstat_data = {} for f in ['netstat', 'snmp']: proc_data_path = "{}/net/{}".format(proc_location, f) try: with open(proc_data_path, 'r') as netstat: while True: n_header = netstat.readline() if not n_header: break # No more? Abort! n_data = netstat.readline() h_parts = n_header.strip().split(' ') h_values = n_data.strip().split(' ') ns_category = h_parts[0][:-1] netstat_data[ns_category] = {} # Turn the data into a dictionary for idx, hpart in enumerate(h_parts[1:]): netstat_data[ns_category][hpart] = h_values[idx + 1] except IOError: # On Openshift, /proc/net/snmp is only readable by root self.log.debug("Unable to read %s.", proc_data_path) nstat_metrics_names = { 'Tcp': { 'RetransSegs': 'system.net.tcp.retrans_segs', 'InSegs': 'system.net.tcp.in_segs', 'OutSegs': 'system.net.tcp.out_segs', }, 'TcpExt': { 'ListenOverflows': 'system.net.tcp.listen_overflows', 'ListenDrops': 'system.net.tcp.listen_drops', 'TCPBacklogDrop': 'system.net.tcp.backlog_drops', 'TCPRetransFail': 'system.net.tcp.failed_retransmits', }, 'Udp': { 'InDatagrams': 'system.net.udp.in_datagrams', 'NoPorts': 'system.net.udp.no_ports', 'InErrors': 'system.net.udp.in_errors', 'OutDatagrams': 'system.net.udp.out_datagrams', 'RcvbufErrors': 'system.net.udp.rcv_buf_errors', 'SndbufErrors': 'system.net.udp.snd_buf_errors', 'InCsumErrors': 'system.net.udp.in_csum_errors' } } # Skip the first line, as it's junk for k in nstat_metrics_names: for met in nstat_metrics_names[k]: if met in netstat_data.get(k, {}): self._submit_netmetric(nstat_metrics_names[k][met], self._parse_value(netstat_data[k][met]), tags=custom_tags)
aggregator.assert_all_metrics_covered() def test_check_ssl(aggregator, check, openldap_server, instance_ssl): tags = ["url:{}".format(instance_ssl["url"]), "test:integration"] # Should fail certificate verification with pytest.raises(ldap3.core.exceptions.LDAPExceptionError): check.check(instance_ssl) aggregator.assert_service_check("openldap.can_connect", check.CRITICAL, tags=tags) instance_ssl["ssl_verify"] = False # Should work now check.check(instance_ssl) aggregator.assert_service_check("openldap.can_connect", check.OK, tags=tags) def test_check_connection_failure(aggregator, check, openldap_server, instance): instance["url"] = "bad_url" tags = ["url:{}".format(instance["url"]), "test:integration"] # Should fail certificate verification with pytest.raises(ldap3.core.exceptions.LDAPExceptionError): check.check(instance) aggregator.assert_service_check("openldap.can_connect", check.CRITICAL, tags=tags) @pytest.mark.skipif(not Platform.is_linux(), reason='Windows sockets are not file handles') def test_check_socket(aggregator, check, openldap_server, instance): instance["url"] = "ldapi://{}".format(openldap_server) tags = ["url:{}".format(instance["url"]), "test:integration"] check.check(instance) aggregator.assert_service_check("openldap.can_connect", check.OK, tags=tags)
# Should work now check.check(instance_ssl) aggregator.assert_service_check("openldap.can_connect", check.OK, tags=tags) @pytest.mark.usefixtures('dd_environment') def test_check_connection_failure(aggregator, check, instance): instance["url"] = "bad_url" tags = ["url:{}".format(instance["url"]), "test:integration"] # Should fail certificate verification with pytest.raises(ldap3.core.exceptions.LDAPExceptionError): check.check(instance) aggregator.assert_service_check("openldap.can_connect", check.CRITICAL, tags=tags) @pytest.mark.skipif(not Platform.is_linux(), reason='Windows sockets are not file handles') @pytest.mark.usefixtures('dd_environment') def test_check_socket(aggregator, check, instance): host_socket_path = os.path.join(os.environ['HOST_SOCKET_DIR'], 'ldapi') instance["url"] = "ldapi://{}".format(host_socket_path) tags = ["url:{}".format(instance["url"]), "test:integration"] check.check(instance) aggregator.assert_service_check("openldap.can_connect", check.OK, tags=tags)
# All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) from __future__ import unicode_literals import time import dns.resolver from datadog_checks.checks import NetworkCheck, Status from datadog_checks.utils.platform import Platform # These imports are necessary because otherwise dynamic type # resolution will fail on windows without it. # See more here: https://github.com/rthalley/dnspython/issues/39. if Platform.is_win32(): from dns.rdtypes.ANY import * # noqa from dns.rdtypes.IN import * # noqa # for tiny time deltas, time.time on Windows reports the same value # of the clock more than once, causing the computation of response_time # to be often 0; let's use time.clock that is more precise. time_func = time.clock else: time_func = time.time class BadConfException(Exception): pass
def test_complex_config(aggregator, spin_up_mysql): mysql_check = MySql(common.CHECK_NAME, {}, {}, instances=[common_config.MYSQL_COMPLEX_CONFIG]) mysql_check.check(common_config.MYSQL_COMPLEX_CONFIG) # Test service check aggregator.assert_service_check('mysql.can_connect', status=MySql.OK, tags=tags.SC_TAGS, count=1) aggregator.assert_service_check('mysql.replication.slave_running', status=MySql.OK, tags=tags.SC_TAGS, at_least=1) ver = map(lambda x: int(x), mysql_check.mysql_version[mysql_check._get_host_key()]) ver = tuple(ver) testable_metrics = (variables.STATUS_VARS + variables.VARIABLES_VARS + variables.INNODB_VARS + variables.BINLOG_VARS + variables.SYSTEM_METRICS + variables.SCHEMA_VARS + variables.SYNTHETIC_VARS) if ver >= (5, 6, 0) and environ.get('MYSQL_FLAVOR') != 'mariadb': testable_metrics.extend(variables.PERFORMANCE_VARS) # Test metrics for mname in testable_metrics: # These two are currently not guaranteed outside of a Linux # environment. if mname == 'mysql.performance.user_time' and not Platform.is_linux(): continue if mname == 'mysql.performance.kernel_time' and not Platform.is_linux(): continue if mname == 'mysql.performance.cpu_time' and Platform.is_windows(): continue if mname == 'mysql.performance.query_run_time.avg': aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:testdb'], count=1) aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:mysql'], count=1) elif mname == 'mysql.info.schema.size': aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:testdb'], count=1) aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:information_schema'], count=1) aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:performance_schema'], count=1) else: aggregator.assert_metric(mname, tags=tags.METRIC_TAGS, at_least=0) # TODO: test this if it is implemented # Assert service metadata # version_metadata = mysql_check.service_metadata['version'] # assert len(version_metadata) == 1 # test custom query metrics aggregator.assert_metric('alice.age', value=25) aggregator.assert_metric('bob.age', value=20) # test optional metrics optional_metrics = (variables.OPTIONAL_REPLICATION_METRICS + variables.OPTIONAL_INNODB_VARS + variables.OPTIONAL_STATUS_VARS + variables.OPTIONAL_STATUS_VARS_5_6_6) _test_optional_metrics(aggregator, optional_metrics, 1) # Raises when coverage < 100% aggregator.assert_all_metrics_covered()