def check(self, instance): # Not configured? Not a problem. if instance.get("varnishstat", None) is None: raise Exception("varnishstat is not configured") tags = instance.get('tags', []) if tags is None: tags = [] else: tags = list(set(tags)) varnishstat_path = instance.get("varnishstat") name = instance.get('name') # Get version and version-specific args from varnishstat -V. version, use_xml = self._get_version_info(varnishstat_path) # Parse metrics from varnishstat. arg = '-x' if use_xml else '-1' cmd = [varnishstat_path, arg] if name is not None: cmd.extend(['-n', name]) tags += [u'varnish_name:%s' % name] else: tags += [u'varnish_name:default'] output, _, _ = get_subprocess_output(cmd, self.log) self._parse_varnishstat(output, use_xml, tags) # Parse service checks from varnishadm. varnishadm_path = instance.get('varnishadm') if varnishadm_path: secretfile_path = instance.get('secretfile', '/etc/varnish/secret') cmd = [] if geteuid() != 0: cmd.append('sudo') if version < LooseVersion('4.1.0'): cmd.extend( [varnishadm_path, '-S', secretfile_path, 'debug.health']) else: cmd.extend([ varnishadm_path, '-S', secretfile_path, 'backend.list', '-p' ]) try: output, err, _ = get_subprocess_output(cmd, self.log) except OSError as e: self.log.error( "There was an error running varnishadm. Make sure 'sudo' is available. %s", e) output = None if err: self.log.error( 'Error getting service check from varnishadm: %s', err) if output: self._parse_varnishadm(output)
def _collect_raw(self, ceph_cmd, instance): use_sudo = _is_affirmative(instance.get('use_sudo', False)) ceph_args = [] if use_sudo: test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo != 0: raise Exception('The dd-agent user does not have sudo access') ceph_args = ['sudo', ceph_cmd] else: ceph_args = [ceph_cmd] args = ceph_args + ['version'] try: output,_,_ = get_subprocess_output(args, self.log) except Exception as e: raise Exception('Unable to run cmd=%s: %s' % (' '.join(args), str(e))) raw = {} for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats', 'osd perf', 'health detail'): try: args = ceph_args + cmd.split() + ['-fjson'] output,_,_ = get_subprocess_output(args, self.log) res = json.loads(output) except Exception as e: self.log.warning('Unable to parse data from cmd=%s: %s' % (cmd, str(e))) continue name = cmd.replace(' ', '_') raw[name] = res return raw
def _collect_raw(self, ceph_cmd, instance): use_sudo = _is_affirmative(instance.get('use_sudo', False)) ceph_args = [] if use_sudo: test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo != 0: raise Exception('The dd-agent user does not have sudo access') ceph_args = ['sudo', ceph_cmd] else: ceph_args = [ceph_cmd] args = ceph_args + ['version'] try: output, _, _ = get_subprocess_output(args, self.log) except Exception as e: raise Exception('Unable to run cmd=%s: %s' % (' '.join(args), str(e))) raw = {} for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats', 'osd perf'): try: args = ceph_args + cmd.split() + ['-fjson'] output, _, _ = get_subprocess_output(args, self.log) res = json.loads(output) except Exception as e: self.log.warning('Unable to parse data from cmd=%s: %s' % (cmd, str(e))) continue name = cmd.replace(' ', '_') raw[name] = res return raw
def _check_solaris(self, instance): # Can't get bytes sent and received via netstat # Default to kstat -p link:0: try: netstat, _, _ = get_subprocess_output(["kstat", "-p", "link:0:"], self.log) metrics_by_interface = self._parse_solaris_netstat(netstat) for interface, metrics in metrics_by_interface.iteritems(): self._submit_devicemetrics(interface, metrics) except SubprocessOutputEmptyError: self.log.exception("Error collecting kstat stats.") try: netstat, _, _ = get_subprocess_output(["netstat", "-s", "-P" "tcp"], self.log) # TCP: tcpRtoAlgorithm= 4 tcpRtoMin = 200 # tcpRtoMax = 60000 tcpMaxConn = -1 # tcpActiveOpens = 57 tcpPassiveOpens = 50 # tcpAttemptFails = 1 tcpEstabResets = 0 # tcpCurrEstab = 0 tcpOutSegs = 254 # tcpOutDataSegs = 995 tcpOutDataBytes =1216733 # tcpRetransSegs = 0 tcpRetransBytes = 0 # tcpOutAck = 185 tcpOutAckDelayed = 4 # ... self._submit_regexed_values(netstat, SOLARIS_TCP_METRICS) except SubprocessOutputEmptyError: self.log.exception("Error collecting TCP stats.")
def _check_bsd(self, instance): netstat_flags = ['-i', '-b'] if Platform.is_freebsd(): netstat_flags.append('-W') try: output, _, _ = get_subprocess_output(["netstat"] + netstat_flags, self.log) lines = output.splitlines() headers = lines[0].split() for h in ("Ipkts", "Ierrs", "Ibytes", "Opkts", "Oerrs", "Obytes", "Coll"): if h not in headers: self.logger.error("%s not found in %s; cannot parse" % (h, headers)) return False current = None for l in lines[1:]: if "Name" in l: break x = l.split() if len(x) == 0: break iface = x[0] if iface.endswith("*"): iface = iface[:-1] if iface == current: continue else: current = iface if self._parse_value(x[-5]) or self._parse_value(x[-2]): iface = current metrics = { 'bytes_rcvd': self._parse_value(x[-5]), 'bytes_sent': self._parse_value(x[-2]), 'packets_in.count': self._parse_value(x[-7]), 'packets_in.error': self._parse_value(x[-6]), 'packets_out.count': self._parse_value(x[-4]), 'packets_out.error': self._parse_value(x[-3]), } self._submit_devicemetrics(iface, metrics) except SubprocessOutputEmptyError: self.log.exception("Error collecting connection stats.") try: netstat, _, _ = get_subprocess_output( ["netstat", "-s", "-p" "tcp"], self.log) self._submit_regexed_values(netstat, BSD_TCP_METRICS) except SubprocessOutputEmptyError: self.log.exception("Error collecting TCP stats.")
def _get_nsd_control_stats(self, tags, metrics): output = None if os.geteuid() == 0: # dd-agent is running as root (not recommended) output = get_subprocess_output(['nsd-control', 'stats'], self.log, False) else: # can dd-agent user run sudo? test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo == 0: output, _, _ = get_subprocess_output( ['sudo', 'nsd-control', 'stats'], self.log, False) else: raise Exception('The dd-agent user does not have sudo access') for metric in re.findall(r'(\S+)=(.*\d)', output): if len(metrics) == 0 or metric[0] in metrics: self.log.debug('nsd.{}:{}'.format(metric[0], metric[1])) if 'num.' in metric[0]: self.rate(METRIC_PREFIX + metric[0], float(metric[1]), tags=tags) else: self.gauge(METRIC_PREFIX + metric[0], float(metric[1]), tags=tags)
def get_system_stats(): systemStats = { 'machine': platform.machine(), 'platform': sys.platform, 'processor': platform.processor(), 'pythonV': platform.python_version(), } platf = sys.platform try: if Platform.is_linux(platf): output, _, _ = get_subprocess_output(['grep', 'model name', '/proc/cpuinfo'], log) systemStats['cpuCores'] = len(output.splitlines()) if Platform.is_darwin(platf) or Platform.is_freebsd(platf): output, _, _ = get_subprocess_output(['sysctl', 'hw.ncpu'], log) systemStats['cpuCores'] = int(output.split(': ')[1]) except SubprocessOutputEmptyError as e: log.warning("unable to retrieve number of cpuCores. Failed with error %s", e) if Platform.is_linux(platf): systemStats['nixV'] = platform.dist() elif Platform.is_darwin(platf): systemStats['macV'] = platform.mac_ver() elif Platform.is_freebsd(platf): version = platform.uname()[2] systemStats['fbsdV'] = ('freebsd', version, '') # no codename for FreeBSD elif Platform.is_win32(platf): systemStats['winV'] = platform.win32_ver() return systemStats
def _get_postqueue_stats(self, postfix_config_dir, tags): # get some intersting configuratin values from postconf pc_output, _, _ = get_subprocess_output(['postconf', 'mail_version'], self.log, False) postfix_version = pc_output.strip('\n').split('=')[1].strip() pc_output, _, _ = get_subprocess_output( ['postconf', 'authorized_mailq_users'], self.log, False) authorized_mailq_users = pc_output.strip('\n').split('=')[1].strip() self.log.debug( 'authorized_mailq_users : {}'.format(authorized_mailq_users)) output, _, _ = get_subprocess_output( ['postqueue', '-c', postfix_config_dir, '-p'], self.log, False) active_count = 0 hold_count = 0 deferred_count = 0 # postque -p sample output ''' root@postfix:/opt/datadog-agent/agent/checks.d# postqueue -p ----Queue ID----- --Size-- ---Arrival Time---- --Sender/Recipient------ 3xWyLP6Nmfz23fk 367 Tue Aug 15 16:17:33 [email protected] (deferred transport) [email protected] 3xWyD86NwZz23ff! 358 Tue Aug 15 16:12:08 [email protected] (deferred transport) [email protected] -- 1 Kbytes in 2 Requests. ''' for line in output.splitlines(): if '*' in line: active_count += 1 continue if '!' in line: hold_count += 1 continue if line[0:1].isdigit(): deferred_count += 1 self.log.debug('Postfix Version: %s' % postfix_version) self.gauge('postfix.queue.size', active_count, tags=tags + ['queue:active', 'instance:{}'.format(postfix_config_dir)]) self.gauge('postfix.queue.size', hold_count, tags=tags + ['queue:hold', 'instance:{}'.format(postfix_config_dir)]) self.gauge( 'postfix.queue.size', deferred_count, tags=tags + ['queue:deferred', 'instance:{}'.format(postfix_config_dir)])
def check(self, instance): if instance.get("varnishstat", None) is None: raise Exception("varnishstat is not configured") tags = instance.get('tags', []) if tags is None: tags = [] else: tags = list(set(tags)) varnishstat_path = instance.get("varnishstat") name = instance.get('name') version, use_xml = self._get_version_info(varnishstat_path) arg = '-x' if use_xml else '-1' cmd = [varnishstat_path, arg] if name is not None: cmd.extend(['-n', name]) tags += [u'varnish_name:%s' % name] else: tags += [u'varnish_name:default'] output, _, _ = get_subprocess_output(cmd, self.log) self._parse_varnishstat(output, use_xml, tags) varnishadm_path = instance.get('varnishadm') if varnishadm_path: secretfile_path = instance.get('secretfile', '/etc/varnish/secret') cmd = ['sudo', varnishadm_path, '-S', secretfile_path, 'debug.health'] output, _, _ = get_subprocess_output(cmd, self.log) if output: self._parse_varnishadm(output)
def check(self, agentConfig): io = {} try: if Platform.is_linux(): stdout, _, _ = get_subprocess_output(['iostat', '-d', '1', '2', '-x', '-k'], self.logger) io.update(self._parse_linux2(stdout)) elif sys.platform == "sunos5": output, _, _ = get_subprocess_output(["iostat", "-x", "-d", "1", "2"], self.logger) iostat = output.splitlines() lines = [l for l in iostat if len(l) > 0] lines = lines[len(lines) / 2:] assert "extended device statistics" in lines[0] headers = lines[1].split() assert "device" in headers for l in lines[2:]: cols = l.split() io[cols[0]] = {} for i in range(1, len(cols)): io[cols[0]][self.xlate(headers[i], "sunos")] = cols[i] elif sys.platform.startswith("freebsd"): output, _, _ = get_subprocess_output(["iostat", "-x", "-d", "1", "2"], self.logger) iostat = output.splitlines() lines = [l for l in iostat if len(l) > 0] lines = lines[len(lines) / 2:] assert "extended device statistics" in lines[0] headers = lines[1].split() assert "device" in headers for l in lines[2:]: cols = l.split() io[cols[0]] = {} for i in range(1, len(cols)): io[cols[0]][self.xlate(headers[i], "freebsd")] = cols[i] elif sys.platform == 'darwin': iostat, _, _ = get_subprocess_output(['iostat', '-d', '-c', '2', '-w', '1'], self.logger) io = self._parse_darwin(iostat) else: return False device_blacklist_re = agentConfig.get('device_blacklist_re', None) if device_blacklist_re: filtered_io = {} for device, stats in io.iteritems(): if not device_blacklist_re.match(device): filtered_io[device] = stats else: filtered_io = io return filtered_io except Exception: self.logger.exception("Cannot extract IO statistics") return False
def check(self, instance): # Not configured? Not a problem. if instance.get("varnishstat", None) is None: raise Exception("varnishstat is not configured") tags = instance.get('tags', []) if tags is None: tags = [] else: tags = list(set(tags)) varnishstat_path = instance.get("varnishstat") name = instance.get('name') # Get version and version-specific args from varnishstat -V. version, use_xml = self._get_version_info(varnishstat_path) # Parse metrics from varnishstat. arg = '-x' if use_xml else '-1' cmd = [varnishstat_path, arg] if name is not None: cmd.extend(['-n', name]) tags += [u'varnish_name:%s' % name] else: tags += [u'varnish_name:default'] output, _, _ = get_subprocess_output(cmd, self.log) self._parse_varnishstat(output, use_xml, tags) # Parse service checks from varnishadm. varnishadm_path = instance.get('varnishadm') if varnishadm_path: secretfile_path = instance.get('secretfile', '/etc/varnish/secret') cmd = [] if geteuid() != 0: cmd.append('sudo') if version < LooseVersion('4.1.0'): cmd.extend([varnishadm_path, '-S', secretfile_path, 'debug.health']) else: cmd.extend([varnishadm_path, '-S', secretfile_path, 'backend.list', '-p']) try: output, err, _ = get_subprocess_output(cmd, self.log) except OSError as e: self.log.error("There was an error running varnishadm. Make sure 'sudo' is available. %s", e) output = None if err: self.log.error('Error getting service check from varnishadm: %s', err) if output: self._parse_varnishadm(output)
class Ceph(AgentCheck): DEFAULT_CEPH_CMD = '/usr/bin/ceph' DEFAULT_CEPH_CLUSTER = 'ceph' NAMESPACE = "openstack.ceph" def _collect_raw(self, ceph_cmd, ceph_cluster, instance): use_sudo = _is_affirmative(instance.get('use_sudo', False)) ceph_args = [] if use_sudo: test_sudo = os.popen('setsid sudo -l < /dev/null').read() if test_sudo != 0: raise Exception('The monitor-agent user does not have sudo access') ceph_args = ['sudo', ceph_cmd] else: ceph_args = ['sudo', ceph_cmd] ceph_args += ["--cluster", ceph_cluster] args = ceph_args + ['version'] try: output, _, _ = get_subprocess_output(args, self.log) except Exception, e: raise Exception('Unable to run cmd=%s: %s' % (' '.join(args), str(e))) raw = {} for cmd in ('mon_status', 'status', 'df detail', 'osd stat', 'osd pool stats', 'health', 'mds stat'): try: args = ceph_args + cmd.split() + ['-fjson'] output, _, _ = get_subprocess_output(args, self.log) res = json.loads(output) except Exception as e: self.log.warning('Unable to parse data from cmd=%s: %s' % (cmd, str(e))) continue name = cmd.replace(' ', '_') raw[name] = res return raw
def check(self, instance): stat_out, err, _ = get_subprocess_output(self.nfs_cmd, self.log) all_devices = [] this_device = [] custom_tags = instance.get("tags", []) for l in stat_out.splitlines(): if not l: continue elif l.find('mounted on') >= 0 and len(this_device) > 0: # if it's a new device, create the device and add it to the array device = Device(this_device, self.log) all_devices.append(device) this_device = [] this_device.append(l.strip().split()) # Add the last device into the array device = Device(this_device, self.log) all_devices.append(device) # Disregard the first half of device stats (report 1 of 2) # as that is the moving average all_devices = all_devices[len(all_devices) // 2:] for device in all_devices: device.send_metrics(self.gauge, custom_tags)
def journalctl_entries(self, args): out, err, exitCode = get_subprocess_output( [ 'journalctl', # One JSON object per line per entry. '-o', 'json', # No reason to look at non-system logs. '--system', # Kernel logs. '_TRANSPORT=kernel', # A the "error" level. 'PRIORITY=3' ] + args, self.log) if exitCode != 0: self.log.error('journalctl failed, code {0}: {1}'.format( exitCode, err)) self.increment('oom.errors.je.failure') return [] try: return [json.loads(line) for line in out.splitlines()] except: self.log.exception('json parsing failed') self.increment('oom.errors.je.jsonfail') return []
def _get_version_info(self, varnishstat_path): output, error, _ = get_subprocess_output([varnishstat_path, "-V"], self.log) use_xml = True version = 3 m1 = re.search(r"varnish-(\d+)", output, re.MULTILINE) m2 = re.search(r"varnish-(\d+)", error, re.MULTILINE) if m1 is None and m2 is None: self.log.warn("Cannot determine the version of varnishstat, assuming 3 or greater") self.warning("Cannot determine the version of varnishstat, assuming 3 or greater") else: if m1 is not None: version = int(m1.group(1)) elif m2 is not None: version = int(m2.group(1)) self.log.debug("Varnish version: %d" % version) if version <= 2: use_xml = False return version, use_xml
def check(self, agentConfig): process_exclude_args = agentConfig.get('exclude_process_args', False) if process_exclude_args: ps_arg = 'aux' else: ps_arg = 'auxww' # Get output from ps try: output, _, _ = get_subprocess_output(['ps', ps_arg], self.logger) processLines = output.splitlines() # Also removes a trailing empty line del processLines[0] # Removes the headers except Exception: self.logger.exception('getProcesses') return False processes = [] for line in processLines: line = line.split(None, 10) processes.append(map(lambda s: s.strip(), line)) return {'processes': processes, 'apiKey': agentConfig['api_key'], 'host': get_hostname(agentConfig)}
def _get_version_info(self, varnishstat_path): # Get the varnish version from varnishstat output, error, _ = get_subprocess_output([varnishstat_path, "-V"], self.log, raise_on_empty_output=False) # Assumptions regarding varnish's version use_xml = True version = 3 m1 = re.search(r"varnish-(\d+)", output, re.MULTILINE) # v2 prints the version on stderr, v3 on stdout m2 = re.search(r"varnish-(\d+)", error, re.MULTILINE) if m1 is None and m2 is None: self.log.warn("Cannot determine the version of varnishstat, assuming 3 or greater") self.warning("Cannot determine the version of varnishstat, assuming 3 or greater") else: if m1 is not None: version = int(m1.group(1)) elif m2 is not None: version = int(m2.group(1)) self.log.debug("Varnish version: %d" % version) # Location of varnishstat if version <= 2: use_xml = False return version, use_xml
def check(self, agentConfig): process_exclude_args = agentConfig.get('exclude_process_args', False) if process_exclude_args: ps_arg = 'aux' else: ps_arg = 'auxww' # Get output from ps try: output, _, _ = get_subprocess_output(['ps', ps_arg], self.logger) processLines = output.splitlines( ) # Also removes a trailing empty line except StandardError: self.logger.exception('getProcesses') return False del processLines[0] # Removes the headers processes = [] for line in processLines: line = line.split(None, 10) processes.append(map(lambda s: s.strip(), line)) return { 'processes': processes, 'apiKey': agentConfig['api_key'], 'host': get_hostname(agentConfig) }
def _get_version_info(self, varnishstat_path): # Get the varnish version from varnishstat output, error, _ = get_subprocess_output(varnishstat_path + ["-V"], self.log, raise_on_empty_output=False) # Assumptions regarding varnish's version use_xml = True version = LooseVersion('3.0.0') m1 = self.version_pattern.search(output, re.MULTILINE) # v2 prints the version on stderr, v3 on stdout m2 = self.version_pattern.search(error, re.MULTILINE) if m1 is None and m2 is None: self.log.warn( "Cannot determine the version of varnishstat, assuming 3 or greater" ) self.warning( "Cannot determine the version of varnishstat, assuming 3 or greater" ) else: if m1 is not None: version = LooseVersion(m1.group()) elif m2 is not None: version = LooseVersion(m2.group()) self.log.debug("Varnish version: %s", version) # Location of varnishstat if version < LooseVersion('3.0.0'): use_xml = False return version, use_xml
def check(self, agentConfig): if Platform.is_linux(): try: with open('/proc/loadavg', 'r') as load_avg: uptime = load_avg.readline().strip() except Exception: self.logger.exception('Cannot extract load') return False elif sys.platform in ('darwin', 'sunos5') or sys.platform.startswith("freebsd"): try: uptime, _, _ = get_subprocess_output(['uptime'], self.logger) except Exception: self.logger.exception('Cannot extract load') return False load = [res.replace(',', '.') for res in re.findall(r'([0-9]+[\.,]\d+)', uptime)] try: cores = int(agentConfig.get('system_stats').get('cpuCores')) assert cores >= 1, "Cannot determine number of cores" return {'system.load.1': float(load[0]), 'system.load.5': float(load[1]), 'system.load.15': float(load[2]), 'system.load.norm.1': float(load[0]) / cores, 'system.load.norm.5': float(load[1]) / cores, 'system.load.norm.15': float(load[2]) / cores, } except Exception: return {'system.load.1': float(load[0]), 'system.load.5': float(load[1]), 'system.load.15': float(load[2])}
def _get_proc_list(self): # Get output from ps try: process_exclude_args = self.config.get('exclude_process_args', False) if process_exclude_args: ps_arg = 'aux' else: ps_arg = 'auxww' output, _, _ = get_subprocess_output(['ps', ps_arg], self.log) processLines = output.splitlines( ) # Also removes a trailing empty line except Exception: self.log.exception('Cannot get process list') raise #del processLines[0] # Removes the headers processes = [] for line in processLines: line = line.split(None, 10) processes.append(map(lambda s: s.strip(), line)) return processes
def _get_queue_count(self, directory, queues, tags): for queue in queues: queue_path = os.path.join(directory, queue) if not os.path.exists(queue_path): raise Exception('%s does not exist' % queue_path) count = 0 if os.geteuid() == 0: # dd-agent is running as root (not recommended) count = sum( len(files) for root, dirs, files in os.walk(queue_path)) else: # can dd-agent user run sudo? test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo == 0: output, _, _ = get_subprocess_output( ['sudo', 'find', queue_path, '-type', 'f'], self.log) count = len(output.splitlines()) else: raise Exception( 'The dd-agent user does not have sudo access') # emit an individually tagged metric self.gauge('postfix.queue.size', count, tags=tags + [ 'queue:%s' % queue, 'instance:%s' % os.path.basename(directory) ])
def _get_version_info(self, varnishstat_path): # Get the varnish version from varnishstat output, error, _ = get_subprocess_output([varnishstat_path, "-V"], self.log) # Assumptions regarding varnish's version use_xml = True version = 3 m1 = re.search(r"varnish-(\d+)", output, re.MULTILINE) # v2 prints the version on stderr, v3 on stdout m2 = re.search(r"varnish-(\d+)", error, re.MULTILINE) if m1 is None and m2 is None: self.log.warn( "Cannot determine the version of varnishstat, assuming 3 or greater" ) self.warning( "Cannot determine the version of varnishstat, assuming 3 or greater" ) else: if m1 is not None: version = int(m1.group(1)) elif m2 is not None: version = int(m2.group(1)) self.log.debug("Varnish version: %d" % version) # Location of varnishstat if version <= 2: use_xml = False return version, use_xml
def _get_queue_count(self, directory, queues, tags): for queue in queues: queue_path = os.path.join(directory, queue) if not os.path.exists(queue_path): raise Exception('{} does not exist'.format(queue_path)) count = 0 if os.geteuid() == 0: # dd-agent is running as root (not recommended) count = sum( len(files) for root, dirs, files in os.walk(queue_path)) else: # can dd-agent user run sudo? test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo == 0: # default to `root` for backward compatibility postfix_user = self.init_config.get('postfix_user', 'root') output, _, _ = get_subprocess_output([ 'sudo', '-u', postfix_user, 'find', queue_path, '-type', 'f' ], self.log, False) count = len(output.splitlines()) else: raise Exception( 'The dd-agent user does not have sudo access') # emit an individually tagged metric self.gauge('postfix.queue.size', count, tags=tags + [ 'queue:{}'.format(queue), 'instance:{}'.format( os.path.basename(directory)) ])
def check(self, instance): tags = instance.get('tags', []) state_counts = defaultdict(int) prio_counts = defaultdict(int) proc_location = self.agentConfig.get('procfs_path', '/proc').rstrip('/') proc_path_map = { "inode_info": "sys/fs/inode-nr", "stat_info": "stat", "entropy_info": "sys/kernel/random/entropy_avail", } for key, path in proc_path_map.iteritems(): proc_path_map[key] = "{procfs}/{path}".format(procfs=proc_location, path=path) with open(proc_path_map['inode_info'], 'r') as inode_info: inode_stats = inode_info.readline().split() self.gauge('system.inodes.total', float(inode_stats[0]), tags=tags) self.gauge('system.inodes.used', float(inode_stats[1]), tags=tags) with open(proc_path_map['stat_info'], 'r') as stat_info: lines = [line.strip() for line in stat_info.readlines()] for line in lines: if line.startswith('ctxt'): ctxt_count = float(line.split(' ')[1]) self.monotonic_count('system.linux.context_switches', ctxt_count, tags=tags) elif line.startswith('processes'): process_count = int(line.split(' ')[1]) self.monotonic_count('system.linux.processes_created', process_count, tags=tags) elif line.startswith('intr'): interrupts = int(line.split(' ')[1]) self.monotonic_count('system.linux.interrupts', interrupts, tags=tags) with open(proc_path_map['entropy_info'], 'r') as entropy_info: entropy = entropy_info.readline() self.gauge('system.entropy.available', float(entropy), tags=tags) ps = get_subprocess_output(['ps', '--no-header', '-eo', 'stat'], self.log) for state in ps[0]: # Each process state is a flag in a list of characters. See ps(1) for details. for flag in list(state): if state in PROCESS_STATES: state_counts[PROCESS_STATES[state]] += 1 elif state in PROCESS_PRIOS: prio_counts[PROCESS_PRIOS[state]] += 1 for state in state_counts: state_tags = list(tags) state_tags.append("state:" + state) self.gauge('system.processes.states', float(state_counts[state]), state_tags) for prio in prio_counts: prio_tags = list(tags) prio_tags.append("priority:" + prio) self.gauge('system.processes.priorities', float(prio_counts[prio]), prio_tags)
def _get_hostname_unix(): try: # try fqdn out, _, rtcode = get_subprocess_output(['/bin/hostname', '-f'], log) if rtcode == 0: return out.strip() except Exception: return None
def _check_solaris(self, instance): try: netstat, _, _ = get_subprocess_output(["kstat", "-p", "link:0:"], self.log) metrics_by_interface = self._parse_solaris_netstat(netstat) for interface, metrics in metrics_by_interface.iteritems(): self._submit_devicemetrics(interface, metrics) except SubprocessOutputEmptyError: self.log.exception("Error collecting kstat stats.") try: netstat, _, _ = get_subprocess_output( ["netstat", "-s", "-P" "tcp"], self.log) self._submit_regexed_values(netstat, SOLARIS_TCP_METRICS) except SubprocessOutputEmptyError: self.log.exception("Error collecting TCP stats.")
def collect_metrics_manually(self): df_out, _, _ = get_subprocess_output(self.DF_COMMAND + ["-k"], self.log) self.log.debug(df_out) for device in self._list_devices(df_out): self.log.debug("Passed: {0}".format(device)) tags = [device[1]] if self._tag_by_filesystem else [] device_name = device[-1] if self._use_mount else device[0] for metric_name, value in self._collect_metrics_manually(device).iteritems(): self.gauge(metric_name, value, tags=tags, device_name=device_name)
def check(self, instance): tags = instance.get('tags', []) state_counts = defaultdict(int) prio_counts = defaultdict(int) with open('/proc/sys/fs/inode-nr', 'r') as inode_info: inode_stats = inode_info.readline().split() self.gauge('system.inodes.total', float(inode_stats[0]), tags=tags) self.gauge('system.inodes.used', float(inode_stats[1]), tags=tags) with open('/proc/stat', 'r') as stat_info: lines = [line.strip() for line in stat_info.readlines()] for line in lines: if line.startswith('ctxt'): ctxt_count = float(line.split(' ')[1]) self.monotonic_count('system.linux.context_switches', ctxt_count, tags=tags) elif line.startswith('processes'): process_count = int(line.split(' ')[1]) self.monotonic_count('system.linux.processes_created', process_count, tags=tags) elif line.startswith('intr'): interrupts = int(line.split(' ')[1]) self.monotonic_count('system.linux.interrupts', interrupts, tags=tags) with open('/proc/sys/kernel/random/entropy_avail') as entropy_info: entropy = entropy_info.readline() self.gauge('system.entropy.available', float(entropy), tags=tags) ps = get_subprocess_output(['ps', '--no-header', '-eo', 'stat'], self.log) for state in ps[0]: # Each process state is a flag in a list of characters. See ps(1) for details. for flag in list(state): if state in PROCESS_STATES: state_counts[PROCESS_STATES[state]] += 1 elif state in PROCESS_PRIOS: prio_counts[PROCESS_PRIOS[state]] += 1 for state in state_counts: state_tags = list(tags) state_tags.append("state:" + state) self.gauge('system.processes.states', float(state_counts[state]), state_tags) for prio in prio_counts: prio_tags = list(tags) prio_tags.append("priority:" + prio) self.gauge('system.processes.priorities', float(prio_counts[prio]), prio_tags)
def collect_metrics_manually(self): df_out, _, _ = get_subprocess_output(self.DF_COMMAND + ['-k'], self.log) self.log.debug(df_out) for device in self._list_devices(df_out): self.log.debug("Passed: {0}".format(device)) tags = [device[1]] if self._tag_by_filesystem else [] device_name = device[-1] if self._use_mount else device[0] for metric_name, value in self._collect_metrics_manually(device).iteritems(): self.gauge(metric_name, value, tags=tags, device_name=device_name)
def check(self, instance): # Not configured? Not a problem. if instance.get("varnishstat", None) is None: raise Exception("varnishstat is not configured") tags = instance.get('tags', []) if tags is None: tags = [] else: tags = list(set(tags)) varnishstat_path = instance.get("varnishstat") name = instance.get('name') # Get version and version-specific args from varnishstat -V. version, use_xml = self._get_version_info(varnishstat_path) # Parse metrics from varnishstat. arg = '-x' if use_xml else '-1' cmd = [varnishstat_path, arg] if name is not None: cmd.extend(['-n', name]) tags += [u'varnish_name:%s' % name] else: tags += [u'varnish_name:default'] output, _, _ = get_subprocess_output(cmd, self.log) self._parse_varnishstat(output, use_xml, tags) # Parse service checks from varnishadm. varnishadm_path = instance.get('varnishadm') if varnishadm_path: # TODO: "debug.health" has been removed since varnish 4.1+. We # should check the version and use "backend.list -p" instead. secretfile_path = instance.get('secretfile', '/etc/varnish/secret') cmd = [ 'sudo', varnishadm_path, '-S', secretfile_path, 'debug.health' ] output, _, _ = get_subprocess_output(cmd, self.log) if output: self._parse_varnishadm(output)
def check(self, instance): #Check the status of Entropy if Platform.is_unix(): try: data, _, _ = get_subprocess_output( ['sudo', 'cat', '/proc/sys/kernel/random/entropy_avail'], self.log, False) self.log.debug("Entropy Available:", str(data)) self.gauge('system.entropy.available', int(data)) except Exception as e: self.log.exception("Failed to collect entropy: ".format(e)) else: self.log.warning('Plugin currently only available on Linux.')
def _get_proc_list(self): # Get output from ps try: process_exclude_args = self.config.get('exclude_process_args', False) if process_exclude_args: ps_arg = 'aux' else: ps_arg = 'auxww' output, _, _ = get_subprocess_output(['ps', ps_arg], self.log) processLines = output.splitlines() # Also removes a trailing empty line except Exception, e: self.log.exception('Cannot get process list') return False
def get_system_stats(): systemStats = { 'machine': platform.machine(), 'platform': sys.platform, 'processor': platform.processor(), 'pythonV': platform.python_version(), } platf = sys.platform if Platform.is_linux(platf): output, _, _ = get_subprocess_output( ['grep', 'model name', '/proc/cpuinfo'], log) systemStats['cpuCores'] = len(output.splitlines()) if Platform.is_darwin(platf): output, _, _ = get_subprocess_output(['sysctl', 'hw.ncpu'], log) systemStats['cpuCores'] = int(output.split(': ')[1]) if Platform.is_freebsd(platf): output, _, _ = get_subprocess_output(['sysctl', 'hw.ncpu'], log) systemStats['cpuCores'] = int(output.split(': ')[1]) if Platform.is_linux(platf): systemStats['nixV'] = platform.dist() elif Platform.is_darwin(platf): systemStats['macV'] = platform.mac_ver() elif Platform.is_freebsd(platf): version = platform.uname()[2] systemStats['fbsdV'] = ('freebsd', version, '' ) # no codename for FreeBSD elif Platform.is_win32(platf): systemStats['winV'] = platform.win32_ver() return systemStats
def collect_metrics_manually(self): df_out, _, _ = get_subprocess_output(self.DF_COMMAND + ['-k'], self.log) self.log.debug(df_out) for device in self._list_devices(df_out): self.log.debug("Passed: {0}".format(device)) tags = [device[1], 'filesystem:{}'.format(device[1])] if self._tag_by_filesystem else [] device_name = device[-1] if self._use_mount else device[0] # apply device/mountpoint specific tags for regex, device_tags in self._device_tag_re: if regex.match(device_name): tags += device_tags for metric_name, value in self._collect_metrics_manually(device).iteritems(): self.gauge(metric_name, value, tags=tags, device_name=device_name)
def check(self, instance): # Not configured? Not a problem. if instance.get("varnishstat", None) is None: raise Exception("varnishstat is not configured") tags = instance.get('tags', []) if tags is None: tags = [] else: tags = list(set(tags)) varnishstat_path = instance.get("varnishstat") name = instance.get('name') # Get version and version-specific args from varnishstat -V. version, use_xml = self._get_version_info(varnishstat_path) # Parse metrics from varnishstat. arg = '-x' if use_xml else '-1' cmd = [varnishstat_path, arg] if name is not None: cmd.extend(['-n', name]) tags += [u'varnish_name:%s' % name] else: tags += [u'varnish_name:default'] output, _, _ = get_subprocess_output(cmd, self.log) self._parse_varnishstat(output, use_xml, tags) # Parse service checks from varnishadm. varnishadm_path = instance.get('varnishadm') if varnishadm_path: secretfile_path = instance.get('secretfile', '/etc/varnish/secret') cmd = ['sudo', varnishadm_path, '-S', secretfile_path, 'debug.health'] output, _, _ = get_subprocess_output(cmd, self.log) if output: self._parse_varnishadm(output)
def check(self, instance): # Not configured? Not a problem. if instance.get("varnishstat", None) is None: raise Exception("varnishstat is not configured") tags = instance.get("tags", []) if tags is None: tags = [] else: tags = list(set(tags)) varnishstat_path = instance.get("varnishstat") name = instance.get("name") # Get version and version-specific args from varnishstat -V. version, use_xml = self._get_version_info(varnishstat_path) # Parse metrics from varnishstat. arg = "-x" if use_xml else "-1" cmd = [varnishstat_path, arg] if name is not None: cmd.extend(["-n", name]) tags += [u"varnish_name:%s" % name] else: tags += [u"varnish_name:default"] output, _, _ = get_subprocess_output(cmd, self.log) self._parse_varnishstat(output, use_xml, tags) # Parse service checks from varnishadm. varnishadm_path = instance.get("varnishadm") if varnishadm_path: secretfile_path = instance.get("secretfile", "/etc/varnish/secret") cmd = ["sudo", varnishadm_path, "-S", secretfile_path, "debug.health"] output, _, _ = get_subprocess_output(cmd, self.log) if output: self._parse_varnishadm(output)
def get_system_stats(): systemStats = { 'machine': platform.machine(), 'platform': sys.platform, 'processor': platform.processor(), 'pythonV': platform.python_version(), } platf = sys.platform if Platform.is_linux(platf): output, _, _ = get_subprocess_output(['grep', 'model name', '/proc/cpuinfo'], log) systemStats['cpuCores'] = len(output.splitlines()) if Platform.is_darwin(platf): output, _, _ = get_subprocess_output(['sysctl', 'hw.ncpu'], log) systemStats['cpuCores'] = int(output.split(': ')[1]) if Platform.is_freebsd(platf): output, _, _ = get_subprocess_output(['sysctl', 'hw.ncpu'], log) systemStats['cpuCores'] = int(output.split(': ')[1]) if Platform.is_linux(platf): systemStats['nixV'] = platform.dist() elif Platform.is_darwin(platf): systemStats['macV'] = platform.mac_ver() elif Platform.is_freebsd(platf): version = platform.uname()[2] systemStats['fbsdV'] = ('freebsd', version, '') # no codename for FreeBSD elif Platform.is_win32(platf): systemStats['winV'] = platform.win32_ver() return systemStats
def _run_gohai(self, options): output = None try: output, err, _ = get_subprocess_output(["gohai"] + options, log) if err: log.debug("GOHAI LOG | %s", err) except OSError as e: if e.errno == 2: # file not found, expected when install from source log.info("gohai file not found") else: log.warning("Unexpected OSError when running gohai %s", e) except Exception as e: log.warning("gohai command failed with error %s", e) return output
def get_system_stats(): systemStats = { "machine": platform.machine(), "platform": sys.platform, "processor": platform.processor(), "pythonV": platform.python_version(), } platf = sys.platform if Platform.is_linux(platf): output, _, _ = get_subprocess_output(["grep", "model name", "/proc/cpuinfo"], log) systemStats["cpuCores"] = len(output.splitlines()) if Platform.is_darwin(platf): output, _, _ = get_subprocess_output(["sysctl", "hw.ncpu"], log) systemStats["cpuCores"] = int(output.split(": ")[1]) if Platform.is_freebsd(platf): output, _, _ = get_subprocess_output(["sysctl", "hw.ncpu"], log) systemStats["cpuCores"] = int(output.split(": ")[1]) if Platform.is_linux(platf): systemStats["nixV"] = platform.dist() elif Platform.is_darwin(platf): systemStats["macV"] = platform.mac_ver() elif Platform.is_freebsd(platf): version = platform.uname()[2] systemStats["fbsdV"] = ("freebsd", version, "") # no codename for FreeBSD elif Platform.is_win32(platf): systemStats["winV"] = platform.win32_ver() return systemStats
def _collect_raw(self, ceph_cmd, instance): use_sudo = _is_affirmative(instance.get('use_sudo', False)) ceph_args = [] if use_sudo: test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo != 0: raise Exception('The dd-agent user does not have sudo access') ceph_args = ['sudo', ceph_cmd] else: ceph_args = [ceph_cmd] args = ceph_args + ['version'] try: output,_,_ = get_subprocess_output(args, self.log) except Exception, e: raise Exception('Unable to run cmd=%s: %s' % (' '.join(args), str(e)))
def _run_gohai(self, options): # Gohai is disabled on Mac for now if Platform.is_mac() or not self.agentConfig.get('enable_gohai'): return None output = None try: output, err, _ = get_subprocess_output(["gohai"] + options, log) if err: log.debug("GOHAI LOG | %s", err) except OSError as e: if e.errno == 2: # file not found, expected when install from source log.info("gohai file not found") else: log.warning("Unexpected OSError when running gohai %s", e) except Exception as e: log.warning("gohai command failed with error %s", e) return output
def _run_gohai(self, options): output = None try: if not Platform.is_windows(): command = "gohai" else: command = "gohai\gohai.exe" output, err, _ = get_subprocess_output([command] + options, log) if err: log.warning("GOHAI LOG | {0}".format(err)) except OSError as e: if e.errno == 2: # file not found, expected when install from source log.info("gohai file not found") else: log.warning("Unexpected OSError when running gohai %s", e) except Exception as e: log.warning("gohai command failed with error %s", e) return output
def __init__(self, logger): Check.__init__(self, logger) macV = None if sys.platform == 'darwin': macV = platform.mac_ver() macV_minor_version = int(re.match(r'10\.(\d+)\.?.*', macV[0]).group(1)) # Output from top is slightly modified on OS X 10.6 (case #28239) and greater if macV and (macV_minor_version >= 6): self.topIndex = 6 else: self.topIndex = 5 self.pagesize = 0 if sys.platform == 'sunos5': try: pgsz, _, _ = get_subprocess_output(['pagesize'], self.logger) self.pagesize = int(pgsz.strip()) except Exception: # No page size available pass
def _get_queue_count(self, directory, queues, tags): for queue in queues: queue_path = os.path.join(directory, queue) if not os.path.exists(queue_path): raise Exception('%s does not exist' % queue_path) count = 0 if os.geteuid() == 0: # dd-agent is running as root (not recommended) count = sum(len(files) for root, dirs, files in os.walk(queue_path)) else: # can dd-agent user run sudo? test_sudo = os.system('setsid sudo -l < /dev/null') if test_sudo == 0: output, _, _ = get_subprocess_output(['sudo', 'find', queue_path, '-type', 'f'], self.log, False) count = len(output.splitlines()) else: raise Exception('The dd-agent user does not have sudo access') # emit an individually tagged metric self.gauge('postfix.queue.size', count, tags=tags + ['queue:%s' % queue, 'instance:%s' % os.path.basename(directory)])
def check(self, agentConfig): if Platform.is_linux(): proc_location = agentConfig.get('procfs_path', '/proc').rstrip('/') try: proc_loadavg = "{0}/loadavg".format(proc_location) with open(proc_loadavg, 'r') as load_avg: uptime = load_avg.readline().strip() except Exception: self.log.exception('Cannot extract load') return False elif sys.platform in ('darwin', 'sunos5') or sys.platform.startswith("freebsd"): # Get output from uptime try: uptime, _, _ = get_subprocess_output(['uptime'], self.log) except Exception: self.log.exception('Cannot extract load') return False else: return False # Split out the 3 load average values load = [res.replace(',', '.') for res in re.findall(r'([0-9]+[\.,]\d+)', uptime)] # Normalize load by number of cores try: cores = int(agentConfig.get('system_stats').get('cpuCores')) assert cores >= 1, "Cannot determine number of cores" # Compute a normalized load, named .load.norm to make it easy to find next to .load return {'system.load.1': float(load[0]), 'system.load.5': float(load[1]), 'system.load.15': float(load[2]), 'system.load.norm.1': float(load[0])/cores, 'system.load.norm.5': float(load[1])/cores, 'system.load.norm.15': float(load[2])/cores, } except Exception: # No normalized load available return {'system.load.1': float(load[0]), 'system.load.5': float(load[1]), 'system.load.15': float(load[2])}
def check(self, agentConfig): process_exclude_args = agentConfig.get('exclude_process_args', False) if process_exclude_args: ps_arg = 'aux' else: ps_arg = 'auxww' # Get output from ps try: output, _, _ = get_subprocess_output(['ps', ps_arg], self.log) processLines = output.splitlines() # Also removes a trailing empty line except StandardError: self.log.exception('getProcesses') return False del processLines[0] # Removes the headers processes = [] for line in processLines: line = line.split(None, 10) processes.append(map(lambda s: s.strip(), line)) return {'processes': processes}
def _get_proc_list(self): # Get output from ps try: process_exclude_args = self.config.get('exclude_process_args', False) if process_exclude_args: ps_arg = 'aux' else: ps_arg = 'auxww' output, _, _ = get_subprocess_output(['ps', ps_arg], self.log) processLines = output.splitlines() # Also removes a trailing empty line except Exception: self.log.exception('Cannot get process list') raise del processLines[0] # Removes the headers processes = [] for line in processLines: line = line.split(None, 10) processes.append(map(lambda s: s.strip(), line)) return processes
def _get_server_pid(self, db): pid = None # Try to get pid from pid file, it can fail for permission reason pid_file = None try: cursor = db.cursor() cursor.execute("SHOW VARIABLES LIKE 'pid_file'") pid_file = cursor.fetchone()[1] cursor.close() del cursor except Exception: self.warning("Error while fetching pid_file variable of MySQL.") if pid_file is not None: self.log.debug("pid file: %s" % str(pid_file)) try: f = open(pid_file) pid = int(f.readline()) f.close() except IOError: self.log.debug("Cannot read mysql pid file %s" % pid_file) # If pid has not been found, read it from ps if pid is None: try: if sys.platform.startswith("linux"): ps, _, _ = get_subprocess_output(['ps', '-C', 'mysqld', '-o', 'pid'], self.log) pslines = ps.strip().splitlines() # First line is header, second line is mysql pid if len(pslines) == 2: pid = int(pslines[1]) except Exception: self.log.exception("Error while fetching mysql pid from ps") return pid
def _check_linux(self, instance): if self._collect_cx_state: try: self.log.debug("Using `ss` to collect connection state") # Try using `ss` for increased performance over `netstat` for ip_version in ['4', '6']: # Call `ss` for each IP version because there's no built-in way of distinguishing # between the IP versions in the output lines = get_subprocess_output(["ss", "-n", "-u", "-t", "-a", "-{0}".format(ip_version)], self.log).splitlines() # Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port # udp UNCONN 0 0 127.0.0.1:8125 *:* # udp ESTAB 0 0 127.0.0.1:37036 127.0.0.1:8125 # udp UNCONN 0 0 fe80::a00:27ff:fe1c:3c4:123 :::* # tcp TIME-WAIT 0 0 90.56.111.177:56867 46.105.75.4:143 # tcp LISTEN 0 0 ::ffff:127.0.0.1:33217 ::ffff:127.0.0.1:7199 # tcp ESTAB 0 0 ::ffff:127.0.0.1:58975 ::ffff:127.0.0.1:2181 metrics = self._parse_linux_cx_state(lines[1:], self.TCP_STATES['ss'], 1, ip_version=ip_version) # Only send the metrics which match the loop iteration's ip version for stat, metric in self.CX_STATE_GAUGE.iteritems(): if stat[0].endswith(ip_version): self.gauge(metric, metrics.get(metric)) except OSError: self.log.info("`ss` not found: using `netstat` as a fallback") lines = get_subprocess_output(["netstat", "-n", "-u", "-t", "-a"], self.log).splitlines() # Active Internet connections (w/o servers) # Proto Recv-Q Send-Q Local Address Foreign Address State # tcp 0 0 46.105.75.4:80 79.220.227.193:2032 SYN_RECV # tcp 0 0 46.105.75.4:143 90.56.111.177:56867 ESTABLISHED # tcp 0 0 46.105.75.4:50468 107.20.207.175:443 TIME_WAIT # tcp6 0 0 46.105.75.4:80 93.15.237.188:58038 FIN_WAIT2 # tcp6 0 0 46.105.75.4:80 79.220.227.193:2029 ESTABLISHED # udp 0 0 0.0.0.0:123 0.0.0.0:* # udp6 0 0 :::41458 :::* metrics = self._parse_linux_cx_state(lines[2:], self.TCP_STATES['netstat'], 5) for metric, value in metrics.iteritems(): self.gauge(metric, value) proc = open('/proc/net/dev', 'r') try: lines = proc.readlines() finally: proc.close() # Inter-| Receive | Transmit # face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed # lo:45890956 112797 0 0 0 0 0 0 45890956 112797 0 0 0 0 0 0 # eth0:631947052 1042233 0 19 0 184 0 1206 1208625538 1320529 0 0 0 0 0 0 # eth1: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 for l in lines[2:]: cols = l.split(':', 1) x = cols[1].split() # Filter inactive interfaces if self._parse_value(x[0]) or self._parse_value(x[8]): iface = cols[0].strip() metrics = { 'bytes_rcvd': self._parse_value(x[0]), 'bytes_sent': self._parse_value(x[8]), 'packets_in.count': self._parse_value(x[1]), 'packets_in.error': self._parse_value(x[2]) + self._parse_value(x[3]), 'packets_out.count': self._parse_value(x[9]), 'packets_out.error':self._parse_value(x[10]) + self._parse_value(x[11]), } self._submit_devicemetrics(iface, metrics) try: proc = open('/proc/net/snmp', 'r') # IP: Forwarding DefaultTTL InReceives InHdrErrors ... # IP: 2 64 377145470 0 ... # Icmp: InMsgs InErrors InDestUnreachs InTimeExcds ... # Icmp: 1644495 1238 1643257 0 ... # IcmpMsg: InType3 OutType3 # IcmpMsg: 1643257 1643257 # Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ... # Tcp: 1 200 120000 -1 ... # Udp: InDatagrams NoPorts InErrors OutDatagrams ... # Udp: 24249494 1643257 0 25892947 ... # UdpLite: InDatagrams Noports InErrors OutDatagrams ... # UdpLite: 0 0 0 0 ... try: lines = proc.readlines() finally: proc.close() tcp_lines = [line for line in lines if line.startswith('Tcp:')] udp_lines = [line for line in lines if line.startswith('Udp:')] tcp_column_names = tcp_lines[0].strip().split() tcp_values = tcp_lines[1].strip().split() tcp_metrics = dict(zip(tcp_column_names, tcp_values)) udp_column_names = udp_lines[0].strip().split() udp_values = udp_lines[1].strip().split() udp_metrics = dict(zip(udp_column_names, udp_values)) # line start indicating what kind of metrics we're looking at assert(tcp_metrics['Tcp:'] == 'Tcp:') tcp_metrics_name = { 'RetransSegs': 'system.net.tcp.retrans_segs', 'InSegs' : 'system.net.tcp.in_segs', 'OutSegs' : 'system.net.tcp.out_segs' } for key, metric in tcp_metrics_name.iteritems(): self.rate(metric, self._parse_value(tcp_metrics[key])) assert(udp_metrics['Udp:'] == 'Udp:') udp_metrics_name = { 'InDatagrams': 'system.net.udp.in_datagrams', 'NoPorts': 'system.net.udp.no_ports', 'InErrors': 'system.net.udp.in_errors', 'OutDatagrams': 'system.net.udp.out_datagrams', 'RcvbufErrors': 'system.net.udp.rcv_buf_errors', 'SndbufErrors': 'system.net.udp.snd_buf_errors' } for key, metric in udp_metrics_name.iteritems(): if key in udp_metrics: self.rate(metric, self._parse_value(udp_metrics[key])) except IOError: # On Openshift, /proc/net/snmp is only readable by root self.log.debug("Unable to read /proc/net/snmp.")
def check(self, agentConfig): """Return an aggregate of CPU stats across all CPUs When figures are not available, False is sent back. """ def format_results(us, sy, wa, idle, st, guest=None): data = {'cpuUser': us, 'cpuSystem': sy, 'cpuWait': wa, 'cpuIdle': idle, 'cpuStolen': st, 'cpuGuest': guest} return dict((k, v) for k, v in data.iteritems() if v is not None) def get_value(legend, data, name, filter_value=None): "Using the legend and a metric name, get the value or None from the data line" if name in legend: value = to_float(data[legend.index(name)]) if filter_value is not None: if value > filter_value: return None return value else: # FIXME return a float or False, would trigger type error if not python self.logger.debug("Cannot extract cpu value %s from %s (%s)" % (name, data, legend)) return 0.0 try: if Platform.is_linux(): output, _, _ = get_subprocess_output(['mpstat', '1', '3'], self.logger) mpstat = output.splitlines() # topdog@ip:~$ mpstat 1 3 # Linux 2.6.32-341-ec2 (ip) 01/19/2012 _x86_64_ (2 CPU) # # 04:22:41 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle # 04:22:42 PM all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # 04:22:43 PM all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # 04:22:44 PM all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # Average: all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # # OR # # Thanks to Mart Visser to spotting this one. # blah:/etc/dd-agent# mpstat # Linux 2.6.26-2-xen-amd64 (atira) 02/17/2012 _x86_64_ # # 05:27:03 PM CPU %user %nice %sys %iowait %irq %soft %steal %idle intr/s # 05:27:03 PM all 3.59 0.00 0.68 0.69 0.00 0.00 0.01 95.03 43.65 # legend = [l for l in mpstat if "%usr" in l or "%user" in l] avg = [l for l in mpstat if "Average" in l] if len(legend) == 1 and len(avg) == 1: headers = [h for h in legend[0].split() if h not in ("AM", "PM")] data = avg[0].split() # Userland # Debian lenny says %user so we look for both # One of them will be 0 cpu_metrics = { "%usr": None, "%user": None, "%nice": None, "%iowait": None, "%idle": None, "%sys": None, "%irq": None, "%soft": None, "%steal": None, "%guest": None } for cpu_m in cpu_metrics: cpu_metrics[cpu_m] = get_value(headers, data, cpu_m, filter_value=110) if any([v is None for v in cpu_metrics.values()]): self.logger.warning("Invalid mpstat data: %s" % data) cpu_user = cpu_metrics["%usr"] + cpu_metrics["%user"] + cpu_metrics["%nice"] cpu_system = cpu_metrics["%sys"] + cpu_metrics["%irq"] + cpu_metrics["%soft"] cpu_wait = cpu_metrics["%iowait"] cpu_idle = cpu_metrics["%idle"] cpu_stolen = cpu_metrics["%steal"] cpu_guest = cpu_metrics["%guest"] return format_results(cpu_user, cpu_system, cpu_wait, cpu_idle, cpu_stolen, cpu_guest) else: return False elif sys.platform == 'darwin': # generate 3 seconds of data # [' disk0 disk1 cpu load average', ' KB/t tps MB/s KB/t tps MB/s us sy id 1m 5m 15m', ' 21.23 13 0.27 17.85 7 0.13 14 7 79 1.04 1.27 1.31', ' 4.00 3 0.01 5.00 8 0.04 12 10 78 1.04 1.27 1.31', ''] iostats, _, _ = get_subprocess_output(['iostat', '-C', '-w', '3', '-c', '2'], self.logger) lines = [l for l in iostats.splitlines() if len(l) > 0] legend = [l for l in lines if "us" in l] if len(legend) == 1: headers = legend[0].split() data = lines[-1].split() cpu_user = get_value(headers, data, "us") cpu_sys = get_value(headers, data, "sy") cpu_wait = 0 cpu_idle = get_value(headers, data, "id") cpu_st = 0 return format_results(cpu_user, cpu_sys, cpu_wait, cpu_idle, cpu_st) else: self.logger.warn("Expected to get at least 4 lines of data from iostat instead of just " + str(iostats[:max(80, len(iostats))])) return False elif sys.platform.startswith("freebsd"): # generate 3 seconds of data # tty ada0 cd0 pass0 cpu # tin tout KB/t tps MB/s KB/t tps MB/s KB/t tps MB/s us ni sy in id # 0 69 26.71 0 0.01 0.00 0 0.00 0.00 0 0.00 2 0 0 1 97 # 0 78 0.00 0 0.00 0.00 0 0.00 0.00 0 0.00 0 0 0 0 100 iostats, _, _ = get_subprocess_output(['iostat', '-w', '3', '-c', '2'], self.logger) lines = [l for l in iostats.splitlines() if len(l) > 0] legend = [l for l in lines if "us" in l] if len(legend) == 1: headers = legend[0].split() data = lines[-1].split() cpu_user = get_value(headers, data, "us") cpu_nice = get_value(headers, data, "ni") cpu_sys = get_value(headers, data, "sy") cpu_intr = get_value(headers, data, "in") cpu_wait = 0 cpu_idle = get_value(headers, data, "id") cpu_stol = 0 return format_results(cpu_user + cpu_nice, cpu_sys + cpu_intr, cpu_wait, cpu_idle, cpu_stol) else: self.logger.warn("Expected to get at least 4 lines of data from iostat instead of just " + str(iostats[:max(80, len(iostats))])) return False elif sys.platform == 'sunos5': # mpstat -aq 1 2 # SET minf mjf xcal intr ithr csw icsw migr smtx srw syscl usr sys wt idl sze # 0 5239 0 12857 22969 5523 14628 73 546 4055 1 146856 5 6 0 89 24 <-- since boot # 1 ... # SET minf mjf xcal intr ithr csw icsw migr smtx srw syscl usr sys wt idl sze # 0 20374 0 45634 57792 5786 26767 80 876 20036 2 724475 13 13 0 75 24 <-- past 1s # 1 ... # http://docs.oracle.com/cd/E23824_01/html/821-1462/mpstat-1m.html # # Will aggregate over all processor sets output, _, _ = get_subprocess_output(['mpstat', '-aq', '1', '2'], self.logger) mpstat = output.splitlines() lines = [l for l in mpstat if len(l) > 0] # discard the first len(lines)/2 lines lines = lines[len(lines)/2:] legend = [l for l in lines if "SET" in l] assert len(legend) == 1 if len(legend) == 1: headers = legend[0].split() # collect stats for each processor set # and aggregate them based on the relative set size d_lines = [l for l in lines if "SET" not in l] user = [get_value(headers, l.split(), "usr") for l in d_lines] kern = [get_value(headers, l.split(), "sys") for l in d_lines] wait = [get_value(headers, l.split(), "wt") for l in d_lines] idle = [get_value(headers, l.split(), "idl") for l in d_lines] size = [get_value(headers, l.split(), "sze") for l in d_lines] count = sum(size) rel_size = [s/count for s in size] dot = lambda v1, v2: reduce(operator.add, map(operator.mul, v1, v2)) return format_results(dot(user, rel_size), dot(kern, rel_size), dot(wait, rel_size), dot(idle, rel_size), 0.0) else: self.logger.warn("CPUStats: unsupported platform") return False except Exception: self.logger.exception("Cannot compute CPU stats") return False
def check(self, agentConfig): if Platform.is_linux(): proc_location = agentConfig.get('procfs_path', '/proc').rstrip('/') try: proc_meminfo = "{}/meminfo".format(proc_location) with open(proc_meminfo, 'r') as mem_info: lines = mem_info.readlines() except Exception: self.logger.exception('Cannot get memory metrics from %s', proc_meminfo) return False # NOTE: not all of the stats below are present on all systems as # not all kernel versions report all of them. # # $ cat /proc/meminfo # MemTotal: 7995360 kB # MemFree: 1045120 kB # MemAvailable: 1253920 kB # Buffers: 226284 kB # Cached: 775516 kB # SwapCached: 248868 kB # Active: 1004816 kB # Inactive: 1011948 kB # Active(anon): 455152 kB # Inactive(anon): 584664 kB # Active(file): 549664 kB # Inactive(file): 427284 kB # Unevictable: 4392476 kB # Mlocked: 4392476 kB # SwapTotal: 11120632 kB # SwapFree: 10555044 kB # Dirty: 2948 kB # Writeback: 0 kB # AnonPages: 5203560 kB # Mapped: 50520 kB # Shmem: 10108 kB # Slab: 161300 kB # SReclaimable: 136108 kB # SUnreclaim: 25192 kB # KernelStack: 3160 kB # PageTables: 26776 kB # NFS_Unstable: 0 kB # Bounce: 0 kB # WritebackTmp: 0 kB # CommitLimit: 15118312 kB # Committed_AS: 6703508 kB # VmallocTotal: 34359738367 kB # VmallocUsed: 400668 kB # VmallocChunk: 34359329524 kB # HardwareCorrupted: 0 kB # HugePages_Total: 0 # HugePages_Free: 0 # HugePages_Rsvd: 0 # HugePages_Surp: 0 # Hugepagesize: 2048 kB # DirectMap4k: 10112 kB # DirectMap2M: 8243200 kB regexp = re.compile(r'^(\w+):\s+([0-9]+)') # We run this several times so one-time compile now meminfo = {} parse_error = False for line in lines: try: match = re.search(regexp, line) if match is not None: meminfo[match.group(1)] = match.group(2) except Exception: parse_error = True if parse_error: self.logger.error("Error parsing %s", proc_meminfo) memData = {} # Physical memory # FIXME units are in MB, we should use bytes instead try: memData['physTotal'] = int(meminfo.get('MemTotal', 0)) / 1024 memData['physFree'] = int(meminfo.get('MemFree', 0)) / 1024 memData['physBuffers'] = int(meminfo.get('Buffers', 0)) / 1024 memData['physCached'] = int(meminfo.get('Cached', 0)) / 1024 memData['physShared'] = int(meminfo.get('Shmem', 0)) / 1024 memData['physSlab'] = int(meminfo.get('Slab', 0)) / 1024 memData['physPageTables'] = int(meminfo.get('PageTables', 0)) / 1024 memData['physUsed'] = memData['physTotal'] - memData['physFree'] if 'MemAvailable' in meminfo: memData['physUsable'] = int(meminfo.get('MemAvailable', 0)) / 1024 else: # Usable is relative since cached and buffers are actually used to speed things up. memData['physUsable'] = memData['physFree'] + memData['physBuffers'] + memData['physCached'] if memData['physTotal'] > 0: memData['physPctUsable'] = float(memData['physUsable']) / float(memData['physTotal']) except Exception: self.logger.exception('Cannot compute stats from %s', proc_meminfo) # Swap # FIXME units are in MB, we should use bytes instead try: memData['swapTotal'] = int(meminfo.get('SwapTotal', 0)) / 1024 memData['swapFree'] = int(meminfo.get('SwapFree', 0)) / 1024 memData['swapCached'] = int(meminfo.get('SwapCached', 0)) / 1024 memData['swapUsed'] = memData['swapTotal'] - memData['swapFree'] if memData['swapTotal'] > 0: memData['swapPctFree'] = float(memData['swapFree']) / float(memData['swapTotal']) except Exception: self.logger.exception('Cannot compute swap stats') return memData elif sys.platform == 'darwin': if psutil is None: self.logger.error("psutil must be installed on MacOS to collect memory metrics") return False phys_memory = psutil.virtual_memory() swap = psutil.swap_memory() return {'physUsed': phys_memory.used / float(1024**2), 'physFree': phys_memory.free / float(1024**2), 'physUsable': phys_memory.available / float(1024**2), 'physPctUsable': (100 - phys_memory.percent) / 100.0, 'swapUsed': swap.used / float(1024**2), 'swapFree': swap.free / float(1024**2)} elif sys.platform.startswith("freebsd"): try: output, _, _ = get_subprocess_output(['sysctl', 'vm.stats.vm'], self.logger) sysctl = output.splitlines() except Exception: self.logger.exception('getMemoryUsage') return False # ... # vm.stats.vm.v_page_size: 4096 # vm.stats.vm.v_page_count: 759884 # vm.stats.vm.v_wire_count: 122726 # vm.stats.vm.v_active_count: 109350 # vm.stats.vm.v_cache_count: 17437 # vm.stats.vm.v_inactive_count: 479673 # vm.stats.vm.v_free_count: 30542 # ... # We run this several times so one-time compile now regexp = re.compile(r'^vm\.stats\.vm\.(\w+):\s+([0-9]+)') meminfo = {} parse_error = False for line in sysctl: try: match = re.search(regexp, line) if match is not None: meminfo[match.group(1)] = match.group(2) except Exception: parse_error = True if parse_error: self.logger.error("Error parsing vm.stats.vm output: %s", sysctl) memData = {} # Physical memory try: pageSize = int(meminfo.get('v_page_size')) memData['physTotal'] = (int(meminfo.get('v_page_count', 0)) * pageSize) / 1048576 memData['physFree'] = (int(meminfo.get('v_free_count', 0)) * pageSize) / 1048576 memData['physCached'] = (int(meminfo.get('v_cache_count', 0)) * pageSize) / 1048576 memData['physUsed'] = ((int(meminfo.get('v_active_count'), 0) + int(meminfo.get('v_wire_count', 0))) * pageSize) / 1048576 memData['physUsable'] = ((int(meminfo.get('v_free_count'), 0) + int(meminfo.get('v_cache_count', 0)) + int(meminfo.get('v_inactive_count', 0))) * pageSize) / 1048576 if memData['physTotal'] > 0: memData['physPctUsable'] = float(memData['physUsable']) / float(memData['physTotal']) except Exception: self.logger.exception('Cannot compute stats from %s', proc_meminfo) # Swap try: output, _, _ = get_subprocess_output(['swapinfo', '-m'], self.logger) sysctl = output.splitlines() except Exception: self.logger.exception('getMemoryUsage') return False # ... # Device 1M-blocks Used Avail Capacity # /dev/ad0s1b 570 0 570 0% # ... assert "Device" in sysctl[0] try: memData['swapTotal'] = 0 memData['swapFree'] = 0 memData['swapUsed'] = 0 for line in sysctl[1:]: if len(line) > 0: line = line.split() memData['swapTotal'] += int(line[1]) memData['swapFree'] += int(line[3]) memData['swapUsed'] += int(line[2]) except Exception: self.logger.exception('Cannot compute stats from swapinfo') return memData elif sys.platform == 'sunos5': try: memData = {} cmd = ["kstat", "-m", "memory_cap", "-c", "zone_memory_cap", "-p"] output, _, _ = get_subprocess_output(cmd, self.logger) kmem = output.splitlines() # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:anon_alloc_fail 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:anonpgin 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:class zone_memory_cap # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:crtime 16359935.0680834 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:execpgin 185 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:fspgin 2556 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:n_pf_throttle 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:n_pf_throttle_usec 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:nover 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:pagedout 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:pgpgin 2741 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:physcap 536870912 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:rss 115544064 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:snaptime 16787393.9439095 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swap 91828224 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swapcap 1073741824 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:zonename 53aa9b7e-48ba-4152-a52b-a6368c3d9e7c # turn memory_cap:360:zone_name:key value # into { "key": value, ...} kv = [l.strip().split() for l in kmem if len(l) > 0] entries = dict([(k.split(":")[-1], v) for (k, v) in kv]) # extract rss, physcap, swap, swapcap, turn into MB convert = lambda v: int(long(v))/2**20 memData["physTotal"] = convert(entries["physcap"]) memData["physUsed"] = convert(entries["rss"]) memData["physFree"] = memData["physTotal"] - memData["physUsed"] memData["swapTotal"] = convert(entries["swapcap"]) memData["swapUsed"] = convert(entries["swap"]) memData["swapFree"] = memData["swapTotal"] - memData["swapUsed"] if memData['swapTotal'] > 0: memData['swapPctFree'] = float(memData['swapFree']) / float(memData['swapTotal']) return memData except Exception: self.logger.exception("Cannot compute mem stats from kstat -c zone_memory_cap") return False else: return False
def check(self, agentConfig): """Capture io stats. @rtype dict @return {"device": {"metric": value, "metric": value}, ...} """ io = {} try: if Platform.is_linux(): stdout, _, _ = get_subprocess_output(['iostat', '-d', '1', '2', '-x', '-k'], self.logger) # Linux 2.6.32-343-ec2 (ip-10-35-95-10) 12/11/2012 _x86_64_ (2 CPU) # # Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await svctm %util # sda1 0.00 17.61 0.26 32.63 4.23 201.04 12.48 0.16 4.81 0.53 1.73 # sdb 0.00 2.68 0.19 3.84 5.79 26.07 15.82 0.02 4.93 0.22 0.09 # sdg 0.00 0.13 2.29 3.84 100.53 30.61 42.78 0.05 8.41 0.88 0.54 # sdf 0.00 0.13 2.30 3.84 100.54 30.61 42.78 0.06 9.12 0.90 0.55 # md0 0.00 0.00 0.05 3.37 1.41 30.01 18.35 0.00 0.00 0.00 0.00 # # Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await svctm %util # sda1 0.00 0.00 0.00 10.89 0.00 43.56 8.00 0.03 2.73 2.73 2.97 # sdb 0.00 0.00 0.00 2.97 0.00 11.88 8.00 0.00 0.00 0.00 0.00 # sdg 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 # sdf 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 # md0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 io.update(self._parse_linux2(stdout)) elif sys.platform == "sunos5": output, _, _ = get_subprocess_output(["iostat", "-x", "-d", "1", "2"], self.logger) iostat = output.splitlines() # extended device statistics <-- since boot # device r/s w/s kr/s kw/s wait actv svc_t %w %b # ramdisk1 0.0 0.0 0.1 0.1 0.0 0.0 0.0 0 0 # sd0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 # sd1 79.9 149.9 1237.6 6737.9 0.0 0.5 2.3 0 11 # extended device statistics <-- past second # device r/s w/s kr/s kw/s wait actv svc_t %w %b # ramdisk1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 # sd0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 # sd1 0.0 139.0 0.0 1850.6 0.0 0.0 0.1 0 1 # discard the first half of the display (stats since boot) lines = [l for l in iostat if len(l) > 0] lines = lines[len(lines)/2:] assert "extended device statistics" in lines[0] headers = lines[1].split() assert "device" in headers for l in lines[2:]: cols = l.split() # cols[0] is the device # cols[1:] are the values io[cols[0]] = {} for i in range(1, len(cols)): io[cols[0]][self.xlate(headers[i], "sunos")] = cols[i] elif sys.platform.startswith("freebsd"): output, _, _ = get_subprocess_output(["iostat", "-x", "-d", "1", "2"], self.logger) iostat = output.splitlines() # Be careful! # It looks like SunOS, but some columms (wait, svc_t) have different meaning # extended device statistics # device r/s w/s kr/s kw/s wait svc_t %b # ad0 3.1 1.3 49.9 18.8 0 0.7 0 # extended device statistics # device r/s w/s kr/s kw/s wait svc_t %b # ad0 0.0 2.0 0.0 31.8 0 0.2 0 # discard the first half of the display (stats since boot) lines = [l for l in iostat if len(l) > 0] lines = lines[len(lines)/2:] assert "extended device statistics" in lines[0] headers = lines[1].split() assert "device" in headers for l in lines[2:]: cols = l.split() # cols[0] is the device # cols[1:] are the values io[cols[0]] = {} for i in range(1, len(cols)): io[cols[0]][self.xlate(headers[i], "freebsd")] = cols[i] elif sys.platform == 'darwin': iostat, _, _ = get_subprocess_output(['iostat', '-d', '-c', '2', '-w', '1'], self.logger) # disk0 disk1 <-- number of disks # KB/t tps MB/s KB/t tps MB/s # 21.11 23 0.47 20.01 0 0.00 # 6.67 3 0.02 0.00 0 0.00 <-- line of interest io = self._parse_darwin(iostat) else: return False # If we filter devices, do it know. device_blacklist_re = agentConfig.get('device_blacklist_re', None) if device_blacklist_re: filtered_io = {} for device, stats in io.iteritems(): if not device_blacklist_re.match(device): filtered_io[device] = stats else: filtered_io = io return filtered_io except Exception: self.logger.exception("Cannot extract IO statistics") return False
def _check_bsd(self, instance): netstat_flags = ['-i', '-b'] # FreeBSD's netstat truncates device names unless you pass '-W' if Platform.is_freebsd(): netstat_flags.append('-W') try: output, _, _ = get_subprocess_output(["netstat"] + netstat_flags, self.log) lines = output.splitlines() # Name Mtu Network Address Ipkts Ierrs Ibytes Opkts Oerrs Obytes Coll # lo0 16384 <Link#1> 318258 0 428252203 318258 0 428252203 0 # lo0 16384 localhost fe80:1::1 318258 - 428252203 318258 - 428252203 - # lo0 16384 127 localhost 318258 - 428252203 318258 - 428252203 - # lo0 16384 localhost ::1 318258 - 428252203 318258 - 428252203 - # gif0* 1280 <Link#2> 0 0 0 0 0 0 0 # stf0* 1280 <Link#3> 0 0 0 0 0 0 0 # en0 1500 <Link#4> 04:0c:ce:db:4e:fa 20801309 0 13835457425 15149389 0 11508790198 0 # en0 1500 seneca.loca fe80:4::60c:ceff: 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 192.168.1 192.168.1.63 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # p2p0 2304 <Link#5> 06:0c:ce:db:4e:fa 0 0 0 0 0 0 0 # ham0 1404 <Link#6> 7a:79:05:4d:bf:f5 30100 0 6815204 18742 0 8494811 0 # ham0 1404 5 5.77.191.245 30100 - 6815204 18742 - 8494811 - # ham0 1404 seneca.loca fe80:6::7879:5ff: 30100 - 6815204 18742 - 8494811 - # ham0 1404 2620:9b::54 2620:9b::54d:bff5 30100 - 6815204 18742 - 8494811 - headers = lines[0].split() # Given the irregular structure of the table above, better to parse from the end of each line # Verify headers first # -7 -6 -5 -4 -3 -2 -1 for h in ("Ipkts", "Ierrs", "Ibytes", "Opkts", "Oerrs", "Obytes", "Coll"): if h not in headers: self.logger.error("%s not found in %s; cannot parse" % (h, headers)) return False current = None for l in lines[1:]: # Another header row, abort now, this is IPv6 land if "Name" in l: break x = l.split() if len(x) == 0: break iface = x[0] if iface.endswith("*"): iface = iface[:-1] if iface == current: # skip multiple lines of same interface continue else: current = iface # Filter inactive interfaces if self._parse_value(x[-5]) or self._parse_value(x[-2]): iface = current metrics = { 'bytes_rcvd': self._parse_value(x[-5]), 'bytes_sent': self._parse_value(x[-2]), 'packets_in.count': self._parse_value(x[-7]), 'packets_in.error': self._parse_value(x[-6]), 'packets_out.count': self._parse_value(x[-4]), 'packets_out.error':self._parse_value(x[-3]), } self._submit_devicemetrics(iface, metrics) except SubprocessOutputEmptyError: self.log.exception("Error collecting connection stats.") try: netstat, _, _ = get_subprocess_output(["netstat", "-s", "-p" "tcp"], self.log) #3651535 packets sent # 972097 data packets (615753248 bytes) # 5009 data packets (2832232 bytes) retransmitted # 0 resends initiated by MTU discovery # 2086952 ack-only packets (471 delayed) # 0 URG only packets # 0 window probe packets # 310851 window update packets # 336829 control packets # 0 data packets sent after flow control # 3058232 checksummed in software # 3058232 segments (571218834 bytes) over IPv4 # 0 segments (0 bytes) over IPv6 #4807551 packets received # 1143534 acks (for 616095538 bytes) # 165400 duplicate acks # ... self._submit_regexed_values(netstat, BSD_TCP_METRICS) except SubprocessOutputEmptyError: self.log.exception("Error collecting TCP stats.")
def _populate_payload_metadata(self, payload, check_statuses, start_event=True): """ Periodically populate the payload with metadata related to the system, host, and/or checks. """ now = time.time() # Include system stats on first postback if start_event and self._is_first_run(): payload['systemStats'] = self.agentConfig.get('system_stats', {}) # Also post an event in the newsfeed payload['events']['System'] = [{ 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': now, 'event_type':'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Periodically send the host metadata. if self._should_send_additional_data('host_metadata'): # gather metadata with gohai try: if not Platform.is_windows(): command = "gohai" else: command = "gohai\gohai.exe" gohai_metadata, gohai_err, _ = get_subprocess_output([command], log) payload['gohai'] = gohai_metadata if gohai_err: log.warning("GOHAI LOG | {0}".format(gohai_err)) except OSError as e: if e.errno == 2: # file not found, expected when install from source log.info("gohai file not found") else: raise e except Exception as e: log.warning("gohai command failed with error %s" % str(e)) payload['systemStats'] = get_system_stats() payload['meta'] = self._get_hostname_metadata() self.hostname_metadata_cache = payload['meta'] # Add static tags from the configuration file host_tags = [] if self.agentConfig['tags'] is not None: host_tags.extend([unicode(tag.strip()) for tag in self.agentConfig['tags'].split(",")]) if self.agentConfig['collect_ec2_tags']: host_tags.extend(EC2.get_tags(self.agentConfig)) if host_tags: payload['host-tags']['system'] = host_tags GCE_tags = GCE.get_tags(self.agentConfig) if GCE_tags is not None: payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags # Log the metadata on the first run if self._is_first_run(): log.info("Hostnames: %s, tags: %s" % (repr(self.hostname_metadata_cache), payload['host-tags'])) # Periodically send extra hosts metadata (vsphere) # Metadata of hosts that are not the host where the agent runs, not all the checks use # that external_host_tags = [] if self._should_send_additional_data('external_host_tags'): for check in self.initialized_checks_d: try: getter = getattr(check, 'get_external_host_tags') check_tags = getter() external_host_tags.extend(check_tags) except AttributeError: pass if external_host_tags: payload['external_host_tags'] = external_host_tags # Periodically send agent_checks metadata if self._should_send_additional_data('agent_checks'): # Add agent checks statuses and error/warning messages agent_checks = [] for check in check_statuses: if check.instance_statuses is not None: for i, instance_status in enumerate(check.instance_statuses): agent_checks.append( ( check.name, check.source_type_name, instance_status.instance_id, instance_status.status, # put error message or list of warning messages in the same field # it will be handled by the UI instance_status.error or instance_status.warnings or "", check.service_metadata[i] ) ) else: agent_checks.append( ( check.name, check.source_type_name, "initialization", check.status, repr(check.init_failed_error) ) ) payload['agent_checks'] = agent_checks payload['meta'] = self.hostname_metadata_cache # add hostname metadata # If required by the user, let's create the dd_check:xxx host tags if self.agentConfig['create_dd_check_tags'] and \ self._should_send_additional_data('dd_check_tags'): app_tags_list = [DD_CHECK_TAG.format(c.name) for c in self.initialized_checks_d] app_tags_list.extend([DD_CHECK_TAG.format(cname) for cname in JMXFiles.get_jmx_appnames()]) if 'system' not in payload['host-tags']: payload['host-tags']['system'] = [] payload['host-tags']['system'].extend(app_tags_list)