def get_system_stats(): systemStats = { 'machine': platform.machine(), 'platform': sys.platform, 'processor': platform.processor(), 'pythonV': platform.python_version(), } platf = sys.platform if Platform.is_linux(platf): grep = subprocess.Popen(['grep', 'model name', '/proc/cpuinfo'], stdout=subprocess.PIPE, close_fds=True) wc = subprocess.Popen(['wc', '-l'], stdin=grep.stdout, stdout=subprocess.PIPE, close_fds=True) systemStats['cpuCores'] = int(wc.communicate()[0]) if Platform.is_darwin(platf): systemStats['cpuCores'] = int(subprocess.Popen(['sysctl', 'hw.ncpu'], stdout=subprocess.PIPE, close_fds=True).communicate()[0].split(': ')[1]) if Platform.is_freebsd(platf): systemStats['cpuCores'] = int(subprocess.Popen(['sysctl', 'hw.ncpu'], stdout=subprocess.PIPE, close_fds=True).communicate()[0].split(': ')[1]) if Platform.is_linux(platf): systemStats['nixV'] = platform.dist() elif Platform.is_darwin(platf): systemStats['macV'] = platform.mac_ver() elif Platform.is_freebsd(platf): version = platform.uname()[2] systemStats['fbsdV'] = ('freebsd', version, '') # no codename for FreeBSD elif Platform.is_win32(platf): systemStats['winV'] = platform.win32_ver() return systemStats
def check(self, agentConfig): process_exclude_args = agentConfig.get('exclude_process_args', False) if process_exclude_args: ps_arg = 'aux' else: ps_arg = 'auxww' # Get output from ps try: ps = sp.Popen(['ps', ps_arg], stdout=sp.PIPE, close_fds=True).communicate()[0] except StandardError: self.logger.exception('getProcesses') return False # Split out each process processLines = ps.split('\n') del processLines[0] # Removes the headers processLines.pop() # Removes a trailing empty line processes = [] for line in processLines: line = line.split(None, 10) processes.append(map(lambda s: s.strip(), line)) return { 'processes': processes, 'apiKey': agentConfig['api_key'], 'host': get_hostname(agentConfig) }
def execute(self, process_args, redirect_std_streams=None, env=None): try: with nested(tempfile.TemporaryFile(), tempfile.TemporaryFile()) as (stdout_f, stderr_f): process = subprocess.Popen( process_args, close_fds=not redirect_std_streams, # only set to True when the streams are not redirected, for WIN compatibility stdout=stdout_f if redirect_std_streams else None, stderr=stderr_f if redirect_std_streams else None, env=env ) self._process = process self._running = True # Register SIGINT and SIGTERM signal handlers self.register_signal_handlers() # Wait for process to return self._process.wait() self._running = False if redirect_std_streams: stderr_f.seek(0) err = stderr_f.read() stdout_f.seek(0) out = stdout_f.read() sys.stdout.write(out) sys.stderr.write(err) return self._process.returncode except Exception: log.exception("Could not launch process") raise
def _get_hostname_unix(): try: # try fqdn p = subprocess.Popen(['/bin/hostname', '-f'], stdout=subprocess.PIPE) out, err = p.communicate() if p.returncode == 0: return out.strip() except Exception: return None
def check(self, agentConfig): if Platform.is_linux(): try: with open('/proc/loadavg', 'r') as load_avg: uptime = load_avg.readline().strip() except Exception: self.logger.exception('Cannot extract load') return False elif sys.platform in ('darwin', 'sunos5') or sys.platform.startswith("freebsd"): # Get output from uptime try: uptime = sp.Popen(['uptime'], stdout=sp.PIPE, close_fds=True).communicate()[0] except Exception: self.logger.exception('Cannot extract load') return False # Split out the 3 load average values load = [ res.replace(',', '.') for res in re.findall(r'([0-9]+[\.,]\d+)', uptime) ] # Normalize load by number of cores try: cores = int(agentConfig.get('system_stats').get('cpuCores')) assert cores >= 1, "Cannot determine number of cores" # Compute a normalized load, named .load.norm to make it easy to find next to .load return { 'system.load.1': float(load[0]), 'system.load.5': float(load[1]), 'system.load.15': float(load[2]), 'system.load.norm.1': float(load[0]) / cores, 'system.load.norm.5': float(load[1]) / cores, 'system.load.norm.15': float(load[2]) / cores, } except Exception: # No normalized load available return { 'system.load.1': float(load[0]), 'system.load.5': float(load[1]), 'system.load.15': float(load[2]) }
def __init__(self, logger): Check.__init__(self, logger) macV = None if sys.platform == 'darwin': macV = platform.mac_ver() macV_minor_version = int( re.match(r'10\.(\d+)\.?.*', macV[0]).group(1)) # Output from top is slightly modified on OS X 10.6 (case #28239) and greater if macV and (macV_minor_version >= 6): self.topIndex = 6 else: self.topIndex = 5 self.pagesize = 0 if sys.platform == 'sunos5': try: pgsz = sp.Popen(['pagesize'], stdout=sp.PIPE, close_fds=True).communicate()[0] self.pagesize = int(pgsz.strip()) except Exception: # No page size available pass
def _start(self, path_to_java, java_run_opts, jmx_checks, command, reporter, tools_jar_path, custom_jar_paths, redirect_std_streams): statsd_port = self.agentConfig.get('dogstatsd_port', "8125") if reporter is None: reporter = "statsd:%s" % str(statsd_port) log.info("Starting jmxfetch:") try: path_to_java = path_to_java or "java" java_run_opts = java_run_opts or "" path_to_jmxfetch = self._get_path_to_jmxfetch() path_to_status_file = JMXFiles.get_status_file_path() classpath = path_to_jmxfetch if tools_jar_path is not None: classpath = r"%s:%s" % (tools_jar_path, classpath) if custom_jar_paths: classpath = r"%s:%s" % (':'.join(custom_jar_paths), classpath) subprocess_args = [ path_to_java, # Path to the java bin '-classpath', classpath, JMXFETCH_MAIN_CLASS, '--check_period', str(self.check_frequency * 1000), # Period of the main loop of jmxfetch in ms '--conf_directory', r"%s" % self. confd_path, # Path of the conf.d directory that will be read by jmxfetch, '--log_level', JAVA_LOGGING_LEVEL.get( self.logging_config.get("log_level"), "INFO" ), # Log Level: Mapping from Python log level to log4j log levels '--log_location', r"%s" % self.logging_config.get( 'jmxfetch_log_file'), # Path of the log file '--reporter', reporter, # Reporter to use '--status_location', r"%s" % path_to_status_file, # Path to the status file to write command, # Name of the command ] if Platform.is_windows(): # Signal handlers are not supported on Windows: # use a file to trigger JMXFetch exit instead path_to_exit_file = JMXFiles.get_python_exit_file_path() subprocess_args.insert( len(subprocess_args) - 1, '--exit_file_location') subprocess_args.insert( len(subprocess_args) - 1, path_to_exit_file) subprocess_args.insert(4, '--check') for check in jmx_checks: subprocess_args.insert(5, check) # Specify a maximum memory allocation pool for the JVM if "Xmx" not in java_run_opts and "XX:MaxHeapSize" not in java_run_opts: java_run_opts += _JVM_DEFAULT_MAX_MEMORY_ALLOCATION # Specify the initial memory allocation pool for the JVM if "Xms" not in java_run_opts and "XX:InitialHeapSize" not in java_run_opts: java_run_opts += _JVM_DEFAULT_INITIAL_MEMORY_ALLOCATION for opt in java_run_opts.split(): subprocess_args.insert(1, opt) log.info("Running %s" % " ".join(subprocess_args)) # Launch JMXfetch subprocess manually, w/o get_subprocess_output(), since it's a special case with nested(tempfile.TemporaryFile('rw'), tempfile.TemporaryFile('rw')) as (stdout_f, stderr_f): jmx_process = subprocess.Popen( subprocess_args, close_fds= not redirect_std_streams, # only set to True when the streams are not redirected, for WIN compatibility stdout=stdout_f if redirect_std_streams else None, stderr=stderr_f if redirect_std_streams else None) self.jmx_process = jmx_process # Register SIGINT and SIGTERM signal handlers self.register_signal_handlers() if redirect_std_streams: # Wait for JMXFetch to return, and write out the stdout and stderr of JMXFetch to sys.stdout and sys.stderr out, err = jmx_process.communicate() sys.stdout.write(out) sys.stderr.write(err) else: # Wait for JMXFetch to return jmx_process.wait() return jmx_process.returncode except OSError: java_path_msg = "Couldn't launch JMXTerm. Is Java in your PATH ?" log.exception(java_path_msg) invalid_checks = {} for check in jmx_checks: check_name = check.split('.')[0] check_name = check_name.encode('ascii', 'ignore') invalid_checks[check_name] = java_path_msg JMXFiles.write_status_file(invalid_checks) raise except Exception: log.exception("Couldn't launch JMXFetch") raise
def check(self, agentConfig): """Return an aggregate of CPU stats across all CPUs When figures are not available, False is sent back. """ def format_results(us, sy, wa, idle, st, guest=None): data = { 'cpuUser': us, 'cpuSystem': sy, 'cpuWait': wa, 'cpuIdle': idle, 'cpuStolen': st, 'cpuGuest': guest } return dict((k, v) for k, v in data.iteritems() if v is not None) def get_value(legend, data, name, filter_value=None): "Using the legend and a metric name, get the value or None from the data line" if name in legend: value = to_float(data[legend.index(name)]) if filter_value is not None: if value > filter_value: return None return value else: # FIXME return a float or False, would trigger type error if not python self.logger.debug("Cannot extract cpu value %s from %s (%s)" % (name, data, legend)) return 0.0 try: if Platform.is_linux(): mpstat = sp.Popen(['mpstat', '1', '3'], stdout=sp.PIPE, close_fds=True).communicate()[0] # topdog@ip:~$ mpstat 1 3 # Linux 2.6.32-341-ec2 (ip) 01/19/2012 _x86_64_ (2 CPU) # # 04:22:41 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle # 04:22:42 PM all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # 04:22:43 PM all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # 04:22:44 PM all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # Average: all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # # OR # # Thanks to Mart Visser to spotting this one. # blah:/etc/dd-agent# mpstat # Linux 2.6.26-2-xen-amd64 (atira) 02/17/2012 _x86_64_ # # 05:27:03 PM CPU %user %nice %sys %iowait %irq %soft %steal %idle intr/s # 05:27:03 PM all 3.59 0.00 0.68 0.69 0.00 0.00 0.01 95.03 43.65 # lines = mpstat.split("\n") legend = [l for l in lines if "%usr" in l or "%user" in l] avg = [l for l in lines if "Average" in l] if len(legend) == 1 and len(avg) == 1: headers = [ h for h in legend[0].split() if h not in ("AM", "PM") ] data = avg[0].split() # Userland # Debian lenny says %user so we look for both # One of them will be 0 cpu_metrics = { "%usr": None, "%user": None, "%nice": None, "%iowait": None, "%idle": None, "%sys": None, "%irq": None, "%soft": None, "%steal": None, "%guest": None } for cpu_m in cpu_metrics: cpu_metrics[cpu_m] = get_value(headers, data, cpu_m, filter_value=110) if any([v is None for v in cpu_metrics.values()]): self.logger.warning("Invalid mpstat data: %s" % data) cpu_user = cpu_metrics["%usr"] + cpu_metrics[ "%user"] + cpu_metrics["%nice"] cpu_system = cpu_metrics["%sys"] + cpu_metrics[ "%irq"] + cpu_metrics["%soft"] cpu_wait = cpu_metrics["%iowait"] cpu_idle = cpu_metrics["%idle"] cpu_stolen = cpu_metrics["%steal"] cpu_guest = cpu_metrics["%guest"] return format_results(cpu_user, cpu_system, cpu_wait, cpu_idle, cpu_stolen, cpu_guest) else: return False elif sys.platform == 'darwin': # generate 3 seconds of data # [' disk0 disk1 cpu load average', ' KB/t tps MB/s KB/t tps MB/s us sy id 1m 5m 15m', ' 21.23 13 0.27 17.85 7 0.13 14 7 79 1.04 1.27 1.31', ' 4.00 3 0.01 5.00 8 0.04 12 10 78 1.04 1.27 1.31', ''] iostats = sp.Popen(['iostat', '-C', '-w', '3', '-c', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0] lines = [l for l in iostats.split("\n") if len(l) > 0] legend = [l for l in lines if "us" in l] if len(legend) == 1: headers = legend[0].split() data = lines[-1].split() cpu_user = get_value(headers, data, "us") cpu_sys = get_value(headers, data, "sy") cpu_wait = 0 cpu_idle = get_value(headers, data, "id") cpu_st = 0 return format_results(cpu_user, cpu_sys, cpu_wait, cpu_idle, cpu_st) else: self.logger.warn( "Expected to get at least 4 lines of data from iostat instead of just " + str(iostats[:max(80, len(iostats))])) return False elif sys.platform.startswith("freebsd"): # generate 3 seconds of data # tty ada0 cd0 pass0 cpu # tin tout KB/t tps MB/s KB/t tps MB/s KB/t tps MB/s us ni sy in id # 0 69 26.71 0 0.01 0.00 0 0.00 0.00 0 0.00 2 0 0 1 97 # 0 78 0.00 0 0.00 0.00 0 0.00 0.00 0 0.00 0 0 0 0 100 iostats = sp.Popen(['iostat', '-w', '3', '-c', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0] lines = [l for l in iostats.split("\n") if len(l) > 0] legend = [l for l in lines if "us" in l] if len(legend) == 1: headers = legend[0].split() data = lines[-1].split() cpu_user = get_value(headers, data, "us") cpu_nice = get_value(headers, data, "ni") cpu_sys = get_value(headers, data, "sy") cpu_intr = get_value(headers, data, "in") cpu_wait = 0 cpu_idle = get_value(headers, data, "id") cpu_stol = 0 return format_results(cpu_user + cpu_nice, cpu_sys + cpu_intr, cpu_wait, cpu_idle, cpu_stol) else: self.logger.warn( "Expected to get at least 4 lines of data from iostat instead of just " + str(iostats[:max(80, len(iostats))])) return False elif sys.platform == 'sunos5': # mpstat -aq 1 2 # SET minf mjf xcal intr ithr csw icsw migr smtx srw syscl usr sys wt idl sze # 0 5239 0 12857 22969 5523 14628 73 546 4055 1 146856 5 6 0 89 24 <-- since boot # 1 ... # SET minf mjf xcal intr ithr csw icsw migr smtx srw syscl usr sys wt idl sze # 0 20374 0 45634 57792 5786 26767 80 876 20036 2 724475 13 13 0 75 24 <-- past 1s # 1 ... # http://docs.oracle.com/cd/E23824_01/html/821-1462/mpstat-1m.html # # Will aggregate over all processor sets mpstat = sp.Popen(['mpstat', '-aq', '1', '2'], stdout=sp.PIPE, close_fds=True).communicate()[0] lines = [l for l in mpstat.split("\n") if len(l) > 0] # discard the first len(lines)/2 lines lines = lines[len(lines) / 2:] legend = [l for l in lines if "SET" in l] assert len(legend) == 1 if len(legend) == 1: headers = legend[0].split() # collect stats for each processor set # and aggregate them based on the relative set size d_lines = [l for l in lines if "SET" not in l] user = [ get_value(headers, l.split(), "usr") for l in d_lines ] kern = [ get_value(headers, l.split(), "sys") for l in d_lines ] wait = [ get_value(headers, l.split(), "wt") for l in d_lines ] idle = [ get_value(headers, l.split(), "idl") for l in d_lines ] size = [ get_value(headers, l.split(), "sze") for l in d_lines ] count = sum(size) rel_size = [s / count for s in size] dot = lambda v1, v2: reduce(operator.add, map(operator.mul, v1, v2)) return format_results(dot(user, rel_size), dot(kern, rel_size), dot(wait, rel_size), dot(idle, rel_size), 0.0) else: self.logger.warn("CPUStats: unsupported platform") return False except Exception: self.logger.exception("Cannot compute CPU stats") return False
def check(self, agentConfig): if Platform.is_linux(): try: with open('/proc/meminfo', 'r') as mem_info: lines = mem_info.readlines() except Exception: self.logger.exception( 'Cannot get memory metrics from /proc/meminfo') return False # $ cat /proc/meminfo # MemTotal: 7995360 kB # MemFree: 1045120 kB # Buffers: 226284 kB # Cached: 775516 kB # SwapCached: 248868 kB # Active: 1004816 kB # Inactive: 1011948 kB # Active(anon): 455152 kB # Inactive(anon): 584664 kB # Active(file): 549664 kB # Inactive(file): 427284 kB # Unevictable: 4392476 kB # Mlocked: 4392476 kB # SwapTotal: 11120632 kB # SwapFree: 10555044 kB # Dirty: 2948 kB # Writeback: 0 kB # AnonPages: 5203560 kB # Mapped: 50520 kB # Shmem: 10108 kB # Slab: 161300 kB # SReclaimable: 136108 kB # SUnreclaim: 25192 kB # KernelStack: 3160 kB # PageTables: 26776 kB # NFS_Unstable: 0 kB # Bounce: 0 kB # WritebackTmp: 0 kB # CommitLimit: 15118312 kB # Committed_AS: 6703508 kB # VmallocTotal: 34359738367 kB # VmallocUsed: 400668 kB # VmallocChunk: 34359329524 kB # HardwareCorrupted: 0 kB # HugePages_Total: 0 # HugePages_Free: 0 # HugePages_Rsvd: 0 # HugePages_Surp: 0 # Hugepagesize: 2048 kB # DirectMap4k: 10112 kB # DirectMap2M: 8243200 kB regexp = re.compile( r'^(\w+):\s+([0-9]+)' ) # We run this several times so one-time compile now meminfo = {} for line in lines: try: match = re.search(regexp, line) if match is not None: meminfo[match.group(1)] = match.group(2) except Exception: self.logger.exception("Cannot parse /proc/meminfo") memData = {} # Physical memory # FIXME units are in MB, we should use bytes instead try: memData['physTotal'] = int(meminfo.get('MemTotal', 0)) / 1024 memData['physFree'] = int(meminfo.get('MemFree', 0)) / 1024 memData['physBuffers'] = int(meminfo.get('Buffers', 0)) / 1024 memData['physCached'] = int(meminfo.get('Cached', 0)) / 1024 memData['physShared'] = int(meminfo.get('Shmem', 0)) / 1024 memData[ 'physUsed'] = memData['physTotal'] - memData['physFree'] # Usable is relative since cached and buffers are actually used to speed things up. memData['physUsable'] = memData['physFree'] + memData[ 'physBuffers'] + memData['physCached'] if memData['physTotal'] > 0: memData['physPctUsable'] = float( memData['physUsable']) / float(memData['physTotal']) except Exception: self.logger.exception( 'Cannot compute stats from /proc/meminfo') # Swap # FIXME units are in MB, we should use bytes instead try: memData['swapTotal'] = int(meminfo.get('SwapTotal', 0)) / 1024 memData['swapFree'] = int(meminfo.get('SwapFree', 0)) / 1024 memData[ 'swapUsed'] = memData['swapTotal'] - memData['swapFree'] if memData['swapTotal'] > 0: memData['swapPctFree'] = float( memData['swapFree']) / float(memData['swapTotal']) except Exception: self.logger.exception('Cannot compute swap stats') return memData elif sys.platform == 'darwin': macV = platform.mac_ver() macV_minor_version = int( re.match(r'10\.(\d+)\.?.*', macV[0]).group(1)) try: top = sp.Popen(['top', '-l 1'], stdout=sp.PIPE, close_fds=True).communicate()[0] sysctl = sp.Popen(['sysctl', 'vm.swapusage'], stdout=sp.PIPE, close_fds=True).communicate()[0] except StandardError: self.logger.exception('getMemoryUsage') return False # Deal with top lines = top.split('\n') physParts = re.findall(r'([0-9]\d+)', lines[self.topIndex]) # Deal with sysctl swapParts = re.findall(r'([0-9]+\.\d+)', sysctl) # Mavericks changes the layout of physical memory format in `top` physUsedPartIndex = 3 physFreePartIndex = 4 if macV and (macV_minor_version >= 9): physUsedPartIndex = 0 physFreePartIndex = 2 return { 'physUsed': physParts[physUsedPartIndex], 'physFree': physParts[physFreePartIndex], 'swapUsed': swapParts[1], 'swapFree': swapParts[2] } elif sys.platform.startswith("freebsd"): try: sysctl = sp.Popen(['sysctl', 'vm.stats.vm'], stdout=sp.PIPE, close_fds=True).communicate()[0] except Exception: self.logger.exception('getMemoryUsage') return False lines = sysctl.split('\n') # ... # vm.stats.vm.v_page_size: 4096 # vm.stats.vm.v_page_count: 759884 # vm.stats.vm.v_wire_count: 122726 # vm.stats.vm.v_active_count: 109350 # vm.stats.vm.v_cache_count: 17437 # vm.stats.vm.v_inactive_count: 479673 # vm.stats.vm.v_free_count: 30542 # ... # We run this several times so one-time compile now regexp = re.compile(r'^vm\.stats\.vm\.(\w+):\s+([0-9]+)') meminfo = {} for line in lines: try: match = re.search(regexp, line) if match is not None: meminfo[match.group(1)] = match.group(2) except Exception: self.logger.exception( "Cannot parse sysctl vm.stats.vm output") memData = {} # Physical memory try: pageSize = int(meminfo.get('v_page_size')) memData['physTotal'] = (int(meminfo.get('v_page_count', 0)) * pageSize) / 1048576 memData['physFree'] = (int(meminfo.get('v_free_count', 0)) * pageSize) / 1048576 memData['physCached'] = (int(meminfo.get('v_cache_count', 0)) * pageSize) / 1048576 memData['physUsed'] = ( (int(meminfo.get('v_active_count'), 0) + int(meminfo.get('v_wire_count', 0))) * pageSize) / 1048576 memData['physUsable'] = ( (int(meminfo.get('v_free_count'), 0) + int(meminfo.get('v_cache_count', 0)) + int(meminfo.get('v_inactive_count', 0))) * pageSize) / 1048576 if memData['physTotal'] > 0: memData['physPctUsable'] = float( memData['physUsable']) / float(memData['physTotal']) except Exception: self.logger.exception( 'Cannot compute stats from /proc/meminfo') # Swap try: sysctl = sp.Popen(['swapinfo', '-m'], stdout=sp.PIPE, close_fds=True).communicate()[0] except Exception: self.logger.exception('getMemoryUsage') return False lines = sysctl.split('\n') # ... # Device 1M-blocks Used Avail Capacity # /dev/ad0s1b 570 0 570 0% # ... assert "Device" in lines[0] try: memData['swapTotal'] = 0 memData['swapFree'] = 0 memData['swapUsed'] = 0 for line in lines[1:-1]: line = line.split() memData['swapTotal'] += int(line[1]) memData['swapFree'] += int(line[3]) memData['swapUsed'] += int(line[2]) except Exception: self.logger.exception('Cannot compute stats from swapinfo') return memData elif sys.platform == 'sunos5': try: memData = {} kmem = sp.Popen(["kstat", "-c", "zone_memory_cap", "-p"], stdout=sp.PIPE, close_fds=True).communicate()[0] # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:anon_alloc_fail 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:anonpgin 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:class zone_memory_cap # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:crtime 16359935.0680834 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:execpgin 185 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:fspgin 2556 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:n_pf_throttle 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:n_pf_throttle_usec 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:nover 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:pagedout 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:pgpgin 2741 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:physcap 536870912 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:rss 115544064 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:snaptime 16787393.9439095 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swap 91828224 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swapcap 1073741824 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:zonename 53aa9b7e-48ba-4152-a52b-a6368c3d9e7c # turn memory_cap:360:zone_name:key value # into { "key": value, ...} kv = [ l.strip().split() for l in kmem.split("\n") if len(l) > 0 ] entries = dict([(k.split(":")[-1], v) for (k, v) in kv]) # extract rss, physcap, swap, swapcap, turn into MB convert = lambda v: int(long(v)) / 2**20 memData["physTotal"] = convert(entries["physcap"]) memData["physUsed"] = convert(entries["rss"]) memData[ "physFree"] = memData["physTotal"] - memData["physUsed"] memData["swapTotal"] = convert(entries["swapcap"]) memData["swapUsed"] = convert(entries["swap"]) memData[ "swapFree"] = memData["swapTotal"] - memData["swapUsed"] if memData['swapTotal'] > 0: memData['swapPctFree'] = float( memData['swapFree']) / float(memData['swapTotal']) return memData except Exception: self.logger.exception( "Cannot compute mem stats from kstat -c zone_memory_cap") return False else: return False
def check(self, agentConfig): """Capture io stats. @rtype dict @return {"device": {"metric": value, "metric": value}, ...} """ io = {} try: if Platform.is_linux(): stdout = sp.Popen(['iostat', '-d', '1', '2', '-x', '-k'], stdout=sp.PIPE, close_fds=True).communicate()[0] # Linux 2.6.32-343-ec2 (ip-10-35-95-10) 12/11/2012 _x86_64_ (2 CPU) # # Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await svctm %util # sda1 0.00 17.61 0.26 32.63 4.23 201.04 12.48 0.16 4.81 0.53 1.73 # sdb 0.00 2.68 0.19 3.84 5.79 26.07 15.82 0.02 4.93 0.22 0.09 # sdg 0.00 0.13 2.29 3.84 100.53 30.61 42.78 0.05 8.41 0.88 0.54 # sdf 0.00 0.13 2.30 3.84 100.54 30.61 42.78 0.06 9.12 0.90 0.55 # md0 0.00 0.00 0.05 3.37 1.41 30.01 18.35 0.00 0.00 0.00 0.00 # # Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await svctm %util # sda1 0.00 0.00 0.00 10.89 0.00 43.56 8.00 0.03 2.73 2.73 2.97 # sdb 0.00 0.00 0.00 2.97 0.00 11.88 8.00 0.00 0.00 0.00 0.00 # sdg 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 # sdf 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 # md0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 io.update(self._parse_linux2(stdout)) elif sys.platform == "sunos5": iostat = sp.Popen(["iostat", "-x", "-d", "1", "2"], stdout=sp.PIPE, close_fds=True).communicate()[0] # extended device statistics <-- since boot # device r/s w/s kr/s kw/s wait actv svc_t %w %b # ramdisk1 0.0 0.0 0.1 0.1 0.0 0.0 0.0 0 0 # sd0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 # sd1 79.9 149.9 1237.6 6737.9 0.0 0.5 2.3 0 11 # extended device statistics <-- past second # device r/s w/s kr/s kw/s wait actv svc_t %w %b # ramdisk1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 # sd0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 # sd1 0.0 139.0 0.0 1850.6 0.0 0.0 0.1 0 1 # discard the first half of the display (stats since boot) lines = [l for l in iostat.split("\n") if len(l) > 0] lines = lines[len(lines) / 2:] assert "extended device statistics" in lines[0] headers = lines[1].split() assert "device" in headers for l in lines[2:]: cols = l.split() # cols[0] is the device # cols[1:] are the values io[cols[0]] = {} for i in range(1, len(cols)): io[cols[0]][self.xlate(headers[i], "sunos")] = cols[i] elif sys.platform.startswith("freebsd"): iostat = sp.Popen(["iostat", "-x", "-d", "1", "2"], stdout=sp.PIPE, close_fds=True).communicate()[0] # Be careful! # It looks like SunOS, but some columms (wait, svc_t) have different meaning # extended device statistics # device r/s w/s kr/s kw/s wait svc_t %b # ad0 3.1 1.3 49.9 18.8 0 0.7 0 # extended device statistics # device r/s w/s kr/s kw/s wait svc_t %b # ad0 0.0 2.0 0.0 31.8 0 0.2 0 # discard the first half of the display (stats since boot) lines = [l for l in iostat.split("\n") if len(l) > 0] lines = lines[len(lines) / 2:] assert "extended device statistics" in lines[0] headers = lines[1].split() assert "device" in headers for l in lines[2:]: cols = l.split() # cols[0] is the device # cols[1:] are the values io[cols[0]] = {} for i in range(1, len(cols)): io[cols[0]][self.xlate(headers[i], "freebsd")] = cols[i] elif sys.platform == 'darwin': iostat = sp.Popen(['iostat', '-d', '-c', '2', '-w', '1'], stdout=sp.PIPE, close_fds=True).communicate()[0] # disk0 disk1 <-- number of disks # KB/t tps MB/s KB/t tps MB/s # 21.11 23 0.47 20.01 0 0.00 # 6.67 3 0.02 0.00 0 0.00 <-- line of interest io = self._parse_darwin(iostat) else: return False # If we filter devices, do it know. device_blacklist_re = agentConfig.get('device_blacklist_re', None) if device_blacklist_re: filtered_io = {} for device, stats in io.iteritems(): if not device_blacklist_re.match(device): filtered_io[device] = stats else: filtered_io = io return filtered_io except Exception: self.logger.exception("Cannot extract IO statistics") return False
def check(self, agentConfig): self.logger.debug('getCPUStats: start') cpu_stats = {} if sys.platform == 'linux2': self.logger.debug('getCPUStats: linux2') headerRegexp = re.compile(r'.*?([%][a-zA-Z0-9]+)[\s+]?') itemRegexp = re.compile(r'.*?\s+(\d+)[\s+]?') valueRegexp = re.compile(r'\d+\.\d+') proc = None try: proc = subprocess.Popen(['mpstat', '-P', 'ALL', '1', '1'], stdout=subprocess.PIPE, close_fds=True) stats = proc.communicate()[0] if int(pythonVersion[1]) >= 6: try: proc.kill() except Exception: self.logger.debug('Process already terminated') stats = stats.split('\n') header = stats[2] headerNames = re.findall(headerRegexp, header) device = None for statsIndex in range(3, len(stats)): row = stats[statsIndex] if not row: # skip the averages break deviceMatch = re.match(itemRegexp, row) if string.find(row, 'all') is not -1: device = 'ALL' elif deviceMatch is not None: device = 'CPU%s' % deviceMatch.groups()[0] values = re.findall(valueRegexp, row.replace(',', '.')) cpu_stats[device] = {} for headerIndex in range(0, len(headerNames)): headerName = headerNames[headerIndex] cpu_stats[device][headerName] = values[headerIndex] except OSError: # we dont have it installed return nothing return False except Exception as exception: import traceback self.logger.error("getCPUStats: exception = %s", traceback.format_exc()) if int(pythonVersion[1]) >= 6: try: if proc is not None: proc.kill() except UnboundLocalError: self.logger.debug('Process already terminated') except Exception: self.logger.debug('Process already terminated') return False elif sys.platform == 'darwin': self.logger.debug('getCPUStats: darwin') try: proc = subprocess.Popen(['sar', '-u', '1', '2'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) stats = proc.communicate()[0] itemRegexp = re.compile(r'\s+(\d+)[\s+]?') titleRegexp = re.compile(r'.*?([%][a-zA-Z0-9]+)[\s+]?') titles = [] values = [] for line in stats.split('\n'): # top line with the titles in if '%' in line: titles = re.findall(titleRegexp, line) if line and line.startswith('Average:'): values = re.findall(itemRegexp, line) if values and titles: cpu_stats['CPUs'] = dict(zip(titles, values)) except Exception: import traceback self.logger.error('getCPUStats: exception = %s', traceback.format_exc()) return False else: self.logger.debug('getCPUStats: unsupported platform') return False self.logger.debug('getCPUStats: completed, returning') return {'cpuStats': cpu_stats}
def _populate_payload_metadata(self, payload, check_statuses, start_event=True): """ Periodically populate the payload with metadata related to the system, host, and/or checks. """ now = time.time() # Include system stats on first postback if start_event and self._is_first_run(): payload['systemStats'] = self.agentConfig.get('system_stats', {}) # Also post an event in the newsfeed payload['events']['System'] = [{ 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': now, 'event_type': 'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Periodically send the host metadata. if self._should_send_additional_data('host_metadata'): # gather metadata with gohai try: if get_os() != 'windows': command = "gohai" else: command = "gohai\gohai.exe" gohai_metadata, gohai_log = subprocess.Popen( [command], stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() payload['gohai'] = gohai_metadata if gohai_log: log.warning("GOHAI LOG | {0}".format(gohai_log)) except OSError as e: if e.errno == 2: # file not found, expected when install from source log.info("gohai file not found") else: raise e except Exception as e: log.warning("gohai command failed with error %s" % str(e)) payload['systemStats'] = get_system_stats() payload['meta'] = self._get_hostname_metadata() self.hostname_metadata_cache = payload['meta'] # Add static tags from the configuration file host_tags = [] if self.agentConfig['tags'] is not None: host_tags.extend([ unicode(tag.strip()) for tag in self.agentConfig['tags'].split(",") ]) if self.agentConfig['collect_ec2_tags']: host_tags.extend(EC2.get_tags(self.agentConfig)) if host_tags: payload['host-tags']['system'] = host_tags GCE_tags = GCE.get_tags(self.agentConfig) if GCE_tags is not None: payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags # Log the metadata on the first run if self._is_first_run(): log.info( "Hostnames: %s, tags: %s" % (repr(self.hostname_metadata_cache), payload['host-tags'])) # Periodically send extra hosts metadata (vsphere) # Metadata of hosts that are not the host where the agent runs, not all the checks use # that external_host_tags = [] if self._should_send_additional_data('external_host_tags'): for check in self.initialized_checks_d: try: getter = getattr(check, 'get_external_host_tags') check_tags = getter() external_host_tags.extend(check_tags) except AttributeError: pass if external_host_tags: payload['external_host_tags'] = external_host_tags # Periodically send agent_checks metadata if self._should_send_additional_data('agent_checks'): # Add agent checks statuses and error/warning messages agent_checks = [] for check in check_statuses: if check.instance_statuses is not None: for i, instance_status in enumerate( check.instance_statuses): agent_checks.append(( check.name, check.source_type_name, instance_status.instance_id, instance_status.status, # put error message or list of warning messages in the same field # it will be handled by the UI instance_status.error or instance_status.warnings or "", check.service_metadata[i])) else: agent_checks.append( (check.name, check.source_type_name, "initialization", check.status, repr(check.init_failed_error))) payload['agent_checks'] = agent_checks payload[ 'meta'] = self.hostname_metadata_cache # add hostname metadata # If required by the user, let's create the dd_check:xxx host tags if self.agentConfig['create_dd_check_tags'] and \ self._should_send_additional_data('dd_check_tags'): app_tags_list = [ DD_CHECK_TAG.format(c.name) for c in self.initialized_checks_d ] app_tags_list.extend([ DD_CHECK_TAG.format(cname) for cname in JMXFiles.get_jmx_appnames() ]) if 'system' not in payload['host-tags']: payload['host-tags']['system'] = [] payload['host-tags']['system'].extend(app_tags_list)
def _start(self, path_to_java, java_run_opts, jmx_checks, command, reporter, tools_jar_path, custom_jar_paths, redirect_std_streams): if reporter is None: statsd_host = self.agentConfig.get('bind_host', 'localhost') statsd_port = self.agentConfig.get('monitorstatsd_port', "8125") reporter = "statsd:%s:%s" % (statsd_host, statsd_port) log.info("Starting jmxfetch:") try: path_to_java = path_to_java or "java" java_run_opts = java_run_opts or "" path_to_jmxfetch = self._get_path_to_jmxfetch() path_to_status_file = JMXFiles.get_status_file_path() classpath = path_to_jmxfetch if tools_jar_path is not None: classpath = r"%s:%s" % (tools_jar_path, classpath) if custom_jar_paths: classpath = r"%s:%s" % (':'.join(custom_jar_paths), classpath) subprocess_args = [ path_to_java, '-classpath', classpath, JMXFETCH_MAIN_CLASS, '--check_period', str(self.check_frequency * 1000), '--conf_directory', r"%s" % self.confd_path, '--log_level', JAVA_LOGGING_LEVEL.get(self.logging_config.get("log_level"), "INFO"), '--log_location', r"%s" % self.logging_config.get('jmxfetch_log_file'), '--reporter', reporter, '--status_location', r"%s" % path_to_status_file, command, ] if Platform.is_windows(): path_to_exit_file = JMXFiles.get_python_exit_file_path() subprocess_args.insert( len(subprocess_args) - 1, '--exit_file_location') subprocess_args.insert( len(subprocess_args) - 1, path_to_exit_file) subprocess_args.insert(4, '--check') for check in jmx_checks: subprocess_args.insert(5, check) if "Xmx" not in java_run_opts and "XX:MaxHeapSize" not in java_run_opts: java_run_opts += _JVM_DEFAULT_MAX_MEMORY_ALLOCATION if "Xms" not in java_run_opts and "XX:InitialHeapSize" not in java_run_opts: java_run_opts += _JVM_DEFAULT_INITIAL_MEMORY_ALLOCATION for opt in java_run_opts.split(): subprocess_args.insert(1, opt) log.info("Running %s" % " ".join(subprocess_args)) with nested(tempfile.TemporaryFile(), tempfile.TemporaryFile()) as (stdout_f, stderr_f): jmx_process = subprocess.Popen( subprocess_args, close_fds=not redirect_std_streams, stdout=stdout_f if redirect_std_streams else None, stderr=stderr_f if redirect_std_streams else None) self.jmx_process = jmx_process self.register_signal_handlers() jmx_process.wait() if redirect_std_streams: stderr_f.seek(0) err = stderr_f.read() stdout_f.seek(0) out = stdout_f.read() sys.stdout.write(out) sys.stderr.write(err) return jmx_process.returncode except OSError: java_path_msg = "Couldn't launch JMXTerm. Is Java in your PATH ?" log.exception(java_path_msg) invalid_checks = {} for check in jmx_checks: check_name = check.split('.')[0] check_name = check_name.encode('ascii', 'ignore') invalid_checks[check_name] = java_path_msg JMXFiles.write_status_file(invalid_checks) raise except Exception: log.exception("Couldn't launch JMXFetch") raise