def get_tags(self, state, c_id): """Extract useful tags from docker or platform APIs. These are collected by default.""" tags = [] if Platform.is_k8s(): pod_metadata = state.get_kube_config(c_id, 'metadata') if pod_metadata is None: log.warning("Failed to fetch pod metadata for container %s." " Kubernetes tags may be missing." % c_id[:12]) return [] # get labels kube_labels = pod_metadata.get('labels', {}) for label, value in kube_labels.iteritems(): tags.append('%s:%s' % (label, value)) # get replication controller created_by = json.loads(pod_metadata.get('annotations', {}).get('kubernetes.io/created-by', '{}')) if created_by.get('reference', {}).get('kind') == 'ReplicationController': tags.append('kube_replication_controller:%s' % created_by.get('reference', {}).get('name')) # get kubernetes namespace tags.append('kube_namespace:%s' % pod_metadata.get('namespace')) elif Platform.is_swarm(): c_labels = state.inspect_container(c_id).get('Labels', {}) swarm_svc = c_labels.get(DockerUtil.SWARM_SVC_LABEL) if swarm_svc: tags.append('swarm_service:%s' % c_labels) return tags
def testNetwork(self): # FIXME: cx_state to true, but needs sysstat installed config = """ init_config: instances: - collect_connection_state: false excluded_interfaces: - lo - lo0 """ check, instances = get_check('network', config) check.check(instances[0]) check.get_metrics() metric_names = [m[0] for m in check.aggregator.metrics] assert 'system.net.bytes_rcvd' in metric_names assert 'system.net.bytes_sent' in metric_names if Platform.is_linux(): assert 'system.net.tcp.retrans_segs' in metric_names assert 'system.net.tcp.in_segs' in metric_names assert 'system.net.tcp.out_segs' in metric_names elif Platform.is_bsd(): assert 'system.net.tcp.retrans_packs' in metric_names assert 'system.net.tcp.sent_packs' in metric_names assert 'system.net.tcp.rcv_packs' in metric_names
def _get_host_address(self, state, c_id, tpl_var): """Extract the container IP from a docker inspect object, or the kubelet API.""" c_inspect = state.inspect_container(c_id) c_id = c_inspect.get('Id', '') c_img = self.dockerutil.image_name_extractor(c_inspect) networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {} ip_dict = {} for net_name, net_desc in networks.iteritems(): ip = net_desc.get('IPAddress') if ip: ip_dict[net_name] = ip ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var) if ip_addr: return ip_addr if Platform.is_k8s(): # kubernetes case log.debug("Couldn't find the IP address for container %s (%s), " "using the kubernetes way." % (c_id[:12], c_img)) pod_ip = state.get_kube_config(c_id, 'status').get('podIP') if pod_ip: return pod_ip if Platform.is_rancher(): # try to get the rancher IP address log.debug("No IP address was found in container %s (%s) " "trying with the Rancher label" % (c_id[:12], c_img)) ip_addr = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_CONTAINER_IP) if ip_addr: return ip_addr.split('/')[0] log.error("No IP address was found for container %s (%s)" % (c_id[:12], c_img)) return None
def __init__(self, parent=None): QMenu.__init__(self, parent) self.options = {} system_tray_menu = [ (self.START, lambda: agent_manager("start")), (self.STOP, lambda: agent_manager("stop")), (self.RESTART, lambda: agent_manager("restart")), ] # First the version self.addAction(self.ABOUT.format(get_version())).setEnabled(False) self.addSeparator() for name, action in system_tray_menu: self.add_option(name, action) # enable or disable mac login if Platform.is_mac(): self.add_option(self.MAC_LOGIN.format(self.enable_or_disable_mac()), lambda: self.enable_or_disable_login()) elif Platform.is_windows(): self.add_option(self.FLARE, lambda: thread.start_new_thread(windows_flare, ())) # And finally the exit self.add_option(self.EXIT, lambda: sys.exit(0)) self.connect(self, SIGNAL("aboutToShow()"), lambda: self.update_options())
def _get_checks_to_refresh(self, state, c_id): """Get the list of checks applied to a container from the identifier_to_checks cache in the config store. Use the DATADOG_ID label or the image.""" inspect = state.inspect_container(c_id) # If the container was removed we can't tell which check is concerned # so we have to reload everything. # Same thing if it's stopped and we're on Kubernetes in auto_conf mode # because the pod was deleted and its template could have been in the annotations. if not inspect or \ (not inspect.get('State', {}).get('Running') and Platform.is_k8s() and not self.agentConfig.get('sd_config_backend')): self.reload_check_configs = True return identifier = inspect.get('Config', {}).get('Labels', {}).get(DATADOG_ID) or \ self.dockerutil.image_name_extractor(inspect) platform_kwargs = {} if Platform.is_k8s(): kube_metadata = state.get_kube_config(c_id, 'metadata') or {} platform_kwargs = { 'kube_annotations': kube_metadata.get('annotations'), 'kube_container_name': state.get_kube_container_name(c_id), } return self.config_store.get_checks_to_refresh(identifier, **platform_kwargs)
def __init__(self, agentConfig): try: self.config_store = get_config_store(agentConfig=agentConfig) except Exception as e: log.error('Failed to instantiate the config store client. ' 'Auto-config only will be used. %s' % str(e)) agentConfig['sd_config_backend'] = None self.config_store = get_config_store(agentConfig=agentConfig) self.dockerutil = DockerUtil(config_store=self.config_store) self.docker_client = self.dockerutil.client if Platform.is_k8s(): try: self.kubeutil = KubeUtil() except Exception as ex: self.kubeutil = None log.error("Couldn't instantiate the kubernetes client, " "subsequent kubernetes calls will fail as well. Error: %s" % str(ex)) if Platform.is_nomad(): self.nomadutil = NomadUtil() elif Platform.is_ecs_instance(): self.ecsutil = ECSUtil() self.VAR_MAPPING = { 'host': self._get_host_address, 'port': self._get_port, 'tags': self._get_additional_tags, } AbstractSDBackend.__init__(self, agentConfig)
def get_config_path(cfg_path=None, os_name=None): # Check if there's an override and if it exists if cfg_path is not None and os.path.exists(cfg_path): return cfg_path # Check if there's a config stored in the current agent directory try: path = os.path.realpath(__file__) path = os.path.dirname(path) return _config_path(path) except PathNotFound as e: pass # Check for an OS-specific path, continue on not-found exceptions bad_path = '' try: if Platform.is_windows(): common_data = _windows_commondata_path() return _config_path(os.path.join(common_data, 'Datadog')) elif Platform.is_mac(): return _config_path(MAC_CONFIG_PATH) else: return _config_path(UNIX_CONFIG_PATH) except PathNotFound as e: if len(e.args) > 0: bad_path = e.args[0] # If all searches fail, exit the agent with an error sys.stderr.write("Please supply a configuration file at %s or in the directory where " "the Agent is currently deployed.\n" % bad_path) sys.exit(3)
def initialize_logging(logger_name): try: logging_config = get_logging_config() logging.basicConfig( format=get_log_format(logger_name), level=logging_config['log_level'] or logging.INFO, ) log_file = logging_config.get('%s_log_file' % logger_name) if log_file is not None and not logging_config['disable_file_logging']: # make sure the log directory is writeable # NOTE: the entire directory needs to be writable so that rotation works if os.access(os.path.dirname(log_file), os.R_OK | os.W_OK): file_handler = logging.handlers.RotatingFileHandler(log_file, maxBytes=LOGGING_MAX_BYTES, backupCount=1) formatter = logging.Formatter(get_log_format(logger_name), get_log_date_format()) file_handler.setFormatter(formatter) root_log = logging.getLogger() root_log.addHandler(file_handler) else: sys.stderr.write("Log file is unwritable: '%s'\n" % log_file) # set up syslog if logging_config['log_to_syslog']: try: from logging.handlers import SysLogHandler if logging_config['syslog_host'] is not None and logging_config['syslog_port'] is not None: sys_log_addr = (logging_config['syslog_host'], logging_config['syslog_port']) else: sys_log_addr = "/dev/log" # Special-case BSDs if Platform.is_darwin(): sys_log_addr = "/var/run/syslog" elif Platform.is_freebsd(): sys_log_addr = "/var/run/log" handler = SysLogHandler(address=sys_log_addr, facility=SysLogHandler.LOG_DAEMON) handler.setFormatter(logging.Formatter(get_syslog_format(logger_name), get_log_date_format())) root_log = logging.getLogger() root_log.addHandler(handler) except Exception, e: sys.stderr.write("Error setting up syslog: '%s'\n" % str(e)) traceback.print_exc() # Setting up logging in the event viewer for windows if get_os() == 'windows' and logging_config['log_to_event_viewer']: try: from logging.handlers import NTEventLogHandler nt_event_handler = NTEventLogHandler(logger_name, get_win32service_file('windows', 'win32service.pyd'), 'Application') nt_event_handler.setFormatter(logging.Formatter(get_syslog_format(logger_name), get_log_date_format())) nt_event_handler.setLevel(logging.ERROR) app_log = logging.getLogger(logger_name) app_log.addHandler(nt_event_handler) except Exception, e: sys.stderr.write("Error setting up Event viewer logging: '%s'\n" % str(e)) traceback.print_exc()
def test_check(self): config = {'instances': self.MYSQL_CONFIG} self.run_check_twice(config) # Test service check self.assertServiceCheck('mysql.can_connect', status=AgentCheck.OK, tags=self.SC_TAGS, count=1) # Travis MySQL not running replication - FIX in flavored test. self.assertServiceCheck('mysql.replication.slave_running', status=AgentCheck.CRITICAL, tags=self.SC_TAGS, count=1) ver = map(lambda x: int(x), self.service_metadata[0]['version'].split(".")) ver = tuple(ver) testable_metrics = (self.STATUS_VARS + self.VARIABLES_VARS + self.INNODB_VARS + self.BINLOG_VARS + self.SYSTEM_METRICS + self.SCHEMA_VARS + self.SYNTHETIC_VARS) if ver >= (5, 6, 0): testable_metrics.extend(self.PERFORMANCE_VARS) # Test metrics for mname in testable_metrics: # These two are currently not guaranteed outside of a Linux # environment. if mname == 'mysql.performance.user_time' and not Platform.is_linux(): continue if mname == 'mysql.performance.kernel_time' and not Platform.is_linux(): continue if mname == 'mysql.performance.cpu_time' and Platform.is_windows(): continue if mname == 'mysql.performance.query_run_time.avg': self.assertMetric(mname, tags=self.METRIC_TAGS+['schema:testdb'], count=1) elif mname == 'mysql.info.schema.size': self.assertMetric(mname, tags=self.METRIC_TAGS+['schema:testdb'], count=1) self.assertMetric(mname, tags=self.METRIC_TAGS+['schema:information_schema'], count=1) self.assertMetric(mname, tags=self.METRIC_TAGS+['schema:performance_schema'], count=1) else: self.assertMetric(mname, tags=self.METRIC_TAGS, count=1) # Assert service metadata self.assertServiceMetadata(['version'], count=1) # test custom query metrics self.assertMetric('alice.age', value=25) self.assertMetric('bob.age', value=20) # test optional metrics self._test_optional_metrics((self.OPTIONAL_REPLICATION_METRICS + self.OPTIONAL_INNODB_VARS + self.OPTIONAL_STATUS_VARS + self.OPTIONAL_STATUS_VARS_5_6_6), 1) # Raises when COVERAGE=true and coverage < 100% self.coverage_report()
def get_tags(self, state, c_id): """Extract useful tags from docker or platform APIs. These are collected by default.""" c_inspect = state.inspect_container(c_id) tags = self.dockerutil.extract_container_tags(c_inspect) if Platform.is_k8s(): pod_metadata = state.get_kube_config(c_id, 'metadata') if pod_metadata is None: log.warning("Failed to fetch pod metadata for container %s." " Kubernetes tags may be missing." % c_id[:12]) return [] # get pod labels kube_labels = pod_metadata.get('labels', {}) for label, value in kube_labels.iteritems(): tags.append('%s:%s' % (label, value)) # get kubernetes namespace namespace = pod_metadata.get('namespace') tags.append('kube_namespace:%s' % namespace) # add creator tags creator_tags = self.kubeutil.get_pod_creator_tags(pod_metadata) tags.extend(creator_tags) # add services tags if self.kubeutil.collect_service_tag: services = self.kubeutil.match_services_for_pod(pod_metadata) for s in services: if s is not None: tags.append('kube_service:%s' % s) elif Platform.is_swarm(): c_labels = c_inspect.get('Config', {}).get('Labels', {}) swarm_svc = c_labels.get(SWARM_SVC_LABEL) if swarm_svc: tags.append('swarm_service:%s' % swarm_svc) elif Platform.is_rancher(): service_name = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_SVC_NAME) stack_name = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_STACK_NAME) container_name = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_CONTAINER_NAME) if service_name: tags.append('rancher_service:%s' % service_name) if stack_name: tags.append('rancher_stack:%s' % stack_name) if container_name: tags.append('rancher_container:%s' % container_name) if self.metadata_collector.has_detected(): orch_tags = self.metadata_collector.get_container_tags(co=c_inspect) tags.extend(orch_tags) return tags
def _get_events(self): """Get the list of events.""" events, changed_container_ids = self.docker_util.get_events() if not self._disable_net_metrics: self._invalidate_network_mapping_cache(events) if changed_container_ids and self._service_discovery: get_sd_backend(self.agentConfig).update_checks(changed_container_ids) if changed_container_ids: self.metadata_collector.invalidate_cache(events) if Platform.is_nomad(): self.nomadutil.invalidate_cache(events) elif Platform.is_ecs_instance(): self.ecsutil.invalidate_cache(events) return events
def get_tags(self, state, c_id): """Extract useful tags from docker or platform APIs. These are collected by default.""" tags = [] if Platform.is_k8s(): pod_metadata = state.get_kube_config(c_id, 'metadata') if pod_metadata is None: log.warning("Failed to fetch pod metadata for container %s." " Kubernetes tags may be missing." % c_id[:12]) return [] # get pod labels kube_labels = pod_metadata.get('labels', {}) for label, value in kube_labels.iteritems(): tags.append('%s:%s' % (label, value)) # get kubernetes namespace namespace = pod_metadata.get('namespace') tags.append('kube_namespace:%s' % namespace) # get created-by created_by = json.loads(pod_metadata.get('annotations', {}).get('kubernetes.io/created-by', '{}')) creator_kind = created_by.get('reference', {}).get('kind') creator_name = created_by.get('reference', {}).get('name') # add creator tags if creator_name: if creator_kind == 'ReplicationController': tags.append('kube_replication_controller:%s' % creator_name) elif creator_kind == 'DaemonSet': tags.append('kube_daemon_set:%s' % creator_name) elif creator_kind == 'ReplicaSet': tags.append('kube_replica_set:%s' % creator_name) else: log.debug('creator-name for pod %s is empty, this should not happen' % pod_metadata.get('name')) # FIXME haissam: for service and deployment we need to store a list of these guys # that we query from the apiserver and to compare their selectors with the pod labels. # For service it's straight forward. # For deployment we only need to do it if the pod creator is a ReplicaSet. # Details: https://kubernetes.io/docs/user-guide/deployments/#selector elif Platform.is_swarm(): c_labels = state.inspect_container(c_id).get('Config', {}).get('Labels', {}) swarm_svc = c_labels.get(SWARM_SVC_LABEL) if swarm_svc: tags.append('swarm_service:%s' % swarm_svc) return tags
def is_my_process(pid): """ Check if the pid in the pid given corresponds to a running process and if psutil is available, check if it's process corresponding to the current executable """ pid_existence = pid_exists(pid) if not psutil or not pid_existence: return pid_existence if Platform.is_windows(): # We can't check anything else on Windows return True else: try: command = psutil.Process(pid).cmdline() or [] except psutil.Error: # If we can't communicate with the process, # it's not an agent one return False # Check that the second arg contains (agent|dogstatsd).py # see http://stackoverflow.com/a/2345265 exec_name = os.path.basename(inspect.stack()[-1][1]).lower() return len(command) > 1 and exec_name in command[1].lower()
def get_subprocess_output(command, log, shell=False, stdin=None, output_expected=True): """ Run the given subprocess command and return it's output. Raise an Exception if an error occurs. """ # Use tempfile, allowing a larger amount of memory. The subprocess.Popen # docs warn that the data read is buffered in memory. They suggest not to # use subprocess.PIPE if the data size is large or unlimited. with nested(tempfile.TemporaryFile(), tempfile.TemporaryFile()) as (stdout_f, stderr_f): proc = subprocess.Popen(command, close_fds=not Platform.is_windows(), # only set to True when on Unix, for WIN compatibility shell=shell, stdin=stdin, stdout=stdout_f, stderr=stderr_f) proc.wait() stderr_f.seek(0) err = stderr_f.read() if err: log.debug("Error while running {0} : {1}".format(" ".join(command), err)) stdout_f.seek(0) output = stdout_f.read() if output_expected and output is None: raise SubprocessOutputEmptyError("get_subprocess_output expected output but had none.") return (output, err, proc.returncode)
def check(self, instance): btrfs_devices = {} excluded_devices = instance.get('excluded_devices', []) if Platform.is_linux(): procfs_path = self.agentConfig.get('procfs_path', '/proc').rstrip('/') psutil.PROCFS_PATH = procfs_path for p in psutil.disk_partitions(): if (p.fstype == 'btrfs' and p.device not in btrfs_devices and p.device not in excluded_devices): btrfs_devices[p.device] = p.mountpoint if len(btrfs_devices) == 0: raise Exception("No btrfs device found") for device, mountpoint in btrfs_devices.iteritems(): for flags, total_bytes, used_bytes in self.get_usage(mountpoint): replication_type, usage_type = FLAGS_MAPPER[flags] tags = [ 'usage_type:{0}'.format(usage_type), 'replication_type:{0}'.format(replication_type), ] free = total_bytes - used_bytes usage = float(used_bytes) / float(total_bytes) self.gauge('system.disk.btrfs.total', total_bytes, tags=tags, device_name=device) self.gauge('system.disk.btrfs.used', used_bytes, tags=tags, device_name=device) self.gauge('system.disk.btrfs.free', free, tags=tags, device_name=device) self.gauge('system.disk.btrfs.usage', usage, tags=tags, device_name=device)
def _get_port(self, container_inspect, tpl_var): """Extract a port from a container_inspect or the k8s API given a template variable.""" c_id = container_inspect.get('Id', '') try: ports = map(lambda x: x.split('/')[0], container_inspect['NetworkSettings']['Ports'].keys()) except (IndexError, KeyError, AttributeError): # try to get ports from the docker API. Works if the image has an EXPOSE instruction ports = map(lambda x: x.split('/')[0], container_inspect['Config'].get('ExposedPorts', {}).keys()) # if it failed, try with the kubernetes API if not ports and Platform.is_k8s(): log.debug("Didn't find the port for container %s (%s), trying the kubernetes way." % (c_id[:12], container_inspect.get('Config', {}).get('Image', ''))) co_statuses = self._get_kube_config(c_id, 'status').get('containerStatuses', []) c_name = None for co in co_statuses: if co.get('containerID', '').split('//')[-1] == c_id: c_name = co.get('name') break containers = self._get_kube_config(c_id, 'spec').get('containers', []) for co in containers: if co.get('name') == c_name: ports = map(lambda x: str(x.get('containerPort')), co.get('ports', [])) ports = sorted(ports, key=lambda x: int(x)) return self._extract_port_from_list(ports, tpl_var)
def _header_lines(self, indent, title=None): # Don't indent the header lines = self._title_lines(title) if self.created_seconds_ago() > 120: styles = ['red', 'bold'] else: styles = [] # We color it in red if the status is too old fields = [ ( style("Status date", *styles), style("%s (%ss ago)" % ( self.created_at.strftime('%Y-%m-%d %H:%M:%S'), self.created_seconds_ago()), *styles ) ) ] fields += [ ("Pid", self.created_by_pid), ("Platform", platform.platform()), ("Python Version", "%s, %s" % ( platform.python_version(), Platform.python_architecture())), ("Logs", logger_info()), ] for key, value in fields: l = indent + "%s: %s" % (key, value) lines.append(l) return lines + [""]
def _get_check_configs(self, state, c_id, identifier): """Retrieve configuration templates and fill them with data pulled from docker and tags.""" platform_kwargs = {} if Platform.is_k8s(): kube_metadata = state.get_kube_config(c_id, 'metadata') or {} platform_kwargs = { 'kube_container_name': state.get_kube_container_name(c_id), 'kube_annotations': kube_metadata.get('annotations'), } config_templates = self._get_config_templates(identifier, **platform_kwargs) if not config_templates: return None check_configs = [] tags = self.get_tags(state, c_id) for config_tpl in config_templates: source, config_tpl = config_tpl check_name, init_config_tpl, instance_tpl, variables = config_tpl # insert tags in instance_tpl and process values for template variables instance_tpl, var_values = self._fill_tpl(state, c_id, instance_tpl, variables, tags) tpl = self._render_template(init_config_tpl or {}, instance_tpl or {}, var_values) if tpl and len(tpl) == 2: init_config, instance = tpl check_configs.append((source, (check_name, init_config, instance))) return check_configs
def _get_host_address(self, state, c_id, tpl_var): """Extract the container IP from a docker inspect object, or the kubelet API.""" c_inspect = state.inspect_container(c_id) c_id, c_img = c_inspect.get('Id', ''), c_inspect.get('Config', {}).get('Image', '') networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {} ip_dict = {} for net_name, net_desc in networks.iteritems(): ip = net_desc.get('IPAddress') if ip: ip_dict[net_name] = ip ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var) if ip_addr: return ip_addr # try to get the bridge (default) IP address log.debug("No IP address was found in container %s (%s) " "networks, trying with the IPAddress field" % (c_id[:12], c_img)) ip_addr = c_inspect.get('NetworkSettings', {}).get('IPAddress') if ip_addr: return ip_addr if Platform.is_k8s(): # kubernetes case log.debug("Couldn't find the IP address for container %s (%s), " "using the kubernetes way." % (c_id[:12], c_img)) pod_ip = state.get_kube_config(c_id, 'status').get('podIP') if pod_ip: return pod_ip log.error("No IP address was found for container %s (%s)" % (c_id[:12], c_img)) return None
def check(self, agentConfig): if not Platform.is_linux(): return False try: proc_location = agentConfig.get('procfs_path', '/proc').rstrip('/') proc_fh = "{}/sys/fs/file-nr".format(proc_location) with open(proc_fh, 'r') as file_handle: handle_contents = file_handle.readline() except Exception: self.logger.exception("Cannot extract system file handles stats") return False handle_metrics = handle_contents.split() # https://www.kernel.org/doc/Documentation/sysctl/fs.txt allocated_fh = float(handle_metrics[0]) allocated_unused_fh = float(handle_metrics[1]) max_fh = float(handle_metrics[2]) num_used = allocated_fh - allocated_unused_fh fh_in_use = num_used / max_fh return { 'system.fs.file_handles.allocated': allocated_fh, 'system.fs.file_handles.allocated_unused': allocated_unused_fh, 'system.fs.file_handles.in_use': fh_in_use, 'system.fs.file_handles.used': num_used, 'system.fs.file_handles.max': max_fh, }
def get_configs(self): """Get the config for all docker containers running on the host.""" configs = {} state = self._make_fetch_state() containers = [( self.dockerutil.image_name_extractor(container), container.get('Id'), container.get('Labels') ) for container in self.docker_client.containers()] if Platform.is_k8s(): self.kubeutil.check_services_cache_freshness() for image, cid, labels in containers: try: # value of the DATADOG_ID tag or the image name if the label is missing identifier = self.get_config_id(image, labels) check_configs = self._get_check_configs(state, cid, identifier) or [] for conf in check_configs: source, (check_name, init_config, instance) = conf # build instances list if needed if configs.get(check_name) is None: configs[check_name] = (source, (init_config, [instance])) else: conflict_init_msg = 'Different versions of `init_config` found for check {}. ' \ 'Keeping the first one found.' if configs[check_name][1][0] != init_config: log.warning(conflict_init_msg.format(check_name)) configs[check_name][1][1].append(instance) except Exception: log.exception('Building config for container %s based on image %s using service ' 'discovery failed, leaving it alone.' % (cid[:12], image)) return configs
def _report_performance_metrics(self, containers_by_id): containers_without_proc_root = [] for container in containers_by_id.itervalues(): if self._is_container_excluded(container) or not self._is_container_running(container): continue tags = self._get_tags(container, PERFORMANCE) try: self._report_cgroup_metrics(container, tags) if "_proc_root" not in container: containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0]) continue self._report_net_metrics(container, tags) except BogusPIDException as e: self.log.warning('Unable to report cgroup metrics: %s', e) if containers_without_proc_root: message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format( ", ".join(containers_without_proc_root)) if not Platform.is_k8s(): self.warning(message) else: # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway self.log.debug(message)
def get_checks(): checks = {} conf_d_directory = get_confd_path() for filename in sorted(os.listdir(conf_d_directory)): module_name, ext = osp.splitext(filename) if Platform.is_windows(): excluded_checks = EXCLUDED_WINDOWS_CHECKS else: excluded_checks = EXCLUDED_MAC_CHECKS if filename.split(".")[0] in excluded_checks: continue if ext not in (".yaml", ".example", ".disabled"): continue agent_check = AgentCheck(filename, ext, conf_d_directory) if ( agent_check.enabled or agent_check.module_name not in checks or (not agent_check.is_example and not checks[agent_check.module_name].enabled) ): checks[agent_check.module_name] = agent_check checks_list = checks.values() checks_list.sort(key=lambda c: c.module_name) return checks_list
def upload(self, email=None): self._check_size() if self._cmdline: self._ask_for_confirmation() if not email: email = self._ask_for_email() log.info("Uploading {0} to Datadog Support".format(self._tar_path)) url = self._url if self._case_id: url = '{0}/{1}'.format(self._url, str(self._case_id)) url = "{0}?api_key={1}".format(url, self._api_key) requests_options = { 'data': { 'case_id': self._case_id, 'hostname': self._hostname, 'email': email }, 'files': {'flare_file': open(self._tar_path, 'rb')}, 'timeout': self.TIMEOUT } if Platform.is_windows(): requests_options['verify'] = os.path.realpath(os.path.join( os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir, 'datadog-cert.pem' )) self._resp = requests.post(url, **requests_options) self._analyse_result() return self._case_id
def _add_conf_tar(self): conf_path = get_config_path() if self._can_read(conf_path): self._add_file_tar( self._strip_comment(conf_path), os.path.join('etc', 'datadog.conf'), original_file_path=conf_path ) if not Platform.is_windows(): supervisor_path = os.path.join( os.path.dirname(get_config_path()), 'supervisor.conf' ) if self._can_read(supervisor_path): self._add_file_tar( self._strip_comment(supervisor_path), os.path.join('etc', 'supervisor.conf'), original_file_path=supervisor_path ) for file_path in glob.glob(os.path.join(get_confd_path(), '*.yaml')) +\ glob.glob(os.path.join(get_confd_path(), '*.yaml.default')): if self._can_read(file_path, output=False): self._add_clean_confd(file_path)
def _add_conf_tar(self): conf_path = get_config_path() if self._can_read(conf_path, output=False): self._add_clean_conf( conf_path, 'etc', self.MAIN_CREDENTIALS ) if not Platform.is_windows(): supervisor_path = os.path.join( os.path.dirname(get_config_path()), 'supervisor.conf' ) if self._can_read(supervisor_path, output=False): self._add_clean_conf( supervisor_path, 'etc' ) for file_path in glob.glob(os.path.join(get_confd_path(), '*.yaml')) +\ glob.glob(os.path.join(get_confd_path(), '*.yaml.default')): if self._can_read(file_path, output=False): self._add_clean_conf( file_path, os.path.join('etc', 'confd'), self.CHECK_CREDENTIALS )
def _host_matches_node(self, primary_addrs): """ For < 0.19, check if the current host matches the IP given in the cluster nodes check `/_cluster/nodes`. Uses `ip addr` on Linux and `ifconfig` on Mac """ if Platform.is_darwin(): ifaces = subprocess.Popen(['ifconfig'], stdout=subprocess.PIPE) else: ifaces = subprocess.Popen(['ip', 'addr'], stdout=subprocess.PIPE) grepper = subprocess.Popen( ['grep', 'inet'], stdin=ifaces.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE) ifaces.stdout.close() out, err = grepper.communicate() # Capture the list of interface IPs ips = [] for iface in out.split("\n"): iface = iface.strip() if iface: ips.append(iface.split(' ')[1].split('/')[0]) # Check the interface addresses against the primary address return primary_addrs in ips
def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # ad stands for access denied # We cache the PIDs getting this error and don't iterate on them # more often than `access_denied_cache_duration` # This cache is for all PIDs so it's global, but it should # be refreshed by instance self.last_ad_cache_ts = {} self.ad_cache = set() self.access_denied_cache_duration = int( init_config.get("access_denied_cache_duration", DEFAULT_AD_CACHE_DURATION) ) # By default cache the PID list for a while # Sometimes it's not wanted b/c it can mess with no-data monitoring # This cache is indexed per instance self.last_pid_cache_ts = {} self.pid_cache = {} self.pid_cache_duration = int(init_config.get("pid_cache_duration", DEFAULT_PID_CACHE_DURATION)) if Platform.is_linux(): procfs_path = init_config.get("procfs_path") if procfs_path: psutil.PROCFS_PATH = procfs_path # Process cache, indexed by instance self.process_cache = defaultdict(dict)
def init(self): try: instance = self.instances[0] self.docker_util = DockerUtil() self.docker_client = self.docker_util.client self.docker_gateway = DockerUtil.get_gateway() if Platform.is_k8s(): self.kubeutil = KubeUtil() # We configure the check with the right cgroup settings for this host # Just needs to be done once self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS) self.cgroup_listing_retries = 0 self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False # Set tagging options self.custom_tags = instance.get("tags", []) self.collect_labels_as_tags = instance.get("collect_labels_as_tags", []) self.kube_labels = {} self.use_histogram = _is_affirmative(instance.get('use_histogram', False)) performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: performance_tags, IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } # Set filtering settings if not instance.get("exclude"): self._filtering_enabled = False if instance.get("include"): self.log.warning("You must specify an exclude section to enable filtering") else: self._filtering_enabled = True include = instance.get("include", []) exclude = instance.get("exclude", []) self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude) self.tag_names[FILTERED] = _filtered_tag_names # Other options self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False)) self.collect_events = _is_affirmative(instance.get('collect_events', True)) self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False)) self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False)) self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance() self.ecs_tags = {} except Exception as e: self.log.critical(e) self.warning("Initialization failed. Will retry at next iteration") else: self.init_success = True
def get_hostname(config=None): """ Get the canonical host name this agent should identify as. This is the authoritative source of the host name for the agent. Tries, in order: * agent config (datadog.conf, "hostname:") * 'hostname -f' (on unix) * socket.gethostname() """ hostname = None # first, try the config if config is None: from config import get_config config = get_config(parse_args=True) config_hostname = config.get('hostname') if config_hostname and is_valid_hostname(config_hostname): return config_hostname #Try to get GCE instance name if hostname is None: gce_hostname = GCE.get_hostname(config) if gce_hostname is not None: if is_valid_hostname(gce_hostname): return gce_hostname # then move on to os-specific detection if hostname is None: def _get_hostname_unix(): try: # try fqdn p = subprocess.Popen(['/bin/hostname', '-f'], stdout=subprocess.PIPE) out, err = p.communicate() if p.returncode == 0: return out.strip() except Exception: return None os_name = get_os() if os_name in ['mac', 'freebsd', 'linux', 'solaris']: unix_hostname = _get_hostname_unix() if unix_hostname and is_valid_hostname(unix_hostname): hostname = unix_hostname # if we have an ec2 default hostname, see if there's an instance-id available if (Platform.is_ecs_instance()) or (hostname is not None and True in [hostname.lower().startswith(p) for p in [u'ip-', u'domu']]): instanceid = EC2.get_instance_id(config) if instanceid: hostname = instanceid # fall back on socket.gethostname(), socket.getfqdn() is too unreliable if hostname is None: try: socket_hostname = socket.gethostname() except socket.error, e: socket_hostname = None if socket_hostname and is_valid_hostname(socket_hostname): hostname = socket_hostname
def get_jmx_status_path(): if Platform.is_win32(): path = os.path.join(_windows_commondata_path(), 'Datadog') else: path = tempfile.gettempdir() return path
def _start(self, path_to_java, java_run_opts, jmx_checks, command, reporter, tools_jar_path, custom_jar_paths, redirect_std_streams): if reporter is None: statsd_host = self.agent_config.get('bind_host', 'localhost') if statsd_host == "0.0.0.0": # If statsd is bound to all interfaces, just use localhost for clients statsd_host = "localhost" statsd_port = self.agent_config.get('dogstatsd_port', "8125") reporter = "statsd:%s:%s" % (statsd_host, statsd_port) log.info("Starting jmxfetch:") try: path_to_java = path_to_java or "java" java_run_opts = java_run_opts or "" path_to_jmxfetch = self._get_path_to_jmxfetch() path_to_status_file = JMXFiles.get_status_file_path() classpath = path_to_jmxfetch if tools_jar_path is not None: classpath = r"%s%s%s" % (tools_jar_path, os.pathsep, classpath) if custom_jar_paths: classpath = r"%s%s%s" % (os.pathsep.join(custom_jar_paths), os.pathsep, classpath) if self.config_jar_path: classpath = r"%s%s%s" % (self.config_jar_path, os.pathsep, classpath) subprocess_args = [ path_to_java, # Path to the java bin '-classpath', classpath, JMXFETCH_MAIN_CLASS, '--check_period', str(self.check_frequency * 1000), # Period of the main loop of jmxfetch in ms '--conf_directory', r"%s" % self. confd_path, # Path of the conf.d directory that will be read by jmxfetch, '--log_level', JAVA_LOGGING_LEVEL.get( self.logging_config.get("log_level"), "INFO" ), # Log Level: Mapping from Python log level to log4j log levels '--log_location', r"%s" % self.logging_config.get( 'jmxfetch_log_file'), # Path of the log file '--reporter', reporter, # Reporter to use '--status_location', r"%s" % path_to_status_file, # Path to the status file to write command, # Name of the command ] if Platform.is_windows(): # Signal handlers are not supported on Windows: # use a file to trigger JMXFetch exit instead path_to_exit_file = JMXFiles.get_python_exit_file_path() subprocess_args.insert( len(subprocess_args) - 1, '--exit_file_location') subprocess_args.insert( len(subprocess_args) - 1, path_to_exit_file) if self.service_discovery: pipe_path = get_jmx_pipe_path() subprocess_args.insert(4, '--tmp_directory') subprocess_args.insert(5, pipe_path) subprocess_args.insert(4, '--sd_pipe') subprocess_args.insert(5, SD_PIPE_NAME) subprocess_args.insert(4, '--sd_enabled') if self.pool_size: subprocess_args.insert(4, '--thread_pool_size') subprocess_args.insert(5, self.pool_size) if self.reconnection_pool_size: subprocess_args.insert(4, '--reconnection_thread_pool_size') subprocess_args.insert(5, self.reconnection_pool_size) if self.collection_to: subprocess_args.insert(4, '--collection_timeout') subprocess_args.insert(5, self.collection_to) if self.reconnection_to: subprocess_args.insert(4, '--reconnection_timeout') subprocess_args.insert(5, self.reconnection_to) if jmx_checks: subprocess_args.insert(4, '--check') for check in jmx_checks: subprocess_args.insert(5, check) # Specify a maximum memory allocation pool for the JVM if "Xmx" not in java_run_opts and "XX:MaxHeapSize" not in java_run_opts: java_run_opts += _JVM_DEFAULT_SD_MAX_MEMORY_ALLOCATION if self.service_discovery else _JVM_DEFAULT_MAX_MEMORY_ALLOCATION # Specify the initial memory allocation pool for the JVM if "Xms" not in java_run_opts and "XX:InitialHeapSize" not in java_run_opts: java_run_opts += _JVM_DEFAULT_INITIAL_MEMORY_ALLOCATION for opt in java_run_opts.split(): subprocess_args.insert(1, opt) log.info("Running %s" % " ".join(subprocess_args)) return self.execute(subprocess_args, redirect_std_streams) except OSError: java_path_msg = "Couldn't launch JMXTerm. Is Java in your PATH ?" log.exception(java_path_msg) invalid_checks = {} for check in jmx_checks: check_name = check.split('.')[0] check_name = check_name.encode('ascii', 'ignore') invalid_checks[check_name] = java_path_msg JMXFiles.write_status_file(invalid_checks) raise except Exception: log.info("unable to launch JMXFetch") raise
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) if not Platform.is_windows(): # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) else: sdk_integrations = get_sdk_integration_paths() for name, path in sdk_integrations.iteritems(): lib_path = os.path.join(path, 'lib') if os.path.exists(lib_path): sys.path.append(lib_path) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: try: config = get_config(parse_args=True) except: log.warning("Failed to load configuration") sys.exit(2) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats(proc_path=self._agentConfig.get( 'procfs_path', '/proc').rstrip('/')) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) if self.sd_backend and _is_affirmative( self._agentConfig.get('sd_jmx_enable', False)): pipe_path = get_jmx_pipe_path() if Platform.is_windows(): pipe_name = pipe_path.format(pipename=SD_PIPE_NAME) else: pipe_name = os.path.join(pipe_path, SD_PIPE_NAME) if os.access(pipe_path, os.W_OK): if not os.path.exists(pipe_name): os.mkfifo(pipe_name) self.sd_pipe = os.open( pipe_name, os.O_RDWR) # RW to avoid blocking (will only W) # Initialize Supervisor proxy self.supervisor_proxy = self._get_supervisor_socket( self._agentConfig) else: log.debug( 'Unable to create pipe in temporary directory. JMX service discovery disabled.' ) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Load JMX configs if available if self._jmx_service_discovery_enabled: self.sd_pipe_jmx_configs(hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile try: self.collector_profile_interval = int( self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)) except ValueError: log.warn('collector_profile_interval is invalid. ' 'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int( self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() self.allow_profiling = _is_affirmative( self._agentConfig.get('allow_profiling', True)) profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.allow_profiling and self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs( checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # JMXFetch restarts should prompt re-piping *all* JMX configs if self._jmx_service_discovery_enabled and \ (not self.reload_configs_flag or isinstance(self.reload_configs_flag, set)): try: jmx_launch = JMXFetch._get_jmx_launchtime() if self.last_jmx_piped and self.last_jmx_piped < jmx_launch: self.sd_pipe_jmx_configs(hostname) except Exception as e: log.debug("could not stat JMX lunch file: %s", e) # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run( checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn( 'Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format( self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
import traceback from types import ListType, TupleType # 3p try: import psutil except ImportError: psutil = None import yaml # project from checks import check_status from util import get_hostname, get_next_id, LaconicFilter, yLoader from utils.platform import Platform from utils.profile import pretty_statistics if Platform.is_windows(): from utils.debug import run_check # noqa - windows debug purpose log = logging.getLogger(__name__) # Default methods run when collecting info about the agent in developer mode DEFAULT_PSUTIL_METHODS = ['get_memory_info', 'get_io_counters'] AGENT_METRICS_CHECK_NAME = 'agent_metrics' # Konstants class CheckException(Exception): pass
def test_is_k8s(self): os.unsetenv('KUBERNETES_PORT') self.assertFalse(Platform.is_k8s()) os.environ['KUBERNETES_PORT'] = '999' self.assertTrue(Platform.is_k8s())
def check(self, agentConfig): """Return an aggregate of CPU stats across all CPUs When figures are not available, False is sent back. """ def format_results(us, sy, wa, idle, st, guest=None): data = { 'cpuUser': us, 'cpuSystem': sy, 'cpuWait': wa, 'cpuIdle': idle, 'cpuStolen': st, 'cpuGuest': guest } return dict((k, v) for k, v in data.iteritems() if v is not None) def get_value(legend, data, name, filter_value=None): "Using the legend and a metric name, get the value or None from the data line" if name in legend: value = to_float(data[legend.index(name)]) if filter_value is not None: if value > filter_value: return None return value else: # FIXME return a float or False, would trigger type error if not python self.logger.debug("Cannot extract cpu value %s from %s (%s)" % (name, data, legend)) return 0.0 try: if Platform.is_linux(): output, _, _ = get_subprocess_output(['mpstat', '1', '3'], self.logger) mpstat = output.splitlines() # topdog@ip:~$ mpstat 1 3 # Linux 2.6.32-341-ec2 (ip) 01/19/2012 _x86_64_ (2 CPU) # # 04:22:41 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle # 04:22:42 PM all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # 04:22:43 PM all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # 04:22:44 PM all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # Average: all 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 100.00 # # OR # # Thanks to Mart Visser to spotting this one. # blah:/etc/dd-agent# mpstat # Linux 2.6.26-2-xen-amd64 (atira) 02/17/2012 _x86_64_ # # 05:27:03 PM CPU %user %nice %sys %iowait %irq %soft %steal %idle intr/s # 05:27:03 PM all 3.59 0.00 0.68 0.69 0.00 0.00 0.01 95.03 43.65 # legend = [l for l in mpstat if "%usr" in l or "%user" in l] avg = [l for l in mpstat if "Average" in l] if len(legend) == 1 and len(avg) == 1: headers = [ h for h in legend[0].split() if h not in ("AM", "PM") ] data = avg[0].split() # Userland # Debian lenny says %user so we look for both # One of them will be 0 cpu_metrics = { "%usr": None, "%user": None, "%nice": None, "%iowait": None, "%idle": None, "%sys": None, "%irq": None, "%soft": None, "%steal": None, "%guest": None } for cpu_m in cpu_metrics: cpu_metrics[cpu_m] = get_value(headers, data, cpu_m, filter_value=110) if any([v is None for v in cpu_metrics.values()]): self.logger.warning("Invalid mpstat data: %s" % data) cpu_user = cpu_metrics["%usr"] + cpu_metrics[ "%user"] + cpu_metrics["%nice"] cpu_system = cpu_metrics["%sys"] + cpu_metrics[ "%irq"] + cpu_metrics["%soft"] cpu_wait = cpu_metrics["%iowait"] cpu_idle = cpu_metrics["%idle"] cpu_stolen = cpu_metrics["%steal"] cpu_guest = cpu_metrics["%guest"] return format_results(cpu_user, cpu_system, cpu_wait, cpu_idle, cpu_stolen, cpu_guest) else: return False elif sys.platform == 'darwin': # generate 3 seconds of data # [' disk0 disk1 cpu load average', ' KB/t tps MB/s KB/t tps MB/s us sy id 1m 5m 15m', ' 21.23 13 0.27 17.85 7 0.13 14 7 79 1.04 1.27 1.31', ' 4.00 3 0.01 5.00 8 0.04 12 10 78 1.04 1.27 1.31', ''] iostats, _, _ = get_subprocess_output( ['iostat', '-C', '-w', '3', '-c', '2'], self.logger) lines = [l for l in iostats.splitlines() if len(l) > 0] legend = [l for l in lines if "us" in l] if len(legend) == 1: headers = legend[0].split() data = lines[-1].split() cpu_user = get_value(headers, data, "us") cpu_sys = get_value(headers, data, "sy") cpu_wait = 0 cpu_idle = get_value(headers, data, "id") cpu_st = 0 return format_results(cpu_user, cpu_sys, cpu_wait, cpu_idle, cpu_st) else: self.logger.warn( "Expected to get at least 4 lines of data from iostat instead of just " + str(iostats[:max(80, len(iostats))])) return False elif sys.platform.startswith("freebsd"): # generate 3 seconds of data # tty ada0 cd0 pass0 cpu # tin tout KB/t tps MB/s KB/t tps MB/s KB/t tps MB/s us ni sy in id # 0 69 26.71 0 0.01 0.00 0 0.00 0.00 0 0.00 2 0 0 1 97 # 0 78 0.00 0 0.00 0.00 0 0.00 0.00 0 0.00 0 0 0 0 100 iostats, _, _ = get_subprocess_output( ['iostat', '-w', '3', '-c', '2'], self.logger) lines = [l for l in iostats.splitlines() if len(l) > 0] legend = [l for l in lines if "us" in l] if len(legend) == 1: headers = legend[0].split() data = lines[-1].split() cpu_user = get_value(headers, data, "us") cpu_nice = get_value(headers, data, "ni") cpu_sys = get_value(headers, data, "sy") cpu_intr = get_value(headers, data, "in") cpu_wait = 0 cpu_idle = get_value(headers, data, "id") cpu_stol = 0 return format_results(cpu_user + cpu_nice, cpu_sys + cpu_intr, cpu_wait, cpu_idle, cpu_stol) else: self.logger.warn( "Expected to get at least 4 lines of data from iostat instead of just " + str(iostats[:max(80, len(iostats))])) return False elif sys.platform == 'sunos5': # mpstat -aq 1 2 # SET minf mjf xcal intr ithr csw icsw migr smtx srw syscl usr sys wt idl sze # 0 5239 0 12857 22969 5523 14628 73 546 4055 1 146856 5 6 0 89 24 <-- since boot # 1 ... # SET minf mjf xcal intr ithr csw icsw migr smtx srw syscl usr sys wt idl sze # 0 20374 0 45634 57792 5786 26767 80 876 20036 2 724475 13 13 0 75 24 <-- past 1s # 1 ... # http://docs.oracle.com/cd/E23824_01/html/821-1462/mpstat-1m.html # # Will aggregate over all processor sets output, _, _ = get_subprocess_output( ['mpstat', '-aq', '1', '2'], self.logger) mpstat = output.splitlines() lines = [l for l in mpstat if len(l) > 0] # discard the first len(lines)/2 lines lines = lines[len(lines) / 2:] legend = [l for l in lines if "SET" in l] assert len(legend) == 1 if len(legend) == 1: headers = legend[0].split() # collect stats for each processor set # and aggregate them based on the relative set size d_lines = [l for l in lines if "SET" not in l] user = [ get_value(headers, l.split(), "usr") for l in d_lines ] kern = [ get_value(headers, l.split(), "sys") for l in d_lines ] wait = [ get_value(headers, l.split(), "wt") for l in d_lines ] idle = [ get_value(headers, l.split(), "idl") for l in d_lines ] size = [ get_value(headers, l.split(), "sze") for l in d_lines ] count = sum(size) rel_size = [s / count for s in size] dot = lambda v1, v2: reduce(operator.add, map(operator.mul, v1, v2)) return format_results(dot(user, rel_size), dot(kern, rel_size), dot(wait, rel_size), dot(idle, rel_size), 0.0) else: self.logger.warn("CPUStats: unsupported platform") return False except Exception: self.logger.exception("Cannot compute CPU stats") return False
def get_config(parse_args=True, cfg_path=None, options=None): if parse_args: options, _ = get_parsed_args() # General config agentConfig = { 'check_freq': DEFAULT_CHECK_FREQUENCY, 'monitorstatsd_port': 8125, 'monitorstatsd_target': 'http://localhost:17123', 'graphite_listen_port': None, 'hostname': None, 'listen_port': None, 'tags': None, 'use_ec2_instance_id': False, # DEPRECATED 'version': get_version(), 'watchmonitor': True, 'additional_checksd': '/etc/monitor-agent/checks.d/', 'bind_host': get_default_bind_host(), 'statsd_metric_namespace': None, 'utf8_decoding': False } if Platform.is_mac(): agentConfig['additional_checksd'] = '/opt/datadog-agent/etc/checks.d' # Config handling try: # Find the right config file path = os.path.realpath(__file__) path = os.path.dirname(path) config_path = get_config_path(cfg_path, os_name=get_os()) config = ConfigParser.ConfigParser() config.readfp(skip_leading_wsp(open(config_path))) # bulk import for option in config.options('Main'): agentConfig[option] = config.get('Main', option) # Store developer mode setting in the agentConfig if config.has_option('Main', 'developer_mode'): agentConfig['developer_mode'] = _is_affirmative( config.get('Main', 'developer_mode')) # Allow an override with the --profile option if options is not None and options.profile: agentConfig['developer_mode'] = True # Get check frequency if config.has_option("Main", "frequency"): agentConfig['check_freq'] = config.get("Main", "frequency") # # Core config # # FIXME unnecessarily complex agentConfig['use_forwarder'] = False if options is not None and options.use_forwarder: listen_port = 17123 if config.has_option('Main', 'listen_port'): listen_port = int(config.get('Main', 'listen_port')) agentConfig['m_url'] = "http://" + agentConfig[ 'bind_host'] + ":" + str(listen_port) agentConfig['use_forwarder'] = True elif options is not None and not options.disable_dd and options.m_url: agentConfig['m_url'] = options.m_url else: agentConfig['m_url'] = config.get('Main', 'm_url') if agentConfig['m_url'].endswith('/'): agentConfig['m_url'] = agentConfig['m_url'][:-1] # Extra checks.d path # the linux directory is set by default if config.has_option('Main', 'additional_checksd'): agentConfig['additional_checksd'] = config.get( 'Main', 'additional_checksd') elif get_os() == 'windows': # default windows location common_path = _windows_commondata_path() agentConfig['additional_checksd'] = os.path.join( common_path, 'Datamonitor', 'checks.d') if config.has_option('Main', 'use_monitorstatsd'): agentConfig['use_monitorstatsd'] = config.get( 'Main', 'use_monitorstatsd').lower() in ("yes", "true") else: agentConfig['use_monitorstatsd'] = True # Concerns only Windows if config.has_option('Main', 'use_web_info_page'): agentConfig['use_web_info_page'] = config.get( 'Main', 'use_web_info_page').lower() in ("yes", "true") else: agentConfig['use_web_info_page'] = True # Which API key to use agentConfig['api_key'] = config.get('Main', 'api_key') # local traffic only? Default to no agentConfig['non_local_traffic'] = False if config.has_option('Main', 'non_local_traffic'): agentConfig['non_local_traffic'] = config.get( 'Main', 'non_local_traffic').lower() in ("yes", "true") # DEPRECATED if config.has_option('Main', 'use_ec2_instance_id'): use_ec2_instance_id = config.get('Main', 'use_ec2_instance_id') # translate yes into True, the rest into False agentConfig['use_ec2_instance_id'] = ( use_ec2_instance_id.lower() == 'yes') if config.has_option('Main', 'check_freq'): try: agentConfig['check_freq'] = int( config.get('Main', 'check_freq')) except Exception: pass # Custom histogram aggregate/percentile metrics if config.has_option('Main', 'histogram_aggregates'): agentConfig['histogram_aggregates'] = get_histogram_aggregates( config.get('Main', 'histogram_aggregates')) if config.has_option('Main', 'histogram_percentiles'): agentConfig['histogram_percentiles'] = get_histogram_percentiles( config.get('Main', 'histogram_percentiles')) # Disable Watchmonitor (optionally) if config.has_option('Main', 'watchmonitor'): if config.get('Main', 'watchmonitor').lower() in ('no', 'false'): agentConfig['watchmonitor'] = False # Optional graphite listener if config.has_option('Main', 'graphite_listen_port'): agentConfig['graphite_listen_port'] = \ int(config.get('Main', 'graphite_listen_port')) else: agentConfig['graphite_listen_port'] = None # monitorstatsd config monitorstatsd_defaults = { 'monitorstatsd_port': 8125, 'monitorstatsd_target': 'http://' + agentConfig['bind_host'] + ':17123', } for key, value in monitorstatsd_defaults.iteritems(): if config.has_option('Main', key): agentConfig[key] = config.get('Main', key) else: agentConfig[key] = value # Create app:xxx tags based on monitored apps agentConfig['create_dd_check_tags'] = config.has_option('Main', 'create_dd_check_tags') and \ _is_affirmative(config.get('Main', 'create_dd_check_tags')) # Forwarding to external statsd server if config.has_option('Main', 'statsd_forward_host'): agentConfig['statsd_forward_host'] = config.get( 'Main', 'statsd_forward_host') if config.has_option('Main', 'statsd_forward_port'): agentConfig['statsd_forward_port'] = int( config.get('Main', 'statsd_forward_port')) # optionally send monitorstatsd data directly to the agent. if config.has_option('Main', 'monitorstatsd_use_murl'): if _is_affirmative(config.get('Main', 'monitorstatsd_use_murl')): agentConfig['monitorstatsd_target'] = agentConfig['m_url'] # Optional config # FIXME not the prettiest code ever... if config.has_option('Main', 'use_mount'): agentConfig['use_mount'] = _is_affirmative( config.get('Main', 'use_mount')) if options is not None and options.autorestart: agentConfig['autorestart'] = True elif config.has_option('Main', 'autorestart'): agentConfig['autorestart'] = _is_affirmative( config.get('Main', 'autorestart')) if config.has_option('Main', 'check_timings'): agentConfig['check_timings'] = _is_affirmative( config.get('Main', 'check_timings')) if config.has_option('Main', 'exclude_process_args'): agentConfig['exclude_process_args'] = _is_affirmative( config.get('Main', 'exclude_process_args')) try: filter_device_re = config.get('Main', 'device_blacklist_re') agentConfig['device_blacklist_re'] = re.compile(filter_device_re) except ConfigParser.NoOptionError: pass if config.has_option('datamonitor', 'ddforwarder_log'): agentConfig['has_datamonitor'] = True # monitorstream config if config.has_option("Main", "monitorstream_log"): # Older version, single log support log_path = config.get("Main", "monitorstream_log") if config.has_option("Main", "monitorstream_line_parser"): agentConfig["monitorstreams"] = ':'.join([ log_path, config.get("Main", "monitorstream_line_parser") ]) else: agentConfig["monitorstreams"] = log_path elif config.has_option("Main", "monitorstreams"): agentConfig["monitorstreams"] = config.get("Main", "monitorstreams") if config.has_option("Main", "nagios_perf_cfg"): agentConfig["nagios_perf_cfg"] = config.get( "Main", "nagios_perf_cfg") if config.has_option("Main", "use_curl_http_client"): agentConfig["use_curl_http_client"] = _is_affirmative( config.get("Main", "use_curl_http_client")) else: # Default to False as there are some issues with the curl client and ELB agentConfig["use_curl_http_client"] = False if config.has_section('WMI'): agentConfig['WMI'] = {} for key, value in config.items('WMI'): agentConfig['WMI'][key] = value if (config.has_option("Main", "limit_memory_consumption") and config.get("Main", "limit_memory_consumption") is not None): agentConfig["limit_memory_consumption"] = int( config.get("Main", "limit_memory_consumption")) else: agentConfig["limit_memory_consumption"] = None if config.has_option("Main", "skip_ssl_validation"): agentConfig["skip_ssl_validation"] = _is_affirmative( config.get("Main", "skip_ssl_validation")) agentConfig["collect_instance_metadata"] = True if config.has_option("Main", "collect_instance_metadata"): agentConfig["collect_instance_metadata"] = _is_affirmative( config.get("Main", "collect_instance_metadata")) agentConfig["proxy_forbid_method_switch"] = False if config.has_option("Main", "proxy_forbid_method_switch"): agentConfig["proxy_forbid_method_switch"] = _is_affirmative( config.get("Main", "proxy_forbid_method_switch")) agentConfig["collect_ec2_tags"] = False if config.has_option("Main", "collect_ec2_tags"): agentConfig["collect_ec2_tags"] = _is_affirmative( config.get("Main", "collect_ec2_tags")) agentConfig["utf8_decoding"] = False if config.has_option("Main", "utf8_decoding"): agentConfig["utf8_decoding"] = _is_affirmative( config.get("Main", "utf8_decoding")) agentConfig["gce_updated_hostname"] = False if config.has_option("Main", "gce_updated_hostname"): agentConfig["gce_updated_hostname"] = _is_affirmative( config.get("Main", "gce_updated_hostname")) except ConfigParser.NoSectionError as e: sys.stderr.write('Config file not found or incorrectly formatted.\n') sys.exit(2) except ConfigParser.ParsingError as e: sys.stderr.write('Config file not found or incorrectly formatted.\n') sys.exit(2) except ConfigParser.NoOptionError as e: sys.stderr.write( 'There are some items missing from your config file, but nothing fatal [%s]' % e) # Storing proxy settings in the agentConfig agentConfig['proxy_settings'] = get_proxy(agentConfig) if agentConfig.get('ca_certs', None) is None: agentConfig['ssl_certificate'] = get_ssl_certificate( get_os(), 'datamonitor-cert.pem') else: agentConfig['ssl_certificate'] = agentConfig['ca_certs'] # self-updater relative conf agentConfig['interval'] = config.get('Main', 'updater_interval') return agentConfig
def _check_linux(self, instance): """ _check_linux can be run inside a container and still collects the network metrics from the host For that procfs_path can be set to something like "/host/proc" When a custom procfs_path is set, the collect_connection_state option is ignored """ proc_location = self.agentConfig.get('procfs_path', '/proc').rstrip('/') if Platform.is_containerized() and proc_location != "/proc": proc_location = "%s/1" % proc_location if self._is_collect_cx_state_runnable(proc_location): try: self.log.debug("Using `ss` to collect connection state") # Try using `ss` for increased performance over `netstat` for ip_version in ['4', '6']: for protocol in ['tcp', 'udp']: # Call `ss` for each IP version because there's no built-in way of distinguishing # between the IP versions in the output # Also calls `ss` for each protocol, because on some systems (e.g. Ubuntu 14.04), there is a # bug that print `tcp` even if it's `udp` output, _, _ = get_subprocess_output([ "ss", "-n", "-{0}".format(protocol[0]), "-a", "-{0}".format(ip_version) ], self.log) lines = output.splitlines() # State Recv-Q Send-Q Local Address:Port Peer Address:Port # UNCONN 0 0 127.0.0.1:8125 *:* # ESTAB 0 0 127.0.0.1:37036 127.0.0.1:8125 # UNCONN 0 0 fe80::a00:27ff:fe1c:3c4:123 :::* # TIME-WAIT 0 0 90.56.111.177:56867 46.105.75.4:143 # LISTEN 0 0 ::ffff:127.0.0.1:33217 ::ffff:127.0.0.1:7199 # ESTAB 0 0 ::ffff:127.0.0.1:58975 ::ffff:127.0.0.1:2181 metrics = self._parse_linux_cx_state( lines[1:], self.tcp_states['ss'], 0, protocol=protocol, ip_version=ip_version) # Only send the metrics which match the loop iteration's ip version for stat, metric in self.cx_state_gauge.iteritems(): if stat[0].endswith(ip_version) and stat[ 0].startswith(protocol): self.gauge(metric, metrics.get(metric)) except OSError: self.log.info("`ss` not found: using `netstat` as a fallback") output, _, _ = get_subprocess_output( ["netstat", "-n", "-u", "-t", "-a"], self.log) lines = output.splitlines() # Active Internet connections (w/o servers) # Proto Recv-Q Send-Q Local Address Foreign Address State # tcp 0 0 46.105.75.4:80 79.220.227.193:2032 SYN_RECV # tcp 0 0 46.105.75.4:143 90.56.111.177:56867 ESTABLISHED # tcp 0 0 46.105.75.4:50468 107.20.207.175:443 TIME_WAIT # tcp6 0 0 46.105.75.4:80 93.15.237.188:58038 FIN_WAIT2 # tcp6 0 0 46.105.75.4:80 79.220.227.193:2029 ESTABLISHED # udp 0 0 0.0.0.0:123 0.0.0.0:* # udp6 0 0 :::41458 :::* metrics = self._parse_linux_cx_state( lines[2:], self.tcp_states['netstat'], 5) for metric, value in metrics.iteritems(): self.gauge(metric, value) except SubprocessOutputEmptyError: self.log.exception("Error collecting connection stats.") proc_dev_path = "{}/net/dev".format(proc_location) with open(proc_dev_path, 'r') as proc: lines = proc.readlines() # Inter-| Receive | Transmit # face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed # lo:45890956 112797 0 0 0 0 0 0 45890956 112797 0 0 0 0 0 0 # eth0:631947052 1042233 0 19 0 184 0 1206 1208625538 1320529 0 0 0 0 0 0 # eth1: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 for l in lines[2:]: cols = l.split(':', 1) x = cols[1].split() # Filter inactive interfaces if self._parse_value(x[0]) or self._parse_value(x[8]): iface = cols[0].strip() metrics = { 'bytes_rcvd': self._parse_value(x[0]), 'bytes_sent': self._parse_value(x[8]), 'packets_in.count': self._parse_value(x[1]), 'packets_in.error': self._parse_value(x[2]) + self._parse_value(x[3]), 'packets_out.count': self._parse_value(x[9]), 'packets_out.error': self._parse_value(x[10]) + self._parse_value(x[11]), } self._submit_devicemetrics(iface, metrics) netstat_data = {} for f in ['netstat', 'snmp']: proc_data_path = "{}/net/{}".format(proc_location, f) try: with open(proc_data_path, 'r') as netstat: while True: n_header = netstat.readline() if not n_header: break # No more? Abort! n_data = netstat.readline() h_parts = n_header.strip().split(' ') h_values = n_data.strip().split(' ') ns_category = h_parts[0][:-1] netstat_data[ns_category] = {} # Turn the data into a dictionary for idx, hpart in enumerate(h_parts[1:]): netstat_data[ns_category][hpart] = h_values[idx + 1] except IOError: # On Openshift, /proc/net/snmp is only readable by root self.log.debug("Unable to read %s.", proc_data_path) nstat_metrics_names = { 'Tcp': { 'RetransSegs': 'system.net.tcp.retrans_segs', 'InSegs': 'system.net.tcp.in_segs', 'OutSegs': 'system.net.tcp.out_segs', }, 'TcpExt': { 'ListenOverflows': 'system.net.tcp.listen_overflows', 'ListenDrops': 'system.net.tcp.listen_drops', 'TCPBacklogDrop': 'system.net.tcp.backlog_drops', 'TCPRetransFail': 'system.net.tcp.failed_retransmits', }, 'Udp': { 'InDatagrams': 'system.net.udp.in_datagrams', 'NoPorts': 'system.net.udp.no_ports', 'InErrors': 'system.net.udp.in_errors', 'OutDatagrams': 'system.net.udp.out_datagrams', 'RcvbufErrors': 'system.net.udp.rcv_buf_errors', 'SndbufErrors': 'system.net.udp.snd_buf_errors', 'InCsumErrors': 'system.net.udp.in_csum_errors' } } # Skip the first line, as it's junk for k in nstat_metrics_names: for met in nstat_metrics_names[k]: if met in netstat_data.get(k, {}): self.rate(nstat_metrics_names[k][met], self._parse_value(netstat_data[k][met]))
def test_relocated_procfs(self): from utils.platform import Platform import tempfile import shutil import uuid already_linux = Platform.is_linux() unique_process_name = str(uuid.uuid4()) my_procfs = tempfile.mkdtemp() def _fake_procfs(arg, root=my_procfs): for key, val in arg.iteritems(): path = os.path.join(root, key) if isinstance(val, dict): os.mkdir(path) _fake_procfs(val, path) else: with open(path, "w") as f: f.write(str(val)) _fake_procfs({ '1': { 'status': ("Name:\t%s\nThreads:\t1\n") % unique_process_name, 'stat': ('1 (%s) S 0 1 1 ' + ' 0' * 46) % unique_process_name, 'cmdline': unique_process_name, }, 'stat': ("cpu 13034 0 18596 380856797 2013 2 2962 0 0 0\n" "btime 1448632481\n"), }) config = { 'init_config': { 'procfs_path': my_procfs }, 'instances': [{ 'name': 'moved_procfs', 'search_string': [unique_process_name], 'exact_match': False, 'ignored_denied_access': True, 'thresholds': { 'warning': [1, 10], 'critical': [1, 100] }, }] } version = int(psutil.__version__.replace(".", "")) try: def import_mock(name, i_globals={}, i_locals={}, fromlist=[], level=-1, orig_import=__import__): # _psutil_linux and _psutil_posix are the C bindings; use a mock for those if name in ('_psutil_linux', '_psutil_posix' ) or level >= 1 and ('_psutil_linux' in fromlist or '_psutil_posix' in fromlist): m = MagicMock() # the import system will ask us for our own name m._psutil_linux = m m._psutil_posix = m # there's a version safety check in psutil/__init__.py; this skips it m.version = version return m return orig_import(name, i_globals, i_locals, fromlist, level) # contextlib.nested is deprecated in favor of with MGR1, MGR2, ... etc, but we have too many mocks to fit on one line and apparently \ line # continuation is not flake8 compliant, even when semantically required (as here). Patch is unlikely to throw errors that are suppressed, so # the main downside of contextlib is avoided. with contextlib.nested( patch('sys.platform', 'linux'), patch('socket.AF_PACKET', create=True), patch('__builtin__.__import__', side_effect=import_mock)): if not already_linux: # Reloading psutil fails on linux, but we only need to do so if we didn't start out on a linux platform reload(psutil) assert Platform.is_linux() self.run_check( config, mocks={'get_pagefault_stats': noop_get_pagefault_stats}) finally: shutil.rmtree(my_procfs) if not already_linux: # restore the original psutil that doesn't have our mocks reload(psutil) else: psutil.PROCFS_PATH = '/proc' expected_tags = self.generate_expected_tags(config['instances'][0]) self.assertServiceCheckOK('process.up', count=1, tags=expected_tags + ['process:moved_procfs']) self.assertMetric('system.processes.number', at_least=1, tags=expected_tags) self.assertMetric('system.processes.threads', at_least=1, tags=expected_tags) self.assertMetric('system.processes.run_time.avg', at_least=1, tags=expected_tags) self.assertMetric('system.processes.run_time.max', at_least=1, tags=expected_tags) self.assertMetric('system.processes.run_time.min', at_least=1, tags=expected_tags) self.coverage_report()
def _check_bsd(self, instance): netstat_flags = ['-i', '-b'] # FreeBSD's netstat truncates device names unless you pass '-W' if Platform.is_freebsd(): netstat_flags.append('-W') try: output, _, _ = get_subprocess_output(["netstat"] + netstat_flags, self.log) lines = output.splitlines() # Name Mtu Network Address Ipkts Ierrs Ibytes Opkts Oerrs Obytes Coll # lo0 16384 <Link#1> 318258 0 428252203 318258 0 428252203 0 # lo0 16384 localhost fe80:1::1 318258 - 428252203 318258 - 428252203 - # lo0 16384 127 localhost 318258 - 428252203 318258 - 428252203 - # lo0 16384 localhost ::1 318258 - 428252203 318258 - 428252203 - # gif0* 1280 <Link#2> 0 0 0 0 0 0 0 # stf0* 1280 <Link#3> 0 0 0 0 0 0 0 # en0 1500 <Link#4> 04:0c:ce:db:4e:fa 20801309 0 13835457425 15149389 0 11508790198 0 # en0 1500 seneca.loca fe80:4::60c:ceff: 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 192.168.1 192.168.1.63 20801309 - 13835457425 15149389 - 11508790198 - # en0 1500 2001:470:1f 2001:470:1f07:11d 20801309 - 13835457425 15149389 - 11508790198 - # p2p0 2304 <Link#5> 06:0c:ce:db:4e:fa 0 0 0 0 0 0 0 # ham0 1404 <Link#6> 7a:79:05:4d:bf:f5 30100 0 6815204 18742 0 8494811 0 # ham0 1404 5 5.77.191.245 30100 - 6815204 18742 - 8494811 - # ham0 1404 seneca.loca fe80:6::7879:5ff: 30100 - 6815204 18742 - 8494811 - # ham0 1404 2620:9b::54 2620:9b::54d:bff5 30100 - 6815204 18742 - 8494811 - headers = lines[0].split() # Given the irregular structure of the table above, better to parse from the end of each line # Verify headers first # -7 -6 -5 -4 -3 -2 -1 for h in ("Ipkts", "Ierrs", "Ibytes", "Opkts", "Oerrs", "Obytes", "Coll"): if h not in headers: self.log.error("%s not found in %s; cannot parse" % (h, headers)) return False current = None for l in lines[1:]: # Another header row, abort now, this is IPv6 land if "Name" in l: break x = l.split() if len(x) == 0: break iface = x[0] if iface.endswith("*"): iface = iface[:-1] if iface == current: # skip multiple lines of same interface continue else: current = iface # Filter inactive interfaces if self._parse_value(x[-5]) or self._parse_value(x[-2]): iface = current metrics = { 'bytes_rcvd': self._parse_value(x[-5]), 'bytes_sent': self._parse_value(x[-2]), 'packets_in.count': self._parse_value(x[-7]), 'packets_in.error': self._parse_value(x[-6]), 'packets_out.count': self._parse_value(x[-4]), 'packets_out.error': self._parse_value(x[-3]), } self._submit_devicemetrics(iface, metrics) except SubprocessOutputEmptyError: self.log.exception("Error collecting connection stats.") try: netstat, _, _ = get_subprocess_output( ["netstat", "-s", "-p" "tcp"], self.log) #3651535 packets sent # 972097 data packets (615753248 bytes) # 5009 data packets (2832232 bytes) retransmitted # 0 resends initiated by MTU discovery # 2086952 ack-only packets (471 delayed) # 0 URG only packets # 0 window probe packets # 310851 window update packets # 336829 control packets # 0 data packets sent after flow control # 3058232 checksummed in software # 3058232 segments (571218834 bytes) over IPv4 # 0 segments (0 bytes) over IPv6 #4807551 packets received # 1143534 acks (for 616095538 bytes) # 165400 duplicate acks # ... self._submit_regexed_values(netstat, BSD_TCP_METRICS) except SubprocessOutputEmptyError: self.log.exception("Error collecting TCP stats.")
def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self._enabled_checks.append(name) self._enabled_checks = list(set(self._enabled_checks)) self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get('developer_mode') and psutil self._internal_profiling_stats = None self.hostname = agentConfig.get('checksd_hostname') or get_hostname( agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.aggregator = MetricsAggregator( self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None), histogram_aggregates=agentConfig.get('histogram_aggregates'), histogram_percentiles=agentConfig.get('histogram_percentiles')) if Platform.is_linux() and psutil is not None: procfs_path = self.agentConfig.get('procfs_path', '/proc').rstrip('/') psutil.PROCFS_PATH = procfs_path self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) self._instance_metadata = [] self.svc_metadata = [] self.historate_dict = {} # Set proxy settings self.proxy_settings = get_proxy(self.agentConfig) self._use_proxy = False if init_config is None else init_config.get( "use_agent_proxy", True) self.proxies = { "http": None, "https": None, } if self.proxy_settings and self._use_proxy: uri = "{host}:{port}".format(host=self.proxy_settings['host'], port=self.proxy_settings['port']) if self.proxy_settings['user'] and self.proxy_settings['password']: uri = "{user}:{password}@{uri}".format( user=self.proxy_settings['user'], password=self.proxy_settings['password'], uri=uri) self.proxies['http'] = "http://{uri}".format(uri=uri) self.proxies['https'] = "https://{uri}".format(uri=uri)
def run(self, checksd=None, start_event=True, configs_reloaded=False): """ Collect data from each check and submit their data. """ log.debug("Found {num_checks} checks".format( num_checks=len(checksd['initialized_checks']))) timer = Timer() if not Platform.is_windows(): cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd[ 'initialized_checks'] # is a list of AgentCheck instances self.init_failed_checks_d = checksd[ 'init_failed_checks'] # is of type {check_name: {error, traceback}} payload = AgentPayload() # Find the AgentMetrics check and pop it out # This check must run at the end of the loop to collect info on agent performance if not self._agent_metrics or configs_reloaded: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break # Initialize payload self._build_payload(payload) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] # Run the system checks. Checks will depend on the OS if Platform.is_windows(): # Win32 system checks for check_name in ['memory', 'cpu', 'io', 'proc', 'system']: try: metrics.extend(self._win32_system_checks[check_name].check( self.agentConfig)) except Exception: log.exception('Unable to get %s metrics', check_name) else: # Unix system checks sys_checks = self._unix_system_checks for check_name in ['load', 'system', 'cpu', 'file_handles']: try: result_check = sys_checks[check_name].check( self.agentConfig) if result_check: payload.update(result_check) except Exception: log.exception('Unable to get %s metrics', check_name) try: memory = sys_checks['memory'].check(self.agentConfig) except Exception: log.exception('Unable to get memory metrics') else: if memory: memstats = { 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsable': memory.get('physPctUsable'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared'), 'memSlab': memory.get('physSlab'), 'memPageTables': memory.get('physPageTables'), 'memSwapCached': memory.get('swapCached') } payload.update(memstats) try: ioStats = sys_checks['io'].check(self.agentConfig) except Exception: log.exception('Unable to get io metrics') else: if ioStats: payload['ioStats'] = ioStats try: processes = sys_checks['processes'].check(self.agentConfig) except Exception: log.exception('Unable to get processes metrics') else: payload.update({'processes': processes}) # Run old-style checks if self._ganglia is not None: payload['ganglia'] = self._ganglia.check(self.agentConfig) if self._dogstream is not None: dogstreamData = self._dogstream.check(self.agentConfig) dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # process collector of gohai (compliant with payload of legacy "resources checks") if not Platform.is_windows() and self._should_send_additional_data( 'processes'): gohai_processes = self._run_gohai_processes() if gohai_processes: try: gohai_processes_json = json.loads(gohai_processes) processes_snaps = gohai_processes_json.get('processes') if processes_snaps: processes_payload = {'snaps': [processes_snaps]} payload['resources'] = { 'processes': processes_payload, 'meta': { 'host': self.hostname, } } except Exception: log.exception("Error running gohai processes collection") # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # Use `info` log level for some messages on the first run only, then `debug` log_at_first_run = log.info if self._is_first_run() else log.debug # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log_at_first_run("Running check %s", check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() # Collect metadata current_check_metadata = check.get_service_metadata() # Save metrics & events for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats, check_version=check.check_version) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datadog.agent.check_status', status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) # -1 because the user doesn't care about the service check for check failure service_check_count = len(current_check_service_checks) - 1 # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = 'datadog.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) if hasattr(check, A7_COMPATIBILITY_ATTR) and isinstance( getattr(check, A7_COMPATIBILITY_ATTR), str): metric = 'datadog.agent.check_ready' status = getattr(check, A7_COMPATIBILITY_ATTR) meta = { 'tags': [ "check_name:%s" % check.name, "agent_version_major:%s" % AGENT_VERSION.split(".")[0], "agent_version_minor:%s" % AGENT_VERSION.split(".")[1], "agent_version_patch:%s" % AGENT_VERSION.split(".")[2], "status:%s" % status ] } # datadog.agent.check_ready: # 0: is not compatible with A7 (or unknown) # 1: is compatible with A7 metrics.append( (metric, time.time(), a7_compatible_to_int(status), meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, check_version=info.get( 'version', 'unknown'), init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) # Add a service check for the agent service_checks.append( create_service_check('datadog.agent.up', AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks # Populate metadata self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self._agent_metrics: metric_context = { 'collection_time': collect_duration, 'emit_time': self.emit_duration, } if not Platform.is_windows(): metric_context['cpu_time'] = time.clock() - cpu_clock self._agent_metrics.set_metric_context(payload, metric_context) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) if self.agentConfig.get('developer_mode'): log.debug("\n Agent developer mode stats: \n {0}".format( Collector._stats_for_display(agent_stats))) # Flush metadata for the Agent Metrics check. Otherwise they'll just accumulate and leak. self._agent_metrics.get_service_metadata() # Let's send our payload emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() if self._is_first_run(): # This is not the exact payload sent to the backend as minor post # processing is done, but this will give us a good idea of what is sent # to the backend. data = payload.payload # deep copy and merge of meta and metric data data['apiKey'] = '*************************' + data.get( 'apiKey', '')[-5:] # removing unused keys for the metadata payload del data['metrics'] del data['events'] del data['service_checks'] if data.get('processes'): data['processes'][ 'apiKey'] = '*************************' + data[ 'processes'].get('apiKey', '')[-5:] log.debug("Metadata payload: %s", json.dumps(data)) # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info( "First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug( "Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload
def test_complex_config(self): config = {'instances': self.MYSQL_COMPLEX_CONFIG} self.run_check_twice(config) # Test service check self.assertServiceCheck('mysql.can_connect', status=AgentCheck.OK, tags=self.SC_TAGS, count=1) # Travis MySQL not running replication - FIX in flavored test. self.assertServiceCheck('mysql.replication.slave_running', status=AgentCheck.CRITICAL, tags=self.SC_TAGS, count=1) ver = map(lambda x: int(x), self.service_metadata[0]['version'].split(".")) ver = tuple(ver) testable_metrics = (self.STATUS_VARS + self.VARIABLES_VARS + self.INNODB_VARS + self.BINLOG_VARS + self.SYSTEM_METRICS + self.SCHEMA_VARS + self.SYNTHETIC_VARS) if ver >= (5, 6, 0): testable_metrics.extend(self.PERFORMANCE_VARS) # Test metrics for mname in testable_metrics: # These two are currently not guaranteed outside of a Linux # environment. if mname == 'mysql.performance.user_time' and not Platform.is_linux( ): continue if mname == 'mysql.performance.kernel_time' and not Platform.is_linux( ): continue if mname == 'mysql.performance.cpu_time' and Platform.is_windows(): continue if mname == 'mysql.performance.query_run_time.avg': self.assertMetric(mname, tags=self.METRIC_TAGS + ['schema:testdb'], count=1) elif mname == 'mysql.info.schema.size': self.assertMetric(mname, tags=self.METRIC_TAGS + ['schema:testdb'], count=1) self.assertMetric(mname, tags=self.METRIC_TAGS + ['schema:information_schema'], count=1) self.assertMetric(mname, tags=self.METRIC_TAGS + ['schema:performance_schema'], count=1) else: self.assertMetric(mname, tags=self.METRIC_TAGS, count=1) # Assert service metadata self.assertServiceMetadata(['version'], count=1) # test custom query metrics self.assertMetric('alice.age', value=25) self.assertMetric('bob.age', value=20) # test optional metrics self._test_optional_metrics( (self.OPTIONAL_REPLICATION_METRICS + self.OPTIONAL_INNODB_VARS + self.OPTIONAL_STATUS_VARS + self.OPTIONAL_STATUS_VARS_5_6_6), 1) # Raises when COVERAGE=true and coverage < 100% self.coverage_report()
def check(self, agentConfig): if Platform.is_linux(): proc_location = agentConfig.get('procfs_path', '/proc').rstrip('/') try: proc_meminfo = "{}/meminfo".format(proc_location) with open(proc_meminfo, 'r') as mem_info: lines = mem_info.readlines() except Exception: self.logger.exception('Cannot get memory metrics from %s', proc_meminfo) return False # NOTE: not all of the stats below are present on all systems as # not all kernel versions report all of them. # # $ cat /proc/meminfo # MemTotal: 7995360 kB # MemFree: 1045120 kB # MemAvailable: 1253920 kB # Buffers: 226284 kB # Cached: 775516 kB # SwapCached: 248868 kB # Active: 1004816 kB # Inactive: 1011948 kB # Active(anon): 455152 kB # Inactive(anon): 584664 kB # Active(file): 549664 kB # Inactive(file): 427284 kB # Unevictable: 4392476 kB # Mlocked: 4392476 kB # SwapTotal: 11120632 kB # SwapFree: 10555044 kB # Dirty: 2948 kB # Writeback: 0 kB # AnonPages: 5203560 kB # Mapped: 50520 kB # Shmem: 10108 kB # Slab: 161300 kB # SReclaimable: 136108 kB # SUnreclaim: 25192 kB # KernelStack: 3160 kB # PageTables: 26776 kB # NFS_Unstable: 0 kB # Bounce: 0 kB # WritebackTmp: 0 kB # CommitLimit: 15118312 kB # Committed_AS: 6703508 kB # VmallocTotal: 34359738367 kB # VmallocUsed: 400668 kB # VmallocChunk: 34359329524 kB # HardwareCorrupted: 0 kB # HugePages_Total: 0 # HugePages_Free: 0 # HugePages_Rsvd: 0 # HugePages_Surp: 0 # Hugepagesize: 2048 kB # DirectMap4k: 10112 kB # DirectMap2M: 8243200 kB regexp = re.compile( r'^(\w+):\s+([0-9]+)' ) # We run this several times so one-time compile now meminfo = {} parse_error = False for line in lines: try: match = re.search(regexp, line) if match is not None: meminfo[match.group(1)] = match.group(2) except Exception: parse_error = True if parse_error: self.logger.error("Error parsing %s", proc_meminfo) memData = {} # Physical memory # FIXME units are in MB, we should use bytes instead try: memData['physTotal'] = int(meminfo.get('MemTotal', 0)) / 1024 memData['physFree'] = int(meminfo.get('MemFree', 0)) / 1024 memData['physBuffers'] = int(meminfo.get('Buffers', 0)) / 1024 memData['physCached'] = int(meminfo.get('Cached', 0)) / 1024 memData['physShared'] = int(meminfo.get('Shmem', 0)) / 1024 memData['physSlab'] = int(meminfo.get('Slab', 0)) / 1024 memData['physPageTables'] = int(meminfo.get('PageTables', 0)) / 1024 memData[ 'physUsed'] = memData['physTotal'] - memData['physFree'] if 'MemAvailable' in meminfo: memData['physUsable'] = int(meminfo.get('MemAvailable', 0)) / 1024 else: # Usable is relative since cached and buffers are actually used to speed things up. memData['physUsable'] = memData['physFree'] + memData[ 'physBuffers'] + memData['physCached'] if memData['physTotal'] > 0: memData['physPctUsable'] = float( memData['physUsable']) / float(memData['physTotal']) except Exception: self.logger.exception('Cannot compute stats from %s', proc_meminfo) # Swap # FIXME units are in MB, we should use bytes instead try: memData['swapTotal'] = int(meminfo.get('SwapTotal', 0)) / 1024 memData['swapFree'] = int(meminfo.get('SwapFree', 0)) / 1024 memData['swapCached'] = int(meminfo.get('SwapCached', 0)) / 1024 memData[ 'swapUsed'] = memData['swapTotal'] - memData['swapFree'] if memData['swapTotal'] > 0: memData['swapPctFree'] = float( memData['swapFree']) / float(memData['swapTotal']) except Exception: self.logger.exception('Cannot compute swap stats') return memData elif sys.platform == 'darwin': if psutil is None: self.logger.error( "psutil must be installed on MacOS to collect memory metrics" ) return False phys_memory = psutil.virtual_memory() swap = psutil.swap_memory() return { 'physUsed': phys_memory.used / float(1024**2), 'physFree': phys_memory.free / float(1024**2), 'physUsable': phys_memory.available / float(1024**2), 'physPctUsable': (100 - phys_memory.percent) / 100.0, 'swapUsed': swap.used / float(1024**2), 'swapFree': swap.free / float(1024**2) } elif sys.platform.startswith("freebsd"): try: output, _, _ = get_subprocess_output(['sysctl', 'vm.stats.vm'], self.logger) sysctl = output.splitlines() except Exception: self.logger.exception('getMemoryUsage') return False # ... # vm.stats.vm.v_page_size: 4096 # vm.stats.vm.v_page_count: 759884 # vm.stats.vm.v_wire_count: 122726 # vm.stats.vm.v_active_count: 109350 # vm.stats.vm.v_cache_count: 17437 # vm.stats.vm.v_inactive_count: 479673 # vm.stats.vm.v_free_count: 30542 # ... # We run this several times so one-time compile now regexp = re.compile(r'^vm\.stats\.vm\.(\w+):\s+([0-9]+)') meminfo = {} parse_error = False for line in sysctl: try: match = re.search(regexp, line) if match is not None: meminfo[match.group(1)] = match.group(2) except Exception: parse_error = True if parse_error: self.logger.error("Error parsing vm.stats.vm output: %s", sysctl) memData = {} # Physical memory try: pageSize = int(meminfo.get('v_page_size')) memData['physTotal'] = (int(meminfo.get('v_page_count', 0)) * pageSize) / 1048576 memData['physFree'] = (int(meminfo.get('v_free_count', 0)) * pageSize) / 1048576 memData['physCached'] = (int(meminfo.get('v_cache_count', 0)) * pageSize) / 1048576 memData['physUsed'] = ( (int(meminfo.get('v_active_count'), 0) + int(meminfo.get('v_wire_count', 0))) * pageSize) / 1048576 memData['physUsable'] = ( (int(meminfo.get('v_free_count'), 0) + int(meminfo.get('v_cache_count', 0)) + int(meminfo.get('v_inactive_count', 0))) * pageSize) / 1048576 if memData['physTotal'] > 0: memData['physPctUsable'] = float( memData['physUsable']) / float(memData['physTotal']) except Exception: self.logger.exception('Cannot compute stats from %s', proc_meminfo) # Swap try: output, _, _ = get_subprocess_output(['swapinfo', '-m'], self.logger) sysctl = output.splitlines() except Exception: self.logger.exception('getMemoryUsage') return False # ... # Device 1M-blocks Used Avail Capacity # /dev/ad0s1b 570 0 570 0% # ... assert "Device" in sysctl[0] try: memData['swapTotal'] = 0 memData['swapFree'] = 0 memData['swapUsed'] = 0 for line in sysctl[1:]: if len(line) > 0: line = line.split() memData['swapTotal'] += int(line[1]) memData['swapFree'] += int(line[3]) memData['swapUsed'] += int(line[2]) except Exception: self.logger.exception('Cannot compute stats from swapinfo') return memData elif sys.platform == 'sunos5': try: memData = {} cmd = [ "kstat", "-m", "memory_cap", "-c", "zone_memory_cap", "-p" ] output, _, _ = get_subprocess_output(cmd, self.logger) kmem = output.splitlines() # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:anon_alloc_fail 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:anonpgin 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:class zone_memory_cap # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:crtime 16359935.0680834 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:execpgin 185 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:fspgin 2556 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:n_pf_throttle 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:n_pf_throttle_usec 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:nover 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:pagedout 0 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:pgpgin 2741 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:physcap 536870912 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:rss 115544064 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:snaptime 16787393.9439095 # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swap 91828224 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:swapcap 1073741824 <-- # memory_cap:360:53aa9b7e-48ba-4152-a52b-a6368c:zonename 53aa9b7e-48ba-4152-a52b-a6368c3d9e7c # turn memory_cap:360:zone_name:key value # into { "key": value, ...} kv = [l.strip().split() for l in kmem if len(l) > 0] entries = dict([(k.split(":")[-1], v) for (k, v) in kv]) # extract rss, physcap, swap, swapcap, turn into MB convert = lambda v: int(long(v)) / 2**20 memData["physTotal"] = convert(entries["physcap"]) memData["physUsed"] = convert(entries["rss"]) memData[ "physFree"] = memData["physTotal"] - memData["physUsed"] memData["swapTotal"] = convert(entries["swapcap"]) memData["swapUsed"] = convert(entries["swap"]) memData[ "swapFree"] = memData["swapTotal"] - memData["swapUsed"] if memData['swapTotal'] > 0: memData['swapPctFree'] = float( memData['swapFree']) / float(memData['swapTotal']) return memData except Exception: self.logger.exception( "Cannot compute mem stats from kstat -c zone_memory_cap") return False else: return False
# Licensed under Simplified BSD License (see LICENSE) # stdlib import time # 3p import dns.resolver # project from utils.platform import Platform from checks.network_checks import NetworkCheck, Status # These imports are necessary because otherwise dynamic type # resolution will fail on windows without it. # See more here: https://github.com/rthalley/dnspython/issues/39. if Platform.is_win32(): from dns.rdtypes.ANY import * # noqa from dns.rdtypes.IN import * # noqa # for tiny time deltas, time.time on Windows reports the same value # of the clock more than once, causing the computation of response_time # to be often 0; let's use time.clock that is more precise. time_func = time.clock else: time_func = time.time class BadConfException(Exception): pass class DNSCheck(NetworkCheck):
def set_ssl_validation(self, options): if self._config.get('skip_ssl_validation', False): options['verify'] = False elif Platform.is_windows(): options['verify'] = get_ssl_certificate('windows', 'stackstate-cert.pem')
def initialize_logging(logger_name): try: logging_config = get_logging_config() logging.basicConfig( format=get_log_format(logger_name), level=logging_config['log_level'] or logging.INFO, ) log_file = logging_config.get('%s_log_file' % logger_name) if log_file is not None and not logging_config['disable_file_logging']: # make sure the log directory is writeable # NOTE: the entire directory needs to be writable so that rotation works if os.access(os.path.dirname(log_file), os.R_OK | os.W_OK): file_handler = logging.handlers.RotatingFileHandler( log_file, maxBytes=LOGGING_MAX_BYTES, backupCount=1) formatter = logging.Formatter(get_log_format(logger_name), get_log_date_format()) file_handler.setFormatter(formatter) root_log = logging.getLogger() root_log.addHandler(file_handler) else: sys.stderr.write("Log file is unwritable: '%s'\n" % log_file) # set up syslog if logging_config['log_to_syslog']: try: from logging.handlers import SysLogHandler if logging_config['syslog_host'] is not None and logging_config[ 'syslog_port'] is not None: sys_log_addr = (logging_config['syslog_host'], logging_config['syslog_port']) else: sys_log_addr = "/dev/log" # Special-case BSDs if Platform.is_darwin(): sys_log_addr = "/var/run/syslog" elif Platform.is_freebsd(): sys_log_addr = "/var/run/log" handler = SysLogHandler(address=sys_log_addr, facility=SysLogHandler.LOG_DAEMON) handler.setFormatter( logging.Formatter(get_syslog_format(logger_name), get_log_date_format())) root_log = logging.getLogger() root_log.addHandler(handler) except Exception as e: sys.stderr.write("Error setting up syslog: '%s'\n" % str(e)) traceback.print_exc() # Setting up logging in the event viewer for windows if get_os() == 'windows' and logging_config['log_to_event_viewer']: try: from logging.handlers import NTEventLogHandler nt_event_handler = NTEventLogHandler( logger_name, get_win32service_file('windows', 'win32service.pyd'), 'Application') nt_event_handler.setFormatter( logging.Formatter(get_syslog_format(logger_name), get_log_date_format())) nt_event_handler.setLevel(logging.ERROR) app_log = logging.getLogger(logger_name) app_log.addHandler(nt_event_handler) except Exception as e: sys.stderr.write( "Error setting up Event viewer logging: '%s'\n" % str(e)) traceback.print_exc() except Exception as e: sys.stderr.write("Couldn't initialize logging: %s\n" % str(e)) traceback.print_exc() # if config fails entirely, enable basic stdout logging as a fallback logging.basicConfig( format=get_log_format(logger_name), level=logging.INFO, ) # re-get the log after logging is initialized global log log = logging.getLogger(__name__)
def get_hostname(config=None): """ Get the canonical host name this agent should identify as. This is the authoritative source of the host name for the agent. Tries, in order: * agent config (datadog.conf, "hostname:") * 'hostname -f' (on unix) * socket.gethostname() """ hostname = None # first, try the config if config is None: from config import get_config config = get_config(parse_args=True) config_hostname = config.get('hostname') if config_hostname and is_valid_hostname(config_hostname): return config_hostname # Try to get GCE instance name if hostname is None: gce_hostname = GCE.get_hostname(config) if gce_hostname is not None: if is_valid_hostname(gce_hostname): return gce_hostname # Try to get the docker hostname if hostname is None and is_dockerized(): docker_hostname = get_docker_hostname() if docker_hostname is not None and is_valid_hostname(docker_hostname): return docker_hostname # then move on to os-specific detection if hostname is None: def _get_hostname_unix(): try: # try fqdn out, _, rtcode = get_subprocess_output(['/bin/hostname', '-f'], log) if rtcode == 0: return out.strip() except Exception: return None os_name = get_os() if os_name in ['mac', 'freebsd', 'linux', 'solaris']: unix_hostname = _get_hostname_unix() if unix_hostname and is_valid_hostname(unix_hostname): hostname = unix_hostname # if we have an ec2 default hostname, see if there's an instance-id available if (Platform.is_ecs_instance()) or (hostname is not None and True in [ hostname.lower().startswith(p) for p in [u'ip-', u'domu'] ]): instanceid = EC2.get_instance_id(config) if instanceid: hostname = instanceid # fall back on socket.gethostname(), socket.getfqdn() is too unreliable if hostname is None: try: socket_hostname = socket.gethostname() except socket.error: socket_hostname = None if socket_hostname and is_valid_hostname(socket_hostname): hostname = socket_hostname if hostname is None: log.critical( 'Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file' ) raise Exception( 'Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file' ) else: return hostname
def _populate_payload_metadata(self, payload, check_statuses, start_event=True): """ Periodically populate the payload with metadata related to the system, host, and/or checks. """ now = time.time() # Include system stats on first postback if start_event and self._is_first_run(): payload['systemStats'] = self.agentConfig.get('system_stats', {}) # Also post an event in the newsfeed payload['events']['System'] = [{ 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': now, 'event_type': 'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Periodically send the host metadata. if self._should_send_additional_data('host_metadata'): # gather metadata with gohai try: if not Platform.is_windows(): command = "gohai" else: command = "gohai\gohai.exe" gohai_metadata, gohai_err, _ = get_subprocess_output([command], log) payload['gohai'] = gohai_metadata if gohai_err: log.warning("GOHAI LOG | {0}".format(gohai_err)) except OSError as e: if e.errno == 2: # file not found, expected when install from source log.info("gohai file not found") else: raise e except Exception as e: log.warning("gohai command failed with error %s" % str(e)) payload['systemStats'] = get_system_stats() payload['meta'] = self._get_hostname_metadata() self.hostname_metadata_cache = payload['meta'] # Add static tags from the configuration file host_tags = [] if self.agentConfig['tags'] is not None: host_tags.extend([ unicode(tag.strip()) for tag in self.agentConfig['tags'].split(",") ]) if self.agentConfig['collect_ec2_tags']: host_tags.extend(EC2.get_tags(self.agentConfig)) if host_tags: payload['host-tags']['system'] = host_tags # If required by the user, let's create the dd_check:xxx host tags if self.agentConfig['create_dd_check_tags']: app_tags_list = [ DD_CHECK_TAG.format(c.name) for c in self.initialized_checks_d ] app_tags_list.extend([ DD_CHECK_TAG.format(cname) for cname in JMXFiles.get_jmx_appnames() ]) if 'system' not in payload['host-tags']: payload['host-tags']['system'] = [] payload['host-tags']['system'].extend(app_tags_list) GCE_tags = GCE.get_tags(self.agentConfig) if GCE_tags is not None: payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags # Log the metadata on the first run if self._is_first_run(): log.info( "Hostnames: %s, tags: %s" % (repr(self.hostname_metadata_cache), payload['host-tags'])) # Periodically send extra hosts metadata (vsphere) # Metadata of hosts that are not the host where the agent runs, not all the checks use # that external_host_tags = [] if self._should_send_additional_data('external_host_tags'): for check in self.initialized_checks_d: try: getter = getattr(check, 'get_external_host_tags') check_tags = getter() external_host_tags.extend(check_tags) except AttributeError: pass if external_host_tags: payload['external_host_tags'] = external_host_tags # Periodically send agent_checks metadata if self._should_send_additional_data('agent_checks'): # Add agent checks statuses and error/warning messages agent_checks = [] for check in check_statuses: if check.instance_statuses is not None: for i, instance_status in enumerate( check.instance_statuses): agent_checks.append(( check.name, check.source_type_name, instance_status.instance_id, instance_status.status, # put error message or list of warning messages in the same field # it will be handled by the UI instance_status.error or instance_status.warnings or "", check.service_metadata[i])) else: agent_checks.append( (check.name, check.source_type_name, "initialization", check.status, repr(check.init_failed_error))) payload['agent_checks'] = agent_checks payload[ 'meta'] = self.hostname_metadata_cache # add hostname metadata
def __init__(self, agentConfig, emitters, systemStats, hostname): self.emit_duration = None self.agentConfig = agentConfig self.hostname = hostname # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.check_timings = agentConfig.get('check_timings') self.push_times = { 'host_metadata': { 'start': time.time(), 'interval': int(agentConfig.get('metadata_interval', 4 * 60 * 60)) }, 'external_host_tags': { 'start': time.time() - 3 * 60, # Wait for the checks to init 'interval': int(agentConfig.get('external_host_tags', 5 * 60)) }, 'agent_checks': { 'start': time.time(), 'interval': int(agentConfig.get('agent_checks_interval', 10 * 60)) }, 'processes': { 'start': time.time(), 'interval': int(agentConfig.get('processes_interval', 60)) } } socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.hostname_metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = {} if Platform.is_linux() and psutil is not None: procfs_path = agentConfig.get('procfs_path', '/proc').rstrip('/') psutil.PROCFS_PATH = procfs_path # Unix System Checks self._unix_system_checks = { 'io': u.IO(log), 'load': u.Load(log), 'memory': u.Memory(log), 'processes': u.Processes(log), 'cpu': u.Cpu(log), 'system': u.System(log) } # Win32 System `Checks self._win32_system_checks = { 'io': w32.IO(log), 'proc': w32.Processes(log), 'memory': w32.Memory(log), 'network': w32.Network(log), 'cpu': w32.Cpu(log), 'system': w32.System(log) } # Old-style metric checks self._ganglia = Ganglia(log) if self.agentConfig.get( 'ganglia_host', '') != '' else None self._dogstream = None if self.agentConfig.get( 'dogstreams') is None else Dogstreams.init(log, self.agentConfig) # Agent performance metrics check self._agent_metrics = None self._metrics_checks = [] # Custom metric checks for module_spec in [ s.strip() for s in self.agentConfig.get('custom_checks', '').split(',') ]: if len(module_spec) == 0: continue try: self._metrics_checks.append( modules.load(module_spec, 'Check')(log)) log.info("Registered custom check %s" % module_spec) log.warning( "Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version" ) except Exception: log.exception('Unable to load custom check module %s' % module_spec)
def run(self, checksd=None, start_event=True, configs_reloaded=False): """ Collect data from each check and submit their data. """ log.debug("Found {num_checks} checks".format( num_checks=len(checksd['initialized_checks']))) timer = Timer() if not Platform.is_windows(): cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd[ 'initialized_checks'] # is a list of AgentCheck instances self.init_failed_checks_d = checksd[ 'init_failed_checks'] # is of type {check_name: {error, traceback}} payload = AgentPayload() # Find the AgentMetrics check and pop it out # This check must run at the end of the loop to collect info on agent performance if not self._agent_metrics or configs_reloaded: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break # Initialize payload self._build_payload(payload) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] # Run the system checks. Checks will depend on the OS if Platform.is_windows(): # Win32 system checks try: metrics.extend(self._win32_system_checks['memory'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check( self.agentConfig)) except Exception: log.exception('Unable to fetch Windows system metrics.') else: # Unix system checks sys_checks = self._unix_system_checks load = sys_checks['load'].check(self.agentConfig) payload.update(load) system = sys_checks['system'].check(self.agentConfig) payload.update(system) memory = sys_checks['memory'].check(self.agentConfig) if memory: memstats = { 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsable': memory.get('physPctUsable'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared'), 'memSlab': memory.get('physSlab'), 'memPageTables': memory.get('physPageTables'), 'memSwapCached': memory.get('swapCached') } payload.update(memstats) ioStats = sys_checks['io'].check(self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(self.agentConfig) payload.update({'processes': processes}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks gangliaData = self._ganglia.check(self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # metrics about the forwarder if ddforwarderData: payload['datadog'] = ddforwarderData # Resources checks if not Platform.is_windows(): has_resource = False for resources_check in self._resources_checks: try: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = { 'snaps': snaps, 'format_version': resources_check.get_format_version() } res_format = resources_check.describe_format_if_needed( ) if res_format is not None: res_value['format_description'] = res_format payload['resources'][ resources_check.RESOURCE_KEY] = res_value except Exception: log.exception("Error running resource check %s" % resources_check.RESOURCE_KEY) if has_resource: payload['resources']['meta'] = { 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], } # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() # Collect metadata current_check_metadata = check.get_service_metadata() # Save metrics & events for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datadog.agent.check_status', status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = 'datadog.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) # Add a service check for the agent service_checks.append( create_service_check('datadog.agent.up', AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks # Populate metadata self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self._agent_metrics: metric_context = { 'collection_time': collect_duration, 'emit_time': self.emit_duration, } if not Platform.is_windows(): metric_context['cpu_time'] = time.clock() - cpu_clock self._agent_metrics.set_metric_context(payload, metric_context) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) if self.agentConfig.get('developer_mode'): log.debug("\n Agent developer mode stats: \n {0}".format( Collector._stats_for_display(agent_stats))) # Let's send our payload emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info( "First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug( "Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload
def init(self): try: # We configure the check with the right cgroup settings for this host # Just needs to be done once instance = self.instances[0] set_docker_settings(self.init_config, instance) self.client = get_client() self._docker_root = self.init_config.get('docker_root', '/') self._mountpoints = get_mountpoints(self._docker_root) self.cgroup_listing_retries = 0 self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False # At first run we'll just collect the events from the latest 60 secs self._last_event_collection_ts = int(time.time()) - 60 # Set tagging options self.custom_tags = instance.get("tags", []) self.collect_labels_as_tags = instance.get("collect_labels_as_tags", []) if self.is_k8s(): self.collect_labels_as_tags.append("io.kubernetes.pod.name") self.use_histogram = _is_affirmative(instance.get('use_histogram', False)) performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: performance_tags, IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } # Set filtering settings if not instance.get("exclude"): self._filtering_enabled = False if instance.get("include"): self.log.warning("You must specify an exclude section to enable filtering") else: self._filtering_enabled = True include = instance.get("include", []) exclude = instance.get("exclude", []) self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude) self.tag_names[FILTERED] = _filtered_tag_names # Other options self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False)) self.collect_events = _is_affirmative(instance.get('collect_events', True)) self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False)) self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance() self.ecs_tags = {} except Exception, e: self.log.critical(e) self.warning("Initialization failed. Will retry at next iteration")
def _make_fetch_state(self): return _SDDockerBackendConfigFetchState( self.docker_client.inspect_container, self.kubeutil.retrieve_pods_list().get('items', []) if Platform.is_k8s() else None)
def agent_status(): if Platform.is_windows(): return service_manager_status() else: return osx_manager_status()
from config import (get_confd_path, get_config, get_config_path, get_logging_config, get_version) from util import yLoader from utils.flare import Flare from utils.platform import Platform # Constants describing the agent state AGENT_RUNNING = 0 AGENT_START_PENDING = 1 AGENT_STOP_PENDING = 2 AGENT_STOPPED = 3 AGENT_UNKNOWN = 4 # Windows management # Import Windows stuff only on Windows if Platform.is_windows(): import pywintypes import winerror import win32serviceutil import win32service # project from utils.pidfile import PidFile WIN_STATUS_TO_AGENT = { win32service.SERVICE_RUNNING: AGENT_RUNNING, win32service.SERVICE_START_PENDING: AGENT_START_PENDING, win32service.SERVICE_STOP_PENDING: AGENT_STOP_PENDING, win32service.SERVICE_STOPPED: AGENT_STOPPED, }
def check(self, agentConfig): """Capture io stats. @rtype dict @return {"device": {"metric": value, "metric": value}, ...} """ io = {} try: if Platform.is_linux(): stdout, _, _ = get_subprocess_output( ['iostat', '-d', '1', '2', '-x', '-k'], self.logger) # Linux 2.6.32-343-ec2 (ip-10-35-95-10) 12/11/2012 _x86_64_ (2 CPU) # # Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await svctm %util # sda1 0.00 17.61 0.26 32.63 4.23 201.04 12.48 0.16 4.81 0.53 1.73 # sdb 0.00 2.68 0.19 3.84 5.79 26.07 15.82 0.02 4.93 0.22 0.09 # sdg 0.00 0.13 2.29 3.84 100.53 30.61 42.78 0.05 8.41 0.88 0.54 # sdf 0.00 0.13 2.30 3.84 100.54 30.61 42.78 0.06 9.12 0.90 0.55 # md0 0.00 0.00 0.05 3.37 1.41 30.01 18.35 0.00 0.00 0.00 0.00 # # Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await svctm %util # sda1 0.00 0.00 0.00 10.89 0.00 43.56 8.00 0.03 2.73 2.73 2.97 # sdb 0.00 0.00 0.00 2.97 0.00 11.88 8.00 0.00 0.00 0.00 0.00 # sdg 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 # sdf 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 # md0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 io.update(self._parse_linux2(stdout)) elif sys.platform == "sunos5": output, _, _ = get_subprocess_output( ["iostat", "-x", "-d", "1", "2"], self.logger) iostat = output.splitlines() # extended device statistics <-- since boot # device r/s w/s kr/s kw/s wait actv svc_t %w %b # ramdisk1 0.0 0.0 0.1 0.1 0.0 0.0 0.0 0 0 # sd0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 # sd1 79.9 149.9 1237.6 6737.9 0.0 0.5 2.3 0 11 # extended device statistics <-- past second # device r/s w/s kr/s kw/s wait actv svc_t %w %b # ramdisk1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 # sd0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 # sd1 0.0 139.0 0.0 1850.6 0.0 0.0 0.1 0 1 # discard the first half of the display (stats since boot) lines = [l for l in iostat if len(l) > 0] lines = lines[len(lines) / 2:] assert "extended device statistics" in lines[0] headers = lines[1].split() assert "device" in headers for l in lines[2:]: cols = l.split() # cols[0] is the device # cols[1:] are the values io[cols[0]] = {} for i in range(1, len(cols)): io[cols[0]][self.xlate(headers[i], "sunos")] = cols[i] elif sys.platform.startswith("freebsd"): output, _, _ = get_subprocess_output( ["iostat", "-x", "-d", "1", "2"], self.logger) iostat = output.splitlines() # Be careful! # It looks like SunOS, but some columms (wait, svc_t) have different meaning # extended device statistics # device r/s w/s kr/s kw/s wait svc_t %b # ad0 3.1 1.3 49.9 18.8 0 0.7 0 # extended device statistics # device r/s w/s kr/s kw/s wait svc_t %b # ad0 0.0 2.0 0.0 31.8 0 0.2 0 # discard the first half of the display (stats since boot) lines = [l for l in iostat if len(l) > 0] lines = lines[len(lines) / 2:] assert "extended device statistics" in lines[0] headers = lines[1].split() assert "device" in headers for l in lines[2:]: cols = l.split() # cols[0] is the device # cols[1:] are the values io[cols[0]] = {} for i in range(1, len(cols)): io[cols[0]][self.xlate(headers[i], "freebsd")] = cols[i] elif sys.platform == 'darwin': iostat, _, _ = get_subprocess_output( ['iostat', '-d', '-c', '2', '-w', '1'], self.logger) # disk0 disk1 <-- number of disks # KB/t tps MB/s KB/t tps MB/s # 21.11 23 0.47 20.01 0 0.00 # 6.67 3 0.02 0.00 0 0.00 <-- line of interest io = self._parse_darwin(iostat) else: return False # If we filter devices, do it know. device_blacklist_re = agentConfig.get('device_blacklist_re', None) if device_blacklist_re: filtered_io = {} for device, stats in io.iteritems(): if not device_blacklist_re.match(device): filtered_io[device] = stats else: filtered_io = io return filtered_io except Exception: self.logger.exception("Cannot extract IO statistics") return False
def init(self): try: instance = self.instances[0] # We configure the check with the right cgroup settings for this host # Just needs to be done once self.docker_util = DockerUtil() self.docker_client = self.docker_util.client if self.is_k8s(): self.kubeutil = KubeUtil() self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS) self.cgroup_listing_retries = 0 self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False # Set tagging options self.custom_tags = instance.get("tags", []) self.collect_labels_as_tags = instance.get("collect_labels_as_tags", []) self.kube_labels = {} self.use_histogram = _is_affirmative(instance.get('use_histogram', False)) performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: performance_tags, IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } # Set filtering settings if not instance.get("exclude"): self._filtering_enabled = False if instance.get("include"): self.log.warning("You must specify an exclude section to enable filtering") else: self._filtering_enabled = True include = instance.get("include", []) exclude = instance.get("exclude", []) self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude) self.tag_names[FILTERED] = _filtered_tag_names # Other options self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False)) self.collect_events = _is_affirmative(instance.get('collect_events', True)) self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False)) self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False)) self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance() self.ecs_tags = {} except Exception as e: self.log.critical(e) self.warning("Initialization failed. Will retry at next iteration") else: self.init_success = True
def __init__(self, parent=None): log_conf = get_logging_config() QSplitter.__init__(self, parent) self.setWindowTitle(MAIN_WINDOW_TITLE) self.setWindowIcon(get_icon("agent.svg")) self.sysTray = SystemTray(self) self.connect(self.sysTray, SIGNAL("activated(QSystemTrayIcon::ActivationReason)"), self.__icon_activated) checks = get_checks() datadog_conf = DatadogConf(get_config_path()) self.create_logs_files_windows(log_conf) listwidget = QListWidget(self) listwidget.addItems([ osp.basename(check.module_name).replace("_", " ").title() for check in checks ]) self.properties = PropertiesWidget(self) self.setting_button = QPushButton(get_icon("info.png"), "Logs and Status", self) self.menu_button = QPushButton(get_icon("settings.png"), "Actions", self) self.settings = [ ("Forwarder Logs", lambda: [ self.properties.set_log_file(self.forwarder_log_file), self.show_html(self.properties.group_code, self.properties. html_window, False) ]), ("Collector Logs", lambda: [ self.properties.set_log_file(self.collector_log_file), self.show_html(self.properties.group_code, self.properties. html_window, False) ]), ("Dogstatsd Logs", lambda: [ self.properties.set_log_file(self.dogstatsd_log_file), self.show_html(self.properties.group_code, self.properties. html_window, False) ]), ("JMX Fetch Logs", lambda: [ self.properties.set_log_file(self.jmxfetch_log_file), self.show_html(self.properties.group_code, self.properties. html_window, False) ]), ] if Platform.is_windows(): self.settings.extend([ ("Service Logs", lambda: [ self.properties.set_log_file(self.service_log_file), self.show_html(self.properties.group_code, self.properties. html_window, False) ]), ]) self.settings.extend([ ("Agent Status", lambda: [ self.properties.html_window.setHtml(self.properties.html_window .latest_status()), self.show_html(self.properties.group_code, self.properties. html_window, True), self.properties.set_status() ]), ]) self.agent_settings = QPushButton(get_icon("edit.png"), "Settings", self) self.connect( self.agent_settings, SIGNAL("clicked()"), lambda: [ self.properties.set_datadog_conf(datadog_conf), self.show_html(self.properties.group_code, self.properties. html_window, False) ]) self.setting_menu = SettingMenu(self.settings) self.connect( self.setting_button, SIGNAL("clicked()"), lambda: self.setting_menu.popup( self.setting_button.mapToGlobal(QPoint(0, 0)))) self.manager_menu = Menu(self) self.connect( self.menu_button, SIGNAL("clicked()"), lambda: self.manager_menu.popup( self.menu_button.mapToGlobal(QPoint(0, 0)))) holdingBox = QGroupBox("", self) Box = QVBoxLayout(self) Box.addWidget(self.agent_settings) Box.addWidget(self.setting_button) Box.addWidget(self.menu_button) Box.addWidget(listwidget) holdingBox.setLayout(Box) self.addWidget(holdingBox) self.addWidget(self.properties) self.connect(self.properties.enable_button, SIGNAL("clicked()"), lambda: enable_check(self.properties)) self.connect(self.properties.disable_button, SIGNAL("clicked()"), lambda: disable_check(self.properties)) self.connect(self.properties.save_button, SIGNAL("clicked()"), lambda: save_file(self.properties)) self.connect( self.properties.refresh_button, SIGNAL("clicked()"), lambda: [ self.properties.set_log_file(self.properties.current_file), self.properties.html_window.setHtml(self.properties.html_window .latest_status()) ]) self.connect( listwidget, SIGNAL('currentRowChanged(int)'), lambda row: [ self.properties.set_item(checks[row]), self.show_html(self.properties.group_code, self.properties. html_window, False) ]) listwidget.setCurrentRow(0) self.setSizes([150, 1]) self.setStretchFactor(1, 1) self.resize(QSize(950, 600)) self.properties.set_datadog_conf(datadog_conf) self.do_refresh()
def _get_tags(self, entity=None, tag_type=None): """Generate the tags for a given entity (container or image) according to a list of tag names.""" # Start with custom tags tags = list(self.custom_tags) # Collect pod names as tags on kubernetes if Platform.is_k8s( ) and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags: self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL) if entity is not None: pod_name = None # Get labels as tags labels = entity.get("Labels") if labels is not None: for k in self.collect_labels_as_tags: if k in labels: v = labels[k] if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s(): pod_name = v k = "pod_name" if "-" in pod_name: replication_controller = "-".join( pod_name.split("-")[:-1]) if "/" in replication_controller: # k8s <= 1.1 namespace, replication_controller = replication_controller.split( "/", 1) elif KubeUtil.NAMESPACE_LABEL in labels: # k8s >= 1.2 namespace = labels[ KubeUtil.NAMESPACE_LABEL] pod_name = "{0}/{1}".format( namespace, pod_name) tags.append("kube_namespace:%s" % namespace) tags.append("kube_replication_controller:%s" % replication_controller) tags.append("pod_name:%s" % pod_name) elif not v: tags.append(k) else: tags.append("%s:%s" % (k, v)) if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s( ) and k not in labels: tags.append("pod_name:no_pod") # Get entity specific tags if tag_type is not None: tag_names = self.tag_names[tag_type] for tag_name in tag_names: tag_value = self._extract_tag_value(entity, tag_name) if tag_value is not None: for t in tag_value: tags.append('%s:%s' % (tag_name, str(t).strip())) # Add ECS tags if self.collect_ecs_tags: entity_id = entity.get("Id") if entity_id in self.ecs_tags: ecs_tags = self.ecs_tags[entity_id] tags.extend(ecs_tags) # Add kube labels if Platform.is_k8s(): kube_tags = self.kube_labels.get(pod_name) if kube_tags: tags.extend(list(kube_tags)) return tags
def get_tags(self, state, c_id): """Extract useful tags from docker or platform APIs. These are collected by default.""" c_inspect = state.inspect_container(c_id) tags = self.dockerutil.extract_container_tags( c_inspect, self.docker_labels_as_tags) if Platform.is_k8s(): if not self.kubeutil.init_success: log.warning( "kubelet client not initialized, kubernetes tags will be missing." ) return tags pod_metadata = state.get_kube_config(c_id, 'metadata') if pod_metadata is None: log.warning("Failed to fetch pod metadata for container %s." " Kubernetes tags will be missing." % c_id[:12]) return tags # get pod labels kube_labels = pod_metadata.get('labels', {}) for label, value in kube_labels.iteritems(): tags.append('%s:%s' % (label, value)) # get kubernetes namespace namespace = pod_metadata.get('namespace') tags.append('kube_namespace:%s' % namespace) if not self.kubeutil: log.warning("The agent can't connect to kubelet, creator and " "service tags will be missing for container %s." % c_id[:12]) else: # add creator tags creator_tags = self.kubeutil.get_pod_creator_tags(pod_metadata) tags.extend(creator_tags) # add services tags if self.kubeutil.collect_service_tag: services = self.kubeutil.match_services_for_pod( pod_metadata) for s in services: if s is not None: tags.append('kube_service:%s' % s) elif Platform.is_swarm(): c_labels = c_inspect.get('Config', {}).get('Labels', {}) swarm_svc = c_labels.get(SWARM_SVC_LABEL) if swarm_svc: tags.append('swarm_service:%s' % swarm_svc) elif Platform.is_rancher(): service_name = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_SVC_NAME) stack_name = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_STACK_NAME) container_name = c_inspect.get('Config', {}).get( 'Labels', {}).get(RANCHER_CONTAINER_NAME) if service_name: tags.append('rancher_service:%s' % service_name) if stack_name: tags.append('rancher_stack:%s' % stack_name) if container_name: tags.append('rancher_container:%s' % container_name) if self.metadata_collector.has_detected(): orch_tags = self.metadata_collector.get_container_tags( co=c_inspect) tags.extend(orch_tags) return tags