def test_docker_are_tags_filtered(self): with mock.patch.object(DockerUtil, 'is_k8s', side_effect=lambda: True): DockerUtil._drop() du = DockerUtil() self.assertTrue(du.is_k8s()) pause_containers = [ "docker_image:gcr.io/google_containers/pause-amd64:0.3.0", "docker_image:asia.gcr.io/google_containers/pause-amd64:3.0", "docker_image:k8s.gcr.io/pause-amd64:latest", "image_name:openshift/origin-pod", "image_name:kubernetes/pause", ] for image in pause_containers: self.assertTrue(du.are_tags_filtered([image])) self.assertTrue(pause_containers) self.assertFalse(du.are_tags_filtered(["docker_image:quay.io/coreos/etcd:latest"])) self.assertFalse(du.are_tags_filtered(["image_name:redis"]))
class DockerDaemon(AgentCheck): """Collect metrics and events from Docker API and cgroups.""" def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception("Docker check only supports one configured instance.") AgentCheck.__init__(self, name, init_config, agentConfig, instances=instances) self.init_success = False self._service_discovery = agentConfig.get('service_discovery') and \ agentConfig.get('service_discovery_backend') == 'docker' self.init() def init(self): try: instance = self.instances[0] self.docker_util = DockerUtil() self.docker_client = self.docker_util.client self.docker_gateway = DockerUtil.get_gateway() if Platform.is_k8s(): self.kubeutil = KubeUtil() # We configure the check with the right cgroup settings for this host # Just needs to be done once self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS) self.cgroup_listing_retries = 0 self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False # Set tagging options self.custom_tags = instance.get("tags", []) self.collect_labels_as_tags = instance.get("collect_labels_as_tags", []) self.kube_labels = {} self.use_histogram = _is_affirmative(instance.get('use_histogram', False)) performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: performance_tags, IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } # Set filtering settings if self.docker_util.filtering_enabled: self.tag_names[FILTERED] = self.docker_util.filtered_tag_names # get the health check whitelist health_scs_whitelist = instance.get('health_service_check_whitelist', []) if health_scs_whitelist: patterns, whitelist_tags = compile_filter_rules(health_scs_whitelist) self.whitelist_patterns = set(patterns) self.tag_names[HEALTHCHECK] = set(whitelist_tags) # Other options self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False)) self.collect_events = _is_affirmative(instance.get('collect_events', True)) self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False)) self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False)) self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance() self.ecs_tags = {} except Exception as e: self.log.critical(e) self.warning("Initialization failed. Will retry at next iteration") else: self.init_success = True def check(self, instance): """Run the Docker check for one instance.""" if not self.init_success: # Initialization can fail if cgroups are not ready. So we retry if needed # https://github.com/DataDog/dd-agent/issues/1896 self.init() if not self.init_success: # Initialization failed, will try later return # Report image metrics if self.collect_image_stats: self._count_and_weigh_images() if self.collect_ecs_tags: self.refresh_ecs_tags() if Platform.is_k8s(): try: self.kube_labels = self.kubeutil.get_kube_labels() except Exception as e: self.log.warning('Could not retrieve kubernetes labels: %s' % str(e)) self.kube_labels = {} # containers running with custom cgroups? custom_cgroups = _is_affirmative(instance.get('custom_cgroups', False)) # Get the list of containers and the index of their names health_service_checks = True if self.whitelist_patterns else False containers_by_id = self._get_and_count_containers(custom_cgroups, health_service_checks) containers_by_id = self._crawl_container_pids(containers_by_id, custom_cgroups) # Send events from Docker API if self.collect_events or self._service_discovery: self._process_events(containers_by_id) # Report performance container metrics (cpu, mem, net, io) self._report_performance_metrics(containers_by_id) if self.collect_container_size: self._report_container_size(containers_by_id) # Collect disk stats from Docker info command if self.collect_disk_stats: self._report_disk_stats() if health_service_checks: self._send_container_healthcheck_sc(containers_by_id) def _count_and_weigh_images(self): try: tags = self._get_tags() active_images = self.docker_client.images(all=False) active_images_len = len(active_images) all_images_len = len(self.docker_client.images(quiet=True, all=True)) self.gauge("docker.images.available", active_images_len, tags=tags) self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags) if self.collect_image_size: self._report_image_size(active_images) except Exception as e: # It's not an important metric, keep going if it fails self.warning("Failed to count Docker images. Exception: {0}".format(e)) def _get_and_count_containers(self, custom_cgroups=False, healthchecks=False): """List all the containers from the API, filter and count them.""" # Querying the size of containers is slow, we don't do it at each run must_query_size = self.collect_container_size and self._latest_size_query == 0 self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE running_containers_count = Counter() all_containers_count = Counter() try: containers = self.docker_client.containers(all=True, size=must_query_size) except Exception as e: message = "Unable to list Docker containers: {0}".format(e) self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message=message) raise Exception(message) else: self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK) # Create a set of filtered containers based on the exclude/include rules # and cache these rules in docker_util self._filter_containers(containers) containers_by_id = {} for container in containers: container_name = DockerUtil.container_name_extractor(container)[0] container_status_tags = self._get_tags(container, CONTAINER) all_containers_count[tuple(sorted(container_status_tags))] += 1 if self._is_container_running(container): running_containers_count[tuple(sorted(container_status_tags))] += 1 # Check if the container is included/excluded via its tags if self._is_container_excluded(container): self.log.debug("Container {0} is excluded".format(container_name)) continue containers_by_id[container['Id']] = container # grab pid via API if custom cgroups - otherwise we won't find process when # crawling for pids. if custom_cgroups or healthchecks: try: inspect_dict = self.docker_client.inspect_container(container_name) container['_pid'] = inspect_dict['State']['Pid'] container['health'] = inspect_dict['State'].get('Health', {}) except Exception as e: self.log.debug("Unable to inspect Docker container: %s", e) for tags, count in running_containers_count.iteritems(): self.gauge("docker.containers.running", count, tags=list(tags)) for tags, count in all_containers_count.iteritems(): stopped_count = count - running_containers_count[tags] self.gauge("docker.containers.stopped", stopped_count, tags=list(tags)) return containers_by_id def _is_container_running(self, container): """Tell if a container is running, according to its status. There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated. See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35 """ return container["Status"].startswith("Up") or container["Status"].startswith("Restarting") def _get_tags(self, entity=None, tag_type=None): """Generate the tags for a given entity (container or image) according to a list of tag names.""" # Start with custom tags tags = list(self.custom_tags) # Collect pod names as tags on kubernetes if Platform.is_k8s() and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags: self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL) if entity is not None: pod_name = None # Get labels as tags labels = entity.get("Labels") if labels is not None: for k in self.collect_labels_as_tags: if k in labels: v = labels[k] if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s(): pod_name = v k = "pod_name" if "-" in pod_name: replication_controller = "-".join(pod_name.split("-")[:-1]) if "/" in replication_controller: # k8s <= 1.1 namespace, replication_controller = replication_controller.split("/", 1) elif KubeUtil.NAMESPACE_LABEL in labels: # k8s >= 1.2 namespace = labels[KubeUtil.NAMESPACE_LABEL] pod_name = "{0}/{1}".format(namespace, pod_name) tags.append("kube_namespace:%s" % namespace) tags.append("kube_replication_controller:%s" % replication_controller) tags.append("pod_name:%s" % pod_name) elif not v: tags.append(k) else: tags.append("%s:%s" % (k,v)) if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s() and k not in labels: tags.append("pod_name:no_pod") # Get entity specific tags if tag_type is not None: tag_names = self.tag_names[tag_type] for tag_name in tag_names: tag_value = self._extract_tag_value(entity, tag_name) if tag_value is not None: for t in tag_value: tags.append('%s:%s' % (tag_name, str(t).strip())) # Add ECS tags if self.collect_ecs_tags: entity_id = entity.get("Id") if entity_id in self.ecs_tags: ecs_tags = self.ecs_tags[entity_id] tags.extend(ecs_tags) # Add kube labels if Platform.is_k8s(): kube_tags = self.kube_labels.get(pod_name) if kube_tags: tags.extend(list(kube_tags)) return tags def _extract_tag_value(self, entity, tag_name): """Extra tag information from the API result (containers or images). Cache extracted tags inside the entity object. """ if tag_name not in TAG_EXTRACTORS: self.warning("{0} isn't a supported tag".format(tag_name)) return # Check for already extracted tags if "_tag_values" not in entity: entity["_tag_values"] = {} if tag_name not in entity["_tag_values"]: entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity) return entity["_tag_values"][tag_name] def refresh_ecs_tags(self): ecs_config = self.docker_client.inspect_container('ecs-agent') ip = ecs_config.get('NetworkSettings', {}).get('IPAddress') ports = ecs_config.get('NetworkSettings', {}).get('Ports') port = ports.keys()[0].split('/')[0] if ports else None if not ip: port = ECS_INTROSPECT_DEFAULT_PORT if Platform.is_containerized() and self.docker_gateway: ip = self.docker_gateway else: ip = "localhost" ecs_tags = {} try: if ip and port: tasks = requests.get('http://%s:%s/v1/tasks' % (ip, port)).json() for task in tasks.get('Tasks', []): for container in task.get('Containers', []): tags = ['task_name:%s' % task['Family'], 'task_version:%s' % task['Version']] ecs_tags[container['DockerId']] = tags except (requests.exceptions.HTTPError, requests.exceptions.HTTPError) as e: self.log.warning("Unable to collect ECS task names: %s" % e) self.ecs_tags = ecs_tags def _filter_containers(self, containers): if not self.docker_util.filtering_enabled: return self._filtered_containers = set() for container in containers: container_tags = self._get_tags(container, FILTERED) # exclude/include patterns are stored in docker_util to share them with other container-related checks if self.docker_util.are_tags_filtered(container_tags): container_name = DockerUtil.container_name_extractor(container)[0] self._filtered_containers.add(container_name) self.log.debug("Container {0} is filtered".format(container_name)) def _is_container_excluded(self, container): """Check if a container is excluded according to the filter rules. Requires _filter_containers to run first. """ container_name = DockerUtil.container_name_extractor(container)[0] return container_name in self._filtered_containers def _report_container_size(self, containers_by_id): for container in containers_by_id.itervalues(): if self._is_container_excluded(container): continue tags = self._get_tags(container, PERFORMANCE) m_func = FUNC_MAP[GAUGE][self.use_histogram] if "SizeRw" in container: m_func(self, 'docker.container.size_rw', container['SizeRw'], tags=tags) if "SizeRootFs" in container: m_func( self, 'docker.container.size_rootfs', container['SizeRootFs'], tags=tags) def _send_container_healthcheck_sc(self, containers_by_id): """Send health service checks for containers.""" for container in containers_by_id.itervalues(): healthcheck_tags = self._get_tags(container, HEALTHCHECK) match = False for tag in healthcheck_tags: for rule in self.whitelist_patterns: if re.match(rule, tag): match = True self._submit_healthcheck_sc(container) break if match: break def _submit_healthcheck_sc(self, container): health = container.get('health', {}) status = AgentCheck.UNKNOWN if health: _health = health.get('Status', '') if _health == 'unhealthy': status = AgentCheck.CRITICAL elif _health == 'healthy': status = AgentCheck.OK tags = self._get_tags(container, CONTAINER) self.service_check(HEALTHCHECK_SERVICE_CHECK_NAME, status, tags=tags) def _report_image_size(self, images): for image in images: tags = self._get_tags(image, IMAGE) if 'VirtualSize' in image: self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags) if 'Size' in image: self.gauge('docker.image.size', image['Size'], tags=tags) # Performance metrics def _report_performance_metrics(self, containers_by_id): containers_without_proc_root = [] for container in containers_by_id.itervalues(): if self._is_container_excluded(container) or not self._is_container_running(container): continue tags = self._get_tags(container, PERFORMANCE) self._report_cgroup_metrics(container, tags) if "_proc_root" not in container: containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0]) continue self._report_net_metrics(container, tags) if containers_without_proc_root: message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format( ", ".join(containers_without_proc_root)) if not Platform.is_k8s(): self.warning(message) else: # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway self.log.debug(message) def _report_cgroup_metrics(self, container, tags): try: for cgroup in CGROUP_METRICS: stat_file = self._get_cgroup_from_proc(cgroup["cgroup"], container['_pid'], cgroup['file']) stats = self._parse_cgroup_file(stat_file) if stats: for key, (dd_key, metric_func) in cgroup['metrics'].iteritems(): metric_func = FUNC_MAP[metric_func][self.use_histogram] if key in stats: metric_func(self, dd_key, int(stats[key]), tags=tags) # Computed metrics for mname, (key_list, fct, metric_func) in cgroup.get('to_compute', {}).iteritems(): values = [stats[key] for key in key_list if key in stats] if len(values) != len(key_list): self.log.debug("Couldn't compute {0}, some keys were missing.".format(mname)) continue value = fct(*values) metric_func = FUNC_MAP[metric_func][self.use_histogram] if value is not None: metric_func(self, mname, value, tags=tags) except MountException as ex: if self.cgroup_listing_retries > MAX_CGROUP_LISTING_RETRIES: raise ex else: self.warning("Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now." "Will retry {0} times before failing.".format(MAX_CGROUP_LISTING_RETRIES - self.cgroup_listing_retries)) self.cgroup_listing_retries += 1 else: self.cgroup_listing_retries = 0 def _report_net_metrics(self, container, tags): """Find container network metrics by looking at /proc/$PID/net/dev of the container process.""" if self._disable_net_metrics: self.log.debug("Network metrics are disabled. Skipping") return proc_net_file = os.path.join(container['_proc_root'], 'net/dev') try: with open(proc_net_file, 'r') as fp: lines = fp.readlines() """Two first lines are headers: Inter-| Receive | Transmit face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed """ for l in lines[2:]: cols = l.split(':', 1) interface_name = str(cols[0]).strip() if interface_name == 'eth0': x = cols[1].split() m_func = FUNC_MAP[RATE][self.use_histogram] m_func(self, "docker.net.bytes_rcvd", long(x[0]), tags) m_func(self, "docker.net.bytes_sent", long(x[8]), tags) break except Exception as e: # It is possible that the container got stopped between the API call and now self.warning("Failed to report IO metrics from file {0}. Exception: {1}".format(proc_net_file, e)) def _process_events(self, containers_by_id): if self.collect_events is False: # Crawl events for service discovery only self._get_events() return try: api_events = self._get_events() aggregated_events = self._pre_aggregate_events(api_events, containers_by_id) events = self._format_events(aggregated_events, containers_by_id) except (socket.timeout, urllib2.URLError): self.warning('Timeout when collecting events. Events will be missing.') return except Exception as e: self.warning("Unexpected exception when collecting events: {0}. " "Events will be missing".format(e)) return for ev in events: self.log.debug("Creating event: %s" % ev['msg_title']) self.event(ev) def _get_events(self): """Get the list of events.""" events, changed_container_ids = self.docker_util.get_events() if changed_container_ids and self._service_discovery: get_sd_backend(self.agentConfig).update_checks(changed_container_ids) return events def _pre_aggregate_events(self, api_events, containers_by_id): # Aggregate events, one per image. Put newer events first. events = defaultdict(deque) for event in api_events: # Skip events related to filtered containers container = containers_by_id.get(event.get('id')) if container is not None and self._is_container_excluded(container): self.log.debug("Excluded event: container {0} status changed to {1}".format( event['id'], event['status'])) continue # from may be missing (for network events for example) if 'from' in event: events[event['from']].appendleft(event) return events def _format_events(self, aggregated_events, containers_by_id): events = [] for image_name, event_group in aggregated_events.iteritems(): container_tags = set() low_prio_events = [] normal_prio_events = [] for event in event_group: container_name = event['id'][:11] if event['id'] in containers_by_id: cont = containers_by_id[event['id']] container_name = DockerUtil.container_name_extractor(cont)[0] container_tags.update(self._get_tags(cont, PERFORMANCE)) container_tags.add('container_name:%s' % container_name) # health checks generate tons of these so we treat them separately and lower their priority if event['status'].startswith('exec_create:') or event['status'].startswith('exec_start:'): low_prio_events.append((event, container_name)) else: normal_prio_events.append((event, container_name)) exec_event = self._create_dd_event(low_prio_events, image_name, container_tags, priority='Low') if exec_event: events.append(exec_event) normal_event = self._create_dd_event(normal_prio_events, image_name, container_tags, priority='Normal') if normal_event: events.append(normal_event) return events def _create_dd_event(self, events, image, c_tags, priority='Normal'): """Create the actual event to submit from a list of similar docker events""" if not events: return max_timestamp = 0 status = defaultdict(int) status_change = [] for ev, c_name in events: max_timestamp = max(max_timestamp, int(ev['time'])) status[ev['status']] += 1 status_change.append([c_name, ev['status']]) status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()]) msg_title = "%s %s on %s" % (image, status_text, self.hostname) msg_body = ( "%%%\n" "{image_name} {status} on {hostname}\n" "```\n{status_changes}\n```\n" "%%%" ).format( image_name=image, status=status_text, hostname=self.hostname, status_changes="\n".join( ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change]) ) if any(error in status_text for error in ERROR_ALERT_TYPE): alert_type = "error" else: alert_type = None return { 'timestamp': max_timestamp, 'host': self.hostname, 'event_type': EVENT_TYPE, 'msg_title': msg_title, 'msg_text': msg_body, 'source_type_name': EVENT_TYPE, 'event_object': 'docker:%s' % image, 'tags': list(c_tags), 'alert_type': alert_type, 'priority': priority } def _report_disk_stats(self): """Report metrics about the volume space usage""" stats = { 'docker.data.used': None, 'docker.data.total': None, 'docker.data.free': None, 'docker.metadata.used': None, 'docker.metadata.total': None, 'docker.metadata.free': None # these two are calculated by _calc_percent_disk_stats # 'docker.data.percent': None, # 'docker.metadata.percent': None } info = self.docker_client.info() driver_status = info.get('DriverStatus', []) if not driver_status: self.log.warning('Disk metrics collection is enabled but docker info did not' ' report any. Your storage driver might not support them, skipping.') return for metric in driver_status: # only consider metrics about disk space if len(metric) == 2 and 'Space' in metric[0]: # identify Data and Metadata metrics mtype = 'data' if 'Metadata' in metric[0]: mtype = 'metadata' if 'Used' in metric[0]: stats['docker.{0}.used'.format(mtype)] = metric[1] elif 'Space Total' in metric[0]: stats['docker.{0}.total'.format(mtype)] = metric[1] elif 'Space Available' in metric[0]: stats['docker.{0}.free'.format(mtype)] = metric[1] stats = self._format_disk_metrics(stats) stats.update(self._calc_percent_disk_stats(stats)) tags = self._get_tags() for name, val in stats.iteritems(): if val is not None: self.gauge(name, val, tags) def _format_disk_metrics(self, metrics): """Cast the disk stats to float and convert them to bytes""" for name, raw_val in metrics.iteritems(): if raw_val: val, unit = raw_val.split(' ') # by default some are uppercased others lowercased. That's error prone. unit = unit.lower() try: val = int(float(val) * UNIT_MAP[unit]) metrics[name] = val except KeyError: self.log.error('Unrecognized unit %s for disk metric %s. Dropping it.' % (unit, name)) metrics[name] = None return metrics def _calc_percent_disk_stats(self, stats): """Calculate a percentage of used disk space for data and metadata""" mtypes = ['data', 'metadata'] percs = {} for mtype in mtypes: used = stats.get('docker.{0}.used'.format(mtype)) total = stats.get('docker.{0}.total'.format(mtype)) free = stats.get('docker.{0}.free'.format(mtype)) if used and total and free and ceil(total) < free + used: self.log.debug('used, free, and total disk metrics may be wrong, ' 'used: %s, free: %s, total: %s', used, free, total) total = used + free try: if isinstance(used, int): percs['docker.{0}.percent'.format(mtype)] = round(100 * float(used) / float(total), 2) elif isinstance(free, int): percs['docker.{0}.percent'.format(mtype)] = round(100 * (1.0 - (float(free) / float(total))), 2) except ZeroDivisionError: self.log.error('docker.{0}.total is 0, calculating docker.{1}.percent' ' is not possible.'.format(mtype, mtype)) return percs # Cgroups def _get_cgroup_from_proc(self, cgroup, pid, filename): """Find a specific cgroup file, containing metrics to extract.""" params = { "file": filename, } return DockerUtil.find_cgroup_from_proc(self._mountpoints, pid, cgroup, self.docker_util._docker_root) % (params) def _parse_cgroup_file(self, stat_file): """Parse a cgroup pseudo file for key/values.""" self.log.debug("Opening cgroup file: %s" % stat_file) try: with open(stat_file, 'r') as fp: if 'blkio' in stat_file: return self._parse_blkio_metrics(fp.read().splitlines()) else: return dict(map(lambda x: x.split(' ', 1), fp.read().splitlines())) except IOError: # It is possible that the container got stopped between the API call and now. # Some files can also be missing (like cpu.stat) and that's fine. self.log.info("Can't open %s. Some metrics for this container may be missing." % stat_file) def _parse_blkio_metrics(self, stats): """Parse the blkio metrics.""" metrics = { 'io_read': 0, 'io_write': 0, } for line in stats: if 'Read' in line: metrics['io_read'] += int(line.split()[2]) if 'Write' in line: metrics['io_write'] += int(line.split()[2]) return metrics def _is_container_cgroup(self, line, selinux_policy): if line[1] not in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') or line[2] == '/docker-daemon': return False if 'docker' in line[2]: # general case return True if 'docker' in selinux_policy: # selinux return True if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2][1:]): # kubernetes return True return False # proc files def _crawl_container_pids(self, container_dict, custom_cgroups=False): """Crawl `/proc` to find container PIDs and add them to `containers_by_id`.""" proc_path = os.path.join(self.docker_util._docker_root, 'proc') pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()] if len(pid_dirs) == 0: self.warning("Unable to find any pid directory in {0}. " "If you are running the agent in a container, make sure to " 'share the volume properly: "/proc:/host/proc:ro". ' "See https://github.com/DataDog/docker-dd-agent/blob/master/README.md for more information. " "Network metrics will be missing".format(proc_path)) self._disable_net_metrics = True return container_dict self._disable_net_metrics = False for folder in pid_dirs: try: path = os.path.join(proc_path, folder, 'cgroup') with open(path, 'r') as f: content = [line.strip().split(':') for line in f.readlines()] selinux_policy = '' path = os.path.join(proc_path, folder, 'attr', 'current') if os.path.exists(path): with open(path, 'r') as f: selinux_policy = f.readlines()[0] except IOError, e: # Issue #2074 self.log.debug("Cannot read %s, " "process likely raced to finish : %s" % (path, str(e))) except Exception as e: self.warning("Cannot read %s : %s" % (path, str(e))) continue try: for line in content: if self._is_container_cgroup(line, selinux_policy): cpuacct = line[2] break else: continue matches = re.findall(CONTAINER_ID_RE, cpuacct) if matches: container_id = matches[-1] if container_id not in container_dict: self.log.debug("Container %s not in container_dict, it's likely excluded", container_id) continue container_dict[container_id]['_pid'] = folder container_dict[container_id]['_proc_root'] = os.path.join(proc_path, folder) elif custom_cgroups: # if we match by pid that should be enough (?) - O(n) ugh! for _, container in container_dict.iteritems(): if container.get('_pid') == int(folder): container['_proc_root'] = os.path.join(proc_path, folder) break except Exception, e: self.warning("Cannot parse %s content: %s" % (path, str(e))) continue
class KubeUtil: __metaclass__ = Singleton DEFAULT_METHOD = 'http' MACHINE_INFO_PATH = '/api/v1.3/machine/' METRICS_PATH = '/api/v1.3/subcontainers/' PODS_LIST_PATH = '/pods/' DEFAULT_CADVISOR_PORT = 4194 DEFAULT_KUBELET_PORT = 10255 DEFAULT_MASTER_PORT = 8080 DEFAULT_MASTER_NAME = 'kubernetes' # DNS name to reach the master from a pod. CA_CRT_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' AUTH_TOKEN_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/token' POD_NAME_LABEL = "io.kubernetes.pod.name" NAMESPACE_LABEL = "io.kubernetes.pod.namespace" def __init__(self, instance=None): self.docker_util = DockerUtil() if instance is None: try: config_file_path = get_conf_path(KUBERNETES_CHECK_NAME) check_config = check_yaml(config_file_path) instance = check_config['instances'][0] # kubernetes.yaml was not found except IOError as ex: log.error(ex.message) instance = {} except Exception: log.error('Kubernetes configuration file is invalid. ' 'Trying connecting to kubelet with default settings anyway...') instance = {} self.method = instance.get('method', KubeUtil.DEFAULT_METHOD) self.host = instance.get("host") or self.docker_util.get_hostname() self.kubelet_host = os.environ.get('KUBERNETES_KUBELET_HOST') or self.host self._node_ip = self._node_name = None # lazy evaluation self.host_name = os.environ.get('HOSTNAME') self.cadvisor_port = instance.get('port', KubeUtil.DEFAULT_CADVISOR_PORT) self.kubelet_port = instance.get('kubelet_port', KubeUtil.DEFAULT_KUBELET_PORT) self.kubelet_api_url = '%s://%s:%d' % (self.method, self.kubelet_host, self.kubelet_port) self.cadvisor_url = '%s://%s:%d' % (self.method, self.kubelet_host, self.cadvisor_port) self.kubernetes_api_url = 'https://%s/api/v1' % (os.environ.get('KUBERNETES_SERVICE_HOST') or self.DEFAULT_MASTER_NAME) self.tls_settings = self._init_tls_settings(instance) self.metrics_url = urljoin(self.cadvisor_url, KubeUtil.METRICS_PATH) self.machine_info_url = urljoin(self.cadvisor_url, KubeUtil.MACHINE_INFO_PATH) self.pods_list_url = urljoin(self.kubelet_api_url, KubeUtil.PODS_LIST_PATH) self.kube_health_url = urljoin(self.kubelet_api_url, 'healthz') # keep track of the latest k8s event we collected and posted # default value is 0 but TTL for k8s events is one hour anyways self.last_event_collection_ts = 0 def _init_tls_settings(self, instance): """ Initialize TLS settings for connection to apiserver and kubelet. """ tls_settings = {} client_crt = instance.get('apiserver_client_crt') client_key = instance.get('apiserver_client_key') apiserver_cacert = instance.get('apiserver_ca_cert') if client_crt and client_key and os.path.exists(client_crt) and os.path.exists(client_key): tls_settings['apiserver_client_cert'] = (client_crt, client_key) if apiserver_cacert and os.path.exists(apiserver_cacert): tls_settings['apiserver_cacert'] = apiserver_cacert token = self.get_auth_token() if token: tls_settings['bearer_token'] = token return tls_settings def get_kube_labels(self, excluded_keys=None): pods = self.retrieve_pods_list() return self.extract_kube_labels(pods, excluded_keys=excluded_keys) def extract_kube_labels(self, pods_list, excluded_keys=None): """ Extract labels from a list of pods coming from the kubelet API. """ excluded_keys = excluded_keys or [] kube_labels = defaultdict(list) pod_items = pods_list.get("items") or [] for pod in pod_items: metadata = pod.get("metadata", {}) name = metadata.get("name") namespace = metadata.get("namespace") labels = metadata.get("labels") if name and labels and namespace: key = "%s/%s" % (namespace, name) for k, v in labels.iteritems(): if k in excluded_keys: continue kube_labels[key].append(u"kube_%s:%s" % (k, v)) return kube_labels def extract_meta(self, pods_list, field_name): """ Exctract fields like `uid` or `name` from the `metadata` section of a list of pods coming from the kubelet API. TODO: currently not in use, was added to support events filtering, consider to remove it. """ uids = [] pods = pods_list.get("items") or [] for p in pods: value = p.get('metadata', {}).get(field_name) if value is not None: uids.append(value) return uids def retrieve_pods_list(self): """ Retrieve the list of pods for this cluster querying the kubelet API. TODO: the list of pods could be cached with some policy to be decided. """ return retrieve_json(self.pods_list_url) def retrieve_machine_info(self): """ Retrieve machine info from Cadvisor. """ return retrieve_json(self.machine_info_url) def retrieve_metrics(self): """ Retrieve metrics from Cadvisor. """ return retrieve_json(self.metrics_url) def filter_pods_list(self, pods_list, host_ip): """ Filter out (in place) pods that are not running on the given host. TODO: currently not in use, was added to support events filtering, consider to remove it. """ pod_items = pods_list.get('items') or [] log.debug('Found {} pods to filter'.format(len(pod_items))) filtered_pods = [] for pod in pod_items: status = pod.get('status', {}) if status.get('hostIP') == host_ip: filtered_pods.append(pod) log.debug('Pods after filtering: {}'.format(len(filtered_pods))) pods_list['items'] = filtered_pods return pods_list def retrieve_json_auth(self, url, timeout=10): """ Kubernetes API requires authentication using a token available in every pod, or with a client X509 cert/key pair. We authenticate using the service account token by default and replace this behavior with cert authentication if the user provided a cert/key pair in the instance. We try to verify the server TLS cert if the public cert is available. """ verify = self.tls_settings.get('apiserver_cacert') if not verify: verify = self.CA_CRT_PATH if os.path.exists(self.CA_CRT_PATH) else False log.debug('ssl validation: {}'.format(verify)) cert = self.tls_settings.get('apiserver_client_cert') bearer_token = self.tls_settings.get('bearer_token') if not cert else None headers = {'Authorization': 'Bearer {}'.format(bearer_token)} if bearer_token else None r = requests.get(url, timeout=timeout, headers=headers, verify=verify, cert=cert) r.raise_for_status() return r.json() def get_node_info(self): """ Return the IP address and the hostname of the node where the pod is running. """ if None in (self._node_ip, self._node_name): self._fetch_host_data() return self._node_ip, self._node_name def _fetch_host_data(self): """ Retrieve host name and IP address from the payload returned by the listing pods endpoints from kubelet or kubernetes API. The host IP address is different from the default router for the pod. """ try: pod_items = self.retrieve_pods_list().get("items") or [] except Exception as e: log.warning("Unable to retrieve pod list %s. Not fetching host data", str(e)) return for pod in pod_items: metadata = pod.get("metadata", {}) name = metadata.get("name") if name == self.host_name: status = pod.get('status', {}) spec = pod.get('spec', {}) # if not found, use an empty string - we use None as "not initialized" self._node_ip = status.get('hostIP', '') self._node_name = spec.get('nodeName', '') break def extract_event_tags(self, event): """ Return a list of tags extracted from an event object """ tags = [] if 'reason' in event: tags.append('reason:%s' % event.get('reason', '').lower()) if 'namespace' in event.get('metadata', {}): tags.append('namespace:%s' % event['metadata']['namespace']) if 'host' in event.get('source', {}): tags.append('node_name:%s' % event['source']['host']) if 'kind' in event.get('involvedObject', {}): tags.append('object_type:%s' % event['involvedObject'].get('kind', '').lower()) return tags def are_tags_filtered(self, tags): """ Because it is a pain to call it from the kubernetes check otherwise. """ return self.docker_util.are_tags_filtered(tags) @classmethod def get_auth_token(cls): """ Return a string containing the authorization token for the pod. """ try: with open(cls.AUTH_TOKEN_PATH) as f: return f.read() except IOError as e: log.error('Unable to read token from {}: {}'.format(cls.AUTH_TOKEN_PATH, e)) return None
class KubeUtil: __metaclass__ = Singleton DEFAULT_METHOD = 'http' KUBELET_HEALTH_PATH = '/healthz' MACHINE_INFO_PATH = '/api/v1.3/machine/' METRICS_PATH = '/api/v1.3/subcontainers/' PODS_LIST_PATH = '/pods/' DEFAULT_CADVISOR_PORT = 4194 DEFAULT_HTTP_KUBELET_PORT = 10255 DEFAULT_HTTPS_KUBELET_PORT = 10250 DEFAULT_MASTER_PORT = 443 DEFAULT_MASTER_NAME = 'kubernetes' # DNS name to reach the master from a pod. DEFAULT_LABEL_PREFIX = 'kube_' DEFAULT_COLLECT_SERVICE_TAG = True CA_CRT_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' AUTH_TOKEN_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/token' POD_NAME_LABEL = "io.kubernetes.pod.name" NAMESPACE_LABEL = "io.kubernetes.pod.namespace" CONTAINER_NAME_LABEL = "io.kubernetes.container.name" def __init__(self, **kwargs): self.docker_util = DockerUtil() if 'init_config' in kwargs and 'instance' in kwargs: init_config = kwargs.get('init_config', {}) instance = kwargs.get('instance', {}) else: try: config_file_path = get_conf_path(KUBERNETES_CHECK_NAME) check_config = check_yaml(config_file_path) init_config = check_config['init_config'] or {} instance = check_config['instances'][0] or {} # kubernetes.yaml was not found except IOError as ex: log.error(ex.message) init_config, instance = {}, {} except Exception: log.error( 'Kubernetes configuration file is invalid. ' 'Trying connecting to kubelet with default settings anyway...' ) init_config, instance = {}, {} self.method = instance.get('method', KubeUtil.DEFAULT_METHOD) self._node_ip = self._node_name = None # lazy evaluation self.host_name = os.environ.get('HOSTNAME') self.pod_name = os.environ.get('KUBERNETES_POD_NAME') or self.host_name self.tls_settings = self._init_tls_settings(instance) # apiserver if 'api_server_url' in instance: self.kubernetes_api_root_url = instance.get('api_server_url') else: master_host = os.environ.get( 'KUBERNETES_SERVICE_HOST') or self.DEFAULT_MASTER_NAME master_port = os.environ.get( 'KUBERNETES_SERVICE_PORT') or self.DEFAULT_MASTER_PORT self.kubernetes_api_root_url = 'https://%s:%s' % (master_host, master_port) self.kubernetes_api_url = '%s/api/v1' % self.kubernetes_api_root_url # Service mapping helper class self._service_mapper = PodServiceMapper(self) from config import _is_affirmative self.collect_service_tag = _is_affirmative( instance.get('collect_service_tags', KubeUtil.DEFAULT_COLLECT_SERVICE_TAG)) # leader status triggers event collection self.is_leader = False self.leader_elector = None self.leader_lease_duration = instance.get('leader_lease_duration') # kubelet # If kubelet_api_url is None, init_kubelet didn't succeed yet. self.init_success = False self.kubelet_api_url = None self.init_retry_interval = init_config.get('init_retry_interval', DEFAULT_RETRY_INTERVAL) self.last_init_retry = None self.left_init_retries = init_config.get('init_retries', DEFAULT_INIT_RETRIES) + 1 self.init_kubelet(instance) self.kube_label_prefix = instance.get('label_to_tag_prefix', KubeUtil.DEFAULT_LABEL_PREFIX) self.kube_node_labels = instance.get('node_labels_to_host_tags', {}) # keep track of the latest k8s event we collected and posted # default value is 0 but TTL for k8s events is one hour anyways self.last_event_collection_ts = 0 def _init_tls_settings(self, instance): """ Initialize TLS settings for connection to apiserver and kubelet. """ tls_settings = {} # apiserver client_crt = instance.get('apiserver_client_crt') client_key = instance.get('apiserver_client_key') apiserver_cacert = instance.get('apiserver_ca_cert') if client_crt and client_key and os.path.exists( client_crt) and os.path.exists(client_key): tls_settings['apiserver_client_cert'] = (client_crt, client_key) if apiserver_cacert and os.path.exists(apiserver_cacert): tls_settings['apiserver_cacert'] = apiserver_cacert # kubelet kubelet_client_crt = instance.get('kubelet_client_crt') kubelet_client_key = instance.get('kubelet_client_key') if kubelet_client_crt and kubelet_client_key and os.path.exists( kubelet_client_crt) and os.path.exists(kubelet_client_key): tls_settings['kubelet_client_cert'] = (kubelet_client_crt, kubelet_client_key) cert = instance.get('kubelet_cert') if cert: tls_settings['kubelet_verify'] = cert else: tls_settings['kubelet_verify'] = instance.get( 'kubelet_tls_verify', DEFAULT_TLS_VERIFY) if ('apiserver_client_cert' not in tls_settings) or ('kubelet_client_cert' not in tls_settings): # Only lookup token if we don't have client certs for both token = self.get_auth_token(instance) if token: tls_settings['bearer_token'] = token return tls_settings def init_kubelet(self, instance): """ Handles the retry logic around _locate_kubelet. Once _locate_kubelet succeeds, initialize all kubelet-related URLs and settings. """ if self.left_init_retries == 0: raise Exception( "Kubernetes client initialization failed permanently. " "Kubernetes-related features will fail.") now = time.time() # last retry was less than retry_interval ago if self.last_init_retry and now <= self.last_init_retry + self.init_retry_interval: return # else it's the first try, or last retry was long enough ago self.last_init_retry = now self.left_init_retries -= 1 try: self.kubelet_api_url = self._locate_kubelet(instance) except Exception as ex: log.error( "Failed to initialize kubelet connection. Will retry %s time(s). Error: %s" % (self.left_init_retries, str(ex))) return if not self.kubelet_api_url: log.error( "Failed to initialize kubelet connection. Will retry %s time(s)." % self.left_init_retries) return self.init_success = True self.kubelet_host = self.kubelet_api_url.split(':')[1].lstrip('/') self.pods_list_url = urljoin(self.kubelet_api_url, KubeUtil.PODS_LIST_PATH) self.kube_health_url = urljoin(self.kubelet_api_url, KubeUtil.KUBELET_HEALTH_PATH) # namespace of the agent pod try: self.self_namespace = self.get_self_namespace() except Exception: log.warning( "Failed to get the agent pod namespace, defaulting to default." ) self.self_namespace = DEFAULT_NAMESPACE # cadvisor self.cadvisor_port = instance.get('port', KubeUtil.DEFAULT_CADVISOR_PORT) self.cadvisor_url = '%s://%s:%d' % (self.method, self.kubelet_host, self.cadvisor_port) self.metrics_url = urljoin(self.cadvisor_url, KubeUtil.METRICS_PATH) self.machine_info_url = urljoin(self.cadvisor_url, KubeUtil.MACHINE_INFO_PATH) def _locate_kubelet(self, instance): """ Kubelet may or may not accept un-authenticated http requests. If it doesn't we need to use its HTTPS API that may or may not require auth. Returns the kubelet URL or raises. """ host = os.environ.get('KUBERNETES_KUBELET_HOST') or instance.get( "host") if not host: # if no hostname was provided, use the docker hostname if cert # validation is not required, the kubernetes hostname otherwise. docker_hostname = self.docker_util.get_hostname( should_resolve=True) if self.tls_settings.get('kubelet_verify'): try: k8s_hostname = self.get_node_hostname(docker_hostname) host = k8s_hostname or docker_hostname except Exception as ex: log.error(str(ex)) host = docker_hostname else: host = docker_hostname # check if the no-auth endpoint is enabled port = instance.get('kubelet_port', KubeUtil.DEFAULT_HTTP_KUBELET_PORT) no_auth_url = 'http://%s:%s' % (host, port) test_url = urljoin(no_auth_url, KubeUtil.KUBELET_HEALTH_PATH) try: self.perform_kubelet_query(test_url) return no_auth_url except Exception: log.debug( "Couldn't query kubelet over HTTP, assuming it's not in no_auth mode." ) port = instance.get('kubelet_port', KubeUtil.DEFAULT_HTTPS_KUBELET_PORT) https_url = 'https://%s:%s' % (host, port) test_url = urljoin(https_url, KubeUtil.KUBELET_HEALTH_PATH) try: self.perform_kubelet_query(test_url) return https_url except Exception as ex: log.warning( "Couldn't query kubelet over HTTP, assuming it's not in no_auth mode." ) raise ex def get_self_namespace(self): pods = self.retrieve_pods_list() for pod in pods.get('items', []): if pod.get('metadata', {}).get('name') == self.pod_name: return pod['metadata']['namespace'] log.warning( "Couldn't find the agent pod and namespace, using the default.") return DEFAULT_NAMESPACE def get_node_hostname(self, host): """ Query the API server for the kubernetes hostname of the node using the docker hostname as a filter. """ node_filter = {'labelSelector': 'kubernetes.io/hostname=%s' % host} node = self.retrieve_json_auth(self.kubernetes_api_url + '/nodes?%s' % urlencode(node_filter)).json() if len(node['items']) != 1: log.error( 'Error while getting node hostname: expected 1 node, got %s.' % len(node['items'])) else: addresses = (node or {}).get('items', [{}])[0].get('status', {}).get('addresses', []) for address in addresses: if address.get('type') == 'Hostname': return address['address'] return None def get_kube_pod_tags(self, excluded_keys=None): """ Gets pods' labels as tags + creator and service tags. Returns a dict{namespace/podname: [tags]} """ if not self.init_success: log.warning( "Kubernetes client is not initialized, can't get pod tags.") return {} pods = self.retrieve_pods_list() return self.extract_kube_pod_tags(pods, excluded_keys=excluded_keys) def extract_kube_pod_tags(self, pods_list, excluded_keys=None, label_prefix=None): """ Extract labels + creator and service tags from a list of pods coming from the kubelet API. :param excluded_keys: labels to skip :param label_prefix: prefix for label->tag conversion, None defaults to the configuration option label_to_tag_prefix Returns a dict{namespace/podname: [tags]} """ excluded_keys = excluded_keys or [] kube_labels = defaultdict(list) pod_items = pods_list.get("items") or [] label_prefix = label_prefix or self.kube_label_prefix for pod in pod_items: metadata = pod.get("metadata", {}) name = metadata.get("name") namespace = metadata.get("namespace") labels = metadata.get("labels", {}) if name and namespace: key = "%s/%s" % (namespace, name) # Extract creator tags podtags = self.get_pod_creator_tags(metadata) # Extract services tags if self.collect_service_tag: for service in self.match_services_for_pod(metadata): if service is not None: podtags.append(u'kube_service:%s' % service) # Extract labels for k, v in labels.iteritems(): if k in excluded_keys: continue podtags.append(u"%s%s:%s" % (label_prefix, k, v)) kube_labels[key] = podtags return kube_labels def retrieve_pods_list(self): """ Retrieve the list of pods for this cluster querying the kubelet API. TODO: the list of pods could be cached with some policy to be decided. """ return self.perform_kubelet_query(self.pods_list_url).json() def retrieve_machine_info(self): """ Retrieve machine info from Cadvisor. """ return retrieve_json(self.machine_info_url) def retrieve_metrics(self): """ Retrieve metrics from Cadvisor. """ return retrieve_json(self.metrics_url) def get_deployment_for_replicaset(self, rs_name): """ Get the deployment name for a given replicaset name For now, the rs name's first part always is the deployment's name, see https://github.com/kubernetes/kubernetes/blob/release-1.6/pkg/controller/deployment/sync.go#L299 But it might change in a future k8s version. The other way to match RS and deployments is to parse and cache /apis/extensions/v1beta1/replicasets, mirroring PodServiceMapper In 1.8, the hash generation logic changed: https://github.com/kubernetes/kubernetes/pull/51538/files As we are matching both patterns without checking the apiserver version, we might have some false positives. For agent6, we plan on doing this pod->replicaset->deployment matching in the cluster agent, with replicaset data from the apiserver. This will address that risk. """ end = rs_name.rfind("-") if end > 0 and rs_name[end + 1:].isdigit(): # k8s before 1.8 return rs_name[0:end] if end > 0 and len(rs_name[end + 1:]) == 10: # k8s 1.8+ maybe? Check contents for char in rs_name[end + 1:]: if char not in ALLOWED_ENCODESTRING_ALPHANUMS: return None return rs_name[0:end] else: return None def perform_kubelet_query(self, url, verbose=True, timeout=10): """ Perform and return a GET request against kubelet. Support auth and TLS validation. """ tls_context = self.tls_settings headers = None cert = tls_context.get('kubelet_client_cert') verify = tls_context.get('kubelet_verify', DEFAULT_TLS_VERIFY) # if cert-based auth is enabled, don't use the token. if not cert and url.lower().startswith( 'https') and 'bearer_token' in self.tls_settings: headers = { 'Authorization': 'Bearer {}'.format(self.tls_settings.get('bearer_token')) } return requests.get(url, timeout=timeout, verify=verify, cert=cert, headers=headers, params={'verbose': verbose}) def get_apiserver_auth_settings(self): """ Kubernetes API requires authentication using a token available in every pod, or with a client X509 cert/key pair. We authenticate using the service account token by default and replace this behavior with cert authentication if the user provided a cert/key pair in the instance. We try to verify the server TLS cert if the public cert is available. """ verify = self.tls_settings.get('apiserver_cacert') if not verify: verify = self.CA_CRT_PATH if os.path.exists( self.CA_CRT_PATH) else False log.debug('tls validation: {}'.format(verify)) cert = self.tls_settings.get('apiserver_client_cert') bearer_token = self.tls_settings.get( 'bearer_token') if not cert else None headers = { 'Authorization': 'Bearer {}'.format(bearer_token) } if bearer_token else {} headers['content-type'] = 'application/json' return cert, headers, verify def retrieve_json_auth(self, url, params=None, timeout=3): cert, headers, verify = self.get_apiserver_auth_settings() res = requests.get(url, timeout=timeout, headers=headers, verify=verify, cert=cert, params=params) res.raise_for_status() return res def post_json_to_apiserver(self, url, data, timeout=3): cert, headers, verify = self.get_apiserver_auth_settings() res = requests.post(url, timeout=timeout, headers=headers, verify=verify, cert=cert, data=json.dumps(data)) res.raise_for_status() return res def put_json_to_apiserver(self, url, data, timeout=3): cert, headers, verify = self.get_apiserver_auth_settings() res = requests.put(url, timeout=timeout, headers=headers, verify=verify, cert=cert, data=json.dumps(data)) res.raise_for_status() return res def delete_to_apiserver(self, url, timeout=3): cert, headers, verify = self.get_apiserver_auth_settings() res = requests.delete(url, timeout=timeout, headers=headers, verify=verify, cert=cert) res.raise_for_status() return res def get_node_info(self): """ Return the IP address and the hostname of the node where the pod is running. """ if None in (self._node_ip, self._node_name): self._fetch_host_data() return self._node_ip, self._node_name def get_node_metadata(self): """Returns host metadata about the local k8s node""" meta = {} # API server version try: request_url = "%s/version" % self.kubernetes_api_root_url master_info = self.retrieve_json_auth(request_url).json() version = master_info.get("gitVersion") meta['kube_master_version'] = version[1:] except Exception as ex: # Intentional use of non-safe lookups to get the exception in the debug logs # if the parsing were to fail log.debug("Error getting Kube master version: %s" % str(ex)) # Kubelet version & labels if not self.init_success: log.warning( "Kubelet client failed to initialize, kubelet host tags will be missing for now." ) return meta try: _, node_name = self.get_node_info() if not node_name: raise ValueError("node name missing or empty") request_url = "%s/nodes/%s" % (self.kubernetes_api_url, node_name) node_info = self.retrieve_json_auth(request_url).json() version = node_info.get("status").get("nodeInfo").get( "kubeletVersion") meta['kubelet_version'] = version[1:] except Exception as ex: log.debug("Error getting Kubelet version: %s" % str(ex)) return meta def get_node_hosttags(self): """ Returns node labels as tags. Tag name is transformed as defined in node_labels_to_host_tags in the kubernetes check configuration. Note: queries the API server for node info. Configure RBAC accordingly. """ tags = [] try: _, node_name = self.get_node_info() if not node_name: raise ValueError("node name missing or empty") request_url = "%s/nodes/%s" % (self.kubernetes_api_url, node_name) node_info = self.retrieve_json_auth(request_url).json() node_labels = node_info.get('metadata', {}).get('labels', {}) for l_name, t_name in self.kube_node_labels.iteritems(): if l_name in node_labels: tags.append('%s:%s' % (t_name, node_labels[l_name])) except Exception as ex: log.debug("Error getting node labels: %s" % str(ex)) return tags def _fetch_host_data(self): """ Retrieve host name and IP address from the payload returned by the listing pods endpoints from kubelet. The host IP address is different from the default router for the pod. """ try: pod_items = self.retrieve_pods_list().get("items") or [] except Exception as e: log.warning( "Unable to retrieve pod list %s. Not fetching host data", str(e)) return for pod in pod_items: metadata = pod.get("metadata", {}) name = metadata.get("name") if name == self.pod_name: status = pod.get('status', {}) spec = pod.get('spec', {}) # if not found, use an empty string - we use None as "not initialized" self._node_ip = status.get('hostIP', '') self._node_name = spec.get('nodeName', '') break def extract_event_tags(self, event): """ Return a list of tags extracted from an event object """ tags = [] if 'reason' in event: tags.append('reason:%s' % event.get('reason', '').lower()) if 'namespace' in event.get('metadata', {}): tags.append('namespace:%s' % event['metadata']['namespace']) if 'host' in event.get('source', {}): tags.append('node_name:%s' % event['source']['host']) if 'kind' in event.get('involvedObject', {}): tags.append('object_type:%s' % event['involvedObject'].get('kind', '').lower()) if 'name' in event.get('involvedObject', {}): tags.append('object_name:%s' % event['involvedObject'].get('name', '').lower()) if 'component' in event.get('source', {}): tags.append('source_component:%s' % event['source'].get('component', '').lower()) return tags def are_tags_filtered(self, tags): """ Because it is a pain to call it from the kubernetes check otherwise. """ return self.docker_util.are_tags_filtered(tags) @classmethod def get_auth_token(cls, instance): """ Return a string containing the authorization token for the pod. """ token_path = instance.get('bearer_token_path', cls.AUTH_TOKEN_PATH) try: with open(token_path) as f: return f.read().strip() except IOError as e: log.error('Unable to read token from {}: {}'.format(token_path, e)) return None def match_services_for_pod(self, pod_metadata, refresh=False): """ Match the pods labels with services' label selectors to determine the list of services that point to that pod. Returns an array of service names. Pass refresh=True if you want to bypass the cached cid->services mapping (after a service change) """ s = self._service_mapper.match_services_for_pod(pod_metadata, refresh, names=True) #log.warning("Matches for %s: %s" % (pod_metadata.get('name'), str(s))) return s def get_event_retriever(self, namespaces=None, kinds=None, delay=None): """ Returns a KubeEventRetriever object ready for action """ return KubeEventRetriever(self, namespaces, kinds, delay) def match_containers_for_pods(self, pod_uids, podlist=None): """ Reads a set of pod uids and returns the set of docker container ids they manage podlist should be a recent self.retrieve_pods_list return value, if not given that method will be called """ cids = set() if not isinstance(pod_uids, set) or len(pod_uids) < 1: return cids if podlist is None: podlist = self.retrieve_pods_list() for pod in podlist.get('items', {}): uid = pod.get('metadata', {}).get('uid', None) if uid in pod_uids: for container in pod.get('status', {}).get('containerStatuses', None): id = container.get('containerID', "") if id.startswith("docker://"): cids.add(id[9:]) return cids def get_pod_creator(self, pod_metadata): """ Get the pod's creator from its metadata and returns a tuple (creator_kind, creator_name) This allows for consitency across code path """ try: created_by = json.loads( pod_metadata['annotations']['kubernetes.io/created-by']) creator_kind = created_by.get('reference', {}).get('kind') creator_name = created_by.get('reference', {}).get('name') return (creator_kind, creator_name) except Exception: log.debug('Could not parse creator for pod ' + pod_metadata.get('name', '')) return (None, None) def get_pod_creator_tags(self, pod_metadata, legacy_rep_controller_tag=False): """ Get the pod's creator from its metadata and returns a list of tags in the form kube_$kind:$name, ready to add to the metrics """ try: tags = [] creator_kind, creator_name = self.get_pod_creator(pod_metadata) if creator_kind in CREATOR_KIND_TO_TAG and creator_name: tags.append("%s:%s" % (CREATOR_KIND_TO_TAG[creator_kind], creator_name)) if creator_kind == 'ReplicaSet': deployment = self.get_deployment_for_replicaset( creator_name) if deployment: tags.append( "%s:%s" % (CREATOR_KIND_TO_TAG['Deployment'], deployment)) if legacy_rep_controller_tag and creator_kind != 'ReplicationController' and creator_name: tags.append( 'kube_replication_controller:{0}'.format(creator_name)) return tags except Exception: log.warning('Could not parse creator tags for pod ' + pod_metadata.get('name')) return [] def process_events(self, event_array, podlist=None): """ Reads a list of kube events, invalidates caches and and computes a set of containers impacted by the changes, to refresh service discovery Pod creation/deletion events are ignored for now, as docker_daemon already sends container creation/deletion events to SD Pod->containers matching is done using match_containers_for_pods """ try: pods = set() if self._service_mapper: pods.update(self._service_mapper.process_events(event_array)) return self.match_containers_for_pods(pods, podlist) except Exception as e: log.warning("Error processing events %s: %s" % (str(event_array), e)) return set() def refresh_leader(self): if not self.init_success: log.warning( "Kubelet client is not initialized, leader election is disabled." ) return if not self.leader_elector: self.leader_elector = LeaderElector(self) self.leader_elector.try_acquire_or_refresh() def image_name_resolver(self, image): """ Wraps around the sibling dockerutil method and catches exceptions """ if image is None: return None try: return self.docker_util.image_name_resolver(image) except Exception as e: log.warning("Error resolving image name: %s", str(e)) return image
class DockerDaemon(AgentCheck): """Collect metrics and events from Docker API and cgroups.""" def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception("Docker check only supports one configured instance.") AgentCheck.__init__(self, name, init_config, agentConfig, instances=instances) self.init_success = False self._service_discovery = agentConfig.get('service_discovery') and \ agentConfig.get('service_discovery_backend') == 'docker' self.init() def init(self): try: instance = self.instances[0] self.docker_util = DockerUtil() self.docker_client = self.docker_util.client self.docker_gateway = DockerUtil.get_gateway() if Platform.is_k8s(): self.kubeutil = KubeUtil() # We configure the check with the right cgroup settings for this host # Just needs to be done once self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS) self.cgroup_listing_retries = 0 self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False # Set tagging options self.custom_tags = instance.get("tags", []) self.collect_labels_as_tags = instance.get("collect_labels_as_tags", []) self.kube_labels = {} self.use_histogram = _is_affirmative(instance.get('use_histogram', False)) performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: performance_tags, IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } # Set filtering settings if self.docker_util.filtering_enabled: self.tag_names[FILTERED] = self.docker_util.filtered_tag_names # Other options self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False)) self.collect_events = _is_affirmative(instance.get('collect_events', True)) self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False)) self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False)) self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance() self.ecs_tags = {} except Exception as e: self.log.critical(e) self.warning("Initialization failed. Will retry at next iteration") else: self.init_success = True def check(self, instance): """Run the Docker check for one instance.""" if not self.init_success: # Initialization can fail if cgroups are not ready. So we retry if needed # https://github.com/DataDog/dd-agent/issues/1896 self.init() if not self.init_success: # Initialization failed, will try later return # Report image metrics if self.collect_image_stats: self._count_and_weigh_images() if self.collect_ecs_tags: self.refresh_ecs_tags() if Platform.is_k8s(): try: self.kube_labels = self.kubeutil.get_kube_labels() except Exception as e: self.log.warning('Could not retrieve kubernetes labels: %s' % str(e)) self.kube_labels = {} # containers running with custom cgroups? custom_cgroups = _is_affirmative(instance.get('custom_cgroups', False)) # Get the list of containers and the index of their names containers_by_id = self._get_and_count_containers(custom_cgroups) containers_by_id = self._crawl_container_pids(containers_by_id, custom_cgroups) # Send events from Docker API if self.collect_events or self._service_discovery: self._process_events(containers_by_id) # Report performance container metrics (cpu, mem, net, io) self._report_performance_metrics(containers_by_id) if self.collect_container_size: self._report_container_size(containers_by_id) # Collect disk stats from Docker info command if self.collect_disk_stats: self._report_disk_stats() def _count_and_weigh_images(self): try: tags = self._get_tags() active_images = self.docker_client.images(all=False) active_images_len = len(active_images) all_images_len = len(self.docker_client.images(quiet=True, all=True)) self.gauge("docker.images.available", active_images_len, tags=tags) self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags) if self.collect_image_size: self._report_image_size(active_images) except Exception as e: # It's not an important metric, keep going if it fails self.warning("Failed to count Docker images. Exception: {0}".format(e)) def _get_and_count_containers(self, custom_cgroups=False): """List all the containers from the API, filter and count them.""" # Querying the size of containers is slow, we don't do it at each run must_query_size = self.collect_container_size and self._latest_size_query == 0 self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE running_containers_count = Counter() all_containers_count = Counter() try: containers = self.docker_client.containers(all=True, size=must_query_size) except Exception as e: message = "Unable to list Docker containers: {0}".format(e) self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message=message) raise Exception(message) else: self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK) # Create a set of filtered containers based on the exclude/include rules # and cache these rules in docker_util self._filter_containers(containers) containers_by_id = {} for container in containers: container_name = DockerUtil.container_name_extractor(container)[0] container_status_tags = self._get_tags(container, CONTAINER) all_containers_count[tuple(sorted(container_status_tags))] += 1 if self._is_container_running(container): running_containers_count[tuple(sorted(container_status_tags))] += 1 # Check if the container is included/excluded via its tags if self._is_container_excluded(container): self.log.debug("Container {0} is excluded".format(container_name)) continue containers_by_id[container['Id']] = container # grab pid via API if custom cgroups - otherwise we won't find process when # crawling for pids. if custom_cgroups: try: inspect_dict = self.docker_client.inspect_container(container_name) container['_pid'] = inspect_dict['State']['Pid'] except Exception as e: self.log.debug("Unable to inspect Docker container: %s", e) for tags, count in running_containers_count.iteritems(): self.gauge("docker.containers.running", count, tags=list(tags)) for tags, count in all_containers_count.iteritems(): stopped_count = count - running_containers_count[tags] self.gauge("docker.containers.stopped", stopped_count, tags=list(tags)) return containers_by_id def _is_container_running(self, container): """Tell if a container is running, according to its status. There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated. See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35 """ return container["Status"].startswith("Up") or container["Status"].startswith("Restarting") def _get_tags(self, entity=None, tag_type=None): """Generate the tags for a given entity (container or image) according to a list of tag names.""" # Start with custom tags tags = list(self.custom_tags) # Collect pod names as tags on kubernetes if Platform.is_k8s() and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags: self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL) if entity is not None: pod_name = None # Get labels as tags labels = entity.get("Labels") if labels is not None: for k in self.collect_labels_as_tags: if k in labels: v = labels[k] if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s(): pod_name = v k = "pod_name" if "-" in pod_name: replication_controller = "-".join(pod_name.split("-")[:-1]) if "/" in replication_controller: # k8s <= 1.1 namespace, replication_controller = replication_controller.split("/", 1) elif KubeUtil.NAMESPACE_LABEL in labels: # k8s >= 1.2 namespace = labels[KubeUtil.NAMESPACE_LABEL] pod_name = "{0}/{1}".format(namespace, pod_name) tags.append("kube_namespace:%s" % namespace) tags.append("kube_replication_controller:%s" % replication_controller) tags.append("pod_name:%s" % pod_name) elif not v: tags.append(k) else: tags.append("%s:%s" % (k,v)) if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s() and k not in labels: tags.append("pod_name:no_pod") # Get entity specific tags if tag_type is not None: tag_names = self.tag_names[tag_type] for tag_name in tag_names: tag_value = self._extract_tag_value(entity, tag_name) if tag_value is not None: for t in tag_value: tags.append('%s:%s' % (tag_name, str(t).strip())) # Add ECS tags if self.collect_ecs_tags: entity_id = entity.get("Id") if entity_id in self.ecs_tags: ecs_tags = self.ecs_tags[entity_id] tags.extend(ecs_tags) # Add kube labels if Platform.is_k8s(): kube_tags = self.kube_labels.get(pod_name) if kube_tags: tags.extend(list(kube_tags)) return tags def _extract_tag_value(self, entity, tag_name): """Extra tag information from the API result (containers or images). Cache extracted tags inside the entity object. """ if tag_name not in TAG_EXTRACTORS: self.warning("{0} isn't a supported tag".format(tag_name)) return # Check for already extracted tags if "_tag_values" not in entity: entity["_tag_values"] = {} if tag_name not in entity["_tag_values"]: entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity) return entity["_tag_values"][tag_name] def refresh_ecs_tags(self): ecs_config = self.docker_client.inspect_container('ecs-agent') ip = ecs_config.get('NetworkSettings', {}).get('IPAddress') ports = ecs_config.get('NetworkSettings', {}).get('Ports') port = ports.keys()[0].split('/')[0] if ports else None if not ip: port = ECS_INTROSPECT_DEFAULT_PORT if Platform.is_containerized() and self.docker_gateway: ip = self.docker_gateway else: ip = "localhost" ecs_tags = {} try: if ip and port: tasks = requests.get('http://%s:%s/v1/tasks' % (ip, port)).json() for task in tasks.get('Tasks', []): for container in task.get('Containers', []): tags = ['task_name:%s' % task['Family'], 'task_version:%s' % task['Version']] ecs_tags[container['DockerId']] = tags except (requests.exceptions.HTTPError, requests.exceptions.HTTPError) as e: self.log.warning("Unable to collect ECS task names: %s" % e) self.ecs_tags = ecs_tags def _filter_containers(self, containers): if not self.docker_util.filtering_enabled: return self._filtered_containers = set() for container in containers: container_tags = self._get_tags(container, FILTERED) # exclude/include patterns are stored in docker_util to share them with other container-related checks if self.docker_util.are_tags_filtered(container_tags): container_name = DockerUtil.container_name_extractor(container)[0] self._filtered_containers.add(container_name) self.log.debug("Container {0} is filtered".format(container_name)) def _is_container_excluded(self, container): """Check if a container is excluded according to the filter rules. Requires _filter_containers to run first. """ container_name = DockerUtil.container_name_extractor(container)[0] return container_name in self._filtered_containers def _report_container_size(self, containers_by_id): for container in containers_by_id.itervalues(): if self._is_container_excluded(container): continue tags = self._get_tags(container, PERFORMANCE) m_func = FUNC_MAP[GAUGE][self.use_histogram] if "SizeRw" in container: m_func(self, 'docker.container.size_rw', container['SizeRw'], tags=tags) if "SizeRootFs" in container: m_func( self, 'docker.container.size_rootfs', container['SizeRootFs'], tags=tags) def _report_image_size(self, images): for image in images: tags = self._get_tags(image, IMAGE) if 'VirtualSize' in image: self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags) if 'Size' in image: self.gauge('docker.image.size', image['Size'], tags=tags) # Performance metrics def _report_performance_metrics(self, containers_by_id): containers_without_proc_root = [] for container in containers_by_id.itervalues(): if self._is_container_excluded(container) or not self._is_container_running(container): continue tags = self._get_tags(container, PERFORMANCE) self._report_cgroup_metrics(container, tags) if "_proc_root" not in container: containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0]) continue self._report_net_metrics(container, tags) if containers_without_proc_root: message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format( ", ".join(containers_without_proc_root)) if not Platform.is_k8s(): self.warning(message) else: # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway self.log.debug(message) def _report_cgroup_metrics(self, container, tags): try: for cgroup in CGROUP_METRICS: stat_file = self._get_cgroup_from_proc(cgroup["cgroup"], container['_pid'], cgroup['file']) stats = self._parse_cgroup_file(stat_file) if stats: for key, (dd_key, metric_func) in cgroup['metrics'].iteritems(): metric_func = FUNC_MAP[metric_func][self.use_histogram] if key in stats: metric_func(self, dd_key, int(stats[key]), tags=tags) # Computed metrics for mname, (key_list, fct, metric_func) in cgroup.get('to_compute', {}).iteritems(): values = [stats[key] for key in key_list if key in stats] if len(values) != len(key_list): self.log.debug("Couldn't compute {0}, some keys were missing.".format(mname)) continue value = fct(*values) metric_func = FUNC_MAP[metric_func][self.use_histogram] if value is not None: metric_func(self, mname, value, tags=tags) except MountException as ex: if self.cgroup_listing_retries > MAX_CGROUP_LISTING_RETRIES: raise ex else: self.warning("Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now." "Will retry {0} times before failing.".format(MAX_CGROUP_LISTING_RETRIES - self.cgroup_listing_retries)) self.cgroup_listing_retries += 1 else: self.cgroup_listing_retries = 0 def _report_net_metrics(self, container, tags): """Find container network metrics by looking at /proc/$PID/net/dev of the container process.""" if self._disable_net_metrics: self.log.debug("Network metrics are disabled. Skipping") return proc_net_file = os.path.join(container['_proc_root'], 'net/dev') try: with open(proc_net_file, 'r') as fp: lines = fp.readlines() """Two first lines are headers: Inter-| Receive | Transmit face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed """ for l in lines[2:]: cols = l.split(':', 1) interface_name = str(cols[0]).strip() if interface_name == 'eth0': x = cols[1].split() m_func = FUNC_MAP[RATE][self.use_histogram] m_func(self, "docker.net.bytes_rcvd", long(x[0]), tags) m_func(self, "docker.net.bytes_sent", long(x[8]), tags) break except Exception as e: # It is possible that the container got stopped between the API call and now self.warning("Failed to report IO metrics from file {0}. Exception: {1}".format(proc_net_file, e)) def _process_events(self, containers_by_id): if self.collect_events is False: # Crawl events for service discovery only self._get_events() return try: api_events = self._get_events() aggregated_events = self._pre_aggregate_events(api_events, containers_by_id) events = self._format_events(aggregated_events, containers_by_id) except (socket.timeout, urllib2.URLError): self.warning('Timeout when collecting events. Events will be missing.') return except Exception as e: self.warning("Unexpected exception when collecting events: {0}. " "Events will be missing".format(e)) return for ev in events: self.log.debug("Creating event: %s" % ev['msg_title']) self.event(ev) def _get_events(self): """Get the list of events.""" events, changed_container_ids = self.docker_util.get_events() if changed_container_ids and self._service_discovery: get_sd_backend(self.agentConfig).update_checks(changed_container_ids) return events def _pre_aggregate_events(self, api_events, containers_by_id): # Aggregate events, one per image. Put newer events first. events = defaultdict(deque) for event in api_events: # Skip events related to filtered containers container = containers_by_id.get(event.get('id')) if container is not None and self._is_container_excluded(container): self.log.debug("Excluded event: container {0} status changed to {1}".format( event['id'], event['status'])) continue # from may be missing (for network events for example) if 'from' in event: events[event['from']].appendleft(event) return events def _format_events(self, aggregated_events, containers_by_id): events = [] for image_name, event_group in aggregated_events.iteritems(): container_tags = set() low_prio_events = [] normal_prio_events = [] for event in event_group: container_name = event['id'][:11] if event['id'] in containers_by_id: cont = containers_by_id[event['id']] container_name = DockerUtil.container_name_extractor(cont)[0] container_tags.update(self._get_tags(cont, PERFORMANCE)) container_tags.add('container_name:%s' % container_name) # health checks generate tons of these so we treat them separately and lower their priority if event['status'].startswith('exec_create:') or event['status'].startswith('exec_start:'): low_prio_events.append((event, container_name)) else: normal_prio_events.append((event, container_name)) exec_event = self._create_dd_event(low_prio_events, image_name, container_tags, priority='Low') if exec_event: events.append(exec_event) normal_event = self._create_dd_event(normal_prio_events, image_name, container_tags, priority='Normal') if normal_event: events.append(normal_event) return events def _create_dd_event(self, events, image, c_tags, priority='Normal'): """Create the actual event to submit from a list of similar docker events""" if not events: return max_timestamp = 0 status = defaultdict(int) status_change = [] for ev, c_name in events: max_timestamp = max(max_timestamp, int(ev['time'])) status[ev['status']] += 1 status_change.append([c_name, ev['status']]) status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()]) msg_title = "%s %s on %s" % (image, status_text, self.hostname) msg_body = ( "%%%\n" "{image_name} {status} on {hostname}\n" "```\n{status_changes}\n```\n" "%%%" ).format( image_name=image, status=status_text, hostname=self.hostname, status_changes="\n".join( ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change]) ) if any(error in status_text for error in ERROR_ALERT_TYPE): alert_type = "error" else: alert_type = None return { 'timestamp': max_timestamp, 'host': self.hostname, 'event_type': EVENT_TYPE, 'msg_title': msg_title, 'msg_text': msg_body, 'source_type_name': EVENT_TYPE, 'event_object': 'docker:%s' % image, 'tags': list(c_tags), 'alert_type': alert_type, 'priority': priority } def _report_disk_stats(self): """Report metrics about the volume space usage""" stats = { 'docker.data.used': None, 'docker.data.total': None, 'docker.data.free': None, 'docker.metadata.used': None, 'docker.metadata.total': None, 'docker.metadata.free': None # these two are calculated by _calc_percent_disk_stats # 'docker.data.percent': None, # 'docker.metadata.percent': None } info = self.docker_client.info() driver_status = info.get('DriverStatus', []) if not driver_status: self.log.warning('Disk metrics collection is enabled but docker info did not' ' report any. Your storage driver might not support them, skipping.') return for metric in driver_status: # only consider metrics about disk space if len(metric) == 2 and 'Space' in metric[0]: # identify Data and Metadata metrics mtype = 'data' if 'Metadata' in metric[0]: mtype = 'metadata' if 'Used' in metric[0]: stats['docker.{0}.used'.format(mtype)] = metric[1] elif 'Space Total' in metric[0]: stats['docker.{0}.total'.format(mtype)] = metric[1] elif 'Space Available' in metric[0]: stats['docker.{0}.free'.format(mtype)] = metric[1] stats = self._format_disk_metrics(stats) stats.update(self._calc_percent_disk_stats(stats)) tags = self._get_tags() for name, val in stats.iteritems(): if val is not None: self.gauge(name, val, tags) def _format_disk_metrics(self, metrics): """Cast the disk stats to float and convert them to bytes""" for name, raw_val in metrics.iteritems(): if raw_val: val, unit = raw_val.split(' ') # by default some are uppercased others lowercased. That's error prone. unit = unit.lower() try: val = int(float(val) * UNIT_MAP[unit]) metrics[name] = val except KeyError: self.log.error('Unrecognized unit %s for disk metric %s. Dropping it.' % (unit, name)) metrics[name] = None return metrics def _calc_percent_disk_stats(self, stats): """Calculate a percentage of used disk space for data and metadata""" mtypes = ['data', 'metadata'] percs = {} for mtype in mtypes: used = stats.get('docker.{0}.used'.format(mtype)) total = stats.get('docker.{0}.total'.format(mtype)) free = stats.get('docker.{0}.free'.format(mtype)) if used and total and free and ceil(total) < free + used: self.log.debug('used, free, and total disk metrics may be wrong, ' 'used: %s, free: %s, total: %s', used, free, total) total = used + free try: if isinstance(used, int): percs['docker.{0}.percent'.format(mtype)] = round(100 * float(used) / float(total), 2) elif isinstance(free, int): percs['docker.{0}.percent'.format(mtype)] = round(100 * (1.0 - (float(free) / float(total))), 2) except ZeroDivisionError: self.log.error('docker.{0}.total is 0, calculating docker.{1}.percent' ' is not possible.'.format(mtype, mtype)) return percs # Cgroups def _get_cgroup_from_proc(self, cgroup, pid, filename): """Find a specific cgroup file, containing metrics to extract.""" params = { "file": filename, } return DockerUtil.find_cgroup_from_proc(self._mountpoints, pid, cgroup, self.docker_util._docker_root) % (params) def _parse_cgroup_file(self, stat_file): """Parse a cgroup pseudo file for key/values.""" self.log.debug("Opening cgroup file: %s" % stat_file) try: with open(stat_file, 'r') as fp: if 'blkio' in stat_file: return self._parse_blkio_metrics(fp.read().splitlines()) else: return dict(map(lambda x: x.split(' ', 1), fp.read().splitlines())) except IOError: # It is possible that the container got stopped between the API call and now. # Some files can also be missing (like cpu.stat) and that's fine. self.log.info("Can't open %s. Some metrics for this container may be missing." % stat_file) def _parse_blkio_metrics(self, stats): """Parse the blkio metrics.""" metrics = { 'io_read': 0, 'io_write': 0, } for line in stats: if 'Read' in line: metrics['io_read'] += int(line.split()[2]) if 'Write' in line: metrics['io_write'] += int(line.split()[2]) return metrics def _is_container_cgroup(self, line, selinux_policy): if line[1] not in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') or line[2] == '/docker-daemon': return False if 'docker' in line[2]: # general case return True if 'docker' in selinux_policy: # selinux return True if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2][1:]): # kubernetes return True return False # proc files def _crawl_container_pids(self, container_dict, custom_cgroups=False): """Crawl `/proc` to find container PIDs and add them to `containers_by_id`.""" proc_path = os.path.join(self.docker_util._docker_root, 'proc') pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()] if len(pid_dirs) == 0: self.warning("Unable to find any pid directory in {0}. " "If you are running the agent in a container, make sure to " 'share the volume properly: "/proc:/host/proc:ro". ' "See https://github.com/DataDog/docker-dd-agent/blob/master/README.md for more information. " "Network metrics will be missing".format(proc_path)) self._disable_net_metrics = True return container_dict self._disable_net_metrics = False for folder in pid_dirs: try: path = os.path.join(proc_path, folder, 'cgroup') with open(path, 'r') as f: content = [line.strip().split(':') for line in f.readlines()] selinux_policy = '' path = os.path.join(proc_path, folder, 'attr', 'current') if os.path.exists(path): with open(path, 'r') as f: selinux_policy = f.readlines()[0] except IOError, e: # Issue #2074 self.log.debug("Cannot read %s, " "process likely raced to finish : %s" % (path, str(e))) except Exception as e: self.warning("Cannot read %s : %s" % (path, str(e))) continue try: for line in content: if self._is_container_cgroup(line, selinux_policy): cpuacct = line[2] break else: continue matches = re.findall(CONTAINER_ID_RE, cpuacct) if matches: container_id = matches[-1] if container_id not in container_dict: self.log.debug("Container %s not in container_dict, it's likely excluded", container_id) continue container_dict[container_id]['_pid'] = folder container_dict[container_id]['_proc_root'] = os.path.join(proc_path, folder) elif custom_cgroups: # if we match by pid that should be enough (?) - O(n) ugh! for _, container in container_dict.iteritems(): if container.get('_pid') == int(folder): container['_proc_root'] = os.path.join(proc_path, folder) break except Exception, e: self.warning("Cannot parse %s content: %s" % (path, str(e))) continue
class KubeUtil: __metaclass__ = Singleton DEFAULT_METHOD = 'http' MACHINE_INFO_PATH = '/api/v1.3/machine/' METRICS_PATH = '/api/v1.3/subcontainers/' PODS_LIST_PATH = '/pods/' DEFAULT_CADVISOR_PORT = 4194 DEFAULT_KUBELET_PORT = 10255 DEFAULT_MASTER_PORT = 8080 DEFAULT_MASTER_NAME = 'kubernetes' # DNS name to reach the master from a pod. CA_CRT_PATH = '/run/secrets/kubernetes.io/serviceaccount/ca.crt' AUTH_TOKEN_PATH = '/run/secrets/kubernetes.io/serviceaccount/token' POD_NAME_LABEL = "io.kubernetes.pod.name" NAMESPACE_LABEL = "io.kubernetes.pod.namespace" def __init__(self, instance=None): self.docker_util = DockerUtil() if instance is None: try: config_file_path = get_conf_path(KUBERNETES_CHECK_NAME) check_config = check_yaml(config_file_path) instance = check_config['instances'][0] # kubernetes.yaml was not found except IOError as ex: log.error(ex.message) instance = {} except Exception: log.error('Kubernetes configuration file is invalid. ' 'Trying connecting to kubelet with default settings anyway...') instance = {} self.method = instance.get('method', KubeUtil.DEFAULT_METHOD) self.host = instance.get("host") or self.docker_util.get_hostname() self.kubelet_host = os.environ.get('KUBERNETES_KUBELET_HOST') or self.host self._node_ip = self._node_name = None # lazy evaluation self.host_name = os.environ.get('HOSTNAME') self.cadvisor_port = instance.get('port', KubeUtil.DEFAULT_CADVISOR_PORT) self.kubelet_port = instance.get('kubelet_port', KubeUtil.DEFAULT_KUBELET_PORT) self.kubelet_api_url = '%s://%s:%d' % (self.method, self.kubelet_host, self.kubelet_port) self.cadvisor_url = '%s://%s:%d' % (self.method, self.kubelet_host, self.cadvisor_port) self.kubernetes_api_url = 'https://%s/api/v1' % (os.environ.get('KUBERNETES_SERVICE_HOST') or self.DEFAULT_MASTER_NAME) self.tls_settings = self._init_tls_settings(instance) self.metrics_url = urljoin(self.cadvisor_url, KubeUtil.METRICS_PATH) self.machine_info_url = urljoin(self.cadvisor_url, KubeUtil.MACHINE_INFO_PATH) self.pods_list_url = urljoin(self.kubelet_api_url, KubeUtil.PODS_LIST_PATH) self.kube_health_url = urljoin(self.kubelet_api_url, 'healthz') # keep track of the latest k8s event we collected and posted # default value is 0 but TTL for k8s events is one hour anyways self.last_event_collection_ts = 0 def _init_tls_settings(self, instance): """ Initialize TLS settings for connection to apiserver and kubelet. """ tls_settings = {} client_crt = instance.get('apiserver_client_crt') client_key = instance.get('apiserver_client_key') apiserver_cacert = instance.get('apiserver_ca_cert') if client_crt and client_key and os.path.exists(client_crt) and os.path.exists(client_key): tls_settings['apiserver_client_cert'] = (client_crt, client_key) if apiserver_cacert and os.path.exists(apiserver_cacert): tls_settings['apiserver_cacert'] = apiserver_cacert token = self.get_auth_token() if token: tls_settings['bearer_token'] = token return tls_settings def get_kube_labels(self, excluded_keys=None): pods = self.retrieve_pods_list() return self.extract_kube_labels(pods, excluded_keys=excluded_keys) def extract_kube_labels(self, pods_list, excluded_keys=None): """ Extract labels from a list of pods coming from the kubelet API. """ excluded_keys = excluded_keys or [] kube_labels = defaultdict(list) pod_items = pods_list.get("items") or [] for pod in pod_items: metadata = pod.get("metadata", {}) name = metadata.get("name") namespace = metadata.get("namespace") labels = metadata.get("labels") if name and labels and namespace: key = "%s/%s" % (namespace, name) for k, v in labels.iteritems(): if k in excluded_keys: continue kube_labels[key].append(u"kube_%s:%s" % (k, v)) return kube_labels def extract_meta(self, pods_list, field_name): """ Exctract fields like `uid` or `name` from the `metadata` section of a list of pods coming from the kubelet API. TODO: currently not in use, was added to support events filtering, consider to remove it. """ uids = [] pods = pods_list.get("items") or [] for p in pods: value = p.get('metadata', {}).get(field_name) if value is not None: uids.append(value) return uids def retrieve_pods_list(self): """ Retrieve the list of pods for this cluster querying the kubelet API. TODO: the list of pods could be cached with some policy to be decided. """ return retrieve_json(self.pods_list_url) def retrieve_machine_info(self): """ Retrieve machine info from Cadvisor. """ return retrieve_json(self.machine_info_url) def retrieve_metrics(self): """ Retrieve metrics from Cadvisor. """ return retrieve_json(self.metrics_url) def filter_pods_list(self, pods_list, host_ip): """ Filter out (in place) pods that are not running on the given host. TODO: currently not in use, was added to support events filtering, consider to remove it. """ pod_items = pods_list.get('items') or [] log.debug('Found {} pods to filter'.format(len(pod_items))) filtered_pods = [] for pod in pod_items: status = pod.get('status', {}) if status.get('hostIP') == host_ip: filtered_pods.append(pod) log.debug('Pods after filtering: {}'.format(len(filtered_pods))) pods_list['items'] = filtered_pods return pods_list def retrieve_json_auth(self, url, timeout=10): """ Kubernetes API requires authentication using a token available in every pod, or with a client X509 cert/key pair. We authenticate using the service account token by default and replace this behavior with cert authentication if the user provided a cert/key pair in the instance. We try to verify the server TLS cert if the public cert is available. """ verify = self.tls_settings.get('apiserver_cacert') if not verify: verify = self.CA_CRT_PATH if os.path.exists(self.CA_CRT_PATH) else False log.debug('ssl validation: {}'.format(verify)) cert = self.tls_settings.get('apiserver_client_cert') bearer_token = self.tls_settings.get('bearer_token') if not cert else None headers = {'Authorization': 'Bearer {}'.format(bearer_token)} if bearer_token else None r = requests.get(url, timeout=timeout, headers=headers, verify=verify, cert=cert) r.raise_for_status() return r.json() def get_node_info(self): """ Return the IP address and the hostname of the node where the pod is running. """ if None in (self._node_ip, self._node_name): self._fetch_host_data() return self._node_ip, self._node_name def _fetch_host_data(self): """ Retrieve host name and IP address from the payload returned by the listing pods endpoints from kubelet or kubernetes API. The host IP address is different from the default router for the pod. """ try: pod_items = self.retrieve_pods_list().get("items") or [] except Exception as e: log.warning("Unable to retrieve pod list %s. Not fetching host data", str(e)) return for pod in pod_items: metadata = pod.get("metadata", {}) name = metadata.get("name") if name == self.host_name: status = pod.get('status', {}) spec = pod.get('spec', {}) # if not found, use an empty string - we use None as "not initialized" self._node_ip = status.get('hostIP', '') self._node_name = spec.get('nodeName', '') break def extract_event_tags(self, event): """ Return a list of tags extracted from an event object """ tags = [] if 'reason' in event: tags.append('reason:%s' % event.get('reason', '').lower()) if 'namespace' in event.get('metadata', {}): tags.append('namespace:%s' % event['metadata']['namespace']) if 'host' in event.get('source', {}): tags.append('node_name:%s' % event['source']['host']) if 'kind' in event.get('involvedObject', {}): tags.append('object_type:%s' % event['involvedObject'].get('kind', '').lower()) return tags def are_tags_filtered(self, tags): """ Because it is a pain to call it from the kubernetes check otherwise. """ return self.docker_util.are_tags_filtered(tags) @classmethod def get_auth_token(cls): """ Return a string containing the authorization token for the pod. """ try: with open(cls.AUTH_TOKEN_PATH) as f: return f.read() except IOError as e: log.error('Unable to read token from {}: {}'.format(cls.AUTH_TOKEN_PATH, e)) return None
class KubeUtil: DEFAULT_METHOD = 'http' MACHINE_INFO_PATH = '/api/v1.3/machine/' METRICS_PATH = '/api/v1.3/subcontainers/' DEPLOYMENTS_LIST_PATH = 'deployments/' REPLICASETS_LIST_PATH = 'replicasets/' PODS_LIST_PATH = 'pods/' SERVICES_LIST_PATH = 'services/' NODES_LIST_PATH = 'nodes/' ENDPOINTS_LIST_PATH = 'endpoints/' DEFAULT_CADVISOR_PORT = 4194 DEFAULT_KUBELET_PORT = 10255 DEFAULT_MASTER_METHOD = 'https' DEFAULT_MASTER_PORT = 443 DEFAULT_MASTER_NAME = 'kubernetes' # DNS name to reach the master from a pod. DEFAULT_USE_KUBE_AUTH = False CA_CRT_PATH = '/run/secrets/kubernetes.io/serviceaccount/ca.crt' AUTH_TOKEN_PATH = '/run/secrets/kubernetes.io/serviceaccount/token' DEFAULT_TIMEOUT_SECONDS = 10 POD_NAME_LABEL = "io.kubernetes.pod.name" NAMESPACE_LABEL = "io.kubernetes.pod.namespace" def __init__(self, instance=None): self.docker_util = DockerUtil() if instance is None: try: config_file_path = get_conf_path(KUBERNETES_CHECK_NAME) check_config = check_yaml(config_file_path) instance = check_config['instances'][0] # kubernetes.yaml was not found except IOError as ex: log.error(ex.message) instance = {} except Exception: log.error( 'Kubernetes configuration file is invalid. ' 'Trying connecting to kubelet with default settings anyway...' ) instance = {} self.timeoutSeconds = instance.get("timeoutSeconds", KubeUtil.DEFAULT_TIMEOUT_SECONDS) self.method = instance.get('method', KubeUtil.DEFAULT_METHOD) self.host = instance.get("host") or self.docker_util.get_hostname() self._node_ip = self._node_name = None # lazy evaluation self.host_name = os.environ.get('HOSTNAME') self.cadvisor_port = instance.get('port', KubeUtil.DEFAULT_CADVISOR_PORT) self.kubelet_port = instance.get('kubelet_port', KubeUtil.DEFAULT_KUBELET_PORT) self.master_method = instance.get('master_method', KubeUtil.DEFAULT_MASTER_METHOD) self.master_name = instance.get('master_name', KubeUtil.DEFAULT_MASTER_NAME) self.master_port = instance.get('master_port', KubeUtil.DEFAULT_MASTER_PORT) self.use_kube_auth = instance.get('use_kube_auth', KubeUtil.DEFAULT_USE_KUBE_AUTH) self.kubelet_api_url = '%s://%s:%d' % (self.method, self.host, self.kubelet_port) self.cadvisor_url = '%s://%s:%d' % (self.method, self.host, self.cadvisor_port) self.master_host = os.environ.get('KUBERNETES_SERVICE_HOST') or ( '%s:%d' % (self.master_name, self.master_port)) self.kubernetes_api_url = '%s://%s/api/v1/' % (self.master_method, self.master_host) self.kubernetes_api_extension_url = '%s://%s/apis/extensions/v1beta1/' % ( self.master_method, self.master_host) self.metrics_url = urljoin(self.cadvisor_url, KubeUtil.METRICS_PATH) self.machine_info_url = urljoin(self.cadvisor_url, KubeUtil.MACHINE_INFO_PATH) self.nodes_list_url = urljoin(self.kubernetes_api_url, KubeUtil.NODES_LIST_PATH) self.services_list_url = urljoin(self.kubernetes_api_url, KubeUtil.SERVICES_LIST_PATH) self.endpoints_list_url = urljoin(self.kubernetes_api_url, KubeUtil.ENDPOINTS_LIST_PATH) self.pods_list_url = urljoin(self.kubernetes_api_url, KubeUtil.PODS_LIST_PATH) self.deployments_list_url = urljoin(self.kubernetes_api_extension_url, KubeUtil.DEPLOYMENTS_LIST_PATH) self.kube_health_url = urljoin(self.kubelet_api_url, 'healthz') # keep track of the latest k8s event we collected and posted # default value is 0 but TTL for k8s events is one hour anyways self.last_event_collection_ts = defaultdict(int) def get_kube_labels(self, excluded_keys=None): pods = self.retrieve_pods_list() return self.extract_kube_labels(pods, excluded_keys=excluded_keys) def extract_kube_labels(self, pods_list, excluded_keys=None): """ Extract labels from a list of pods coming from the kubelet API. """ excluded_keys = excluded_keys or [] kube_labels = defaultdict(list) pod_items = pods_list.get("items") or [] for pod in pod_items: metadata = pod.get("metadata", {}) pod_labels = self.extract_metadata_labels(metadata, excluded_keys) kube_labels.update(pod_labels) return kube_labels def extract_metadata_labels(self, metadata, excluded_keys={}, add_kube_prefix=True): """ Extract labels from metadata section coming from the kubelet API. """ kube_labels = defaultdict(list) name = metadata.get("name") namespace = metadata.get("namespace") labels = metadata.get("labels") if name and labels: if namespace: key = "%s/%s" % (namespace, name) else: key = name for k, v in labels.iteritems(): if k in excluded_keys: continue if add_kube_prefix: kube_labels[key].append(u"kube_%s:%s" % (k, v)) else: kube_labels[key].append(u"%s:%s" % (k, v)) return kube_labels def extract_meta(self, pods_list, field_name): """ Exctract fields like `uid` or `name` from the `metadata` section of a list of pods coming from the kubelet API. TODO: currently not in use, was added to support events filtering, consider to remove it. """ uids = [] pods = pods_list.get("items") or [] for p in pods: value = p.get('metadata', {}).get(field_name) if value is not None: uids.append(value) return uids def retrieve_pods_list(self): """ Retrieve the list of pods for this cluster querying the kubelet API. TODO: the list of pods could be cached with some policy to be decided. """ return self.retrieve_json_with_optional_auth(url=self.pods_list_url) def retrieve_endpoints_list(self): """ Retrieve the list of endpoints for this cluster querying the kubelet API. TODO: the list of endpoints could be cached with some policy to be decided. """ return self.retrieve_json_with_optional_auth( url=self.endpoints_list_url) def retrieve_machine_info(self): """ Retrieve machine info from Cadvisor. """ return self.retrieve_json_with_optional_auth(url=self.machine_info_url) def retrieve_metrics(self): """ Retrieve metrics from Cadvisor. """ return self.retrieve_json_with_optional_auth(url=self.metrics_url) def retrieve_nodes_list(self): """ Retrieve the list of nodes for this cluster querying the kublet API. """ return self.retrieve_json_with_optional_auth(self.nodes_list_url) def retrieve_services_list(self): """ Retrieve the list of services for this cluster querying the kublet API. """ return self.retrieve_json_with_optional_auth( url=self.services_list_url) def retrieve_json_with_optional_auth(self, url): if self.use_kube_auth: return self.retrieve_json_auth(url=url, auth_token=self.get_auth_token(), timeout=self.timeoutSeconds) else: return retrieve_json(url=url, timeout=self.timeoutSeconds) def retrieve_deployments_list(self): """ Retrieve the list of deployments for this cluster querying the kublet API extensions. https://kubernetes.io/docs/concepts/workloads/controllers/deployment/ """ return self.retrieve_json_with_optional_auth( url=self.deployments_list_url) def retrieve_replicaset_filtered_list(self, namespace=None, labels_dict=None): """ Retrieve the list of replicasets for given parameters, namespace and labels selector. https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/ The replicaset filter is very similar to how it is implemented in kubernetes dashboard: https://github.com/kubernetes/dashboard/blob/master/src/app/backend/resource/deployment/detail.go https://github.com/kubernetes/dashboard/blob/master/src/app/backend/resource/common/resourcechannels.go """ if labels_dict and len(labels_dict) > 0: params = "?labelSelector=%s" % self._to_label_selector(labels_dict) else: params = "" if namespace: fetch_url = "%snamespaces/%s/%s%s" % ( self.kubernetes_api_extension_url, namespace, KubeUtil.REPLICASETS_LIST_PATH, params) else: fetch_url = "%s%s%s" % (self.kubernetes_api_extension_url, KubeUtil.REPLICASETS_LIST_PATH, params) return self._retrieve_replicaset_list(fetch_url=fetch_url) def _retrieve_replicaset_list(self, fetch_url): """ Retrieve the list of replicasets for given parameters, namespace and labels selector. https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/ """ return self.retrieve_json_with_optional_auth(url=fetch_url) def _to_label_selector(self, labels_dict): """ Render labels dict {'app': 'nginxapp', 'pod-template-hash': 275046495} to a label selector in a form "app%3Dnginxapp,pod-template-hash%3D275046495" """ labels = [ "%s%%3D%s" % (name, value) for name, value in labels_dict.items() ] return ",".join(labels) def filter_pods_list(self, pods_list, host_ip): """ Filter out (in place) pods that are not running on the given host. TODO: currently not in use, was added to support events filtering, consider to remove it. """ pod_items = pods_list.get('items') or [] log.debug('Found {} pods to filter'.format(len(pod_items))) filtered_pods = [] for pod in pod_items: status = pod.get('status', {}) if status.get('hostIP') == host_ip: filtered_pods.append(pod) log.debug('Pods after filtering: {}'.format(len(filtered_pods))) pods_list['items'] = filtered_pods return pods_list def retrieve_json_auth(self, url, auth_token, timeout=10): """ Kubernetes API requires authentication using a token available in every pod. We try to verify ssl certificate if available. """ verify = self.CA_CRT_PATH if os.path.exists( self.CA_CRT_PATH) else False log.debug('ssl validation: {}'.format(verify)) headers = {'Authorization': 'Bearer {}'.format(auth_token)} r = requests.get(url, timeout=timeout, headers=headers, verify=verify) r.raise_for_status() return r.json() def get_node_info(self): """ Return the IP address and the hostname of the node where the pod is running. """ if None in (self._node_ip, self._node_name): self._fetch_host_data() return self._node_ip, self._node_name def _fetch_host_data(self): """ Retrieve host name and IP address from the payload returned by the listing pods endpoints from kubelet or kubernetes API. The host IP address is different from the default router for the pod. """ try: pod_items = self.retrieve_pods_list().get("items") or [] except Exception as e: log.warning( "Unable to retrieve pod list %s. Not fetching host data", str(e)) return for pod in pod_items: metadata = pod.get("metadata", {}) name = metadata.get("name") if name == self.host_name: status = pod.get('status', {}) spec = pod.get('spec', {}) # if not found, use an empty string - we use None as "not initialized" self._node_ip = status.get('hostIP', '') self._node_name = spec.get('nodeName', '') break def extract_event_tags(self, event): """ Return a list of tags extracted from an event object """ tags = [] if 'reason' in event: tags.append('reason:%s' % event.get('reason', '').lower()) if 'namespace' in event.get('metadata', {}): tags.append('namespace:%s' % event['metadata']['namespace']) if 'host' in event.get('source', {}): tags.append('node_name:%s' % event['source']['host']) if 'kind' in event.get('involvedObject', {}): tags.append('object_type:%s' % event['involvedObject'].get('kind', '').lower()) return tags def are_tags_filtered(self, tags): """ Because it is a pain to call it from the kubernetes check otherwise. """ return self.docker_util.are_tags_filtered(tags) @classmethod def get_auth_token(cls): """ Return a string containing the authorization token for the pod. """ try: with open(cls.AUTH_TOKEN_PATH) as f: return f.read() except IOError as e: log.error('Unable to read token from {}: {}'.format( cls.AUTH_TOKEN_PATH, e)) return None
class KubeUtil: __metaclass__ = Singleton DEFAULT_METHOD = 'http' KUBELET_HEALTH_PATH = '/healthz' MACHINE_INFO_PATH = '/api/v1.3/machine/' METRICS_PATH = '/api/v1.3/subcontainers/' PODS_LIST_PATH = '/pods/' DEFAULT_CADVISOR_PORT = 4194 DEFAULT_HTTP_KUBELET_PORT = 10255 DEFAULT_HTTPS_KUBELET_PORT = 10250 DEFAULT_MASTER_PORT = 8080 DEFAULT_MASTER_NAME = 'kubernetes' # DNS name to reach the master from a pod. CA_CRT_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' AUTH_TOKEN_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/token' POD_NAME_LABEL = "io.kubernetes.pod.name" NAMESPACE_LABEL = "io.kubernetes.pod.namespace" def __init__(self, instance=None): self.docker_util = DockerUtil() if instance is None: try: config_file_path = get_conf_path(KUBERNETES_CHECK_NAME) check_config = check_yaml(config_file_path) instance = check_config['instances'][0] # kubernetes.yaml was not found except IOError as ex: log.error(ex.message) instance = {} except Exception: log.error( 'Kubernetes configuration file is invalid. ' 'Trying connecting to kubelet with default settings anyway...' ) instance = {} self.method = instance.get('method', KubeUtil.DEFAULT_METHOD) self._node_ip = self._node_name = None # lazy evaluation self.host_name = os.environ.get('HOSTNAME') self.tls_settings = self._init_tls_settings(instance) # apiserver self.kubernetes_api_url = 'https://%s/api/v1' % ( os.environ.get('KUBERNETES_SERVICE_HOST') or self.DEFAULT_MASTER_NAME) # kubelet try: self.kubelet_api_url = self._locate_kubelet(instance) if not self.kubelet_api_url: raise Exception( "Couldn't find a method to connect to kubelet.") except Exception as ex: log.error( "Kubernetes check exiting, cannot run without access to kubelet." ) raise ex self.kubelet_host = self.kubelet_api_url.split(':')[1].lstrip('/') self.pods_list_url = urljoin(self.kubelet_api_url, KubeUtil.PODS_LIST_PATH) self.kube_health_url = urljoin(self.kubelet_api_url, KubeUtil.KUBELET_HEALTH_PATH) # cadvisor self.cadvisor_port = instance.get('port', KubeUtil.DEFAULT_CADVISOR_PORT) self.cadvisor_url = '%s://%s:%d' % (self.method, self.kubelet_host, self.cadvisor_port) self.metrics_url = urljoin(self.cadvisor_url, KubeUtil.METRICS_PATH) self.machine_info_url = urljoin(self.cadvisor_url, KubeUtil.MACHINE_INFO_PATH) # keep track of the latest k8s event we collected and posted # default value is 0 but TTL for k8s events is one hour anyways self.last_event_collection_ts = 0 def _init_tls_settings(self, instance): """ Initialize TLS settings for connection to apiserver and kubelet. """ tls_settings = {} # apiserver client_crt = instance.get('apiserver_client_crt') client_key = instance.get('apiserver_client_key') apiserver_cacert = instance.get('apiserver_ca_cert') if client_crt and client_key and os.path.exists( client_crt) and os.path.exists(client_key): tls_settings['apiserver_client_cert'] = (client_crt, client_key) if apiserver_cacert and os.path.exists(apiserver_cacert): tls_settings['apiserver_cacert'] = apiserver_cacert token = self.get_auth_token() if token: tls_settings['bearer_token'] = token # kubelet kubelet_client_crt = instance.get('kubelet_client_crt') kubelet_client_key = instance.get('kubelet_client_key') if kubelet_client_crt and kubelet_client_key and os.path.exists( kubelet_client_crt) and os.path.exists(kubelet_client_key): tls_settings['kubelet_client_cert'] = (kubelet_client_crt, kubelet_client_key) cert = instance.get('kubelet_cert') if cert: tls_settings['kubelet_verify'] = cert else: tls_settings['kubelet_verify'] = instance.get( 'kubelet_tls_verify', DEFAULT_TLS_VERIFY) return tls_settings def _locate_kubelet(self, instance): """ Kubelet may or may not accept un-authenticated http requests. If it doesn't we need to use its HTTPS API that may or may not require auth. """ host = os.environ.get('KUBERNETES_KUBELET_HOST') or instance.get( "host") if not host: # if no hostname was provided, use the docker hostname if cert # validation is not required, the kubernetes hostname otherwise. docker_hostname = self.docker_util.get_hostname( should_resolve=True) if self.tls_settings.get('kubelet_verify'): try: k8s_hostname = self.get_node_hostname(docker_hostname) host = k8s_hostname or docker_hostname except Exception as ex: log.error(str(ex)) host = docker_hostname else: host = docker_hostname try: # check if the no-auth endpoint is enabled port = instance.get('kubelet_port', KubeUtil.DEFAULT_HTTP_KUBELET_PORT) no_auth_url = 'http://%s:%s' % (host, port) test_url = urljoin(no_auth_url, KubeUtil.KUBELET_HEALTH_PATH) self.perform_kubelet_query(test_url) return no_auth_url except Exception: log.debug( "Couldn't query kubelet over HTTP, assuming it's not in no_auth mode." ) port = instance.get('kubelet_port', KubeUtil.DEFAULT_HTTPS_KUBELET_PORT) https_url = 'https://%s:%s' % (host, port) test_url = urljoin(https_url, KubeUtil.KUBELET_HEALTH_PATH) self.perform_kubelet_query(test_url) return https_url def get_node_hostname(self, host): """ Query the API server for the kubernetes hostname of the node using the docker hostname as a filter. """ node_filter = {'labelSelector': 'kubernetes.io/hostname=%s' % host} node = self.retrieve_json_auth(self.kubernetes_api_url + '/nodes?%s' % urlencode(node_filter)) if len(node['items']) != 1: log.error( 'Error while getting node hostname: expected 1 node, got %s.' % len(node['items'])) else: addresses = (node or {}).get('items', [{}])[0].get('status', {}).get('addresses', []) for address in addresses: if address.get('type') == 'Hostname': return address['address'] return None def get_kube_labels(self, excluded_keys=None): pods = self.retrieve_pods_list() return self.extract_kube_labels(pods, excluded_keys=excluded_keys) def extract_kube_labels(self, pods_list, excluded_keys=None): """ Extract labels from a list of pods coming from the kubelet API. """ excluded_keys = excluded_keys or [] kube_labels = defaultdict(list) pod_items = pods_list.get("items") or [] for pod in pod_items: metadata = pod.get("metadata", {}) name = metadata.get("name") namespace = metadata.get("namespace") labels = metadata.get("labels") if name and labels and namespace: key = "%s/%s" % (namespace, name) for k, v in labels.iteritems(): if k in excluded_keys: continue kube_labels[key].append(u"kube_%s:%s" % (k, v)) return kube_labels def retrieve_pods_list(self): """ Retrieve the list of pods for this cluster querying the kubelet API. TODO: the list of pods could be cached with some policy to be decided. """ return self.perform_kubelet_query(self.pods_list_url).json() def retrieve_machine_info(self): """ Retrieve machine info from Cadvisor. """ return retrieve_json(self.machine_info_url) def retrieve_metrics(self): """ Retrieve metrics from Cadvisor. """ return retrieve_json(self.metrics_url) def perform_kubelet_query(self, url, verbose=True, timeout=10): """ Perform and return a GET request against kubelet. Support auth and TLS validation. """ tls_context = self.tls_settings headers = None cert = tls_context.get('kubelet_client_cert') verify = tls_context.get('kubelet_verify', DEFAULT_TLS_VERIFY) # if cert-based auth is enabled, don't use the token. if not cert and url.lower().startswith('https'): headers = { 'Authorization': 'Bearer {}'.format(self.get_auth_token()) } return requests.get(url, timeout=timeout, verify=verify, cert=cert, headers=headers, params={'verbose': verbose}) def retrieve_json_auth(self, url, timeout=10, verify=None): """ Kubernetes API requires authentication using a token available in every pod, or with a client X509 cert/key pair. We authenticate using the service account token by default and replace this behavior with cert authentication if the user provided a cert/key pair in the instance. We try to verify the server TLS cert if the public cert is available. """ verify = self.tls_settings.get('apiserver_cacert') if not verify: verify = self.CA_CRT_PATH if os.path.exists( self.CA_CRT_PATH) else False log.debug('tls validation: {}'.format(verify)) cert = self.tls_settings.get('apiserver_client_cert') bearer_token = self.tls_settings.get( 'bearer_token') if not cert else None headers = { 'Authorization': 'Bearer {}'.format(bearer_token) } if bearer_token else None r = requests.get(url, timeout=timeout, headers=headers, verify=verify, cert=cert) r.raise_for_status() return r.json() def get_node_info(self): """ Return the IP address and the hostname of the node where the pod is running. """ if None in (self._node_ip, self._node_name): self._fetch_host_data() return self._node_ip, self._node_name def _fetch_host_data(self): """ Retrieve host name and IP address from the payload returned by the listing pods endpoints from kubelet. The host IP address is different from the default router for the pod. """ try: pod_items = self.retrieve_pods_list().get("items") or [] except Exception as e: log.warning( "Unable to retrieve pod list %s. Not fetching host data", str(e)) return for pod in pod_items: metadata = pod.get("metadata", {}) name = metadata.get("name") if name == self.host_name: status = pod.get('status', {}) spec = pod.get('spec', {}) # if not found, use an empty string - we use None as "not initialized" self._node_ip = status.get('hostIP', '') self._node_name = spec.get('nodeName', '') break def extract_event_tags(self, event): """ Return a list of tags extracted from an event object """ tags = [] if 'reason' in event: tags.append('reason:%s' % event.get('reason', '').lower()) if 'namespace' in event.get('metadata', {}): tags.append('namespace:%s' % event['metadata']['namespace']) if 'host' in event.get('source', {}): tags.append('node_name:%s' % event['source']['host']) if 'kind' in event.get('involvedObject', {}): tags.append('object_type:%s' % event['involvedObject'].get('kind', '').lower()) return tags def are_tags_filtered(self, tags): """ Because it is a pain to call it from the kubernetes check otherwise. """ return self.docker_util.are_tags_filtered(tags) @classmethod def get_auth_token(cls): """ Return a string containing the authorization token for the pod. """ try: with open(cls.AUTH_TOKEN_PATH) as f: return f.read() except IOError as e: log.error('Unable to read token from {}: {}'.format( cls.AUTH_TOKEN_PATH, e)) return None
class KubeUtil: __metaclass__ = Singleton DEFAULT_METHOD = 'http' KUBELET_HEALTH_PATH = '/healthz' MACHINE_INFO_PATH = '/api/v1.3/machine/' METRICS_PATH = '/api/v1.3/subcontainers/' PODS_LIST_PATH = '/pods/' DEFAULT_CADVISOR_PORT = 4194 DEFAULT_HTTP_KUBELET_PORT = 10255 DEFAULT_HTTPS_KUBELET_PORT = 10250 DEFAULT_MASTER_PORT = 8080 DEFAULT_MASTER_NAME = 'kubernetes' # DNS name to reach the master from a pod. DEFAULT_LABEL_PREFIX = 'kube_' CA_CRT_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' AUTH_TOKEN_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/token' POD_NAME_LABEL = "io.kubernetes.pod.name" NAMESPACE_LABEL = "io.kubernetes.pod.namespace" def __init__(self, instance=None): self.docker_util = DockerUtil() if instance is None: try: config_file_path = get_conf_path(KUBERNETES_CHECK_NAME) check_config = check_yaml(config_file_path) instance = check_config['instances'][0] # kubernetes.yaml was not found except IOError as ex: log.error(ex.message) instance = {} except Exception: log.error( 'Kubernetes configuration file is invalid. ' 'Trying connecting to kubelet with default settings anyway...' ) instance = {} self.method = instance.get('method', KubeUtil.DEFAULT_METHOD) self._node_ip = self._node_name = None # lazy evaluation self.host_name = os.environ.get('HOSTNAME') self.tls_settings = self._init_tls_settings(instance) # apiserver self.kubernetes_api_url = 'https://%s/api/v1' % ( os.environ.get('KUBERNETES_SERVICE_HOST') or self.DEFAULT_MASTER_NAME) # kubelet try: self.kubelet_api_url = self._locate_kubelet(instance) if not self.kubelet_api_url: raise Exception( "Couldn't find a method to connect to kubelet.") except Exception as ex: log.error( "Kubernetes check exiting, cannot run without access to kubelet." ) raise ex # Service mapping helper class self._service_mapper = PodServiceMapper(self) self.kubelet_host = self.kubelet_api_url.split(':')[1].lstrip('/') self.pods_list_url = urljoin(self.kubelet_api_url, KubeUtil.PODS_LIST_PATH) self.kube_health_url = urljoin(self.kubelet_api_url, KubeUtil.KUBELET_HEALTH_PATH) self.kube_label_prefix = instance.get('label_to_tag_prefix', KubeUtil.DEFAULT_LABEL_PREFIX) # cadvisor self.cadvisor_port = instance.get('port', KubeUtil.DEFAULT_CADVISOR_PORT) self.cadvisor_url = '%s://%s:%d' % (self.method, self.kubelet_host, self.cadvisor_port) self.metrics_url = urljoin(self.cadvisor_url, KubeUtil.METRICS_PATH) self.machine_info_url = urljoin(self.cadvisor_url, KubeUtil.MACHINE_INFO_PATH) # keep track of the latest k8s event we collected and posted # default value is 0 but TTL for k8s events is one hour anyways self.last_event_collection_ts = 0 def _init_tls_settings(self, instance): """ Initialize TLS settings for connection to apiserver and kubelet. """ tls_settings = {} # apiserver client_crt = instance.get('apiserver_client_crt') client_key = instance.get('apiserver_client_key') apiserver_cacert = instance.get('apiserver_ca_cert') if client_crt and client_key and os.path.exists( client_crt) and os.path.exists(client_key): tls_settings['apiserver_client_cert'] = (client_crt, client_key) if apiserver_cacert and os.path.exists(apiserver_cacert): tls_settings['apiserver_cacert'] = apiserver_cacert token = self.get_auth_token() if token: tls_settings['bearer_token'] = token # kubelet kubelet_client_crt = instance.get('kubelet_client_crt') kubelet_client_key = instance.get('kubelet_client_key') if kubelet_client_crt and kubelet_client_key and os.path.exists( kubelet_client_crt) and os.path.exists(kubelet_client_key): tls_settings['kubelet_client_cert'] = (kubelet_client_crt, kubelet_client_key) cert = instance.get('kubelet_cert') if cert: tls_settings['kubelet_verify'] = cert else: tls_settings['kubelet_verify'] = instance.get( 'kubelet_tls_verify', DEFAULT_TLS_VERIFY) return tls_settings def _locate_kubelet(self, instance): """ Kubelet may or may not accept un-authenticated http requests. If it doesn't we need to use its HTTPS API that may or may not require auth. """ host = os.environ.get('KUBERNETES_KUBELET_HOST') or instance.get( "host") if not host: # if no hostname was provided, use the docker hostname if cert # validation is not required, the kubernetes hostname otherwise. docker_hostname = self.docker_util.get_hostname( should_resolve=True) if self.tls_settings.get('kubelet_verify'): try: k8s_hostname = self.get_node_hostname(docker_hostname) host = k8s_hostname or docker_hostname except Exception as ex: log.error(str(ex)) host = docker_hostname else: host = docker_hostname try: # check if the no-auth endpoint is enabled port = instance.get('kubelet_port', KubeUtil.DEFAULT_HTTP_KUBELET_PORT) no_auth_url = 'http://%s:%s' % (host, port) test_url = urljoin(no_auth_url, KubeUtil.KUBELET_HEALTH_PATH) self.perform_kubelet_query(test_url) return no_auth_url except Exception: log.debug( "Couldn't query kubelet over HTTP, assuming it's not in no_auth mode." ) port = instance.get('kubelet_port', KubeUtil.DEFAULT_HTTPS_KUBELET_PORT) https_url = 'https://%s:%s' % (host, port) test_url = urljoin(https_url, KubeUtil.KUBELET_HEALTH_PATH) self.perform_kubelet_query(test_url) return https_url def get_node_hostname(self, host): """ Query the API server for the kubernetes hostname of the node using the docker hostname as a filter. """ node_filter = {'labelSelector': 'kubernetes.io/hostname=%s' % host} node = self.retrieve_json_auth(self.kubernetes_api_url + '/nodes?%s' % urlencode(node_filter)) if len(node['items']) != 1: log.error( 'Error while getting node hostname: expected 1 node, got %s.' % len(node['items'])) else: addresses = (node or {}).get('items', [{}])[0].get('status', {}).get('addresses', []) for address in addresses: if address.get('type') == 'Hostname': return address['address'] return None def get_kube_pod_tags(self, excluded_keys=None): """ Gets pods' labels as tags + creator and service tags. Returns a dict{namespace/podname: [tags]} """ pods = self.retrieve_pods_list() return self.extract_kube_pod_tags(pods, excluded_keys=excluded_keys) def extract_kube_pod_tags(self, pods_list, excluded_keys=None, label_prefix=None): """ Extract labels + creator and service tags from a list of pods coming from the kubelet API. :param excluded_keys: labels to skip :param label_prefix: prefix for label->tag conversion, None defaults to the configuration option label_to_tag_prefix Returns a dict{namespace/podname: [tags]} """ excluded_keys = excluded_keys or [] kube_labels = defaultdict(list) pod_items = pods_list.get("items") or [] label_prefix = label_prefix or self.kube_label_prefix for pod in pod_items: metadata = pod.get("metadata", {}) name = metadata.get("name") namespace = metadata.get("namespace") labels = metadata.get("labels", {}) if name and namespace: key = "%s/%s" % (namespace, name) # Extract creator tags podtags = self.get_pod_creator_tags(metadata) # Extract services tags for service in self.match_services_for_pod(metadata): if service is not None: podtags.append(u'kube_service:%s' % service) # Extract labels for k, v in labels.iteritems(): if k in excluded_keys: continue podtags.append(u"%s%s:%s" % (label_prefix, k, v)) kube_labels[key] = podtags return kube_labels def retrieve_pods_list(self): """ Retrieve the list of pods for this cluster querying the kubelet API. TODO: the list of pods could be cached with some policy to be decided. """ return self.perform_kubelet_query(self.pods_list_url).json() def retrieve_machine_info(self): """ Retrieve machine info from Cadvisor. """ return retrieve_json(self.machine_info_url) def retrieve_metrics(self): """ Retrieve metrics from Cadvisor. """ return retrieve_json(self.metrics_url) def get_deployment_for_replicaset(self, rs_name): """ Get the deployment name for a given replicaset name For now, the rs name's first part always is the deployment's name, see https://github.com/kubernetes/kubernetes/blob/release-1.6/pkg/controller/deployment/sync.go#L299 But it might change in a future k8s version. The other way to match RS and deployments is to parse and cache /apis/extensions/v1beta1/replicasets, mirroring PodServiceMapper """ end = rs_name.rfind("-") if end > 0 and rs_name[end + 1:].isdigit(): return rs_name[0:end] else: return None def perform_kubelet_query(self, url, verbose=True, timeout=10): """ Perform and return a GET request against kubelet. Support auth and TLS validation. """ tls_context = self.tls_settings headers = None cert = tls_context.get('kubelet_client_cert') verify = tls_context.get('kubelet_verify', DEFAULT_TLS_VERIFY) # if cert-based auth is enabled, don't use the token. if not cert and url.lower().startswith('https'): headers = { 'Authorization': 'Bearer {}'.format(self.get_auth_token()) } return requests.get(url, timeout=timeout, verify=verify, cert=cert, headers=headers, params={'verbose': verbose}) def retrieve_json_auth(self, url, timeout=10, verify=None, params=None): """ Kubernetes API requires authentication using a token available in every pod, or with a client X509 cert/key pair. We authenticate using the service account token by default and replace this behavior with cert authentication if the user provided a cert/key pair in the instance. We try to verify the server TLS cert if the public cert is available. """ verify = self.tls_settings.get('apiserver_cacert') if not verify: verify = self.CA_CRT_PATH if os.path.exists( self.CA_CRT_PATH) else False log.debug('tls validation: {}'.format(verify)) cert = self.tls_settings.get('apiserver_client_cert') bearer_token = self.tls_settings.get( 'bearer_token') if not cert else None headers = { 'Authorization': 'Bearer {}'.format(bearer_token) } if bearer_token else None r = requests.get(url, timeout=timeout, headers=headers, verify=verify, cert=cert, params=params) r.raise_for_status() return r.json() def get_node_info(self): """ Return the IP address and the hostname of the node where the pod is running. """ if None in (self._node_ip, self._node_name): self._fetch_host_data() return self._node_ip, self._node_name def _fetch_host_data(self): """ Retrieve host name and IP address from the payload returned by the listing pods endpoints from kubelet. The host IP address is different from the default router for the pod. """ try: pod_items = self.retrieve_pods_list().get("items") or [] except Exception as e: log.warning( "Unable to retrieve pod list %s. Not fetching host data", str(e)) return for pod in pod_items: metadata = pod.get("metadata", {}) name = metadata.get("name") if name == self.host_name: status = pod.get('status', {}) spec = pod.get('spec', {}) # if not found, use an empty string - we use None as "not initialized" self._node_ip = status.get('hostIP', '') self._node_name = spec.get('nodeName', '') break def extract_event_tags(self, event): """ Return a list of tags extracted from an event object """ tags = [] if 'reason' in event: tags.append('reason:%s' % event.get('reason', '').lower()) if 'namespace' in event.get('metadata', {}): tags.append('namespace:%s' % event['metadata']['namespace']) if 'host' in event.get('source', {}): tags.append('node_name:%s' % event['source']['host']) if 'kind' in event.get('involvedObject', {}): tags.append('object_type:%s' % event['involvedObject'].get('kind', '').lower()) return tags def are_tags_filtered(self, tags): """ Because it is a pain to call it from the kubernetes check otherwise. """ return self.docker_util.are_tags_filtered(tags) @classmethod def get_auth_token(cls): """ Return a string containing the authorization token for the pod. """ try: with open(cls.AUTH_TOKEN_PATH) as f: return f.read() except IOError as e: log.error('Unable to read token from {}: {}'.format( cls.AUTH_TOKEN_PATH, e)) return None def check_services_cache_freshness(self): """ Entry point for sd_docker_backend to check whether to invalidate the cached services For now, we remove the whole cache as the fill_service_cache logic doesn't handle partial lookups We use the event's resourceVersion, as using the service's version wouldn't catch deletion """ return self._service_mapper.check_services_cache_freshness() def match_services_for_pod(self, pod_metadata, refresh=False): """ Match the pods labels with services' label selectors to determine the list of services that point to that pod. Returns an array of service names. Pass refresh=True if you want to bypass the cached cid->services mapping (after a service change) """ s = self._service_mapper.match_services_for_pod(pod_metadata, refresh, names=True) #log.warning("Matches for %s: %s" % (pod_metadata.get('name'), str(s))) return s def get_event_retriever(self, namespaces=None, kinds=None): """ Returns a KubeEventRetriever object ready for action """ return KubeEventRetriever(self, namespaces, kinds) def match_containers_for_pods(self, pod_uids, podlist=None): """ Reads a set of pod uids and returns the set of docker container ids they manage podlist should be a recent self.retrieve_pods_list return value, if not given that method will be called """ cids = set() if not isinstance(pod_uids, set) or len(pod_uids) < 1: return cids if podlist is None: podlist = self.retrieve_pods_list() for pod in podlist.get('items', {}): uid = pod.get('metadata', {}).get('uid', None) if uid in pod_uids: for container in pod.get('status', {}).get('containerStatuses', None): id = container.get('containerID', "") if id.startswith("docker://"): cids.add(id[9:]) return cids def get_pod_creator(self, pod_metadata): """ Get the pod's creator from its metadata and returns a tuple (creator_kind, creator_name) This allows for consitency across code path """ try: created_by = json.loads( pod_metadata['annotations']['kubernetes.io/created-by']) creator_kind = created_by.get('reference', {}).get('kind') creator_name = created_by.get('reference', {}).get('name') return (creator_kind, creator_name) except Exception: log.debug('Could not parse creator for pod ' + pod_metadata.get('name', '')) return (None, None) def get_pod_creator_tags(self, pod_metadata, legacy_rep_controller_tag=False): """ Get the pod's creator from its metadata and returns a list of tags in the form kube_$kind:$name, ready to add to the metrics """ try: tags = [] creator_kind, creator_name = self.get_pod_creator(pod_metadata) if creator_kind in CREATOR_KIND_TO_TAG and creator_name: tags.append("%s:%s" % (CREATOR_KIND_TO_TAG[creator_kind], creator_name)) if creator_kind == 'ReplicaSet': deployment = self.get_deployment_for_replicaset( creator_name) if deployment: tags.append( "%s:%s" % (CREATOR_KIND_TO_TAG['Deployment'], deployment)) if legacy_rep_controller_tag and creator_kind != 'ReplicationController' and creator_name: tags.append( 'kube_replication_controller:{0}'.format(creator_name)) return tags except Exception: log.warning('Could not parse creator tags for pod ' + pod_metadata.get('name')) return [] def process_events(self, event_array, podlist=None): """ Reads a list of kube events, invalidates caches and and computes a set of containers impacted by the changes, to refresh service discovery Pod creation/deletion events are ignored for now, as docker_daemon already sends container creation/deletion events to SD Pod->containers matching is done using match_containers_for_pods """ try: pods = set() if self._service_mapper: pods.update(self._service_mapper.process_events(event_array)) return self.match_containers_for_pods(pods, podlist) except Exception as e: log.warning("Error processing events %s: %s" % (str(event_array), e)) return set()
class DockerDaemon(AgentCheck): """Collect metrics and events from Docker API and cgroups.""" def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception("Docker check only supports one configured instance.") AgentCheck.__init__(self, name, init_config, agentConfig, instances=instances) self.init_success = False self._service_discovery = agentConfig.get('service_discovery') and \ agentConfig.get('service_discovery_backend') == 'docker' global_labels_as_tags = agentConfig.get('docker_labels_as_tags') if global_labels_as_tags: self.collect_labels_as_tags = [label.strip() for label in global_labels_as_tags.split(',')] else: self.collect_labels_as_tags = DEFAULT_LABELS_AS_TAGS self.init() def init(self): try: instance = self.instances[0] # Getting custom tags for service checks when docker is down self.custom_tags = instance.get("tags", []) self.docker_util = DockerUtil() if not self.docker_util.client: raise Exception("Failed to initialize Docker client.") self.docker_gateway = DockerUtil.get_gateway() self.metadata_collector = MetadataCollector() self.kubeutil = None if Platform.is_k8s(): try: self.kubeutil = KubeUtil() except Exception as ex: self.log.error("Couldn't instantiate the kubernetes client, " "subsequent kubernetes calls will fail as well. Error: %s" % str(ex)) # We configure the check with the right cgroup settings for this host # Just needs to be done once self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS) self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False # Set tagging options # The collect_labels_as_tags is legacy, only tagging docker metrics. # It is replaced by docker_labels_as_tags in datadog.conf. # We keep this line for backward compatibility. if "collect_labels_as_tags" in instance: self.collect_labels_as_tags = instance.get("collect_labels_as_tags") self.kube_pod_tags = {} self.use_histogram = _is_affirmative(instance.get('use_histogram', False)) performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: performance_tags, IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } # Set filtering settings if self.docker_util.filtering_enabled: self.tag_names[FILTERED] = self.docker_util.filtered_tag_names # Container network mapping cache self.network_mappings = {} # get the health check whitelist self.whitelist_patterns = None health_scs_whitelist = instance.get('health_service_check_whitelist', []) if health_scs_whitelist: patterns, whitelist_tags = compile_filter_rules(health_scs_whitelist) self.whitelist_patterns = set(patterns) self.tag_names[HEALTHCHECK] = set(whitelist_tags) # Other options self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False)) self.collect_container_count = _is_affirmative(instance.get('collect_container_count', False)) self.collect_volume_count = _is_affirmative(instance.get('collect_volume_count', False)) self.collect_events = _is_affirmative(instance.get('collect_events', True)) self.event_attributes_as_tags = instance.get('event_attributes_as_tags', []) self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False)) self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False)) self.collect_exit_codes = _is_affirmative(instance.get('collect_exit_codes', False)) self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance() self.filtered_event_types = tuple(instance.get("filtered_event_types", DEFAULT_FILTERED_EVENT_TYPES)) self.capped_metrics = instance.get('capped_metrics') except Exception as e: self.log.error(e) self.warning("Initialization failed. Will retry at next iteration") else: self.init_success = True def check(self, instance): """Run the Docker check for one instance.""" if not self.init_success: # Initialization can fail if cgroups are not ready or docker daemon is down. So we retry if needed # https://github.com/DataDog/dd-agent/issues/1896 self.init() try: if self.docker_util.client is None: message = "Unable to connect to Docker daemon" self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message=message, tags=self.custom_tags) return except Exception as ex: self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message=str(ex), tags=self.custom_tags) return if not self.init_success: # Initialization failed, will try later return try: # Report image metrics if self.collect_image_stats: self._count_and_weigh_images() if Platform.is_k8s(): self.kube_pod_tags = {} if self.kubeutil: try: self.kube_pod_tags = self.kubeutil.get_kube_pod_tags() except Exception as e: self.log.warning('Could not retrieve kubernetes labels: %s', e) # containers running with custom cgroups? custom_cgroups = _is_affirmative(instance.get('custom_cgroups', False)) # Get the list of containers and the index of their names health_service_checks = True if self.whitelist_patterns else False containers_by_id = self._get_and_count_containers(custom_cgroups, health_service_checks) containers_by_id = self._crawl_container_pids(containers_by_id, custom_cgroups) # Send events from Docker API if self.collect_events or self._service_discovery or not self._disable_net_metrics or self.collect_exit_codes: self._process_events(containers_by_id) # Report performance container metrics (cpu, mem, net, io) self._report_performance_metrics(containers_by_id) if self.collect_container_size: self._report_container_size(containers_by_id) if self.collect_container_count: self._report_container_count(containers_by_id) if self.collect_volume_count: self._report_volume_count() # Collect disk stats from Docker info command if self.collect_disk_stats: self._report_disk_stats() if health_service_checks: self._send_container_healthcheck_sc(containers_by_id) except: self.log.exception("Docker_daemon check failed") self.warning("Check failed. Will retry at next iteration") if self.capped_metrics: self.filter_capped_metrics() def _count_and_weigh_images(self): try: tags = self._get_tags() active_images = self.docker_util.client.images(all=False) active_images_len = len(active_images) all_images_len = len(self.docker_util.client.images(quiet=True, all=True)) self.gauge("docker.images.available", active_images_len, tags=tags) self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags) if self.collect_image_size: self._report_image_size(active_images) except Exception as e: # It's not an important metric, keep going if it fails self.warning("Failed to count Docker images. Exception: %s", e) def _get_and_count_containers(self, custom_cgroups=False, healthchecks=False): """List all the containers from the API, filter and count them.""" # Querying the size of containers is slow, we don't do it at each run must_query_size = self.collect_container_size and self._latest_size_query == 0 self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE running_containers_count = Counter() all_containers_count = Counter() try: containers = self.docker_util.client.containers(all=True, size=must_query_size) except Exception as e: message = "Unable to list Docker containers: {0}".format(e) self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message=message, tags=self.custom_tags) raise Exception(message) else: self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=self.custom_tags) # Create a set of filtered containers based on the exclude/include rules # and cache these rules in docker_util self._filter_containers(containers) containers_by_id = {} for container in containers: container_name = DockerUtil.container_name_extractor(container)[0] container_status_tags = self._get_tags(container, CONTAINER) all_containers_count[tuple(sorted(container_status_tags))] += 1 if self._is_container_running(container): running_containers_count[tuple(sorted(container_status_tags))] += 1 # Check if the container is included/excluded via its tags if self._is_container_excluded(container): self.log.debug("Container %s is excluded", container_name) continue containers_by_id[container['Id']] = container # grab pid via API if custom cgroups - otherwise we won't find process when # crawling for pids. if custom_cgroups or healthchecks: try: inspect_dict = self.docker_util.client.inspect_container(container_name) container['_pid'] = inspect_dict['State']['Pid'] container['health'] = inspect_dict['State'].get('Health', {}) except Exception as e: self.log.debug("Unable to inspect Docker container: %s", e) total_count = 0 # TODO: deprecate these 2, they should be replaced by _report_container_count for tags, count in running_containers_count.iteritems(): total_count += count self.gauge("docker.containers.running", count, tags=list(tags)) self.gauge("docker.containers.running.total", total_count, tags=self.custom_tags) total_count = 0 for tags, count in all_containers_count.iteritems(): stopped_count = count - running_containers_count[tags] total_count += stopped_count self.gauge("docker.containers.stopped", stopped_count, tags=list(tags)) self.gauge("docker.containers.stopped.total", total_count, tags=self.custom_tags) return containers_by_id def _is_container_running(self, container): """Tell if a container is running, according to its status. There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated. See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35 """ return container["Status"].startswith("Up") or container["Status"].startswith("Restarting") def _get_tags(self, entity=None, tag_type=None): """Generate the tags for a given entity (container or image) according to a list of tag names.""" # Start with custom tags tags = list(self.custom_tags) # Collect pod names as tags on kubernetes if Platform.is_k8s() and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags: self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL) self.collect_labels_as_tags.append(KubeUtil.CONTAINER_NAME_LABEL) # Collect container names as tags on rancher if Platform.is_rancher(): if RANCHER_CONTAINER_NAME not in self.collect_labels_as_tags: self.collect_labels_as_tags.append(RANCHER_CONTAINER_NAME) if RANCHER_SVC_NAME not in self.collect_labels_as_tags: self.collect_labels_as_tags.append(RANCHER_SVC_NAME) if RANCHER_STACK_NAME not in self.collect_labels_as_tags: self.collect_labels_as_tags.append(RANCHER_STACK_NAME) if entity is not None: pod_name = None namespace = None # Get labels as tags labels = entity.get("Labels") if labels is not None: for k in self.collect_labels_as_tags: if k in labels: v = labels[k] if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s(): pod_name = v k = "pod_name" if "-" in pod_name: replication_controller = "-".join(pod_name.split("-")[:-1]) if "/" in replication_controller: # k8s <= 1.1 namespace, replication_controller = replication_controller.split("/", 1) elif KubeUtil.NAMESPACE_LABEL in labels: # k8s >= 1.2 namespace = labels[KubeUtil.NAMESPACE_LABEL] tags.append("kube_namespace:%s" % namespace) tags.append("kube_replication_controller:%s" % replication_controller) tags.append("pod_name:%s" % pod_name) elif k == KubeUtil.CONTAINER_NAME_LABEL and Platform.is_k8s(): if v: tags.append("kube_container_name:%s" % v) elif k == SWARM_SVC_LABEL and Platform.is_swarm(): if v: tags.append("swarm_service:%s" % v) elif k == RANCHER_CONTAINER_NAME and Platform.is_rancher(): if v: tags.append('rancher_container:%s' % v) elif k == RANCHER_SVC_NAME and Platform.is_rancher(): if v: tags.append('rancher_service:%s' % v) elif k == RANCHER_STACK_NAME and Platform.is_rancher(): if v: tags.append('rancher_stack:%s' % v) elif not v: tags.append(k) else: tags.append("%s:%s" % (k, v)) if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s() and k not in labels: tags.append("pod_name:no_pod") # Get entity specific tags if tag_type is not None: tag_names = self.tag_names[tag_type] for tag_name in tag_names: tag_value = self._extract_tag_value(entity, tag_name) if tag_value is not None: for t in tag_value: tags.append('%s:%s' % (tag_name, str(t).strip())) # Add kube labels and creator/service tags if Platform.is_k8s() and namespace and pod_name: kube_tags = self.kube_pod_tags.get("{0}/{1}".format(namespace, pod_name)) if kube_tags: tags.extend(list(kube_tags)) if self.metadata_collector.has_detected(): orch_tags = self.metadata_collector.get_container_tags(co=entity) tags.extend(orch_tags) return tags def _extract_tag_value(self, entity, tag_name): """Extra tag information from the API result (containers or images). Cache extracted tags inside the entity object. """ if tag_name not in TAG_EXTRACTORS: self.warning("%s isn't a supported tag", tag_name) return # Check for already extracted tags if "_tag_values" not in entity: entity["_tag_values"] = {} if tag_name not in entity["_tag_values"]: entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity) return entity["_tag_values"][tag_name] def _filter_containers(self, containers): if not self.docker_util.filtering_enabled: return self._filtered_containers = set() for container in containers: container_tags = self._get_tags(container, FILTERED) # exclude/include patterns are stored in docker_util to share them with other container-related checks if self.docker_util.are_tags_filtered(container_tags): container_name = DockerUtil.container_name_extractor(container)[0] self._filtered_containers.add(container_name) self.log.debug("Container %s is filtered", container_name) def _is_container_excluded(self, container): """Check if a container is excluded according to the filter rules. Requires _filter_containers to run first. """ container_name = DockerUtil.container_name_extractor(container)[0] return container_name in self._filtered_containers def _report_container_size(self, containers_by_id): for container in containers_by_id.itervalues(): if self._is_container_excluded(container): continue tags = self._get_tags(container, PERFORMANCE) m_func = FUNC_MAP[GAUGE][self.use_histogram] if "SizeRw" in container: m_func(self, 'docker.container.size_rw', container['SizeRw'], tags=tags) if "SizeRootFs" in container: m_func( self, 'docker.container.size_rootfs', container['SizeRootFs'], tags=tags) def _send_container_healthcheck_sc(self, containers_by_id): """Send health service checks for containers.""" for container in containers_by_id.itervalues(): healthcheck_tags = self._get_tags(container, HEALTHCHECK) match = False for tag in healthcheck_tags: for rule in self.whitelist_patterns: if re.match(rule, tag): match = True self._submit_healthcheck_sc(container) break if match: break def _submit_healthcheck_sc(self, container): health = container.get('health', {}) status = AgentCheck.UNKNOWN if health: _health = health.get('Status', '') if _health == 'unhealthy': status = AgentCheck.CRITICAL elif _health == 'healthy': status = AgentCheck.OK tags = self._get_tags(container, CONTAINER) self.service_check(HEALTHCHECK_SERVICE_CHECK_NAME, status, tags=tags) def _report_container_count(self, containers_by_id): """Report container count per state""" m_func = FUNC_MAP[GAUGE][self.use_histogram] per_state_count = defaultdict(int) filterlambda = lambda ctr: not self._is_container_excluded(ctr) containers = list(filter(filterlambda, containers_by_id.values())) for ctr in containers: per_state_count[ctr.get('State', '')] += 1 for state in per_state_count: if state: m_func(self, 'docker.container.count', per_state_count[state], tags=['container_state:%s' % state.lower()]) def _report_volume_count(self): """Report volume count per state (dangling or not)""" m_func = FUNC_MAP[GAUGE][self.use_histogram] attached_volumes = self.docker_util.client.volumes(filters={'dangling': False}) dangling_volumes = self.docker_util.client.volumes(filters={'dangling': True}) attached_count = len(attached_volumes.get('Volumes', []) or []) dangling_count = len(dangling_volumes.get('Volumes', []) or []) m_func(self, 'docker.volume.count', attached_count, tags=['volume_state:attached']) m_func(self, 'docker.volume.count', dangling_count, tags=['volume_state:dangling']) def _report_image_size(self, images): for image in images: tags = self._get_tags(image, IMAGE) if 'VirtualSize' in image: self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags) if 'Size' in image: self.gauge('docker.image.size', image['Size'], tags=tags) # Performance metrics def _report_performance_metrics(self, containers_by_id): containers_without_proc_root = [] for container_id, container in containers_by_id.iteritems(): if self._is_container_excluded(container) or not self._is_container_running(container): continue tags = self._get_tags(container, PERFORMANCE) try: self._report_cgroup_metrics(container, tags) if "_proc_root" not in container: containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0]) continue self._report_net_metrics(container, tags) except BogusPIDException as e: self.log.warning('Unable to report cgroup metrics for container %s: %s', container_id[:12], e) if containers_without_proc_root: message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format( ", ".join(containers_without_proc_root)) if not Platform.is_k8s(): self.warning(message) else: # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway self.log.debug(message) def _report_cgroup_metrics(self, container, tags): cgroup_stat_file_failures = 0 if not container.get('_pid'): raise BogusPIDException('Cannot report on bogus pid(0)') for cgroup in CGROUP_METRICS: try: stat_file = self._get_cgroup_from_proc(cgroup["cgroup"], container['_pid'], cgroup['file']) except MountException as e: # We can't find a stat file self.warning(str(e)) cgroup_stat_file_failures += 1 if cgroup_stat_file_failures >= len(CGROUP_METRICS): self.warning("Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now.") except IOError as e: self.log.debug("Cannot read cgroup file, container likely raced to finish : %s", e) else: stats = self._parse_cgroup_file(stat_file) if stats: for key, (dd_key, metric_func) in cgroup['metrics'].iteritems(): metric_func = FUNC_MAP[metric_func][self.use_histogram] if key in stats: metric_func(self, dd_key, int(stats[key]), tags=tags) # Computed metrics for mname, (key_list, fct, metric_func) in cgroup.get('to_compute', {}).iteritems(): values = [stats[key] for key in key_list if key in stats] if len(values) != len(key_list): self.log.debug("Couldn't compute %s, some keys were missing.", mname) continue value = fct(*values) metric_func = FUNC_MAP[metric_func][self.use_histogram] if value is not None: metric_func(self, mname, value, tags=tags) def _report_net_metrics(self, container, tags): """Find container network metrics by looking at /proc/$PID/net/dev of the container process.""" if self._disable_net_metrics: self.log.debug("Network metrics are disabled. Skipping") return proc_net_file = os.path.join(container['_proc_root'], 'net/dev') try: if container['Id'] in self.network_mappings: networks = self.network_mappings[container['Id']] else: networks = self.docker_util.get_container_network_mapping(container) if not networks: networks = {'eth0': 'bridge'} self.network_mappings[container['Id']] = networks except Exception as e: # Revert to previous behaviour if the method is missing or failing # Debug message will only appear once per container, then the cache is used self.log.debug("Failed to build docker network mapping, using failsafe. Exception: %s", e) networks = {'eth0': 'bridge'} self.network_mappings[container['Id']] = networks try: with open(proc_net_file, 'r') as fp: lines = fp.readlines() """Two first lines are headers: Inter-| Receive | Transmit face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed """ for l in lines[2:]: cols = l.split(':', 1) interface_name = str(cols[0]).strip() if interface_name in networks: net_tags = tags + ['docker_network:'+networks[interface_name]] x = cols[1].split() m_func = FUNC_MAP[RATE][self.use_histogram] m_func(self, "docker.net.bytes_rcvd", long(x[0]), net_tags) m_func(self, "docker.net.bytes_sent", long(x[8]), net_tags) except IOError as e: # It is possible that the container got stopped between the API call and now self.log.debug("Cannot read network interface file, container likely raced to finish : %s", e) def _invalidate_network_mapping_cache(self, api_events): for ev in api_events: try: if ev.get('Type') == 'network' and ev.get('Action').endswith('connect'): container_id = ev.get('Actor').get('Attributes').get('container') if container_id in self.network_mappings: self.log.debug("Removing network mapping cache for container %s", container_id) del self.network_mappings[container_id] except Exception: self.log.warning('Malformed network event: %s', str(ev)) def _process_events(self, containers_by_id): api_events = self._get_events() if self.collect_exit_codes: self._report_exit_codes(api_events, containers_by_id) if self.collect_events: try: aggregated_events = self._pre_aggregate_events(api_events, containers_by_id) events = self._format_events(aggregated_events, containers_by_id) except (socket.timeout, urllib2.URLError): self.warning('Timeout when collecting events. Events will be missing.') return except Exception as e: self.warning("Unexpected exception when collecting events: {0}. " "Events will be missing".format(e)) return for ev in events: self.log.debug("Creating event: %s" % ev['msg_title']) self.event(ev) def _get_events(self): """Get the list of events.""" events, changed_container_ids = self.docker_util.get_events() if not self._disable_net_metrics: self._invalidate_network_mapping_cache(events) if changed_container_ids and self._service_discovery: get_sd_backend(self.agentConfig).update_checks(changed_container_ids) if changed_container_ids: self.metadata_collector.invalidate_cache(events) return events def _pre_aggregate_events(self, api_events, containers_by_id): # Aggregate events, one per image. Put newer events first. events = defaultdict(deque) for event in api_events: # Skip events related to filtered containers container = containers_by_id.get(event.get('id')) if container is not None and self._is_container_excluded(container): self.log.debug("Excluded event: container {0} status changed to {1}".format( event['id'], event['status'])) continue # from may be missing (for network events for example) if 'from' in event: image_name = event['from'] if image_name.startswith('sha256:'): image_name = self.docker_util.image_name_extractor({'Image': image_name}) events[image_name].appendleft(event) return events def _format_events(self, aggregated_events, containers_by_id): events = [] for image_name, event_group in aggregated_events.iteritems(): container_tags = set() filtered_events_count = 0 normal_prio_events = [] for event in event_group: # Only keep events that are not configured to be filtered out if event['status'].startswith(self.filtered_event_types): filtered_events_count += 1 continue container_name = event['id'][:11] if event['id'] in containers_by_id: cont = containers_by_id[event['id']] container_name = DockerUtil.container_name_extractor(cont)[0] container_tags.update(self._get_tags(cont, PERFORMANCE)) container_tags.add('container_name:%s' % container_name) # Add additional docker event attributes as tag for attr in self.event_attributes_as_tags: if attr in event['Actor']['Attributes'] and attr not in EXCLUDED_ATTRIBUTES: container_tags.add('%s:%s' % (attr, event['Actor']['Attributes'][attr])) normal_prio_events.append((event, container_name)) if filtered_events_count: self.log.debug('%d events were filtered out because of ignored event type', filtered_events_count) normal_event = self._create_dd_event(normal_prio_events, image_name, container_tags, priority='Normal') if normal_event: events.append(normal_event) return events def _report_exit_codes(self, events, containers_by_id): for event in events: container_tags = set() container = containers_by_id.get(event.get('id')) # Skip events related to filtered containers if container is not None and self._is_container_excluded(container): continue # Report the exit code in case of a DIE event if container is not None and event['status'] == 'die': container_name = DockerUtil.container_name_extractor(container)[0] container_tags.update(self._get_tags(container, CONTAINER)) container_tags.add('container_name:%s' % container_name) try: exit_code = int(event['Actor']['Attributes']['exitCode']) message = 'Container %s exited with %s' % (container_name, exit_code) status = AgentCheck.OK if exit_code == 0 else AgentCheck.CRITICAL self.service_check(EXIT_SERVICE_CHECK_NAME, status, tags=list(container_tags), message=message) except KeyError: self.log.warning('Unable to collect the exit code for container %s', container_name) def _create_dd_event(self, events, image, c_tags, priority='Normal'): """Create the actual event to submit from a list of similar docker events""" if not events: return max_timestamp = 0 status = defaultdict(int) status_change = [] for ev, c_name in events: max_timestamp = max(max_timestamp, int(ev['time'])) status[ev['status']] += 1 status_change.append([c_name, ev['status']]) status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()]) msg_title = "%s %s on %s" % (image, status_text, self.hostname) msg_body = ( "%%%\n" "{image_name} {status} on {hostname}\n" "```\n{status_changes}\n```\n" "%%%" ).format( image_name=image, status=status_text, hostname=self.hostname, status_changes="\n".join( ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change]) ) if any(error in status_text for error in ERROR_ALERT_TYPE): alert_type = "error" else: alert_type = None return { 'timestamp': max_timestamp, 'host': self.hostname, 'event_type': EVENT_TYPE, 'msg_title': msg_title, 'msg_text': msg_body, 'source_type_name': EVENT_TYPE, 'event_object': 'docker:%s' % image, 'tags': list(c_tags), 'alert_type': alert_type, 'priority': priority } def _report_disk_stats(self): """Report metrics about the volume space usage""" stats = { 'docker.data.used': None, 'docker.data.total': None, 'docker.data.free': None, 'docker.metadata.used': None, 'docker.metadata.total': None, 'docker.metadata.free': None # these two are calculated by _calc_percent_disk_stats # 'docker.data.percent': None, # 'docker.metadata.percent': None } info = self.docker_util.client.info() driver_status = info.get('DriverStatus', []) if not driver_status: self.log.warning('Disk metrics collection is enabled but docker info did not' ' report any. Your storage driver might not support them, skipping.') return for metric in driver_status: # only consider metrics about disk space if len(metric) == 2 and 'Space' in metric[0]: # identify Data and Metadata metrics mtype = 'data' if 'Metadata' in metric[0]: mtype = 'metadata' if 'Used' in metric[0]: stats['docker.{0}.used'.format(mtype)] = metric[1] elif 'Space Total' in metric[0]: stats['docker.{0}.total'.format(mtype)] = metric[1] elif 'Space Available' in metric[0]: stats['docker.{0}.free'.format(mtype)] = metric[1] stats = self._format_disk_metrics(stats) stats.update(self._calc_percent_disk_stats(stats)) tags = self._get_tags() for name, val in stats.iteritems(): if val is not None: self.gauge(name, val, tags) def _format_disk_metrics(self, metrics): """Cast the disk stats to float and convert them to bytes""" for name, raw_val in metrics.iteritems(): if raw_val: match = DISK_STATS_RE.search(raw_val) if match is None or len(match.groups()) != 2: self.log.warning('Can\'t parse value %s for disk metric %s. Dropping it.', raw_val, name) metrics[name] = None val, unit = match.groups() # by default some are uppercased others lowercased. That's error prone. unit = unit.lower() try: val = int(float(val) * UNIT_MAP[unit]) metrics[name] = val except KeyError: self.log.error('Unrecognized unit %s for disk metric %s. Dropping it.', unit, name) metrics[name] = None return metrics def _calc_percent_disk_stats(self, stats): """Calculate a percentage of used disk space for data and metadata""" mtypes = ['data', 'metadata'] percs = {} for mtype in mtypes: used = stats.get('docker.{0}.used'.format(mtype)) total = stats.get('docker.{0}.total'.format(mtype)) free = stats.get('docker.{0}.free'.format(mtype)) if used and total and free and ceil(total) < free + used: self.log.debug('used, free, and total disk metrics may be wrong, ' 'used: %s, free: %s, total: %s', used, free, total) total = used + free try: if isinstance(used, int): percs['docker.{0}.percent'.format(mtype)] = round(100 * float(used) / float(total), 2) elif isinstance(free, int): percs['docker.{0}.percent'.format(mtype)] = round(100 * (1.0 - (float(free) / float(total))), 2) except ZeroDivisionError: self.log.error('docker.{0}.total is 0, calculating docker.{1}.percent' ' is not possible.'.format(mtype, mtype)) return percs # Cgroups def _get_cgroup_from_proc(self, cgroup, pid, filename): """Find a specific cgroup file, containing metrics to extract.""" params = { "file": filename, } return DockerUtil.find_cgroup_from_proc(self._mountpoints, pid, cgroup, self.docker_util._docker_root) % (params) def _parse_cgroup_file(self, stat_file): """Parse a cgroup pseudo file for key/values.""" self.log.debug("Opening cgroup file: %s", stat_file) try: with open(stat_file, 'r') as fp: if 'blkio' in stat_file: return self._parse_blkio_metrics(fp.read().splitlines()) elif 'cpuacct.usage' in stat_file: return dict({'usage': str(int(fp.read())/10000000)}) elif 'memory.soft_limit_in_bytes' in stat_file: value = int(fp.read()) # do not report kernel max default value (uint64 * 4096) # see https://github.com/torvalds/linux/blob/5b36577109be007a6ecf4b65b54cbc9118463c2b/mm/memcontrol.c#L2844-L2845 # 2 ** 60 is kept for consistency of other cgroups metrics if value < 2 ** 60: return dict({'softlimit': value}) elif 'memory.kmem.usage_in_bytes' in stat_file: value = int(fp.read()) if value < 2 ** 60: return dict({'kmemusage': value}) elif 'cpu.shares' in stat_file: value = int(fp.read()) return {'shares': value} else: return dict(map(lambda x: x.split(' ', 1), fp.read().splitlines())) except IOError: # It is possible that the container got stopped between the API call and now. # Some files can also be missing (like cpu.stat) and that's fine. self.log.debug("Can't open %s. Its metrics will be missing.", stat_file) def _parse_blkio_metrics(self, stats): """Parse the blkio metrics.""" metrics = { 'io_read': 0, 'io_write': 0, } for line in stats: if 'Read' in line: metrics['io_read'] += int(line.split()[2]) if 'Write' in line: metrics['io_write'] += int(line.split()[2]) return metrics def _is_container_cgroup(self, line, selinux_policy): if line[1] not in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') or line[2] == '/docker-daemon': return False if 'docker' in line[2]: # general case return True if 'docker' in selinux_policy: # selinux return True if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2][1:]): # kubernetes return True if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2].split('/')[-1]): # kube 1.6+ qos hierarchy return True return False # proc files def _crawl_container_pids(self, container_dict, custom_cgroups=False): """Crawl `/proc` to find container PIDs and add them to `containers_by_id`.""" proc_path = os.path.join(self.docker_util._docker_root, 'proc') pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()] if len(pid_dirs) == 0: self.warning("Unable to find any pid directory in {0}. " "If you are running the agent in a container, make sure to " 'share the volume properly: "/proc:/host/proc:ro". ' "See https://github.com/DataDog/docker-dd-agent/blob/master/README.md for more information. " "Network metrics will be missing".format(proc_path)) self._disable_net_metrics = True return container_dict self._disable_net_metrics = False for folder in pid_dirs: try: path = os.path.join(proc_path, folder, 'cgroup') with open(path, 'r') as f: content = [line.strip().split(':') for line in f.readlines()] selinux_policy = '' path = os.path.join(proc_path, folder, 'attr', 'current') if os.path.exists(path): with open(path, 'r') as f: selinux_policy = f.readlines()[0] except IOError, e: # Issue #2074 self.log.debug("Cannot read %s, process likely raced to finish : %s", path, e) except Exception as e: self.warning("Cannot read %s : %s", path, e) continue try: for line in content: if self._is_container_cgroup(line, selinux_policy): cpuacct = line[2] break else: continue matches = re.findall(CONTAINER_ID_RE, cpuacct) if matches: container_id = matches[-1] if container_id not in container_dict: self.log.debug( "Container %s not in container_dict, it's likely excluded", container_id ) continue container_dict[container_id]['_pid'] = folder container_dict[container_id]['_proc_root'] = os.path.join(proc_path, folder) elif custom_cgroups: # if we match by pid that should be enough (?) - O(n) ugh! for _, container in container_dict.iteritems(): if container.get('_pid') == int(folder): container['_proc_root'] = os.path.join(proc_path, folder) break except Exception, e: self.warning("Cannot parse %s content: %s", path, e) continue
class KubeUtil: __metaclass__ = Singleton DEFAULT_METHOD = 'http' KUBELET_HEALTH_PATH = '/healthz' MACHINE_INFO_PATH = '/api/v1.3/machine/' METRICS_PATH = '/api/v1.3/subcontainers/' PODS_LIST_PATH = '/pods/' DEFAULT_CADVISOR_PORT = 4194 DEFAULT_HTTP_KUBELET_PORT = 10255 DEFAULT_HTTPS_KUBELET_PORT = 10250 DEFAULT_MASTER_PORT = 443 DEFAULT_MASTER_NAME = 'kubernetes' # DNS name to reach the master from a pod. DEFAULT_LABEL_PREFIX = 'kube_' DEFAULT_COLLECT_SERVICE_TAG = True CA_CRT_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt' AUTH_TOKEN_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/token' POD_NAME_LABEL = "io.kubernetes.pod.name" NAMESPACE_LABEL = "io.kubernetes.pod.namespace" CONTAINER_NAME_LABEL = "io.kubernetes.container.name" def __init__(self, **kwargs): self.docker_util = DockerUtil() if 'init_config' in kwargs and 'instance' in kwargs: init_config = kwargs.get('init_config', {}) instance = kwargs.get('instance', {}) else: try: config_file_path = get_conf_path(KUBERNETES_CHECK_NAME) check_config = check_yaml(config_file_path) init_config = check_config['init_config'] or {} instance = check_config['instances'][0] or {} # kubernetes.yaml was not found except IOError as ex: log.error(ex.message) init_config, instance = {}, {} except Exception: log.error('Kubernetes configuration file is invalid. ' 'Trying connecting to kubelet with default settings anyway...') init_config, instance = {}, {} self.method = instance.get('method', KubeUtil.DEFAULT_METHOD) self._node_ip = self._node_name = None # lazy evaluation self.host_name = os.environ.get('HOSTNAME') self.pod_name = os.environ.get('KUBERNETES_POD_NAME') or self.host_name self.tls_settings = self._init_tls_settings(instance) # apiserver if 'api_server_url' in instance: self.kubernetes_api_root_url = instance.get('api_server_url') else: master_host = os.environ.get('KUBERNETES_SERVICE_HOST') or self.DEFAULT_MASTER_NAME master_port = os.environ.get('KUBERNETES_SERVICE_PORT') or self.DEFAULT_MASTER_PORT self.kubernetes_api_root_url = 'https://%s:%s' % (master_host, master_port) self.kubernetes_api_url = '%s/api/v1' % self.kubernetes_api_root_url # Service mapping helper class self._service_mapper = PodServiceMapper(self) from config import _is_affirmative self.collect_service_tag = _is_affirmative(instance.get('collect_service_tags', KubeUtil.DEFAULT_COLLECT_SERVICE_TAG)) # leader status triggers event collection self.is_leader = False self.leader_elector = None self.leader_lease_duration = instance.get('leader_lease_duration') # kubelet # If kubelet_api_url is None, init_kubelet didn't succeed yet. self.init_success = False self.kubelet_api_url = None self.init_retry_interval = init_config.get('init_retry_interval', DEFAULT_RETRY_INTERVAL) self.last_init_retry = None self.left_init_retries = init_config.get('init_retries', DEFAULT_INIT_RETRIES) + 1 self.init_kubelet(instance) self.kube_label_prefix = instance.get('label_to_tag_prefix', KubeUtil.DEFAULT_LABEL_PREFIX) self.kube_node_labels = instance.get('node_labels_to_host_tags', {}) # keep track of the latest k8s event we collected and posted # default value is 0 but TTL for k8s events is one hour anyways self.last_event_collection_ts = 0 def _init_tls_settings(self, instance): """ Initialize TLS settings for connection to apiserver and kubelet. """ tls_settings = {} # apiserver client_crt = instance.get('apiserver_client_crt') client_key = instance.get('apiserver_client_key') apiserver_cacert = instance.get('apiserver_ca_cert') if client_crt and client_key and os.path.exists(client_crt) and os.path.exists(client_key): tls_settings['apiserver_client_cert'] = (client_crt, client_key) if apiserver_cacert and os.path.exists(apiserver_cacert): tls_settings['apiserver_cacert'] = apiserver_cacert # kubelet kubelet_client_crt = instance.get('kubelet_client_crt') kubelet_client_key = instance.get('kubelet_client_key') if kubelet_client_crt and kubelet_client_key and os.path.exists(kubelet_client_crt) and os.path.exists(kubelet_client_key): tls_settings['kubelet_client_cert'] = (kubelet_client_crt, kubelet_client_key) cert = instance.get('kubelet_cert') if cert: tls_settings['kubelet_verify'] = cert else: tls_settings['kubelet_verify'] = instance.get('kubelet_tls_verify', DEFAULT_TLS_VERIFY) if ('apiserver_client_cert' not in tls_settings) or ('kubelet_client_cert' not in tls_settings): # Only lookup token if we don't have client certs for both token = self.get_auth_token(instance) if token: tls_settings['bearer_token'] = token return tls_settings def init_kubelet(self, instance): """ Handles the retry logic around _locate_kubelet. Once _locate_kubelet succeeds, initialize all kubelet-related URLs and settings. """ if self.left_init_retries == 0: raise Exception("Kubernetes client initialization failed permanently. " "Kubernetes-related features will fail.") now = time.time() # last retry was less than retry_interval ago if self.last_init_retry and now <= self.last_init_retry + self.init_retry_interval: return # else it's the first try, or last retry was long enough ago self.last_init_retry = now self.left_init_retries -= 1 try: self.kubelet_api_url = self._locate_kubelet(instance) except Exception as ex: log.error("Failed to initialize kubelet connection. Will retry %s time(s). Error: %s" % (self.left_init_retries, str(ex))) return if not self.kubelet_api_url: log.error("Failed to initialize kubelet connection. Will retry %s time(s)." % self.left_init_retries) return self.init_success = True self.kubelet_host = self.kubelet_api_url.split(':')[1].lstrip('/') self.pods_list_url = urljoin(self.kubelet_api_url, KubeUtil.PODS_LIST_PATH) self.kube_health_url = urljoin(self.kubelet_api_url, KubeUtil.KUBELET_HEALTH_PATH) # namespace of the agent pod try: self.self_namespace = self.get_self_namespace() except Exception: log.warning("Failed to get the agent pod namespace, defaulting to default.") self.self_namespace = DEFAULT_NAMESPACE # cadvisor self.cadvisor_port = instance.get('port', KubeUtil.DEFAULT_CADVISOR_PORT) self.cadvisor_url = '%s://%s:%d' % (self.method, self.kubelet_host, self.cadvisor_port) self.metrics_url = urljoin(self.cadvisor_url, KubeUtil.METRICS_PATH) self.machine_info_url = urljoin(self.cadvisor_url, KubeUtil.MACHINE_INFO_PATH) def _locate_kubelet(self, instance): """ Kubelet may or may not accept un-authenticated http requests. If it doesn't we need to use its HTTPS API that may or may not require auth. Returns the kubelet URL or raises. """ host = os.environ.get('KUBERNETES_KUBELET_HOST') or instance.get("host") if not host: # if no hostname was provided, use the docker hostname if cert # validation is not required, the kubernetes hostname otherwise. docker_hostname = self.docker_util.get_hostname(should_resolve=True) if self.tls_settings.get('kubelet_verify'): try: k8s_hostname = self.get_node_hostname(docker_hostname) host = k8s_hostname or docker_hostname except Exception as ex: log.error(str(ex)) host = docker_hostname else: host = docker_hostname # check if the no-auth endpoint is enabled port = instance.get('kubelet_port', KubeUtil.DEFAULT_HTTP_KUBELET_PORT) no_auth_url = 'http://%s:%s' % (host, port) test_url = urljoin(no_auth_url, KubeUtil.KUBELET_HEALTH_PATH) try: self.perform_kubelet_query(test_url) return no_auth_url except Exception: log.debug("Couldn't query kubelet over HTTP, assuming it's not in no_auth mode.") port = instance.get('kubelet_port', KubeUtil.DEFAULT_HTTPS_KUBELET_PORT) https_url = 'https://%s:%s' % (host, port) test_url = urljoin(https_url, KubeUtil.KUBELET_HEALTH_PATH) try: self.perform_kubelet_query(test_url) return https_url except Exception as ex: log.warning("Couldn't query kubelet over HTTP, assuming it's not in no_auth mode.") raise ex def get_self_namespace(self): pods = self.retrieve_pods_list() for pod in pods.get('items', []): if pod.get('metadata', {}).get('name') == self.pod_name: return pod['metadata']['namespace'] log.warning("Couldn't find the agent pod and namespace, using the default.") return DEFAULT_NAMESPACE def get_node_hostname(self, host): """ Query the API server for the kubernetes hostname of the node using the docker hostname as a filter. """ node_filter = {'labelSelector': 'kubernetes.io/hostname=%s' % host} node = self.retrieve_json_auth( self.kubernetes_api_url + '/nodes?%s' % urlencode(node_filter) ).json() if len(node['items']) != 1: log.error('Error while getting node hostname: expected 1 node, got %s.' % len(node['items'])) else: addresses = (node or {}).get('items', [{}])[0].get('status', {}).get('addresses', []) for address in addresses: if address.get('type') == 'Hostname': return address['address'] return None def get_kube_pod_tags(self, excluded_keys=None): """ Gets pods' labels as tags + creator and service tags. Returns a dict{namespace/podname: [tags]} """ if not self.init_success: log.warning("Kubernetes client is not initialized, can't get pod tags.") return {} pods = self.retrieve_pods_list() return self.extract_kube_pod_tags(pods, excluded_keys=excluded_keys) def extract_kube_pod_tags(self, pods_list, excluded_keys=None, label_prefix=None): """ Extract labels + creator and service tags from a list of pods coming from the kubelet API. :param excluded_keys: labels to skip :param label_prefix: prefix for label->tag conversion, None defaults to the configuration option label_to_tag_prefix Returns a dict{namespace/podname: [tags]} """ excluded_keys = excluded_keys or [] kube_labels = defaultdict(list) pod_items = pods_list.get("items") or [] label_prefix = label_prefix or self.kube_label_prefix for pod in pod_items: metadata = pod.get("metadata", {}) name = metadata.get("name") namespace = metadata.get("namespace") labels = metadata.get("labels", {}) if name and namespace: key = "%s/%s" % (namespace, name) # Extract creator tags podtags = self.get_pod_creator_tags(metadata) # Extract services tags if self.collect_service_tag: for service in self.match_services_for_pod(metadata): if service is not None: podtags.append(u'kube_service:%s' % service) # Extract labels for k, v in labels.iteritems(): if k in excluded_keys: continue podtags.append(u"%s%s:%s" % (label_prefix, k, v)) kube_labels[key] = podtags return kube_labels def retrieve_pods_list(self): """ Retrieve the list of pods for this cluster querying the kubelet API. TODO: the list of pods could be cached with some policy to be decided. """ return self.perform_kubelet_query(self.pods_list_url).json() def retrieve_machine_info(self): """ Retrieve machine info from Cadvisor. """ return retrieve_json(self.machine_info_url) def retrieve_metrics(self): """ Retrieve metrics from Cadvisor. """ return retrieve_json(self.metrics_url) def get_deployment_for_replicaset(self, rs_name): """ Get the deployment name for a given replicaset name For now, the rs name's first part always is the deployment's name, see https://github.com/kubernetes/kubernetes/blob/release-1.6/pkg/controller/deployment/sync.go#L299 But it might change in a future k8s version. The other way to match RS and deployments is to parse and cache /apis/extensions/v1beta1/replicasets, mirroring PodServiceMapper In 1.8, the hash generation logic changed: https://github.com/kubernetes/kubernetes/pull/51538/files As none of these naming schemes have guaranteed suffix lenghts, we have to be pretty permissive in what kind of suffix we match. That can lead to false positives, although their impact would be limited (erroneous kube_deployment tag, but the kube_replica_set tag will be present). For example, the hardcoded replicaset name prefix-34 or prefix-cfd will match. For agent6, we plan on doing this pod->replicaset->deployment matching in the cluster agent, with replicaset data from the apiserver. This will address that risk. """ end = rs_name.rfind("-") if end > 0 and rs_name[end + 1:].isdigit(): # k8s before 1.8 return rs_name[0:end] if end > 0 and len(rs_name[end + 1:]) > 2: # k8s 1.8+ maybe? Check contents for char in rs_name[end + 1:]: if char not in ALLOWED_ENCODESTRING_ALPHANUMS: return None return rs_name[0:end] else: return None def perform_kubelet_query(self, url, verbose=True, timeout=10): """ Perform and return a GET request against kubelet. Support auth and TLS validation. """ tls_context = self.tls_settings headers = None cert = tls_context.get('kubelet_client_cert') verify = tls_context.get('kubelet_verify', DEFAULT_TLS_VERIFY) # if cert-based auth is enabled, don't use the token. if not cert and url.lower().startswith('https') and 'bearer_token' in self.tls_settings: headers = {'Authorization': 'Bearer {}'.format(self.tls_settings.get('bearer_token'))} return requests.get(url, timeout=timeout, verify=verify, cert=cert, headers=headers, params={'verbose': verbose}) def get_apiserver_auth_settings(self): """ Kubernetes API requires authentication using a token available in every pod, or with a client X509 cert/key pair. We authenticate using the service account token by default and replace this behavior with cert authentication if the user provided a cert/key pair in the instance. We try to verify the server TLS cert if the public cert is available. """ verify = self.tls_settings.get('apiserver_cacert') if not verify: verify = self.CA_CRT_PATH if os.path.exists(self.CA_CRT_PATH) else False log.debug('tls validation: {}'.format(verify)) cert = self.tls_settings.get('apiserver_client_cert') bearer_token = self.tls_settings.get('bearer_token') if not cert else None headers = {'Authorization': 'Bearer {}'.format(bearer_token)} if bearer_token else {} headers['content-type'] = 'application/json' return cert, headers, verify def retrieve_json_auth(self, url, params=None, timeout=3): cert, headers, verify = self.get_apiserver_auth_settings() res = requests.get(url, timeout=timeout, headers=headers, verify=verify, cert=cert, params=params) res.raise_for_status() return res def post_json_to_apiserver(self, url, data, timeout=3): cert, headers, verify = self.get_apiserver_auth_settings() res = requests.post(url, timeout=timeout, headers=headers, verify=verify, cert=cert, data=json.dumps(data)) res.raise_for_status() return res def put_json_to_apiserver(self, url, data, timeout=3): cert, headers, verify = self.get_apiserver_auth_settings() res = requests.put(url, timeout=timeout, headers=headers, verify=verify, cert=cert, data=json.dumps(data)) res.raise_for_status() return res def delete_to_apiserver(self, url, timeout=3): cert, headers, verify = self.get_apiserver_auth_settings() res = requests.delete(url, timeout=timeout, headers=headers, verify=verify, cert=cert) res.raise_for_status() return res def get_node_info(self): """ Return the IP address and the hostname of the node where the pod is running. """ if None in (self._node_ip, self._node_name): self._fetch_host_data() return self._node_ip, self._node_name def get_node_metadata(self): """Returns host metadata about the local k8s node""" meta = {} # API server version try: request_url = "%s/version" % self.kubernetes_api_root_url master_info = self.retrieve_json_auth(request_url).json() version = master_info.get("gitVersion") meta['kube_master_version'] = version[1:] except Exception as ex: # Intentional use of non-safe lookups to get the exception in the debug logs # if the parsing were to fail log.debug("Error getting Kube master version: %s" % str(ex)) # Kubelet version & labels if not self.init_success: log.warning("Kubelet client failed to initialize, kubelet host tags will be missing for now.") return meta try: _, node_name = self.get_node_info() if not node_name: raise ValueError("node name missing or empty") request_url = "%s/nodes/%s" % (self.kubernetes_api_url, node_name) node_info = self.retrieve_json_auth(request_url).json() version = node_info.get("status").get("nodeInfo").get("kubeletVersion") meta['kubelet_version'] = version[1:] except Exception as ex: log.debug("Error getting Kubelet version: %s" % str(ex)) return meta def get_node_hosttags(self): """ Returns node labels as tags. Tag name is transformed as defined in node_labels_to_host_tags in the kubernetes check configuration. Note: queries the API server for node info. Configure RBAC accordingly. """ tags = [] try: _, node_name = self.get_node_info() if not node_name: raise ValueError("node name missing or empty") request_url = "%s/nodes/%s" % (self.kubernetes_api_url, node_name) node_info = self.retrieve_json_auth(request_url).json() node_labels = node_info.get('metadata', {}).get('labels', {}) for l_name, t_name in self.kube_node_labels.iteritems(): if l_name in node_labels: tags.append('%s:%s' % (t_name, node_labels[l_name])) except Exception as ex: log.debug("Error getting node labels: %s" % str(ex)) return tags def _fetch_host_data(self): """ Retrieve host name and IP address from the payload returned by the listing pods endpoints from kubelet. The host IP address is different from the default router for the pod. """ try: pod_items = self.retrieve_pods_list().get("items") or [] except Exception as e: log.warning("Unable to retrieve pod list %s. Not fetching host data", str(e)) return # Take the first Pod with a status: # all running pods have the adapted '.spec.nodeName' # static pods doesn't have the '.status.hostIP' for pod in pod_items: node_name = pod.get('spec', {}).get('nodeName', '') if not self._node_name and node_name: self._node_name = node_name # hostIP is not fill on static Pods host_ip = pod.get('status', {}).get('hostIP', '') if not self._node_ip and host_ip: self._node_ip = host_ip if self._node_name and self._node_ip: return log.warning("Cannot set both node_name: '%s' and node_ip: '%s' from PodList with %d items", self._node_name, self._node_ip, len(pod_items)) def extract_event_tags(self, event): """ Return a list of tags extracted from an event object """ tags = [] if 'reason' in event: tags.append('reason:%s' % event.get('reason', '').lower()) if 'namespace' in event.get('metadata', {}): tags.append('namespace:%s' % event['metadata']['namespace']) if 'host' in event.get('source', {}): tags.append('node_name:%s' % event['source']['host']) if 'kind' in event.get('involvedObject', {}): tags.append('object_type:%s' % event['involvedObject'].get('kind', '').lower()) if 'name' in event.get('involvedObject', {}): tags.append('object_name:%s' % event['involvedObject'].get('name','').lower()) if 'component' in event.get('source', {}): tags.append('source_component:%s' % event['source'].get('component','').lower()) return tags def are_tags_filtered(self, tags): """ Because it is a pain to call it from the kubernetes check otherwise. """ return self.docker_util.are_tags_filtered(tags) @classmethod def get_auth_token(cls, instance): """ Return a string containing the authorization token for the pod. """ token_path = instance.get('bearer_token_path', cls.AUTH_TOKEN_PATH) try: with open(token_path) as f: return f.read().strip() except IOError as e: log.error('Unable to read token from {}: {}'.format(token_path, e)) return None def match_services_for_pod(self, pod_metadata, refresh=False): """ Match the pods labels with services' label selectors to determine the list of services that point to that pod. Returns an array of service names. Pass refresh=True if you want to bypass the cached cid->services mapping (after a service change) """ s = self._service_mapper.match_services_for_pod(pod_metadata, refresh, names=True) #log.warning("Matches for %s: %s" % (pod_metadata.get('name'), str(s))) return s def get_event_retriever(self, namespaces=None, kinds=None, delay=None): """ Returns a KubeEventRetriever object ready for action """ return KubeEventRetriever(self, namespaces, kinds, delay) def match_containers_for_pods(self, pod_uids, podlist=None): """ Reads a set of pod uids and returns the set of docker container ids they manage podlist should be a recent self.retrieve_pods_list return value, if not given that method will be called """ cids = set() if not isinstance(pod_uids, set) or len(pod_uids) < 1: return cids if podlist is None: podlist = self.retrieve_pods_list() for pod in podlist.get('items', {}): uid = pod.get('metadata', {}).get('uid', None) if uid in pod_uids: for container in pod.get('status', {}).get('containerStatuses', None): id = container.get('containerID', "") if id.startswith("docker://"): cids.add(id[9:]) return cids def get_pod_creator(self, pod_metadata): """ Get the pod's creator from its metadata and returns a tuple (creator_kind, creator_name) This allows for consitency across code path """ try: owner_references_entry = pod_metadata['ownerReferences'][0] creator_kind = owner_references_entry['kind'] creator_name = owner_references_entry['name'] return creator_kind, creator_name except LookupError as e: try: log.debug('Could not parse creator for pod %s through `OwnerReferences`, falling back to annotation: %s', pod_metadata.get('name', ''), type(e)) created_by = json.loads(pod_metadata['annotations']['kubernetes.io/created-by']) creator_kind = created_by.get('reference', {}).get('kind') creator_name = created_by.get('reference', {}).get('name') return creator_kind, creator_name except Exception as e: log.debug('Could not parse creator for pod %s: %s', pod_metadata.get('name', ''), type(e)) return None, None def get_pod_creator_tags(self, pod_metadata, legacy_rep_controller_tag=False): """ Get the pod's creator from its metadata and returns a list of tags in the form kube_$kind:$name, ready to add to the metrics """ try: tags = [] creator_kind, creator_name = self.get_pod_creator(pod_metadata) if creator_kind in CREATOR_KIND_TO_TAG and creator_name: tags.append("%s:%s" % (CREATOR_KIND_TO_TAG[creator_kind], creator_name)) if creator_kind == 'ReplicaSet': deployment = self.get_deployment_for_replicaset(creator_name) if deployment: tags.append("%s:%s" % (CREATOR_KIND_TO_TAG['Deployment'], deployment)) if legacy_rep_controller_tag and creator_kind != 'ReplicationController' and creator_name: tags.append('kube_replication_controller:{0}'.format(creator_name)) return tags except Exception: log.warning('Could not parse creator tags for pod ' + pod_metadata.get('name')) return [] def process_events(self, event_array, podlist=None): """ Reads a list of kube events, invalidates caches and and computes a set of containers impacted by the changes, to refresh service discovery Pod creation/deletion events are ignored for now, as docker_daemon already sends container creation/deletion events to SD Pod->containers matching is done using match_containers_for_pods """ try: pods = set() if self._service_mapper: pods.update(self._service_mapper.process_events(event_array)) return self.match_containers_for_pods(pods, podlist) except Exception as e: log.warning("Error processing events %s: %s" % (str(event_array), e)) return set() def refresh_leader(self): if not self.init_success: log.warning("Kubelet client is not initialized, leader election is disabled.") return if not self.leader_elector: self.leader_elector = LeaderElector(self) self.leader_elector.try_acquire_or_refresh() def image_name_resolver(self, image): """ Wraps around the sibling dockerutil method and catches exceptions """ if image is None: return None try: return self.docker_util.image_name_resolver(image) except Exception as e: log.warning("Error resolving image name: %s", str(e)) return image