def init(self): try: instance = self.instances[0] self.docker_util = DockerUtil() self.docker_client = self.docker_util.client self.docker_gateway = DockerUtil.get_gateway() if Platform.is_k8s(): self.kubeutil = KubeUtil() # We configure the check with the right cgroup settings for this host # Just needs to be done once self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS) self.cgroup_listing_retries = 0 self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False # Set tagging options self.custom_tags = instance.get("tags", []) self.collect_labels_as_tags = instance.get("collect_labels_as_tags", []) self.kube_labels = {} self.use_histogram = _is_affirmative(instance.get('use_histogram', False)) performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: performance_tags, IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } # Set filtering settings if not instance.get("exclude"): self._filtering_enabled = False if instance.get("include"): self.log.warning("You must specify an exclude section to enable filtering") else: self._filtering_enabled = True include = instance.get("include", []) exclude = instance.get("exclude", []) self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude) self.tag_names[FILTERED] = _filtered_tag_names # Other options self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False)) self.collect_events = _is_affirmative(instance.get('collect_events', True)) self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False)) self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False)) self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance() self.ecs_tags = {} except Exception as e: self.log.critical(e) self.warning("Initialization failed. Will retry at next iteration") else: self.init_success = True
def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception( 'Kubernetes check only supports one configured instance.') AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.kubeutil = KubeUtil() if not self.kubeutil.host: raise Exception( 'Unable to get default router and host parameter is not set')
def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception('Kubernetes check only supports one configured instance.') AgentCheck.__init__(self, name, init_config, agentConfig, instances) inst = instances[0] if instances is not None else None self.kubeutil = KubeUtil(instance=inst) if not self.kubeutil.host: raise Exception('Unable to retrieve Docker hostname and host parameter is not set')
def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception('Kubernetes check only supports one configured instance.') AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.kubeutil = KubeUtil() if not self.kubeutil.host: raise Exception('Unable to get default router and host parameter is not set')
def __init__(self, agentConfig): self.docker_client = DockerUtil().client if is_k8s(): self.kubeutil = KubeUtil() try: self.config_store = get_config_store(agentConfig=agentConfig) except Exception as e: log.error('Failed to instantiate the config store client. ' 'Auto-config only will be used. %s' % str(e)) agentConfig['sd_config_backend'] = None self.config_store = get_config_store(agentConfig=agentConfig) self.VAR_MAPPING = { 'host': self._get_host, 'port': self._get_ports, 'tags': self._get_additional_tags, } AbstractSDBackend.__init__(self, agentConfig)
def test_extract_event_tags(self): events = json.loads( Fixtures.read_file("events.json", string_escape=False))['items'] for ev in events: tags = KubeUtil().extract_event_tags(ev) # there should be 4 tags except for some events where source.host is missing self.assertTrue(len(tags) >= 3) tag_names = [tag.split(':')[0] for tag in tags] self.assertIn('reason', tag_names) self.assertIn('namespace', tag_names) self.assertIn('object_type', tag_names) if len(tags) == 4: self.assertIn('node_name', tag_names)
def setUp(self): self.kubeutil = KubeUtil()
class TestKubeutil(unittest.TestCase): def setUp(self): self.kubeutil = KubeUtil() @mock.patch('utils.kubeutil.KubeUtil.retrieve_pods_list', side_effect=['foo']) @mock.patch('utils.kubeutil.KubeUtil.extract_kube_labels') def test_get_kube_labels(self, extract_kube_labels, retrieve_pods_list): self.kubeutil.get_kube_labels(excluded_keys='bar') retrieve_pods_list.assert_called_once() extract_kube_labels.assert_called_once_with('foo', excluded_keys='bar') def test_extract_kube_labels(self): """ Test with both 1.1 and 1.2 version payloads """ res = self.kubeutil.extract_kube_labels({}, ['foo']) self.assertEqual(len(res), 0) pods = json.loads( Fixtures.read_file("pods_list_1.1.json", string_escape=False)) res = self.kubeutil.extract_kube_labels(pods, ['foo']) labels = set(inn for out in res.values() for inn in out) self.assertEqual(len(labels), 8) res = self.kubeutil.extract_kube_labels(pods, ['k8s-app']) labels = set(inn for out in res.values() for inn in out) self.assertEqual(len(labels), 6) pods = json.loads( Fixtures.read_file("pods_list_1.2.json", string_escape=False)) res = self.kubeutil.extract_kube_labels(pods, ['foo']) labels = set(inn for out in res.values() for inn in out) self.assertEqual(len(labels), 3) res = self.kubeutil.extract_kube_labels(pods, ['k8s-app']) labels = set(inn for out in res.values() for inn in out) self.assertEqual(len(labels), 3) def test_extract_meta(self): """ Test with both 1.1 and 1.2 version payloads """ res = self.kubeutil.extract_meta({}, 'foo') self.assertEqual(len(res), 0) pods = json.loads( Fixtures.read_file("pods_list_1.1.json", string_escape=False)) res = self.kubeutil.extract_meta(pods, 'foo') self.assertEqual(len(res), 0) res = self.kubeutil.extract_meta(pods, 'uid') self.assertEqual(len(res), 6) pods = json.loads( Fixtures.read_file("pods_list_1.2.json", string_escape=False)) res = self.kubeutil.extract_meta(pods, 'foo') self.assertEqual(len(res), 0) res = self.kubeutil.extract_meta(pods, 'uid') self.assertEqual(len(res), 4) @mock.patch('utils.kubeutil.retrieve_json') def test_retrieve_pods_list(self, retrieve_json): self.kubeutil.retrieve_pods_list() retrieve_json.assert_called_once_with(self.kubeutil.pods_list_url) @mock.patch('utils.kubeutil.retrieve_json') def test_retrieve_metrics(self, retrieve_json): self.kubeutil.retrieve_metrics() retrieve_json.assert_called_once_with(self.kubeutil.metrics_url) def test_filter_pods_list(self): """ Test with both 1.1 and 1.2 version payloads """ res = self.kubeutil.filter_pods_list({}, 'foo') self.assertEqual(len(res.get('items')), 0) pods = json.loads( Fixtures.read_file("pods_list_1.1.json", string_escape=False)) res = self.kubeutil.filter_pods_list(pods, '10.240.0.9') self.assertEqual(len(res.get('items')), 5) pods = json.loads( Fixtures.read_file("pods_list_1.1.json", string_escape=False)) res = self.kubeutil.filter_pods_list(pods, 'foo') self.assertEqual(len(res.get('items')), 0) pods = json.loads( Fixtures.read_file("pods_list_1.2.json", string_escape=False)) res = self.kubeutil.filter_pods_list(pods, '10.240.0.5') self.assertEqual(len(res.get('items')), 1) pods = json.loads( Fixtures.read_file("pods_list_1.2.json", string_escape=False)) res = self.kubeutil.filter_pods_list(pods, 'foo') self.assertEqual(len(res.get('items')), 0) @mock.patch('utils.kubeutil.requests') def test_retrieve_json_auth(self, r): self.kubeutil.retrieve_json_auth('url', 'foo_tok') r.get.assert_called_once_with( 'url', verify=False, timeout=10, headers={'Authorization': 'Bearer foo_tok'}) self.kubeutil.CA_CRT_PATH = __file__ self.kubeutil.retrieve_json_auth('url', 'foo_tok') r.get.assert_called_with('url', verify=__file__, timeout=10, headers={'Authorization': 'Bearer foo_tok'}) def test_get_node_info(self): with mock.patch('utils.kubeutil.KubeUtil._fetch_host_data') as f: self.kubeutil.get_node_info() f.assert_called_once() f.reset_mock() self.kubeutil._node_ip = 'foo' self.kubeutil._node_name = 'bar' ip, name = self.kubeutil.get_node_info() self.assertEqual(ip, 'foo') self.assertEqual(name, 'bar') f.assert_not_called() def test__fetch_host_data(self): """ Test with both 1.1 and 1.2 version payloads """ with mock.patch( 'utils.kubeutil.KubeUtil.retrieve_pods_list') as mock_pods: self.kubeutil.host_name = 'dd-agent-1rxlh' mock_pods.return_value = json.loads( Fixtures.read_file("pods_list_1.2.json", string_escape=False)) self.kubeutil._fetch_host_data() self.assertEqual(self.kubeutil._node_ip, '10.240.0.9') self.assertEqual(self.kubeutil._node_name, 'kubernetes-massi-minion-k23m') self.kubeutil.host_name = 'heapster-v11-l8sh1' mock_pods.return_value = json.loads( Fixtures.read_file("pods_list_1.1.json", string_escape=False)) self.kubeutil._fetch_host_data() self.assertEqual(self.kubeutil._node_ip, '10.240.0.9') self.assertEqual(self.kubeutil._node_name, 'gke-cluster-1-8046fdfa-node-ld35') def test_get_auth_token(self): KubeUtil.AUTH_TOKEN_PATH = '/foo/bar' self.assertIsNone(KubeUtil.get_auth_token()) KubeUtil.AUTH_TOKEN_PATH = Fixtures.file( 'events.json') # any file could do the trick self.assertIsNotNone(KubeUtil.get_auth_token()) def test_is_k8s(self): os.unsetenv('KUBERNETES_PORT') self.assertFalse(Platform.is_k8s()) os.environ['KUBERNETES_PORT'] = '999' self.assertTrue(Platform.is_k8s()) def test_extract_event_tags(self): events = json.loads( Fixtures.read_file("events.json", string_escape=False))['items'] for ev in events: tags = KubeUtil().extract_event_tags(ev) # there should be 4 tags except for some events where source.host is missing self.assertTrue(len(tags) >= 3) tag_names = [tag.split(':')[0] for tag in tags] self.assertIn('reason', tag_names) self.assertIn('namespace', tag_names) self.assertIn('object_type', tag_names) if len(tags) == 4: self.assertIn('node_name', tag_names)
class TestKubeutil(unittest.TestCase): def setUp(self): self.kubeutil = KubeUtil() @mock.patch('utils.kubeutil.KubeUtil.retrieve_pods_list', side_effect=['foo']) @mock.patch('utils.kubeutil.KubeUtil.extract_kube_labels') def test_get_kube_labels(self, extract_kube_labels, retrieve_pods_list): self.kubeutil.get_kube_labels(excluded_keys='bar') retrieve_pods_list.assert_called_once() extract_kube_labels.assert_called_once_with('foo', excluded_keys='bar') def test_extract_kube_labels(self): """ Test with both 1.1 and 1.2 version payloads """ res = self.kubeutil.extract_kube_labels({}, ['foo']) self.assertEqual(len(res), 0) pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False)) res = self.kubeutil.extract_kube_labels(pods, ['foo']) labels = set(inn for out in res.values() for inn in out) self.assertEqual(len(labels), 8) res = self.kubeutil.extract_kube_labels(pods, ['k8s-app']) labels = set(inn for out in res.values() for inn in out) self.assertEqual(len(labels), 6) pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False)) res = self.kubeutil.extract_kube_labels(pods, ['foo']) labels = set(inn for out in res.values() for inn in out) self.assertEqual(len(labels), 3) res = self.kubeutil.extract_kube_labels(pods, ['k8s-app']) labels = set(inn for out in res.values() for inn in out) self.assertEqual(len(labels), 3) def test_extract_meta(self): """ Test with both 1.1 and 1.2 version payloads """ res = self.kubeutil.extract_meta({}, 'foo') self.assertEqual(len(res), 0) pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False)) res = self.kubeutil.extract_meta(pods, 'foo') self.assertEqual(len(res), 0) res = self.kubeutil.extract_meta(pods, 'uid') self.assertEqual(len(res), 6) pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False)) res = self.kubeutil.extract_meta(pods, 'foo') self.assertEqual(len(res), 0) res = self.kubeutil.extract_meta(pods, 'uid') self.assertEqual(len(res), 4) @mock.patch('utils.kubeutil.retrieve_json') def test_retrieve_pods_list(self, retrieve_json): self.kubeutil.retrieve_pods_list() retrieve_json.assert_called_once_with(self.kubeutil.pods_list_url) @mock.patch('utils.kubeutil.retrieve_json') def test_retrieve_metrics(self, retrieve_json): self.kubeutil.retrieve_metrics() retrieve_json.assert_called_once_with(self.kubeutil.metrics_url) def test_filter_pods_list(self): """ Test with both 1.1 and 1.2 version payloads """ res = self.kubeutil.filter_pods_list({}, 'foo') self.assertEqual(len(res.get('items')), 0) pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False)) res = self.kubeutil.filter_pods_list(pods, '10.240.0.9') self.assertEqual(len(res.get('items')), 5) pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False)) res = self.kubeutil.filter_pods_list(pods, 'foo') self.assertEqual(len(res.get('items')), 0) pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False)) res = self.kubeutil.filter_pods_list(pods, '10.240.0.5') self.assertEqual(len(res.get('items')), 1) pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False)) res = self.kubeutil.filter_pods_list(pods, 'foo') self.assertEqual(len(res.get('items')), 0) @mock.patch('utils.kubeutil.requests') def test_retrieve_json_auth(self, r): self.kubeutil.retrieve_json_auth('url', 'foo_tok') r.get.assert_called_once_with('url', verify=False, timeout=10, headers={'Authorization': 'Bearer foo_tok'}) self.kubeutil.CA_CRT_PATH = __file__ self.kubeutil.retrieve_json_auth('url', 'foo_tok') r.get.assert_called_with('url', verify=__file__, timeout=10, headers={'Authorization': 'Bearer foo_tok'}) def test_get_node_info(self): with mock.patch('utils.kubeutil.KubeUtil._fetch_host_data') as f: self.kubeutil.get_node_info() f.assert_called_once() f.reset_mock() self.kubeutil._node_ip = 'foo' self.kubeutil._node_name = 'bar' ip, name = self.kubeutil.get_node_info() self.assertEqual(ip, 'foo') self.assertEqual(name, 'bar') f.assert_not_called() def test__fetch_host_data(self): """ Test with both 1.1 and 1.2 version payloads """ with mock.patch('utils.kubeutil.KubeUtil.retrieve_pods_list') as mock_pods: self.kubeutil.host_name = 'dd-agent-1rxlh' mock_pods.return_value = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False)) self.kubeutil._fetch_host_data() self.assertEqual(self.kubeutil._node_ip, '10.240.0.9') self.assertEqual(self.kubeutil._node_name, 'kubernetes-massi-minion-k23m') self.kubeutil.host_name = 'heapster-v11-l8sh1' mock_pods.return_value = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False)) self.kubeutil._fetch_host_data() self.assertEqual(self.kubeutil._node_ip, '10.240.0.9') self.assertEqual(self.kubeutil._node_name, 'gke-cluster-1-8046fdfa-node-ld35') def test_get_auth_token(self): KubeUtil.AUTH_TOKEN_PATH = '/foo/bar' self.assertIsNone(KubeUtil.get_auth_token()) KubeUtil.AUTH_TOKEN_PATH = Fixtures.file('events.json') # any file could do the trick self.assertIsNotNone(KubeUtil.get_auth_token()) def test_is_k8s(self): os.unsetenv('KUBERNETES_PORT') self.assertFalse(Platform.is_k8s()) os.environ['KUBERNETES_PORT'] = '999' self.assertTrue(Platform.is_k8s())
class DockerDaemon(AgentCheck): """Collect metrics and events from Docker API and cgroups.""" def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception("Docker check only supports one configured instance.") AgentCheck.__init__(self, name, init_config, agentConfig, instances=instances) self.init_success = False self._service_discovery = agentConfig.get('service_discovery') and \ agentConfig.get('service_discovery_backend') == 'docker' self.init() def init(self): try: instance = self.instances[0] self.docker_util = DockerUtil() self.docker_client = self.docker_util.client self.docker_gateway = DockerUtil.get_gateway() if Platform.is_k8s(): self.kubeutil = KubeUtil() # We configure the check with the right cgroup settings for this host # Just needs to be done once self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS) self.cgroup_listing_retries = 0 self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False # Set tagging options self.custom_tags = instance.get("tags", []) self.collect_labels_as_tags = instance.get("collect_labels_as_tags", []) self.kube_labels = {} self.use_histogram = _is_affirmative(instance.get('use_histogram', False)) performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: performance_tags, IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } # Set filtering settings if not instance.get("exclude"): self._filtering_enabled = False if instance.get("include"): self.log.warning("You must specify an exclude section to enable filtering") else: self._filtering_enabled = True include = instance.get("include", []) exclude = instance.get("exclude", []) self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude) self.tag_names[FILTERED] = _filtered_tag_names # Other options self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False)) self.collect_events = _is_affirmative(instance.get('collect_events', True)) self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False)) self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False)) self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance() self.ecs_tags = {} except Exception as e: self.log.critical(e) self.warning("Initialization failed. Will retry at next iteration") else: self.init_success = True def check(self, instance): """Run the Docker check for one instance.""" if not self.init_success: # Initialization can fail if cgroups are not ready. So we retry if needed # https://github.com/DataDog/dd-agent/issues/1896 self.init() if not self.init_success: # Initialization failed, will try later return # Report image metrics if self.collect_image_stats: self._count_and_weigh_images() if self.collect_ecs_tags: self.refresh_ecs_tags() if Platform.is_k8s(): try: self.kube_labels = self.kubeutil.get_kube_labels() except Exception as e: self.log.warning('Could not retrieve kubernetes labels: %s' % str(e)) self.kube_labels = {} # containers running with custom cgroups? custom_cgroups = _is_affirmative(instance.get('custom_cgroups', False)) # Get the list of containers and the index of their names containers_by_id = self._get_and_count_containers(custom_cgroups) containers_by_id = self._crawl_container_pids(containers_by_id, custom_cgroups) # Send events from Docker API if self.collect_events or self._service_discovery: self._process_events(containers_by_id) # Report performance container metrics (cpu, mem, net, io) self._report_performance_metrics(containers_by_id) if self.collect_container_size: self._report_container_size(containers_by_id) # Collect disk stats from Docker info command if self.collect_disk_stats: self._report_disk_stats() def _count_and_weigh_images(self): try: tags = self._get_tags() active_images = self.docker_client.images(all=False) active_images_len = len(active_images) all_images_len = len(self.docker_client.images(quiet=True, all=True)) self.gauge("docker.images.available", active_images_len, tags=tags) self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags) if self.collect_image_size: self._report_image_size(active_images) except Exception as e: # It's not an important metric, keep going if it fails self.warning("Failed to count Docker images. Exception: {0}".format(e)) def _get_and_count_containers(self, custom_cgroups=False): """List all the containers from the API, filter and count them.""" # Querying the size of containers is slow, we don't do it at each run must_query_size = self.collect_container_size and self._latest_size_query == 0 self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE running_containers_count = Counter() all_containers_count = Counter() try: containers = self.docker_client.containers(all=True, size=must_query_size) except Exception as e: message = "Unable to list Docker containers: {0}".format(e) self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message=message) raise Exception(message) else: self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK) # Filter containers according to the exclude/include rules self._filter_containers(containers) containers_by_id = {} for container in containers: container_name = DockerUtil.container_name_extractor(container)[0] container_status_tags = self._get_tags(container, CONTAINER) all_containers_count[tuple(sorted(container_status_tags))] += 1 if self._is_container_running(container): running_containers_count[tuple(sorted(container_status_tags))] += 1 # Check if the container is included/excluded via its tags if self._is_container_excluded(container): self.log.debug("Container {0} is excluded".format(container_name)) continue containers_by_id[container['Id']] = container # grab pid via API if custom cgroups - otherwise we won't find process when # crawling for pids. if custom_cgroups: try: inspect_dict = self.docker_client.inspect_container(container_name) container['_pid'] = inspect_dict['State']['Pid'] except Exception as e: self.log.debug("Unable to inspect Docker container: %s", e) for tags, count in running_containers_count.iteritems(): self.gauge("docker.containers.running", count, tags=list(tags)) for tags, count in all_containers_count.iteritems(): stopped_count = count - running_containers_count[tags] self.gauge("docker.containers.stopped", stopped_count, tags=list(tags)) return containers_by_id def _is_container_running(self, container): """Tell if a container is running, according to its status. There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated. See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35 """ return container["Status"].startswith("Up") or container["Status"].startswith("Restarting") def _get_tags(self, entity=None, tag_type=None): """Generate the tags for a given entity (container or image) according to a list of tag names.""" # Start with custom tags tags = list(self.custom_tags) # Collect pod names as tags on kubernetes if Platform.is_k8s() and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags: self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL) if entity is not None: pod_name = None # Get labels as tags labels = entity.get("Labels") if labels is not None: for k in self.collect_labels_as_tags: if k in labels: v = labels[k] if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s(): pod_name = v k = "pod_name" if "-" in pod_name: replication_controller = "-".join(pod_name.split("-")[:-1]) if "/" in replication_controller: # k8s <= 1.1 namespace, replication_controller = replication_controller.split("/", 1) elif KubeUtil.NAMESPACE_LABEL in labels: # k8s >= 1.2 namespace = labels[KubeUtil.NAMESPACE_LABEL] pod_name = "{0}/{1}".format(namespace, pod_name) tags.append("kube_namespace:%s" % namespace) tags.append("kube_replication_controller:%s" % replication_controller) tags.append("pod_name:%s" % pod_name) elif not v: tags.append(k) else: tags.append("%s:%s" % (k,v)) if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s() and k not in labels: tags.append("pod_name:no_pod") # Get entity specific tags if tag_type is not None: tag_names = self.tag_names[tag_type] for tag_name in tag_names: tag_value = self._extract_tag_value(entity, tag_name) if tag_value is not None: for t in tag_value: tags.append('%s:%s' % (tag_name, str(t).strip())) # Add ECS tags if self.collect_ecs_tags: entity_id = entity.get("Id") if entity_id in self.ecs_tags: ecs_tags = self.ecs_tags[entity_id] tags.extend(ecs_tags) # Add kube labels if Platform.is_k8s(): kube_tags = self.kube_labels.get(pod_name) if kube_tags: tags.extend(list(kube_tags)) return tags def _extract_tag_value(self, entity, tag_name): """Extra tag information from the API result (containers or images). Cache extracted tags inside the entity object. """ if tag_name not in TAG_EXTRACTORS: self.warning("{0} isn't a supported tag".format(tag_name)) return # Check for already extracted tags if "_tag_values" not in entity: entity["_tag_values"] = {} if tag_name not in entity["_tag_values"]: entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity) return entity["_tag_values"][tag_name] def refresh_ecs_tags(self): ecs_config = self.docker_client.inspect_container('ecs-agent') ip = ecs_config.get('NetworkSettings', {}).get('IPAddress') ports = ecs_config.get('NetworkSettings', {}).get('Ports') port = ports.keys()[0].split('/')[0] if ports else None if not ip: port = ECS_INTROSPECT_DEFAULT_PORT if Platform.is_containerized() and self.docker_gateway: ip = self.docker_gateway else: ip = "localhost" ecs_tags = {} try: if ip and port: tasks = requests.get('http://%s:%s/v1/tasks' % (ip, port)).json() for task in tasks.get('Tasks', []): for container in task.get('Containers', []): tags = ['task_name:%s' % task['Family'], 'task_version:%s' % task['Version']] ecs_tags[container['DockerId']] = tags except (requests.exceptions.HTTPError, requests.exceptions.HTTPError) as e: self.log.warning("Unable to collect ECS task names: %s" % e) self.ecs_tags = ecs_tags def _filter_containers(self, containers): if not self._filtering_enabled: return self._filtered_containers = set() for container in containers: container_tags = self._get_tags(container, FILTERED) if self._are_tags_filtered(container_tags): container_name = DockerUtil.container_name_extractor(container)[0] self._filtered_containers.add(container_name) self.log.debug("Container {0} is filtered".format(container_name)) def _are_tags_filtered(self, tags): if self._tags_match_patterns(tags, self._exclude_patterns): if self._tags_match_patterns(tags, self._include_patterns): return False return True return False def _tags_match_patterns(self, tags, filters): for rule in filters: for tag in tags: if re.match(rule, tag): return True return False def _is_container_excluded(self, container): """Check if a container is excluded according to the filter rules. Requires _filter_containers to run first. """ container_name = DockerUtil.container_name_extractor(container)[0] return container_name in self._filtered_containers def _report_container_size(self, containers_by_id): for container in containers_by_id.itervalues(): if self._is_container_excluded(container): continue tags = self._get_tags(container, PERFORMANCE) m_func = FUNC_MAP[GAUGE][self.use_histogram] if "SizeRw" in container: m_func(self, 'docker.container.size_rw', container['SizeRw'], tags=tags) if "SizeRootFs" in container: m_func( self, 'docker.container.size_rootfs', container['SizeRootFs'], tags=tags) def _report_image_size(self, images): for image in images: tags = self._get_tags(image, IMAGE) if 'VirtualSize' in image: self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags) if 'Size' in image: self.gauge('docker.image.size', image['Size'], tags=tags) # Performance metrics def _report_performance_metrics(self, containers_by_id): containers_without_proc_root = [] for container in containers_by_id.itervalues(): if self._is_container_excluded(container) or not self._is_container_running(container): continue tags = self._get_tags(container, PERFORMANCE) self._report_cgroup_metrics(container, tags) if "_proc_root" not in container: containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0]) continue self._report_net_metrics(container, tags) if containers_without_proc_root: message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format( ", ".join(containers_without_proc_root)) if not Platform.is_k8s(): self.warning(message) else: # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway self.log.debug(message) def _report_cgroup_metrics(self, container, tags): try: for cgroup in CGROUP_METRICS: stat_file = self._get_cgroup_from_proc(cgroup["cgroup"], container['_pid'], cgroup['file']) stats = self._parse_cgroup_file(stat_file) if stats: for key, (dd_key, metric_func) in cgroup['metrics'].iteritems(): metric_func = FUNC_MAP[metric_func][self.use_histogram] if key in stats: metric_func(self, dd_key, int(stats[key]), tags=tags) # Computed metrics for mname, (key_list, fct, metric_func) in cgroup.get('to_compute', {}).iteritems(): values = [stats[key] for key in key_list if key in stats] if len(values) != len(key_list): self.log.debug("Couldn't compute {0}, some keys were missing.".format(mname)) continue value = fct(*values) metric_func = FUNC_MAP[metric_func][self.use_histogram] if value is not None: metric_func(self, mname, value, tags=tags) except MountException as ex: if self.cgroup_listing_retries > MAX_CGROUP_LISTING_RETRIES: raise ex else: self.warning("Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now." "Will retry {0} times before failing.".format(MAX_CGROUP_LISTING_RETRIES - self.cgroup_listing_retries)) self.cgroup_listing_retries += 1 else: self.cgroup_listing_retries = 0 def _report_net_metrics(self, container, tags): """Find container network metrics by looking at /proc/$PID/net/dev of the container process.""" if self._disable_net_metrics: self.log.debug("Network metrics are disabled. Skipping") return proc_net_file = os.path.join(container['_proc_root'], 'net/dev') try: with open(proc_net_file, 'r') as fp: lines = fp.readlines() """Two first lines are headers: Inter-| Receive | Transmit face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed """ for l in lines[2:]: cols = l.split(':', 1) interface_name = str(cols[0]).strip() if interface_name == 'eth0': x = cols[1].split() m_func = FUNC_MAP[RATE][self.use_histogram] m_func(self, "docker.net.bytes_rcvd", long(x[0]), tags) m_func(self, "docker.net.bytes_sent", long(x[8]), tags) break except Exception as e: # It is possible that the container got stopped between the API call and now self.warning("Failed to report IO metrics from file {0}. Exception: {1}".format(proc_net_file, e)) def _process_events(self, containers_by_id): if self.collect_events is False: # Crawl events for service discovery only self._get_events() return try: api_events = self._get_events() aggregated_events = self._pre_aggregate_events(api_events, containers_by_id) events = self._format_events(aggregated_events, containers_by_id) except (socket.timeout, urllib2.URLError): self.warning('Timeout when collecting events. Events will be missing.') return except Exception as e: self.warning("Unexpected exception when collecting events: {0}. " "Events will be missing".format(e)) return for ev in events: self.log.debug("Creating event: %s" % ev['msg_title']) self.event(ev) def _get_events(self): """Get the list of events.""" events, changed_container_ids = self.docker_util.get_events() if changed_container_ids and self._service_discovery: get_sd_backend(self.agentConfig).update_checks(changed_container_ids) return events def _pre_aggregate_events(self, api_events, containers_by_id): # Aggregate events, one per image. Put newer events first. events = defaultdict(deque) for event in api_events: # Skip events related to filtered containers container = containers_by_id.get(event.get('id')) if container is not None and self._is_container_excluded(container): self.log.debug("Excluded event: container {0} status changed to {1}".format( event['id'], event['status'])) continue # from may be missing (for network events for example) if 'from' in event: events[event['from']].appendleft(event) return events def _format_events(self, aggregated_events, containers_by_id): events = [] for image_name, event_group in aggregated_events.iteritems(): container_tags = set() low_prio_events = [] normal_prio_events = [] for event in event_group: container_name = event['id'][:11] if event['id'] in containers_by_id: cont = containers_by_id[event['id']] container_name = DockerUtil.container_name_extractor(cont)[0] container_tags.update(self._get_tags(cont, PERFORMANCE)) container_tags.add('container_name:%s' % container_name) # health checks generate tons of these so we treat them separately and lower their priority if event['status'].startswith('exec_create:') or event['status'].startswith('exec_start:'): low_prio_events.append((event, container_name)) else: normal_prio_events.append((event, container_name)) exec_event = self._create_dd_event(low_prio_events, image_name, container_tags, priority='Low') events.append(exec_event) normal_event = self._create_dd_event(normal_prio_events, image_name, container_tags, priority='Normal') events.append(normal_event) return events def _create_dd_event(self, events, image, c_tags, priority='Normal'): """Create the actual event to submit from a list of similar docker events""" max_timestamp = 0 status = defaultdict(int) status_change = [] for ev, c_name in events: max_timestamp = max(max_timestamp, int(ev['time'])) status[ev['status']] += 1 status_change.append([c_name, ev['status']]) status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()]) msg_title = "%s %s on %s" % (image, status_text, self.hostname) msg_body = ( "%%%\n" "{image_name} {status} on {hostname}\n" "```\n{status_changes}\n```\n" "%%%" ).format( image_name=image, status=status_text, hostname=self.hostname, status_changes="\n".join( ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change]) ) if any(error in status_text for error in ERROR_ALERT_TYPE): alert_type = "error" else: alert_type = None return { 'timestamp': max_timestamp, 'host': self.hostname, 'event_type': EVENT_TYPE, 'msg_title': msg_title, 'msg_text': msg_body, 'source_type_name': EVENT_TYPE, 'event_object': 'docker:%s' % image, 'tags': list(c_tags), 'alert_type': alert_type, 'priority': priority } def _report_disk_stats(self): """Report metrics about the volume space usage""" stats = { 'docker.data.used': None, 'docker.data.total': None, 'docker.data.free': None, 'docker.metadata.used': None, 'docker.metadata.total': None, 'docker.metadata.free': None # these two are calculated by _calc_percent_disk_stats # 'docker.data.percent': None, # 'docker.metadata.percent': None } info = self.docker_client.info() driver_status = info.get('DriverStatus', []) if not driver_status: self.log.warning('Disk metrics collection is enabled but docker info did not' ' report any. Your storage driver might not support them, skipping.') return for metric in driver_status: # only consider metrics about disk space if len(metric) == 2 and 'Space' in metric[0]: # identify Data and Metadata metrics mtype = 'data' if 'Metadata' in metric[0]: mtype = 'metadata' if 'Used' in metric[0]: stats['docker.{0}.used'.format(mtype)] = metric[1] elif 'Space Total' in metric[0]: stats['docker.{0}.total'.format(mtype)] = metric[1] elif 'Space Available' in metric[0]: stats['docker.{0}.free'.format(mtype)] = metric[1] stats = self._format_disk_metrics(stats) stats.update(self._calc_percent_disk_stats(stats)) tags = self._get_tags() for name, val in stats.iteritems(): if val is not None: self.gauge(name, val, tags) def _format_disk_metrics(self, metrics): """Cast the disk stats to float and convert them to bytes""" for name, raw_val in metrics.iteritems(): if raw_val: val, unit = raw_val.split(' ') # by default some are uppercased others lowercased. That's error prone. unit = unit.lower() try: val = int(float(val) * UNIT_MAP[unit]) metrics[name] = val except KeyError: self.log.error('Unrecognized unit %s for disk metric %s. Dropping it.' % (unit, name)) metrics[name] = None return metrics def _calc_percent_disk_stats(self, stats): """Calculate a percentage of used disk space for data and metadata""" mtypes = ['data', 'metadata'] percs = {} for mtype in mtypes: used = stats.get('docker.{0}.used'.format(mtype)) total = stats.get('docker.{0}.total'.format(mtype)) free = stats.get('docker.{0}.free'.format(mtype)) if used and total and free and ceil(total) < free + used: self.log.debug('used, free, and total disk metrics may be wrong, ' 'used: %s, free: %s, total: %s', used, free, total) total = used + free try: if isinstance(used, int): percs['docker.{0}.percent'.format(mtype)] = round(100 * float(used) / float(total), 2) elif isinstance(free, int): percs['docker.{0}.percent'.format(mtype)] = round(100 * (1.0 - (float(free) / float(total))), 2) except ZeroDivisionError: self.log.error('docker.{0}.total is 0, calculating docker.{1}.percent' ' is not possible.'.format(mtype, mtype)) return percs # Cgroups def _get_cgroup_from_proc(self, cgroup, pid, filename): """Find a specific cgroup file, containing metrics to extract.""" params = { "file": filename, } return DockerUtil.find_cgroup_from_proc(self._mountpoints, pid, cgroup, self.docker_util._docker_root) % (params) def _parse_cgroup_file(self, stat_file): """Parse a cgroup pseudo file for key/values.""" self.log.debug("Opening cgroup file: %s" % stat_file) try: with open(stat_file, 'r') as fp: if 'blkio' in stat_file: return self._parse_blkio_metrics(fp.read().splitlines()) elif 'cpuacct.usage' in stat_file: return dict({"usage": str(int(fp.read())/10000000)}) else: return dict(map(lambda x: x.split(' ', 1), fp.read().splitlines())) except IOError: # It is possible that the container got stopped between the API call and now self.log.info("Can't open %s. Metrics for this container are skipped." % stat_file) def _parse_blkio_metrics(self, stats): """Parse the blkio metrics.""" metrics = { 'io_read': 0, 'io_write': 0, } for line in stats: if 'Read' in line: metrics['io_read'] += int(line.split()[2]) if 'Write' in line: metrics['io_write'] += int(line.split()[2]) return metrics def _is_container_cgroup(self, line, selinux_policy): if line[1] not in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') or line[2] == '/docker-daemon': return False if 'docker' in line[2]: # general case return True if 'docker' in selinux_policy: # selinux return True if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2][1:]): # kubernetes return True return False # proc files def _crawl_container_pids(self, container_dict, custom_cgroups=False): """Crawl `/proc` to find container PIDs and add them to `containers_by_id`.""" proc_path = os.path.join(self.docker_util._docker_root, 'proc') pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()] if len(pid_dirs) == 0: self.warning("Unable to find any pid directory in {0}. " "If you are running the agent in a container, make sure to " 'share the volume properly: "/proc:/host/proc:ro". ' "See https://github.com/DataDog/docker-dd-agent/blob/master/README.md for more information. " "Network metrics will be missing".format(proc_path)) self._disable_net_metrics = True return container_dict self._disable_net_metrics = False for folder in pid_dirs: try: path = os.path.join(proc_path, folder, 'cgroup') with open(path, 'r') as f: content = [line.strip().split(':') for line in f.readlines()] selinux_policy = '' path = os.path.join(proc_path, folder, 'attr', 'current') if os.path.exists(path): with open(path, 'r') as f: selinux_policy = f.readlines()[0] except IOError, e: # Issue #2074 self.log.debug("Cannot read %s, " "process likely raced to finish : %s" % (path, str(e))) except Exception as e: self.warning("Cannot read %s : %s" % (path, str(e))) continue try: for line in content: if self._is_container_cgroup(line, selinux_policy): cpuacct = line[2] break else: continue matches = re.findall(CONTAINER_ID_RE, cpuacct) if matches: container_id = matches[-1] if container_id not in container_dict: self.log.debug("Container %s not in container_dict, it's likely excluded", container_id) continue container_dict[container_id]['_pid'] = folder container_dict[container_id]['_proc_root'] = os.path.join(proc_path, folder) elif custom_cgroups: # if we match by pid that should be enough (?) - O(n) ugh! for _, container in container_dict.iteritems(): if container.get('_pid') == int(folder): container['_proc_root'] = os.path.join(proc_path, folder) break except Exception, e: self.warning("Cannot parse %s content: %s" % (path, str(e))) continue
class SDDockerBackend(AbstractSDBackend): """Docker-based service discovery""" def __init__(self, agentConfig): self.docker_client = DockerUtil().client if is_k8s(): self.kubeutil = KubeUtil() try: self.config_store = get_config_store(agentConfig=agentConfig) except Exception as e: log.error('Failed to instantiate the config store client. ' 'Auto-config only will be used. %s' % str(e)) agentConfig['sd_config_backend'] = None self.config_store = get_config_store(agentConfig=agentConfig) self.VAR_MAPPING = { 'host': self._get_host_address, 'port': self._get_port, 'tags': self._get_additional_tags, } AbstractSDBackend.__init__(self, agentConfig) def _get_host_address(self, c_inspect, tpl_var): """Extract the container IP from a docker inspect object, or the kubelet API.""" c_id, c_img = c_inspect.get('Id', ''), c_inspect.get('Config', {}).get('Image', '') tpl_parts = tpl_var.split('_') # a specifier was given if len(tpl_parts) > 1: networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {} ip_dict = {} for net_name, net_desc in networks.iteritems(): ip = net_desc.get('IPAddress') if ip: ip_dict[net_name] = ip ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var) if ip_addr: return ip_addr # try to get the bridge IP address log.debug("No network found for container %s (%s), trying with IPAddress field" % (c_id[:12], c_img)) ip_addr = c_inspect.get('NetworkSettings', {}).get('IPAddress') if ip_addr: return ip_addr if is_k8s(): # kubernetes case log.debug("Couldn't find the IP address for container %s (%s), " "using the kubernetes way." % (c_id[:12], c_img)) pod_list = self.kubeutil.retrieve_pods_list().get('items', []) for pod in pod_list: pod_ip = pod.get('status', {}).get('podIP') if pod_ip is None: continue else: c_statuses = pod.get('status', {}).get('containerStatuses', []) for status in c_statuses: # compare the container id with those of containers in the current pod if c_id == status.get('containerID', '').split('//')[-1]: return pod_ip log.error("No IP address was found for container %s (%s)" % (c_id[:12], c_img)) return None def _extract_ip_from_networks(self, ip_dict, tpl_var): """Extract a single IP from a dictionary made of network names and IPs.""" if not ip_dict: return None tpl_parts = tpl_var.split('_') # no specifier if len(tpl_parts) < 2: log.warning("No key was passed for template variable %s." % tpl_var) return self._get_fallback_ip(ip_dict) else: res = ip_dict.get(tpl_parts[-1]) if res is None: log.warning("The key passed for template variable %s was not found." % tpl_var) return self._get_fallback_ip(ip_dict) else: return res def _get_fallback_ip(self, ip_dict): """try to pick the bridge key, falls back to the value of the last key""" if 'bridge' in ip_dict: log.warning("Using the bridge network.") return ip_dict['bridge'] else: last_key = sorted(ip_dict.iterkeys())[-1] log.warning("Trying with the last key: '%s'." % last_key) return ip_dict[last_key] def _get_port(self, container_inspect, tpl_var): """Extract a port from a container_inspect or the k8s API given a template variable.""" c_id = container_inspect.get('Id', '') try: ports = map(lambda x: x.split('/')[0], container_inspect['NetworkSettings']['Ports'].keys()) except (IndexError, KeyError, AttributeError): # try to get ports from the docker API. Works if the image has an EXPOSE instruction ports = map(lambda x: x.split('/')[0], container_inspect['Config'].get('ExposedPorts', {}).keys()) # if it failed, try with the kubernetes API if not ports and is_k8s(): log.debug("Didn't find the port for container %s (%s), trying the kubernetes way." % (c_id[:12], container_inspect.get('Config', {}).get('Image', ''))) co_statuses = self._get_kube_config(c_id, 'status').get('containerStatuses', []) c_name = None for co in co_statuses: if co.get('containerID', '').split('//')[-1] == c_id: c_name = co.get('name') break containers = self._get_kube_config(c_id, 'spec').get('containers', []) for co in containers: if co.get('name') == c_name: ports = map(lambda x: str(x.get('containerPort')), co.get('ports', [])) ports = sorted(ports, key=lambda x: int(x)) return self._extract_port_from_list(ports, tpl_var) def _extract_port_from_list(self, ports, tpl_var): if not ports: return None tpl_parts = tpl_var.split('_') if len(tpl_parts) == 1: log.debug("No index was passed for template variable %s. " "Trying with the last element." % tpl_var) return ports[-1] try: idx = tpl_parts[-1] return ports[int(idx)] except ValueError: log.error("Port index is not an integer. Using the last element instead.") except IndexError: log.error("Port index is out of range. Using the last element instead.") return ports[-1] def get_tags(self, c_inspect): """Extract useful tags from docker or platform APIs. These are collected by default.""" tags = [] if is_k8s(): pod_metadata = self._get_kube_config(c_inspect.get('Id'), 'metadata') if pod_metadata is None: log.warning("Failed to fetch pod metadata for container %s." " Kubernetes tags may be missing." % c_inspect.get('Id', '')[:12]) return [] # get labels kube_labels = pod_metadata.get('labels', {}) for label, value in kube_labels.iteritems(): tags.append('%s:%s' % (label, value)) # get replication controller created_by = json.loads(pod_metadata.get('annotations', {}).get('kubernetes.io/created-by', '{}')) if created_by.get('reference', {}).get('kind') == 'ReplicationController': tags.append('kube_replication_controller:%s' % created_by.get('reference', {}).get('name')) # get kubernetes namespace tags.append('kube_namespace:%s' % pod_metadata.get('namespace')) return tags def _get_additional_tags(self, container_inspect, *args): tags = [] if is_k8s(): pod_metadata = self._get_kube_config(container_inspect.get('Id'), 'metadata') pod_spec = self._get_kube_config(container_inspect.get('Id'), 'spec') if pod_metadata is None or pod_spec is None: log.warning("Failed to fetch pod metadata or pod spec for container %s." " Additional Kubernetes tags may be missing." % container_inspect.get('Id', '')[:12]) return [] tags.append('node_name:%s' % pod_spec.get('nodeName')) tags.append('pod_name:%s' % pod_metadata.get('name')) return tags def _get_kube_config(self, c_id, key): """Get a part of a pod config from the kubernetes API""" pods = self.kubeutil.retrieve_pods_list().get('items', []) for pod in pods: c_statuses = pod.get('status', {}).get('containerStatuses', []) for status in c_statuses: if c_id == status.get('containerID', '').split('//')[-1]: return pod.get(key, {}) def get_configs(self): """Get the config for all docker containers running on the host.""" configs = {} containers = [( container.get('Image'), container.get('Id'), container.get('Labels') ) for container in self.docker_client.containers()] # used by the configcheck agent command to trace where check configs come from trace_config = self.agentConfig.get(TRACE_CONFIG, False) for image, cid, labels in containers: try: # value of the DATADOG_ID tag or the image name if the label is missing identifier = self.get_config_id(image, labels) check_configs = self._get_check_configs(cid, identifier, trace_config=trace_config) or [] for conf in check_configs: if trace_config and conf is not None: source, conf = conf check_name, init_config, instance = conf # build instances list if needed if configs.get(check_name) is None: if trace_config: configs[check_name] = (source, (init_config, [instance])) else: configs[check_name] = (init_config, [instance]) else: conflict_init_msg = 'Different versions of `init_config` found for check {0}. ' \ 'Keeping the first one found.' if trace_config: if configs[check_name][1][0] != init_config: log.warning(conflict_init_msg.format(check_name)) configs[check_name][1][1].append(instance) else: if configs[check_name][0] != init_config: log.warning(conflict_init_msg.format(check_name)) configs[check_name][1].append(instance) except Exception: log.exception('Building config for container %s based on image %s using service ' 'discovery failed, leaving it alone.' % (cid[:12], image)) return configs def get_config_id(self, image, labels): """Look for a DATADOG_ID label, return its value or the image name if missing""" return labels.get(DATADOG_ID) or image def _get_check_configs(self, c_id, identifier, trace_config=False): """Retrieve configuration templates and fill them with data pulled from docker and tags.""" inspect = self.docker_client.inspect_container(c_id) config_templates = self._get_config_templates(identifier, trace_config=trace_config) if not config_templates: log.debug('No config template for container %s with identifier %s. ' 'It will be left unconfigured.' % (c_id[:12], identifier)) return None check_configs = [] tags = self.get_tags(inspect) for config_tpl in config_templates: if trace_config: source, config_tpl = config_tpl check_name, init_config_tpl, instance_tpl, variables = config_tpl # insert tags in instance_tpl and process values for template variables instance_tpl, var_values = self._fill_tpl(inspect, instance_tpl, variables, tags) tpl = self._render_template(init_config_tpl or {}, instance_tpl or {}, var_values) if tpl and len(tpl) == 2: init_config, instance = tpl if trace_config: check_configs.append((source, (check_name, init_config, instance))) else: check_configs.append((check_name, init_config, instance)) return check_configs def _get_config_templates(self, identifier, trace_config=False): """Extract config templates for an identifier from a K/V store and returns it as a dict object.""" config_backend = self.agentConfig.get('sd_config_backend') templates = [] if config_backend is None: auto_conf = True log.warning('No supported configuration backend was provided, using auto-config only.') else: auto_conf = False # format: [('ident', {init_tpl}, {instance_tpl})] without trace_config # or [(source, ('ident', {init_tpl}, {instance_tpl}))] with trace_config raw_tpls = self.config_store.get_check_tpls( identifier, auto_conf=auto_conf, trace_config=trace_config) for tpl in raw_tpls: if trace_config and tpl is not None: # each template can come from either auto configuration or user-supplied templates source, tpl = tpl if tpl is not None and len(tpl) == 3: check_name, init_config_tpl, instance_tpl = tpl else: log.debug('No template was found for identifier %s, leaving it alone.' % identifier) return None try: # build a list of all variables to replace in the template variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \ self.PLACEHOLDER_REGEX.findall(str(instance_tpl)) variables = map(lambda x: x.strip('%'), variables) if not isinstance(init_config_tpl, dict): init_config_tpl = json.loads(init_config_tpl or '{}') if not isinstance(instance_tpl, dict): instance_tpl = json.loads(instance_tpl or '{}') except json.JSONDecodeError: log.exception('Failed to decode the JSON template fetched for check {0}. Its configuration' ' by service discovery failed for ident {1}.'.format(check_name, identifier)) return None if trace_config: templates.append((source, (check_name, init_config_tpl, instance_tpl, variables))) else: templates.append((check_name, init_config_tpl, instance_tpl, variables)) return templates def _fill_tpl(self, inspect, instance_tpl, variables, tags=None): """Add container tags to instance templates and build a dict from template variable names and their values.""" var_values = {} c_id, c_image = inspect.get('Id', ''), inspect.get('Config', {}).get('Image', '') # add default tags to the instance if tags: tpl_tags = instance_tpl.get('tags', []) tags += tpl_tags if isinstance(tpl_tags, list) else [tpl_tags] instance_tpl['tags'] = list(set(tags)) for var in variables: # variables can be suffixed with an index in case several values are found if var.split('_')[0] in self.VAR_MAPPING: try: res = self.VAR_MAPPING[var.split('_')[0]](inspect, var) if res is None: raise ValueError("Invalid value for variable %s." % var) var_values[var] = res except Exception as ex: log.error("Could not find a value for the template variable %s for container %s " "(%s): %s" % (var, c_id[:12], c_image, str(ex))) else: log.error("No method was found to interpolate template variable %s for container %s " "(%s)." % (var, c_id[:12], c_image)) return instance_tpl, var_values
def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception( 'Kubernetes check only supports one configured instance.') AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.kubeutil = KubeUtil()
class TestKubeutil(unittest.TestCase): def setUp(self): self.kubeutil = KubeUtil() @mock.patch("utils.kubeutil.KubeUtil.retrieve_pods_list", side_effect=["foo"]) @mock.patch("utils.kubeutil.KubeUtil.extract_kube_labels") def test_get_kube_labels(self, extract_kube_labels, retrieve_pods_list): self.kubeutil.get_kube_labels(excluded_keys="bar") retrieve_pods_list.assert_called_once() extract_kube_labels.assert_called_once_with("foo", excluded_keys="bar") def test_extract_kube_labels(self): """ Test with both 1.1 and 1.2 version payloads """ res = self.kubeutil.extract_kube_labels({}, ["foo"]) self.assertEqual(len(res), 0) pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False)) res = self.kubeutil.extract_kube_labels(pods, ["foo"]) labels = set(inn for out in res.values() for inn in out) self.assertEqual(len(labels), 8) res = self.kubeutil.extract_kube_labels(pods, ["k8s-app"]) labels = set(inn for out in res.values() for inn in out) self.assertEqual(len(labels), 6) pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False)) res = self.kubeutil.extract_kube_labels(pods, ["foo"]) labels = set(inn for out in res.values() for inn in out) self.assertEqual(len(labels), 3) res = self.kubeutil.extract_kube_labels(pods, ["k8s-app"]) labels = set(inn for out in res.values() for inn in out) self.assertEqual(len(labels), 3) def test_extract_meta(self): """ Test with both 1.1 and 1.2 version payloads """ res = self.kubeutil.extract_meta({}, "foo") self.assertEqual(len(res), 0) pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False)) res = self.kubeutil.extract_meta(pods, "foo") self.assertEqual(len(res), 0) res = self.kubeutil.extract_meta(pods, "uid") self.assertEqual(len(res), 6) pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False)) res = self.kubeutil.extract_meta(pods, "foo") self.assertEqual(len(res), 0) res = self.kubeutil.extract_meta(pods, "uid") self.assertEqual(len(res), 4) @mock.patch("utils.kubeutil.retrieve_json") def test_retrieve_pods_list(self, retrieve_json): self.kubeutil.retrieve_pods_list() retrieve_json.assert_called_once_with(self.kubeutil.pods_list_url) @mock.patch("utils.kubeutil.retrieve_json") def test_retrieve_metrics(self, retrieve_json): self.kubeutil.retrieve_metrics() retrieve_json.assert_called_once_with(self.kubeutil.metrics_url) def test_filter_pods_list(self): """ Test with both 1.1 and 1.2 version payloads """ res = self.kubeutil.filter_pods_list({}, "foo") self.assertEqual(len(res.get("items")), 0) pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False)) res = self.kubeutil.filter_pods_list(pods, "10.240.0.9") self.assertEqual(len(res.get("items")), 5) pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False)) res = self.kubeutil.filter_pods_list(pods, "foo") self.assertEqual(len(res.get("items")), 0) pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False)) res = self.kubeutil.filter_pods_list(pods, "10.240.0.5") self.assertEqual(len(res.get("items")), 1) pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False)) res = self.kubeutil.filter_pods_list(pods, "foo") self.assertEqual(len(res.get("items")), 0) @mock.patch("utils.kubeutil.requests") def test_retrieve_json_auth(self, r): self.kubeutil.retrieve_json_auth("url", "foo_tok") r.get.assert_called_once_with("url", verify=False, timeout=10, headers={"Authorization": "Bearer foo_tok"}) self.kubeutil.CA_CRT_PATH = __file__ self.kubeutil.retrieve_json_auth("url", "foo_tok") r.get.assert_called_with("url", verify=__file__, timeout=10, headers={"Authorization": "Bearer foo_tok"}) def test_get_node_info(self): with mock.patch("utils.kubeutil.KubeUtil._fetch_host_data") as f: self.kubeutil.get_node_info() f.assert_called_once() f.reset_mock() self.kubeutil._node_ip = "foo" self.kubeutil._node_name = "bar" ip, name = self.kubeutil.get_node_info() self.assertEqual(ip, "foo") self.assertEqual(name, "bar") f.assert_not_called() def test__fetch_host_data(self): """ Test with both 1.1 and 1.2 version payloads """ with mock.patch("utils.kubeutil.KubeUtil.retrieve_pods_list") as mock_pods: self.kubeutil.host_name = "dd-agent-1rxlh" mock_pods.return_value = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False)) self.kubeutil._fetch_host_data() self.assertEqual(self.kubeutil._node_ip, "10.240.0.9") self.assertEqual(self.kubeutil._node_name, "kubernetes-massi-minion-k23m") self.kubeutil.host_name = "heapster-v11-l8sh1" mock_pods.return_value = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False)) self.kubeutil._fetch_host_data() self.assertEqual(self.kubeutil._node_ip, "10.240.0.9") self.assertEqual(self.kubeutil._node_name, "gke-cluster-1-8046fdfa-node-ld35") def test_get_auth_token(self): KubeUtil.AUTH_TOKEN_PATH = "/foo/bar" self.assertIsNone(KubeUtil.get_auth_token()) KubeUtil.AUTH_TOKEN_PATH = Fixtures.file("events.json") # any file could do the trick self.assertIsNotNone(KubeUtil.get_auth_token()) def test_is_k8s(self): os.unsetenv("KUBERNETES_PORT") self.assertFalse(is_k8s()) os.environ["KUBERNETES_PORT"] = "999" self.assertTrue(is_k8s())
class Kubernetes(AgentCheck): """ Collect metrics and events from kubelet """ pod_names_by_container = {} def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception('Kubernetes check only supports one configured instance.') AgentCheck.__init__(self, name, init_config, agentConfig, instances) inst = instances[0] if instances is not None else None self.kubeutil = KubeUtil(instance=inst) if not self.kubeutil.host: raise Exception('Unable to retrieve Docker hostname and host parameter is not set') def _perform_kubelet_checks(self, url): service_check_base = NAMESPACE + '.kubelet.check' is_ok = True try: r = requests.get(url) for line in r.iter_lines(): # avoid noise; this check is expected to fail since we override the container hostname if line.find('hostname') != -1: continue matches = re.match('\[(.)\]([^\s]+) (.*)?', line) if not matches or len(matches.groups()) < 2: continue service_check_name = service_check_base + '.' + matches.group(2) status = matches.group(1) if status == '+': self.service_check(service_check_name, AgentCheck.OK) else: self.service_check(service_check_name, AgentCheck.CRITICAL) is_ok = False except Exception as e: self.log.warning('kubelet check %s failed: %s' % (url, str(e))) self.service_check(service_check_base, AgentCheck.CRITICAL, message='Kubelet check %s failed: %s' % (url, str(e))) else: if is_ok: self.service_check(service_check_base, AgentCheck.OK) else: self.service_check(service_check_base, AgentCheck.CRITICAL) def check(self, instance): self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH) enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES) self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges] enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES) self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates] self.publish_aliases = _is_affirmative(instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES)) self.use_histogram = _is_affirmative(instance.get('use_histogram', DEFAULT_USE_HISTOGRAM)) self.publish_rate = FUNC_MAP[RATE][self.use_histogram] self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram] pods_list = self.kubeutil.retrieve_pods_list() # kubelet health checks self._perform_kubelet_checks(self.kubeutil.kube_health_url) # kubelet metrics self._update_metrics(instance, pods_list) # kubelet events if _is_affirmative(instance.get('collect_events', DEFAULT_COLLECT_EVENTS)): self._process_events(instance, pods_list) def _publish_raw_metrics(self, metric, dat, tags, depth=0): if depth >= self.max_depth: self.log.warning('Reached max depth on metric=%s' % metric) return if isinstance(dat, numbers.Number): if self.enabled_rates and any([fnmatch(metric, pat) for pat in self.enabled_rates]): self.publish_rate(self, metric, float(dat), tags) elif self.enabled_gauges and any([fnmatch(metric, pat) for pat in self.enabled_gauges]): self.publish_gauge(self, metric, float(dat), tags) elif isinstance(dat, dict): for k, v in dat.iteritems(): self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags, depth + 1) elif isinstance(dat, list): self._publish_raw_metrics(metric, dat[-1], tags, depth + 1) @staticmethod def _shorten_name(name): # shorten docker image id return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name) def _get_post_1_2_tags(self, cont_labels, subcontainer, kube_labels): tags = [] pod_name = cont_labels[KubeUtil.POD_NAME_LABEL] pod_namespace = cont_labels[KubeUtil.NAMESPACE_LABEL] tags.append(u"pod_name:{0}/{1}".format(pod_namespace, pod_name)) tags.append(u"kube_namespace:{0}".format(pod_namespace)) kube_labels_key = "{0}/{1}".format(pod_namespace, pod_name) pod_labels = kube_labels.get(kube_labels_key) if pod_labels: tags += list(pod_labels) if "-" in pod_name: replication_controller = "-".join(pod_name.split("-")[:-1]) tags.append("kube_replication_controller:%s" % replication_controller) if self.publish_aliases and subcontainer.get("aliases"): for alias in subcontainer['aliases'][1:]: # we don't add the first alias as it will be the container_name tags.append('container_alias:%s' % (self._shorten_name(alias))) return tags def _get_pre_1_2_tags(self, cont_labels, subcontainer, kube_labels): tags = [] pod_name = cont_labels[KubeUtil.POD_NAME_LABEL] tags.append(u"pod_name:{0}".format(pod_name)) pod_labels = kube_labels.get(pod_name) if pod_labels: tags.extend(list(pod_labels)) if "-" in pod_name: replication_controller = "-".join(pod_name.split("-")[:-1]) if "/" in replication_controller: namespace, replication_controller = replication_controller.split("/", 1) tags.append(u"kube_namespace:%s" % namespace) tags.append(u"kube_replication_controller:%s" % replication_controller) if self.publish_aliases and subcontainer.get("aliases"): for alias in subcontainer['aliases'][1:]: # we don't add the first alias as it will be the container_name tags.append(u"container_alias:%s" % (self._shorten_name(alias))) return tags def _update_container_metrics(self, instance, subcontainer, kube_labels): tags = list(instance.get('tags', [])) # add support for custom tags if len(subcontainer.get('aliases', [])) >= 1: # The first alias seems to always match the docker container name container_name = subcontainer['aliases'][0] else: # We default to the container id container_name = subcontainer['name'] tags.append('container_name:%s' % container_name) try: cont_labels = subcontainer['spec']['labels'] except KeyError: self.log.debug("Subcontainer, doesn't have any labels") cont_labels = {} # Collect pod names, namespaces, rc... if KubeUtil.NAMESPACE_LABEL in cont_labels and KubeUtil.POD_NAME_LABEL in cont_labels: # Kubernetes >= 1.2 tags += self._get_post_1_2_tags(cont_labels, subcontainer, kube_labels) elif KubeUtil.POD_NAME_LABEL in cont_labels: # Kubernetes <= 1.1 tags += self._get_pre_1_2_tags(cont_labels, subcontainer, kube_labels) else: # Those are containers that are not part of a pod. # They are top aggregate views and don't have the previous metadata. tags.append("pod_name:no_pod") stats = subcontainer['stats'][-1] # take the latest self._publish_raw_metrics(NAMESPACE, stats, tags) if subcontainer.get("spec", {}).get("has_filesystem"): fs = stats['filesystem'][-1] fs_utilization = float(fs['usage'])/float(fs['capacity']) self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct', fs_utilization, tags) if subcontainer.get("spec", {}).get("has_network"): net = stats['network'] self.publish_rate(self, NAMESPACE + '.network_errors', sum(float(net[x]) for x in NET_ERRORS), tags) return tags def _update_metrics(self, instance, pods_list): metrics = self.kubeutil.retrieve_metrics() excluded_labels = instance.get('excluded_labels') kube_labels = self.kubeutil.extract_kube_labels(pods_list, excluded_keys=excluded_labels) if not metrics: raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd) # container metrics from Cadvisor container_tags = {} for subcontainer in metrics: c_id = subcontainer.get('id') try: tags = self._update_container_metrics(instance, subcontainer, kube_labels) if c_id: container_tags[c_id] = tags # also store tags for aliases for alias in subcontainer.get('aliases', []): container_tags[alias] = tags except Exception, e: self.log.error("Unable to collect metrics for container: {0} ({1}".format(c_id, e)) # container metrics from kubernetes API: limits and requests for pod in pods_list['items']: try: containers = pod['spec']['containers'] name2id = {} for cs in pod['status'].get('containerStatuses', []): c_id = cs.get('containerID', '').split('//')[-1] name = cs.get('name') if name: name2id[name] = c_id except KeyError: self.log.debug("Pod %s does not have containers specs, skipping...", pod['metadata'].get('name')) continue for container in containers: c_name = container.get('name') _tags = container_tags.get(name2id.get(c_name), []) prog = re.compile(r'[-+]?\d+[\.]?\d*') # limits try: for limit, value_str in container['resources']['limits'].iteritems(): values = [float(s) for s in prog.findall(value_str)] if len(values) != 1: self.log.warning("Error parsing limits value string: %s", value_str) continue self.publish_gauge(self, '{}.{}.limits'.format(NAMESPACE, limit), values[0], _tags) except (KeyError, AttributeError) as e: self.log.debug("Unable to retrieve container limits for %s: %s", c_name, e) self.log.debug("Container object for {}: {}".format(c_name, container)) # requests try: for request, value_str in container['resources']['requests'].iteritems(): values = [float(s) for s in prog.findall(value_str)] if len(values) != 1: self.log.warning("Error parsing requests value string: %s", value_str) continue self.publish_gauge(self, '{}.{}.requests'.format(NAMESPACE, request), values[0], _tags) except (KeyError, AttributeError) as e: self.log.error("Unable to retrieve container requests for %s: %s", c_name, e) self.log.debug("Container object for {}: {}".format(c_name, container)) self._update_pods_metrics(instance, pods_list)
class SDDockerBackend(AbstractSDBackend): """Docker-based service discovery""" def __init__(self, agentConfig): self.docker_client = DockerUtil().client if is_k8s(): self.kubeutil = KubeUtil() try: self.config_store = get_config_store(agentConfig=agentConfig) except Exception as e: log.error('Failed to instantiate the config store client. ' 'Auto-config only will be used. %s' % str(e)) agentConfig['sd_config_backend'] = None self.config_store = get_config_store(agentConfig=agentConfig) self.VAR_MAPPING = { 'host': self._get_host, 'port': self._get_ports, 'tags': self._get_additional_tags, } AbstractSDBackend.__init__(self, agentConfig) def _get_host(self, container_inspect): """Extract the host IP from a docker inspect object, or the kubelet API.""" ip_addr = container_inspect.get('NetworkSettings', {}).get('IPAddress') if not ip_addr: if not is_k8s(): return # kubernetes case log.debug( "Didn't find the IP address for container %s (%s), using the kubernetes way." % (container_inspect.get('Id', '')[:12], container_inspect.get('Config', {}).get('Image', ''))) pod_list = self.kubeutil.retrieve_pods_list().get('items', []) c_id = container_inspect.get('Id') for pod in pod_list: pod_ip = pod.get('status', {}).get('podIP') if pod_ip is None: continue else: c_statuses = pod.get('status', {}).get('containerStatuses', []) for status in c_statuses: # compare the container id with those of containers in the current pod if c_id == status.get('containerID', '').split('//')[-1]: ip_addr = pod_ip return ip_addr def _get_ports(self, container_inspect): """Extract a list of available ports from a docker inspect object. Sort them numerically.""" c_id = container_inspect.get('Id', '') try: ports = map(lambda x: x.split('/')[0], container_inspect['NetworkSettings']['Ports'].keys()) except (IndexError, KeyError, AttributeError): log.debug( "Didn't find the port for container %s (%s), trying the kubernetes way." % (c_id[:12], container_inspect.get('Config', {}).get( 'Image', ''))) # first we try to get it from the docker API # it works if the image has an EXPOSE instruction ports = map( lambda x: x.split('/')[0], container_inspect['Config'].get('ExposedPorts', {}).keys()) # if it failed, try with the kubernetes API if not ports and is_k8s(): co_statuses = self._get_kube_config(c_id, 'status').get( 'containerStatuses', []) c_name = None for co in co_statuses: if co.get('containerID', '').split('//')[-1] == c_id: c_name = co.get('name') break containers = self._get_kube_config(c_id, 'spec').get( 'containers', []) for co in containers: if co.get('name') == c_name: ports = map(lambda x: str(x.get('containerPort')), co.get('ports', [])) ports = sorted(ports, key=lambda x: int(x)) return ports def get_tags(self, c_inspect): """Extract useful tags from docker or platform APIs. These are collected by default.""" tags = [] if is_k8s(): pod_metadata = self._get_kube_config(c_inspect.get('Id'), 'metadata') if pod_metadata is None: log.warning("Failed to fetch pod metadata for container %s." " Kubernetes tags may be missing." % c_inspect.get('Id', '')[:12]) return [] # get labels kube_labels = pod_metadata.get('labels', {}) for label, value in kube_labels.iteritems(): tags.append('%s:%s' % (label, value)) # get replication controller created_by = json.loads( pod_metadata.get('annotations', {}).get('kubernetes.io/created-by', '{}')) if created_by.get('reference', {}).get('kind') == 'ReplicationController': tags.append('kube_replication_controller:%s' % created_by.get('reference', {}).get('name')) # get kubernetes namespace tags.append('kube_namespace:%s' % pod_metadata.get('namespace')) return tags def _get_additional_tags(self, container_inspect): tags = [] if is_k8s(): pod_metadata = self._get_kube_config(container_inspect.get('Id'), 'metadata') pod_spec = self._get_kube_config(container_inspect.get('Id'), 'spec') tags.append('node_name:%s' % pod_spec.get('nodeName')) tags.append('pod_name:%s' % pod_metadata.get('name')) return tags def _get_kube_config(self, c_id, key): """Get a part of a pod config from the kubernetes API""" pods = self.kubeutil.retrieve_pods_list().get('items', []) for pod in pods: c_statuses = pod.get('status', {}).get('containerStatuses', []) for status in c_statuses: if c_id == status.get('containerID', '').split('//')[-1]: return pod.get(key, {}) def get_configs(self): """Get the config for all docker containers running on the host.""" configs = {} containers = [(container.get('Image').split(':')[0].split('/')[-1], container.get('Id'), container.get('Labels')) for container in self.docker_client.containers()] # used by the configcheck agent command to trace where check configs come from trace_config = self.agentConfig.get(TRACE_CONFIG, False) for image, cid, labels in containers: try: check_configs = self._get_check_configs( cid, image, trace_config=trace_config) or [] for conf in check_configs: if trace_config and conf is not None: source, conf = conf check_name, init_config, instance = conf # build instances list if needed if configs.get(check_name) is None: if trace_config: configs[check_name] = (source, (init_config, [instance])) else: configs[check_name] = (init_config, [instance]) else: conflict_init_msg = 'Different versions of `init_config` found for check {0}. ' \ 'Keeping the first one found.' if trace_config: if configs[check_name][1][0] != init_config: log.warning( conflict_init_msg.format(check_name)) configs[check_name][1][1].append(instance) else: if configs[check_name][0] != init_config: log.warning( conflict_init_msg.format(check_name)) configs[check_name][1].append(instance) except Exception: log.exception( 'Building config for container %s based on image %s using service' ' discovery failed, leaving it alone.' % (cid[:12], image)) return configs def _get_check_configs(self, c_id, image, trace_config=False): """Retrieve configuration templates and fill them with data pulled from docker and tags.""" inspect = self.docker_client.inspect_container(c_id) config_templates = self._get_config_templates( image, trace_config=trace_config) if not config_templates: log.debug('No config template for container %s with image %s. ' 'It will be left unconfigured.' % (c_id[:12], image)) return None check_configs = [] tags = self.get_tags(inspect) for config_tpl in config_templates: if trace_config: source, config_tpl = config_tpl check_name, init_config_tpl, instance_tpl, variables = config_tpl # insert tags in instance_tpl and process values for template variables instance_tpl, var_values = self._fill_tpl(inspect, instance_tpl, variables, tags) tpl = self._render_template(init_config_tpl or {}, instance_tpl or {}, var_values) if tpl and len(tpl) == 2: if trace_config and len(tpl[1]) == 2: source, (init_config, instance) = tpl check_configs.append( (source, (check_name, init_config, instance))) elif not trace_config: init_config, instance = tpl check_configs.append((check_name, init_config, instance)) return check_configs def _get_config_templates(self, image_name, trace_config=False): """Extract config templates for an image from a K/V store and returns it as a dict object.""" config_backend = self.agentConfig.get('sd_config_backend') templates = [] if config_backend is None: auto_conf = True log.warning( 'No supported configuration backend was provided, using auto-config only.' ) else: auto_conf = False # format: [('image', {init_tpl}, {instance_tpl})] without trace_config # or [(source, ('image', {init_tpl}, {instance_tpl}))] with trace_config raw_tpls = self.config_store.get_check_tpls(image_name, auto_conf=auto_conf, trace_config=trace_config) for tpl in raw_tpls: if trace_config and tpl is not None: # each template can come from either auto configuration or user-supplied templates source, tpl = tpl if tpl is not None and len(tpl) == 3: check_name, init_config_tpl, instance_tpl = tpl else: log.debug( 'No template was found for image %s, leaving it alone.' % image_name) return None try: # build a list of all variables to replace in the template variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \ self.PLACEHOLDER_REGEX.findall(str(instance_tpl)) variables = map(lambda x: x.strip('%'), variables) if not isinstance(init_config_tpl, dict): init_config_tpl = json.loads(init_config_tpl or '{}') if not isinstance(instance_tpl, dict): instance_tpl = json.loads(instance_tpl or '{}') except json.JSONDecodeError: log.exception( 'Failed to decode the JSON template fetched for check {0}. Its configuration' ' by service discovery failed for {1}.'.format( check_name, image_name)) return None if trace_config: templates.append((source, (check_name, init_config_tpl, instance_tpl, variables))) else: templates.append( (check_name, init_config_tpl, instance_tpl, variables)) return templates def _fill_tpl(self, inspect, instance_tpl, variables, tags=None): """Add container tags to instance templates and build a """ """dict from template variable names and their values.""" var_values = {} # add default tags to the instance if tags: tags += instance_tpl.get('tags', []) instance_tpl['tags'] = list(set(tags)) for v in variables: # variables can be suffixed with an index in case a list is found var_parts = v.split('_') if var_parts[0] in self.VAR_MAPPING: try: res = self.VAR_MAPPING[var_parts[0]](inspect) if not res: raise ValueError("Invalid value for variable %s." % var_parts[0]) # if an index is found in the variable, use it to select a value if len(var_parts) > 1 and isinstance( res, list) and int(var_parts[-1]) < len(res): var_values[v] = res[int(var_parts[-1])] # if no valid index was found but we have a list, return the last element elif isinstance(res, list): var_values[v] = res[-1] else: var_values[v] = res except Exception as ex: log.error( "Could not find a value for the template variable %s: %s" % (v, str(ex))) else: log.error( "No method was found to interpolate template variable %s." % v) return instance_tpl, var_values
class Kubernetes(AgentCheck): """ Collect metrics and events from kubelet """ pod_names_by_container = {} def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception( 'Kubernetes check only supports one configured instance.') AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.kubeutil = KubeUtil() if not self.kubeutil.host: raise Exception( 'Unable to get default router and host parameter is not set') def _perform_kubelet_checks(self, url): service_check_base = NAMESPACE + '.kubelet.check' is_ok = True try: r = requests.get(url) for line in r.iter_lines(): # avoid noise; this check is expected to fail since we override the container hostname if line.find('hostname') != -1: continue matches = re.match('\[(.)\]([^\s]+) (.*)?', line) if not matches or len(matches.groups()) < 2: continue service_check_name = service_check_base + '.' + matches.group( 2) status = matches.group(1) if status == '+': self.service_check(service_check_name, AgentCheck.OK) else: self.service_check(service_check_name, AgentCheck.CRITICAL) is_ok = False except Exception as e: self.log.warning('kubelet check %s failed: %s' % (url, str(e))) self.service_check(service_check_base, AgentCheck.CRITICAL, message='Kubelet check %s failed: %s' % (url, str(e))) else: if is_ok: self.service_check(service_check_base, AgentCheck.OK) else: self.service_check(service_check_base, AgentCheck.CRITICAL) def check(self, instance): self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH) enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES) self.enabled_gauges = [ "{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges ] enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES) self.enabled_rates = [ "{0}.{1}".format(NAMESPACE, x) for x in enabled_rates ] self.publish_aliases = _is_affirmative( instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES)) self.use_histogram = _is_affirmative( instance.get('use_histogram', DEFAULT_USE_HISTOGRAM)) self.publish_rate = FUNC_MAP[RATE][self.use_histogram] self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram] # kubelet health checks self._perform_kubelet_checks(self.kubeutil.kube_health_url) # kubelet metrics self._update_metrics(instance) def _publish_raw_metrics(self, metric, dat, tags, depth=0): if depth >= self.max_depth: self.log.warning('Reached max depth on metric=%s' % metric) return if isinstance(dat, numbers.Number): if self.enabled_rates and any( [fnmatch(metric, pat) for pat in self.enabled_rates]): self.publish_rate(self, metric, float(dat), tags) elif self.enabled_gauges and any( [fnmatch(metric, pat) for pat in self.enabled_gauges]): self.publish_gauge(self, metric, float(dat), tags) elif isinstance(dat, dict): for k, v in dat.iteritems(): self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags, depth + 1) elif isinstance(dat, list): self._publish_raw_metrics(metric, dat[-1], tags, depth + 1) @staticmethod def _shorten_name(name): # shorten docker image id return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name) def _get_post_1_2_tags(self, cont_labels, subcontainer, kube_labels): tags = [] pod_name = cont_labels[KubeUtil.POD_NAME_LABEL] pod_namespace = cont_labels[KubeUtil.NAMESPACE_LABEL] tags.append(u"pod_name:{0}/{1}".format(pod_namespace, pod_name)) tags.append(u"kube_namespace:{0}".format(pod_namespace)) kube_labels_key = "{0}/{1}".format(pod_namespace, pod_name) pod_labels = kube_labels.get(kube_labels_key) if pod_labels: tags += list(pod_labels) if "-" in pod_name: replication_controller = "-".join(pod_name.split("-")[:-1]) tags.append("kube_replication_controller:%s" % replication_controller) if self.publish_aliases and subcontainer.get("aliases"): for alias in subcontainer['aliases'][1:]: # we don't add the first alias as it will be the container_name tags.append('container_alias:%s' % (self._shorten_name(alias))) return tags def _get_pre_1_2_tags(self, cont_labels, subcontainer, kube_labels): tags = [] pod_name = cont_labels[KubeUtil.POD_NAME_LABEL] tags.append(u"pod_name:{0}".format(pod_name)) pod_labels = kube_labels.get(pod_name) if pod_labels: tags.extend(list(pod_labels)) if "-" in pod_name: replication_controller = "-".join(pod_name.split("-")[:-1]) if "/" in replication_controller: namespace, replication_controller = replication_controller.split( "/", 1) tags.append(u"kube_namespace:%s" % namespace) tags.append(u"kube_replication_controller:%s" % replication_controller) if self.publish_aliases and subcontainer.get("aliases"): for alias in subcontainer['aliases'][1:]: # we don't add the first alias as it will be the container_name tags.append(u"container_alias:%s" % (self._shorten_name(alias))) return tags def _update_container_metrics(self, instance, subcontainer, kube_labels): tags = list(instance.get('tags', [])) # add support for custom tags if len(subcontainer.get('aliases', [])) >= 1: # The first alias seems to always match the docker container name container_name = subcontainer['aliases'][0] else: # We default to the container id container_name = subcontainer['name'] tags.append('container_name:%s' % container_name) try: cont_labels = subcontainer['spec']['labels'] except KeyError: self.log.debug("Subcontainer, doesn't have any labels") cont_labels = {} # Collect pod names, namespaces, rc... if KubeUtil.NAMESPACE_LABEL in cont_labels and KubeUtil.POD_NAME_LABEL in cont_labels: # Kubernetes >= 1.2 tags += self._get_post_1_2_tags(cont_labels, subcontainer, kube_labels) elif KubeUtil.POD_NAME_LABEL in cont_labels: # Kubernetes <= 1.1 tags += self._get_pre_1_2_tags(cont_labels, subcontainer, kube_labels) else: # Those are containers that are not part of a pod. # They are top aggregate views and don't have the previous metadata. tags.append("pod_name:no_pod") stats = subcontainer['stats'][-1] # take the latest self._publish_raw_metrics(NAMESPACE, stats, tags) if subcontainer.get("spec", {}).get("has_filesystem"): fs = stats['filesystem'][-1] fs_utilization = float(fs['usage']) / float(fs['capacity']) self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct', fs_utilization, tags) if subcontainer.get("spec", {}).get("has_network"): net = stats['network'] self.publish_rate(self, NAMESPACE + '.network_errors', sum(float(net[x]) for x in NET_ERRORS), tags) def _retrieve_metrics(self, url): return retrieve_json(url) def _update_metrics(self, instance): pods_list = self.kubeutil.retrieve_pods_list() metrics = self._retrieve_metrics(self.kubeutil.metrics_url) excluded_labels = instance.get('excluded_labels') kube_labels = self.kubeutil.extract_kube_labels( pods_list, excluded_keys=excluded_labels) if not metrics: raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd) for subcontainer in metrics: try: self._update_container_metrics(instance, subcontainer, kube_labels) except Exception as e: self.log.error( "Unable to collect metrics for container: {0} ({1}".format( subcontainer.get('name'), e)) self._update_pods_metrics(instance, pods_list) def _update_pods_metrics(self, instance, pods): supported_kinds = [ "DaemonSet", "Deployment", "Job", "ReplicationController", "ReplicaSet", ] controllers_map = defaultdict(int) for pod in pods['items']: try: created_by = json.loads( pod['metadata']['annotations']['kubernetes.io/created-by']) kind = created_by['reference']['kind'] if kind in supported_kinds: controllers_map[created_by['reference']['name']] += 1 except KeyError: continue tags = instance.get('tags', []) for ctrl, pod_count in controllers_map.iteritems(): _tags = tags[:] # copy base tags _tags.append('kube_replication_controller:{0}'.format(ctrl)) self.publish_gauge(self, NAMESPACE + '.pods.running', pod_count, _tags)
def test_get_auth_token(self): KubeUtil.AUTH_TOKEN_PATH = "/foo/bar" self.assertIsNone(KubeUtil.get_auth_token()) KubeUtil.AUTH_TOKEN_PATH = Fixtures.file("events.json") # any file could do the trick self.assertIsNotNone(KubeUtil.get_auth_token())
def test_get_auth_token(self): KubeUtil.AUTH_TOKEN_PATH = '/foo/bar' self.assertIsNone(KubeUtil.get_auth_token()) KubeUtil.AUTH_TOKEN_PATH = Fixtures.file( 'events.json') # any file could do the trick self.assertIsNotNone(KubeUtil.get_auth_token())
class DockerDaemon(AgentCheck): """Collect metrics and events from Docker API and cgroups.""" def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception("Docker check only supports one configured instance.") AgentCheck.__init__(self, name, init_config, agentConfig, instances=instances) self.init_success = False self._service_discovery = agentConfig.get('service_discovery') and \ agentConfig.get('service_discovery_backend') == 'docker' self.init() self._custom_cgroups = _is_affirmative(init_config.get('custom_cgroups', False)) def is_k8s(self): return 'KUBERNETES_PORT' in os.environ def init(self): try: instance = self.instances[0] # if service discovery is enabled dockerutil will need a reference to the config store if self._service_discovery: self.docker_util = DockerUtil( agentConfig=self.agentConfig, config_store=get_config_store(self.agentConfig) ) else: self.docker_util = DockerUtil() self.docker_client = self.docker_util.client self.docker_gateway = DockerUtil.get_gateway() if self.is_k8s(): self.kubeutil = KubeUtil() # We configure the check with the right cgroup settings for this host # Just needs to be done once self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS) self.cgroup_listing_retries = 0 self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False # Set tagging options self.custom_tags = instance.get("tags", []) self.collect_labels_as_tags = instance.get("collect_labels_as_tags", []) self.kube_labels = {} self.use_histogram = _is_affirmative(instance.get('use_histogram', False)) performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: performance_tags, IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } # Set filtering settings if not instance.get("exclude"): self._filtering_enabled = False if instance.get("include"): self.log.warning("You must specify an exclude section to enable filtering") else: self._filtering_enabled = True include = instance.get("include", []) exclude = instance.get("exclude", []) self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude) self.tag_names[FILTERED] = _filtered_tag_names # Other options self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False)) self.collect_events = _is_affirmative(instance.get('collect_events', True)) self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False)) self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False)) self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance() self.ecs_tags = {} except Exception as e: self.log.critical(e) self.warning("Initialization failed. Will retry at next iteration") else: self.init_success = True def check(self, instance): """Run the Docker check for one instance.""" if not self.init_success: # Initialization can fail if cgroups are not ready. So we retry if needed # https://github.com/DataDog/dd-agent/issues/1896 self.init() if not self.init_success: # Initialization failed, will try later return # Report image metrics if self.collect_image_stats: self._count_and_weigh_images() if self.collect_ecs_tags: self.refresh_ecs_tags() if self.is_k8s(): try: self.kube_labels = self.kubeutil.get_kube_labels() except Exception as e: self.log.warning('Could not retrieve kubernetes labels: %s' % str(e)) self.kube_labels = {} # containers running with custom cgroups? custom_cgroups = _is_affirmative(instance.get('custom_cgroups', self._custom_cgroups)) # Get the list of containers and the index of their names containers_by_id = self._get_and_count_containers(custom_cgroups) containers_by_id = self._crawl_container_pids(containers_by_id) # Send events from Docker API if self.collect_events or self._service_discovery: self._process_events(containers_by_id) # Report performance container metrics (cpu, mem, net, io) self._report_performance_metrics(containers_by_id) if self.collect_container_size: self._report_container_size(containers_by_id) # Collect disk stats from Docker info command if self.collect_disk_stats: self._report_disk_stats() def _count_and_weigh_images(self): try: tags = self._get_tags() active_images = self.docker_client.images(all=False) active_images_len = len(active_images) all_images_len = len(self.docker_client.images(quiet=True, all=True)) self.gauge("docker.images.available", active_images_len, tags=tags) self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags) if self.collect_image_size: self._report_image_size(active_images) except Exception as e: # It's not an important metric, keep going if it fails self.warning("Failed to count Docker images. Exception: {0}".format(e)) def _get_and_count_containers(self, custom_cgroups=False): """List all the containers from the API, filter and count them.""" # Querying the size of containers is slow, we don't do it at each run must_query_size = self.collect_container_size and self._latest_size_query == 0 self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE running_containers_count = Counter() all_containers_count = Counter() try: containers = self.docker_client.containers(all=True, size=must_query_size) except Exception as e: message = "Unable to list Docker containers: {0}".format(e) self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message=message) raise Exception(message) else: self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK) # Filter containers according to the exclude/include rules self._filter_containers(containers) containers_by_id = {} for container in containers: container_name = DockerUtil.container_name_extractor(container)[0] container_status_tags = self._get_tags(container, CONTAINER) all_containers_count[tuple(sorted(container_status_tags))] += 1 if self._is_container_running(container): running_containers_count[tuple(sorted(container_status_tags))] += 1 # Check if the container is included/excluded via its tags if self._is_container_excluded(container): self.log.debug("Container {0} is excluded".format(container_name)) continue containers_by_id[container['Id']] = container # grab pid via API if custom cgroups - otherwise we won't find process when # crawling for pids. if custom_cgroups: try: inspect_dict = self.docker_client.inspect_container(container_name) container['_pid'] = inspect_dict['State']['Pid'] except Exception as e: self.log.debug("Unable to inspect Docker container: %s", e) for tags, count in running_containers_count.iteritems(): self.gauge("docker.containers.running", count, tags=list(tags)) for tags, count in all_containers_count.iteritems(): stopped_count = count - running_containers_count[tags] self.gauge("docker.containers.stopped", stopped_count, tags=list(tags)) return containers_by_id def _is_container_running(self, container): """Tell if a container is running, according to its status. There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated. See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35 """ return container["Status"].startswith("Up") or container["Status"].startswith("Restarting") def _get_tags(self, entity=None, tag_type=None): """Generate the tags for a given entity (container or image) according to a list of tag names.""" # Start with custom tags tags = list(self.custom_tags) # Collect pod names as tags on kubernetes if self.is_k8s() and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags: self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL) if entity is not None: pod_name = None # Get labels as tags labels = entity.get("Labels") if labels is not None: for k in self.collect_labels_as_tags: if k in labels: v = labels[k] if k == KubeUtil.POD_NAME_LABEL and self.is_k8s(): pod_name = v k = "pod_name" if "-" in pod_name: replication_controller = "-".join(pod_name.split("-")[:-1]) if "/" in replication_controller: # k8s <= 1.1 namespace, replication_controller = replication_controller.split("/", 1) elif KubeUtil.NAMESPACE_LABEL in labels: # k8s >= 1.2 namespace = labels[KubeUtil.NAMESPACE_LABEL] pod_name = "{0}/{1}".format(namespace, pod_name) tags.append("kube_namespace:%s" % namespace) tags.append("kube_replication_controller:%s" % replication_controller) tags.append("pod_name:%s" % pod_name) elif not v: tags.append(k) else: tags.append("%s:%s" % (k,v)) if k == KubeUtil.POD_NAME_LABEL and self.is_k8s() and k not in labels: tags.append("pod_name:no_pod") # Get entity specific tags if tag_type is not None: tag_names = self.tag_names[tag_type] for tag_name in tag_names: tag_value = self._extract_tag_value(entity, tag_name) if tag_value is not None: for t in tag_value: tags.append('%s:%s' % (tag_name, str(t).strip())) # Add ECS tags if self.collect_ecs_tags: entity_id = entity.get("Id") if entity_id in self.ecs_tags: ecs_tags = self.ecs_tags[entity_id] tags.extend(ecs_tags) # Add kube labels if self.is_k8s(): kube_tags = self.kube_labels.get(pod_name) if kube_tags: tags.extend(list(kube_tags)) return tags def _extract_tag_value(self, entity, tag_name): """Extra tag information from the API result (containers or images). Cache extracted tags inside the entity object. """ if tag_name not in TAG_EXTRACTORS: self.warning("{0} isn't a supported tag".format(tag_name)) return # Check for already extracted tags if "_tag_values" not in entity: entity["_tag_values"] = {} if tag_name not in entity["_tag_values"]: entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity) return entity["_tag_values"][tag_name] def refresh_ecs_tags(self): ecs_config = self.docker_client.inspect_container('ecs-agent') ip = ecs_config.get('NetworkSettings', {}).get('IPAddress') ports = ecs_config.get('NetworkSettings', {}).get('Ports') port = ports.keys()[0].split('/')[0] if ports else None if not ip: port = ECS_INTROSPECT_DEFAULT_PORT if DockerUtil.is_dockerized() and self.docker_gateway(): ip = self.docker_gateway else: ip = "localhost" ecs_tags = {} try: if ip and port: tasks = requests.get('http://%s:%s/v1/tasks' % (ip, port)).json() for task in tasks.get('Tasks', []): for container in task.get('Containers', []): tags = ['task_name:%s' % task['Family'], 'task_version:%s' % task['Version']] ecs_tags[container['DockerId']] = tags except (requests.exceptions.HTTPError, requests.exceptions.HTTPError) as e: self.log.warning("Unable to collect ECS task names: %s" % e) self.ecs_tags = ecs_tags def _filter_containers(self, containers): if not self._filtering_enabled: return self._filtered_containers = set() for container in containers: container_tags = self._get_tags(container, FILTERED) if self._are_tags_filtered(container_tags): container_name = DockerUtil.container_name_extractor(container)[0] self._filtered_containers.add(container_name) self.log.debug("Container {0} is filtered".format(container_name)) def _are_tags_filtered(self, tags): if self._tags_match_patterns(tags, self._exclude_patterns): if self._tags_match_patterns(tags, self._include_patterns): return False return True return False def _tags_match_patterns(self, tags, filters): for rule in filters: for tag in tags: if re.match(rule, tag): return True return False def _is_container_excluded(self, container): """Check if a container is excluded according to the filter rules. Requires _filter_containers to run first. """ container_name = DockerUtil.container_name_extractor(container)[0] return container_name in self._filtered_containers def _report_container_size(self, containers_by_id): for container in containers_by_id.itervalues(): if self._is_container_excluded(container): continue tags = self._get_tags(container, PERFORMANCE) m_func = FUNC_MAP[GAUGE][self.use_histogram] if "SizeRw" in container: m_func(self, 'docker.container.size_rw', container['SizeRw'], tags=tags) if "SizeRootFs" in container: m_func( self, 'docker.container.size_rootfs', container['SizeRootFs'], tags=tags) def _report_image_size(self, images): for image in images: tags = self._get_tags(image, IMAGE) if 'VirtualSize' in image: self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags) if 'Size' in image: self.gauge('docker.image.size', image['Size'], tags=tags) # Performance metrics def _report_performance_metrics(self, containers_by_id): containers_without_proc_root = [] for container in containers_by_id.itervalues(): if self._is_container_excluded(container) or not self._is_container_running(container): continue tags = self._get_tags(container, PERFORMANCE) self._report_cgroup_metrics(container, tags) if "_proc_root" not in container: containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0]) continue self._report_net_metrics(container, tags) if containers_without_proc_root: message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format( ", ".join(containers_without_proc_root)) if not self.is_k8s(): self.warning(message) else: # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway self.log.debug(message) def _report_cgroup_metrics(self, container, tags): try: for cgroup in CGROUP_METRICS: stat_file = self._get_cgroup_from_proc(cgroup["cgroup"], container['_pid'], cgroup['file']) stats = self._parse_cgroup_file(stat_file) if stats: for key, (dd_key, metric_func) in cgroup['metrics'].iteritems(): metric_func = FUNC_MAP[metric_func][self.use_histogram] if key in stats: metric_func(self, dd_key, int(stats[key]), tags=tags) # Computed metrics for mname, (key_list, fct, metric_func) in cgroup.get('to_compute', {}).iteritems(): values = [stats[key] for key in key_list if key in stats] if len(values) != len(key_list): self.log.debug("Couldn't compute {0}, some keys were missing.".format(mname)) continue value = fct(*values) metric_func = FUNC_MAP[metric_func][self.use_histogram] if value is not None: metric_func(self, mname, value, tags=tags) except MountException as ex: if self.cgroup_listing_retries > MAX_CGROUP_LISTING_RETRIES: raise ex else: self.warning("Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now." "Will retry {0} times before failing.".format(MAX_CGROUP_LISTING_RETRIES - self.cgroup_listing_retries)) self.cgroup_listing_retries += 1 else: self.cgroup_listing_retries = 0 def _report_net_metrics(self, container, tags): """Find container network metrics by looking at /proc/$PID/net/dev of the container process.""" if self._disable_net_metrics: self.log.debug("Network metrics are disabled. Skipping") return proc_net_file = os.path.join(container['_proc_root'], 'net/dev') try: with open(proc_net_file, 'r') as fp: lines = fp.readlines() """Two first lines are headers: Inter-| Receive | Transmit face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed """ for l in lines[2:]: cols = l.split(':', 1) interface_name = str(cols[0]).strip() if interface_name == 'eth0': x = cols[1].split() m_func = FUNC_MAP[RATE][self.use_histogram] m_func(self, "docker.net.bytes_rcvd", long(x[0]), tags) m_func(self, "docker.net.bytes_sent", long(x[8]), tags) break except Exception as e: # It is possible that the container got stopped between the API call and now self.warning("Failed to report IO metrics from file {0}. Exception: {1}".format(proc_net_file, e)) def _process_events(self, containers_by_id): if self.collect_events is False: # Crawl events for service discovery only self._get_events() return try: api_events = self._get_events() aggregated_events = self._pre_aggregate_events(api_events, containers_by_id) events = self._format_events(aggregated_events, containers_by_id) except (socket.timeout, urllib2.URLError): self.warning('Timeout when collecting events. Events will be missing.') return except Exception as e: self.warning("Unexpected exception when collecting events: {0}. " "Events will be missing".format(e)) return for ev in events: self.log.debug("Creating event: %s" % ev['msg_title']) self.event(ev) def _get_events(self): """Get the list of events.""" events, conf_reload_set = self.docker_util.get_events() if conf_reload_set and self._service_discovery: get_sd_backend(self.agentConfig).reload_check_configs = conf_reload_set return events def _pre_aggregate_events(self, api_events, containers_by_id): # Aggregate events, one per image. Put newer events first. events = defaultdict(deque) for event in api_events: # Skip events related to filtered containers container = containers_by_id.get(event.get('id')) if container is not None and self._is_container_excluded(container): self.log.debug("Excluded event: container {0} status changed to {1}".format( event['id'], event['status'])) continue # from may be missing (for network events for example) if 'from' in event: events[event['from']].appendleft(event) return events def _format_events(self, aggregated_events, containers_by_id): events = [] for image_name, event_group in aggregated_events.iteritems(): max_timestamp = 0 status = defaultdict(int) status_change = [] container_tags = set() for event in event_group: max_timestamp = max(max_timestamp, int(event['time'])) status[event['status']] += 1 container_name = event['id'][:11] if event['id'] in containers_by_id: cont = containers_by_id[event['id']] container_name = DockerUtil.container_name_extractor(cont)[0] container_tags.update(self._get_tags(cont, PERFORMANCE)) container_tags.add('container_name:%s' % container_name) status_change.append([container_name, event['status']]) status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()]) msg_title = "%s %s on %s" % (image_name, status_text, self.hostname) msg_body = ( "%%%\n" "{image_name} {status} on {hostname}\n" "```\n{status_changes}\n```\n" "%%%" ).format( image_name=image_name, status=status_text, hostname=self.hostname, status_changes="\n".join( ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change]) ) events.append({ 'timestamp': max_timestamp, 'host': self.hostname, 'event_type': EVENT_TYPE, 'msg_title': msg_title, 'msg_text': msg_body, 'source_type_name': EVENT_TYPE, 'event_object': 'docker:%s' % image_name, 'tags': list(container_tags) }) return events def _report_disk_stats(self): """Report metrics about the volume space usage""" stats = { 'docker.data.used': None, 'docker.data.total': None, 'docker.data.free': None, 'docker.metadata.used': None, 'docker.metadata.total': None, 'docker.metadata.free': None # these two are calculated by _calc_percent_disk_stats # 'docker.data.percent': None, # 'docker.metadata.percent': None } info = self.docker_client.info() driver_status = info.get('DriverStatus', []) if not driver_status: self.log.warning('Disk metrics collection is enabled but docker info did not' ' report any. Your storage driver might not support them, skipping.') return for metric in driver_status: # only consider metrics about disk space if len(metric) == 2 and 'Space' in metric[0]: # identify Data and Metadata metrics mtype = 'data' if 'Metadata' in metric[0]: mtype = 'metadata' if 'Used' in metric[0]: stats['docker.{0}.used'.format(mtype)] = metric[1] elif 'Space Total' in metric[0]: stats['docker.{0}.total'.format(mtype)] = metric[1] elif 'Space Available' in metric[0]: stats['docker.{0}.free'.format(mtype)] = metric[1] stats = self._format_disk_metrics(stats) stats.update(self._calc_percent_disk_stats(stats)) tags = self._get_tags() for name, val in stats.iteritems(): if val is not None: self.gauge(name, val, tags) def _format_disk_metrics(self, metrics): """Cast the disk stats to float and convert them to bytes""" for name, raw_val in metrics.iteritems(): if raw_val: val, unit = raw_val.split(' ') # by default some are uppercased others lowercased. That's error prone. unit = unit.lower() try: val = int(float(val) * UNIT_MAP[unit]) metrics[name] = val except KeyError: self.log.error('Unrecognized unit %s for disk metric %s. Dropping it.' % (unit, name)) metrics[name] = None return metrics def _calc_percent_disk_stats(self, stats): """Calculate a percentage of used disk space for data and metadata""" mtypes = ['data', 'metadata'] percs = {} for mtype in mtypes: used = stats.get('docker.{0}.used'.format(mtype)) total = stats.get('docker.{0}.total'.format(mtype)) free = stats.get('docker.{0}.free'.format(mtype)) if used and total and free and ceil(total) < free + used: self.log.debug('used, free, and total disk metrics may be wrong, ' 'used: %s, free: %s, total: %s', used, free, total) total = used + free try: if isinstance(used, int): percs['docker.{0}.percent'.format(mtype)] = round(100 * float(used) / float(total), 2) elif isinstance(free, int): percs['docker.{0}.percent'.format(mtype)] = round(100 * (1.0 - (float(free) / float(total))), 2) except ZeroDivisionError: self.log.error('docker.{0}.total is 0, calculating docker.{1}.percent' ' is not possible.'.format(mtype, mtype)) return percs # Cgroups def _get_cgroup_from_proc(self, cgroup, pid, filename): """Find a specific cgroup file, containing metrics to extract.""" params = { "file": filename, } return DockerUtil.find_cgroup_from_proc(self._mountpoints, pid, cgroup, self.docker_util._docker_root) % (params) def _parse_cgroup_file(self, stat_file): """Parse a cgroup pseudo file for key/values.""" self.log.debug("Opening cgroup file: %s" % stat_file) try: with open(stat_file, 'r') as fp: if 'blkio' in stat_file: return self._parse_blkio_metrics(fp.read().splitlines()) else: return dict(map(lambda x: x.split(' ', 1), fp.read().splitlines())) except IOError: # It is possible that the container got stopped between the API call and now self.log.info("Can't open %s. Metrics for this container are skipped." % stat_file) def _parse_blkio_metrics(self, stats): """Parse the blkio metrics.""" metrics = { 'io_read': 0, 'io_write': 0, } for line in stats: if 'Read' in line: metrics['io_read'] += int(line.split()[2]) if 'Write' in line: metrics['io_write'] += int(line.split()[2]) return metrics # proc files def _crawl_container_pids(self, container_dict): """Crawl `/proc` to find container PIDs and add them to `containers_by_id`.""" proc_path = os.path.join(self.docker_util._docker_root, 'proc') pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()] if len(pid_dirs) == 0: self.warning("Unable to find any pid directory in {0}. " "If you are running the agent in a container, make sure to " 'share the volume properly: "/proc:/host/proc:ro". ' "See https://github.com/DataDog/docker-dd-agent/blob/master/README.md for more information. " "Network metrics will be missing".format(proc_path)) self._disable_net_metrics = True return container_dict self._disable_net_metrics = False for folder in pid_dirs: try: path = os.path.join(proc_path, folder, 'cgroup') with open(path, 'r') as f: content = [line.strip().split(':') for line in f.readlines()] selinux_policy = '' path = os.path.join(proc_path, folder, 'attr', 'current') if os.path.exists(path): with open(path, 'r') as f: selinux_policy = f.readlines()[0] except IOError, e: # Issue #2074 self.log.debug("Cannot read %s, " "process likely raced to finish : %s" % (path, str(e))) except Exception as e: self.warning("Cannot read %s : %s" % (path, str(e))) continue try: for line in content: if line[1] in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') and \ ('docker' in line[2] or 'docker' in selinux_policy): cpuacct = line[2] break else: continue matches = re.findall(CONTAINER_ID_RE, cpuacct) if matches: container_id = matches[-1] if container_id not in container_dict: self.log.debug("Container %s not in container_dict, it's likely excluded", container_id) continue container_dict[container_id]['_pid'] = folder container_dict[container_id]['_proc_root'] = os.path.join(proc_path, folder) elif self._custom_cgroups: # if we match by pid that should be enough (?) - O(n) ugh! for _, container in container_dict.iteritems(): if container.get('_pid') == int(folder): container['_proc_root'] = os.path.join(proc_path, folder) break except Exception, e: self.warning("Cannot parse %s content: %s" % (path, str(e))) continue
def get_hostname(config=None): """ Get the canonical host name this agent should identify as. This is the authoritative source of the host name for the agent. Tries, in order: * agent config (datadog.conf, "hostname:") * 'hostname -f' (on unix) * socket.gethostname() """ hostname = None # first, try the config if config is None: from config import get_config config = get_config(parse_args=True) config_hostname = config.get('hostname') if config_hostname and is_valid_hostname(config_hostname): return config_hostname # Try to get GCE instance name gce_hostname = GCE.get_hostname(config) if gce_hostname is not None: if is_valid_hostname(gce_hostname): return gce_hostname # Try to get the docker hostname if Platform.is_containerized(): # First we try from the Docker API docker_util = DockerUtil() docker_hostname = docker_util.get_hostname(use_default_gw=False) if docker_hostname is not None and is_valid_hostname(docker_hostname): hostname = docker_hostname elif Platform.is_k8s(): # Let's try from the kubelet kube_util = KubeUtil() _, kube_hostname = kube_util.get_node_info() if kube_hostname is not None and is_valid_hostname(kube_hostname): hostname = kube_hostname # then move on to os-specific detection if hostname is None: if Platform.is_unix() or Platform.is_solaris(): unix_hostname = _get_hostname_unix() if unix_hostname and is_valid_hostname(unix_hostname): hostname = unix_hostname # if we have an ec2 default hostname, see if there's an instance-id available if (Platform.is_ecs_instance()) or (hostname is not None and EC2.is_default(hostname)): instanceid = EC2.get_instance_id(config) if instanceid: hostname = instanceid # fall back on socket.gethostname(), socket.getfqdn() is too unreliable if hostname is None: try: socket_hostname = socket.gethostname() except socket.error: socket_hostname = None if socket_hostname and is_valid_hostname(socket_hostname): hostname = socket_hostname if hostname is None: log.critical('Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file') raise Exception('Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file') return hostname
def get_hostname(config=None): """ Get the canonical host name this agent should identify as. This is the authoritative source of the host name for the agent. Tries, in order: * agent config (stackstate.conf, "hostname:") * 'hostname -f' (on unix) * socket.gethostname() """ hostname = None # first, try the config if config is None: from config import get_config config = get_config(parse_args=True) config_hostname = config.get('hostname') if config_hostname and is_valid_hostname(config_hostname): return config_hostname # Try to get GCE instance name gce_hostname = GCE.get_hostname(config) if gce_hostname is not None: if is_valid_hostname(gce_hostname): return gce_hostname # Try to get the docker hostname if Platform.is_containerized(): # First we try from the Docker API docker_util = DockerUtil() docker_hostname = docker_util.get_hostname(use_default_gw=False) if docker_hostname is not None and is_valid_hostname(docker_hostname): hostname = docker_hostname elif Platform.is_k8s(): # Let's try from the kubelet kube_util = KubeUtil() _, kube_hostname = kube_util.get_node_info() if kube_hostname is not None and is_valid_hostname(kube_hostname): hostname = kube_hostname # then move on to os-specific detection if hostname is None: if Platform.is_unix() or Platform.is_solaris(): unix_hostname = _get_hostname_unix() if unix_hostname and is_valid_hostname(unix_hostname): hostname = unix_hostname # if we have an ec2 default hostname, see if there's an instance-id available if (Platform.is_ecs_instance()) or (hostname is not None and EC2.is_default(hostname)): instanceid = EC2.get_instance_id(config) if instanceid: hostname = instanceid # fall back on socket.gethostname(), socket.getfqdn() is too unreliable if hostname is None: try: socket_hostname = socket.gethostname() except socket.error: socket_hostname = None if socket_hostname and is_valid_hostname(socket_hostname): hostname = socket_hostname if hostname is None: log.critical( 'Unable to reliably determine host name. You can define one in stackstate.conf or in your hosts file' ) raise Exception( 'Unable to reliably determine host name. You can define one in stackstate.conf or in your hosts file' ) return hostname
class SDDockerBackend(AbstractSDBackend): """Docker-based service discovery""" def __init__(self, agentConfig): self.docker_client = DockerUtil().client if is_k8s(): self.kubeutil = KubeUtil() try: self.config_store = get_config_store(agentConfig=agentConfig) except Exception as e: log.error('Failed to instantiate the config store client. ' 'Auto-config only will be used. %s' % str(e)) agentConfig['sd_config_backend'] = None self.config_store = get_config_store(agentConfig=agentConfig) self.VAR_MAPPING = { 'host': self._get_host, 'port': self._get_ports, 'tags': self._get_additional_tags, } AbstractSDBackend.__init__(self, agentConfig) def _get_host(self, container_inspect): """Extract the host IP from a docker inspect object, or the kubelet API.""" ip_addr = container_inspect.get('NetworkSettings', {}).get('IPAddress') if not ip_addr: if not is_k8s(): return # kubernetes case log.debug("Didn't find the IP address for container %s (%s), using the kubernetes way." % (container_inspect.get('Id', '')[:12], container_inspect.get('Config', {}).get('Image', ''))) pod_list = self.kubeutil.retrieve_pods_list().get('items', []) c_id = container_inspect.get('Id') for pod in pod_list: pod_ip = pod.get('status', {}).get('podIP') if pod_ip is None: continue else: c_statuses = pod.get('status', {}).get('containerStatuses', []) for status in c_statuses: # compare the container id with those of containers in the current pod if c_id == status.get('containerID', '').split('//')[-1]: ip_addr = pod_ip return ip_addr def _get_ports(self, container_inspect): """Extract a list of available ports from a docker inspect object. Sort them numerically.""" c_id = container_inspect.get('Id', '') try: ports = map(lambda x: x.split('/')[0], container_inspect['NetworkSettings']['Ports'].keys()) except (IndexError, KeyError, AttributeError): log.debug("Didn't find the port for container %s (%s), trying the kubernetes way." % (c_id[:12], container_inspect.get('Config', {}).get('Image', ''))) # first we try to get it from the docker API # it works if the image has an EXPOSE instruction ports = map(lambda x: x.split('/')[0], container_inspect['Config'].get('ExposedPorts', {}).keys()) # if it failed, try with the kubernetes API if not ports and is_k8s(): co_statuses = self._get_kube_config(c_id, 'status').get('containerStatuses', []) c_name = None for co in co_statuses: if co.get('containerID', '').split('//')[-1] == c_id: c_name = co.get('name') break containers = self._get_kube_config(c_id, 'spec').get('containers', []) for co in containers: if co.get('name') == c_name: ports = map(lambda x: str(x.get('containerPort')), co.get('ports', [])) ports = sorted(ports, key=lambda x: int(x)) return ports def get_tags(self, c_inspect): """Extract useful tags from docker or platform APIs. These are collected by default.""" tags = [] if is_k8s(): pod_metadata = self._get_kube_config(c_inspect.get('Id'), 'metadata') if pod_metadata is None: log.warning("Failed to fetch pod metadata for container %s." " Kubernetes tags may be missing." % c_inspect.get('Id', '')[:12]) return [] # get labels kube_labels = pod_metadata.get('labels', {}) for label, value in kube_labels.iteritems(): tags.append('%s:%s' % (label, value)) # get replication controller created_by = json.loads(pod_metadata.get('annotations', {}).get('kubernetes.io/created-by', '{}')) if created_by.get('reference', {}).get('kind') == 'ReplicationController': tags.append('kube_replication_controller:%s' % created_by.get('reference', {}).get('name')) # get kubernetes namespace tags.append('kube_namespace:%s' % pod_metadata.get('namespace')) return tags def _get_additional_tags(self, container_inspect): tags = [] if is_k8s(): pod_metadata = self._get_kube_config(container_inspect.get('Id'), 'metadata') pod_spec = self._get_kube_config(container_inspect.get('Id'), 'spec') tags.append('node_name:%s' % pod_spec.get('nodeName')) tags.append('pod_name:%s' % pod_metadata.get('name')) return tags def _get_kube_config(self, c_id, key): """Get a part of a pod config from the kubernetes API""" pods = self.kubeutil.retrieve_pods_list().get('items', []) for pod in pods: c_statuses = pod.get('status', {}).get('containerStatuses', []) for status in c_statuses: if c_id == status.get('containerID', '').split('//')[-1]: return pod.get(key, {}) def get_configs(self): """Get the config for all docker containers running on the host.""" configs = {} containers = [( container.get('Image').split(':')[0].split('/')[-1], container.get('Id'), container.get('Labels') ) for container in self.docker_client.containers()] # used by the configcheck agent command to trace where check configs come from trace_config = self.agentConfig.get(TRACE_CONFIG, False) for image, cid, labels in containers: try: check_configs = self._get_check_configs(cid, image, trace_config=trace_config) or [] for conf in check_configs: if trace_config and conf is not None: source, conf = conf check_name, init_config, instance = conf # build instances list if needed if configs.get(check_name) is None: if trace_config: configs[check_name] = (source, (init_config, [instance])) else: configs[check_name] = (init_config, [instance]) else: conflict_init_msg = 'Different versions of `init_config` found for check {0}. ' \ 'Keeping the first one found.' if trace_config: if configs[check_name][1][0] != init_config: log.warning(conflict_init_msg.format(check_name)) configs[check_name][1][1].append(instance) else: if configs[check_name][0] != init_config: log.warning(conflict_init_msg.format(check_name)) configs[check_name][1].append(instance) except Exception: log.exception('Building config for container %s based on image %s using service' ' discovery failed, leaving it alone.' % (cid[:12], image)) return configs def _get_check_configs(self, c_id, image, trace_config=False): """Retrieve configuration templates and fill them with data pulled from docker and tags.""" inspect = self.docker_client.inspect_container(c_id) config_templates = self._get_config_templates(image, trace_config=trace_config) if not config_templates: log.debug('No config template for container %s with image %s. ' 'It will be left unconfigured.' % (c_id[:12], image)) return None check_configs = [] tags = self.get_tags(inspect) for config_tpl in config_templates: if trace_config: source, config_tpl = config_tpl check_name, init_config_tpl, instance_tpl, variables = config_tpl # insert tags in instance_tpl and process values for template variables instance_tpl, var_values = self._fill_tpl(inspect, instance_tpl, variables, tags) tpl = self._render_template(init_config_tpl or {}, instance_tpl or {}, var_values) if tpl and len(tpl) == 2: if trace_config and len(tpl[1]) == 2: source, (init_config, instance) = tpl check_configs.append((source, (check_name, init_config, instance))) elif not trace_config: init_config, instance = tpl check_configs.append((check_name, init_config, instance)) return check_configs def _get_config_templates(self, image_name, trace_config=False): """Extract config templates for an image from a K/V store and returns it as a dict object.""" config_backend = self.agentConfig.get('sd_config_backend') templates = [] if config_backend is None: auto_conf = True log.warning('No supported configuration backend was provided, using auto-config only.') else: auto_conf = False # format: [('image', {init_tpl}, {instance_tpl})] without trace_config # or [(source, ('image', {init_tpl}, {instance_tpl}))] with trace_config raw_tpls = self.config_store.get_check_tpls(image_name, auto_conf=auto_conf, trace_config=trace_config) for tpl in raw_tpls: if trace_config and tpl is not None: # each template can come from either auto configuration or user-supplied templates source, tpl = tpl if tpl is not None and len(tpl) == 3: check_name, init_config_tpl, instance_tpl = tpl else: log.debug('No template was found for image %s, leaving it alone.' % image_name) return None try: # build a list of all variables to replace in the template variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \ self.PLACEHOLDER_REGEX.findall(str(instance_tpl)) variables = map(lambda x: x.strip('%'), variables) if not isinstance(init_config_tpl, dict): init_config_tpl = json.loads(init_config_tpl or '{}') if not isinstance(instance_tpl, dict): instance_tpl = json.loads(instance_tpl or '{}') except json.JSONDecodeError: log.exception('Failed to decode the JSON template fetched for check {0}. Its configuration' ' by service discovery failed for {1}.'.format(check_name, image_name)) return None if trace_config: templates.append((source, (check_name, init_config_tpl, instance_tpl, variables))) else: templates.append((check_name, init_config_tpl, instance_tpl, variables)) return templates def _fill_tpl(self, inspect, instance_tpl, variables, tags=None): """Add container tags to instance templates and build a """ """dict from template variable names and their values.""" var_values = {} # add default tags to the instance if tags: tags += instance_tpl.get('tags', []) instance_tpl['tags'] = list(set(tags)) for v in variables: # variables can be suffixed with an index in case a list is found var_parts = v.split('_') if var_parts[0] in self.VAR_MAPPING: try: res = self.VAR_MAPPING[var_parts[0]](inspect) if not res: raise ValueError("Invalid value for variable %s." % var_parts[0]) # if an index is found in the variable, use it to select a value if len(var_parts) > 1 and isinstance(res, list) and int(var_parts[-1]) < len(res): var_values[v] = res[int(var_parts[-1])] # if no valid index was found but we have a list, return the last element elif isinstance(res, list): var_values[v] = res[-1] else: var_values[v] = res except Exception as ex: log.error("Could not find a value for the template variable %s: %s" % (v, str(ex))) else: log.error("No method was found to interpolate template variable %s." % v) return instance_tpl, var_values
class Kubernetes(AgentCheck): """ Collect metrics and events from kubelet """ pod_names_by_container = {} def __init__(self, name, init_config, agentConfig, instances=None): if instances is not None and len(instances) > 1: raise Exception('Kubernetes check only supports one configured instance.') AgentCheck.__init__(self, name, init_config, agentConfig, instances) self.kubeutil = KubeUtil() if not self.kubeutil.host: raise Exception('Unable to get default router and host parameter is not set') def _perform_kubelet_checks(self, url): service_check_base = NAMESPACE + '.kubelet.check' is_ok = True try: r = requests.get(url) for line in r.iter_lines(): # avoid noise; this check is expected to fail since we override the container hostname if line.find('hostname') != -1: continue matches = re.match('\[(.)\]([^\s]+) (.*)?', line) if not matches or len(matches.groups()) < 2: continue service_check_name = service_check_base + '.' + matches.group(2) status = matches.group(1) if status == '+': self.service_check(service_check_name, AgentCheck.OK) else: self.service_check(service_check_name, AgentCheck.CRITICAL) is_ok = False except Exception as e: self.log.warning('kubelet check failed: %s' % str(e)) self.service_check(service_check_base, AgentCheck.CRITICAL, message='Kubelet check failed: %s' % str(e)) else: if is_ok: self.service_check(service_check_base, AgentCheck.OK) else: self.service_check(service_check_base, AgentCheck.CRITICAL) def check(self, instance): self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH) enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES) self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges] enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES) self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates] self.publish_aliases = _is_affirmative(instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES)) self.use_histogram = _is_affirmative(instance.get('use_histogram', DEFAULT_USE_HISTOGRAM)) self.publish_rate = FUNC_MAP[RATE][self.use_histogram] self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram] # kubelet health checks self._perform_kubelet_checks(self.kubeutil.kube_health_url) # kubelet metrics self._update_metrics(instance) def _publish_raw_metrics(self, metric, dat, tags, depth=0): if depth >= self.max_depth: self.log.warning('Reached max depth on metric=%s' % metric) return if isinstance(dat, numbers.Number): if self.enabled_rates and any([fnmatch(metric, pat) for pat in self.enabled_rates]): self.publish_rate(self, metric, float(dat), tags) elif self.enabled_gauges and any([fnmatch(metric, pat) for pat in self.enabled_gauges]): self.publish_gauge(self, metric, float(dat), tags) elif isinstance(dat, dict): for k, v in dat.iteritems(): self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags, depth + 1) elif isinstance(dat, list): self._publish_raw_metrics(metric, dat[-1], tags, depth + 1) @staticmethod def _shorten_name(name): # shorten docker image id return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name) def _get_post_1_2_tags(self, cont_labels, subcontainer, kube_labels): tags = [] pod_name = cont_labels[KubeUtil.POD_NAME_LABEL] pod_namespace = cont_labels[KubeUtil.NAMESPACE_LABEL] tags.append(u"pod_name:{0}/{1}".format(pod_namespace, pod_name)) tags.append(u"kube_namespace:{0}".format(pod_namespace)) kube_labels_key = "{0}/{1}".format(pod_namespace, pod_name) pod_labels = kube_labels.get(kube_labels_key) if pod_labels: tags += list(pod_labels) if "-" in pod_name: replication_controller = "-".join(pod_name.split("-")[:-1]) tags.append("kube_replication_controller:%s" % replication_controller) if self.publish_aliases and subcontainer.get("aliases"): for alias in subcontainer['aliases'][1:]: # we don't add the first alias as it will be the container_name tags.append('container_alias:%s' % (self._shorten_name(alias))) return tags def _get_pre_1_2_tags(self, cont_labels, subcontainer, kube_labels): tags = [] pod_name = cont_labels[KubeUtil.POD_NAME_LABEL] tags.append(u"pod_name:{0}".format(pod_name)) pod_labels = kube_labels.get(pod_name) if pod_labels: tags.extend(list(pod_labels)) if "-" in pod_name: replication_controller = "-".join(pod_name.split("-")[:-1]) if "/" in replication_controller: namespace, replication_controller = replication_controller.split("/", 1) tags.append(u"kube_namespace:%s" % namespace) tags.append(u"kube_replication_controller:%s" % replication_controller) if self.publish_aliases and subcontainer.get("aliases"): for alias in subcontainer['aliases'][1:]: # we don't add the first alias as it will be the container_name tags.append(u"container_alias:%s" % (self._shorten_name(alias))) return tags def _update_container_metrics(self, instance, subcontainer, kube_labels): tags = list(instance.get('tags', [])) # add support for custom tags if len(subcontainer.get('aliases', [])) >= 1: # The first alias seems to always match the docker container name container_name = subcontainer['aliases'][0] else: # We default to the container id container_name = subcontainer['name'] tags.append('container_name:%s' % container_name) try: cont_labels = subcontainer['spec']['labels'] except KeyError: self.log.debug("Subcontainer, doesn't have any labels") cont_labels = {} # Collect pod names, namespaces, rc... if KubeUtil.NAMESPACE_LABEL in cont_labels and KubeUtil.POD_NAME_LABEL in cont_labels: # Kubernetes >= 1.2 tags += self._get_post_1_2_tags(cont_labels, subcontainer, kube_labels) elif KubeUtil.POD_NAME_LABEL in cont_labels: # Kubernetes <= 1.1 tags += self._get_pre_1_2_tags(cont_labels, subcontainer, kube_labels) else: # Those are containers that are not part of a pod. # They are top aggregate views and don't have the previous metadata. tags.append("pod_name:no_pod") stats = subcontainer['stats'][-1] # take the latest self._publish_raw_metrics(NAMESPACE, stats, tags) if subcontainer.get("spec", {}).get("has_filesystem"): fs = stats['filesystem'][-1] fs_utilization = float(fs['usage'])/float(fs['capacity']) self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct', fs_utilization, tags) if subcontainer.get("spec", {}).get("has_network"): net = stats['network'] self.publish_rate(self, NAMESPACE + '.network_errors', sum(float(net[x]) for x in NET_ERRORS), tags) def _retrieve_metrics(self, url): return retrieve_json(url) def _update_metrics(self, instance): pods_list = self.kubeutil.retrieve_pods_list() metrics = self._retrieve_metrics(self.kubeutil.metrics_url) excluded_labels = instance.get('excluded_labels') kube_labels = self.kubeutil.extract_kube_labels(pods_list, excluded_keys=excluded_labels) if not metrics: raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd) for subcontainer in metrics: try: self._update_container_metrics(instance, subcontainer, kube_labels) except Exception as e: self.log.error("Unable to collect metrics for container: {0} ({1}".format( subcontainer.get('name'), e)) self._update_pods_metrics(instance, pods_list) def _update_pods_metrics(self, instance, pods): supported_kinds = [ "DaemonSet", "Deployment", "Job", "ReplicationController", "ReplicaSet", ] controllers_map = defaultdict(int) for pod in pods['items']: try: created_by = json.loads(pod['metadata']['annotations']['kubernetes.io/created-by']) kind = created_by['reference']['kind'] if kind in supported_kinds: controllers_map[created_by['reference']['name']] += 1 except KeyError: continue tags = instance.get('tags', []) for ctrl, pod_count in controllers_map.iteritems(): _tags = tags[:] # copy base tags _tags.append('kube_replication_controller:{0}'.format(ctrl)) self.publish_gauge(self, NAMESPACE + '.pods.running', pod_count, _tags)