예제 #1
0
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.dockerutil = DockerUtil(config_store=self.config_store)
        self.kubeutil = None
        if Platform.is_k8s():
            try:
                self.kubeutil = KubeUtil()
            except Exception as ex:
                log.error(
                    "Couldn't instantiate the kubernetes client, "
                    "subsequent kubernetes calls will fail as well. Error: %s"
                    % str(ex))

        self.metadata_collector = MetadataCollector()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'pid': self._get_container_pid,
            'port': self._get_port,
            'container-name': self._get_container_name,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)
예제 #2
0
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.dockerutil = DockerUtil(config_store=self.config_store)
        self.docker_client = self.dockerutil.client
        if Platform.is_k8s():
            try:
                self.kubeutil = KubeUtil()
            except Exception as ex:
                self.kubeutil = None
                log.error("Couldn't instantiate the kubernetes client, "
                    "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

        self.metadata_collector = MetadataCollector()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'pid': self._get_container_pid,
            'port': self._get_port,
            'container-name': self._get_container_name,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)
예제 #3
0
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.dockerutil = DockerUtil(config_store=self.config_store)
        self.kubeutil = None
        if Platform.is_k8s():
            try:
                self.kubeutil = KubeUtil()
            except Exception as ex:
                log.error("Couldn't instantiate the kubernetes client, "
                          "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

        self.metadata_collector = MetadataCollector()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'pid': self._get_container_pid,
            'port': self._get_port,
            'container-name': self._get_container_name,
            'tags': self._get_additional_tags,
        }

        # docker labels we'll add as tags to all instances SD configures
        self.docker_labels_as_tags = agentConfig.get('docker_labels_as_tags', '')
        if self.docker_labels_as_tags:
            self.docker_labels_as_tags = [label.strip() for label in self.docker_labels_as_tags.split(',')]
        else:
            self.docker_labels_as_tags = []

        AbstractSDBackend.__init__(self, agentConfig)
예제 #4
0
    def init(self):
        try:
            instance = self.instances[0]

            # Getting custom tags for service checks when docker is down
            self.custom_tags = instance.get("tags", [])

            self.docker_util = DockerUtil()
            if not self.docker_util.client:
                raise Exception("Failed to initialize Docker client.")

            self.docker_gateway = DockerUtil.get_gateway()
            self.metadata_collector = MetadataCollector()

            self.kubeutil = None
            if Platform.is_k8s():
                try:
                    self.kubeutil = KubeUtil()
                except Exception as ex:
                    self.log.error("Couldn't instantiate the kubernetes client, "
                                   "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            # The collect_labels_as_tags is legacy, only tagging docker metrics.
            # It is replaced by docker_labels_as_tags in config.cfg.
            # We keep this line for backward compatibility.
            if "collect_labels_as_tags" in instance:
                self.collect_labels_as_tags = instance.get("collect_labels_as_tags")

            self.kube_pod_tags = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if self.docker_util.filtering_enabled:
                self.tag_names[FILTERED] = self.docker_util.filtered_tag_names

            # Container network mapping cache
            self.network_mappings = {}

            # get the health check whitelist
            self.whitelist_patterns = None
            health_scs_whitelist = instance.get('health_service_check_whitelist', [])
            if health_scs_whitelist:
                patterns, whitelist_tags = compile_filter_rules(health_scs_whitelist)
                self.whitelist_patterns = set(patterns)
                self.tag_names[HEALTHCHECK] = set(whitelist_tags)

            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_container_count = _is_affirmative(instance.get('collect_container_count', False))
            self.collect_volume_count = _is_affirmative(instance.get('collect_volume_count', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.event_attributes_as_tags = instance.get('event_attributes_as_tags', [])
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_exit_codes = _is_affirmative(instance.get('collect_exit_codes', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.filtered_event_types = tuple(instance.get("filtered_event_types", DEFAULT_FILTERED_EVENT_TYPES))

            self.capped_metrics = instance.get('capped_metrics')

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True
예제 #5
0
class DockerDaemon(AgentCheck):
    """Collect metrics and events from Docker API and cgroups."""

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception("Docker check only supports one configured instance.")
        AgentCheck.__init__(self, name, init_config,
                            agentConfig, instances=instances)
        self.init_success = False
        self._service_discovery = agentConfig.get('service_discovery') and \
            agentConfig.get('service_discovery_backend') == 'docker'

        global_labels_as_tags = agentConfig.get('docker_labels_as_tags')
        if global_labels_as_tags:
            self.collect_labels_as_tags = [label.strip() for label in global_labels_as_tags.split(',')]
        else:
            self.collect_labels_as_tags = DEFAULT_LABELS_AS_TAGS
        self.init()

    def init(self):
        try:
            instance = self.instances[0]

            # Getting custom tags for service checks when docker is down
            self.custom_tags = instance.get("tags", [])

            self.docker_util = DockerUtil()
            if not self.docker_util.client:
                raise Exception("Failed to initialize Docker client.")

            self.docker_gateway = DockerUtil.get_gateway()
            self.metadata_collector = MetadataCollector()

            self.kubeutil = None
            if Platform.is_k8s():
                try:
                    self.kubeutil = KubeUtil()
                except Exception as ex:
                    self.log.error("Couldn't instantiate the kubernetes client, "
                                   "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            # The collect_labels_as_tags is legacy, only tagging docker metrics.
            # It is replaced by docker_labels_as_tags in config.cfg.
            # We keep this line for backward compatibility.
            if "collect_labels_as_tags" in instance:
                self.collect_labels_as_tags = instance.get("collect_labels_as_tags")

            self.kube_pod_tags = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if self.docker_util.filtering_enabled:
                self.tag_names[FILTERED] = self.docker_util.filtered_tag_names

            # Container network mapping cache
            self.network_mappings = {}

            # get the health check whitelist
            self.whitelist_patterns = None
            health_scs_whitelist = instance.get('health_service_check_whitelist', [])
            if health_scs_whitelist:
                patterns, whitelist_tags = compile_filter_rules(health_scs_whitelist)
                self.whitelist_patterns = set(patterns)
                self.tag_names[HEALTHCHECK] = set(whitelist_tags)

            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_container_count = _is_affirmative(instance.get('collect_container_count', False))
            self.collect_volume_count = _is_affirmative(instance.get('collect_volume_count', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.event_attributes_as_tags = instance.get('event_attributes_as_tags', [])
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_exit_codes = _is_affirmative(instance.get('collect_exit_codes', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.filtered_event_types = tuple(instance.get("filtered_event_types", DEFAULT_FILTERED_EVENT_TYPES))

            self.capped_metrics = instance.get('capped_metrics')

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True

    def check(self, instance):
        """Run the Docker check for one instance."""
        if not self.init_success:
            # Initialization can fail if cgroups are not ready or docker daemon is down. So we retry if needed
            self.init()

            try:
                if self.docker_util.client is None:
                    message = "Unable to connect to Docker daemon"
                    self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                                       message=message, tags=self.custom_tags)
                    return
            except Exception as ex:
                self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                                   message=str(ex), tags=self.custom_tags)
                return

            if not self.init_success:
                # Initialization failed, will try later
                return

        try:
            # Report image metrics
            if self.collect_image_stats:
                self._count_and_weigh_images()

            if Platform.is_k8s():
                self.kube_pod_tags = {}
                if self.kubeutil:
                    try:
                        self.kube_pod_tags = self.kubeutil.get_kube_pod_tags()
                    except Exception as e:
                        self.log.warning('Could not retrieve kubernetes labels: %s' % str(e))

            # containers running with custom cgroups?
            custom_cgroups = _is_affirmative(instance.get('custom_cgroups', False))

            # Get the list of containers and the index of their names
            health_service_checks = True if self.whitelist_patterns else False
            containers_by_id = self._get_and_count_containers(custom_cgroups, health_service_checks)
            containers_by_id = self._crawl_container_pids(containers_by_id, custom_cgroups)

            # Send events from Docker API
            if self.collect_events or self._service_discovery or not self._disable_net_metrics or self.collect_exit_codes:
                self._process_events(containers_by_id)

            # Report performance container metrics (cpu, mem, net, io)
            self._report_performance_metrics(containers_by_id)

            if self.collect_container_size:
                self._report_container_size(containers_by_id)

            if self.collect_container_count:
                self._report_container_count(containers_by_id)

            if self.collect_volume_count:
                self._report_volume_count()

            # Collect disk stats from Docker info command
            if self.collect_disk_stats:
                self._report_disk_stats()

            if health_service_checks:
                self._send_container_healthcheck_sc(containers_by_id)
        except:
            self.log.exception("Docker_daemon check failed")
            self.warning("Check failed. Will retry at next iteration")

        if self.capped_metrics:
            self.filter_capped_metrics()

    def _count_and_weigh_images(self):
        try:
            tags = self._get_tags()
            active_images = self.docker_util.client.images(all=False)
            active_images_len = len(active_images)
            all_images_len = len(self.docker_util.client.images(quiet=True, all=True))
            self.gauge("docker.images.available", active_images_len, tags=tags)
            self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags)

            if self.collect_image_size:
                self._report_image_size(active_images)

        except Exception as e:
            # It's not an important metric, keep going if it fails
            self.warning("Failed to count Docker images. Exception: {0}".format(e))

    def _get_and_count_containers(self, custom_cgroups=False, healthchecks=False):
        """List all the containers from the API, filter and count them."""

        # Querying the size of containers is slow, we don't do it at each run
        must_query_size = self.collect_container_size and self._latest_size_query == 0
        self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE

        running_containers_count = Counter()
        all_containers_count = Counter()

        try:
            containers = self.docker_util.client.containers(all=True, size=must_query_size)
        except Exception as e:
            message = "Unable to list Docker containers: {0}".format(e)
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                               message=message, tags=self.custom_tags)
            raise Exception(message)

        else:
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=self.custom_tags)

        # Create a set of filtered containers based on the exclude/include rules
        # and cache these rules in docker_util
        self._filter_containers(containers)

        containers_by_id = {}

        for container in containers:
            container_name = DockerUtil.container_name_extractor(container)[0]

            container_status_tags = self._get_tags(container, CONTAINER)

            all_containers_count[tuple(sorted(container_status_tags))] += 1
            if self._is_container_running(container):
                running_containers_count[tuple(sorted(container_status_tags))] += 1

            # Check if the container is included/excluded via its tags
            if self._is_container_excluded(container):
                self.log.debug("Container {0} is excluded".format(container_name))
                continue

            containers_by_id[container['Id']] = container

            # grab pid via API if custom cgroups - otherwise we won't find process when
            # crawling for pids.
            if custom_cgroups or healthchecks:
                try:
                    inspect_dict = self.docker_util.client.inspect_container(container_name)
                    container['_pid'] = inspect_dict['State']['Pid']
                    container['health'] = inspect_dict['State'].get('Health', {})
                except Exception as e:
                    self.log.debug("Unable to inspect Docker container: %s", e)

        total_count = 0
        # TODO: deprecate these 2, they should be replaced by _report_container_count
        for tags, count in running_containers_count.iteritems():
            total_count += count
            self.gauge("docker.containers.running", count, tags=list(tags))
        self.gauge("docker.containers.running.total", total_count, tags=self.custom_tags)

        total_count = 0
        for tags, count in all_containers_count.iteritems():
            stopped_count = count - running_containers_count[tags]
            total_count += stopped_count
            self.gauge("docker.containers.stopped", stopped_count, tags=list(tags))
        self.gauge("docker.containers.stopped.total", total_count, tags=self.custom_tags)

        return containers_by_id

    def _is_container_running(self, container):
        """Tell if a container is running, according to its status.

        There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated.
        See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35
        """
        return container["Status"].startswith("Up") or container["Status"].startswith("Restarting")

    def _get_tags(self, entity=None, tag_type=None):
        """Generate the tags for a given entity (container or image) according to a list of tag names."""
        # Start with custom tags
        tags = list(self.custom_tags)

        # Collect pod names as tags on kubernetes
        if Platform.is_k8s() and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags:
            self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL)
            self.collect_labels_as_tags.append(KubeUtil.CONTAINER_NAME_LABEL)

        # Collect container names as tags on rancher
        if Platform.is_rancher():
            if RANCHER_CONTAINER_NAME not in self.collect_labels_as_tags:
                self.collect_labels_as_tags.append(RANCHER_CONTAINER_NAME)
            if RANCHER_SVC_NAME not in self.collect_labels_as_tags:
                self.collect_labels_as_tags.append(RANCHER_SVC_NAME)
            if RANCHER_STACK_NAME not in self.collect_labels_as_tags:
                self.collect_labels_as_tags.append(RANCHER_STACK_NAME)

        if entity is not None:
            pod_name = None
            namespace = None
            # Get labels as tags
            labels = entity.get("Labels")
            if labels is not None:
                for k in self.collect_labels_as_tags:
                    if k in labels:
                        v = labels[k]
                        if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s():
                            pod_name = v
                            k = "pod_name"
                            if "-" in pod_name:
                                replication_controller = "-".join(pod_name.split("-")[:-1])
                                if "/" in replication_controller:  # k8s <= 1.1
                                    namespace, replication_controller = replication_controller.split("/", 1)

                                elif KubeUtil.NAMESPACE_LABEL in labels:  # k8s >= 1.2
                                    namespace = labels[KubeUtil.NAMESPACE_LABEL]

                                tags.append("kube_namespace:%s" % namespace)
                                tags.append("kube_replication_controller:%s" % replication_controller)
                                tags.append("pod_name:%s" % pod_name)

                        elif k == KubeUtil.CONTAINER_NAME_LABEL and Platform.is_k8s():
                            if v:
                                tags.append("kube_container_name:%s" % v)
                        elif k == SWARM_SVC_LABEL and Platform.is_swarm():
                            if v:
                                tags.append("swarm_service:%s" % v)
                        elif k == RANCHER_CONTAINER_NAME and Platform.is_rancher():
                            if v:
                                tags.append('rancher_container:%s' % v)
                        elif k == RANCHER_SVC_NAME and Platform.is_rancher():
                            if v:
                                tags.append('rancher_service:%s' % v)
                        elif k == RANCHER_STACK_NAME and Platform.is_rancher():
                            if v:
                                tags.append('rancher_stack:%s' % v)

                        elif not v:
                            tags.append(k)

                        else:
                            tags.append("%s:%s" % (k, v))

                    if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s() and k not in labels:
                        tags.append("pod_name:no_pod")

            # Get entity specific tags
            if tag_type is not None:
                tag_names = self.tag_names[tag_type]
                for tag_name in tag_names:
                    tag_value = self._extract_tag_value(entity, tag_name)
                    if tag_value is not None:
                        for t in tag_value:
                            tags.append('%s:%s' % (tag_name, str(t).strip()))

            # Add kube labels and creator/service tags
            if Platform.is_k8s() and namespace and pod_name:
                kube_tags = self.kube_pod_tags.get("{0}/{1}".format(namespace, pod_name))
                if kube_tags:
                    tags.extend(list(kube_tags))

            if self.metadata_collector.has_detected():
                orch_tags = self.metadata_collector.get_container_tags(co=entity)
                tags.extend(orch_tags)

        return tags

    def _extract_tag_value(self, entity, tag_name):
        """Extra tag information from the API result (containers or images).
        Cache extracted tags inside the entity object.
        """
        if tag_name not in TAG_EXTRACTORS:
            self.warning("{0} isn't a supported tag".format(tag_name))
            return

        # Check for already extracted tags
        if "_tag_values" not in entity:
            entity["_tag_values"] = {}

        if tag_name not in entity["_tag_values"]:
            entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity)

        return entity["_tag_values"][tag_name]

    def _filter_containers(self, containers):
        if not self.docker_util.filtering_enabled:
            return

        self._filtered_containers = set()
        for container in containers:
            container_tags = self._get_tags(container, FILTERED)
            # exclude/include patterns are stored in docker_util to share them with other container-related checks
            if self.docker_util.are_tags_filtered(container_tags):
                container_name = DockerUtil.container_name_extractor(container)[0]
                self._filtered_containers.add(container_name)
                self.log.debug("Container {0} is filtered".format(container_name))

    def _is_container_excluded(self, container):
        """Check if a container is excluded according to the filter rules.

        Requires _filter_containers to run first.
        """
        container_name = DockerUtil.container_name_extractor(container)[0]
        return container_name in self._filtered_containers

    def _report_container_size(self, containers_by_id):
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)
            m_func = FUNC_MAP[GAUGE][self.use_histogram]
            if "SizeRw" in container:
                m_func(self, 'docker.container.size_rw', container['SizeRw'],
                       tags=tags)
            if "SizeRootFs" in container:
                m_func(
                    self, 'docker.container.size_rootfs', container['SizeRootFs'],
                    tags=tags)

    def _send_container_healthcheck_sc(self, containers_by_id):
        """Send health service checks for containers."""
        for container in containers_by_id.itervalues():
            healthcheck_tags = self._get_tags(container, HEALTHCHECK)
            match = False
            for tag in healthcheck_tags:
                for rule in self.whitelist_patterns:
                    if re.match(rule, tag):
                        match = True

                        self._submit_healthcheck_sc(container)
                        break

                if match:
                    break

    def _submit_healthcheck_sc(self, container):
        health = container.get('health', {})
        status = AgentCheck.UNKNOWN
        if health:
            _health = health.get('Status', '')
            if _health == 'unhealthy':
                status = AgentCheck.CRITICAL
            elif _health == 'healthy':
                status = AgentCheck.OK

        tags = self._get_tags(container, CONTAINER)
        self.service_check(HEALTHCHECK_SERVICE_CHECK_NAME, status, tags=tags)

    def _report_container_count(self, containers_by_id):
        """Report container count per state"""
        m_func = FUNC_MAP[GAUGE][self.use_histogram]

        per_state_count = defaultdict(int)

        filterlambda = lambda ctr: not self._is_container_excluded(ctr)
        containers = list(filter(filterlambda, containers_by_id.values()))

        for ctr in containers:
            per_state_count[ctr.get('State', '')] += 1

        for state in per_state_count:
            if state:
                m_func(self, 'docker.container.count', per_state_count[state], tags=['container_state:%s' % state.lower()])

    def _report_volume_count(self):
        """Report volume count per state (dangling or not)"""
        m_func = FUNC_MAP[GAUGE][self.use_histogram]

        attached_volumes = self.docker_util.client.volumes(filters={'dangling': False})
        dangling_volumes = self.docker_util.client.volumes(filters={'dangling': True})
        attached_count = len(attached_volumes.get('Volumes', []) or [])
        dangling_count = len(dangling_volumes.get('Volumes', []) or [])
        m_func(self, 'docker.volume.count', attached_count, tags=['volume_state:attached'])
        m_func(self, 'docker.volume.count', dangling_count, tags=['volume_state:dangling'])

    def _report_image_size(self, images):
        for image in images:
            tags = self._get_tags(image, IMAGE)
            if 'VirtualSize' in image:
                self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags)
            if 'Size' in image:
                self.gauge('docker.image.size', image['Size'], tags=tags)

    # Performance metrics

    def _report_performance_metrics(self, containers_by_id):

        containers_without_proc_root = []
        for container_id, container in containers_by_id.iteritems():
            if self._is_container_excluded(container) or not self._is_container_running(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)

            try:
                self._report_cgroup_metrics(container, tags)
                if "_proc_root" not in container:
                    containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0])
                    continue
                self._report_net_metrics(container, tags)
            except BogusPIDException as e:
                self.log.warning('Unable to report cgroup metrics for container %s: %s', container_id[:12], e)

        if containers_without_proc_root:
            message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format(
                ", ".join(containers_without_proc_root))
            if not Platform.is_k8s():
                self.warning(message)
            else:
                # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway
                self.log.debug(message)

    def _report_cgroup_metrics(self, container, tags):
        cgroup_stat_file_failures = 0
        if not container.get('_pid'):
            raise BogusPIDException('Cannot report on bogus pid(0)')

        for cgroup in CGROUP_METRICS:
            try:
                stat_file = self._get_cgroup_from_proc(cgroup["cgroup"], container['_pid'], cgroup['file'])
            except MountException as e:
                # We can't find a stat file
                self.warning(str(e))
                cgroup_stat_file_failures += 1
                if cgroup_stat_file_failures >= len(CGROUP_METRICS):
                    self.warning("Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now.")
            except IOError as e:
                self.log.debug("Cannot read cgroup file, container likely raced to finish : %s", e)
            else:
                stats = self._parse_cgroup_file(stat_file)
                if stats:
                    for key, (dd_key, metric_func) in cgroup['metrics'].iteritems():
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if key in stats:
                            metric_func(self, dd_key, int(stats[key]), tags=tags)

                    # Computed metrics
                    for mname, (key_list, fct, metric_func) in cgroup.get('to_compute', {}).iteritems():
                        values = [stats[key] for key in key_list if key in stats]
                        if len(values) != len(key_list):
                            self.log.debug("Couldn't compute {0}, some keys were missing.".format(mname))
                            continue
                        value = fct(*values)
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if value is not None:
                            metric_func(self, mname, value, tags=tags)

    def _report_net_metrics(self, container, tags):
        """Find container network metrics by looking at /proc/$PID/net/dev of the container process."""
        if self._disable_net_metrics:
            self.log.debug("Network metrics are disabled. Skipping")
            return

        proc_net_file = os.path.join(container['_proc_root'], 'net/dev')

        try:
            if container['Id'] in self.network_mappings:
                networks = self.network_mappings[container['Id']]
            else:
                networks = self.docker_util.get_container_network_mapping(container)
                if not networks:
                    networks = {'eth0': 'bridge'}
                self.network_mappings[container['Id']] = networks
        except Exception as e:
            # Revert to previous behaviour if the method is missing or failing
            # Debug message will only appear once per container, then the cache is used
            self.log.debug("Failed to build docker network mapping, using failsafe. Exception: {0}".format(e))
            networks = {'eth0': 'bridge'}
            self.network_mappings[container['Id']] = networks

        try:
            with open(proc_net_file, 'r') as fp:
                lines = fp.readlines()
                """Two first lines are headers:
                Inter-|   Receive                                                |  Transmit
                 face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed
                """
                for l in lines[2:]:
                    cols = l.split(':', 1)
                    interface_name = str(cols[0]).strip()
                    if interface_name in networks:
                        net_tags = tags + ['docker_network:'+networks[interface_name]]
                        x = cols[1].split()
                        m_func = FUNC_MAP[RATE][self.use_histogram]
                        m_func(self, "docker.net.bytes_rcvd", long(x[0]), net_tags)
                        m_func(self, "docker.net.bytes_sent", long(x[8]), net_tags)

        except Exception as e:
            # It is possible that the container got stopped between the API call and now
            self.warning("Failed to report IO metrics from file {0}. Exception: {1}".format(proc_net_file, e))

    def _invalidate_network_mapping_cache(self, api_events):
        for ev in api_events:
            try:
                if ev.get('Type') == 'network' and ev.get('Action').endswith('connect'):
                    container_id = ev.get('Actor').get('Attributes').get('container')
                    if container_id in self.network_mappings:
                        self.log.debug("Removing network mapping cache for container %s" % container_id)
                        del self.network_mappings[container_id]
            except Exception:
                self.log.warning('Malformed network event: %s' % str(ev))

    def _process_events(self, containers_by_id):
        api_events = self._get_events()

        if self.collect_exit_codes:
            self._report_exit_codes(api_events, containers_by_id)

        if self.collect_events:
            try:
                aggregated_events = self._pre_aggregate_events(api_events, containers_by_id)
                events = self._format_events(aggregated_events, containers_by_id)
            except (socket.timeout, urllib2.URLError):
                self.warning('Timeout when collecting events. Events will be missing.')
                return
            except Exception as e:
                self.warning("Unexpected exception when collecting events: {0}. "
                             "Events will be missing".format(e))
                return

            for ev in events:
                self.log.debug("Creating event: %s" % ev['msg_title'])
                self.event(ev)

    def _get_events(self):
        """Get the list of events."""
        events, changed_container_ids = self.docker_util.get_events()
        if not self._disable_net_metrics:
            self._invalidate_network_mapping_cache(events)
        if changed_container_ids and self._service_discovery:
            get_sd_backend(self.agentConfig).update_checks(changed_container_ids)
        if changed_container_ids:
            self.metadata_collector.invalidate_cache(events)
        return events

    def _pre_aggregate_events(self, api_events, containers_by_id):
        # Aggregate events, one per image. Put newer events first.
        events = defaultdict(deque)
        for event in api_events:
            # Skip events related to filtered containers
            container = containers_by_id.get(event.get('id'))
            if container is not None and self._is_container_excluded(container):
                self.log.debug("Excluded event: container {0} status changed to {1}".format(
                    event['id'], event['status']))
                continue
            # from may be missing (for network events for example)
            if 'from' in event:
                image_name = event['from']
                if image_name.startswith('sha256:'):
                    image_name = self.docker_util.image_name_extractor({'Image': image_name})
                events[image_name].appendleft(event)
        return events

    def _format_events(self, aggregated_events, containers_by_id):
        events = []
        for image_name, event_group in aggregated_events.iteritems():
            container_tags = set()
            filtered_events_count = 0
            normal_prio_events = []

            for event in event_group:
                # Only keep events that are not configured to be filtered out
                if event['status'].startswith(self.filtered_event_types):
                    filtered_events_count += 1
                    continue
                container_name = event['id'][:11]

                if event['id'] in containers_by_id:
                    cont = containers_by_id[event['id']]
                    container_name = DockerUtil.container_name_extractor(cont)[0]
                    container_tags.update(self._get_tags(cont, PERFORMANCE))
                    container_tags.add('container_name:%s' % container_name)
                    # Add additionnal docker event attributes as tag
                    for attr in self.event_attributes_as_tags:
                        if attr in event['Actor']['Attributes'] and attr not in EXCLUDED_ATTRIBUTES:
                            container_tags.add('%s:%s' % (attr, event['Actor']['Attributes'][attr]))

                normal_prio_events.append((event, container_name))
            if filtered_events_count:
                self.log.debug('%d events were filtered out because of ignored event type' % filtered_events_count)

            normal_event = self._create_dd_event(normal_prio_events, image_name, container_tags, priority='Normal')
            if normal_event:
                events.append(normal_event)

        return events

    def _report_exit_codes(self, events, containers_by_id):
        for event in events:
            container_tags = set()
            container = containers_by_id.get(event.get('id'))
            # Skip events related to filtered containers
            if container is not None and self._is_container_excluded(container):
                continue

            # Report the exit code in case of a DIE event
            if container is not None and event['status'] == 'die':
                container_name = DockerUtil.container_name_extractor(container)[0]
                container_tags.update(self._get_tags(container, CONTAINER))
                container_tags.add('container_name:%s' % container_name)
                try:
                    exit_code = int(event['Actor']['Attributes']['exitCode'])
                    message = 'Container %s exited with %s' % (container_name, exit_code)
                    status = AgentCheck.OK if exit_code == 0 else AgentCheck.CRITICAL
                    self.service_check(EXIT_SERVICE_CHECK_NAME, status, tags=list(container_tags), message=message)
                except KeyError:
                    self.log.warning('Unable to collect the exit code for container %s' % container_name)

    def _create_dd_event(self, events, image, c_tags, priority='Normal'):
        """Create the actual event to submit from a list of similar docker events"""
        if not events:
            return

        max_timestamp = 0
        status = defaultdict(int)
        status_change = []

        for ev, c_name in events:
            max_timestamp = max(max_timestamp, int(ev['time']))
            status[ev['status']] += 1
            status_change.append([c_name, ev['status']])

        status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()])
        msg_title = "%s %s on %s" % (image, status_text, self.hostname)
        msg_body = (
            "%%%\n"
            "{image_name} {status} on {hostname}\n"
            "```\n{status_changes}\n```\n"
            "%%%"
        ).format(
            image_name=image,
            status=status_text,
            hostname=self.hostname,
            status_changes="\n".join(
                ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change])
        )

        if any(error in status_text for error in ERROR_ALERT_TYPE):
            alert_type = "error"
        else:
            alert_type = None

        return {
            'timestamp': max_timestamp,
            'host': self.hostname,
            'event_type': EVENT_TYPE,
            'msg_title': msg_title,
            'msg_text': msg_body,
            'source_type_name': EVENT_TYPE,
            'event_object': 'docker:%s' % image,
            'tags': list(c_tags),
            'alert_type': alert_type,
            'priority': priority
        }

    def _report_disk_stats(self):
        """Report metrics about the volume space usage"""
        stats = {
            'docker.data.used': None,
            'docker.data.total': None,
            'docker.data.free': None,
            'docker.metadata.used': None,
            'docker.metadata.total': None,
            'docker.metadata.free': None
            # these two are calculated by _calc_percent_disk_stats
            # 'docker.data.percent': None,
            # 'docker.metadata.percent': None
        }
        info = self.docker_util.client.info()
        driver_status = info.get('DriverStatus', [])
        if not driver_status:
            self.log.warning('Disk metrics collection is enabled but docker info did not'
                             ' report any. Your storage driver might not support them, skipping.')
            return
        for metric in driver_status:
            # only consider metrics about disk space
            if len(metric) == 2 and 'Space' in metric[0]:
                # identify Data and Metadata metrics
                mtype = 'data'
                if 'Metadata' in metric[0]:
                    mtype = 'metadata'

                if 'Used' in metric[0]:
                    stats['docker.{0}.used'.format(mtype)] = metric[1]
                elif 'Space Total' in metric[0]:
                    stats['docker.{0}.total'.format(mtype)] = metric[1]
                elif 'Space Available' in metric[0]:
                    stats['docker.{0}.free'.format(mtype)] = metric[1]
        stats = self._format_disk_metrics(stats)
        stats.update(self._calc_percent_disk_stats(stats))
        tags = self._get_tags()
        for name, val in stats.iteritems():
            if val is not None:
                self.gauge(name, val, tags)

    def _format_disk_metrics(self, metrics):
        """Cast the disk stats to float and convert them to bytes"""
        for name, raw_val in metrics.iteritems():
            if raw_val:
                match = DISK_STATS_RE.search(raw_val)
                if match is None or len(match.groups()) != 2:
                    self.log.warning('Can\'t parse value %s for disk metric %s. Dropping it.' % (raw_val, name))
                    metrics[name] = None
                val, unit = match.groups()
                # by default some are uppercased others lowercased. That's error prone.
                unit = unit.lower()
                try:
                    val = int(float(val) * UNIT_MAP[unit])
                    metrics[name] = val
                except KeyError:
                    self.log.error('Unrecognized unit %s for disk metric %s. Dropping it.' % (unit, name))
                    metrics[name] = None
        return metrics

    def _calc_percent_disk_stats(self, stats):
        """Calculate a percentage of used disk space for data and metadata"""
        mtypes = ['data', 'metadata']
        percs = {}
        for mtype in mtypes:
            used = stats.get('docker.{0}.used'.format(mtype))
            total = stats.get('docker.{0}.total'.format(mtype))
            free = stats.get('docker.{0}.free'.format(mtype))
            if used and total and free and ceil(total) < free + used:
                self.log.debug('used, free, and total disk metrics may be wrong, '
                               'used: %s, free: %s, total: %s',
                               used, free, total)
                total = used + free
            try:
                if isinstance(used, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(100 * float(used) / float(total), 2)
                elif isinstance(free, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(100 * (1.0 - (float(free) / float(total))), 2)
            except ZeroDivisionError:
                self.log.error('docker.{0}.total is 0, calculating docker.{1}.percent'
                               ' is not possible.'.format(mtype, mtype))
        return percs

    # Cgroups
    def _get_cgroup_from_proc(self, cgroup, pid, filename):
        """Find a specific cgroup file, containing metrics to extract."""
        params = {
            "file": filename,
        }
        return DockerUtil.find_cgroup_from_proc(self._mountpoints, pid, cgroup, self.docker_util._docker_root) % (params)

    def _parse_cgroup_file(self, stat_file):
        """Parse a cgroup pseudo file for key/values."""
        self.log.debug("Opening cgroup file: %s" % stat_file)
        try:
            with open(stat_file, 'r') as fp:
                if 'blkio' in stat_file:
                    return self._parse_blkio_metrics(fp.read().splitlines())
                elif 'cpuacct.usage' in stat_file:
                    return dict({'usage': str(int(fp.read())/10000000)})
                elif 'memory.soft_limit_in_bytes' in stat_file:
                    value = int(fp.read())
                    # do not report kernel max default value (uint64 * 4096)
                    # see https://github.com/torvalds/linux/blob/5b36577109be007a6ecf4b65b54cbc9118463c2b/mm/memcontrol.c#L2844-L2845
                    # 2 ** 60 is kept for consistency of other cgroups metrics
                    if value < 2 ** 60:
                        return dict({'softlimit': value})
                else:
                    return dict(map(lambda x: x.split(' ', 1), fp.read().splitlines()))
        except IOError:
            # It is possible that the container got stopped between the API call and now.
            # Some files can also be missing (like cpu.stat) and that's fine.
            self.log.debug("Can't open %s. Its metrics will be missing." % stat_file)

    def _parse_blkio_metrics(self, stats):
        """Parse the blkio metrics."""
        metrics = {
            'io_read': 0,
            'io_write': 0,
        }
        for line in stats:
            if 'Read' in line:
                metrics['io_read'] += int(line.split()[2])
            if 'Write' in line:
                metrics['io_write'] += int(line.split()[2])
        return metrics

    def _is_container_cgroup(self, line, selinux_policy):
        if line[1] not in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') or line[2] == '/docker-daemon':
            return False
        if 'docker' in line[2]:  # general case
            return True
        if 'docker' in selinux_policy:  # selinux
            return True
        if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2][1:]):  # kubernetes
            return True
        if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2].split('/')[-1]): # kube 1.6+ qos hierarchy
            return True
        return False

    # proc files
    def _crawl_container_pids(self, container_dict, custom_cgroups=False):
        """Crawl `/proc` to find container PIDs and add them to `containers_by_id`."""
        proc_path = os.path.join(self.docker_util._docker_root, 'proc')
        pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()]

        if len(pid_dirs) == 0:
            self.warning("Unable to find any pid directory in {0}. "
                         "If you are running the agent in a container, make sure to "
                         'share the volume properly: "/proc:/host/proc:ro". '
                         "See https://github.com/serverdensity/docker-sd-agent/blob/master/README.md for more information. "
                         "Network metrics will be missing".format(proc_path))
            self._disable_net_metrics = True
            return container_dict

        self._disable_net_metrics = False

        for folder in pid_dirs:
            try:
                path = os.path.join(proc_path, folder, 'cgroup')
                with open(path, 'r') as f:
                    content = [line.strip().split(':') for line in f.readlines()]

                selinux_policy = ''
                path = os.path.join(proc_path, folder, 'attr', 'current')
                if os.path.exists(path):
                    with open(path, 'r') as f:
                        selinux_policy = f.readlines()[0]
            except IOError, e:
                #  Issue #2074
                self.log.debug("Cannot read %s, process likely raced to finish : %s", path, e)
            except Exception as e:
                self.warning("Cannot read %s : %s" % (path, str(e)))
                continue

            try:
                for line in content:
                    if self._is_container_cgroup(line, selinux_policy):
                        cpuacct = line[2]
                        break
                else:
                    continue

                matches = re.findall(CONTAINER_ID_RE, cpuacct)
                if matches:
                    container_id = matches[-1]
                    if container_id not in container_dict:
                        self.log.debug(
                            "Container %s not in container_dict, it's likely excluded", container_id
                        )
                        continue
                    container_dict[container_id]['_pid'] = folder
                    container_dict[container_id]['_proc_root'] = os.path.join(proc_path, folder)
                elif custom_cgroups:  # if we match by pid that should be enough (?) - O(n) ugh!
                    for _, container in container_dict.iteritems():
                        if container.get('_pid') == int(folder):
                            container['_proc_root'] = os.path.join(proc_path, folder)
                            break

            except Exception, e:
                self.warning("Cannot parse %s content: %s" % (path, str(e)))
                continue
예제 #6
0
class SDDockerBackend(AbstractSDBackend):
    """Docker-based service discovery"""
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.dockerutil = DockerUtil(config_store=self.config_store)
        self.kubeutil = None
        if Platform.is_k8s():
            try:
                self.kubeutil = KubeUtil()
            except Exception as ex:
                log.error(
                    "Couldn't instantiate the kubernetes client, "
                    "subsequent kubernetes calls will fail as well. Error: %s"
                    % str(ex))

        self.metadata_collector = MetadataCollector()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'pid': self._get_container_pid,
            'port': self._get_port,
            'container-name': self._get_container_name,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)

    def _make_fetch_state(self):
        pod_list = []
        if Platform.is_k8s():
            if not self.kubeutil or not self.kubeutil.init_success:
                log.error(
                    "kubelet client not initialized, cannot retrieve pod list."
                )
            else:
                try:
                    pod_list = self.kubeutil.retrieve_pods_list().get(
                        'items', [])
                except Exception as ex:
                    log.warning("Failed to retrieve pod list: %s" % str(ex))
        return _SDDockerBackendConfigFetchState(
            self.dockerutil.client.inspect_container, pod_list)

    def update_checks(self, changed_containers):
        """
        Takes a list of container IDs that changed recently
        and marks their corresponding checks as
        """
        if not self.dockerutil.client:
            log.warning(
                "Docker client is not initialized, pausing auto discovery.")
            return

        state = self._make_fetch_state()

        conf_reload_set = set()
        for c_id in changed_containers:
            checks = self._get_checks_to_refresh(state, c_id)
            if checks:
                conf_reload_set.update(set(checks))

        if conf_reload_set:
            self.reload_check_configs = conf_reload_set

    def _get_checks_to_refresh(self, state, c_id):
        """Get the list of checks applied to a container from the identifier_to_checks cache in the config store.
        Use the STACKSTATE_ID label or the image."""
        inspect = state.inspect_container(c_id)

        # If the container was removed we can't tell which check is concerned
        # so we have to reload everything.
        # Same thing if it's stopped and we're on Kubernetes in auto_conf mode
        # because the pod was deleted and its template could have been in the annotations.
        if not inspect or \
                (not inspect.get('State', {}).get('Running')
                    and Platform.is_k8s() and not self.agentConfig.get('sd_config_backend')):
            self.reload_check_configs = True
            return

        labels = inspect.get('Config', {}).get('Labels', {})
        identifier = labels.get(STACKSTATE_ID) or \
            self.dockerutil.image_name_extractor(inspect)

        platform_kwargs = {}
        if Platform.is_k8s():
            kube_metadata = state.get_kube_config(c_id, 'metadata') or {}
            platform_kwargs = {
                'kube_annotations': kube_metadata.get('annotations'),
                'kube_container_name': state.get_kube_container_name(c_id),
            }
        if labels:
            platform_kwargs['docker_labels'] = labels
        return self.config_store.get_checks_to_refresh(identifier,
                                                       **platform_kwargs)

    def _get_container_pid(self, state, cid, tpl_var):
        """Extract the host-namespace pid of the container pid 0"""
        pid = state.inspect_container(cid).get('State', {}).get('Pid')
        if not pid:
            return None

        return str(pid)

    def _get_host_address(self, state, c_id, tpl_var):
        """Extract the container IP from a docker inspect object, or the kubelet API."""
        c_inspect = state.inspect_container(c_id)
        c_id = c_inspect.get('Id', '')
        c_img = self.dockerutil.image_name_extractor(c_inspect)

        networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {}
        ip_dict = {}
        for net_name, net_desc in networks.iteritems():
            ip = net_desc.get('IPAddress')
            if ip:
                ip_dict[net_name] = ip
        ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var)
        if ip_addr:
            return ip_addr

        # try to get the bridge (default) IP address
        log.debug("No IP address was found in container %s (%s) "
                  "networks, trying with the IPAddress field" %
                  (c_id[:12], c_img))
        ip_addr = c_inspect.get('NetworkSettings', {}).get('IPAddress')
        if ip_addr:
            return ip_addr

        if Platform.is_k8s():
            # kubernetes case
            log.debug("Couldn't find the IP address for container %s (%s), "
                      "using the kubernetes way." % (c_id[:12], c_img))
            pod_ip = state.get_kube_config(c_id, 'status').get('podIP')
            if pod_ip:
                return pod_ip

        if Platform.is_rancher():
            # try to get the rancher IP address
            log.debug("No IP address was found in container %s (%s) "
                      "trying with the Rancher label" % (c_id[:12], c_img))

            ip_addr = c_inspect.get('Config',
                                    {}).get('Labels',
                                            {}).get(RANCHER_CONTAINER_IP)
            if ip_addr:
                return ip_addr.split('/')[0]

        log.error("No IP address was found for container %s (%s)" %
                  (c_id[:12], c_img))
        return None

    def _extract_ip_from_networks(self, ip_dict, tpl_var):
        """Extract a single IP from a dictionary made of network names and IPs."""
        if not ip_dict:
            return None
        tpl_parts = tpl_var.split('_', 1)

        # no specifier
        if len(tpl_parts) < 2:
            log.debug("No key was passed for template variable %s." % tpl_var)
            return self._get_fallback_ip(ip_dict)
        else:
            res = ip_dict.get(tpl_parts[-1])
            if res is None:
                log.warning(
                    "The key passed for template variable %s was not found." %
                    tpl_var)
                return self._get_fallback_ip(ip_dict)
            else:
                return res

    def _get_fallback_ip(self, ip_dict):
        """try to pick the bridge key, falls back to the value of the last key"""
        if 'bridge' in ip_dict:
            log.debug("Using the bridge network.")
            return ip_dict['bridge']
        else:
            last_key = sorted(ip_dict.iterkeys())[-1]
            log.debug("Trying with the last (sorted) network: '%s'." %
                      last_key)
            return ip_dict[last_key]

    def _get_port(self, state, c_id, tpl_var):
        """Extract a port from a container_inspect or the k8s API given a template variable."""
        container_inspect = state.inspect_container(c_id)

        try:
            ports = map(lambda x: x.split('/')[0],
                        container_inspect['NetworkSettings']['Ports'].keys())
            if len(
                    ports
            ) == 0:  # There might be a key Port in NetworkSettings but no ports so we raise IndexError to check in ExposedPorts
                raise IndexError
        except (IndexError, KeyError, AttributeError):
            # try to get ports from the docker API. Works if the image has an EXPOSE instruction
            ports = map(
                lambda x: x.split('/')[0],
                container_inspect['Config'].get('ExposedPorts', {}).keys())

            # if it failed, try with the kubernetes API
            if not ports and Platform.is_k8s():
                log.debug(
                    "Didn't find the port for container %s (%s), trying the kubernetes way."
                    % (c_id[:12], container_inspect.get('Config', {}).get(
                        'Image', '')))
                spec = state.get_kube_container_spec(c_id)
                if spec:
                    ports = [
                        str(x.get('containerPort'))
                        for x in spec.get('ports', [])
                    ]
        ports = sorted(ports, key=int)
        return self._extract_port_from_list(ports, tpl_var)

    def _extract_port_from_list(self, ports, tpl_var):
        if not ports:
            return None

        tpl_parts = tpl_var.split('_', 1)

        if len(tpl_parts) == 1:
            log.debug("No index was passed for template variable %s. "
                      "Trying with the last element." % tpl_var)
            return ports[-1]

        try:
            idx = tpl_parts[-1]
            return ports[int(idx)]
        except ValueError:
            log.error(
                "Port index is not an integer. Using the last element instead."
            )
        except IndexError:
            log.error(
                "Port index is out of range. Using the last element instead.")
        return ports[-1]

    def get_tags(self, state, c_id):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        c_inspect = state.inspect_container(c_id)
        tags = self.dockerutil.extract_container_tags(c_inspect)

        if Platform.is_k8s():
            if not self.kubeutil.init_success:
                log.warning(
                    "kubelet client not initialized, kubernetes tags will be missing."
                )
                return tags

            pod_metadata = state.get_kube_config(c_id, 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags will be missing." % c_id[:12])
                return tags

            # get pod labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get kubernetes namespace
            namespace = pod_metadata.get('namespace')
            tags.append('kube_namespace:%s' % namespace)

            if not self.kubeutil:
                log.warning("The agent can't connect to kubelet, creator and "
                            "service tags will be missing for container %s." %
                            c_id[:12])
            else:
                # add creator tags
                creator_tags = self.kubeutil.get_pod_creator_tags(pod_metadata)
                tags.extend(creator_tags)

                # add services tags
                if self.kubeutil.collect_service_tag:
                    services = self.kubeutil.match_services_for_pod(
                        pod_metadata)
                    for s in services:
                        if s is not None:
                            tags.append('kube_service:%s' % s)

        elif Platform.is_swarm():
            c_labels = c_inspect.get('Config', {}).get('Labels', {})
            swarm_svc = c_labels.get(SWARM_SVC_LABEL)
            if swarm_svc:
                tags.append('swarm_service:%s' % swarm_svc)

        elif Platform.is_rancher():
            service_name = c_inspect.get('Config',
                                         {}).get('Labels',
                                                 {}).get(RANCHER_SVC_NAME)
            stack_name = c_inspect.get('Config',
                                       {}).get('Labels',
                                               {}).get(RANCHER_STACK_NAME)
            container_name = c_inspect.get('Config', {}).get(
                'Labels', {}).get(RANCHER_CONTAINER_NAME)
            if service_name:
                tags.append('rancher_service:%s' % service_name)
            if stack_name:
                tags.append('rancher_stack:%s' % stack_name)
            if container_name:
                tags.append('rancher_container:%s' % container_name)

        if self.metadata_collector.has_detected():
            orch_tags = self.metadata_collector.get_container_tags(
                co=c_inspect)
            tags.extend(orch_tags)

        return tags

    def _get_container_name(self, state, c_id, tpl_var):
        container_inspect = state.inspect_container(c_id)
        return container_inspect.get('Name', '').lstrip('/')

    def _get_additional_tags(self, state, c_id, *args):
        tags = []

        if Platform.is_k8s():
            pod_metadata = state.get_kube_config(c_id, 'metadata')
            pod_spec = state.get_kube_config(c_id, 'spec')
            if pod_metadata is None or pod_spec is None:
                log.warning(
                    "Failed to fetch pod metadata or pod spec for container %s."
                    " Additional Kubernetes tags may be missing." % c_id[:12])
                return []
            tags.append('node_name:%s' % pod_spec.get('nodeName'))
            tags.append('pod_name:%s' % pod_metadata.get('name'))

            c_inspect = state.inspect_container(c_id)
            c_name = c_inspect.get('Config', {}).get('Labels', {}).get(
                KubeUtil.CONTAINER_NAME_LABEL)
            if c_name:
                tags.append('kube_container_name:%s' % c_name)
        return tags

    def get_configs(self):
        """Get the config for all docker containers running on the host."""
        configs = {}
        if not self.dockerutil.client:
            log.warning(
                "Docker client is not initialized, pausing auto discovery.")
            return configs

        state = self._make_fetch_state()
        containers = [(self.dockerutil.image_name_extractor(container),
                       container.get('Id'), container.get('Labels'))
                      for container in self.dockerutil.client.containers()]

        for image, cid, labels in containers:
            try:
                # value of the STACKSTATE_ID tag or the image name if the label is missing
                identifier = self.get_config_id(image, labels)
                check_configs = self._get_check_configs(
                    state, cid, identifier, labels) or []
                for conf in check_configs:
                    source, (check_name, init_config, instance) = conf

                    # build instances list if needed
                    if configs.get(check_name) is None:
                        if isinstance(instance, list):
                            configs[check_name] = (source, (init_config,
                                                            instance))
                        else:
                            configs[check_name] = (source, (init_config,
                                                            [instance]))
                    else:
                        conflict_init_msg = 'Different versions of `init_config` found for check {}. ' \
                            'Keeping the first one found.'
                        if configs[check_name][1][0] != init_config:
                            log.warning(conflict_init_msg.format(check_name))
                        if isinstance(instance, list):
                            for inst in instance:
                                configs[check_name][1][1].append(inst)
                        else:
                            configs[check_name][1][1].append(instance)
            except Exception:
                log.exception(
                    'Building config for container %s based on image %s using service '
                    'discovery failed, leaving it alone.' % (cid[:12], image))
        return configs

    def get_config_id(self, image, labels):
        """Look for a STACKSTATE_ID label, return its value or the image name if missing"""
        return labels.get(STACKSTATE_ID) or image

    def _get_check_configs(self, state, c_id, identifier, labels=None):
        """Retrieve configuration templates and fill them with data pulled from docker and tags."""
        platform_kwargs = {}
        if Platform.is_k8s():
            kube_metadata = state.get_kube_config(c_id, 'metadata') or {}
            platform_kwargs = {
                'kube_container_name': state.get_kube_container_name(c_id),
                'kube_annotations': kube_metadata.get('annotations'),
            }
        if labels:
            platform_kwargs['docker_labels'] = labels

        config_templates = self._get_config_templates(identifier,
                                                      **platform_kwargs)
        if not config_templates:
            return None

        check_configs = []
        tags = self.get_tags(state, c_id)
        for config_tpl in config_templates:
            source, config_tpl = config_tpl
            check_name, init_config_tpl, instance_tpl, variables = config_tpl

            # covering mono-instance and multi-instances cases
            tmpl_array = instance_tpl
            if not isinstance(instance_tpl, list):
                tmpl_array = [instance_tpl]

            # insert tags in instance_tpl and process values for template variables
            result_instances = []
            result_init_config = None
            for inst_tmpl in tmpl_array:
                instance_tpl, var_values = self._fill_tpl(
                    state, c_id, inst_tmpl, variables, tags)
                tpl = self._render_template(init_config_tpl or {}, instance_tpl
                                            or {}, var_values)
                if tpl and len(tpl) == 2:
                    init_config, instance = tpl
                    result_instances.append(instance)
                    if not result_init_config:
                        result_init_config = init_config
                    elif result_init_config != init_config:
                        self.log.warning(
                            "Different versions of `init_config` found for "
                            "check {}. Keeping the first one found.".format(
                                'check_name'))
            check_configs.append(
                (source, (check_name, result_init_config, result_instances)))

        return check_configs

    def _get_config_templates(self, identifier, **platform_kwargs):
        """Extract config templates for an identifier from a K/V store and returns it as a dict object."""
        config_backend = self.agentConfig.get('sd_config_backend')
        templates = []
        if config_backend is None:
            auto_conf = True
        else:
            auto_conf = False

        # format [(source, ('ident', {init_tpl}, {instance_tpl}))]
        raw_tpls = self.config_store.get_check_tpls(identifier,
                                                    auto_conf=auto_conf,
                                                    **platform_kwargs)
        for tpl in raw_tpls:
            # each template can come from either auto configuration or user-supplied templates
            try:
                source, (check_name, init_config_tpl, instance_tpl) = tpl
            except (TypeError, IndexError, ValueError):
                log.debug(
                    'No template was found for identifier %s, leaving it alone: %s'
                    % (identifier, tpl))
                return None
            try:
                # build a list of all variables to replace in the template
                variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \
                    self.PLACEHOLDER_REGEX.findall(str(instance_tpl))
                variables = map(lambda x: x.strip('%'), variables)
                if not isinstance(init_config_tpl, dict):
                    init_config_tpl = json.loads(init_config_tpl or '{}')
                if not isinstance(instance_tpl, dict) and not isinstance(
                        instance_tpl, list):
                    instance_tpl = json.loads(instance_tpl or '{}')
            except json.JSONDecodeError:
                log.exception(
                    'Failed to decode the JSON template fetched for check {0}. Its configuration'
                    ' by service discovery failed for ident  {1}.'.format(
                        check_name, identifier))
                return None

            templates.append((source, (check_name, init_config_tpl,
                                       instance_tpl, variables)))

        return templates

    def _fill_tpl(self, state, c_id, instance_tpl, variables, c_tags=None):
        """Add container tags to instance templates and build a
           dict from template variable names and their values."""
        var_values = {}
        c_image = state.inspect_container(c_id).get('Config',
                                                    {}).get('Image', '')

        # add only default c_tags to the instance to avoid duplicate tags from conf
        if c_tags:
            tags = c_tags[:]  # shallow copy of the c_tags array
        else:
            tags = []
        if tags:
            tpl_tags = instance_tpl.get('tags', [])
            if isinstance(tpl_tags, dict):
                for key, val in tpl_tags.iteritems():
                    tags.append("{}:{}".format(key, val))
            else:
                tags += tpl_tags if isinstance(tpl_tags, list) else [tpl_tags]
            instance_tpl['tags'] = list(set(tags))

        for var in variables:
            # variables can be suffixed with an index in case several values are found
            if var.split('_')[0] in self.VAR_MAPPING:
                try:
                    res = self.VAR_MAPPING[var.split('_')[0]](state, c_id, var)
                    if res is None:
                        raise ValueError("Invalid value for variable %s." %
                                         var)
                    var_values[var] = res
                except Exception as ex:
                    log.error(
                        "Could not find a value for the template variable %s for container %s "
                        "(%s): %s" % (var, c_id[:12], c_image, str(ex)))
            else:
                log.error(
                    "No method was found to interpolate template variable %s for container %s "
                    "(%s)." % (var, c_id[:12], c_image))

        return instance_tpl, var_values
예제 #7
0
    def _populate_payload_metadata(self,
                                   payload,
                                   check_statuses,
                                   start_event=True):
        """
        Periodically populate the payload with metadata related to the system, host, and/or checks.
        """
        now = time.time()

        # Include system stats on first postback
        if start_event and self._is_first_run():
            payload['systemStats'] = self.agentConfig.get('system_stats', {})
            # Also post an event in the newsfeed
            payload['events']['System'] = [{
                'api_key':
                self.agentConfig['api_key'],
                'host':
                self.hostname,
                'timestamp':
                now,
                'event_type':
                'Agent Startup',
                'msg_text':
                'Version %s' % get_version()
            }]

        # Periodically send the host metadata.
        if self._should_send_additional_data('host_metadata'):
            # gather metadata with gohai
            gohai_metadata = self._run_gohai_metadata()
            if gohai_metadata:
                payload['gohai'] = gohai_metadata

            payload['systemStats'] = get_system_stats(
                proc_path=self.agentConfig.get('procfs_path', '/proc').rstrip(
                    '/'))

            if self.agentConfig['collect_orchestrator_tags']:
                host_container_metadata = MetadataCollector(
                ).get_host_metadata()
                if host_container_metadata:
                    payload['container-meta'] = host_container_metadata

            payload['meta'] = self._get_hostname_metadata()

            self.hostname_metadata_cache = payload['meta']
            # Add static tags from the configuration file
            host_tags = []
            if self.agentConfig['tags'] is not None:
                host_tags.extend([
                    unicode(tag.strip())
                    for tag in self.agentConfig['tags'].split(",")
                ])

            if self.agentConfig['collect_ec2_tags']:
                host_tags.extend(EC2.get_tags(self.agentConfig))

            if self.agentConfig['collect_orchestrator_tags']:
                host_docker_tags = MetadataCollector().get_host_tags()
                if host_docker_tags:
                    host_tags.extend(host_docker_tags)

            if host_tags:
                payload['host-tags']['system'] = host_tags

            # If required by the user, let's create the dd_check:xxx host tags
            if self.agentConfig['create_dd_check_tags']:
                app_tags_list = [
                    DD_CHECK_TAG.format(c.name)
                    for c in self.initialized_checks_d
                ]
                app_tags_list.extend([
                    DD_CHECK_TAG.format(cname)
                    for cname in JMXFiles.get_jmx_appnames()
                ])

                if 'system' not in payload['host-tags']:
                    payload['host-tags']['system'] = []

                payload['host-tags']['system'].extend(app_tags_list)

            GCE_tags = GCE.get_tags(self.agentConfig)
            if GCE_tags is not None:
                payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags

            # Log the metadata on the first run
            if self._is_first_run():
                log.info(
                    "Hostnames: %s, tags: %s" %
                    (repr(self.hostname_metadata_cache), payload['host-tags']))

        # Periodically send extra hosts metadata (vsphere)
        # Metadata of hosts that are not the host where the agent runs, not all the checks use
        # that
        external_host_tags = []
        if self._should_send_additional_data('external_host_tags'):
            for check in self.initialized_checks_d:
                try:
                    getter = getattr(check, 'get_external_host_tags')
                    check_tags = getter()
                    external_host_tags.extend(check_tags)
                except AttributeError:
                    pass

        if external_host_tags:
            payload['external_host_tags'] = external_host_tags

        # Periodically send agent_checks metadata
        if self._should_send_additional_data('agent_checks'):
            # Add agent checks statuses and error/warning messages
            agent_checks = []
            for check in check_statuses:
                if check.instance_statuses is not None:
                    for i, instance_status in enumerate(
                            check.instance_statuses):
                        agent_checks.append((
                            check.name,
                            check.source_type_name,
                            instance_status.instance_id,
                            instance_status.status,
                            # put error message or list of warning messages in the same field
                            # it will be handled by the UI
                            instance_status.error or instance_status.warnings
                            or "",
                            check.service_metadata[i]))
                else:
                    agent_checks.append(
                        (check.name, check.source_type_name, "initialization",
                         check.status, repr(check.init_failed_error)))
            payload['agent_checks'] = agent_checks
            payload[
                'meta'] = self.hostname_metadata_cache  # add hostname metadata
예제 #8
0
class SDDockerBackend(AbstractSDBackend):
    """Docker-based service discovery"""

    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.dockerutil = DockerUtil(config_store=self.config_store)
        self.kubeutil = None
        if Platform.is_k8s():
            try:
                self.kubeutil = KubeUtil()
            except Exception as ex:
                log.error("Couldn't instantiate the kubernetes client, "
                          "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

        self.metadata_collector = MetadataCollector()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'pid': self._get_container_pid,
            'port': self._get_port,
            'container-name': self._get_container_name,
            'tags': self._get_additional_tags,
        }

        # docker labels we'll add as tags to all instances SD configures
        self.docker_labels_as_tags = agentConfig.get('docker_labels_as_tags', '')
        if self.docker_labels_as_tags:
            self.docker_labels_as_tags = [label.strip() for label in self.docker_labels_as_tags.split(',')]
        else:
            self.docker_labels_as_tags = []

        AbstractSDBackend.__init__(self, agentConfig)

    def _make_fetch_state(self):
        pod_list = []
        if Platform.is_k8s():
            if not self.kubeutil or not self.kubeutil.init_success:
                log.error("kubelet client not initialized, cannot retrieve pod list.")
            else:
                try:
                    pod_list = self.kubeutil.retrieve_pods_list().get('items', [])
                except Exception as ex:
                    log.warning("Failed to retrieve pod list: %s" % str(ex))
        return _SDDockerBackendConfigFetchState(self.dockerutil.client.inspect_container, pod_list)

    def update_checks(self, changed_containers):
        """
        Takes a list of container IDs that changed recently
        and marks their corresponding checks as
        """
        if not self.dockerutil.client:
            log.warning("Docker client is not initialized, pausing auto discovery.")
            return

        state = self._make_fetch_state()

        conf_reload_set = set()
        for c_id in changed_containers:
            checks = self._get_checks_to_refresh(state, c_id)
            if checks:
                conf_reload_set.update(set(checks))

        if conf_reload_set:
            self.reload_check_configs = conf_reload_set

    def _get_checks_to_refresh(self, state, c_id):
        """Get the list of checks applied to a container from the identifier_to_checks cache in the config store.
        Use the SD_ID label or the image."""
        inspect = state.inspect_container(c_id)

        # If the container was removed we can't tell which check is concerned
        # so we have to reload everything.
        # Same thing if it's stopped and we're on Kubernetes in auto_conf mode
        # because the pod was deleted and its template could have been in the annotations.
        if not inspect or \
                (not inspect.get('State', {}).get('Running')
                 and Platform.is_k8s() and not self.agentConfig.get('sd_config_backend')):
            self.reload_check_configs = True
            return

        labels = inspect.get('Config', {}).get('Labels', {})
        identifier = labels.get(SD_ID) or \
            self.dockerutil.image_name_extractor(inspect)

        platform_kwargs = {}
        if Platform.is_k8s():
            kube_metadata = state.get_kube_config(c_id, 'metadata') or {}
            platform_kwargs = {
                'kube_annotations': kube_metadata.get('annotations'),
                'kube_container_name': state.get_kube_container_name(c_id),
            }
        if labels:
            platform_kwargs['docker_labels'] = labels
        return self.config_store.get_checks_to_refresh(identifier, **platform_kwargs)

    def _get_container_pid(self, state, cid, tpl_var):
        """Extract the host-namespace pid of the container pid 0"""
        pid = state.inspect_container(cid).get('State', {}).get('Pid')
        if not pid:
            return None

        return str(pid)

    def _get_host_address(self, state, c_id, tpl_var):
        """Extract the container IP from a docker inspect object, or the kubelet API."""
        c_inspect = state.inspect_container(c_id)
        c_id = c_inspect.get('Id', '')
        c_img = self.dockerutil.image_name_extractor(c_inspect)

        networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {}
        ip_dict = {}
        for net_name, net_desc in networks.iteritems():
            ip = net_desc.get('IPAddress')
            if ip:
                ip_dict[net_name] = ip
        ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var)
        if ip_addr:
            return ip_addr

        if Platform.is_k8s():
            # kubernetes case
            log.debug("Couldn't find the IP address for container %s (%s), "
                      "using the kubernetes way." % (c_id[:12], c_img))
            pod_ip = state.get_kube_config(c_id, 'status').get('podIP')
            if pod_ip:
                return pod_ip

        if Platform.is_rancher():
            # try to get the rancher IP address
            log.debug("No IP address was found in container %s (%s) "
                      "trying with the Rancher label" % (c_id[:12], c_img))

            ip_addr = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_CONTAINER_IP)
            if ip_addr:
                return ip_addr.split('/')[0]

        log.error("No IP address was found for container %s (%s)" % (c_id[:12], c_img))
        return None

    def _extract_ip_from_networks(self, ip_dict, tpl_var):
        """Extract a single IP from a dictionary made of network names and IPs."""
        if not ip_dict:
            return None
        tpl_parts = tpl_var.split('_', 1)

        # no specifier
        if len(tpl_parts) < 2:
            log.debug("No key was passed for template variable %s." % tpl_var)
            return self._get_fallback_ip(ip_dict)

        res = ip_dict.get(tpl_parts[-1])
        if res is None:
            log.warning("The key passed for template variable %s was not found." % tpl_var)
            return self._get_fallback_ip(ip_dict)
        return res

    def _get_fallback_ip(self, ip_dict):
        """try to pick the bridge key, falls back to the value of the last key"""
        if 'bridge' in ip_dict:
            log.debug("Using the bridge network.")
            return ip_dict['bridge']

        last_key = sorted(ip_dict.iterkeys())[-1]
        log.debug("Trying with the last (sorted) network: '%s'." % last_key)
        return ip_dict[last_key]

    def _get_port(self, state, c_id, tpl_var):
        """Extract a port from a container_inspect or the k8s API given a template variable."""
        container_inspect = state.inspect_container(c_id)
        ports = []
        try:
            ports = [x.split('/')[0] for x in container_inspect['NetworkSettings']['Ports'].keys()]
            if len(ports) == 0:
                raise IndexError
        except (IndexError, KeyError, AttributeError):
            if Platform.is_k8s():
                spec = state.get_kube_container_spec(c_id)
                if spec:
                    ports = [str(x.get('containerPort')) for x in spec.get('ports', [])]
            else:
                ports = [p.split('/')[0] for p in container_inspect['Config'].get('ExposedPorts', {}).keys()]

        ports = sorted(ports, key=int)
        return self._extract_port_from_list(ports, tpl_var)

    def _extract_port_from_list(self, ports, tpl_var):
        if not ports:
            return None

        tpl_parts = tpl_var.split('_', 1)

        if len(tpl_parts) == 1:
            log.debug("No index was passed for template variable %s. "
                      "Trying with the last element." % tpl_var)
            return ports[-1]

        try:
            idx = tpl_parts[-1]
            return ports[int(idx)]
        except ValueError:
            log.error("Port index is not an integer. Using the last element instead.")
        except IndexError:
            log.error("Port index is out of range. Using the last element instead.")
        return ports[-1]

    def get_tags(self, state, c_id):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        c_inspect = state.inspect_container(c_id)
        tags = self.dockerutil.extract_container_tags(c_inspect, self.docker_labels_as_tags)

        if Platform.is_k8s():
            if not self.kubeutil.init_success:
                log.warning("kubelet client not initialized, kubernetes tags will be missing.")
                return tags

            pod_metadata = state.get_kube_config(c_id, 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags will be missing." % c_id[:12])
                return tags

            # get pod labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get kubernetes namespace
            namespace = pod_metadata.get('namespace')
            tags.append('kube_namespace:%s' % namespace)

            # get kubernetes container name
            kube_container_name = state.get_kube_container_name(c_id)
            if kube_container_name:
                tags.append('kube_container_name:%s' % kube_container_name)

            if not self.kubeutil:
                log.warning("The agent can't connect to kubelet, creator and "
                            "service tags will be missing for container %s." % c_id[:12])
            else:
                # add creator tags
                creator_tags = self.kubeutil.get_pod_creator_tags(pod_metadata)
                tags.extend(creator_tags)

                # add services tags
                if self.kubeutil.collect_service_tag:
                    services = self.kubeutil.match_services_for_pod(pod_metadata)
                    for s in services:
                        if s is not None:
                            tags.append('kube_service:%s' % s)

        elif Platform.is_swarm():
            c_labels = c_inspect.get('Config', {}).get('Labels', {})
            swarm_svc = c_labels.get(SWARM_SVC_LABEL)
            if swarm_svc:
                tags.append('swarm_service:%s' % swarm_svc)

        elif Platform.is_rancher():
            service_name = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_SVC_NAME)
            stack_name = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_STACK_NAME)
            container_name = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_CONTAINER_NAME)
            if service_name:
                tags.append('rancher_service:%s' % service_name)
            if stack_name:
                tags.append('rancher_stack:%s' % stack_name)
            if container_name:
                tags.append('rancher_container:%s' % container_name)

        if self.metadata_collector.has_detected():
            orch_tags = self.metadata_collector.get_container_tags(co=c_inspect)
            tags.extend(orch_tags)

        return tags

    def _get_container_name(self, state, c_id, tpl_var):
        container_inspect = state.inspect_container(c_id)
        return container_inspect.get('Name', '').lstrip('/')

    def _get_additional_tags(self, state, c_id, *args):
        tags = []

        if Platform.is_k8s():
            pod_metadata = state.get_kube_config(c_id, 'metadata')
            pod_spec = state.get_kube_config(c_id, 'spec')
            if pod_metadata is None or pod_spec is None:
                log.warning("Failed to fetch pod metadata or pod spec for container %s."
                            " Additional Kubernetes tags may be missing." % c_id[:12])
                return []
            tags.append('node_name:%s' % pod_spec.get('nodeName'))
            tags.append('pod_name:%s' % pod_metadata.get('name'))

            c_inspect = state.inspect_container(c_id)
            c_name = c_inspect.get('Config', {}).get('Labels', {}).get(KubeUtil.CONTAINER_NAME_LABEL)
            if c_name:
                tags.append('kube_container_name:%s' % c_name)
        return tags

    def get_configs(self):
        """Get the config for all docker containers running on the host."""
        configs = {}
        if not self.dockerutil.client:
            log.warning("Docker client is not initialized, pausing auto discovery.")
            return configs

        state = self._make_fetch_state()
        containers = [(
            self.dockerutil.image_name_extractor(container),
            container.get('Id'), container.get('Labels')
        ) for container in self.dockerutil.client.containers()]

        for image, cid, labels in containers:
            try:
                # value of the SD_ID tag or the image name if the label is missing
                identifier = self.get_config_id(image, labels)
                check_configs = self._get_check_configs(state, cid, identifier, labels) or []
                for conf in check_configs:
                    source, (check_name, init_config, instance) = conf

                    # build instances list if needed
                    if configs.get(check_name) is None:
                        if isinstance(instance, list):
                            configs[check_name] = (source, (init_config, instance))
                        else:
                            configs[check_name] = (source, (init_config, [instance]))
                    else:
                        conflict_init_msg = 'Different versions of `init_config` found for check {}. ' \
                            'Keeping the first one found.'
                        if configs[check_name][1][0] != init_config:
                            log.warning(conflict_init_msg.format(check_name))
                        if isinstance(instance, list):
                            for inst in instance:
                                configs[check_name][1][1].append(inst)
                        else:
                            configs[check_name][1][1].append(instance)
            except Exception:
                log.exception('Building config for container %s based on image %s using service '
                              'discovery failed, leaving it alone.' % (cid[:12], image))
        return configs

    def get_config_id(self, image, labels):
        """Look for a SD_ID label, return its value or the image name if missing"""
        return labels.get(SD_ID) or image

    def _get_check_configs(self, state, c_id, identifier, labels=None):
        """Retrieve configuration templates and fill them with data pulled from docker and tags."""
        platform_kwargs = {}
        if Platform.is_k8s():
            kube_metadata = state.get_kube_config(c_id, 'metadata') or {}
            platform_kwargs = {
                'kube_container_name': state.get_kube_container_name(c_id),
                'kube_annotations': kube_metadata.get('annotations'),
            }
        if labels:
            platform_kwargs['docker_labels'] = labels

        config_templates = self._get_config_templates(identifier, **platform_kwargs)
        if not config_templates:
            return None

        check_configs = []
        tags = self.get_tags(state, c_id)
        for config_tpl in config_templates:
            source, config_tpl = config_tpl
            check_name, init_config_tpl, instance_tpl, variables = config_tpl

            # covering mono-instance and multi-instances cases
            tmpl_array = instance_tpl
            if not isinstance(instance_tpl, list):
                tmpl_array = [instance_tpl]

            # insert tags in instance_tpl and process values for template variables
            result_instances = []
            result_init_config = None
            for inst_tmpl in tmpl_array:
                instance_tpl, var_values = self._fill_tpl(state, c_id, inst_tmpl, variables, tags)
                tpl = self._render_template(init_config_tpl or {}, instance_tpl or {}, var_values)
                if tpl and len(tpl) == 2:
                    init_config, instance = tpl
                    result_instances.append(instance)
                    if not result_init_config:
                        result_init_config = init_config
                    elif result_init_config != init_config:
                        log.warning("Different versions of `init_config` found for "
                                    "check {}. Keeping the first one found.".format('check_name'))
            check_configs.append((source, (check_name, result_init_config, result_instances)))

        return check_configs

    def _get_config_templates(self, identifier, **platform_kwargs):
        """Extract config templates for an identifier from a K/V store and returns it as a dict object."""
        config_backend = self.agentConfig.get('sd_config_backend')
        templates = []
        auto_conf = not bool(config_backend)

        # format [(source, ('ident', {init_tpl}, {instance_tpl}))]
        raw_tpls = self.config_store.get_check_tpls(identifier, auto_conf=auto_conf, **platform_kwargs)
        for tpl in raw_tpls:
            # each template can come from either auto configuration or user-supplied templates
            try:
                source, (check_name, init_config_tpl, instance_tpl) = tpl
            except (TypeError, IndexError, ValueError):
                log.debug('No template was found for identifier %s, leaving it alone: %s' % (identifier, tpl))
                return None
            try:
                # build a list of all variables to replace in the template
                variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \
                    self.PLACEHOLDER_REGEX.findall(str(instance_tpl))
                variables = [var.strip('%') for var in variables]
                if not isinstance(init_config_tpl, dict):
                    init_config_tpl = json.loads(init_config_tpl or '{}')
                if not isinstance(instance_tpl, dict) and not isinstance(instance_tpl, list):
                    instance_tpl = json.loads(instance_tpl or '{}')
            except json.JSONDecodeError:
                log.exception('Failed to decode the JSON template fetched for check {0}. Its configuration'
                              ' by service discovery failed for ident  {1}.'.format(check_name, identifier))
                return None

            templates.append((source,
                              (check_name, init_config_tpl, instance_tpl, variables)))

        return templates

    def _fill_tpl(self, state, c_id, instance_tpl, variables, c_tags=None):
        """Add container tags to instance templates and build a
           dict from template variable names and their values."""
        var_values = {}
        c_image = state.inspect_container(c_id).get('Config', {}).get('Image', '')

        # add only default c_tags to the instance to avoid duplicate tags from conf
        if c_tags:
            tags = c_tags[:] # shallow copy of the c_tags array
        else:
            tags = []
        if tags:
            tpl_tags = instance_tpl.get('tags', [])
            if isinstance(tpl_tags, dict):
                for key, val in tpl_tags.iteritems():
                    tags.append("{}:{}".format(key, val))
            else:
                tags += tpl_tags if isinstance(tpl_tags, list) else [tpl_tags]
            instance_tpl['tags'] = list(set(tags))

        for var in variables:
            # variables can be suffixed with an index in case several values are found
            if var.split('_')[0] in self.VAR_MAPPING:
                try:
                    res = self.VAR_MAPPING[var.split('_')[0]](state, c_id, var)
                    if res is None:
                        raise ValueError("Invalid value for variable %s." % var)
                    var_values[var] = res
                except Exception as ex:
                    log.error("Could not find a value for the template variable %s for container %s "
                              "(%s): %s" % (var, c_id[:12], c_image, str(ex)))
            else:
                log.error("No method was found to interpolate template variable %s for container %s "
                          "(%s)." % (var, c_id[:12], c_image))

        return instance_tpl, var_values
예제 #9
0
    def init(self):
        try:
            instance = self.instances[0]

            self.docker_util = DockerUtil()

            self.docker_client = self.docker_util.client
            self.docker_gateway = DockerUtil.get_gateway()

            self.metadata_collector = MetadataCollector()

            if Platform.is_k8s():
                try:
                    self.kubeutil = KubeUtil()
                except Exception as ex:
                    self.kubeutil = None
                    self.log.error("Couldn't instantiate the kubernetes client, "
                        "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get("collect_labels_as_tags", DEFAULT_LABELS_AS_TAGS)
            self.kube_pod_tags = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if self.docker_util.filtering_enabled:
                self.tag_names[FILTERED] = self.docker_util.filtered_tag_names

            # Container network mapping cache
            self.network_mappings = {}

            # get the health check whitelist
            self.whitelist_patterns = None
            health_scs_whitelist = instance.get('health_service_check_whitelist', [])
            if health_scs_whitelist:
                patterns, whitelist_tags = compile_filter_rules(health_scs_whitelist)
                self.whitelist_patterns = set(patterns)
                self.tag_names[HEALTHCHECK] = set(whitelist_tags)


            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_container_count = _is_affirmative(instance.get('collect_container_count', False))
            self.collect_volume_count = _is_affirmative(instance.get('collect_volume_count', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.event_attributes_as_tags = instance.get('event_attributes_as_tags', [])
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_exit_codes = _is_affirmative(instance.get('collect_exit_codes', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.capped_metrics = instance.get('capped_metrics')

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True
예제 #10
0
class DockerDaemon(AgentCheck):
    """Collect metrics and events from Docker API and cgroups."""

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception("Docker check only supports one configured instance.")
        AgentCheck.__init__(self, name, init_config,
                            agentConfig, instances=instances)

        self.init_success = False
        self.docker_client = None
        self._service_discovery = agentConfig.get('service_discovery') and \
            agentConfig.get('service_discovery_backend') == 'docker'
        self.init()

    def init(self):
        try:
            instance = self.instances[0]

            self.docker_util = DockerUtil()

            self.docker_client = self.docker_util.client
            self.docker_gateway = DockerUtil.get_gateway()

            self.metadata_collector = MetadataCollector()

            if Platform.is_k8s():
                try:
                    self.kubeutil = KubeUtil()
                except Exception as ex:
                    self.kubeutil = None
                    self.log.error("Couldn't instantiate the kubernetes client, "
                        "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get("collect_labels_as_tags", DEFAULT_LABELS_AS_TAGS)
            self.kube_pod_tags = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if self.docker_util.filtering_enabled:
                self.tag_names[FILTERED] = self.docker_util.filtered_tag_names

            # Container network mapping cache
            self.network_mappings = {}

            # get the health check whitelist
            self.whitelist_patterns = None
            health_scs_whitelist = instance.get('health_service_check_whitelist', [])
            if health_scs_whitelist:
                patterns, whitelist_tags = compile_filter_rules(health_scs_whitelist)
                self.whitelist_patterns = set(patterns)
                self.tag_names[HEALTHCHECK] = set(whitelist_tags)


            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_container_count = _is_affirmative(instance.get('collect_container_count', False))
            self.collect_volume_count = _is_affirmative(instance.get('collect_volume_count', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.event_attributes_as_tags = instance.get('event_attributes_as_tags', [])
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_exit_codes = _is_affirmative(instance.get('collect_exit_codes', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.capped_metrics = instance.get('capped_metrics')

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True

    def check(self, instance):
        """Run the Docker check for one instance."""
        if not self.init_success:
            # Initialization can fail if cgroups are not ready or docker daemon is down. So we retry if needed
            # https://github.com/DataDog/dd-agent/issues/1896
            self.init()

            if self.docker_client is None:
                message = "Unable to connect to Docker daemon"
                self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                                   message=message)
                return

            if not self.init_success:
                # Initialization failed, will try later
                return

        try:
            # Report image metrics
            if self.collect_image_stats:
                self._count_and_weigh_images()

            if Platform.is_k8s():
                self.kube_pod_tags = {}
                if self.kubeutil:
                    try:
                        self.kube_pod_tags = self.kubeutil.get_kube_pod_tags()
                    except Exception as e:
                        self.log.warning('Could not retrieve kubernetes labels: %s' % str(e))

            # containers running with custom cgroups?
            custom_cgroups = _is_affirmative(instance.get('custom_cgroups', False))

            # Get the list of containers and the index of their names
            health_service_checks = True if self.whitelist_patterns else False
            containers_by_id = self._get_and_count_containers(custom_cgroups, health_service_checks)
            containers_by_id = self._crawl_container_pids(containers_by_id, custom_cgroups)

            # Send events from Docker API
            if self.collect_events or self._service_discovery or not self._disable_net_metrics or self.collect_exit_codes:
                self._process_events(containers_by_id)

            # Report performance container metrics (cpu, mem, net, io)
            self._report_performance_metrics(containers_by_id)

            if self.collect_container_size:
                self._report_container_size(containers_by_id)

            if self.collect_container_count:
                self._report_container_count(containers_by_id)

            if self.collect_volume_count:
                self._report_volume_count()

            # Collect disk stats from Docker info command
            if self.collect_disk_stats:
                self._report_disk_stats()

            if health_service_checks:
                self._send_container_healthcheck_sc(containers_by_id)
        except:
            self.log.exception("Docker_daemon check failed")
            self.warning("Check failed. Will retry at next iteration")

        if self.capped_metrics:
            self.filter_capped_metrics()

    def _count_and_weigh_images(self):
        try:
            tags = self._get_tags()
            active_images = self.docker_client.images(all=False)
            active_images_len = len(active_images)
            all_images_len = len(self.docker_client.images(quiet=True, all=True))
            self.gauge("docker.images.available", active_images_len, tags=tags)
            self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags)

            if self.collect_image_size:
                self._report_image_size(active_images)

        except Exception as e:
            # It's not an important metric, keep going if it fails
            self.warning("Failed to count Docker images. Exception: {0}".format(e))

    def _get_and_count_containers(self, custom_cgroups=False, healthchecks=False):
        """List all the containers from the API, filter and count them."""

        # Querying the size of containers is slow, we don't do it at each run
        must_query_size = self.collect_container_size and self._latest_size_query == 0
        self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE

        running_containers_count = Counter()
        all_containers_count = Counter()

        try:
            containers = self.docker_client.containers(all=True, size=must_query_size)
        except Exception as e:
            message = "Unable to list Docker containers: {0}".format(e)
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                               message=message)
            raise Exception(message)

        else:
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK)

        # Create a set of filtered containers based on the exclude/include rules
        # and cache these rules in docker_util
        self._filter_containers(containers)

        containers_by_id = {}

        for container in containers:
            container_name = DockerUtil.container_name_extractor(container)[0]

            container_status_tags = self._get_tags(container, CONTAINER)

            all_containers_count[tuple(sorted(container_status_tags))] += 1
            if self._is_container_running(container):
                running_containers_count[tuple(sorted(container_status_tags))] += 1

            # Check if the container is included/excluded via its tags
            if self._is_container_excluded(container):
                self.log.debug("Container {0} is excluded".format(container_name))
                continue

            containers_by_id[container['Id']] = container

            # grab pid via API if custom cgroups - otherwise we won't find process when
            # crawling for pids.
            if custom_cgroups or healthchecks:
                try:
                    inspect_dict = self.docker_client.inspect_container(container_name)
                    container['_pid'] = inspect_dict['State']['Pid']
                    container['health'] = inspect_dict['State'].get('Health', {})
                except Exception as e:
                    self.log.debug("Unable to inspect Docker container: %s", e)

        # TODO: deprecate these 2, they should be replaced by _report_container_count
        for tags, count in running_containers_count.iteritems():
            self.gauge("docker.containers.running", count, tags=list(tags))

        for tags, count in all_containers_count.iteritems():
            stopped_count = count - running_containers_count[tags]
            self.gauge("docker.containers.stopped", stopped_count, tags=list(tags))

        return containers_by_id

    def _is_container_running(self, container):
        """Tell if a container is running, according to its status.

        There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated.
        See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35
        """
        return container["Status"].startswith("Up") or container["Status"].startswith("Restarting")

    def _get_tags(self, entity=None, tag_type=None):
        """Generate the tags for a given entity (container or image) according to a list of tag names."""
        # Start with custom tags
        tags = list(self.custom_tags)

        # Collect pod names as tags on kubernetes
        if Platform.is_k8s() and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags:
            self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL)

        # Collect container names as tags on rancher
        if Platform.is_rancher():
            if RANCHER_CONTAINER_NAME not in self.collect_labels_as_tags:
                self.collect_labels_as_tags.append(RANCHER_CONTAINER_NAME)
            if RANCHER_SVC_NAME not in self.collect_labels_as_tags:
                self.collect_labels_as_tags.append(RANCHER_SVC_NAME)
            if RANCHER_STACK_NAME not in self.collect_labels_as_tags:
                self.collect_labels_as_tags.append(RANCHER_STACK_NAME)

        if entity is not None:
            pod_name = None
            # Get labels as tags
            labels = entity.get("Labels")
            if labels is not None:
                for k in self.collect_labels_as_tags:
                    if k in labels:
                        v = labels[k]
                        if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s():
                            pod_name = v
                            k = "pod_name"
                            if "-" in pod_name:
                                replication_controller = "-".join(pod_name.split("-")[:-1])
                                if "/" in replication_controller: # k8s <= 1.1
                                    namespace, replication_controller = replication_controller.split("/", 1)

                                elif KubeUtil.NAMESPACE_LABEL in labels: # k8s >= 1.2
                                    namespace = labels[KubeUtil.NAMESPACE_LABEL]
                                    pod_name = "{0}/{1}".format(namespace, pod_name)

                                tags.append("kube_namespace:%s" % namespace)
                                tags.append("kube_replication_controller:%s" % replication_controller)
                                tags.append("pod_name:%s" % pod_name)

                        elif k == SWARM_SVC_LABEL and Platform.is_swarm():
                            if v:
                                tags.append("swarm_service:%s" % v)
                        elif k == RANCHER_CONTAINER_NAME and Platform.is_rancher():
                            if v:
                                tags.append('rancher_container:%s' % v)
                        elif k == RANCHER_SVC_NAME and Platform.is_rancher():
                            if v:
                                tags.append('rancher_service:%s' % v)
                        elif k == RANCHER_STACK_NAME and Platform.is_rancher():
                            if v:
                                tags.append('rancher_stack:%s' % v)

                        elif not v:
                            tags.append(k)

                        else:
                            tags.append("%s:%s" % (k,v))

                    if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s() and k not in labels:
                        tags.append("pod_name:no_pod")

            # Get entity specific tags
            if tag_type is not None:
                tag_names = self.tag_names[tag_type]
                for tag_name in tag_names:
                    tag_value = self._extract_tag_value(entity, tag_name)
                    if tag_value is not None:
                        for t in tag_value:
                            tags.append('%s:%s' % (tag_name, str(t).strip()))

            # Add kube labels and creator/service tags
            if Platform.is_k8s():
                kube_tags = self.kube_pod_tags.get(pod_name)
                if kube_tags:
                    tags.extend(list(kube_tags))

            if self.metadata_collector.has_detected():
                orch_tags = self.metadata_collector.get_container_tags(co=entity)
                tags.extend(orch_tags)

        return tags

    def _extract_tag_value(self, entity, tag_name):
        """Extra tag information from the API result (containers or images).
        Cache extracted tags inside the entity object.
        """
        if tag_name not in TAG_EXTRACTORS:
            self.warning("{0} isn't a supported tag".format(tag_name))
            return

        # Check for already extracted tags
        if "_tag_values" not in entity:
            entity["_tag_values"] = {}

        if tag_name not in entity["_tag_values"]:
            entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity)

        return entity["_tag_values"][tag_name]

    def _filter_containers(self, containers):
        if not self.docker_util.filtering_enabled:
            return

        self._filtered_containers = set()
        for container in containers:
            container_tags = self._get_tags(container, FILTERED)
            # exclude/include patterns are stored in docker_util to share them with other container-related checks
            if self.docker_util.are_tags_filtered(container_tags):
                container_name = DockerUtil.container_name_extractor(container)[0]
                self._filtered_containers.add(container_name)
                self.log.debug("Container {0} is filtered".format(container_name))

    def _is_container_excluded(self, container):
        """Check if a container is excluded according to the filter rules.

        Requires _filter_containers to run first.
        """
        container_name = DockerUtil.container_name_extractor(container)[0]
        return container_name in self._filtered_containers

    def _report_container_size(self, containers_by_id):
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)
            m_func = FUNC_MAP[GAUGE][self.use_histogram]
            if "SizeRw" in container:
                m_func(self, 'docker.container.size_rw', container['SizeRw'],
                       tags=tags)
            if "SizeRootFs" in container:
                m_func(
                    self, 'docker.container.size_rootfs', container['SizeRootFs'],
                    tags=tags)

    def _send_container_healthcheck_sc(self, containers_by_id):
        """Send health service checks for containers."""
        for container in containers_by_id.itervalues():
            healthcheck_tags = self._get_tags(container, HEALTHCHECK)
            match = False
            for tag in healthcheck_tags:
                for rule in self.whitelist_patterns:
                    if re.match(rule, tag):
                        match = True

                        self._submit_healthcheck_sc(container)
                        break

                if match:
                    break

    def _submit_healthcheck_sc(self, container):
        health = container.get('health', {})
        status = AgentCheck.UNKNOWN
        if health:
            _health = health.get('Status', '')
            if _health == 'unhealthy':
                status = AgentCheck.CRITICAL
            elif _health == 'healthy':
                status = AgentCheck.OK

        tags = self._get_tags(container, CONTAINER)
        self.service_check(HEALTHCHECK_SERVICE_CHECK_NAME, status, tags=tags)

    def _report_container_count(self, containers_by_id):
        """Report container count per state"""
        m_func = FUNC_MAP[GAUGE][self.use_histogram]

        per_state_count = defaultdict(int)

        filterlambda = lambda ctr: not self._is_container_excluded(ctr)
        containers = list(filter(filterlambda, containers_by_id.values()))

        for ctr in containers:
            per_state_count[ctr.get('State', '')] += 1

        for state in per_state_count:
            if state:
                m_func(self, 'docker.container.count', per_state_count[state], tags=['container_state:%s' % state.lower()])

    def _report_volume_count(self):
        """Report volume count per state (dangling or not)"""
        m_func = FUNC_MAP[GAUGE][self.use_histogram]

        attached_volumes = self.docker_client.volumes(filters={'dangling': False})
        dangling_volumes = self.docker_client.volumes(filters={'dangling': True})
        attached_count = len(attached_volumes['Volumes'])
        dangling_count = len(dangling_volumes['Volumes'])
        m_func(self, 'docker.volume.count', attached_count, tags=['volume_state:attached'])
        m_func(self, 'docker.volume.count', dangling_count, tags=['volume_state:dangling'])

    def _report_image_size(self, images):
        for image in images:
            tags = self._get_tags(image, IMAGE)
            if 'VirtualSize' in image:
                self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags)
            if 'Size' in image:
                self.gauge('docker.image.size', image['Size'], tags=tags)

    # Performance metrics

    def _report_performance_metrics(self, containers_by_id):

        containers_without_proc_root = []
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container) or not self._is_container_running(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)

            try:
                self._report_cgroup_metrics(container, tags)
                if "_proc_root" not in container:
                    containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0])
                    continue
                self._report_net_metrics(container, tags)
            except BogusPIDException as e:
                self.log.warning('Unable to report cgroup metrics: %s', e)

        if containers_without_proc_root:
            message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format(
                ", ".join(containers_without_proc_root))
            if not Platform.is_k8s():
                self.warning(message)
            else:
                # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway
                self.log.debug(message)

    def _report_cgroup_metrics(self, container, tags):
        cgroup_stat_file_failures = 0
        if not container.get('_pid'):
            raise BogusPIDException('Cannot report on bogus pid(0)')

        for cgroup in CGROUP_METRICS:
            try:
                stat_file = self._get_cgroup_from_proc(cgroup["cgroup"], container['_pid'], cgroup['file'])
            except MountException as e:
                # We can't find a stat file
                self.warning(str(e))
                cgroup_stat_file_failures += 1
                if cgroup_stat_file_failures >= len(CGROUP_METRICS):
                    self.warning("Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now.")
            except IOError as e:
                self.log.debug("Cannot read cgroup file, container likely raced to finish : %s", e)
            else:
                stats = self._parse_cgroup_file(stat_file)
                if stats:
                    for key, (dd_key, metric_func) in cgroup['metrics'].iteritems():
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if key in stats:
                            metric_func(self, dd_key, int(stats[key]), tags=tags)

                    # Computed metrics
                    for mname, (key_list, fct, metric_func) in cgroup.get('to_compute', {}).iteritems():
                        values = [stats[key] for key in key_list if key in stats]
                        if len(values) != len(key_list):
                            self.log.debug("Couldn't compute {0}, some keys were missing.".format(mname))
                            continue
                        value = fct(*values)
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if value is not None:
                            metric_func(self, mname, value, tags=tags)

    def _report_net_metrics(self, container, tags):
        """Find container network metrics by looking at /proc/$PID/net/dev of the container process."""
        if self._disable_net_metrics:
            self.log.debug("Network metrics are disabled. Skipping")
            return

        proc_net_file = os.path.join(container['_proc_root'], 'net/dev')

        try:
            if container['Id'] in self.network_mappings:
                networks = self.network_mappings[container['Id']]
            else:
                networks = self.docker_util.get_container_network_mapping(container)
                if not networks:
                    networks = {'eth0': 'bridge'}
                self.network_mappings[container['Id']] = networks
        except Exception as e:
            # Revert to previous behaviour if the method is missing or failing
            # Debug message will only appear once per container, then the cache is used
            self.log.debug("Failed to build docker network mapping, using failsafe. Exception: {0}".format(e))
            networks = {'eth0': 'bridge'}
            self.network_mappings[container['Id']] = networks

        try:
            with open(proc_net_file, 'r') as fp:
                lines = fp.readlines()
                """Two first lines are headers:
                Inter-|   Receive                                                |  Transmit
                 face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed
                """
                for l in lines[2:]:
                    cols = l.split(':', 1)
                    interface_name = str(cols[0]).strip()
                    if interface_name in networks:
                        net_tags = tags + ['docker_network:'+networks[interface_name]]
                        x = cols[1].split()
                        m_func = FUNC_MAP[RATE][self.use_histogram]
                        m_func(self, "docker.net.bytes_rcvd", long(x[0]), net_tags)
                        m_func(self, "docker.net.bytes_sent", long(x[8]), net_tags)

        except Exception as e:
            # It is possible that the container got stopped between the API call and now
            self.warning("Failed to report IO metrics from file {0}. Exception: {1}".format(proc_net_file, e))

    def _invalidate_network_mapping_cache(self, api_events):
        for ev in api_events:
            try:
                if ev.get('Type') == 'network' and ev.get('Action').endswith('connect'):
                    container_id = ev.get('Actor').get('Attributes').get('container')
                    if container_id in self.network_mappings:
                        self.log.debug("Removing network mapping cache for container %s" % container_id)
                        del self.network_mappings[container_id]
            except Exception:
                self.log.warning('Malformed network event: %s' % str(ev))

    def _process_events(self, containers_by_id):
        api_events = self._get_events()

        if self.collect_exit_codes:
            self._report_exit_codes(api_events, containers_by_id)

        if self.collect_events:
            try:
                aggregated_events = self._pre_aggregate_events(api_events, containers_by_id)
                events = self._format_events(aggregated_events, containers_by_id)
            except (socket.timeout, urllib2.URLError):
                self.warning('Timeout when collecting events. Events will be missing.')
                return
            except Exception as e:
                self.warning("Unexpected exception when collecting events: {0}. "
                             "Events will be missing".format(e))
                return

            for ev in events:
                self.log.debug("Creating event: %s" % ev['msg_title'])
                self.event(ev)

    def _get_events(self):
        """Get the list of events."""
        events, changed_container_ids = self.docker_util.get_events()
        if not self._disable_net_metrics:
            self._invalidate_network_mapping_cache(events)
        if changed_container_ids and self._service_discovery:
            get_sd_backend(self.agentConfig).update_checks(changed_container_ids)
        if changed_container_ids:
            self.metadata_collector.invalidate_cache(events)
            if Platform.is_nomad():
                self.nomadutil.invalidate_cache(events)
            elif Platform.is_ecs_instance():
                self.ecsutil.invalidate_cache(events)
        return events

    def _pre_aggregate_events(self, api_events, containers_by_id):
        # Aggregate events, one per image. Put newer events first.
        events = defaultdict(deque)
        for event in api_events:
            # Skip events related to filtered containers
            container = containers_by_id.get(event.get('id'))
            if container is not None and self._is_container_excluded(container):
                self.log.debug("Excluded event: container {0} status changed to {1}".format(
                    event['id'], event['status']))
                continue
            # from may be missing (for network events for example)
            if 'from' in event:
                image_name = event['from']
                if image_name.startswith('sha256:'):
                    image_name = self.docker_util.image_name_extractor({'Image': image_name})
                events[image_name].appendleft(event)
        return events

    def _format_events(self, aggregated_events, containers_by_id):
        events = []
        for image_name, event_group in aggregated_events.iteritems():
            container_tags = set()
            low_prio_events = []
            normal_prio_events = []

            for event in event_group:
                container_name = event['id'][:11]

                if event['id'] in containers_by_id:
                    cont = containers_by_id[event['id']]
                    container_name = DockerUtil.container_name_extractor(cont)[0]
                    container_tags.update(self._get_tags(cont, PERFORMANCE))
                    container_tags.add('container_name:%s' % container_name)
                    # Add additionnal docker event attributes as tag
                    for attr in self.event_attributes_as_tags:
                        if attr in event['Actor']['Attributes'] and attr not in EXCLUDED_ATTRIBUTES:
                            container_tags.add('%s:%s' % (attr, event['Actor']['Attributes'][attr]))

                # health checks generate tons of these so we treat them separately and lower their priority
                if event['status'].startswith('exec_create:') or event['status'].startswith('exec_start:'):
                    low_prio_events.append((event, container_name))
                else:
                    normal_prio_events.append((event, container_name))

            exec_event = self._create_dd_event(low_prio_events, image_name, container_tags, priority='Low')
            if exec_event:
                events.append(exec_event)

            normal_event = self._create_dd_event(normal_prio_events, image_name, container_tags, priority='Normal')
            if normal_event:
                events.append(normal_event)

        return events

    def _report_exit_codes(self, events, containers_by_id):
        for event in events:
            container_tags = set()
            container = containers_by_id.get(event.get('id'))
            # Skip events related to filtered containers
            if container is not None and self._is_container_excluded(container):
                continue

            # Report the exit code in case of a DIE event
            if container is not None and event['status'] == 'die':
                container_name = DockerUtil.container_name_extractor(container)[0]
                container_tags.update(self._get_tags(container, CONTAINER))
                container_tags.add('container_name:%s' % container_name)
                try:
                    exit_code = int(event['Actor']['Attributes']['exitCode'])
                    message = 'Container %s exited with %s' % (container_name, exit_code)
                    status = AgentCheck.OK if exit_code == 0 else AgentCheck.CRITICAL
                    self.service_check(EXIT_SERVICE_CHECK_NAME, status, tags=list(container_tags), message=message)
                except KeyError:
                    self.log.warning('Unable to collect the exit code for container %s' % container_name)

    def _create_dd_event(self, events, image, c_tags, priority='Normal'):
        """Create the actual event to submit from a list of similar docker events"""
        if not events:
            return

        max_timestamp = 0
        status = defaultdict(int)
        status_change = []

        for ev, c_name in events:
            max_timestamp = max(max_timestamp, int(ev['time']))
            status[ev['status']] += 1
            status_change.append([c_name, ev['status']])

        status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()])
        msg_title = "%s %s on %s" % (image, status_text, self.hostname)
        msg_body = (
            "%%%\n"
            "{image_name} {status} on {hostname}\n"
            "```\n{status_changes}\n```\n"
            "%%%"
        ).format(
            image_name=image,
            status=status_text,
            hostname=self.hostname,
            status_changes="\n".join(
                ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change])
        )

        if any(error in status_text for error in ERROR_ALERT_TYPE):
            alert_type = "error"
        else:
            alert_type = None

        return {
            'timestamp': max_timestamp,
            'host': self.hostname,
            'event_type': EVENT_TYPE,
            'msg_title': msg_title,
            'msg_text': msg_body,
            'source_type_name': EVENT_TYPE,
            'event_object': 'docker:%s' % image,
            'tags': list(c_tags),
            'alert_type': alert_type,
            'priority': priority
        }


    def _report_disk_stats(self):
        """Report metrics about the volume space usage"""
        stats = {
            'docker.data.used': None,
            'docker.data.total': None,
            'docker.data.free': None,
            'docker.metadata.used': None,
            'docker.metadata.total': None,
            'docker.metadata.free': None
            # these two are calculated by _calc_percent_disk_stats
            # 'docker.data.percent': None,
            # 'docker.metadata.percent': None
        }
        info = self.docker_client.info()
        driver_status = info.get('DriverStatus', [])
        if not driver_status:
            self.log.warning('Disk metrics collection is enabled but docker info did not'
                             ' report any. Your storage driver might not support them, skipping.')
            return
        for metric in driver_status:
            # only consider metrics about disk space
            if len(metric) == 2 and 'Space' in metric[0]:
                # identify Data and Metadata metrics
                mtype = 'data'
                if 'Metadata' in metric[0]:
                    mtype = 'metadata'

                if 'Used' in metric[0]:
                    stats['docker.{0}.used'.format(mtype)] = metric[1]
                elif 'Space Total' in metric[0]:
                    stats['docker.{0}.total'.format(mtype)] = metric[1]
                elif 'Space Available' in metric[0]:
                    stats['docker.{0}.free'.format(mtype)] = metric[1]
        stats = self._format_disk_metrics(stats)
        stats.update(self._calc_percent_disk_stats(stats))
        tags = self._get_tags()
        for name, val in stats.iteritems():
            if val is not None:
                self.gauge(name, val, tags)

    def _format_disk_metrics(self, metrics):
        """Cast the disk stats to float and convert them to bytes"""
        for name, raw_val in metrics.iteritems():
            if raw_val:
                match = DISK_STATS_RE.search(raw_val)
                if match is None or len(match.groups()) != 2:
                    self.log.warning('Can\'t parse value %s for disk metric %s. Dropping it.' % (raw_val, name))
                    metrics[name] = None
                val, unit = match.groups()
                # by default some are uppercased others lowercased. That's error prone.
                unit = unit.lower()
                try:
                    val = int(float(val) * UNIT_MAP[unit])
                    metrics[name] = val
                except KeyError:
                    self.log.error('Unrecognized unit %s for disk metric %s. Dropping it.' % (unit, name))
                    metrics[name] = None
        return metrics

    def _calc_percent_disk_stats(self, stats):
        """Calculate a percentage of used disk space for data and metadata"""
        mtypes = ['data', 'metadata']
        percs = {}
        for mtype in mtypes:
            used = stats.get('docker.{0}.used'.format(mtype))
            total = stats.get('docker.{0}.total'.format(mtype))
            free = stats.get('docker.{0}.free'.format(mtype))
            if used and total and free and ceil(total) < free + used:
                self.log.debug('used, free, and total disk metrics may be wrong, '
                               'used: %s, free: %s, total: %s',
                               used, free, total)
                total = used + free
            try:
                if isinstance(used, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(100 * float(used) / float(total), 2)
                elif isinstance(free, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(100 * (1.0 - (float(free) / float(total))), 2)
            except ZeroDivisionError:
                self.log.error('docker.{0}.total is 0, calculating docker.{1}.percent'
                               ' is not possible.'.format(mtype, mtype))
        return percs

    # Cgroups
    def _get_cgroup_from_proc(self, cgroup, pid, filename):
        """Find a specific cgroup file, containing metrics to extract."""
        params = {
            "file": filename,
        }
        return DockerUtil.find_cgroup_from_proc(self._mountpoints, pid, cgroup, self.docker_util._docker_root) % (params)

    def _parse_cgroup_file(self, stat_file):
        """Parse a cgroup pseudo file for key/values."""
        self.log.debug("Opening cgroup file: %s" % stat_file)
        try:
            with open(stat_file, 'r') as fp:
                if 'blkio' in stat_file:
                    return self._parse_blkio_metrics(fp.read().splitlines())
                elif 'cpuacct.usage' in stat_file:
                    return dict({'usage': str(int(fp.read())/10000000)})
                else:
                    return dict(map(lambda x: x.split(' ', 1), fp.read().splitlines()))
        except IOError:
            # It is possible that the container got stopped between the API call and now.
            # Some files can also be missing (like cpu.stat) and that's fine.
            self.log.debug("Can't open %s. Its metrics will be missing." % stat_file)

    def _parse_blkio_metrics(self, stats):
        """Parse the blkio metrics."""
        metrics = {
            'io_read': 0,
            'io_write': 0,
        }
        for line in stats:
            if 'Read' in line:
                metrics['io_read'] += int(line.split()[2])
            if 'Write' in line:
                metrics['io_write'] += int(line.split()[2])
        return metrics

    def _is_container_cgroup(self, line, selinux_policy):
        if line[1] not in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') or line[2] == '/docker-daemon':
            return False
        if 'docker' in line[2]: # general case
            return True
        if 'docker' in selinux_policy: # selinux
            return True
        if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2][1:]): # kubernetes
            return True
        if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2].split('/')[-1]): # kube 1.6+ qos hierarchy
            return True
        return False

    # proc files
    def _crawl_container_pids(self, container_dict, custom_cgroups=False):
        """Crawl `/proc` to find container PIDs and add them to `containers_by_id`."""
        proc_path = os.path.join(self.docker_util._docker_root, 'proc')
        pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()]

        if len(pid_dirs) == 0:
            self.warning("Unable to find any pid directory in {0}. "
                "If you are running the agent in a container, make sure to "
                'share the volume properly: "/proc:/host/proc:ro". '
                "See https://github.com/DataDog/docker-dd-agent/blob/master/README.md for more information. "
                "Network metrics will be missing".format(proc_path))
            self._disable_net_metrics = True
            return container_dict

        self._disable_net_metrics = False

        for folder in pid_dirs:
            try:
                path = os.path.join(proc_path, folder, 'cgroup')
                with open(path, 'r') as f:
                    content = [line.strip().split(':') for line in f.readlines()]

                selinux_policy = ''
                path = os.path.join(proc_path, folder, 'attr', 'current')
                if os.path.exists(path):
                    with open(path, 'r') as f:
                        selinux_policy = f.readlines()[0]
            except IOError, e:
                #  Issue #2074
                self.log.debug("Cannot read %s, process likely raced to finish : %s", path, e)
            except Exception as e:
                self.warning("Cannot read %s : %s" % (path, str(e)))
                continue

            try:
                for line in content:
                    if self._is_container_cgroup(line, selinux_policy):
                        cpuacct = line[2]
                        break
                else:
                    continue

                matches = re.findall(CONTAINER_ID_RE, cpuacct)
                if matches:
                    container_id = matches[-1]
                    if container_id not in container_dict:
                        self.log.debug(
                            "Container %s not in container_dict, it's likely excluded", container_id
                        )
                        continue
                    container_dict[container_id]['_pid'] = folder
                    container_dict[container_id]['_proc_root'] = os.path.join(proc_path, folder)
                elif custom_cgroups:  # if we match by pid that should be enough (?) - O(n) ugh!
                    for _, container in container_dict.iteritems():
                        if container.get('_pid') == int(folder):
                            container['_proc_root'] = os.path.join(proc_path, folder)
                            break

            except Exception, e:
                self.warning("Cannot parse %s content: %s" % (path, str(e)))
                continue