Exemplo n.º 1
0
    def is_excluded(self, cid, pod_uid=None):
        """
        Queries the agent6 container filter interface. It retrieves container
        name + image from the podlist, so static pod filtering is not supported.

        Result is cached between calls to avoid the python-go switching cost for
        prometheus metrics (will be called once per metric)
        :param cid: container id
        :param pod_uid: pod UID for static pod detection
        :return: bool
        """
        if not cid:
            return True

        if cid in self.cache:
            return self.cache[cid]

        if pod_uid and pod_uid in self.static_pod_uids:
            self.cache[cid] = False
            return False

        if cid not in self.containers:
            # Filter out metrics not coming from a container (system slices)
            self.cache[cid] = True
            return True
        ctr = self.containers[cid]
        if not ("name" in ctr and "image" in ctr):
            # Filter out invalid containers
            self.cache[cid] = True
            return True

        excluded = c_is_excluded(ctr.get("name"), ctr.get("image"),
                                 self.container_id_to_namespace.get(cid, ""))
        self.cache[cid] = excluded
        return excluded
Exemplo n.º 2
0
    def is_namespace_excluded(self, namespace):
        """
        Queries the agent container filter interface to check whether a
        Kubernetes namespace should be excluded.

        The result is cached between calls to avoid the python-go switching
        cost.
        :param namespace: namespace
        :return: bool
        """
        if not namespace:
            return False

        # Sent empty container name and image because we are interested in
        # applying only the namespace exclusion rules.
        excluded = c_is_excluded('', '', namespace)
        self.cache_namespace_exclusion[namespace] = excluded
        return excluded
Exemplo n.º 3
0
    def check(self, instance):
        metadata_endpoint = API_ENDPOINT + METADATA_ROUTE
        stats_endpoint = API_ENDPOINT + STATS_ROUTE
        custom_tags = instance.get('tags', [])

        try:
            request = self.http.get(metadata_endpoint)
        except requests.exceptions.Timeout:
            msg = 'Fargate {} endpoint timed out after {} seconds'.format(
                metadata_endpoint, self.http.options['timeout'])
            self.service_check('fargate_check',
                               AgentCheck.CRITICAL,
                               message=msg,
                               tags=custom_tags)
            self.log.exception(msg)
            return
        except requests.exceptions.RequestException:
            msg = 'Error fetching Fargate {} endpoint'.format(
                metadata_endpoint)
            self.service_check('fargate_check',
                               AgentCheck.CRITICAL,
                               message=msg,
                               tags=custom_tags)
            self.log.exception(msg)
            return

        if request.status_code != 200:
            msg = 'Fargate {} endpoint responded with {} HTTP code'.format(
                metadata_endpoint, request.status_code)
            self.service_check('fargate_check',
                               AgentCheck.CRITICAL,
                               message=msg,
                               tags=custom_tags)
            self.log.warning(msg)
            return

        metadata = {}
        try:
            metadata = request.json()
        except ValueError:
            msg = 'Cannot decode Fargate {} endpoint response'.format(
                metadata_endpoint)
            self.service_check('fargate_check',
                               AgentCheck.WARNING,
                               message=msg,
                               tags=custom_tags)
            self.log.warning(msg, exc_info=True)
            return

        if not all(k in metadata for k in ['Cluster', 'Containers']):
            msg = 'Missing critical metadata in {} endpoint response'.format(
                metadata_endpoint)
            self.service_check('fargate_check',
                               AgentCheck.WARNING,
                               message=msg,
                               tags=custom_tags)
            self.log.warning(msg)
            return

        exlcuded_cid = set()
        container_tags = {}
        for container in metadata['Containers']:
            c_id = container['DockerId']
            # Check if container is excluded
            if c_is_excluded(container.get("Name", ""),
                             container.get("Image", "")):
                exlcuded_cid.add(c_id)
                continue

            tagger_tags = get_tags('container_id://%s' % c_id, True) or []

            # Compatibility with previous versions of the check
            compat_tags = []
            for tag in tagger_tags:
                if tag.startswith(("task_family:", "task_version:")):
                    compat_tags.append("ecs_" + tag)
                elif tag.startswith("cluster_name:"):
                    compat_tags.append(
                        tag.replace("cluster_name:", "ecs_cluster:"))
                elif tag.startswith("container_name:"):
                    compat_tags.append(
                        tag.replace("container_name:", "docker_name:"))

            container_tags[c_id] = tagger_tags + compat_tags + custom_tags

            if container.get('Limits', {}).get('CPU', 0) > 0:
                self.gauge('ecs.fargate.cpu.limit', container['Limits']['CPU'],
                           container_tags[c_id])

        try:
            request = self.http.get(stats_endpoint)
        except requests.exceptions.Timeout:
            msg = 'Fargate {} endpoint timed out after {} seconds'.format(
                stats_endpoint, self.http.options['timeout'])
            self.service_check('fargate_check',
                               AgentCheck.WARNING,
                               message=msg,
                               tags=custom_tags)
            self.log.warning(msg, exc_info=True)
            return
        except requests.exceptions.RequestException:
            msg = 'Error fetching Fargate {} endpoint'.format(stats_endpoint)
            self.service_check('fargate_check',
                               AgentCheck.WARNING,
                               message=msg,
                               tags=custom_tags)
            self.log.warning(msg, exc_info=True)
            return

        if request.status_code != 200:
            msg = 'Fargate {} endpoint responded with {} HTTP code'.format(
                stats_endpoint, request.status_code)
            self.service_check('fargate_check',
                               AgentCheck.WARNING,
                               message=msg,
                               tags=custom_tags)
            self.log.warning(msg)
            return

        stats = {}
        try:
            stats = request.json()
        except ValueError:
            msg = 'Cannot decode Fargate {} endpoint response'.format(
                stats_endpoint)
            self.service_check('fargate_check',
                               AgentCheck.WARNING,
                               message=msg,
                               tags=custom_tags)
            self.log.warning(msg, exc_info=True)

        for container_id, container_stats in iteritems(stats):
            if container_id not in exlcuded_cid:
                self.submit_perf_metrics(instance, container_tags,
                                         container_id, container_stats)

        self.service_check('fargate_check', AgentCheck.OK, tags=custom_tags)