def agent_container_inspect(): # Self inspection based on cgroups # On all platforms, the container ID is the last part of the path. REGEX_PATTERN = '(.*/)+([a-z0-9]{64})$' dockerutil = DockerUtil() cgroup_path = '/proc/self/cgroup' container_id = None with open(cgroup_path, 'r') as f: for ind in f: id_match = re.search(REGEX_PATTERN, ind) if id_match: container_id = id_match.group(2) break if container_id is None: print( "The container_id could not be found. Refer to the docker log of the container running the agent" ) return 1 try: inspect = dockerutil.inspect_container(container_id) key_indices = [ i for i, k in enumerate(inspect['Config']['Env']) if 'API_KEY' in k ] for ind in key_indices: inspect['Config']['Env'][ind] = '%s=%s' % ( inspect['Config']['Env'][ind].split('=', 1)[0], 'redacted') print json.dumps(inspect, indent=4) return 0 except Exception as e: print "Could not inspect container: %s" % e
def agent_container_inspect(): # Self inspection based on cgroups # On all platforms, the container ID is the last part of the path. REGEX_PATTERN = '(.*/)+([a-z0-9]{64})$' dockerutil = DockerUtil() cgroup_path = '/proc/self/cgroup' container_id = None with open(cgroup_path, 'r') as f: for ind in f: id_match = re.search(REGEX_PATTERN, ind) if id_match: container_id = id_match.group(2) break if container_id is None: print("The container_id could not be found. Refer to the docker log of the container running the agent") return 1 try: inspect = dockerutil.inspect_container(container_id) key_indices = [i for i, k in enumerate(inspect['Config']['Env']) if 'API_KEY' in k] for ind in key_indices: inspect['Config']['Env'][ind] = '%s=%s' % (inspect['Config']['Env'][ind].split('=', 1)[0], 'redacted') print json.dumps(inspect, indent=4) return 0 except Exception as e: print "Could not inspect container: %s" % e
class SDDockerBackend(AbstractSDBackend): """Docker-based service discovery""" def __init__(self, agentConfig): self.docker_client = DockerUtil().client if is_k8s(): self.kubeutil = KubeUtil() try: self.config_store = get_config_store(agentConfig=agentConfig) except Exception as e: log.error('Failed to instantiate the config store client. ' 'Auto-config only will be used. %s' % str(e)) agentConfig['sd_config_backend'] = None self.config_store = get_config_store(agentConfig=agentConfig) self.VAR_MAPPING = { 'host': self._get_host_address, 'port': self._get_port, 'tags': self._get_additional_tags, } AbstractSDBackend.__init__(self, agentConfig) def _get_host_address(self, c_inspect, tpl_var): """Extract the container IP from a docker inspect object, or the kubelet API.""" c_id, c_img = c_inspect.get('Id', ''), c_inspect.get('Config', {}).get('Image', '') tpl_parts = tpl_var.split('_') # a specifier was given if len(tpl_parts) > 1: networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {} ip_dict = {} for net_name, net_desc in networks.iteritems(): ip = net_desc.get('IPAddress') if ip: ip_dict[net_name] = ip ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var) if ip_addr: return ip_addr # try to get the bridge IP address log.debug("No network found for container %s (%s), trying with IPAddress field" % (c_id[:12], c_img)) ip_addr = c_inspect.get('NetworkSettings', {}).get('IPAddress') if ip_addr: return ip_addr if is_k8s(): # kubernetes case log.debug("Couldn't find the IP address for container %s (%s), " "using the kubernetes way." % (c_id[:12], c_img)) pod_list = self.kubeutil.retrieve_pods_list().get('items', []) for pod in pod_list: pod_ip = pod.get('status', {}).get('podIP') if pod_ip is None: continue else: c_statuses = pod.get('status', {}).get('containerStatuses', []) for status in c_statuses: # compare the container id with those of containers in the current pod if c_id == status.get('containerID', '').split('//')[-1]: return pod_ip log.error("No IP address was found for container %s (%s)" % (c_id[:12], c_img)) return None def _extract_ip_from_networks(self, ip_dict, tpl_var): """Extract a single IP from a dictionary made of network names and IPs.""" if not ip_dict: return None tpl_parts = tpl_var.split('_') # no specifier if len(tpl_parts) < 2: log.warning("No key was passed for template variable %s." % tpl_var) return self._get_fallback_ip(ip_dict) else: res = ip_dict.get(tpl_parts[-1]) if res is None: log.warning("The key passed for template variable %s was not found." % tpl_var) return self._get_fallback_ip(ip_dict) else: return res def _get_fallback_ip(self, ip_dict): """try to pick the bridge key, falls back to the value of the last key""" if 'bridge' in ip_dict: log.warning("Using the bridge network.") return ip_dict['bridge'] else: last_key = sorted(ip_dict.iterkeys())[-1] log.warning("Trying with the last key: '%s'." % last_key) return ip_dict[last_key] def _get_port(self, container_inspect, tpl_var): """Extract a port from a container_inspect or the k8s API given a template variable.""" c_id = container_inspect.get('Id', '') try: ports = map(lambda x: x.split('/')[0], container_inspect['NetworkSettings']['Ports'].keys()) except (IndexError, KeyError, AttributeError): # try to get ports from the docker API. Works if the image has an EXPOSE instruction ports = map(lambda x: x.split('/')[0], container_inspect['Config'].get('ExposedPorts', {}).keys()) # if it failed, try with the kubernetes API if not ports and is_k8s(): log.debug("Didn't find the port for container %s (%s), trying the kubernetes way." % (c_id[:12], container_inspect.get('Config', {}).get('Image', ''))) co_statuses = self._get_kube_config(c_id, 'status').get('containerStatuses', []) c_name = None for co in co_statuses: if co.get('containerID', '').split('//')[-1] == c_id: c_name = co.get('name') break containers = self._get_kube_config(c_id, 'spec').get('containers', []) for co in containers: if co.get('name') == c_name: ports = map(lambda x: str(x.get('containerPort')), co.get('ports', [])) ports = sorted(ports, key=lambda x: int(x)) return self._extract_port_from_list(ports, tpl_var) def _extract_port_from_list(self, ports, tpl_var): if not ports: return None tpl_parts = tpl_var.split('_') if len(tpl_parts) == 1: log.debug("No index was passed for template variable %s. " "Trying with the last element." % tpl_var) return ports[-1] try: idx = tpl_parts[-1] return ports[int(idx)] except ValueError: log.error("Port index is not an integer. Using the last element instead.") except IndexError: log.error("Port index is out of range. Using the last element instead.") return ports[-1] def get_tags(self, c_inspect): """Extract useful tags from docker or platform APIs. These are collected by default.""" tags = [] if is_k8s(): pod_metadata = self._get_kube_config(c_inspect.get('Id'), 'metadata') if pod_metadata is None: log.warning("Failed to fetch pod metadata for container %s." " Kubernetes tags may be missing." % c_inspect.get('Id', '')[:12]) return [] # get labels kube_labels = pod_metadata.get('labels', {}) for label, value in kube_labels.iteritems(): tags.append('%s:%s' % (label, value)) # get replication controller created_by = json.loads(pod_metadata.get('annotations', {}).get('kubernetes.io/created-by', '{}')) if created_by.get('reference', {}).get('kind') == 'ReplicationController': tags.append('kube_replication_controller:%s' % created_by.get('reference', {}).get('name')) # get kubernetes namespace tags.append('kube_namespace:%s' % pod_metadata.get('namespace')) return tags def _get_additional_tags(self, container_inspect, *args): tags = [] if is_k8s(): pod_metadata = self._get_kube_config(container_inspect.get('Id'), 'metadata') pod_spec = self._get_kube_config(container_inspect.get('Id'), 'spec') if pod_metadata is None or pod_spec is None: log.warning("Failed to fetch pod metadata or pod spec for container %s." " Additional Kubernetes tags may be missing." % container_inspect.get('Id', '')[:12]) return [] tags.append('node_name:%s' % pod_spec.get('nodeName')) tags.append('pod_name:%s' % pod_metadata.get('name')) return tags def _get_kube_config(self, c_id, key): """Get a part of a pod config from the kubernetes API""" pods = self.kubeutil.retrieve_pods_list().get('items', []) for pod in pods: c_statuses = pod.get('status', {}).get('containerStatuses', []) for status in c_statuses: if c_id == status.get('containerID', '').split('//')[-1]: return pod.get(key, {}) def get_configs(self): """Get the config for all docker containers running on the host.""" configs = {} containers = [( container.get('Image'), container.get('Id'), container.get('Labels') ) for container in self.docker_client.containers()] # used by the configcheck agent command to trace where check configs come from trace_config = self.agentConfig.get(TRACE_CONFIG, False) for image, cid, labels in containers: try: # value of the DATADOG_ID tag or the image name if the label is missing identifier = self.get_config_id(image, labels) check_configs = self._get_check_configs(cid, identifier, trace_config=trace_config) or [] for conf in check_configs: if trace_config and conf is not None: source, conf = conf check_name, init_config, instance = conf # build instances list if needed if configs.get(check_name) is None: if trace_config: configs[check_name] = (source, (init_config, [instance])) else: configs[check_name] = (init_config, [instance]) else: conflict_init_msg = 'Different versions of `init_config` found for check {0}. ' \ 'Keeping the first one found.' if trace_config: if configs[check_name][1][0] != init_config: log.warning(conflict_init_msg.format(check_name)) configs[check_name][1][1].append(instance) else: if configs[check_name][0] != init_config: log.warning(conflict_init_msg.format(check_name)) configs[check_name][1].append(instance) except Exception: log.exception('Building config for container %s based on image %s using service ' 'discovery failed, leaving it alone.' % (cid[:12], image)) return configs def get_config_id(self, image, labels): """Look for a DATADOG_ID label, return its value or the image name if missing""" return labels.get(DATADOG_ID) or image def _get_check_configs(self, c_id, identifier, trace_config=False): """Retrieve configuration templates and fill them with data pulled from docker and tags.""" inspect = self.docker_client.inspect_container(c_id) config_templates = self._get_config_templates(identifier, trace_config=trace_config) if not config_templates: log.debug('No config template for container %s with identifier %s. ' 'It will be left unconfigured.' % (c_id[:12], identifier)) return None check_configs = [] tags = self.get_tags(inspect) for config_tpl in config_templates: if trace_config: source, config_tpl = config_tpl check_name, init_config_tpl, instance_tpl, variables = config_tpl # insert tags in instance_tpl and process values for template variables instance_tpl, var_values = self._fill_tpl(inspect, instance_tpl, variables, tags) tpl = self._render_template(init_config_tpl or {}, instance_tpl or {}, var_values) if tpl and len(tpl) == 2: init_config, instance = tpl if trace_config: check_configs.append((source, (check_name, init_config, instance))) else: check_configs.append((check_name, init_config, instance)) return check_configs def _get_config_templates(self, identifier, trace_config=False): """Extract config templates for an identifier from a K/V store and returns it as a dict object.""" config_backend = self.agentConfig.get('sd_config_backend') templates = [] if config_backend is None: auto_conf = True log.warning('No supported configuration backend was provided, using auto-config only.') else: auto_conf = False # format: [('ident', {init_tpl}, {instance_tpl})] without trace_config # or [(source, ('ident', {init_tpl}, {instance_tpl}))] with trace_config raw_tpls = self.config_store.get_check_tpls( identifier, auto_conf=auto_conf, trace_config=trace_config) for tpl in raw_tpls: if trace_config and tpl is not None: # each template can come from either auto configuration or user-supplied templates source, tpl = tpl if tpl is not None and len(tpl) == 3: check_name, init_config_tpl, instance_tpl = tpl else: log.debug('No template was found for identifier %s, leaving it alone.' % identifier) return None try: # build a list of all variables to replace in the template variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \ self.PLACEHOLDER_REGEX.findall(str(instance_tpl)) variables = map(lambda x: x.strip('%'), variables) if not isinstance(init_config_tpl, dict): init_config_tpl = json.loads(init_config_tpl or '{}') if not isinstance(instance_tpl, dict): instance_tpl = json.loads(instance_tpl or '{}') except json.JSONDecodeError: log.exception('Failed to decode the JSON template fetched for check {0}. Its configuration' ' by service discovery failed for ident {1}.'.format(check_name, identifier)) return None if trace_config: templates.append((source, (check_name, init_config_tpl, instance_tpl, variables))) else: templates.append((check_name, init_config_tpl, instance_tpl, variables)) return templates def _fill_tpl(self, inspect, instance_tpl, variables, tags=None): """Add container tags to instance templates and build a dict from template variable names and their values.""" var_values = {} c_id, c_image = inspect.get('Id', ''), inspect.get('Config', {}).get('Image', '') # add default tags to the instance if tags: tpl_tags = instance_tpl.get('tags', []) tags += tpl_tags if isinstance(tpl_tags, list) else [tpl_tags] instance_tpl['tags'] = list(set(tags)) for var in variables: # variables can be suffixed with an index in case several values are found if var.split('_')[0] in self.VAR_MAPPING: try: res = self.VAR_MAPPING[var.split('_')[0]](inspect, var) if res is None: raise ValueError("Invalid value for variable %s." % var) var_values[var] = res except Exception as ex: log.error("Could not find a value for the template variable %s for container %s " "(%s): %s" % (var, c_id[:12], c_image, str(ex))) else: log.error("No method was found to interpolate template variable %s for container %s " "(%s)." % (var, c_id[:12], c_image)) return instance_tpl, var_values
class ECSUtil: __metaclass__ = Singleton def __init__(self): self.docker_util = DockerUtil() self.ecs_agent_local = None self.ecs_tags = {} self._populate_ecs_tags() def _get_ecs_address(self): """Detect how to connect to the ecs-agent""" ecs_config = self.docker_util.inspect_container('ecs-agent') ip = ecs_config.get('NetworkSettings', {}).get('IPAddress') ports = ecs_config.get('NetworkSettings', {}).get('Ports') port = ports.keys()[0].split('/')[0] if ports else None if not ip: port = ECS_INTROSPECT_DEFAULT_PORT if self._is_ecs_agent_local(): ip = "localhost" elif Platform.is_containerized() and self.docker_gateway: ip = self.docker_gateway else: raise Exception("Unable to determine ecs-agent IP address") return ip, port def _populate_ecs_tags(self, skip_known=False): """ Populate the cache of ecs tags. Can be called with skip_known=True If we just want to update new containers quickly (single task api call) (because we detected that a new task started for example) """ try: ip, port = self._get_ecs_address() except Exception as ex: log.warning("Failed to connect to ecs-agent, skipping task tagging: %s" % ex) return try: tasks = requests.get('http://%s:%s/v1/tasks' % (ip, port)).json() for task in tasks.get('Tasks', []): for container in task.get('Containers', []): cid = container['DockerId'] if skip_known and cid in self.ecs_tags: continue tags = ['task_name:%s' % task['Family'], 'task_version:%s' % task['Version']] self.ecs_tags[container['DockerId']] = tags except requests.exceptions.HTTPError as ex: log.warning("Unable to collect ECS task names: %s" % ex) def _get_container_tags(self, cid): """ This method triggers a fast fill of the tag cache (useful when a new task starts and we want the new containers to be cached with a single api call) and returns the tags (or an empty list) from the fresh cache. """ self._populate_ecs_tags(skip_known=True) if cid in self.ecs_tags: return self.ecs_tags[cid] else: log.debug("Container %s doesn't seem to be an ECS task, skipping." % cid[:12]) self.ecs_tags[cid] = [] return [] def _is_ecs_agent_local(self): """Return True if we can reach the ecs-agent over localhost, False otherwise. This is needed because if the ecs-agent is started with --net=host it won't have an IP address attached. """ if self.ecs_agent_local is not None: return self.ecs_agent_local self.ecs_agent_local = False sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(5) try: result = sock.connect_ex(('localhost', ECS_INTROSPECT_DEFAULT_PORT)) except Exception as e: log.debug("Unable to connect to ecs-agent. Exception: {0}".format(e)) else: if result == 0: self.ecs_agent_local = True else: log.debug("ecs-agent is not available locally, encountered error code: {0}".format(result)) sock.close() return self.ecs_agent_local def extract_container_tags(self, co): """ Queries the ecs-agent to get ECS tags (task and task version) for a containers. As this is expensive, it is cached in the self.ecs_tags dict. The cache invalidation goes through invalidate_ecs_cache, called by the docker_daemon check :param co: container dict returned by docker-py :return: tags as list<string>, cached """ co_id = co.get('Id', None) if co_id is None: log.warning("Invalid container object in extract_container_tags") return [] if co_id in self.ecs_tags: return self.ecs_tags[co_id] else: return self._get_container_tags(co_id) def invalidate_cache(self, events): """ Allows cache invalidation when containers die :param events from self.get_events """ try: for ev in events: if ev.get('status') == 'die' and ev.get('id') in self.ecs_tags: del self.ecs_tags[ev.get('id')] except Exception as e: log.warning("Error when invalidating ecs cache: " + str(e))
class BaseUtil: """ Base class for orchestrator utils. Only handles container tags for now. Users should go through the orchestrator.Tagger class to simplify the code Children classes can implement: - __init__: to change self.needs_inspect - _get_cacheable_tags: tags will be cached for reuse - _get_transient_tags: tags can change and won't be cached (TODO) - invalidate_cache: custom cache invalidation logic - is_detected (staticmethod) """ __metaclass__ = Singleton def __init__(self): # Whether your get___tags methods need the Config section inspect data self.needs_inspect_config = False # Whether your get___tags methods need the Labels section inspect data self.needs_inspect_labels = False self.log = logging.getLogger(__name__) self.docker_util = DockerUtil() # Tags cache as a dict {co_id: [tags]} self._container_tags_cache = {} def get_container_tags(self, cid=None, co=None): """ Returns container tags for the given container, inspecting the container if needed :param container: either the container id or container dict returned by docker-py :return: tags as list<string>, cached """ if (cid is not None) and (co is not None): self.log.error( "Can only pass either a container id or object, not both, returning empty tags" ) return [] if (cid is None) and (co is None): self.log.error( "Need one container id or container object, returning empty tags" ) return [] elif co is not None: if 'Id' in co: cid = co.get('Id') else: self.log.warning( "Invalid container dict, returning empty tags") return [] if cid in self._container_tags_cache: return self._container_tags_cache[cid] else: if self.needs_inspect_config and (co is None or 'Config' not in co): co = self.docker_util.inspect_container(cid) if self.needs_inspect_labels and (co is None or 'Labels' not in co): co = self.docker_util.inspect_container(cid) self._container_tags_cache[cid] = self._get_cacheable_tags(cid, co) return self._container_tags_cache[cid] def invalidate_cache(self, events): """ Allows cache invalidation when containers die :param events from self.get_events """ try: for ev in events: if ev.get('status') == 'die' and ev.get( 'id') in self._container_tags_cache: del self._container_tags_cache[ev.get('id')] except Exception as e: self.log.warning("Error when invalidating tag cache: " + str(e)) def reset_cache(self): """ Empties all caches to reset the singleton to initial state """ self._container_tags_cache = {} # Util methods for children classes def _try_urls(self, urls, validation_lambda=None, timeout=1): """ When detecting orchestrator agents, one might need to try several IPs before finding the good one. The first url returning a 200 and validating the lambda will be returned. If no lambda is provided, the first url to return a 200 is returned. :param urls: list of urls to try :param validation_lambda: lambda to return a boolean from a Request.Response :return: first url matching, or None """ if not urls: return None for url in urls: try: response = requests.get(url, timeout=timeout) if response.status_code is not requests.codes.ok: continue if validation_lambda and not validation_lambda(response): continue return url except requests.exceptions.RequestException: # Network continue except ValueError: # JSON parsing or dict search continue except TypeError: # NoneType errors continue return None
class NomadUtil: __metaclass__ = Singleton def __init__(self): self.docker_util = DockerUtil() # Tags cache as a dict {co_id: (create_timestamp, [tags])} self._container_tags_cache = {} def extract_container_tags(self, co): """ Queries docker inspect to get nomad tags in the container's environment vars. As this is expensive, it is cached in the self._nomad_tags_cache dict. The cache invalidation goes through invalidate_nomad_cache, called by the docker_daemon check :param co: container dict returned by docker-py :return: tags as list<string>, cached """ co_id = co.get('Id', None) if co_id is None: log.warning("Invalid container object in extract_container_tags") return # Cache lookup on Id, verified on Created timestamp if co_id in self._container_tags_cache: created, tags = self._container_tags_cache[co_id] if created == co.get('Created', -1): return tags tags = [] try: inspect_info = self.docker_util.inspect_container(co_id) envvars = inspect_info.get('Config', {}).get('Env', {}) for var in envvars: if var.startswith(NOMAD_TASK_NAME): tags.append('nomad_task:%s' % var[len(NOMAD_TASK_NAME) + 1:]) elif var.startswith(NOMAD_JOB_NAME): tags.append('nomad_job:%s' % var[len(NOMAD_JOB_NAME) + 1:]) elif var.startswith(NOMAD_ALLOC_NAME): try: start = var.index('.', len(NOMAD_ALLOC_NAME)) + 1 end = var.index('[') if end <= start: raise ValueError("Error extracting group from %s, check format" % var) tags.append('nomad_group:%s' % var[start:end]) except ValueError: pass self._container_tags_cache[co_id] = (co.get('Created'), tags) except Exception as e: log.warning("Error while parsing Nomad tags: %s" % str(e)) finally: return tags def invalidate_cache(self, events): """ Allows cache invalidation when containers dies :param events from self.get_events """ try: for ev in events: if ev.get('status') == 'die' and ev.get('id') in self._container_tags_cache: del self._container_tags_cache[ev.get('id')] except Exception as e: log.warning("Error when invalidating nomad cache: " + str(e))
class ECSUtil: __metaclass__ = Singleton def __init__(self): self.docker_util = DockerUtil() self.ecs_agent_local = None self.ecs_tags = {} self._populate_ecs_tags() def _get_ecs_address(self): """Detect how to connect to the ecs-agent""" ecs_config = self.docker_util.inspect_container('ecs-agent') ip = ecs_config.get('NetworkSettings', {}).get('IPAddress') ports = ecs_config.get('NetworkSettings', {}).get('Ports') port = ports.keys()[0].split('/')[0] if ports else None if not ip: port = ECS_INTROSPECT_DEFAULT_PORT if self._is_ecs_agent_local(): ip = "localhost" elif Platform.is_containerized(): ip = self.docker_util.get_gateway() else: raise Exception("Unable to determine ecs-agent IP address") return ip, port def _populate_ecs_tags(self, skip_known=False): """ Populate the cache of ecs tags. Can be called with skip_known=True If we just want to update new containers quickly (single task api call) (because we detected that a new task started for example) """ try: ip, port = self._get_ecs_address() except Exception as ex: log.warning( "Failed to connect to ecs-agent, skipping task tagging: %s" % ex) return try: tasks = requests.get('http://%s:%s/v1/tasks' % (ip, port)).json() for task in tasks.get('Tasks', []): for container in task.get('Containers', []): cid = container['DockerId'] if skip_known and cid in self.ecs_tags: continue tags = [ 'task_name:%s' % task['Family'], 'task_version:%s' % task['Version'] ] self.ecs_tags[container['DockerId']] = tags except requests.exceptions.HTTPError as ex: log.warning("Unable to collect ECS task names: %s" % ex) def _get_container_tags(self, cid): """ This method triggers a fast fill of the tag cache (useful when a new task starts and we want the new containers to be cached with a single api call) and returns the tags (or an empty list) from the fresh cache. """ self._populate_ecs_tags(skip_known=True) if cid in self.ecs_tags: return self.ecs_tags[cid] else: log.debug( "Container %s doesn't seem to be an ECS task, skipping." % cid[:12]) self.ecs_tags[cid] = [] return [] def _is_ecs_agent_local(self): """Return True if we can reach the ecs-agent over localhost, False otherwise. This is needed because if the ecs-agent is started with --net=host it won't have an IP address attached. """ if self.ecs_agent_local is not None: return self.ecs_agent_local self.ecs_agent_local = False sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(5) try: result = sock.connect_ex( ('localhost', ECS_INTROSPECT_DEFAULT_PORT)) except Exception as e: log.debug( "Unable to connect to ecs-agent. Exception: {0}".format(e)) else: if result == 0: self.ecs_agent_local = True else: log.debug( "ecs-agent is not available locally, encountered error code: {0}" .format(result)) sock.close() return self.ecs_agent_local def extract_container_tags(self, co): """ Queries the ecs-agent to get ECS tags (task and task version) for a containers. As this is expensive, it is cached in the self.ecs_tags dict. The cache invalidation goes through invalidate_ecs_cache, called by the docker_daemon check :param co: container dict returned by docker-py :return: tags as list<string>, cached """ co_id = co.get('Id', None) if co_id is None: log.warning("Invalid container object in extract_container_tags") return [] if co_id in self.ecs_tags: return self.ecs_tags[co_id] else: return self._get_container_tags(co_id) def invalidate_cache(self, events): """ Allows cache invalidation when containers die :param events from self.get_events """ try: for ev in events: if ev.get('status') == 'die' and ev.get('id') in self.ecs_tags: del self.ecs_tags[ev.get('id')] except Exception as e: log.warning("Error when invalidating ecs cache: " + str(e))
class SDDockerBackend(AbstractSDBackend): """Docker-based service discovery""" def __init__(self, agentConfig): self.docker_client = DockerUtil().client if is_k8s(): self.kubeutil = KubeUtil() try: self.config_store = get_config_store(agentConfig=agentConfig) except Exception as e: log.error('Failed to instantiate the config store client. ' 'Auto-config only will be used. %s' % str(e)) agentConfig['sd_config_backend'] = None self.config_store = get_config_store(agentConfig=agentConfig) self.VAR_MAPPING = { 'host': self._get_host, 'port': self._get_ports, 'tags': self._get_additional_tags, } AbstractSDBackend.__init__(self, agentConfig) def _get_host(self, container_inspect): """Extract the host IP from a docker inspect object, or the kubelet API.""" ip_addr = container_inspect.get('NetworkSettings', {}).get('IPAddress') if not ip_addr: if not is_k8s(): return # kubernetes case log.debug("Didn't find the IP address for container %s (%s), using the kubernetes way." % (container_inspect.get('Id', '')[:12], container_inspect.get('Config', {}).get('Image', ''))) pod_list = self.kubeutil.retrieve_pods_list().get('items', []) c_id = container_inspect.get('Id') for pod in pod_list: pod_ip = pod.get('status', {}).get('podIP') if pod_ip is None: continue else: c_statuses = pod.get('status', {}).get('containerStatuses', []) for status in c_statuses: # compare the container id with those of containers in the current pod if c_id == status.get('containerID', '').split('//')[-1]: ip_addr = pod_ip return ip_addr def _get_ports(self, container_inspect): """Extract a list of available ports from a docker inspect object. Sort them numerically.""" c_id = container_inspect.get('Id', '') try: ports = map(lambda x: x.split('/')[0], container_inspect['NetworkSettings']['Ports'].keys()) except (IndexError, KeyError, AttributeError): log.debug("Didn't find the port for container %s (%s), trying the kubernetes way." % (c_id[:12], container_inspect.get('Config', {}).get('Image', ''))) # first we try to get it from the docker API # it works if the image has an EXPOSE instruction ports = map(lambda x: x.split('/')[0], container_inspect['Config'].get('ExposedPorts', {}).keys()) # if it failed, try with the kubernetes API if not ports and is_k8s(): co_statuses = self._get_kube_config(c_id, 'status').get('containerStatuses', []) c_name = None for co in co_statuses: if co.get('containerID', '').split('//')[-1] == c_id: c_name = co.get('name') break containers = self._get_kube_config(c_id, 'spec').get('containers', []) for co in containers: if co.get('name') == c_name: ports = map(lambda x: str(x.get('containerPort')), co.get('ports', [])) ports = sorted(ports, key=lambda x: int(x)) return ports def get_tags(self, c_inspect): """Extract useful tags from docker or platform APIs. These are collected by default.""" tags = [] if is_k8s(): pod_metadata = self._get_kube_config(c_inspect.get('Id'), 'metadata') if pod_metadata is None: log.warning("Failed to fetch pod metadata for container %s." " Kubernetes tags may be missing." % c_inspect.get('Id', '')[:12]) return [] # get labels kube_labels = pod_metadata.get('labels', {}) for label, value in kube_labels.iteritems(): tags.append('%s:%s' % (label, value)) # get replication controller created_by = json.loads(pod_metadata.get('annotations', {}).get('kubernetes.io/created-by', '{}')) if created_by.get('reference', {}).get('kind') == 'ReplicationController': tags.append('kube_replication_controller:%s' % created_by.get('reference', {}).get('name')) # get kubernetes namespace tags.append('kube_namespace:%s' % pod_metadata.get('namespace')) return tags def _get_additional_tags(self, container_inspect): tags = [] if is_k8s(): pod_metadata = self._get_kube_config(container_inspect.get('Id'), 'metadata') pod_spec = self._get_kube_config(container_inspect.get('Id'), 'spec') tags.append('node_name:%s' % pod_spec.get('nodeName')) tags.append('pod_name:%s' % pod_metadata.get('name')) return tags def _get_kube_config(self, c_id, key): """Get a part of a pod config from the kubernetes API""" pods = self.kubeutil.retrieve_pods_list().get('items', []) for pod in pods: c_statuses = pod.get('status', {}).get('containerStatuses', []) for status in c_statuses: if c_id == status.get('containerID', '').split('//')[-1]: return pod.get(key, {}) def get_configs(self): """Get the config for all docker containers running on the host.""" configs = {} containers = [( container.get('Image').split(':')[0].split('/')[-1], container.get('Id'), container.get('Labels') ) for container in self.docker_client.containers()] # used by the configcheck agent command to trace where check configs come from trace_config = self.agentConfig.get(TRACE_CONFIG, False) for image, cid, labels in containers: try: check_configs = self._get_check_configs(cid, image, trace_config=trace_config) or [] for conf in check_configs: if trace_config and conf is not None: source, conf = conf check_name, init_config, instance = conf # build instances list if needed if configs.get(check_name) is None: if trace_config: configs[check_name] = (source, (init_config, [instance])) else: configs[check_name] = (init_config, [instance]) else: conflict_init_msg = 'Different versions of `init_config` found for check {0}. ' \ 'Keeping the first one found.' if trace_config: if configs[check_name][1][0] != init_config: log.warning(conflict_init_msg.format(check_name)) configs[check_name][1][1].append(instance) else: if configs[check_name][0] != init_config: log.warning(conflict_init_msg.format(check_name)) configs[check_name][1].append(instance) except Exception: log.exception('Building config for container %s based on image %s using service' ' discovery failed, leaving it alone.' % (cid[:12], image)) return configs def _get_check_configs(self, c_id, image, trace_config=False): """Retrieve configuration templates and fill them with data pulled from docker and tags.""" inspect = self.docker_client.inspect_container(c_id) config_templates = self._get_config_templates(image, trace_config=trace_config) if not config_templates: log.debug('No config template for container %s with image %s. ' 'It will be left unconfigured.' % (c_id[:12], image)) return None check_configs = [] tags = self.get_tags(inspect) for config_tpl in config_templates: if trace_config: source, config_tpl = config_tpl check_name, init_config_tpl, instance_tpl, variables = config_tpl # insert tags in instance_tpl and process values for template variables instance_tpl, var_values = self._fill_tpl(inspect, instance_tpl, variables, tags) tpl = self._render_template(init_config_tpl or {}, instance_tpl or {}, var_values) if tpl and len(tpl) == 2: if trace_config and len(tpl[1]) == 2: source, (init_config, instance) = tpl check_configs.append((source, (check_name, init_config, instance))) elif not trace_config: init_config, instance = tpl check_configs.append((check_name, init_config, instance)) return check_configs def _get_config_templates(self, image_name, trace_config=False): """Extract config templates for an image from a K/V store and returns it as a dict object.""" config_backend = self.agentConfig.get('sd_config_backend') templates = [] if config_backend is None: auto_conf = True log.warning('No supported configuration backend was provided, using auto-config only.') else: auto_conf = False # format: [('image', {init_tpl}, {instance_tpl})] without trace_config # or [(source, ('image', {init_tpl}, {instance_tpl}))] with trace_config raw_tpls = self.config_store.get_check_tpls(image_name, auto_conf=auto_conf, trace_config=trace_config) for tpl in raw_tpls: if trace_config and tpl is not None: # each template can come from either auto configuration or user-supplied templates source, tpl = tpl if tpl is not None and len(tpl) == 3: check_name, init_config_tpl, instance_tpl = tpl else: log.debug('No template was found for image %s, leaving it alone.' % image_name) return None try: # build a list of all variables to replace in the template variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \ self.PLACEHOLDER_REGEX.findall(str(instance_tpl)) variables = map(lambda x: x.strip('%'), variables) if not isinstance(init_config_tpl, dict): init_config_tpl = json.loads(init_config_tpl or '{}') if not isinstance(instance_tpl, dict): instance_tpl = json.loads(instance_tpl or '{}') except json.JSONDecodeError: log.exception('Failed to decode the JSON template fetched for check {0}. Its configuration' ' by service discovery failed for {1}.'.format(check_name, image_name)) return None if trace_config: templates.append((source, (check_name, init_config_tpl, instance_tpl, variables))) else: templates.append((check_name, init_config_tpl, instance_tpl, variables)) return templates def _fill_tpl(self, inspect, instance_tpl, variables, tags=None): """Add container tags to instance templates and build a """ """dict from template variable names and their values.""" var_values = {} # add default tags to the instance if tags: tags += instance_tpl.get('tags', []) instance_tpl['tags'] = list(set(tags)) for v in variables: # variables can be suffixed with an index in case a list is found var_parts = v.split('_') if var_parts[0] in self.VAR_MAPPING: try: res = self.VAR_MAPPING[var_parts[0]](inspect) if not res: raise ValueError("Invalid value for variable %s." % var_parts[0]) # if an index is found in the variable, use it to select a value if len(var_parts) > 1 and isinstance(res, list) and int(var_parts[-1]) < len(res): var_values[v] = res[int(var_parts[-1])] # if no valid index was found but we have a list, return the last element elif isinstance(res, list): var_values[v] = res[-1] else: var_values[v] = res except Exception as ex: log.error("Could not find a value for the template variable %s: %s" % (v, str(ex))) else: log.error("No method was found to interpolate template variable %s." % v) return instance_tpl, var_values
class NomadUtil: __metaclass__ = Singleton def __init__(self): self.docker_util = DockerUtil() # Tags cache as a dict {co_id: (create_timestamp, [tags])} self._container_tags_cache = {} def extract_container_tags(self, co): """ Queries docker inspect to get nomad tags in the container's environment vars. As this is expensive, it is cached in the self._nomad_tags_cache dict. The cache invalidation goes through invalidate_nomad_cache, called by the docker_daemon check :param co: container dict returned by docker-py :return: tags as list<string>, cached """ co_id = co.get('Id', None) if co_id is None: log.warning("Invalid container object in extract_container_tags") return # Cache lookup on Id, verified on Created timestamp if co_id in self._container_tags_cache: created, tags = self._container_tags_cache[co_id] if created == co.get('Created', -1): return tags tags = [] try: inspect_info = self.docker_util.inspect_container(co_id) envvars = inspect_info.get('Config', {}).get('Env', {}) for var in envvars: if var.startswith(NOMAD_TASK_NAME): tags.append('nomad_task:%s' % var[len(NOMAD_TASK_NAME) + 1:]) elif var.startswith(NOMAD_JOB_NAME): tags.append('nomad_job:%s' % var[len(NOMAD_JOB_NAME) + 1:]) elif var.startswith(NOMAD_ALLOC_NAME): try: start = var.index('.', len(NOMAD_ALLOC_NAME)) + 1 end = var.index('[') if end <= start: raise ValueError( "Error extracting group from %s, check format" % var) tags.append('nomad_group:%s' % var[start:end]) except ValueError: pass self._container_tags_cache[co_id] = (co.get('Created'), tags) except Exception as e: log.warning("Error while parsing Nomad tags: %s" % str(e)) finally: return tags def invalidate_cache(self, events): """ Allows cache invalidation when containers dies :param events from self.get_events """ try: for ev in events: if ev.get('status') == 'die' and ev.get( 'id') in self._container_tags_cache: del self._container_tags_cache[ev.get('id')] except Exception as e: log.warning("Error when invalidating nomad cache: " + str(e))
class SDDockerBackend(AbstractSDBackend): """Docker-based service discovery""" def __init__(self, agentConfig): self.docker_client = DockerUtil().client if is_k8s(): self.kubeutil = KubeUtil() try: self.config_store = get_config_store(agentConfig=agentConfig) except Exception as e: log.error('Failed to instantiate the config store client. ' 'Auto-config only will be used. %s' % str(e)) agentConfig['sd_config_backend'] = None self.config_store = get_config_store(agentConfig=agentConfig) self.VAR_MAPPING = { 'host': self._get_host, 'port': self._get_ports, 'tags': self._get_additional_tags, } AbstractSDBackend.__init__(self, agentConfig) def _get_host(self, container_inspect): """Extract the host IP from a docker inspect object, or the kubelet API.""" ip_addr = container_inspect.get('NetworkSettings', {}).get('IPAddress') if not ip_addr: if not is_k8s(): return # kubernetes case log.debug( "Didn't find the IP address for container %s (%s), using the kubernetes way." % (container_inspect.get('Id', '')[:12], container_inspect.get('Config', {}).get('Image', ''))) pod_list = self.kubeutil.retrieve_pods_list().get('items', []) c_id = container_inspect.get('Id') for pod in pod_list: pod_ip = pod.get('status', {}).get('podIP') if pod_ip is None: continue else: c_statuses = pod.get('status', {}).get('containerStatuses', []) for status in c_statuses: # compare the container id with those of containers in the current pod if c_id == status.get('containerID', '').split('//')[-1]: ip_addr = pod_ip return ip_addr def _get_ports(self, container_inspect): """Extract a list of available ports from a docker inspect object. Sort them numerically.""" c_id = container_inspect.get('Id', '') try: ports = map(lambda x: x.split('/')[0], container_inspect['NetworkSettings']['Ports'].keys()) except (IndexError, KeyError, AttributeError): log.debug( "Didn't find the port for container %s (%s), trying the kubernetes way." % (c_id[:12], container_inspect.get('Config', {}).get( 'Image', ''))) # first we try to get it from the docker API # it works if the image has an EXPOSE instruction ports = map( lambda x: x.split('/')[0], container_inspect['Config'].get('ExposedPorts', {}).keys()) # if it failed, try with the kubernetes API if not ports and is_k8s(): co_statuses = self._get_kube_config(c_id, 'status').get( 'containerStatuses', []) c_name = None for co in co_statuses: if co.get('containerID', '').split('//')[-1] == c_id: c_name = co.get('name') break containers = self._get_kube_config(c_id, 'spec').get( 'containers', []) for co in containers: if co.get('name') == c_name: ports = map(lambda x: str(x.get('containerPort')), co.get('ports', [])) ports = sorted(ports, key=lambda x: int(x)) return ports def get_tags(self, c_inspect): """Extract useful tags from docker or platform APIs. These are collected by default.""" tags = [] if is_k8s(): pod_metadata = self._get_kube_config(c_inspect.get('Id'), 'metadata') if pod_metadata is None: log.warning("Failed to fetch pod metadata for container %s." " Kubernetes tags may be missing." % c_inspect.get('Id', '')[:12]) return [] # get labels kube_labels = pod_metadata.get('labels', {}) for label, value in kube_labels.iteritems(): tags.append('%s:%s' % (label, value)) # get replication controller created_by = json.loads( pod_metadata.get('annotations', {}).get('kubernetes.io/created-by', '{}')) if created_by.get('reference', {}).get('kind') == 'ReplicationController': tags.append('kube_replication_controller:%s' % created_by.get('reference', {}).get('name')) # get kubernetes namespace tags.append('kube_namespace:%s' % pod_metadata.get('namespace')) return tags def _get_additional_tags(self, container_inspect): tags = [] if is_k8s(): pod_metadata = self._get_kube_config(container_inspect.get('Id'), 'metadata') pod_spec = self._get_kube_config(container_inspect.get('Id'), 'spec') tags.append('node_name:%s' % pod_spec.get('nodeName')) tags.append('pod_name:%s' % pod_metadata.get('name')) return tags def _get_kube_config(self, c_id, key): """Get a part of a pod config from the kubernetes API""" pods = self.kubeutil.retrieve_pods_list().get('items', []) for pod in pods: c_statuses = pod.get('status', {}).get('containerStatuses', []) for status in c_statuses: if c_id == status.get('containerID', '').split('//')[-1]: return pod.get(key, {}) def get_configs(self): """Get the config for all docker containers running on the host.""" configs = {} containers = [(container.get('Image').split(':')[0].split('/')[-1], container.get('Id'), container.get('Labels')) for container in self.docker_client.containers()] # used by the configcheck agent command to trace where check configs come from trace_config = self.agentConfig.get(TRACE_CONFIG, False) for image, cid, labels in containers: try: check_configs = self._get_check_configs( cid, image, trace_config=trace_config) or [] for conf in check_configs: if trace_config and conf is not None: source, conf = conf check_name, init_config, instance = conf # build instances list if needed if configs.get(check_name) is None: if trace_config: configs[check_name] = (source, (init_config, [instance])) else: configs[check_name] = (init_config, [instance]) else: conflict_init_msg = 'Different versions of `init_config` found for check {0}. ' \ 'Keeping the first one found.' if trace_config: if configs[check_name][1][0] != init_config: log.warning( conflict_init_msg.format(check_name)) configs[check_name][1][1].append(instance) else: if configs[check_name][0] != init_config: log.warning( conflict_init_msg.format(check_name)) configs[check_name][1].append(instance) except Exception: log.exception( 'Building config for container %s based on image %s using service' ' discovery failed, leaving it alone.' % (cid[:12], image)) return configs def _get_check_configs(self, c_id, image, trace_config=False): """Retrieve configuration templates and fill them with data pulled from docker and tags.""" inspect = self.docker_client.inspect_container(c_id) config_templates = self._get_config_templates( image, trace_config=trace_config) if not config_templates: log.debug('No config template for container %s with image %s. ' 'It will be left unconfigured.' % (c_id[:12], image)) return None check_configs = [] tags = self.get_tags(inspect) for config_tpl in config_templates: if trace_config: source, config_tpl = config_tpl check_name, init_config_tpl, instance_tpl, variables = config_tpl # insert tags in instance_tpl and process values for template variables instance_tpl, var_values = self._fill_tpl(inspect, instance_tpl, variables, tags) tpl = self._render_template(init_config_tpl or {}, instance_tpl or {}, var_values) if tpl and len(tpl) == 2: if trace_config and len(tpl[1]) == 2: source, (init_config, instance) = tpl check_configs.append( (source, (check_name, init_config, instance))) elif not trace_config: init_config, instance = tpl check_configs.append((check_name, init_config, instance)) return check_configs def _get_config_templates(self, image_name, trace_config=False): """Extract config templates for an image from a K/V store and returns it as a dict object.""" config_backend = self.agentConfig.get('sd_config_backend') templates = [] if config_backend is None: auto_conf = True log.warning( 'No supported configuration backend was provided, using auto-config only.' ) else: auto_conf = False # format: [('image', {init_tpl}, {instance_tpl})] without trace_config # or [(source, ('image', {init_tpl}, {instance_tpl}))] with trace_config raw_tpls = self.config_store.get_check_tpls(image_name, auto_conf=auto_conf, trace_config=trace_config) for tpl in raw_tpls: if trace_config and tpl is not None: # each template can come from either auto configuration or user-supplied templates source, tpl = tpl if tpl is not None and len(tpl) == 3: check_name, init_config_tpl, instance_tpl = tpl else: log.debug( 'No template was found for image %s, leaving it alone.' % image_name) return None try: # build a list of all variables to replace in the template variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \ self.PLACEHOLDER_REGEX.findall(str(instance_tpl)) variables = map(lambda x: x.strip('%'), variables) if not isinstance(init_config_tpl, dict): init_config_tpl = json.loads(init_config_tpl or '{}') if not isinstance(instance_tpl, dict): instance_tpl = json.loads(instance_tpl or '{}') except json.JSONDecodeError: log.exception( 'Failed to decode the JSON template fetched for check {0}. Its configuration' ' by service discovery failed for {1}.'.format( check_name, image_name)) return None if trace_config: templates.append((source, (check_name, init_config_tpl, instance_tpl, variables))) else: templates.append( (check_name, init_config_tpl, instance_tpl, variables)) return templates def _fill_tpl(self, inspect, instance_tpl, variables, tags=None): """Add container tags to instance templates and build a """ """dict from template variable names and their values.""" var_values = {} # add default tags to the instance if tags: tags += instance_tpl.get('tags', []) instance_tpl['tags'] = list(set(tags)) for v in variables: # variables can be suffixed with an index in case a list is found var_parts = v.split('_') if var_parts[0] in self.VAR_MAPPING: try: res = self.VAR_MAPPING[var_parts[0]](inspect) if not res: raise ValueError("Invalid value for variable %s." % var_parts[0]) # if an index is found in the variable, use it to select a value if len(var_parts) > 1 and isinstance( res, list) and int(var_parts[-1]) < len(res): var_values[v] = res[int(var_parts[-1])] # if no valid index was found but we have a list, return the last element elif isinstance(res, list): var_values[v] = res[-1] else: var_values[v] = res except Exception as ex: log.error( "Could not find a value for the template variable %s: %s" % (v, str(ex))) else: log.error( "No method was found to interpolate template variable %s." % v) return instance_tpl, var_values