Пример #1
0
    def best_label(self, obj):
        """Returns the best human-readable label of the given object.

    We perfer the "alternateLabel" over "label" and a string not composed
    of only hexadecimal digits over hexadecimal digits.

    This function must be called when self._lock is held.

    Args:
      obj: a dictionary containing an "annotations" attribute. The value
        of this attribute should be a dictionary, which may contain
        "alternateLabel" and "Label" attributes.

    Returns:
    The best human-readable label.
    """
        alt_label = utilities.get_attribute(obj,
                                            ['annotations', 'alternateLabel'])
        label = utilities.get_attribute(obj, ['annotations', 'label'])
        if (utilities.valid_string(alt_label)
                and re.search('[^0-9a-fA-F]', alt_label)):
            return alt_label
        elif utilities.valid_string(label) and re.search(
                '[^0-9a-fA-F]', label):
            return label
        elif utilities.valid_string(alt_label):
            return alt_label
        elif utilities.valid_string(label):
            return label
        else:
            # should not arrive here.
            return '<unknown>'
Пример #2
0
  def add_relation(self, source, target, kind, label=None, metadata=None):
    """Adds a relation to the context graph."""
    assert utilities.valid_string(source) and utilities.valid_string(target)
    assert utilities.valid_string(kind)
    assert utilities.valid_optional_string(label)
    assert (metadata is None) or isinstance(metadata, types.DictType)

    with self._lock:
      # The timestamp of the relation should be inherited from the previous
      # context graph.
      key = (source, target, kind)
      timestamp = self._previous_relations_to_timestamps.get(key)
      if not utilities.valid_string(timestamp):
        timestamp = utilities.now()

      # Add the relation to the context graph data structure.
      relation = {
          'source': source,
          'target': target,
          'type': kind,
          'timestamp': timestamp
      }
      self._current_relations_to_timestamps[key] = timestamp

      # Add annotations as needed.
      relation['annotations'] = {}
      if metadata is not None:
        relation['annotations']['metadata'] = copy.deep_copy(metadata)

      relation['annotations']['label'] = label if label is not None else kind
      if self._version is not None:
        relation['annotations']['createdBy'] = self._version

      self._context_relations.append(relation)
Пример #3
0
def annotate_container(project_id, container, parent_pod):
    """Annotate the given container with Heapster GCM metric information.

  Args:
    project_id: the project ID
    container: the container object to annotate.
    parent_pod: the parent pod of 'container'.

  Raises:
    AssertionError: if the input arguments are invalid or if
    'parent_pod' is not the parent of 'container'
  """
    assert utilities.valid_string(project_id)
    assert utilities.is_wrapped_object(container, "Container")
    assert utilities.is_wrapped_object(parent_pod, "Pod")
    parent_name = utilities.get_attribute(container, ["properties", "Config", "Hostname"])
    assert utilities.valid_string(parent_name)
    pod_name = utilities.get_attribute(parent_pod, ["properties", "metadata", "name"])
    assert utilities.valid_string(pod_name)

    # The 'parent_name' value is truncated to the first 64 characters.
    # Thus it must be the prefix of the full pod name.
    assert pod_name.startswith(parent_name)

    m = _make_gcm_metrics(project_id, _get_container_labels(container, parent_pod))
    if m is None:
        return
    if container.get("annotations") is None:
        container["annotations"] = {}
    container["annotations"]["metrics"] = m
Пример #4
0
  def best_label(self, obj):
    """Returns the best human-readable label of the given object.

    We perfer the "alternateLabel" over "label" and a string not composed
    of only hexadecimal digits over hexadecimal digits.

    This function must be called when self._lock is held.

    Args:
      obj: a dictionary containing an "annotations" attribute. The value
        of this attribute should be a dictionary, which may contain
        "alternateLabel" and "Label" attributes.

    Returns:
    The best human-readable label.
    """
    alt_label = utilities.get_attribute(obj, ['annotations', 'alternateLabel'])
    label = utilities.get_attribute(obj, ['annotations', 'label'])
    if (utilities.valid_string(alt_label) and
        re.search('[^0-9a-fA-F]', alt_label)):
      return alt_label
    elif utilities.valid_string(label) and re.search('[^0-9a-fA-F]', label):
      return label
    elif utilities.valid_string(alt_label):
      return alt_label
    elif utilities.valid_string(label):
      return label
    else:
      # should not arrive here.
      return '<unknown>'
Пример #5
0
  def add_relation(self, source, target, kind, label=None, metadata=None):
    """Adds a relation to the context graph."""
    assert utilities.valid_string(source) and utilities.valid_string(target)
    assert utilities.valid_string(kind)
    assert utilities.valid_optional_string(label)
    assert (metadata is None) or isinstance(metadata, types.DictType)

    with self._lock:
      # Add the relation to the context graph data structure.
      relation = {
          'source': source,
          'target': target,
          'type': kind,
      }

      # Add annotations as needed.
      relation['annotations'] = {}
      if metadata is not None:
        relation['annotations']['metadata'] = metadata

      relation['annotations']['label'] = label if label is not None else kind
      if self._version is not None:
        relation['annotations']['createdBy'] = self._version

      self._context_relations.append(relation)
Пример #6
0
  def test_cluster(self):
    """Test the '/cluster' endpoint."""
    start_time = utilities.now()
    # Execrcise the collector. Read data from golden files and compute
    # a context graph.
    ret_value = self.app.get('/cluster')
    end_time = utilities.now()
    result = json.loads(ret_value.data)
    self.verify_resources(result, start_time, end_time)

    self.assertEqual(23, self.count_relations(result, 'contains'))
    self.assertEqual(3, self.count_relations(result, 'createdFrom'))
    self.assertEqual(7, self.count_relations(result, 'loadBalances'))
    self.assertEqual(6, self.count_relations(result, 'monitors'))
    self.assertEqual(10, self.count_relations(result, 'runs'))

    # Verify that all relations contain a timestamp in the range
    # [start_time, end_time].
    self.assertTrue(isinstance(result.get('relations'), types.ListType))
    for r in result['relations']:
      self.assertTrue(isinstance(r, types.DictType))
      timestamp = r.get('timestamp')
      self.assertTrue(utilities.valid_string(timestamp))
      self.assertTrue(start_time <= timestamp <= end_time)

    # The overall timestamp must be in the expected range.
    self.assertTrue(utilities.valid_string(result.get('timestamp')))
    self.assertTrue(start_time <= result['timestamp'] <= end_time)

    json_output = json.dumps(result, sort_keys=True)
    self.assertEqual(2, json_output.count('"alternateLabel": '))
    self.assertEqual(85, json_output.count('"createdBy": '))
Пример #7
0
  def add_resource(self, rid, annotations, rtype, timestamp, obj):
    """Adds a resource to the context graph."""
    assert utilities.valid_string(rid)
    assert utilities.valid_string(utilities.get_attribute(
        annotations, ['label']))
    assert utilities.valid_string(rtype)
    assert utilities.valid_string(timestamp)
    assert isinstance(obj, types.DictType)

    with self._lock:
      # It is possible that the same resource is referenced by more than one
      # parent. In this case the resource is added only once.
      if rid in self._id_set:
        return

      # Add the resource to the context graph data structure.
      resource = {
          'id': rid,
          'type': rtype,
          'timestamp': timestamp,
          'annotations': copy.deepcopy(annotations)
      }

      if self._version is not None:
        resource['annotations']['createdBy'] = self._version

      resource['properties'] = obj

      self._context_resources.append(resource)
      self._id_set.add(rid)
Пример #8
0
def get_pod_host(gs, pod_id):
  """Gets the host name associated with the given pod.

  Args:
    gs: global state.
    pod_id: the pod name.

  Returns:
    If the pod was found, returns the associated host name.
    If the pod was not found, returns an empty string.

  Raises:
    CollectorError in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
  gs.logger_info('calling get_pod_host(pod_id=%s)', pod_id)
  for pod in get_pods(gs):
    if not utilities.valid_string(pod.get('id')):
      # Found an invalid pod without a pod ID.
      continue

    pod_host = utilities.get_attribute(pod, ['properties', 'spec', 'host'])
    if pod['id'] == pod_id and utilities.valid_string(pod_host):
      # 'pod_host' may be missing if the pod is in "Waiting" state.
      return pod_host

  # Could not find pod.
  return ''
Пример #9
0
    def add_resource(self, rid, annotations, rtype, timestamp, obj):
        """Adds a resource to the context graph."""
        assert utilities.valid_string(rid)
        assert utilities.valid_string(
            utilities.get_attribute(annotations, ['label']))
        assert utilities.valid_string(rtype)
        assert utilities.valid_string(timestamp)
        assert isinstance(obj, types.DictType)

        with self._lock:
            # It is possible that the same resource is referenced by more than one
            # parent. In this case the resource is added only once.
            if rid in self._id_set:
                return

            # Add the resource to the context graph data structure.
            resource = {
                'id': rid,
                'type': rtype,
                'timestamp': timestamp,
                'annotations': copy.deepcopy(annotations)
            }

            if self._version is not None:
                resource['annotations']['createdBy'] = self._version

            resource['properties'] = obj

            self._context_resources.append(resource)
            self._id_set.add(rid)
Пример #10
0
def annotate_container(project_id, container, parent_pod):
    """Annotate the given container with Heapster GCM metric information.

  Args:
    project_id: the project ID
    container: the container object to annotate.
    parent_pod: the parent pod of 'container'.

  Raises:
    AssertionError: if the input arguments are invalid or if
    'parent_pod' is not the parent of 'container'
  """
    assert utilities.valid_string(project_id)
    assert utilities.is_wrapped_object(container, 'Container')
    assert utilities.is_wrapped_object(parent_pod, 'Pod')
    parent_name = utilities.get_attribute(container,
                                          ['properties', 'Config', 'Hostname'])
    assert utilities.valid_string(parent_name)
    pod_name = utilities.get_attribute(parent_pod,
                                       ['properties', 'metadata', 'name'])
    assert utilities.valid_string(pod_name)

    # The 'parent_name' value is truncated to the first 64 characters.
    # Thus it must be the prefix of the full pod name.
    assert pod_name.startswith(parent_name)

    m = _make_gcm_metrics(project_id,
                          _get_container_labels(container, parent_pod))
    if m is None:
        return
    if container.get('annotations') is None:
        container['annotations'] = {}
    container['annotations']['metrics'] = m
Пример #11
0
def _get_container_labels(container, parent_pod):
    """Returns key/value pairs identifying all metrics of this container.

  Args:
    container: the container object to annotate.
    parent_pod: the parent pod of 'container'.

  Returns:
  A dictionary of key/value pairs.
  If any error was detected, returns None.
  """
    if not utilities.is_wrapped_object(container, "Container"):
        return None
    if not utilities.is_wrapped_object(parent_pod, "Pod"):
        return None

    pod_id = utilities.get_attribute(parent_pod, ["properties", "metadata", "uid"])
    if not utilities.valid_string(pod_id):
        return None

    hostname = utilities.get_attribute(parent_pod, ["properties", "spec", "host"])
    if not utilities.valid_string(hostname):
        return None

    short_container_name = utilities.get_short_container_name(container, parent_pod)

    if not utilities.valid_string(short_container_name):
        return None

    return {"pod_id": pod_id, "hostname": hostname, "container_name": short_container_name}
Пример #12
0
    def test_cluster(self):
        """Test the '/cluster' endpoint."""
        start_time = utilities.now()
        # Execrcise the collector. Read data from golden files and compute
        # a context graph.
        ret_value = self.app.get('/cluster')
        end_time = utilities.now()
        result = json.loads(ret_value.data)
        self.verify_resources(result, start_time, end_time)

        self.assertEqual(23, self.count_relations(result, 'contains'))
        self.assertEqual(3, self.count_relations(result, 'createdFrom'))
        self.assertEqual(7, self.count_relations(result, 'loadBalances'))
        self.assertEqual(6, self.count_relations(result, 'monitors'))
        self.assertEqual(10, self.count_relations(result, 'runs'))

        # Verify that all relations contain a timestamp in the range
        # [start_time, end_time].
        self.assertTrue(isinstance(result.get('relations'), types.ListType))
        for r in result['relations']:
            self.assertTrue(isinstance(r, types.DictType))
            timestamp = r.get('timestamp')
            self.assertTrue(utilities.valid_string(timestamp))
            self.assertTrue(start_time <= timestamp <= end_time)

        # The overall timestamp must be in the expected range.
        self.assertTrue(utilities.valid_string(result.get('timestamp')))
        self.assertTrue(start_time <= result['timestamp'] <= end_time)

        json_output = json.dumps(result, sort_keys=True)
        self.assertEqual(2, json_output.count('"alternateLabel": '))
        self.assertEqual(85, json_output.count('"createdBy": '))
Пример #13
0
def get_pod_host(gs, pod_id):
    """Gets the host name associated with the given pod.

  Args:
    gs: global state.
    pod_id: the pod name.

  Returns:
    If the pod was found, returns the associated host name.
    If the pod was not found, returns an empty string.

  Raises:
    CollectorError in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
    gs.logger_info('calling get_pod_host(pod_id=%s)', pod_id)
    for pod in get_pods(gs):
        if not utilities.valid_string(pod.get('id')):
            # Found an invalid pod without a pod ID.
            continue

        pod_host = utilities.get_attribute(pod, ['properties', 'spec', 'host'])
        if pod['id'] == pod_id and utilities.valid_string(pod_host):
            # 'pod_host' may be missing if the pod is in "Waiting" state.
            return pod_host

    # Could not find pod.
    return ''
Пример #14
0
  def add_relation(self, source, target, kind, label=None, metadata=None):
    """Adds a relation to the context graph."""
    assert utilities.valid_string(source) and utilities.valid_string(target)
    assert utilities.valid_string(kind)
    assert utilities.valid_optional_string(label)
    assert (metadata is None) or isinstance(metadata, dict)

    with self._lock:
      # The timestamp of the relation should be inherited from the previous
      # context graph.
      key = (source, target, kind)
      timestamp = self._previous_relations_to_timestamps.get(key)
      if not utilities.valid_string(timestamp):
        timestamp = utilities.now()

      # Add the relation to the context graph data structure.
      relation = {
          'source': source,
          'target': target,
          'type': kind,
          'timestamp': timestamp
      }
      self._current_relations_to_timestamps[key] = timestamp

      # Add annotations as needed.
      relation['annotations'] = {}
      if metadata is not None:
        relation['annotations']['metadata'] = copy.deep_copy(metadata)
      relation['annotations']['label'] = label if label is not None else kind

      self._context_relations.append(relation)
Пример #15
0
def _do_compute_pod(gs, cluster_guid, node_guid, pod, g):
  assert isinstance(gs, global_state.GlobalState)
  assert utilities.valid_string(cluster_guid)
  assert utilities.valid_string(node_guid)
  assert utilities.is_wrapped_object(pod, 'Pod')
  assert isinstance(g, ContextGraph)

  pod_id = pod['id']
  pod_guid = 'Pod:' + pod_id
  g.add_resource(pod_guid, pod['annotations'], 'Pod', pod['timestamp'],
                 pod['properties'])

  # pod.properties.spec.nodeName may be missing if the pod is waiting
  # (not running yet).
  docker_host = utilities.get_attribute(
      pod, ['properties', 'spec', 'nodeName'])
  if utilities.valid_string(docker_host):
    # Pod is running.
    if node_guid == ('Node:' + docker_host):
      g.add_relation(node_guid, pod_guid, 'runs')  # Node runs Pod
    else:
      msg = ('Docker host (pod.properties.spec.nodeName)=%s '
             'not matching node ID=%s' % (docker_host, node_guid))
      gs.logger_error(msg)
      raise collector_error.CollectorError(msg)
  else:
    # Pod is not running.
    g.add_relation(cluster_guid, pod_guid, 'contains')  # Cluster contains Pod
Пример #16
0
def _do_compute_node(gs, input_queue, cluster_guid, node, g):
  assert isinstance(gs, global_state.GlobalState)
  assert isinstance(input_queue, Queue.PriorityQueue)
  assert utilities.valid_string(cluster_guid)
  assert utilities.is_wrapped_object(node, 'Node')
  assert isinstance(g, ContextGraph)

  node_id = node['id']
  node_guid = 'Node:' + node_id
  g.add_resource(node_guid, node['annotations'], 'Node', node['timestamp'],
                 node['properties'])
  g.add_relation(cluster_guid, node_guid, 'contains')  # Cluster contains Node
  # Pods in a Node
  pod_ids = set()
  docker_hosts = set()

  # Process pods sequentially because calls to _do_compute_pod() do not call
  # lower-level services or wait.
  for pod in kubernetes.get_pods(gs, node_id):
    _do_compute_pod(gs, cluster_guid, node_guid, pod, g)
    pod_ids.add(pod['id'])
    # pod.properties.spec.nodeName may be missing if the pod is waiting.
    docker_host = utilities.get_attribute(
        pod, ['properties', 'spec', 'nodeName'])
    if utilities.valid_string(docker_host):
      docker_hosts.add(docker_host)

  # 'docker_hosts' should contain a single Docker host, because all of
  # the pods run in the same Node. However, if it is not the case, we
  # cannot fix the situation, so we just log an error message and continue.
  if len(docker_hosts) != 1:
    gs.logger_error(
        'corrupt pod data in node=%s: '
        '"docker_hosts" is empty or contains more than one entry: %s',
        node_guid, str(docker_hosts))

  # Process containers concurrently.
  for docker_host in docker_hosts:
    for container in docker.get_containers_with_metrics(gs, docker_host):
      parent_pod_id = utilities.get_parent_pod_id(container)
      if utilities.valid_string(parent_pod_id) and (parent_pod_id in pod_ids):
        # This container is contained in a pod.
        parent_guid = 'Pod:' + parent_pod_id
      else:
        # This container is not contained in a pod.
        parent_guid = node_guid

      # Do not compute the containers by worker threads in test mode
      # because the order of the output will be different than the golden
      # files due to the effects of queuing the work.
      if gs.get_testing():
        _do_compute_container(gs, docker_host, parent_guid, container, g)
      else:
        input_queue.put((
            gs.get_random_priority(),
            _do_compute_container,
            {'gs': gs, 'docker_host': docker_host, 'parent_guid': parent_guid,
             'container': container, 'g': g}))
Пример #17
0
def _do_compute_other_nodes(gs, cluster_guid, nodes_list, oldest_timestamp, g):
  """Adds nodes not in the node list but running pods to the graph.

  This handles the case when there are pods running on the master node,
  in which case we add a dummy node representing the master to the graph.
  The nodes list does not include the master.

  Args:
    gs: the global state.
    cluster_guid: the cluster's ID.
    nodes_list: a list of wrapped Node objects.
    oldest_timestamp: the timestamp of the oldest Node object.
    g: the context graph under construction.
  """
  assert isinstance(gs, global_state.GlobalState)
  assert utilities.valid_string(cluster_guid)
  assert isinstance(nodes_list, list)
  assert utilities.valid_string(oldest_timestamp)
  assert isinstance(g, ContextGraph)

  # Compute the set of known Node names.
  known_node_ids = set()
  for node in nodes_list:
    assert utilities.is_wrapped_object(node, 'Node')
    known_node_ids.add(node['id'])

  # Compute the set of Nodes referenced by pods but not in the known set.
  # The set of unknown node names may be empty.
  missing_node_ids = set()
  for pod in kubernetes.get_pods(gs):
    assert utilities.is_wrapped_object(pod, 'Pod')
    # pod.properties.spec.nodeName may be missing if the pod is waiting.
    parent_node_id = utilities.get_attribute(
        pod, ['properties', 'spec', 'nodeName'])
    if not utilities.valid_string(parent_node_id):
      continue

    if parent_node_id in known_node_ids:
      continue

    # Found a pod that does not belong to any of the known nodes.
    missing_node_ids.add(parent_node_id)

  # Process the missing nodes.
  for node_id in missing_node_ids:
    # Create a dummy node object just as a placeholder for metric
    # annotations.
    node = utilities.wrap_object({}, 'Node', node_id, time.time())

    metrics.annotate_node(node)
    node_guid = 'Node:' + node_id
    g.add_resource(node_guid, node['annotations'], 'Node', oldest_timestamp, {})
    g.add_relation(cluster_guid, node_guid, 'contains')  # Cluster contains Node
Пример #18
0
def _do_compute_container(parent_guid, container, g):
  assert utilities.valid_string(parent_guid)
  assert utilities.is_wrapped_object(container, 'Container')
  assert isinstance(g, ContextGraph)

  container_id = container['id']
  container_guid = 'Container:' + container_id
  # TODO(vasbala): container_id is too verbose?
  g.add_resource(container_guid, container['annotations'],
                 'Container', container['timestamp'],
                 container['properties'])

  # The parent Pod contains Container.
  g.add_relation(parent_guid, container_guid, 'contains')

  image = kubernetes.get_image_from_container(container)
  image_guid = 'Image:' + image['id']

  # Add the image to the graph only if we have not added it before.
  #
  # Different containers might reference the same image using different
  # names. Unfortunately, only the first name encountered is recorded.
  # TODO(rimey): Record the other names as well, and choose the primary
  # name deterministically.
  g.add_resource(image_guid, image['annotations'], 'Image',
                 image['timestamp'], image['properties'])

  # Container createdFrom Image
  g.add_relation(container_guid, image_guid, 'createdFrom')
Пример #19
0
def _do_compute_rcontroller(gs, cluster_guid, rcontroller, g):
    assert isinstance(gs, global_state.GlobalState)
    assert utilities.valid_string(cluster_guid)
    assert utilities.is_wrapped_object(rcontroller, 'ReplicationController')
    assert isinstance(g, ContextGraph)

    rcontroller_id = rcontroller['id']
    rcontroller_guid = 'ReplicationController:' + rcontroller_id
    g.add_resource(rcontroller_guid, rcontroller['annotations'],
                   'ReplicationController', rcontroller['timestamp'],
                   rcontroller['properties'])

    # Cluster contains Rcontroller
    g.add_relation(cluster_guid, rcontroller_guid, 'contains')

    # Pods that are monitored by this replication controller.
    # Use the rcontroller['spec']['selector'] key/value pairs to find matching
    # pods.
    selector = utilities.get_attribute(rcontroller,
                                       ['properties', 'spec', 'selector'])
    if selector:
        if not isinstance(selector, types.DictType):
            msg = ('Rcontroller id=%s has an invalid "replicaSelector" value' %
                   rcontroller_id)
            gs.logger_error(msg)
            raise collector_error.CollectorError(msg)

        for pod in kubernetes.get_selected_pods(gs, selector):
            pod_guid = 'Pod:' + pod['id']
            # Rcontroller monitors Pod
            g.add_relation(rcontroller_guid, pod_guid, 'monitors')
    else:
        gs.logger_error('Rcontroller id=%s has no "spec.selector" attribute',
                        rcontroller_id)
Пример #20
0
def _do_compute_node(gs, input_queue, cluster_guid, node, g):
    assert isinstance(gs, global_state.GlobalState)
    assert isinstance(input_queue, Queue.PriorityQueue)
    assert utilities.valid_string(cluster_guid)
    assert utilities.is_wrapped_object(node, 'Node')
    assert isinstance(g, ContextGraph)

    node_id = node['id']
    node_guid = 'Node:' + node_id
    g.add_resource(node_guid, node['annotations'], 'Node', node['timestamp'],
                   node['properties'])
    g.add_relation(cluster_guid, node_guid,
                   'contains')  # Cluster contains Node
    # Pods in a Node
    # Do not compute the pods by worker threads in test mode because the order
    # of the output will be different than the golden files due to the effects
    # of queuing the work.
    for pod in kubernetes.get_pods(gs, node_id):
        if gs.get_testing():
            _do_compute_pod(gs, input_queue, node_guid, pod, g)
        else:
            input_queue.put((gs.get_random_priority(), _do_compute_pod, {
                'gs': gs,
                'input_queue': input_queue,
                'node_guid': node_guid,
                'pod': pod,
                'g': g
            }))
Пример #21
0
def _container_in_pod(gs, container, pod):
    """Returns True when 'container' is a part of 'pod'.

  Args:
    gs: global state.
    container: a wrapped container object.
    pod: a wrapped pod object.

  Raises:
    CollectorError: if the 'container' or the 'pod' are missing essential
    attributes.

  Returns:
  True iff container 'container' is a part of 'pod'.
  """
    assert isinstance(gs, global_state.GlobalState)
    assert utilities.is_wrapped_object(container, 'Container')
    assert utilities.is_wrapped_object(pod, 'Pod')

    parent_pod_id = utilities.get_parent_pod_id(container)
    if not utilities.valid_string(parent_pod_id):
        msg = 'could not find parent pod ID in container %s' % container['id']
        gs.logger_error(msg)
        raise collector_error.CollectorError(msg)

    return parent_pod_id == pod['id']
Пример #22
0
def _do_compute_node(gs, input_queue, cluster_guid, node, g):
  assert isinstance(gs, global_state.GlobalState)
  assert isinstance(input_queue, Queue.PriorityQueue)
  assert utilities.valid_string(cluster_guid)
  assert utilities.is_wrapped_object(node, 'Node')
  assert isinstance(g, ContextGraph)

  node_id = node['id']
  node_guid = 'Node:' + node_id
  g.add_resource(node_guid, node['annotations'], 'Node', node['timestamp'],
                 node['properties'])
  g.add_relation(cluster_guid, node_guid, 'contains')  # Cluster contains Node
  # Pods in a Node
  # Do not compute the pods by worker threads in test mode because the order
  # of the output will be different than the golden files due to the effects
  # of queuing the work.
  for pod in kubernetes.get_pods(gs, node_id):
    if gs.get_testing():
      _do_compute_pod(gs, input_queue, node_guid, pod, g)
    else:
      input_queue.put((
          gs.get_random_priority(),
          _do_compute_pod,
          {'gs': gs, 'input_queue': input_queue, 'node_guid': node_guid,
           'pod': pod, 'g': g}))
Пример #23
0
def _do_compute_service(gs, cluster_guid, service, g):
  assert isinstance(gs, global_state.GlobalState)
  assert utilities.valid_string(cluster_guid)
  assert utilities.is_wrapped_object(service, 'Service')
  assert isinstance(g, ContextGraph)

  service_id = service['id']
  service_guid = 'Service:' + service_id
  g.add_resource(service_guid, service['annotations'], 'Service',
                 service['timestamp'], service['properties'])

  # Cluster contains Service.
  g.add_relation(cluster_guid, service_guid, 'contains')

  # Pods load balanced by this service (use the service['spec', 'selector']
  # key/value pairs to find matching Pods)
  selector = utilities.get_attribute(
      service, ['properties', 'spec', 'selector'])
  if selector:
    if not isinstance(selector, types.DictType):
      msg = 'Service id=%s has an invalid "selector" value' % service_id
      gs.logger_error(msg)
      raise collector_error.CollectorError(msg)

    for pod in kubernetes.get_selected_pods(gs, selector):
      pod_guid = 'Pod:' + pod['id']
      # Service loadBalances Pod
      g.add_relation(service_guid, pod_guid, 'loadBalances')
  else:
    gs.logger_error('Service id=%s has no "selector" attribute', service_id)
Пример #24
0
def _container_in_pod(gs, container, pod):
  """Returns True when 'container' is a part of 'pod'.

  Args:
    gs: global state.
    container: a wrapped container object.
    pod: a wrapped pod object.

  Raises:
    CollectorError: if the 'container' or the 'pod' are missing essential
    attributes.

  Returns:
  True iff container 'container' is a part of 'pod'.
  """
  assert isinstance(gs, global_state.GlobalState)
  assert utilities.is_wrapped_object(container, 'Container')
  assert utilities.is_wrapped_object(pod, 'Pod')

  parent_pod_id = utilities.get_parent_pod_id(container)
  if not utilities.valid_string(parent_pod_id):
    msg = 'could not find parent pod ID in container %s' % container['id']
    gs.logger_error(msg)
    raise collector_error.CollectorError(msg)

  return parent_pod_id == pod['id']
Пример #25
0
def _do_compute_rcontroller(gs, cluster_guid, rcontroller, g):
  assert isinstance(gs, global_state.GlobalState)
  assert utilities.valid_string(cluster_guid)
  assert utilities.is_wrapped_object(rcontroller, 'ReplicationController')
  assert isinstance(g, ContextGraph)

  rcontroller_id = rcontroller['id']
  rcontroller_guid = 'ReplicationController:' + rcontroller_id
  g.add_resource(rcontroller_guid, rcontroller['annotations'],
                 'ReplicationController',
                 rcontroller['timestamp'], rcontroller['properties'])

  # Cluster contains Rcontroller
  g.add_relation(cluster_guid, rcontroller_guid, 'contains')

  # Pods that are monitored by this replication controller.
  # Use the rcontroller['spec']['selector'] key/value pairs to find matching
  # pods.
  selector = utilities.get_attribute(
      rcontroller, ['properties', 'spec', 'selector'])
  if selector:
    if not isinstance(selector, types.DictType):
      msg = ('Rcontroller id=%s has an invalid "replicaSelector" value' %
             rcontroller_id)
      gs.logger_error(msg)
      raise collector_error.CollectorError(msg)

    for pod in kubernetes.get_selected_pods(gs, selector):
      pod_guid = 'Pod:' + pod['id']
      # Rcontroller monitors Pod
      g.add_relation(rcontroller_guid, pod_guid, 'monitors')
  else:
    gs.logger_error('Rcontroller id=%s has no "spec.selector" attribute',
                    rcontroller_id)
Пример #26
0
def fetch(req):
  """Fetch the output of the specified request from the Docker's socket.

  Args:
    req: the request to be sent to the Docker daemon.

  Returns:
  The contents of the JSON response.

  Raises:
    IOError: the Unix domain socket returns a code other than OK (200).
  """
  assert utilities.valid_string(req)
  assert req[0] == '/'
  if app.proxy_is_testing_mode:
    fname = 'testdata/localhost' + re.sub(r'[^a-zA-Z0-9_.-]', '.', req)
    app.logger.info('reading req %s from %s', req, fname)
    f = open(fname, 'r')
    result = json.loads(f.read())
    f.close()
    return result

  r = session.get(
      '{docker_host}{url}'.format(docker_host=LOCAL_DOCKER_HOST, url=req))

  if r.status_code != requests.codes.ok:
    msg = 'Accessing %s API returns an error code %d' % (req, r.status_code)
    app.logger.error(msg)
    raise IOError(msg)

  else:
    return r.json()
Пример #27
0
def get_pods(gs, node_id=None):
  """Gets the list of all pods in the given node or in the cluster.

  When 'node_id' is None, it returns the list of pods in the cluster.
  When 'node_id' is a non-empty string, it returns the list of pods in that
  node.

  Args:
    gs: global state.
    node_id: the parent node of the pods or None.

  Returns:
    list of wrapped pod objects.
    Each element in the list is the result of
    utilities.wrap_object(pod, 'Pod', ...)

  Raises:
    CollectorError: in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
  pods_label = '' if node_id is None else node_id
  pods, timestamp_secs = gs.get_pods_cache().lookup(pods_label)
  if timestamp_secs is not None:
    gs.logger_info('get_pods(pods_label=%s) cache hit returns %d pods',
                   pods_label, len(pods))
    return pods

  pods = []
  url = '{kubernetes}/pods'.format(kubernetes=KUBERNETES_API)
  try:
    result = fetch_data(gs, url)
  except:
    msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
    gs.logger_exception(msg)
    raise collector_error.CollectorError(msg)

  now = time.time()
  if not (isinstance(result, types.DictType) and 'items' in result):
    msg = 'invalid result when fetching %s' % url
    gs.logger_exception(msg)
    raise collector_error.CollectorError(msg)

  for pod in result['items']:
    name = utilities.get_attribute(pod, ['metadata', 'name'])
    if not utilities.valid_string(name):
      # an invalid pod without a valid pod ID value.
      continue
    wrapped_pod = utilities.wrap_object(pod, 'Pod', name, now)
    if node_id:
      # pod['spec']['host'] may be missing if the pod is in "Waiting"
      # status.
      if utilities.get_attribute(pod, ['spec', 'host']) == node_id:
        pods.append(wrapped_pod)
    else:
      # append pod to output if 'node_id' is not specified.
      pods.append(wrapped_pod)

  ret_value = gs.get_pods_cache().update(pods_label, pods, now)
  gs.logger_info('get_pods(node_id=%s) returns %d pods', pods_label, len(pods))
  return ret_value
Пример #28
0
  def add_elapsed(self, start_time, url_or_fname, elapsed_seconds):
    """Append an ElapsedRecord of an access operation to the elapsed time queue.

    Keep at most constants.MAX_ELAPSED_QUEUE_SIZE elements in the elapsed
    time queue.

    Args:
      start_time: the timestamp at the start of the operation.
      url_or_fname: the URL or file name of the operation.
      elapsed_seconds: the elapsed time of the operation.
    """
    assert isinstance(start_time, float)
    assert utilities.valid_string(url_or_fname)
    assert isinstance(elapsed_seconds, float)

    # If the queue is too large, remove some items until it contains less
    # than constants.MAX_ELAPSED_QUEUE_SIZE elements.
    while self._elapsed_queue.qsize() >= constants.MAX_ELAPSED_QUEUE_SIZE:
      try:
        self._elapsed_queue.get(block=False)
      except Queue.Empty:
        # self._elapsed_queue.get() may raise the EMPTY exception if the
        # queue becomes empty (for example, due to concurrent access).
        break

    self._elapsed_queue.put(
        ElapsedRecord(start_time=start_time, what=url_or_fname,
                      thread_identifier=thread.get_ident(),
                      elapsed_seconds=elapsed_seconds))
Пример #29
0
def _do_compute_service(gs, cluster_guid, service, g):
    assert isinstance(gs, global_state.GlobalState)
    assert utilities.valid_string(cluster_guid)
    assert utilities.is_wrapped_object(service, 'Service')
    assert isinstance(g, ContextGraph)

    service_id = service['id']
    service_guid = 'Service:' + service_id
    g.add_resource(service_guid, service['annotations'], 'Service',
                   service['timestamp'], service['properties'])

    # Cluster contains Service.
    g.add_relation(cluster_guid, service_guid, 'contains')

    # Pods load balanced by this service (use the service['spec', 'selector']
    # key/value pairs to find matching Pods)
    selector = utilities.get_attribute(service,
                                       ['properties', 'spec', 'selector'])
    if selector:
        if not isinstance(selector, types.DictType):
            msg = 'Service id=%s has an invalid "selector" value' % service_id
            gs.logger_error(msg)
            raise collector_error.CollectorError(msg)

        for pod in kubernetes.get_selected_pods(gs, selector):
            pod_guid = 'Pod:' + pod['id']
            # Service loadBalances Pod
            g.add_relation(service_guid, pod_guid, 'loadBalances')
    else:
        gs.logger_error('Service id=%s has no "selector" attribute',
                        service_id)
Пример #30
0
def fill_cache(cache):
  """Fill the 'cache' with information about all containers in this host.

  This routine should be called on startup and periodically every
  MAX_CONTAINER_AGE_SECONDS seconds.

  fill_cache() cannot call get_response() because get_response() must be
  called only from a running Flask application.
  fill_cache() is called from the main program before starting the Flask
  application.

  This routine cannot call app.logger.xxx() because it is not running
  as part of the application. It may also run before the application is
  initialized.

  Args:
    cache: the containers cache.
  """
  assert cache is not None
  try:
    containers_list = fetch('/containers/json')

  except ValueError:
    app.logger.error('invalid response format from "/containers/json"')
    return

  except:
    exc_type, value, _ = sys.exc_info()
    msg = ('Failed to fetch /containers/json with exception %s: %s' %
           (exc_type, value))
    app.logger.error(msg)
    return

  if not isinstance(containers_list, types.ListType):
    app.logger.error('invalid response format from "/containers/json"')
    return

  for container_info in containers_list:
    # skip the leading / in the "Name" attribute of the container information.
    if not (isinstance(container_info.get('Names'), types.ListType) and
            container_info['Names'] and
            utilities.valid_string(container_info['Names'][0]) and
            container_info['Names'][0][0] == '/'):
      app.logger.error('invalid containers data format')
      return

    container_id = container_info['Names'][0][1:]
    req = '/containers/{cid}/json'.format(cid=container_id)
    try:
      result = fetch(req)
      cleanup(result)
      cache.update(req, json.dumps(result))
      app.logger.info('caching result of request=%s', req)

    except:
      exc_type, value, _ = sys.exc_info()
      msg = ('Failed to fetch %s with exception %s: %s' %
             (req, exc_type, value))
      app.logger.error(msg)
Пример #31
0
def _do_compute_container(gs, docker_host, pod_guid, container, g):
  assert isinstance(gs, global_state.GlobalState)
  assert utilities.valid_string(docker_host)
  assert utilities.valid_string(pod_guid)
  assert utilities.is_wrapped_object(container, 'Container')
  assert isinstance(g, ContextGraph)

  container_id = container['id']
  container_guid = 'Container:' + container_id
  # TODO(vasbala): container_id is too verbose?
  g.add_resource(container_guid, container['annotations'],
                 'Container', container['timestamp'],
                 container['properties'])

  # Pod contains Container
  g.add_relation(pod_guid, container_guid, 'contains')

  # Processes in a Container
  for process in docker.get_processes(gs, docker_host, container_id):
    process_id = process['id']
    process_guid = 'Process:' + process_id
    g.add_resource(process_guid, process['annotations'],
                   'Process', process['timestamp'], process['properties'])

    # Container contains Process
    g.add_relation(container_guid, process_guid, 'contains')

  # Image from which this Container was created
  image_id = utilities.get_attribute(
      container, ['properties', 'Config', 'Image'])
  if not utilities.valid_string(image_id):
    # Image ID not found
    return
  image = docker.get_image(gs, docker_host, image_id)
  if image is None:
    # image not found
    return

  image_guid = 'Image:' + image['id']
  # Add the image to the graph only if we have not added it before.
  g.add_resource(image_guid, image['annotations'], 'Image',
                 image['timestamp'], image['properties'])

  # Container createdFrom Image
  g.add_relation(container_guid, image_guid, 'createdFrom')
Пример #32
0
def _do_compute_node(cluster_guid, node, g):
  assert utilities.valid_string(cluster_guid)
  assert utilities.is_wrapped_object(node, 'Node')
  assert isinstance(g, ContextGraph)

  node_id = node['id']
  node_guid = 'Node:' + node_id
  g.add_resource(node_guid, node['annotations'], 'Node', node['timestamp'],
                 node['properties'])
  g.add_relation(cluster_guid, node_guid, 'contains')  # Cluster contains Node
Пример #33
0
def _do_compute_pod(gs, input_queue, node_guid, pod, g):
  assert isinstance(gs, global_state.GlobalState)
  assert isinstance(input_queue, Queue.PriorityQueue)
  assert utilities.valid_string(node_guid)
  assert utilities.is_wrapped_object(pod, 'Pod')
  assert isinstance(g, ContextGraph)

  pod_id = pod['id']
  pod_guid = 'Pod:' + pod_id
  docker_host = utilities.get_attribute(
      pod, ['properties', 'spec', 'host'])
  if not utilities.valid_string(docker_host):
    msg = ('Docker host (pod.properties.spec.host) '
           'not found in pod ID %s' % pod_id)
    gs.logger_error(msg)
    raise collector_error.CollectorError(msg)

  g.add_resource(pod_guid, pod['annotations'], 'Pod', pod['timestamp'],
                 pod['properties'])
  g.add_relation(node_guid, pod_guid, 'runs')  # Node runs Pod
Пример #34
0
  def verify_resources(self, result, start_time, end_time):
    assert isinstance(result, dict)
    assert utilities.valid_string(start_time)
    assert utilities.valid_string(end_time)
    self.assertEqual(1, self.count_resources(result, 'Cluster'))
    self.assertEqual(5, self.count_resources(result, 'Node'))
    self.assertEqual(6, self.count_resources(result, 'Service'))
    # TODO(eran): the pods count does not include the pods running in the
    # master. Fix the count once we include pods that run in the master node.
    self.assertEqual(14, self.count_resources(result, 'Pod'))
    self.assertEqual(16, self.count_resources(result, 'Container'))
    self.assertEqual(10, self.count_resources(result, 'Image'))
    self.assertEqual(3, self.count_resources(result, 'ReplicationController'))

    # Verify that all resources are valid wrapped objects.
    assert isinstance(result.get('resources'), list)
    for r in result['resources']:
      # all resources must be valid.
      assert utilities.is_wrapped_object(r)
      assert start_time <= r['timestamp'] <= end_time
Пример #35
0
def fetch(req):
  """Fetch the output of the specified request from the Docker's socket.

  Args:
    req: the request to be sent to the Docker daemon.

  Returns:
  A Unix domain socket response object.
  """
  assert utilities.valid_string(req)
  return session.get(
      '{docker_host}{url}'.format(docker_host=LOCAL_DOCKER_HOST, url=req))
Пример #36
0
def get_rcontrollers(gs):
    """Gets the list of replication controllers in the current cluster.

  Args:
    gs: global state.

  Returns:
    list of wrapped replication controller objects.
    Each element in the list is the result of
    utilities.wrap_object(rcontroller, 'ReplicationController', ...)

  Raises:
    CollectorError: in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
    rcontrollers, ts = gs.get_rcontrollers_cache().lookup('')
    if ts is not None:
        app.logger.debug(
            'get_rcontrollers() cache hit returns %d rcontrollers',
            len(rcontrollers))
        return rcontrollers

    rcontrollers = []
    url = get_kubernetes_base_url() + '/replicationcontrollers'

    try:
        result = fetch_data(gs, url)
    except Exception:
        msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
        app.logger.exception(msg)
        raise collector_error.CollectorError(msg)

    now = time.time()
    if not (isinstance(result, dict) and 'items' in result):
        msg = 'invalid result when fetching %s' % url
        app.logger.exception(msg)
        raise collector_error.CollectorError(msg)

    for rcontroller in result['items']:
        name = utilities.get_attribute(rcontroller, ['metadata', 'name'])
        if not utilities.valid_string(name):
            # an invalid replication controller without a valid rcontroller ID.
            continue

        rcontrollers.append(
            utilities.wrap_object(rcontroller, 'ReplicationController', name,
                                  now))

    ret_value = gs.get_rcontrollers_cache().update('', rcontrollers, now)
    app.logger.info('get_rcontrollers() returns %d rcontrollers',
                    len(rcontrollers))
    return ret_value
Пример #37
0
def get_rcontrollers(gs):
  """Gets the list of replication controllers in the current cluster.

  Args:
    gs: global state.

  Returns:
    list of wrapped replication controller objects.
    Each element in the list is the result of
    utilities.wrap_object(rcontroller, 'ReplicationController', ...)

  Raises:
    CollectorError: in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
  rcontrollers, ts = gs.get_rcontrollers_cache().lookup('')
  if ts is not None:
    gs.logger_info(
        'get_rcontrollers() cache hit returns %d rcontrollers',
        len(rcontrollers))
    return rcontrollers

  rcontrollers = []
  url = get_kubernetes_base_url() + '/replicationcontrollers'

  try:
    result = fetch_data(gs, url)
  except:
    msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
    gs.logger_exception(msg)
    raise collector_error.CollectorError(msg)

  now = time.time()
  if not (isinstance(result, types.DictType) and 'items' in result):
    msg = 'invalid result when fetching %s' % url
    gs.logger_exception(msg)
    raise collector_error.CollectorError(msg)

  for rcontroller in result['items']:
    name = utilities.get_attribute(rcontroller, ['metadata', 'name'])
    if not utilities.valid_string(name):
      # an invalid replication controller without a valid rcontroller ID.
      continue

    rcontrollers.append(utilities.wrap_object(
        rcontroller, 'ReplicationController', name, now))

  ret_value = gs.get_rcontrollers_cache().update('', rcontrollers, now)
  gs.logger_info(
      'get_rcontrollers() returns %d rcontrollers', len(rcontrollers))
  return ret_value
Пример #38
0
def get_nodes(gs):
    """Gets the list of all nodes in the current cluster.

  Args:
    gs: global state.

  Returns:
    list of wrapped node objects.
    Each element in the list is the result of
    utilities.wrap_object(node, 'Node', ...)

  Raises:
    CollectorError: in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
    nodes, timestamp_secs = gs.get_nodes_cache().lookup('')
    if timestamp_secs is not None:
        gs.logger_info('get_nodes() cache hit returns %d nodes', len(nodes))
        return nodes

    nodes = []
    url = '{kubernetes}/nodes'.format(kubernetes=KUBERNETES_API)
    try:
        result = fetch_data(gs, url)
    except:
        msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
        gs.logger_exception(msg)
        raise collector_error.CollectorError(msg)

    now = time.time()
    if not (isinstance(result, types.DictType) and 'items' in result):
        msg = 'invalid result when fetching %s' % url
        gs.logger_exception(msg)
        raise collector_error.CollectorError(msg)

    for node in result['items']:
        name = utilities.get_attribute(node, ['metadata', 'name'])
        if not utilities.valid_string(name):
            # an invalid node without a valid node ID value.
            continue
        wrapped_node = utilities.wrap_object(
            node,
            'Node',
            name,
            now,
            label=utilities.node_id_to_host_name(name))
        nodes.append(wrapped_node)

    ret_value = gs.get_nodes_cache().update('', nodes, now)
    gs.logger_info('get_nodes() returns %d nodes', len(nodes))
    return ret_value
Пример #39
0
def _do_compute_pod(gs, input_queue, node_guid, pod, g):
    assert isinstance(gs, global_state.GlobalState)
    assert isinstance(input_queue, Queue.PriorityQueue)
    assert utilities.valid_string(node_guid)
    assert utilities.is_wrapped_object(pod, 'Pod')
    assert isinstance(g, ContextGraph)

    pod_id = pod['id']
    pod_guid = 'Pod:' + pod_id
    docker_host = utilities.get_attribute(pod, ['properties', 'spec', 'host'])
    if not utilities.valid_string(docker_host):
        msg = ('Docker host (pod.properties.spec.host) '
               'not found in pod ID %s' % pod_id)
        gs.logger_error(msg)
        raise collector_error.CollectorError(msg)

    g.add_resource(pod_guid, pod['annotations'], 'Pod', pod['timestamp'],
                   pod['properties'])
    g.add_relation(node_guid, pod_guid, 'runs')  # Node runs Pod

    # Containers in a Pod
    for container in docker.get_containers_with_metrics(gs, docker_host):
        if not _container_in_pod(gs, container, pod):
            continue

        # Do not compute the containers by worker threads in test mode because the
        # order of the output will be different than the golden files due to the
        # effects of queuing the work.
        if gs.get_testing():
            _do_compute_container(gs, docker_host, pod_guid, container, g)
        else:
            input_queue.put((gs.get_random_priority(), _do_compute_container, {
                'gs': gs,
                'docker_host': docker_host,
                'pod_guid': pod_guid,
                'container': container,
                'g': g
            }))
Пример #40
0
def _do_compute_container(gs, docker_host, pod_guid, container, g):
    assert isinstance(gs, global_state.GlobalState)
    assert utilities.valid_string(docker_host)
    assert utilities.valid_string(pod_guid)
    assert utilities.is_wrapped_object(container, 'Container')
    assert isinstance(g, ContextGraph)

    container_id = container['id']
    container_guid = 'Container:' + container_id
    # TODO(vasbala): container_id is too verbose?
    g.add_resource(container_guid, container['annotations'], 'Container',
                   container['timestamp'], container['properties'])

    # Pod contains Container
    g.add_relation(pod_guid, container_guid, 'contains')

    # Processes in a Container
    for process in docker.get_processes(gs, docker_host, container_id):
        process_id = process['id']
        process_guid = 'Process:' + process_id
        g.add_resource(process_guid, process['annotations'], 'Process',
                       process['timestamp'], process['properties'])

        # Container contains Process
        g.add_relation(container_guid, process_guid, 'contains')

    image = docker.get_image(gs, docker_host, container)
    if image is None:
        # image not found
        return

    image_guid = 'Image:' + image['id']
    # Add the image to the graph only if we have not added it before.
    g.add_resource(image_guid, image['annotations'], 'Image',
                   image['timestamp'], image['properties'])

    # Container createdFrom Image
    g.add_relation(container_guid, image_guid, 'createdFrom')
Пример #41
0
def _get_container_labels(container, parent_pod):
    """Returns key/value pairs identifying all metrics of this container.

  Args:
    container: the container object to annotate.
    parent_pod: the parent pod of 'container'.

  Returns:
  A dictionary of key/value pairs.
  If any error was detected, returns None.
  """
    if not utilities.is_wrapped_object(container, 'Container'):
        return None
    if not utilities.is_wrapped_object(parent_pod, 'Pod'):
        return None

    pod_id = utilities.get_attribute(parent_pod,
                                     ['properties', 'metadata', 'uid'])
    if not utilities.valid_string(pod_id):
        return None

    hostname = utilities.get_attribute(parent_pod,
                                       ['properties', 'spec', 'host'])
    if not utilities.valid_string(hostname):
        return None

    short_container_name = utilities.get_short_container_name(
        container, parent_pod)

    if not utilities.valid_string(short_container_name):
        return None

    return {
        'pod_id': pod_id,
        'hostname': hostname,
        'container_name': short_container_name
    }
Пример #42
0
def _get_container_labels(container, parent_pod):
  """Returns key/value pairs identifying all metrics of this container.

  Args:
    container: the container object to annotate.
    parent_pod: the parent pod of 'container'.

  Returns:
  A dictionary of key/value pairs.
  If any error was detected, returns None.
  """
  if not utilities.is_wrapped_object(container, 'Container'):
    return None
  if not utilities.is_wrapped_object(parent_pod, 'Pod'):
    return None

  pod_id = utilities.get_attribute(
      parent_pod, ['properties', 'metadata', 'uid'])
  if not utilities.valid_string(pod_id):
    return None

  hostname = utilities.get_attribute(
      parent_pod, ['properties', 'spec', 'host'])
  if not utilities.valid_string(hostname):
    return None

  short_container_name = utilities.get_short_container_name(
      container, parent_pod)

  if not utilities.valid_string(short_container_name):
    return None

  return {
      'pod_id': pod_id,
      'hostname': hostname,
      'container_name': short_container_name
  }
Пример #43
0
def _do_compute_pod(gs, input_queue, node_guid, pod, g):
  assert isinstance(gs, global_state.GlobalState)
  assert isinstance(input_queue, Queue.PriorityQueue)
  assert utilities.valid_string(node_guid)
  assert utilities.is_wrapped_object(pod, 'Pod')
  assert isinstance(g, ContextGraph)

  pod_id = pod['id']
  pod_guid = 'Pod:' + pod_id
  docker_host = utilities.get_attribute(
      pod, ['properties', 'spec', 'host'])
  if not utilities.valid_string(docker_host):
    msg = ('Docker host (pod.properties.spec.host) '
           'not found in pod ID %s' % pod_id)
    gs.logger_error(msg)
    raise collector_error.CollectorError(msg)

  g.add_resource(pod_guid, pod['annotations'], 'Pod', pod['timestamp'],
                 pod['properties'])
  g.add_relation(node_guid, pod_guid, 'runs')  # Node runs Pod

  # Containers in a Pod
  for container in docker.get_containers_with_metrics(gs, docker_host):
    if not _container_in_pod(gs, container, pod):
      continue

    # Do not compute the containers by worker threads in test mode because the
    # order of the output will be different than the golden files due to the
    # effects of queuing the work.
    if gs.get_testing():
      _do_compute_container(gs, docker_host, pod_guid, container, g)
    else:
      input_queue.put((
          gs.get_random_priority(),
          _do_compute_container,
          {'gs': gs, 'docker_host': docker_host, 'pod_guid': pod_guid,
           'container': container, 'g': g}))
Пример #44
0
def get_services(gs):
  """Gets the list of services in the current cluster.

  Args:
    gs: global state.

  Returns:
    list of wrapped service objects.
    Each element in the list is the result of
    utilities.wrap_object(service, 'Service', ...)

    (list_of_services, timestamp_in_seconds)

  Raises:
    CollectorError: in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
  services, timestamp_secs = gs.get_services_cache().lookup('')
  if timestamp_secs is not None:
    gs.logger_info('get_services() cache hit returns %d services',
                   len(services))
    return services

  services = []
  url = '{kubernetes}/services'.format(kubernetes=KUBERNETES_API)
  try:
    result = fetch_data(gs, url)
  except:
    msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
    gs.logger_exception(msg)
    raise collector_error.CollectorError(msg)

  now = time.time()
  if not (isinstance(result, types.DictType) and 'items' in result):
    msg = 'invalid result when fetching %s' % url
    gs.logger_exception(msg)
    raise collector_error.CollectorError(msg)

  for service in result['items']:
    name = utilities.get_attribute(service, ['metadata', 'name'])
    if not utilities.valid_string(name):
      # an invalid service without a valid service ID.
      continue
    services.append(
        utilities.wrap_object(service, 'Service', name, now))

  ret_value = gs.get_services_cache().update('', services, now)
  gs.logger_info('get_services() returns %d services', len(services))
  return ret_value
Пример #45
0
def _do_compute_pod(cluster_guid, pod, g):
  assert utilities.valid_string(cluster_guid)
  assert utilities.is_wrapped_object(pod, 'Pod')
  assert isinstance(g, ContextGraph)

  pod_id = pod['id']
  pod_guid = 'Pod:' + pod_id
  g.add_resource(pod_guid, pod['annotations'], 'Pod', pod['timestamp'],
                 pod['properties'])

  # pod.properties.spec.nodeName may be missing if the pod is waiting
  # (not running yet).
  node_id = utilities.get_attribute(pod, ['properties', 'spec', 'nodeName'])
  if utilities.valid_string(node_id):
    # Pod is running.
    node_guid = 'Node:' + node_id
    g.add_relation(node_guid, pod_guid, 'runs')  # Node runs Pod
  else:
    # Pod is not running.
    g.add_relation(cluster_guid, pod_guid, 'contains')  # Cluster contains Pod

  for container in kubernetes.get_containers_from_pod(pod):
    metrics.annotate_container(container, pod)
    _do_compute_container(pod_guid, container, g)
Пример #46
0
def get_services(gs):
    """Gets the list of services in the current cluster.

  Args:
    gs: global state.

  Returns:
    list of wrapped service objects.
    Each element in the list is the result of
    utilities.wrap_object(service, 'Service', ...)

    (list_of_services, timestamp_in_seconds)

  Raises:
    CollectorError: in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
    services, timestamp_secs = gs.get_services_cache().lookup('')
    if timestamp_secs is not None:
        app.logger.debug('get_services() cache hit returns %d services',
                         len(services))
        return services

    services = []
    url = get_kubernetes_base_url() + '/services'
    try:
        result = fetch_data(gs, url)
    except Exception:
        msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
        app.logger.exception(msg)
        raise collector_error.CollectorError(msg)

    now = time.time()
    if not (isinstance(result, dict) and 'items' in result):
        msg = 'invalid result when fetching %s' % url
        app.logger.exception(msg)
        raise collector_error.CollectorError(msg)

    for service in result['items']:
        name = utilities.get_attribute(service, ['metadata', 'name'])
        if not utilities.valid_string(name):
            # an invalid service without a valid service ID.
            continue
        services.append(utilities.wrap_object(service, 'Service', name, now))

    ret_value = gs.get_services_cache().update('', services, now)
    app.logger.info('get_services() returns %d services', len(services))
    return ret_value
Пример #47
0
def get_pods(gs):
    """Gets the list of all pods in the cluster.

  Args:
    gs: global state.

  Returns:
    list of wrapped pod objects.
    Each element in the list is the result of
    utilities.wrap_object(pod, 'Pod', ...)

  Raises:
    CollectorError: in case of failure to fetch data from Kubernetes.
    Other exceptions may be raised due to exectution errors.
  """
    pods, timestamp_secs = gs.get_pods_cache().lookup('')
    if timestamp_secs is not None:
        app.logger.debug('get_pods() cache hit returns %d pods', len(pods))
        return pods

    pods = []
    url = get_kubernetes_base_url() + '/pods'
    try:
        result = fetch_data(gs, url)
    except Exception:
        msg = 'fetching %s failed with exception %s' % (url, sys.exc_info()[0])
        app.logger.exception(msg)
        raise collector_error.CollectorError(msg)

    now = time.time()
    if not (isinstance(result, dict) and 'items' in result):
        msg = 'invalid result when fetching %s' % url
        app.logger.exception(msg)
        raise collector_error.CollectorError(msg)

    for pod in result['items']:
        name = utilities.get_attribute(pod, ['metadata', 'name'])
        if not utilities.valid_string(name):
            # an invalid pod without a valid pod ID value.
            continue
        wrapped_pod = utilities.wrap_object(pod, 'Pod', name, now)
        pods.append(wrapped_pod)

    ret_value = gs.get_pods_cache().update('', pods, now)
    app.logger.info('get_pods() returns %d pods', len(pods))
    return ret_value
Пример #48
0
  def test_resources(self):
    """Test the '/resources' endpoint."""
    start_time = utilities.now()
    ret_value = self.app.get('/cluster/resources')
    end_time = utilities.now()
    result = json.loads(ret_value.data)
    self.verify_resources(result, start_time, end_time)

    self.assertEqual(0, self.count_relations(result, 'contains'))
    self.assertEqual(0, self.count_relations(result, 'createdFrom'))
    self.assertEqual(0, self.count_relations(result, 'loadBalances'))
    self.assertEqual(0, self.count_relations(result, 'monitors'))
    self.assertEqual(0, self.count_relations(result, 'runs'))

    # The overall timestamp must be in the expected range.
    self.assertTrue(utilities.valid_string(result.get('timestamp')))
    self.assertTrue(start_time <= result['timestamp'] <= end_time)
Пример #49
0
    def _max_relations_timestamp(self, initial_value):
        """Computes the maximal timestamp of all relations and 'initial_value'.

    Must be called while holding self._lock.

    Args:
      initial_value: the result should be greater or equal to this value.

    Returns:
    Maximum timestamp of all relations and 'initial_value'.
    """
        assert utilities.valid_string(initial_value)
        max_timestamp = initial_value
        for r in self._context_relations:
            if r['timestamp'] > max_timestamp:
                max_timestamp = r['timestamp']

        return max_timestamp
Пример #50
0
def _get_node_labels(node):
    """Returns key/value pairs identifying all metrics of this node.

  Args:
    node: the node object to annotate.

  Returns:
  A dictionary of key/value pairs.
  If any error was detected, returns None.
  """
    if not utilities.is_wrapped_object(node, 'Node'):
        return None

    hostname = utilities.get_attribute(node,
                                       ['properties', 'metadata', 'name'])
    if not utilities.valid_string(hostname):
        return None

    return {'pod_id': '', 'hostname': hostname, 'container_name': '/'}
Пример #51
0
def _make_gcm_metrics(project_id, labels_dict):
    """Generate a descriptor of GCM metrics from 'project_id' and 'labels_dict'.

  Args:
    project_id: the project ID
    labels_dict: the key/value pairs that identify all metrics of the
    current resource.

  Returns:
  A dictionary containing the descriptor of the GCM metrics.
  See below for details.
  If 'labels_dict' is None, returns None.

  Typical output is:
  {
    'gcm': {
      'names': ['.../cpu/usage', '.../memory/page_faults', ...],
      'project': PROJECT,
      'labels_prefix': PREFIX,
      'labels': {
         'pod_id': POD_ID, 'hostname': HOSTNAME,
         'container_name': CONTAINER_NAME }
    }
  }
  """
    if labels_dict is None:
        return None

    assert utilities.valid_string(project_id)
    assert isinstance(labels_dict, types.DictType)

    if not labels_dict:
        # an empty dictionary
        return None

    return {
        'gcm': {
            'names': copy.deepcopy(METRIC_NAMES),
            'project': project_id,
            'labels': copy.deepcopy(labels_dict),
            'labels_prefix': METRIC_PREFIX + 'label/'
        }
    }
Пример #52
0
def annotate_node(project_id, node):
    """Annotate the given node with Heapster GCM metric information.

  Args:
    project_id: the project ID
    node: the node object to annotate.

  Raises:
    AssertionError: if the input argument is invalid.
  """
    assert utilities.valid_string(project_id)
    assert utilities.is_wrapped_object(node, 'Node')

    m = _make_gcm_metrics(project_id, _get_node_labels(node))
    if m is None:
        return
    if node.get('annotations') is None:
        node['annotations'] = {}
    node['annotations']['metrics'] = m
Пример #53
0
def _do_compute_graph(gs, input_queue, output_queue, output_format):
    """Returns the context graph in the specified format.

  Args:
    gs: the global state.
    input_queue: the input queue for the worker threads.
    output_queue: output queue containing exceptions data from the worker
        threads.
    output_format: one of 'graph', 'dot', 'context_graph', or 'resources'.

  Returns:
    A successful response in the specified format.

  Raises:
    CollectorError: inconsistent or invalid graph data.
  """
    assert isinstance(gs, global_state.GlobalState)
    assert isinstance(input_queue, Queue.PriorityQueue)
    assert isinstance(output_queue, Queue.Queue)
    assert utilities.valid_string(output_format)

    g = ContextGraph()
    g.set_version(docker.get_version(gs))
    g.set_metadata({'timestamp': utilities.now()})
    g.set_relations_to_timestamps(gs.get_relations_to_timestamps())

    # Nodes
    nodes_list = kubernetes.get_nodes_with_metrics(gs)
    if not nodes_list:
        return g.dump(gs, output_format)

    # Find the timestamp of the oldest node. This will be the timestamp of
    # the cluster.
    oldest_timestamp = utilities.now()
    for node in nodes_list:
        assert utilities.is_wrapped_object(node, 'Node')
        # note: we cannot call min(oldest_timestamp, node['timestamp']) here
        # because min(string) returnes the smallest character in the string.
        if node['timestamp'] < oldest_timestamp:
            oldest_timestamp = node['timestamp']

    # Get the cluster name from the first node.
    # The cluster name is an approximation. It is not a big deal if it
    # is incorrect, since the aggregator knows the cluster name.
    cluster_name = utilities.node_id_to_cluster_name(nodes_list[0]['id'])
    cluster_guid = 'Cluster:' + cluster_name
    g.set_title(cluster_name)
    g.add_resource(cluster_guid, {'label': cluster_name}, 'Cluster',
                   oldest_timestamp, {})

    # Nodes
    for node in nodes_list:
        input_queue.put((gs.get_random_priority(), _do_compute_node, {
            'gs': gs,
            'input_queue': input_queue,
            'cluster_guid': cluster_guid,
            'node': node,
            'g': g
        }))

    # Services
    for service in kubernetes.get_services(gs):
        input_queue.put((gs.get_random_priority(), _do_compute_service, {
            'gs': gs,
            'cluster_guid': cluster_guid,
            'service': service,
            'g': g
        }))

    # ReplicationControllers
    rcontrollers_list = kubernetes.get_rcontrollers(gs)
    for rcontroller in rcontrollers_list:
        input_queue.put((gs.get_random_priority(), _do_compute_rcontroller, {
            'gs': gs,
            'cluster_guid': cluster_guid,
            'rcontroller': rcontroller,
            'g': g
        }))

    # Wait until worker threads finished processing all outstanding requests.
    # Once we return from the join(), all output was generated already.
    input_queue.join()

    # Convert any exception caught by the worker threads to an exception
    # raised by the current thread.
    if not output_queue.empty():
        msg = output_queue.get_nowait()  # should not fail.
        gs.logger_error(msg)
        raise collector_error.CollectorError(msg)

    # Keep the relations_to_timestamps mapping for next call.
    gs.set_relations_to_timestamps(g.get_relations_to_timestamps())

    # Dump the resulting graph
    return g.dump(gs, output_format)