Exemplo n.º 1
0
def _do_compute_node(gs, input_queue, cluster_guid, node, g):
    assert isinstance(gs, global_state.GlobalState)
    assert isinstance(input_queue, Queue.PriorityQueue)
    assert utilities.valid_string(cluster_guid)
    assert utilities.is_wrapped_object(node, 'Node')
    assert isinstance(g, ContextGraph)

    node_id = node['id']
    node_guid = 'Node:' + node_id
    g.add_resource(node_guid, node['annotations'], 'Node', node['timestamp'],
                   node['properties'])
    g.add_relation(cluster_guid, node_guid,
                   'contains')  # Cluster contains Node
    # Pods in a Node
    # Do not compute the pods by worker threads in test mode because the order
    # of the output will be different than the golden files due to the effects
    # of queuing the work.
    for pod in kubernetes.get_pods(gs, node_id):
        if gs.get_testing():
            _do_compute_pod(gs, input_queue, node_guid, pod, g)
        else:
            input_queue.put((gs.get_random_priority(), _do_compute_pod, {
                'gs': gs,
                'input_queue': input_queue,
                'node_guid': node_guid,
                'pod': pod,
                'g': g
            }))
Exemplo n.º 2
0
def _do_compute_node(gs, input_queue, cluster_guid, node, g):
  assert isinstance(gs, global_state.GlobalState)
  assert isinstance(input_queue, Queue.PriorityQueue)
  assert utilities.valid_string(cluster_guid)
  assert utilities.is_wrapped_object(node, 'Node')
  assert isinstance(g, ContextGraph)

  node_id = node['id']
  node_guid = 'Node:' + node_id
  g.add_resource(node_guid, node['annotations'], 'Node', node['timestamp'],
                 node['properties'])
  g.add_relation(cluster_guid, node_guid, 'contains')  # Cluster contains Node
  # Pods in a Node
  # Do not compute the pods by worker threads in test mode because the order
  # of the output will be different than the golden files due to the effects
  # of queuing the work.
  for pod in kubernetes.get_pods(gs, node_id):
    if gs.get_testing():
      _do_compute_pod(gs, input_queue, node_guid, pod, g)
    else:
      input_queue.put((
          gs.get_random_priority(),
          _do_compute_pod,
          {'gs': gs, 'input_queue': input_queue, 'node_guid': node_guid,
           'pod': pod, 'g': g}))
Exemplo n.º 3
0
def _do_compute_node(gs, input_queue, cluster_guid, node, g):
  assert isinstance(gs, global_state.GlobalState)
  assert isinstance(input_queue, Queue.PriorityQueue)
  assert utilities.valid_string(cluster_guid)
  assert utilities.is_wrapped_object(node, 'Node')
  assert isinstance(g, ContextGraph)

  node_id = node['id']
  node_guid = 'Node:' + node_id
  g.add_resource(node_guid, node['annotations'], 'Node', node['timestamp'],
                 node['properties'])
  g.add_relation(cluster_guid, node_guid, 'contains')  # Cluster contains Node
  # Pods in a Node
  pod_ids = set()
  docker_hosts = set()

  # Process pods sequentially because calls to _do_compute_pod() do not call
  # lower-level services or wait.
  for pod in kubernetes.get_pods(gs, node_id):
    _do_compute_pod(gs, cluster_guid, node_guid, pod, g)
    pod_ids.add(pod['id'])
    # pod.properties.spec.nodeName may be missing if the pod is waiting.
    docker_host = utilities.get_attribute(
        pod, ['properties', 'spec', 'nodeName'])
    if utilities.valid_string(docker_host):
      docker_hosts.add(docker_host)

  # 'docker_hosts' should contain a single Docker host, because all of
  # the pods run in the same Node. However, if it is not the case, we
  # cannot fix the situation, so we just log an error message and continue.
  if len(docker_hosts) != 1:
    gs.logger_error(
        'corrupt pod data in node=%s: '
        '"docker_hosts" is empty or contains more than one entry: %s',
        node_guid, str(docker_hosts))

  # Process containers concurrently.
  for docker_host in docker_hosts:
    for container in docker.get_containers_with_metrics(gs, docker_host):
      parent_pod_id = utilities.get_parent_pod_id(container)
      if utilities.valid_string(parent_pod_id) and (parent_pod_id in pod_ids):
        # This container is contained in a pod.
        parent_guid = 'Pod:' + parent_pod_id
      else:
        # This container is not contained in a pod.
        parent_guid = node_guid

      # Do not compute the containers by worker threads in test mode
      # because the order of the output will be different than the golden
      # files due to the effects of queuing the work.
      if gs.get_testing():
        _do_compute_container(gs, docker_host, parent_guid, container, g)
      else:
        input_queue.put((
            gs.get_random_priority(),
            _do_compute_container,
            {'gs': gs, 'docker_host': docker_host, 'parent_guid': parent_guid,
             'container': container, 'g': g}))
Exemplo n.º 4
0
def _do_compute_other_nodes(gs, cluster_guid, nodes_list, oldest_timestamp, g):
  """Adds nodes not in the node list but running pods to the graph.

  This handles the case when there are pods running on the master node,
  in which case we add a dummy node representing the master to the graph.
  The nodes list does not include the master.

  Args:
    gs: the global state.
    cluster_guid: the cluster's ID.
    nodes_list: a list of wrapped Node objects.
    oldest_timestamp: the timestamp of the oldest Node object.
    g: the context graph under construction.
  """
  assert isinstance(gs, global_state.GlobalState)
  assert utilities.valid_string(cluster_guid)
  assert isinstance(nodes_list, list)
  assert utilities.valid_string(oldest_timestamp)
  assert isinstance(g, ContextGraph)

  # Compute the set of known Node names.
  known_node_ids = set()
  for node in nodes_list:
    assert utilities.is_wrapped_object(node, 'Node')
    known_node_ids.add(node['id'])

  # Compute the set of Nodes referenced by pods but not in the known set.
  # The set of unknown node names may be empty.
  missing_node_ids = set()
  for pod in kubernetes.get_pods(gs):
    assert utilities.is_wrapped_object(pod, 'Pod')
    # pod.properties.spec.nodeName may be missing if the pod is waiting.
    parent_node_id = utilities.get_attribute(
        pod, ['properties', 'spec', 'nodeName'])
    if not utilities.valid_string(parent_node_id):
      continue

    if parent_node_id in known_node_ids:
      continue

    # Found a pod that does not belong to any of the known nodes.
    missing_node_ids.add(parent_node_id)

  # Process the missing nodes.
  for node_id in missing_node_ids:
    # Create a dummy node object just as a placeholder for metric
    # annotations.
    node = utilities.wrap_object({}, 'Node', node_id, time.time())

    metrics.annotate_node(node)
    node_guid = 'Node:' + node_id
    g.add_resource(node_guid, node['annotations'], 'Node', oldest_timestamp, {})
    g.add_relation(cluster_guid, node_guid, 'contains')  # Cluster contains Node
Exemplo n.º 5
0
def get_pods():
  """Computes the response of the '/cluster/resources/pods' endpoint.

  Returns:
    The pods of the context graph.
  """
  gs = app.context_graph_global_state
  try:
    pods_list = kubernetes.get_pods(gs)
  except collector_error.CollectorError as e:
    return flask.jsonify(utilities.make_error(str(e)))

  return flask.jsonify(utilities.make_response(pods_list, 'resources'))
Exemplo n.º 6
0
def get_pods():
    """Computes the response of the '/cluster/resources/pods' endpoint.

  Returns:
    The pods of the context graph.
  """
    gs = app.context_graph_global_state
    try:
        pods_list = kubernetes.get_pods(gs)
    except collector_error.CollectorError as e:
        return flask.jsonify(utilities.make_error(str(e)))

    return flask.jsonify(utilities.make_response(pods_list, 'resources'))
Exemplo n.º 7
0
def get_pods():
  """Computes the response of the '/cluster/resources/pods' endpoint.

  Returns:
    The pods of the context graph.
  """
  gs = app.context_graph_global_state
  try:
    pods_list = kubernetes.get_pods(gs, None)
  except collector_error.CollectorError as e:
    return flask.jsonify(utilities.make_error(str(e)))
  except:
    msg = 'kubernetes.get_pods() failed with exception %s' % sys.exc_info()[0]
    app.logger.exception(msg)
    return flask.jsonify(utilities.make_error(msg))

  return flask.jsonify(utilities.make_response(pods_list, 'resources'))
Exemplo n.º 8
0
def get_pods():
    """Computes the response of the '/cluster/resources/pods' endpoint.

  Returns:
    The pods of the context graph.
  """
    gs = app.context_graph_global_state
    try:
        pods_list = kubernetes.get_pods(gs, None)
    except collector_error.CollectorError as e:
        return flask.jsonify(utilities.make_error(str(e)))
    except:
        msg = 'kubernetes.get_pods() failed with exception %s' % sys.exc_info(
        )[0]
        app.logger.exception(msg)
        return flask.jsonify(utilities.make_error(msg))

    return flask.jsonify(utilities.make_response(pods_list, 'resources'))
Exemplo n.º 9
0
def get_containers_with_metrics(gs, docker_host):
  """Gets the list of all containers in 'docker_host' with metric annotations.

  Args:
    gs: global state.
    docker_host: the Docker host running the containers.

  Returns:
    list of wrapped container objects.
    Each element in the list is the result of
    utilities.wrap_object(container, 'Container', ...)

  Raises:
    CollectorError: in case of failure to fetch data from Docker.
    Other exceptions may be raised due to exectution errors.
  """
  # Create a lookup table from pod IDs to pods.
  # This lookup table is needed when annotating containers with
  # metrics. Also compute the project's name.
  containers_list = get_containers(gs, docker_host)
  if not containers_list:
    return []

  pod_id_to_pod = {}
  project_id = '_unknown_'

  # Populate the pod ID to pod lookup table.
  # Compute the project_id from the name of the first pod.
  for pod in kubernetes.get_pods(gs, docker_host):
    assert utilities.is_wrapped_object(pod, 'Pod')
    pod_id_to_pod[pod['id']] = pod
    if project_id != '_unknown_':
      continue
    pod_hostname = utilities.get_attribute(
        pod, ['properties', 'spec', 'host'])
    if utilities.valid_string(pod_hostname):
      project_id = utilities.node_id_to_project_id(pod_hostname)

  # We know that there are containers in this docker_host.
  if not pod_id_to_pod:
    # there are no pods in this docker_host.
    msg = 'Docker host %s has containers but no pods' % docker_host
    gs.logger_exception(msg)
    raise collector_error.CollectorError(msg)

  # Annotate the containers with their metrics.
  for container in containers_list:
    assert utilities.is_wrapped_object(container, 'Container')

    parent_pod_id = utilities.get_parent_pod_id(container)
    if not utilities.valid_string(parent_pod_id):
      msg = ('missing or invalid parent pod ID in container %s' %
             container['id'])
      gs.logger_error(msg)
      raise collector_error.CollectorError(msg)

    if parent_pod_id not in pod_id_to_pod:
      msg = ('could not locate parent pod %s for container %s' %
             (parent_pod_id, container['id']))
      gs.logger_error(msg)
      raise collector_error.CollectorError(msg)

    # Note that the project ID may be '_unknown_'.
    # This is not a big deal, because the aggregator knows the project ID.
    metrics.annotate_container(
        project_id, container, pod_id_to_pod[parent_pod_id])

  return containers_list
Exemplo n.º 10
0
def _do_compute_graph(gs, output_format):
  """Returns the context graph in the specified format.

  Args:
    gs: the global state.
    output_format: one of 'dot', 'context_graph', or 'resources'.

  Returns:
    A successful response in the specified format.

  Raises:
    CollectorError: inconsistent or invalid graph data.
  """
  assert isinstance(gs, global_state.GlobalState)
  assert utilities.valid_string(output_format)

  g = ContextGraph()
  g.set_relations_to_timestamps(gs.get_relations_to_timestamps())

  # Nodes
  nodes_list = kubernetes.get_nodes_with_metrics(gs)
  if not nodes_list:
    return g.dump(output_format)

  # Find the timestamp of the oldest node. This will be the timestamp of
  # the cluster.
  oldest_timestamp = utilities.now()
  for node in nodes_list:
    assert utilities.is_wrapped_object(node, 'Node')
    # note: we cannot call min(oldest_timestamp, node['timestamp']) here
    # because min(string) returnes the smallest character in the string.
    if node['timestamp'] < oldest_timestamp:
      oldest_timestamp = node['timestamp']

  # The cluster name may be available through the Kubernetes API someday.
  # TODO(rimey): Determine the cluster name.
  cluster_name = '_unknown_'
  cluster_guid = 'Cluster:' + cluster_name
  g.set_title(cluster_name)
  g.add_resource(cluster_guid, {'label': cluster_name}, 'Cluster',
                 oldest_timestamp, {})

  # Nodes
  for node in nodes_list:
    _do_compute_node(cluster_guid, node, g)

  # Pods
  for pod in kubernetes.get_pods(gs):
    _do_compute_pod(cluster_guid, pod, g)

  # Services
  for service in kubernetes.get_services(gs):
    _do_compute_service(gs, cluster_guid, service, g)

  # ReplicationControllers
  for rcontroller in kubernetes.get_rcontrollers(gs):
    _do_compute_rcontroller(gs, cluster_guid, rcontroller, g)

  # Other nodes, not on the list, such as the Kubernetes master.
  _do_compute_other_nodes(gs, cluster_guid, nodes_list, oldest_timestamp, g)

  # Keep the relations_to_timestamps mapping for next call.
  gs.set_relations_to_timestamps(g.get_relations_to_timestamps())
  g.set_metadata({'timestamp': g.max_resources_and_relations_timestamp()})

  # Dump the resulting graph
  return g.dump(output_format)
Exemplo n.º 11
0
def _do_compute_master_pods(gs, cluster_guid, nodes_list, oldest_timestamp, g):
  """Adds pods running on the master node to the graph.

  These pods do not have a valid parent node, because the nodes list
  does not include the master node.

  This routine adds a dummy master node, and then adds the pods running
  on the master node to the graph. It does not add information about
  containers, processes, or images of these nodes, because there is no
  minion collector running on the master node.

  Note that in some configurations (for example, GKE), there is no
  master node.

  Args:
    gs: the global state.
    cluster_guid: the cluster's ID.
    nodes_list: a list of wrapped Node objects.
    oldest_timestamp: the timestamp of the oldest Node object.
    g: the context graph under construction.
  """
  assert isinstance(gs, global_state.GlobalState)
  assert utilities.valid_string(cluster_guid)
  assert isinstance(nodes_list, types.ListType)
  assert utilities.valid_string(oldest_timestamp)
  assert isinstance(g, ContextGraph)

  # Compute the set of known Node names.
  known_node_ids = set()
  project_id = '_unknown_'
  for node in nodes_list:
    assert utilities.is_wrapped_object(node, 'Node')
    known_node_ids.add(node['id'])
    project_id = utilities.node_id_to_project_id(node['id'])

  # Compute the set of Nodes referenced by pods but not in the known set.
  # The set of unknown node names may be empty.
  assert utilities.valid_string(project_id)
  missing_node_ids = set()
  for pod in kubernetes.get_pods(gs):
    assert utilities.is_wrapped_object(pod, 'Pod')
    # pod.properties.spec.nodeName may be missing if the pod is waiting.
    parent_node_id = utilities.get_attribute(
        pod, ['properties', 'spec', 'nodeName'])
    if not utilities.valid_string(parent_node_id):
      continue

    if parent_node_id in known_node_ids:
      continue

    # Found a pod that does not belong to any of the known nodes.
    missing_node_ids.add(parent_node_id)

  # Process the pods in each of the missing nodes.
  for node_id in missing_node_ids:
    # Create a dummy node object just as a placeholder for metric
    # annotations.
    node = utilities.wrap_object(
        {}, 'Node', node_id, time.time(),
        label=utilities.node_id_to_host_name(node_id))

    # The project_id may be '_unknown_'. This is not a big
    # deal, since the aggregator knows the project ID.
    metrics.annotate_node(project_id, node)
    node_guid = 'Node:' + node_id
    g.add_resource(node_guid, node['annotations'], 'Node', oldest_timestamp, {})
    g.add_relation(cluster_guid, node_guid, 'contains')  # Cluster contains Node
    for pod in kubernetes.get_pods(gs, node_id):
      _do_compute_pod(gs, cluster_guid, node_guid, pod, g)
Exemplo n.º 12
0
def get_version(gs):
  """Returns a human-readable information of the currently running image.

  Args:
    gs: global state.

  Returns:
  A string of the form:
  <symbolic container name> <container hex ID> <creation date and time>

  Raises:
    CollectorError: in case of any error to compute the running image
      information.
  """
  version, timestamp_secs = gs.get_version_cache().lookup('')
  if timestamp_secs is not None:
    assert utilities.valid_string(version)
    gs.logger_info('get_version() cache hit')
    return version

  if gs.get_testing():
    fname = 'testdata/proc-self-cgroup.txt'
  else:
    fname = '/proc/self/cgroup'

  try:
    f = open(fname, 'r')
    cgroup = f.read()
    f.close()
  except IOError:
    # file not found
    msg = 'failed to open or read %s' % fname
    gs.logger_exception(msg)
    raise collector_error.CollectorError(msg)
  except:
    msg = 'reading %s failed with exception %s' % (fname, sys.exc_info()[0])
    gs.logger_exception(msg)
    raise collector_error.CollectorError(msg)

  # The file must contain an entry for '\d+:cpu:/...'.
  m = re.search(r'\b\d+:cpu:/([0-9a-fA-F]+)\b', cgroup)
  if not m:
    msg = 'could not find an entry for "cpu:/docker/..." in %s' % fname
    gs.logger_error(msg)
    raise collector_error.CollectorError(msg)

  hex_container_id = m.group(1)
  if gs.get_testing():
    # This pod name is guaranteed to match a pod in the testdata directory.
    my_pod_name = 'kube-dns-bqw5e'
  else:
    my_pod_name = os.uname()[1]
  assert utilities.valid_string(my_pod_name)

  # Find my node name from my pod.
  my_node_name = None
  for pod in kubernetes.get_pods(gs):
    assert utilities.is_wrapped_object(pod, 'Pod')
    if pod['id'] == my_pod_name:
      my_node_name = utilities.get_attribute(
          pod, ['properties', 'spec', 'nodeName'])
      break

  if not utilities.valid_string(my_node_name):
    msg = ('could not find pod %s or this pod does not contain a valid '
           'node name' % my_pod_name)
    gs.logger_error(msg)
    raise collector_error.CollectorError(msg)

  # inspect the running container.
  # Must specify an explicit host name (not "localhost").
  url = 'http://{host}:{port}/containers/{container_id}/json'.format(
      host=my_node_name, port=gs.get_docker_port(),
      container_id=hex_container_id)
  container = fetch_data(gs, url, 'container-' + hex_container_id[:12])

  # Fetch the image symbolic name and hex ID from the container information.
  symbolic_image_id = utilities.get_attribute(container, ['Config', 'Image'])
  hex_image_id = utilities.get_attribute(container, ['Image'])

  # Verify the image symbolic name and the image hex ID.
  if not (utilities.valid_string(symbolic_image_id) and
          not utilities.valid_hex_id(symbolic_image_id) and
          utilities.valid_hex_id(hex_image_id)):
    msg = 'could not find or invalid image information in container %s' % url
    gs.logger_error(msg)
    raise collector_error.CollectorError(msg)

  # Fetch image information.
  # Must specify an explicit host name (not "localhost").
  url = 'http://{host}:{port}/images/{image_id}/json'.format(
      host=my_node_name, port=gs.get_docker_port(),
      image_id=hex_image_id)
  image = fetch_data(gs, url, 'image-' + hex_image_id[:12])

  # Fetch the image creation timestamp.
  created = utilities.get_attribute(image, ['Created'])
  if not utilities.valid_string(created):
    msg = 'could not find image creation timestamp in %s' % url
    gs.logger_error(msg)
    raise collector_error.CollectorError(msg)

  # Remove the trailing subsecond part of the creation timestamp.
  created = re.sub(r'\.[0-9]+Z$', '', created)

  version = '%s %s %s' % (symbolic_image_id, hex_image_id[:12], created)
  ret_value = gs.get_version_cache().update('', version)
  gs.logger_info('get_version() returns: %s', ret_value)
  return ret_value
Exemplo n.º 13
0
def get_containers_with_metrics(gs, docker_host):
    """Gets the list of all containers in 'docker_host' with metric annotations.

  Args:
    gs: global state.
    docker_host: the Docker host running the containers.

  Returns:
    list of wrapped container objects.
    Each element in the list is the result of
    utilities.wrap_object(container, 'Container', ...)

  Raises:
    CollectorError: in case of failure to fetch data from Docker.
    Other exceptions may be raised due to exectution errors.
  """
    # Create a lookup table from pod IDs to pods.
    # This lookup table is needed when annotating containers with
    # metrics. Also compute the project's name.
    containers_list = get_containers(gs, docker_host)
    if not containers_list:
        return []

    pod_id_to_pod = {}
    project_id = '_unknown_'

    # Populate the pod ID to pod lookup table.
    # Compute the project_id from the name of the first pod.
    for pod in kubernetes.get_pods(gs, docker_host):
        assert utilities.is_wrapped_object(pod, 'Pod')
        pod_id_to_pod[pod['id']] = pod
        if project_id != '_unknown_':
            continue
        pod_hostname = utilities.get_attribute(pod,
                                               ['properties', 'spec', 'host'])
        if utilities.valid_string(pod_hostname):
            project_id = utilities.node_id_to_project_id(pod_hostname)

    # We know that there are containers in this docker_host.
    if not pod_id_to_pod:
        # there are no pods in this docker_host.
        msg = 'Docker host %s has containers but no pods' % docker_host
        gs.logger_exception(msg)
        raise collector_error.CollectorError(msg)

    # Annotate the containers with their metrics.
    for container in containers_list:
        assert utilities.is_wrapped_object(container, 'Container')

        parent_pod_id = utilities.get_parent_pod_id(container)
        if not utilities.valid_string(parent_pod_id):
            msg = ('missing or invalid parent pod ID in container %s' %
                   container['id'])
            gs.logger_error(msg)
            raise collector_error.CollectorError(msg)

        if parent_pod_id not in pod_id_to_pod:
            msg = ('could not locate parent pod %s for container %s' %
                   (parent_pod_id, container['id']))
            gs.logger_error(msg)
            raise collector_error.CollectorError(msg)

        # Note that the project ID may be '_unknown_'.
        # This is not a big deal, because the aggregator knows the project ID.
        metrics.annotate_container(project_id, container,
                                   pod_id_to_pod[parent_pod_id])

    return containers_list