def process_one_container(self, container_id, stats, gpu_infos, all_conns, gauges): container_name = utils.walk_json_field_safe(stats, "name") pai_service_name = ContainerCollector.infer_service_name( container_name) inspect_info = docker_inspect.inspect( container_id, ContainerCollector.inspect_histogram, ContainerCollector.inspect_timeout, self.gpu_vendor) pid = inspect_info.pid job_name = inspect_info.job_name logger.debug("%s has inspect result %s, service_name %s", container_name, inspect_info, pai_service_name) if job_name is None and pai_service_name is None: logger.debug("%s is ignored", container_name) return # other container, maybe kubelet or api-server # get network consumption, since all our services/jobs running in host # network, and network statistic from docker is not specific to that # container. We have to get network statistic by ourselves. lsof_result = network.lsof(pid, ContainerCollector.lsof_histogram, ContainerCollector.lsof_timeout) net_in, net_out = network.get_container_network_metrics( all_conns, lsof_result) if logger.isEnabledFor(logging.DEBUG): debug_info = utils.exec_cmd( "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True) logger.debug( "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid, debug_info.strip(), lsof_result, net_in, net_out) if pai_service_name is None: gpu_ids, container_labels = ContainerCollector.parse_from_labels( inspect_info, gpu_infos) if gpu_infos: for id in gpu_ids: if gpu_infos.get(id) is None: continue nvidia_gpu_status = gpu_infos[id] labels = copy.deepcopy(container_labels) labels["minor_number"] = id gauges.add_value("task_gpu_percent", labels, nvidia_gpu_status.gpu_util) gauges.add_value("task_gpu_mem_percent", labels, nvidia_gpu_status.gpu_mem_util) gauges.add_value("task_cpu_percent", container_labels, stats["CPUPerc"]) gauges.add_value("task_mem_usage_byte", container_labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("task_mem_limit_byte", container_labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("task_net_in_byte", container_labels, net_in) gauges.add_value("task_net_out_byte", container_labels, net_out) gauges.add_value("task_block_in_byte", container_labels, stats["BlockIO"]["in"]) gauges.add_value("task_block_out_byte", container_labels, stats["BlockIO"]["out"]) gauges.add_value("task_mem_usage_percent", container_labels, stats["MemPerc"]) else: labels = {"name": pai_service_name} gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"]) gauges.add_value("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("service_mem_usage_percent", labels, stats["MemPerc"]) gauges.add_value("service_net_in_byte", labels, net_in) gauges.add_value("service_net_out_byte", labels, net_out) gauges.add_value("service_block_in_byte", labels, stats["BlockIO"]["in"]) gauges.add_value("service_block_out_byte", labels, stats["BlockIO"]["out"])
def collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies): stats_obj = docker_stats.stats() if stats_obj is None: logger.warning("docker stats returns None") return None result = [] for container_id, stats in stats_obj.items(): pai_service_name = None # TODO speed this up, since this is O(n^2) for service_name in pai_services: if stats["name"].startswith(service_name): pai_service_name = service_name[4:] # remove "k8s_" prefix break inspect_info = docker_inspect.inspect(container_id) pid = inspect_info["pid"] if inspect_info is not None else None inspect_labels = utils.walk_json_field_safe(inspect_info, "labels") if not inspect_labels and pai_service_name is None: continue # other container, maybe kubelet or api-server # get network consumption, since all our services/jobs running in host network, # network statistic from docker is not specific to that container. We have to # get network statistic by ourselves. lsof_result = network.lsof(pid) net_in, net_out = network.get_container_network_metrics( all_conns, lsof_result) if logger.isEnabledFor(logging.DEBUG): debug_info = utils.check_output( "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True) logger.debug( "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid, debug_info, lsof_result, net_in, net_out) if pai_service_name is None: gpuIds, otherLabels = parse_from_labels(inspect_info["labels"]) otherLabels.update(inspect_info["env"]) for id in gpuIds: if gpu_infos: labels = copy.deepcopy(otherLabels) labels["minor_number"] = id result.append( Metric("container_GPUPerc", labels, gpu_infos[id]["gpuUtil"])) result.append( Metric("container_GPUMemPerc", labels, gpu_infos[id]["gpuMemUtil"])) result.append( Metric("container_CPUPerc", otherLabels, stats["CPUPerc"])) result.append( Metric("container_MemUsage", otherLabels, stats["MemUsage_Limit"]["usage"])) result.append( Metric("container_MemLimit", otherLabels, stats["MemUsage_Limit"]["limit"])) result.append(Metric("container_NetIn", otherLabels, net_in)) result.append(Metric("container_NetOut", otherLabels, net_out)) result.append( Metric("container_BlockIn", otherLabels, stats["BlockIO"]["in"])) result.append( Metric("container_BlockOut", otherLabels, stats["BlockIO"]["out"])) result.append( Metric("container_MemPerc", otherLabels, stats["MemPerc"])) else: labels = {"name": pai_service_name} result.append( Metric("service_cpu_percent", labels, stats["CPUPerc"])) result.append( Metric("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"])) result.append( Metric("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"])) result.append( Metric("service_mem_usage_percent", labels, stats["MemPerc"])) result.append(Metric("service_net_in_byte", labels, net_in)) result.append(Metric("service_net_out_byte", labels, net_out)) result.append( Metric("service_block_in_byte", labels, stats["BlockIO"]["in"])) result.append( Metric("service_block_out_byte", labels, stats["BlockIO"]["out"])) result.extend( generate_zombie_count(stats_obj, type1_zombies, type2_zombies)) return result