def collect_job_metrics(gpuInfos): stats = docker_stats.stats() if stats is None: logger.warning("docker stats returns None") return None result = [] for container in stats: inspectInfo = docker_inspect.inspect(container) if inspectInfo is None or not inspectInfo["labels"]: continue gpuIds, otherLabels = parse_from_labels(inspectInfo["labels"]) otherLabels.update(inspectInfo["env"]) for id in gpuIds: if gpuInfos: logger.info(gpuInfos) labels = copy.deepcopy(otherLabels) labels["minor_number"] = id result.append(Metric("container_GPUPerc", labels, gpuInfos[id]["gpuUtil"])) result.append(Metric("container_GPUMemPerc", labels, gpuInfos[id]["gpuMemUtil"])) result.append(Metric("container_CPUPerc", otherLabels, stats[container]["CPUPerc"])) result.append(Metric("container_MemUsage", otherLabels, stats[container]["MemUsage_Limit"]["usage"])) result.append(Metric("container_MemLimit", otherLabels, stats[container]["MemUsage_Limit"]["limit"])) result.append(Metric("container_NetIn", otherLabels, stats[container]["NetIO"]["in"])) result.append(Metric("container_NetOut", otherLabels, stats[container]["NetIO"]["out"])) result.append(Metric("container_BlockIn", otherLabels, stats[container]["BlockIO"]["in"])) result.append(Metric("container_BlockOut", otherLabels, stats[container]["BlockIO"]["out"])) result.append(Metric("container_MemPerc", otherLabels, stats[container]["MemPerc"])) return result
def collect_job_metrics(gpuInfos): stats = docker_stats.stats() if stats is None: logger.warning("docker stats returns None") return None result = [] for container_id, stats in stats.items(): pai_service_name = None # TODO speed this up, since this is O(n^2) for service_name in pai_services: if stats["name"].startswith(service_name): pai_service_name = service_name[4:] # remove "k8s_" prefix break if pai_service_name is None: inspectInfo = docker_inspect.inspect(container_id) if inspectInfo is None or not inspectInfo["labels"]: continue gpuIds, otherLabels = parse_from_labels(inspectInfo["labels"]) otherLabels.update(inspectInfo["env"]) for id in gpuIds: if gpuInfos: logger.info(gpuInfos) labels = copy.deepcopy(otherLabels) labels["minor_number"] = id result.append(Metric("container_GPUPerc", labels, gpuInfos[id]["gpuUtil"])) result.append(Metric("container_GPUMemPerc", labels, gpuInfos[id]["gpuMemUtil"])) result.append(Metric("container_CPUPerc", otherLabels, stats["CPUPerc"])) result.append(Metric("container_MemUsage", otherLabels, stats["MemUsage_Limit"]["usage"])) result.append(Metric("container_MemLimit", otherLabels, stats["MemUsage_Limit"]["limit"])) result.append(Metric("container_NetIn", otherLabels, stats["NetIO"]["in"])) result.append(Metric("container_NetOut", otherLabels, stats["NetIO"]["out"])) result.append(Metric("container_BlockIn", otherLabels, stats["BlockIO"]["in"])) result.append(Metric("container_BlockOut", otherLabels, stats["BlockIO"]["out"])) result.append(Metric("container_MemPerc", otherLabels, stats["MemPerc"])) else: labels = {"name": pai_service_name} result.append(Metric("service_cpu_percent", labels, stats["CPUPerc"])) result.append(Metric("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"])) result.append(Metric("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"])) result.append(Metric("service_mem_usage_percent", labels, stats["MemPerc"])) result.append(Metric("service_net_in_byte", labels, stats["NetIO"]["in"])) result.append(Metric("service_net_out_byte", labels, stats["NetIO"]["out"])) result.append(Metric("service_block_in_byte", labels, stats["BlockIO"]["in"])) result.append(Metric("service_block_out_byte", labels, stats["BlockIO"]["out"])) return result
def genJobMetrics(logDir, gpuMetrics): stats = docker_stats.stats() outputFile = open(logDir + "/job_exporter.prom", "w") for container in stats: inspectInfo = docker_inspect.inspect(container) if not inspectInfo["labels"]: continue gpuIds, labelStr = parseFromLabels(inspectInfo["labels"]) envStr = parseFromEnv(inspectInfo["env"]) labelStr = labelStr + envStr for id in gpuIds: print("gpu id") print(id) if gpuMetrics: print(gpuMetrics) containerGpuUtilStr = 'container_GPUPerc{{{0}minor_number=\"{1}\"}} {2}\n'.format( labelStr, id, gpuMetrics[id]["gpuUtil"]) containerMemUtilStr = 'container_GPUMemPerc{{{0}minor_number=\"{1}\"}} {2}\n'.format( labelStr, id, gpuMetrics[id]["gpuMemUtil"]) outputFile.write(containerGpuUtilStr) outputFile.write(containerMemUtilStr) containerCPUPerc = 'container_CPUPerc{{{0}}} {1}\n'.format( labelStr, stats[container]["CPUPerc"]) containerMemUsage = 'container_MemUsage{{{0}}} {1}\n'.format( labelStr, stats[container]["MemUsage_Limit"]["usage"]) containerMemLimit = 'container_MemLimit{{{0}}} {1}\n'.format( labelStr, stats[container]["MemUsage_Limit"]["limit"]) containerNetIn = 'container_NetIn{{{0}}} {1}\n'.format( labelStr, stats[container]["NetIO"]["in"]) containerNetOut = 'container_NetOut{{{0}}} {1}\n'.format( labelStr, stats[container]["NetIO"]["out"]) containerBlockIn = 'container_BlockIn{{{0}}} {1}\n'.format( labelStr, stats[container]["BlockIO"]["in"]) containerBlockOut = 'container_BlockOut{{{0}}} {1}\n'.format( labelStr, stats[container]["BlockIO"]["out"]) containerMemPerc = 'container_MemPerc{{{0}}} {1}\n'.format( labelStr, stats[container]["MemPerc"]) outputFile.write(containerCPUPerc) outputFile.write(containerMemUsage) outputFile.write(containerMemLimit) outputFile.write(containerNetIn) outputFile.write(containerNetOut) outputFile.write(containerBlockIn) outputFile.write(containerBlockOut) outputFile.write(containerMemPerc)
def process_one_container(self, container_id, stats, gpu_infos, all_conns, gauges): container_name = utils.walk_json_field_safe(stats, "name") pai_service_name = ContainerCollector.infer_service_name( container_name) inspect_info = docker_inspect.inspect( container_id, ContainerCollector.inspect_histogram, ContainerCollector.inspect_timeout, self.gpu_vendor) pid = inspect_info.pid job_name = inspect_info.job_name logger.debug("%s has inspect result %s, service_name %s", container_name, inspect_info, pai_service_name) if job_name is None and pai_service_name is None: logger.debug("%s is ignored", container_name) return # other container, maybe kubelet or api-server # get network consumption, since all our services/jobs running in host # network, and network statistic from docker is not specific to that # container. We have to get network statistic by ourselves. lsof_result = network.lsof(pid, ContainerCollector.lsof_histogram, ContainerCollector.lsof_timeout) net_in, net_out = network.get_container_network_metrics( all_conns, lsof_result) if logger.isEnabledFor(logging.DEBUG): debug_info = utils.exec_cmd( "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True) logger.debug( "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid, debug_info.strip(), lsof_result, net_in, net_out) if pai_service_name is None: gpu_ids, container_labels = ContainerCollector.parse_from_labels( inspect_info, gpu_infos) if gpu_infos: for id in gpu_ids: if gpu_infos.get(id) is None: continue nvidia_gpu_status = gpu_infos[id] labels = copy.deepcopy(container_labels) labels["minor_number"] = id gauges.add_value("task_gpu_percent", labels, nvidia_gpu_status.gpu_util) gauges.add_value("task_gpu_mem_percent", labels, nvidia_gpu_status.gpu_mem_util) gauges.add_value("task_cpu_percent", container_labels, stats["CPUPerc"]) gauges.add_value("task_mem_usage_byte", container_labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("task_mem_limit_byte", container_labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("task_net_in_byte", container_labels, net_in) gauges.add_value("task_net_out_byte", container_labels, net_out) gauges.add_value("task_block_in_byte", container_labels, stats["BlockIO"]["in"]) gauges.add_value("task_block_out_byte", container_labels, stats["BlockIO"]["out"]) gauges.add_value("task_mem_usage_percent", container_labels, stats["MemPerc"]) else: labels = {"name": pai_service_name} gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"]) gauges.add_value("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("service_mem_usage_percent", labels, stats["MemPerc"]) gauges.add_value("service_net_in_byte", labels, net_in) gauges.add_value("service_net_out_byte", labels, net_out) gauges.add_value("service_block_in_byte", labels, stats["BlockIO"]["in"]) gauges.add_value("service_block_out_byte", labels, stats["BlockIO"]["out"])
def collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies): stats_obj = docker_stats.stats() if stats_obj is None: logger.warning("docker stats returns None") return None result = [] for container_id, stats in stats_obj.items(): pai_service_name = None # TODO speed this up, since this is O(n^2) for service_name in pai_services: if stats["name"].startswith(service_name): pai_service_name = service_name[4:] # remove "k8s_" prefix break inspect_info = docker_inspect.inspect(container_id) pid = inspect_info["pid"] if inspect_info is not None else None inspect_labels = utils.walk_json_field_safe(inspect_info, "labels") if not inspect_labels and pai_service_name is None: continue # other container, maybe kubelet or api-server # get network consumption, since all our services/jobs running in host network, # network statistic from docker is not specific to that container. We have to # get network statistic by ourselves. lsof_result = network.lsof(pid) net_in, net_out = network.get_container_network_metrics( all_conns, lsof_result) if logger.isEnabledFor(logging.DEBUG): debug_info = utils.check_output( "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True) logger.debug( "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid, debug_info, lsof_result, net_in, net_out) if pai_service_name is None: gpuIds, otherLabels = parse_from_labels(inspect_info["labels"]) otherLabels.update(inspect_info["env"]) for id in gpuIds: if gpu_infos: labels = copy.deepcopy(otherLabels) labels["minor_number"] = id result.append( Metric("container_GPUPerc", labels, gpu_infos[id]["gpuUtil"])) result.append( Metric("container_GPUMemPerc", labels, gpu_infos[id]["gpuMemUtil"])) result.append( Metric("container_CPUPerc", otherLabels, stats["CPUPerc"])) result.append( Metric("container_MemUsage", otherLabels, stats["MemUsage_Limit"]["usage"])) result.append( Metric("container_MemLimit", otherLabels, stats["MemUsage_Limit"]["limit"])) result.append(Metric("container_NetIn", otherLabels, net_in)) result.append(Metric("container_NetOut", otherLabels, net_out)) result.append( Metric("container_BlockIn", otherLabels, stats["BlockIO"]["in"])) result.append( Metric("container_BlockOut", otherLabels, stats["BlockIO"]["out"])) result.append( Metric("container_MemPerc", otherLabels, stats["MemPerc"])) else: labels = {"name": pai_service_name} result.append( Metric("service_cpu_percent", labels, stats["CPUPerc"])) result.append( Metric("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"])) result.append( Metric("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"])) result.append( Metric("service_mem_usage_percent", labels, stats["MemPerc"])) result.append(Metric("service_net_in_byte", labels, net_in)) result.append(Metric("service_net_out_byte", labels, net_out)) result.append( Metric("service_block_in_byte", labels, stats["BlockIO"]["in"])) result.append( Metric("service_block_out_byte", labels, stats["BlockIO"]["out"])) result.extend( generate_zombie_count(stats_obj, type1_zombies, type2_zombies)) return result
def process_one_container(self, container_id, stats, gpu_infos, all_conns, gauges,npu_infos,dcgm_infos): container_name = utils.walk_json_field_safe(stats, "name") pai_service_name = ContainerCollector.infer_service_name(container_name) inspect_info = docker_inspect.inspect(container_id, ContainerCollector.inspect_histogram, ContainerCollector.inspect_timeout) if inspect_info is None: return pid = inspect_info.pid job_name = inspect_info.job_name logger.debug("%s has inspect result %s, service_name %s", container_name, inspect_info, pai_service_name) if job_name is None and pai_service_name is None: logger.debug("%s is ignored", container_name) return # other container, maybe kubelet or api-server # get network consumption, if container is host network, we will treat # node network consumption as container consumption. If not, use data # from docker state. # This will result network consumption of service using host network # equals to node network consumption. is_host_network = inspect_info.is_host_network if is_host_network: net_in, net_out = network.get_network_consumption( self.network_interface) else: net_in, net_out = network.get_non_host_network_consumption(pid) if pai_service_name is None: gpu_ids,npu_ids,container_labels = ContainerCollector.parse_from_labels(inspect_info, gpu_infos) logger.info("start to collect metric for jobId: %s",container_labels["job_name"]) if container_labels["username"] == "unknown": logger.warn("jobId: %s has none username,pass!" %(container_labels["job_name"])) return if gpu_infos: for id in gpu_ids: if gpu_infos.get(id) is None: continue nvidia_gpu_status = gpu_infos[id] uuid = nvidia_gpu_status.uuid labels = copy.deepcopy(container_labels) labels["minor_number"] = id labels["device_type"] = inspect_info.gpu_type or "unknown" labels["uuid"] = uuid labels["device_str"] = "nvidia.com/gpu" gauges.add_value("task_device_percent", labels, nvidia_gpu_status.gpu_util) gauges.add_value("task_device_mem_percent", labels, nvidia_gpu_status.gpu_mem_util) if npu_infos: for id in npu_ids: if npu_infos.get(id) is None: continue npu_status = npu_infos[id] labels = copy.deepcopy(container_labels) labels["minor_number"] = id labels["device_type"] = inspect_info.gpu_type or "unknown" labels["device_str"] = "npu.huawei.com/NPU" ### each npu device should have one unique string labels["uuid"] = id if inspect_info.node_name: labels["uuid"] =inspect_info.node_name+ "_" + str(id) gauges.add_value("task_device_percent", labels, npu_status.npu_util) gauges.add_value("task_device_mem_percent", labels, npu_status.npu_mem_util) if dcgm_infos: for id in gpu_ids: if dcgm_infos.get(id) is None: continue dcgm_metric = dcgm_infos[id] # will be type of DCGMMetrics uuid = dcgm_metric.uuid labels = copy.deepcopy(container_labels) labels["minor_number"] = id labels["uuid"] = uuid gauges.add_dcgm_metric(dcgm_metric, labels) gauges.add_value("task_cpu_percent", container_labels, stats["CPUPerc"]) gauges.add_value("task_mem_usage_byte", container_labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("task_mem_limit_byte", container_labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("task_net_in_byte", container_labels, net_in) gauges.add_value("task_net_out_byte", container_labels, net_out) gauges.add_value("task_block_in_byte", container_labels, stats["BlockIO"]["in"]) gauges.add_value("task_block_out_byte", container_labels, stats["BlockIO"]["out"]) gauges.add_value("task_mem_usage_percent", container_labels, stats["MemPerc"]) else: labels = {"name": pai_service_name} gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"]) gauges.add_value("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("service_mem_usage_percent", labels, stats["MemPerc"]) gauges.add_value("service_net_in_byte", labels, net_in) gauges.add_value("service_net_out_byte", labels, net_out) gauges.add_value("service_block_in_byte", labels, stats["BlockIO"]["in"]) gauges.add_value("service_block_out_byte", labels, stats["BlockIO"]["out"])
def process_one_container(self, container_id, stats, gpu_infos, all_conns, gauges, dcgm_infos, infiniband_infos, ipoib_infos): container_name = utils.walk_json_field_safe(stats, "name") pai_service_name = ContainerCollector.infer_service_name( container_name) inspect_info = docker_inspect.inspect( container_id, ContainerCollector.inspect_histogram, ContainerCollector.inspect_timeout) if inspect_info is None: logger.debug("ignore killed container %s", container_id) return pid = inspect_info.pid job_name = inspect_info.job_name logger.debug("%s has inspect result %s, service_name %s", container_name, inspect_info, pai_service_name) if job_name is None and pai_service_name is None: logger.debug("%s is ignored", container_name) return # other container, maybe kubelet or api-server # get network consumption, if container is host network, we will treat # node network consumption as container consumption. If not, use data # from docker state. # This will result network consumption of service using host network # equals to node network consumption. is_host_network = inspect_info.is_host_network if is_host_network: net_in, net_out = network.get_network_consumption( self.network_interface) else: net_in, net_out = network.get_non_host_network_consumption(pid) if pai_service_name is None: gpu_ids, container_labels = ContainerCollector.parse_from_labels( inspect_info, gpu_infos) if gpu_infos: for id in gpu_ids: if gpu_infos.get(id) is None: continue nvidia_gpu_status = gpu_infos[id] uuid = nvidia_gpu_status.uuid labels = copy.deepcopy(container_labels) labels["minor_number"] = id labels["uuid"] = uuid gauges.add_value("task_gpu_percent", labels, nvidia_gpu_status.gpu_util) gauges.add_value("task_gpu_mem_percent", labels, nvidia_gpu_status.gpu_mem_util) if dcgm_infos: for id in gpu_ids: if dcgm_infos.get(id) is None: continue dcgm_metric = dcgm_infos[id] # will be type of DCGMMetrics uuid = dcgm_metric.uuid labels = copy.deepcopy(container_labels) labels["minor_number"] = id labels["uuid"] = uuid gauges.add_dcgm_metric(dcgm_metric, labels) if is_host_network: if infiniband_infos: for infiniband_info in infiniband_infos: labels = copy.deepcopy(container_labels) labels.update(infiniband_info.labels) gauges.add_value("task_infiniband_receive_bytes_total", labels, infiniband_info.receive_bytes) gauges.add_value( "task_infiniband_transmit_bytes_total", labels, infiniband_info.transmit_bytes) if ipoib_infos: for ipoib_info in ipoib_infos: labels = copy.deepcopy(container_labels) labels.update(ipoib_info.labels) gauges.add_value("task_ipoib_receive_bytes_total", labels, ipoib_info.receive_bytes) gauges.add_value("task_ipoib_transmit_bytes_total", labels, ipoib_info.transmit_bytes) gauges.add_value("task_cpu_percent", container_labels, stats["CPUPerc"]) gauges.add_value("task_mem_usage_byte", container_labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("task_mem_limit_byte", container_labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("task_net_in_byte", container_labels, net_in) gauges.add_value("task_net_out_byte", container_labels, net_out) gauges.add_value("task_block_in_byte", container_labels, stats["BlockIO"]["in"]) gauges.add_value("task_block_out_byte", container_labels, stats["BlockIO"]["out"]) gauges.add_value("task_mem_usage_percent", container_labels, stats["MemPerc"]) else: labels = {"name": pai_service_name} gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"]) gauges.add_value("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"]) gauges.add_value("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"]) gauges.add_value("service_mem_usage_percent", labels, stats["MemPerc"]) gauges.add_value("service_net_in_byte", labels, net_in) gauges.add_value("service_net_out_byte", labels, net_out) gauges.add_value("service_block_in_byte", labels, stats["BlockIO"]["in"]) gauges.add_value("service_block_out_byte", labels, stats["BlockIO"]["out"])