예제 #1
0
def parse_docker_inspect(inspect_output):
    obj = json.loads(inspect_output)
    labels = {}
    envs = {}

    obj_labels = utils.walk_json_field_safe(obj, 0, "Config", "Labels")
    if obj_labels is not None:
        for key in obj_labels:
            if key in targetLabel:
                labelKey = "container_label_{0}".format(key.replace(".", "_"))
                labelVal = obj_labels[key]
                labels[labelKey] = labelVal

    obj_env = utils.walk_json_field_safe(obj, 0, "Config", "Env")
    if obj_env:
        for env in obj_env:
            envItem = env.split("=")
            if envItem[0] in targetEnv:
                envKey = "container_env_{0}".format(envItem[0].replace(
                    ".", "_"))
                envVal = envItem[1]
                envs[envKey] = envVal

    pid = utils.walk_json_field_safe(obj, 0, "State", "Pid")

    return {"env": envs, "labels": labels, "pid": pid}
예제 #2
0
def parse_docker_inspect(inspect_output):
    obj = json.loads(inspect_output)

    m = {}

    obj_labels = utils.walk_json_field_safe(obj, 0, "Config", "Labels")
    if obj_labels is not None:
        for k, v in obj_labels.items():
            if k in keys:
                m[k] = v

    obj_env = utils.walk_json_field_safe(obj, 0, "Config", "Env")
    if obj_env:
        for env in obj_env:
            k, v = env.split("=", 1)
            if k in keys:
                m[k] = v

            # for kube-launcher tasks
            if k == "FC_TASK_INDEX":
                m["PAI_TASK_INDEX"] = v
            elif k == "NVIDIA_VISIBLE_DEVICES" and v != "all" and v != "void":
                m["GPU_ID"] = v

            if k == "FC_FRAMEWORK_ATTEMPT_INSTANCE_UID" or k == "APP_ID":
                m["JOB_INSTANCE_ID"] = v

    pid = utils.walk_json_field_safe(obj, 0, "State", "Pid")

    return InspectResult(
        m.get("PAI_USER_NAME") or m.get("DLWS_USER_NAME"),
        m.get("PAI_JOB_NAME") or m.get("DLWS_JOB_ID"),
        m.get("PAI_CURRENT_TASK_ROLE_NAME"), m.get("PAI_TASK_INDEX"),
        m.get("GPU_ID"), m.get("JOB_INSTANCE_ID"), pid)
예제 #3
0
def parse_docker_inspect(inspect_output):
    obj = json.loads(inspect_output)
    labels = {}
    envs = {}

    obj_labels = utils.walk_json_field_safe(obj, 0, "Config", "Labels")
    if obj_labels is not None:
        for key in obj_labels:
            if key in target_label:
                label_key = "container_label_{0}".format(key)
                label_val = obj_labels[key]
                labels[label_key] = label_val

    obj_env = utils.walk_json_field_safe(obj, 0, "Config", "Env")
    if obj_env:
        for env in obj_env:
            k, v = env.split("=", 1)
            if k in target_env:
                key = "container_env_{0}".format(k)
                envs[key] = v

            # for kube-launcher tasks
            if k in target_label:
                label_key = "container_label_{0}".format(k)
                labels[label_key] = v
            if k == "FC_TASK_INDEX":
                envs["container_env_PAI_TASK_INDEX"] = v

    pid = utils.walk_json_field_safe(obj, 0, "State", "Pid")

    return {"env": envs, "labels": labels, "pid": pid}
예제 #4
0
파일: test_utils.py 프로젝트: zmoon111/pai
 def test_walk_json_field_safe(self):
     self.assertIsNone(utils.walk_json_field_safe(None, 1, "abc"))
     self.assertIsNone(utils.walk_json_field_safe([], 1, "abc"))
     self.assertIsNone(utils.walk_json_field_safe([{"abc"}], 1, "abc"))
     self.assertEqual(
         "345",
         utils.walk_json_field_safe([{
             "name": "123"
         }, {
             "name": "345"
         }], 1, "name"))
예제 #5
0
def parse_docker_inspect(inspect_output):
    obj = json.loads(inspect_output)

    m = {}

    obj_labels = utils.walk_json_field_safe(obj, 0, "Config", "Labels")
    if obj_labels is not None:
        for k, v in obj_labels.items():
            if k in keys:
                m[k] = v

    obj_env = utils.walk_json_field_safe(obj, 0, "Config", "Env")
    if obj_env:
        for env in obj_env:
            k, v = env.split("=", 1)
            if k in keys:
                m[k] = v

            # for kube-launcher tasks
            if k == "FC_TASK_INDEX":
                m["PAI_TASK_INDEX"] = v
            elif k == "NVIDIA_VISIBLE_DEVICES" and v != "all" and v != "void":
                m["GPU_ID"] = v

    pid = utils.walk_json_field_safe(obj, 0, "State", "Pid")
    logger.info("m is %s", m)

    return InspectResult(
        select_value_with_key(
            m, ["PAI_USER_NAME", "DLWS_USER_NAME", "DLTS_USER_NAME"]),
        select_value_with_key(m,
                              ["PAI_JOB_NAME", "DLWS_JOB_ID", "DLTS_JOB_ID"]),
        select_value_with_key(
            m,
            ["PAI_CURRENT_TASK_ROLE_NAME", "DLWS_ROLE_NAME", "DLTS_ROLE_NAME"
             ]),
        select_value_with_key(m, [
            "PAI_TASK_INDEX", "DLWS_ROLE_IDX", "DLTS_ROLE_IDX", "FC_TASK_INDEX"
        ]),
        select_value_with_key(m, ["POD_NAME", "PAI_JOB_NAME"]),
        m.get("GPU_ID"),
        pid,
        select_value_with_key(m, ["DLWS_USER_EMAIL", "DLTS_USER_EMAIL"]),
        select_value_with_key(m, ["DLWS_VC_NAME", "DLTS_VC_NAME"]),
        m.get("DLWS_HOST_NETWORK") == "enable"
        or m.get("DLTS_HOST_NETWORK") == "enable",
    )
예제 #6
0
    def collect_container_metrics(self, stats_obj, gpu_infos, all_conns):
        if stats_obj is None:
            logger.warning("docker stats returns None")
            return None

        gauges = ResourceGauges()

        for container_id, stats in stats_obj.items():
            try:
                self.process_one_container(container_id, stats, gpu_infos, all_conns, gauges)
            except Exception:
                logger.exception("error when trying to process container %s with name %s",
                        container_id, utils.walk_json_field_safe(stats, "name"))

        return gauges.as_array()
예제 #7
0
    def process_one_container(self, container_id, stats, gpu_infos, all_conns,
                              gauges):
        container_name = utils.walk_json_field_safe(stats, "name")
        pai_service_name = ContainerCollector.infer_service_name(
            container_name)

        inspect_info = docker_inspect.inspect(
            container_id, ContainerCollector.inspect_histogram,
            ContainerCollector.inspect_timeout, self.gpu_vendor)

        pid = inspect_info.pid
        job_name = inspect_info.job_name

        logger.debug("%s has inspect result %s, service_name %s",
                     container_name, inspect_info, pai_service_name)

        if job_name is None and pai_service_name is None:
            logger.debug("%s is ignored", container_name)
            return  # other container, maybe kubelet or api-server

        # get network consumption, since all our services/jobs running in host
        # network, and network statistic from docker is not specific to that
        # container. We have to get network statistic by ourselves.
        lsof_result = network.lsof(pid, ContainerCollector.lsof_histogram,
                                   ContainerCollector.lsof_timeout)

        net_in, net_out = network.get_container_network_metrics(
            all_conns, lsof_result)
        if logger.isEnabledFor(logging.DEBUG):
            debug_info = utils.exec_cmd(
                "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True)

            logger.debug(
                "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid,
                debug_info.strip(), lsof_result, net_in, net_out)

        if pai_service_name is None:
            gpu_ids, container_labels = ContainerCollector.parse_from_labels(
                inspect_info, gpu_infos)

            if gpu_infos:
                for id in gpu_ids:
                    if gpu_infos.get(id) is None:
                        continue

                    nvidia_gpu_status = gpu_infos[id]
                    labels = copy.deepcopy(container_labels)
                    labels["minor_number"] = id

                    gauges.add_value("task_gpu_percent", labels,
                                     nvidia_gpu_status.gpu_util)
                    gauges.add_value("task_gpu_mem_percent", labels,
                                     nvidia_gpu_status.gpu_mem_util)

            gauges.add_value("task_cpu_percent", container_labels,
                             stats["CPUPerc"])
            gauges.add_value("task_mem_usage_byte", container_labels,
                             stats["MemUsage_Limit"]["usage"])
            gauges.add_value("task_mem_limit_byte", container_labels,
                             stats["MemUsage_Limit"]["limit"])
            gauges.add_value("task_net_in_byte", container_labels, net_in)
            gauges.add_value("task_net_out_byte", container_labels, net_out)
            gauges.add_value("task_block_in_byte", container_labels,
                             stats["BlockIO"]["in"])
            gauges.add_value("task_block_out_byte", container_labels,
                             stats["BlockIO"]["out"])
            gauges.add_value("task_mem_usage_percent", container_labels,
                             stats["MemPerc"])
        else:
            labels = {"name": pai_service_name}
            gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"])
            gauges.add_value("service_mem_usage_byte", labels,
                             stats["MemUsage_Limit"]["usage"])
            gauges.add_value("service_mem_limit_byte", labels,
                             stats["MemUsage_Limit"]["limit"])
            gauges.add_value("service_mem_usage_percent", labels,
                             stats["MemPerc"])
            gauges.add_value("service_net_in_byte", labels, net_in)
            gauges.add_value("service_net_out_byte", labels, net_out)
            gauges.add_value("service_block_in_byte", labels,
                             stats["BlockIO"]["in"])
            gauges.add_value("service_block_out_byte", labels,
                             stats["BlockIO"]["out"])
예제 #8
0
def collect_job_metrics(gpu_infos, all_conns, type1_zombies, type2_zombies):
    stats_obj = docker_stats.stats()
    if stats_obj is None:
        logger.warning("docker stats returns None")
        return None

    result = []
    for container_id, stats in stats_obj.items():
        pai_service_name = None

        # TODO speed this up, since this is O(n^2)
        for service_name in pai_services:
            if stats["name"].startswith(service_name):
                pai_service_name = service_name[4:]  # remove "k8s_" prefix
                break

        inspect_info = docker_inspect.inspect(container_id)
        pid = inspect_info["pid"] if inspect_info is not None else None
        inspect_labels = utils.walk_json_field_safe(inspect_info, "labels")

        if not inspect_labels and pai_service_name is None:
            continue  # other container, maybe kubelet or api-server

        # get network consumption, since all our services/jobs running in host network,
        # network statistic from docker is not specific to that container. We have to
        # get network statistic by ourselves.
        lsof_result = network.lsof(pid)
        net_in, net_out = network.get_container_network_metrics(
            all_conns, lsof_result)
        if logger.isEnabledFor(logging.DEBUG):
            debug_info = utils.check_output(
                "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True)

            logger.debug(
                "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid,
                debug_info, lsof_result, net_in, net_out)

        if pai_service_name is None:
            gpuIds, otherLabels = parse_from_labels(inspect_info["labels"])
            otherLabels.update(inspect_info["env"])

            for id in gpuIds:
                if gpu_infos:
                    labels = copy.deepcopy(otherLabels)
                    labels["minor_number"] = id

                    result.append(
                        Metric("container_GPUPerc", labels,
                               gpu_infos[id]["gpuUtil"]))
                    result.append(
                        Metric("container_GPUMemPerc", labels,
                               gpu_infos[id]["gpuMemUtil"]))

            result.append(
                Metric("container_CPUPerc", otherLabels, stats["CPUPerc"]))
            result.append(
                Metric("container_MemUsage", otherLabels,
                       stats["MemUsage_Limit"]["usage"]))
            result.append(
                Metric("container_MemLimit", otherLabels,
                       stats["MemUsage_Limit"]["limit"]))
            result.append(Metric("container_NetIn", otherLabels, net_in))
            result.append(Metric("container_NetOut", otherLabels, net_out))
            result.append(
                Metric("container_BlockIn", otherLabels,
                       stats["BlockIO"]["in"]))
            result.append(
                Metric("container_BlockOut", otherLabels,
                       stats["BlockIO"]["out"]))
            result.append(
                Metric("container_MemPerc", otherLabels, stats["MemPerc"]))
        else:
            labels = {"name": pai_service_name}
            result.append(
                Metric("service_cpu_percent", labels, stats["CPUPerc"]))
            result.append(
                Metric("service_mem_usage_byte", labels,
                       stats["MemUsage_Limit"]["usage"]))
            result.append(
                Metric("service_mem_limit_byte", labels,
                       stats["MemUsage_Limit"]["limit"]))
            result.append(
                Metric("service_mem_usage_percent", labels, stats["MemPerc"]))
            result.append(Metric("service_net_in_byte", labels, net_in))
            result.append(Metric("service_net_out_byte", labels, net_out))
            result.append(
                Metric("service_block_in_byte", labels,
                       stats["BlockIO"]["in"]))
            result.append(
                Metric("service_block_out_byte", labels,
                       stats["BlockIO"]["out"]))

    result.extend(
        generate_zombie_count(stats_obj, type1_zombies, type2_zombies))

    return result
예제 #9
0
    def process_one_container(self, container_id, stats, gpu_infos, all_conns, gauges,npu_infos,dcgm_infos):
        container_name = utils.walk_json_field_safe(stats, "name")
        pai_service_name = ContainerCollector.infer_service_name(container_name)

        inspect_info = docker_inspect.inspect(container_id,
                ContainerCollector.inspect_histogram,
                ContainerCollector.inspect_timeout)

        if inspect_info is None:
            return

        pid = inspect_info.pid
        job_name = inspect_info.job_name

        logger.debug("%s has inspect result %s, service_name %s",
                container_name, inspect_info, pai_service_name)

        if job_name is None and pai_service_name is None:
            logger.debug("%s is ignored", container_name)
            return # other container, maybe kubelet or api-server

        # get network consumption, if container is host network, we will treat
        # node network consumption as container consumption. If not, use data
        # from docker state.
        # This will result network consumption of service using host network
        # equals to node network consumption.
        is_host_network = inspect_info.is_host_network
        if is_host_network:
            net_in, net_out = network.get_network_consumption(
                self.network_interface)
        else:
            net_in, net_out = network.get_non_host_network_consumption(pid)

        if pai_service_name is None:
            gpu_ids,npu_ids,container_labels = ContainerCollector.parse_from_labels(inspect_info, gpu_infos)
            logger.info("start to collect metric for jobId: %s",container_labels["job_name"])
            if container_labels["username"] == "unknown":
                logger.warn("jobId: %s has none username,pass!" %(container_labels["job_name"]))
                return
            if gpu_infos:
                for id in gpu_ids:
                    if gpu_infos.get(id) is None:
                        continue

                    nvidia_gpu_status = gpu_infos[id]
                    uuid = nvidia_gpu_status.uuid
                    labels = copy.deepcopy(container_labels)
                    labels["minor_number"] = id
                    labels["device_type"] = inspect_info.gpu_type or "unknown"
                    labels["uuid"] = uuid
                    labels["device_str"] = "nvidia.com/gpu"

                    gauges.add_value("task_device_percent",
                            labels, nvidia_gpu_status.gpu_util)
                    gauges.add_value("task_device_mem_percent",
                            labels, nvidia_gpu_status.gpu_mem_util)

            if npu_infos:
                for id in npu_ids:
                    if npu_infos.get(id) is None:
                        continue

                    npu_status = npu_infos[id]
                    labels = copy.deepcopy(container_labels)
                    labels["minor_number"] = id
                    labels["device_type"] = inspect_info.gpu_type or "unknown"
                    labels["device_str"] = "npu.huawei.com/NPU"
                    ### each npu device should have one unique string
                    labels["uuid"] = id
                    if inspect_info.node_name:
                        labels["uuid"] =inspect_info.node_name+ "_" + str(id)

                    gauges.add_value("task_device_percent",
                            labels, npu_status.npu_util)
                    gauges.add_value("task_device_mem_percent",
                            labels, npu_status.npu_mem_util)
            if dcgm_infos:
                for id in gpu_ids:
                    if dcgm_infos.get(id) is None:
                        continue
                    dcgm_metric = dcgm_infos[id] # will be type of DCGMMetrics
                    uuid = dcgm_metric.uuid
                    labels = copy.deepcopy(container_labels)
                    labels["minor_number"] = id
                    labels["uuid"] = uuid
                    gauges.add_dcgm_metric(dcgm_metric, labels)

            gauges.add_value("task_cpu_percent", container_labels, stats["CPUPerc"])
            gauges.add_value("task_mem_usage_byte", container_labels, stats["MemUsage_Limit"]["usage"])
            gauges.add_value("task_mem_limit_byte", container_labels, stats["MemUsage_Limit"]["limit"])
            gauges.add_value("task_net_in_byte", container_labels, net_in)
            gauges.add_value("task_net_out_byte", container_labels, net_out)
            gauges.add_value("task_block_in_byte", container_labels, stats["BlockIO"]["in"])
            gauges.add_value("task_block_out_byte", container_labels, stats["BlockIO"]["out"])
            gauges.add_value("task_mem_usage_percent", container_labels, stats["MemPerc"])
        else:
            labels = {"name": pai_service_name}
            gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"])
            gauges.add_value("service_mem_usage_byte", labels, stats["MemUsage_Limit"]["usage"])
            gauges.add_value("service_mem_limit_byte", labels, stats["MemUsage_Limit"]["limit"])
            gauges.add_value("service_mem_usage_percent", labels, stats["MemPerc"])
            gauges.add_value("service_net_in_byte", labels, net_in)
            gauges.add_value("service_net_out_byte", labels, net_out)
            gauges.add_value("service_block_in_byte", labels, stats["BlockIO"]["in"])
            gauges.add_value("service_block_out_byte", labels, stats["BlockIO"]["out"])
예제 #10
0
    def process_one_container(self, container_id, stats, gpu_infos, all_conns,
                              gauges, dcgm_infos, infiniband_infos,
                              ipoib_infos):
        container_name = utils.walk_json_field_safe(stats, "name")
        pai_service_name = ContainerCollector.infer_service_name(
            container_name)

        inspect_info = docker_inspect.inspect(
            container_id, ContainerCollector.inspect_histogram,
            ContainerCollector.inspect_timeout)

        if inspect_info is None:
            logger.debug("ignore killed container %s", container_id)
            return

        pid = inspect_info.pid
        job_name = inspect_info.job_name

        logger.debug("%s has inspect result %s, service_name %s",
                     container_name, inspect_info, pai_service_name)

        if job_name is None and pai_service_name is None:
            logger.debug("%s is ignored", container_name)
            return  # other container, maybe kubelet or api-server

        # get network consumption, if container is host network, we will treat
        # node network consumption as container consumption. If not, use data
        # from docker state.
        # This will result network consumption of service using host network
        # equals to node network consumption.
        is_host_network = inspect_info.is_host_network
        if is_host_network:
            net_in, net_out = network.get_network_consumption(
                self.network_interface)
        else:
            net_in, net_out = network.get_non_host_network_consumption(pid)

        if pai_service_name is None:
            gpu_ids, container_labels = ContainerCollector.parse_from_labels(
                inspect_info, gpu_infos)

            if gpu_infos:
                for id in gpu_ids:
                    if gpu_infos.get(id) is None:
                        continue

                    nvidia_gpu_status = gpu_infos[id]
                    uuid = nvidia_gpu_status.uuid
                    labels = copy.deepcopy(container_labels)
                    labels["minor_number"] = id
                    labels["uuid"] = uuid

                    gauges.add_value("task_gpu_percent", labels,
                                     nvidia_gpu_status.gpu_util)
                    gauges.add_value("task_gpu_mem_percent", labels,
                                     nvidia_gpu_status.gpu_mem_util)

            if dcgm_infos:
                for id in gpu_ids:
                    if dcgm_infos.get(id) is None:
                        continue
                    dcgm_metric = dcgm_infos[id]  # will be type of DCGMMetrics
                    uuid = dcgm_metric.uuid
                    labels = copy.deepcopy(container_labels)
                    labels["minor_number"] = id
                    labels["uuid"] = uuid
                    gauges.add_dcgm_metric(dcgm_metric, labels)

            if is_host_network:
                if infiniband_infos:
                    for infiniband_info in infiniband_infos:
                        labels = copy.deepcopy(container_labels)
                        labels.update(infiniband_info.labels)
                        gauges.add_value("task_infiniband_receive_bytes_total",
                                         labels, infiniband_info.receive_bytes)
                        gauges.add_value(
                            "task_infiniband_transmit_bytes_total", labels,
                            infiniband_info.transmit_bytes)
                if ipoib_infos:
                    for ipoib_info in ipoib_infos:
                        labels = copy.deepcopy(container_labels)
                        labels.update(ipoib_info.labels)
                        gauges.add_value("task_ipoib_receive_bytes_total",
                                         labels, ipoib_info.receive_bytes)
                        gauges.add_value("task_ipoib_transmit_bytes_total",
                                         labels, ipoib_info.transmit_bytes)

            gauges.add_value("task_cpu_percent", container_labels,
                             stats["CPUPerc"])
            gauges.add_value("task_mem_usage_byte", container_labels,
                             stats["MemUsage_Limit"]["usage"])
            gauges.add_value("task_mem_limit_byte", container_labels,
                             stats["MemUsage_Limit"]["limit"])
            gauges.add_value("task_net_in_byte", container_labels, net_in)
            gauges.add_value("task_net_out_byte", container_labels, net_out)
            gauges.add_value("task_block_in_byte", container_labels,
                             stats["BlockIO"]["in"])
            gauges.add_value("task_block_out_byte", container_labels,
                             stats["BlockIO"]["out"])
            gauges.add_value("task_mem_usage_percent", container_labels,
                             stats["MemPerc"])
        else:
            labels = {"name": pai_service_name}
            gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"])
            gauges.add_value("service_mem_usage_byte", labels,
                             stats["MemUsage_Limit"]["usage"])
            gauges.add_value("service_mem_limit_byte", labels,
                             stats["MemUsage_Limit"]["limit"])
            gauges.add_value("service_mem_usage_percent", labels,
                             stats["MemPerc"])
            gauges.add_value("service_net_in_byte", labels, net_in)
            gauges.add_value("service_net_out_byte", labels, net_out)
            gauges.add_value("service_block_in_byte", labels,
                             stats["BlockIO"]["in"])
            gauges.add_value("service_block_out_byte", labels,
                             stats["BlockIO"]["out"])