def test_parse_smi_xml_result(self):
        sample_path = "data/nvidia_smi_sample.xml"
        with open(sample_path, "r") as f:
            nvidia_smi_result = f.read()
        nvidia_smi_parse_result = nvidia.parse_smi_xml_result(
            nvidia_smi_result)

        zero = nvidia.NvidiaGpuStatus(
            100, 25, [1357, 2384, 3093],
            nvidia.EccError(volatile_single=1,
                            volatile_double=2,
                            aggregated_single=3,
                            aggregated_double=4), "0",
            "GPU-e511a7b2-f9d5-ba47-9b98-853732ca6c1b", 60.0)
        one = nvidia.NvidiaGpuStatus(
            98, 50, [3093], nvidia.EccError(), "1",
            "GPU-28daffaf-8abe-aaf8-c298-4bd13aecb5e6", 59.0)

        target_smi_info = {
            "1": one,
            "0": zero,
            "GPU-e511a7b2-f9d5-ba47-9b98-853732ca6c1b": zero,
            "GPU-28daffaf-8abe-aaf8-c298-4bd13aecb5e6": one
        }

        self.assertEqual(target_smi_info, nvidia_smi_parse_result)
예제 #2
0
 def test_parse_smi_xml_result(self):
     sample_path = "data/nvidia_smi_sample.xml"
     with open(sample_path, "r") as f:
         nvidia_smi_result = f.read()
     nvidia_smi_parse_result = nvidia.parse_smi_xml_result(nvidia_smi_result)
     target_smi_info = {"1": nvidia.NvidiaGpuStatus(98, 50, [3093], nvidia.EccError()),
             "0": nvidia.NvidiaGpuStatus(100, 25, [1357, 2384, 3093], nvidia.EccError())}
     self.assertEqual(target_smi_info, nvidia_smi_parse_result)
예제 #3
0
    def test_convert_to_metrics_with_real_id_BUGFIX(self):
        gpu_info = nvidia.construct_gpu_info([
            nvidia.NvidiaGpuStatus(20, 21, [22], nvidia.EccError(), "0",
                                   "GPU-uuid0", 50.0)
        ])

        # zombie_info is empty should also have external process metric
        zombie_info = {"ce5de12d6275"}

        pid_to_cid_mapping = {
            22:
            "ce5de12d6275dc05c9ec5b7f58484f075f4775d8f54f6a4be3dc1439344df356"
        }

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics

        self.assertEqual(1, len(zombie_container.samples))
        self.assertEqual("0",
                         zombie_container.samples[0].labels["minor_number"])
        self.assertEqual("ce5de12d6275",
                         zombie_container.samples[0].labels["container_id"])
예제 #4
0
    def test_convert_to_metrics_with_no_zombie_info_BUGFIX(self):
        gpu_info = nvidia.construct_gpu_info([
            nvidia.NvidiaGpuStatus(20, 21, [22, 33, 44], nvidia.EccError(), "0", "GPU-uuid0", 40.0)])

        # zombie_info is empty should also have external process metric
        zombie_info = []

        pid_to_cid_mapping = {33: "def", 22: "ghi"} # only 44 is external process

        metrics = GpuCollector.convert_nvidia_gpu_info_to_metrics(gpu_info, zombie_info,
                self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024)

        _, _, _, _, external_process, zombie_container, _, _, _ = metrics

        self.assertEqual(0, len(zombie_container.samples))
        self.assertEqual(1, len(external_process.samples))
        self.assertEqual("0", external_process.samples[0].labels["minor_number"])
        self.assertEqual("44", external_process.samples[0].labels["pid"])

        # zombie_info is None should also have external process metric
        zombie_info = None

        metrics = GpuCollector.convert_nvidia_gpu_info_to_metrics(gpu_info, zombie_info,
                self.make_pid_to_cid_fn(pid_to_cid_mapping), 20 * 1024)

        _, _, _, _, external_process, zombie_container, _, _, _ = metrics

        self.assertEqual(0, len(zombie_container.samples))
        self.assertEqual(1, len(external_process.samples))
        self.assertEqual("0", external_process.samples[0].labels["minor_number"])
        self.assertEqual("44", external_process.samples[0].labels["pid"])
예제 #5
0
    def test_parse_smi_new_xml_result(self):
        sample_path = "data/nvidia_smi_sample_ecc_unsupported.xml"
        with open(sample_path, "r") as f:
            nvidia_smi_result = f.read()
        nvidia_smi_parse_result = nvidia.parse_smi_xml_result(
            nvidia_smi_result)

        zero = nvidia.NvidiaGpuStatus(
            0.000, 0.000, [], nvidia.EccError(), "0",
            "GPU-57567e11-0be2-381b-5132-2ad95c262e58", 24.0)
        one = nvidia.NvidiaGpuStatus(
            0.000, 0.000, [], nvidia.EccError(), "1",
            "GPU-ef1d0068-5bfd-f1e4-7e79-ff35d71d44b8", 24.0)

        target_smi_info = {
            "0": zero,
            "GPU-57567e11-0be2-381b-5132-2ad95c262e58": zero,
            "1": one,
            "GPU-ef1d0068-5bfd-f1e4-7e79-ff35d71d44b8": one
        }

        self.assertEqual(target_smi_info, nvidia_smi_parse_result)
예제 #6
0
    def test_convert_to_metrics_with_no_zombie_info_BUGFIX(self):
        gpu_info = {
            "0": nvidia.NvidiaGpuStatus(20, 21, [22, 33, 44],
                                        nvidia.EccError())
        }

        # zombie_info is empty should also have external process metric
        zombie_info = []

        pid_to_cid_mapping = {
            33: "def",
            22: "ghi"
        }  # only 44 is external process

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics

        self.assertEqual(0, len(zombie_container.samples))
        self.assertEqual(1, len(external_process.samples))
        self.assertEqual("0",
                         external_process.samples[0].labels["minor_number"])
        self.assertEqual(44, external_process.samples[0].labels["pid"])

        # zombie_info is None should also have external process metric
        zombie_info = None

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics

        self.assertEqual(0, len(zombie_container.samples))
        self.assertEqual(1, len(external_process.samples))
        self.assertEqual("0",
                         external_process.samples[0].labels["minor_number"])
        self.assertEqual(44, external_process.samples[0].labels["pid"])
    def test_get_retired_page_count(self):
        sample_path = "data/nvidia_smi_retired_pages.xml"
        with open(sample_path, "r") as f:
            nvidia_smi_result = f.read()
        nvidia_smi_parse_result = nvidia.parse_smi_xml_result(
            nvidia_smi_result)

        zero = nvidia.NvidiaGpuStatus(
            0.0, 0.0, [], nvidia.EccError(), "0",
            "GPU-ef23ffa6-c9fd-93d5-aeb8-612c087255ff", 33.0)
        zero.ecc_errors.single_retirement = 0
        zero.ecc_errors.double_retirement = 2
        zero.ecc_errors.volatile_single = 1
        zero.ecc_errors.volatile_double = 1
        zero.ecc_errors.aggregated_single = 291
        zero.ecc_errors.aggregated_double = 7627

        target_smi_info = {
            "0": zero,
            "GPU-ef23ffa6-c9fd-93d5-aeb8-612c087255ff": zero
        }

        self.assertEqual(target_smi_info, nvidia_smi_parse_result)
예제 #8
0
    def test_convert_to_metrics(self):
        # sample may not ordered, and can not assertEqual directly, so tear them apart
        gpu_info = nvidia.construct_gpu_info([
            nvidia.NvidiaGpuStatus(20, 21, [22, 33, 44], nvidia.EccError(),
                                   "0", "GPU-uuid0", 37.0)
        ])

        zombie_info = {"abc", "def"}

        pid_to_cid_mapping = {33: "def", 22: "ghi"}  # only 33 is zombie

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics

        target_core_utils = collector.gen_gpu_util_gauge()
        target_core_utils.add_metric(["0", "GPU-uuid0"], 20)
        self.assertEqual(target_core_utils, core_utils)

        target_mem_utils = collector.gen_gpu_mem_util_gauge()
        target_mem_utils.add_metric(["0", "GPU-uuid0"], 21)
        self.assertEqual(target_mem_utils, mem_utils)

        target_ecc_errors = collector.gen_gpu_ecc_counter()
        target_ecc_errors.add_metric(["0", "GPU-uuid0", "volatile_single"], 0)
        target_ecc_errors.add_metric(["0", "GPU-uuid0", "volatile_double"], 0)
        target_ecc_errors.add_metric(["0", "GPU-uuid0", "aggregated_single"],
                                     0)
        target_ecc_errors.add_metric(["0", "GPU-uuid0", "aggregated_double"],
                                     0)
        self.assertEqual(target_ecc_errors, ecc_errors)

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        self.assertEqual(target_mem_leak, mem_leak)

        target_external_process = collector.gen_gpu_used_by_external_process_counter(
        )
        target_external_process.add_metric(["0", "44"], 1)
        self.assertEqual(target_external_process, external_process)

        target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter(
        )
        target_zombie_container.add_metric(["0", "def"], 1)
        self.assertEqual(target_zombie_container, zombie_container)

        target_gpu_temp = collector.gen_gpu_temperature_gauge()
        target_gpu_temp.add_metric(["0", "GPU-uuid0"], 37.0)
        self.assertEqual(target_gpu_temp, gpu_temp)

        # test minor 1
        gpu_info = nvidia.construct_gpu_info([
            nvidia.NvidiaGpuStatus(
                30, 31, [55, 123],
                nvidia.EccError(volatile_single=2,
                                volatile_double=3,
                                aggregated_single=4,
                                aggregated_double=5), "1", "GPU-uuid1", 24.0)
        ])

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics

        target_core_utils = collector.gen_gpu_util_gauge()
        target_core_utils.add_metric(["1", "GPU-uuid1"], 30)
        self.assertEqual(target_core_utils, core_utils)

        target_mem_utils = collector.gen_gpu_mem_util_gauge()
        target_mem_utils.add_metric(["1", "GPU-uuid1"], 31)
        self.assertEqual(target_mem_utils, mem_utils)

        target_ecc_errors = collector.gen_gpu_ecc_counter()
        target_ecc_errors.add_metric(["1", "GPU-uuid1", "volatile_single"], 2)
        target_ecc_errors.add_metric(["1", "GPU-uuid1", "volatile_double"], 3)
        target_ecc_errors.add_metric(["1", "GPU-uuid1", "aggregated_single"],
                                     4)
        target_ecc_errors.add_metric(["1", "GPU-uuid1", "aggregated_double"],
                                     5)
        self.assertEqual(target_ecc_errors, ecc_errors)

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        self.assertEqual(target_mem_leak, mem_leak)

        target_external_process = collector.gen_gpu_used_by_external_process_counter(
        )
        target_external_process.add_metric(["1", "55"], 1)
        target_external_process.add_metric(["1", "123"], 1)
        self.assertEqual(target_external_process, external_process)

        target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter(
        )
        self.assertEqual(target_zombie_container, zombie_container)

        target_gpu_temp = collector.gen_gpu_temperature_gauge()
        target_gpu_temp.add_metric(["1", "GPU-uuid1"], 24.0)
        self.assertEqual(target_gpu_temp, gpu_temp)

        # test minor 2
        gpu_info = nvidia.construct_gpu_info([
            nvidia.NvidiaGpuStatus(40, 20 * 1024 * 1024, [], nvidia.EccError(),
                                   "2", "GPU-uuid2", 30.0)
        ])

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics

        target_core_utils = collector.gen_gpu_util_gauge()
        target_core_utils.add_metric(["2", "GPU-uuid2"], 40)
        self.assertEqual(target_core_utils, core_utils)

        target_mem_utils = collector.gen_gpu_mem_util_gauge()
        target_mem_utils.add_metric(["2", "GPU-uuid2"], 20 * 1024 * 1024)
        self.assertEqual(target_mem_utils, mem_utils)

        target_ecc_errors = collector.gen_gpu_ecc_counter()
        target_ecc_errors.add_metric(["2", "GPU-uuid2", "volatile_single"], 0)
        target_ecc_errors.add_metric(["2", "GPU-uuid2", "volatile_double"], 0)
        target_ecc_errors.add_metric(["2", "GPU-uuid2", "aggregated_single"],
                                     0)
        target_ecc_errors.add_metric(["2", "GPU-uuid2", "aggregated_double"],
                                     0)
        self.assertEqual(target_ecc_errors, ecc_errors)

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        self.assertEqual(target_mem_leak, mem_leak)

        target_external_process = collector.gen_gpu_used_by_external_process_counter(
        )
        self.assertEqual(target_external_process, external_process)

        target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter(
        )
        self.assertEqual(target_zombie_container, zombie_container)

        target_gpu_temp = collector.gen_gpu_temperature_gauge()
        target_gpu_temp.add_metric(["2", "GPU-uuid2"], 30.0)
        self.assertEqual(target_gpu_temp, gpu_temp)

        # test memory leak
        gpu_info = nvidia.construct_gpu_info([
            nvidia.NvidiaGpuStatus(40, 20 * 1024 * 1024 + 1, [],
                                   nvidia.EccError(), "3", "GPU-uuid3", 30.0)
        ])

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container, gpu_temp, gpu_retired = metrics

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        target_mem_leak.add_metric(["3", "GPU-uuid3"], 1)
        self.assertEqual(target_mem_leak, mem_leak)
예제 #9
0
    def test_convert_to_metrics(self):
        # sample may not ordered, and can not assertEqual directly, so tear them apart
        gpu_info = {
            "0": nvidia.NvidiaGpuStatus(20, 21, [22, 33, 44],
                                        nvidia.EccError())
        }

        zombie_info = {"abc", "def"}

        pid_to_cid_mapping = {
            33: "def",
            22: "ghi",
            44: "jkl"
        }  # only 33 is zombie

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics

        target_core_utils = collector.gen_gpu_util_gauge()
        target_core_utils.add_metric(["0"], 20)
        self.assertEqual(target_core_utils, core_utils)

        target_mem_utils = collector.gen_gpu_mem_util_gauge()
        target_mem_utils.add_metric(["0"], 21)
        self.assertEqual(target_mem_utils, mem_utils)

        target_ecc_errors = collector.gen_gpu_ecc_counter()
        target_ecc_errors.add_metric(["0", "single"], 0)
        target_ecc_errors.add_metric(["0", "double"], 0)
        self.assertEqual(target_ecc_errors, ecc_errors)

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        self.assertEqual(target_mem_leak, mem_leak)

        target_external_process = collector.gen_gpu_used_by_external_process_counter(
        )
        target_external_process.add_metric(["0", 22], 1)
        target_external_process.add_metric(["0", 44], 1)
        self.assertEqual(target_external_process, external_process)

        target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter(
        )
        target_zombie_container.add_metric(["0", "def"], 1)
        self.assertEqual(target_zombie_container, zombie_container)

        # test minor 1
        gpu_info = {
            "1":
            nvidia.NvidiaGpuStatus(30, 31, [55, 123],
                                   nvidia.EccError(single=2, double=3))
        }

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics

        target_core_utils = collector.gen_gpu_util_gauge()
        target_core_utils.add_metric(["1"], 30)
        self.assertEqual(target_core_utils, core_utils)

        target_mem_utils = collector.gen_gpu_mem_util_gauge()
        target_mem_utils.add_metric(["1"], 31)
        self.assertEqual(target_mem_utils, mem_utils)

        target_ecc_errors = collector.gen_gpu_ecc_counter()
        target_ecc_errors.add_metric(["1", "single"], 2)
        target_ecc_errors.add_metric(["1", "double"], 3)
        self.assertEqual(target_ecc_errors, ecc_errors)

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        self.assertEqual(target_mem_leak, mem_leak)

        target_external_process = collector.gen_gpu_used_by_external_process_counter(
        )
        target_external_process.add_metric(["1", 55], 1)
        target_external_process.add_metric(["1", 123], 1)
        self.assertEqual(target_external_process, external_process)

        target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter(
        )
        self.assertEqual(target_zombie_container, zombie_container)

        # test minor 2
        gpu_info = {
            "2":
            nvidia.NvidiaGpuStatus(40, 20 * 1024 * 1024, [], nvidia.EccError())
        }

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics

        target_core_utils = collector.gen_gpu_util_gauge()
        target_core_utils.add_metric(["2"], 40)
        self.assertEqual(target_core_utils, core_utils)

        target_mem_utils = collector.gen_gpu_mem_util_gauge()
        target_mem_utils.add_metric(["2"], 20 * 1024 * 1024)
        self.assertEqual(target_mem_utils, mem_utils)

        target_ecc_errors = collector.gen_gpu_ecc_counter()
        target_ecc_errors.add_metric(["2", "single"], 0)
        target_ecc_errors.add_metric(["2", "double"], 0)
        self.assertEqual(target_ecc_errors, ecc_errors)

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        self.assertEqual(target_mem_leak, mem_leak)

        target_external_process = collector.gen_gpu_used_by_external_process_counter(
        )
        self.assertEqual(target_external_process, external_process)

        target_zombie_container = collector.gen_gpu_used_by_zombie_container_counter(
        )
        self.assertEqual(target_zombie_container, zombie_container)

        # test memory leak
        gpu_info = {
            "3":
            nvidia.NvidiaGpuStatus(40, 20 * 1024 * 1024 + 1, [],
                                   nvidia.EccError())
        }

        metrics = GpuCollector.convert_to_metrics(
            gpu_info, zombie_info, self.make_pid_to_cid_fn(pid_to_cid_mapping),
            20 * 1024)

        core_utils, mem_utils, ecc_errors, mem_leak, external_process, zombie_container = metrics

        target_mem_leak = collector.gen_gpu_memory_leak_counter()
        target_mem_leak.add_metric(["3"], 1)
        self.assertEqual(target_mem_leak, mem_leak)