예제 #1
0
    def test_process_vc_info(self):
        vc_info = {
            "default": {
                "P40": 10,
                "P80": 10
            },
            "platform": {
                "P40": 10
            },
            "relevance": {
                "P80": 4
            }
        }

        vc_usage = watchdog.VcUsage()

        vc_usage.add_preemptable_used("default", "P40", 8)
        vc_usage.add_preemptable_used("default", "P80", 2)
        vc_usage.add_used("default", "P40", 2)

        vc_usage.add_used("platform", "P40", 3)

        cluster_gpu_info = watchdog.ClusterGPUInfo()
        cluster_gpu_info.capacity = 34
        cluster_gpu_info.available = 29
        cluster_gpu_info.allocatable = 34
        vc_total, vc_avail, vc_preemptive_avail, vc_unschedulable_gauge = \
                watchdog.gen_vc_metrics(vc_info, vc_usage, cluster_gpu_info)

        self.assertEqual(4, len(vc_total.samples))
        for sample in vc_total.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(vc_info[vc_name][gpu_type], sample.value)

        target_vc_avail = {
            "default": {
                "P40": 8,
                "P80": 10
            },
            "platform": {
                "P40": 7
            },
            "relevance": {
                "P80": 4
            }
        }

        self.assertEqual(4, len(vc_avail.samples))
        for sample in vc_avail.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_avail[vc_name][gpu_type], sample.value,
                             "vc " + vc_name + ", gpu " + gpu_type)

        target_vc_preemptive_avail = {
            "default": {
                "P40": 29,
                "P80": 29
            },
            "platform": {
                "P40": 29
            },
            "relevance": {
                "P80": 29
            }
        }

        self.assertEqual(4, len(vc_preemptive_avail.samples))
        for sample in vc_preemptive_avail.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_preemptive_avail[vc_name][gpu_type],
                             sample.value, "vc " + vc_name)

        target_vc_unschedulable = {
            "default": {
                "P40": 0,
                "P80": 0
            },
            "platform": {
                "P40": 0
            },
            "relevance": {
                "P80": 0
            }
        }
        for sample in vc_unschedulable_gauge.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_unschedulable[vc_name][gpu_type],
                             sample.value,
                             "vc " + vc_name + ", gpu " + gpu_type)
예제 #2
0
    def test_gpu_accounting(self):
        vc_info = {"A": {"P40": 40}, "B": {"P40": 40}, "C": {"P40": 40}}

        vc_usage = watchdog.VcUsage()

        vc_usage.add_used("A", "P40", 40)
        vc_usage.add_used("B", "P40", 31)
        vc_usage.add_used("C", "P40", 0)

        cluster_gpu_info = watchdog.ClusterGPUInfo()
        cluster_gpu_info.capacity = 120
        cluster_gpu_info.available = 29
        cluster_gpu_info.allocatable = 100
        vc_total, vc_avail, vc_preemptive_avail, vc_unschedulable_gauge = \
                watchdog.gen_vc_metrics(vc_info, vc_usage, cluster_gpu_info)

        self.assertEqual(3, len(vc_total.samples))
        for sample in vc_total.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(vc_info[vc_name][gpu_type], sample.value)

        target_vc_avail = {"A": {"P40": 0}, "B": {"P40": 1}, "C": {"P40": 27}}

        self.assertEqual(3, len(vc_avail.samples))
        for sample in vc_avail.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_avail[vc_name][gpu_type], sample.value,
                             "vc " + vc_name + ", gpu " + gpu_type)

        target_vc_preemptive_avail = {
            "A": {
                "P40": 29
            },
            "B": {
                "P40": 29
            },
            "C": {
                "P40": 29
            }
        }

        self.assertEqual(3, len(vc_preemptive_avail.samples))
        for sample in vc_preemptive_avail.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_preemptive_avail[vc_name][gpu_type],
                             sample.value, "vc " + vc_name)

        target_vc_unschedulable = {
            "A": {
                "P40": 0
            },
            "B": {
                "P40": 8
            },
            "C": {
                "P40": 13
            }
        }
        self.assertEqual(3, len(vc_unschedulable_gauge.samples))
        for sample in vc_unschedulable_gauge.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_unschedulable[vc_name][gpu_type],
                             sample.value, "vc " + vc_name)
예제 #3
0
    def test_process_vc_info_real_case(self):
        vc_info = {
            "quantus": {
                "P40": 150
            },
            "relevance2": {
                "P40": 234
            },
            "relevance2-inf": {
                "P40": 40
            }
        }

        vc_usage = watchdog.VcUsage()

        vc_usage.add_preemptable_used("relevance2", "P40", 24)
        vc_usage.add_used("relevance2", "P40", 231)
        vc_usage.add_used("quantus", "P40", 125)

        cluster_gpu_info = watchdog.ClusterGPUInfo()
        cluster_gpu_info.capacity = 424
        cluster_gpu_info.available = 68
        cluster_gpu_info.allocatable = 423
        vc_total, vc_avail, vc_preemptive_avail, vc_unschedulable_gauge = \
                watchdog.gen_vc_metrics(vc_info, vc_usage, cluster_gpu_info)

        self.assertEqual(3, len(vc_total.samples))
        for sample in vc_total.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(vc_info[vc_name][gpu_type], sample.value)

        target_vc_avail = {
            "quantus": {
                "P40": 25
            },
            "relevance2": {
                "P40": 2
            },
            "relevance2-inf": {
                "P40": 40
            }
        }

        self.assertEqual(3, len(vc_avail.samples))
        for sample in vc_avail.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_avail[vc_name][gpu_type], sample.value,
                             "vc " + vc_name + ", gpu " + gpu_type)

        target_vc_preemptive_avail = {
            "quantus": {
                "P40": 68
            },
            "relevance2": {
                "P40": 68
            },
            "relevance2-inf": {
                "P40": 68
            }
        }

        self.assertEqual(3, len(vc_preemptive_avail.samples))
        for sample in vc_preemptive_avail.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_preemptive_avail[vc_name][gpu_type],
                             sample.value, "vc " + vc_name)

        target_vc_unschedulable = {
            "quantus": {
                "P40": 0
            },
            "relevance2": {
                "P40": 1
            },
            "relevance2-inf": {
                "P40": 0
            }
        }
        self.assertEqual(3, len(vc_unschedulable_gauge.samples))
        for sample in vc_unschedulable_gauge.samples:
            vc_name = sample.labels["vc_name"]
            gpu_type = sample.labels["gpu_type"]
            self.assertEqual(target_vc_unschedulable[vc_name][gpu_type],
                             sample.value,
                             "vc " + vc_name + ", gpu " + gpu_type)