def test_process_vc_info(self): vc_info = { "default": { "P40": 10, "P80": 10 }, "platform": { "P40": 10 }, "relevance": { "P80": 4 } } vc_usage = watchdog.VcUsage() vc_usage.add_preemptable_used("default", "P40", 8) vc_usage.add_preemptable_used("default", "P80", 2) vc_usage.add_used("default", "P40", 2) vc_usage.add_used("platform", "P40", 3) cluster_gpu_info = watchdog.ClusterGPUInfo() cluster_gpu_info.capacity = 34 cluster_gpu_info.available = 29 cluster_gpu_info.allocatable = 34 vc_total, vc_avail, vc_preemptive_avail, vc_unschedulable_gauge = \ watchdog.gen_vc_metrics(vc_info, vc_usage, cluster_gpu_info) self.assertEqual(4, len(vc_total.samples)) for sample in vc_total.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(vc_info[vc_name][gpu_type], sample.value) target_vc_avail = { "default": { "P40": 8, "P80": 10 }, "platform": { "P40": 7 }, "relevance": { "P80": 4 } } self.assertEqual(4, len(vc_avail.samples)) for sample in vc_avail.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_avail[vc_name][gpu_type], sample.value, "vc " + vc_name + ", gpu " + gpu_type) target_vc_preemptive_avail = { "default": { "P40": 29, "P80": 29 }, "platform": { "P40": 29 }, "relevance": { "P80": 29 } } self.assertEqual(4, len(vc_preemptive_avail.samples)) for sample in vc_preemptive_avail.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_preemptive_avail[vc_name][gpu_type], sample.value, "vc " + vc_name) target_vc_unschedulable = { "default": { "P40": 0, "P80": 0 }, "platform": { "P40": 0 }, "relevance": { "P80": 0 } } for sample in vc_unschedulable_gauge.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_unschedulable[vc_name][gpu_type], sample.value, "vc " + vc_name + ", gpu " + gpu_type)
def test_gpu_accounting(self): vc_info = {"A": {"P40": 40}, "B": {"P40": 40}, "C": {"P40": 40}} vc_usage = watchdog.VcUsage() vc_usage.add_used("A", "P40", 40) vc_usage.add_used("B", "P40", 31) vc_usage.add_used("C", "P40", 0) cluster_gpu_info = watchdog.ClusterGPUInfo() cluster_gpu_info.capacity = 120 cluster_gpu_info.available = 29 cluster_gpu_info.allocatable = 100 vc_total, vc_avail, vc_preemptive_avail, vc_unschedulable_gauge = \ watchdog.gen_vc_metrics(vc_info, vc_usage, cluster_gpu_info) self.assertEqual(3, len(vc_total.samples)) for sample in vc_total.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(vc_info[vc_name][gpu_type], sample.value) target_vc_avail = {"A": {"P40": 0}, "B": {"P40": 1}, "C": {"P40": 27}} self.assertEqual(3, len(vc_avail.samples)) for sample in vc_avail.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_avail[vc_name][gpu_type], sample.value, "vc " + vc_name + ", gpu " + gpu_type) target_vc_preemptive_avail = { "A": { "P40": 29 }, "B": { "P40": 29 }, "C": { "P40": 29 } } self.assertEqual(3, len(vc_preemptive_avail.samples)) for sample in vc_preemptive_avail.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_preemptive_avail[vc_name][gpu_type], sample.value, "vc " + vc_name) target_vc_unschedulable = { "A": { "P40": 0 }, "B": { "P40": 8 }, "C": { "P40": 13 } } self.assertEqual(3, len(vc_unschedulable_gauge.samples)) for sample in vc_unschedulable_gauge.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_unschedulable[vc_name][gpu_type], sample.value, "vc " + vc_name)
def test_process_vc_info_real_case(self): vc_info = { "quantus": { "P40": 150 }, "relevance2": { "P40": 234 }, "relevance2-inf": { "P40": 40 } } vc_usage = watchdog.VcUsage() vc_usage.add_preemptable_used("relevance2", "P40", 24) vc_usage.add_used("relevance2", "P40", 231) vc_usage.add_used("quantus", "P40", 125) cluster_gpu_info = watchdog.ClusterGPUInfo() cluster_gpu_info.capacity = 424 cluster_gpu_info.available = 68 cluster_gpu_info.allocatable = 423 vc_total, vc_avail, vc_preemptive_avail, vc_unschedulable_gauge = \ watchdog.gen_vc_metrics(vc_info, vc_usage, cluster_gpu_info) self.assertEqual(3, len(vc_total.samples)) for sample in vc_total.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(vc_info[vc_name][gpu_type], sample.value) target_vc_avail = { "quantus": { "P40": 25 }, "relevance2": { "P40": 2 }, "relevance2-inf": { "P40": 40 } } self.assertEqual(3, len(vc_avail.samples)) for sample in vc_avail.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_avail[vc_name][gpu_type], sample.value, "vc " + vc_name + ", gpu " + gpu_type) target_vc_preemptive_avail = { "quantus": { "P40": 68 }, "relevance2": { "P40": 68 }, "relevance2-inf": { "P40": 68 } } self.assertEqual(3, len(vc_preemptive_avail.samples)) for sample in vc_preemptive_avail.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_preemptive_avail[vc_name][gpu_type], sample.value, "vc " + vc_name) target_vc_unschedulable = { "quantus": { "P40": 0 }, "relevance2": { "P40": 1 }, "relevance2-inf": { "P40": 0 } } self.assertEqual(3, len(vc_unschedulable_gauge.samples)) for sample in vc_unschedulable_gauge.samples: vc_name = sample.labels["vc_name"] gpu_type = sample.labels["gpu_type"] self.assertEqual(target_vc_unschedulable[vc_name][gpu_type], sample.value, "vc " + vc_name + ", gpu " + gpu_type)