def test_1_visible_devices(): if nvml.device_get_count() < 1: pytest.skip("No GPUs available") os.environ["CUDA_VISIBLE_DEVICES"] = "0" output = nvml.one_time() h = nvml._pynvml_handles() assert output["memory-total"] == pynvml.nvmlDeviceGetMemoryInfo(h).total
def test_one_time(): if nvml.device_get_count() < 1: pytest.skip("No GPUs available") output = nvml.one_time() assert "memory-total" in output assert "name" in output assert len(output["name"]) > 0
def __init__(self, n=10000): self.proc = psutil.Process() self.time = deque(maxlen=n) self.cpu = deque(maxlen=n) self.memory = deque(maxlen=n) self.count = 0 self.quantities = { "cpu": self.cpu, "memory": self.memory, "time": self.time } try: ioc = psutil.net_io_counters() except Exception: self._collect_net_io_counters = False else: self.last_time = time() self.read_bytes = deque(maxlen=n) self.write_bytes = deque(maxlen=n) self.quantities["read_bytes"] = self.read_bytes self.quantities["write_bytes"] = self.write_bytes self._last_io_counters = ioc self._collect_net_io_counters = True try: disk_ioc = psutil.disk_io_counters() except Exception: self._collect_disk_io_counters = False else: if disk_ioc is None: # diskless machine self._collect_disk_io_counters = False else: self.last_time_disk = time() self.read_bytes_disk = deque(maxlen=n) self.write_bytes_disk = deque(maxlen=n) self.quantities["read_bytes_disk"] = self.read_bytes_disk self.quantities["write_bytes_disk"] = self.write_bytes_disk self._last_disk_io_counters = disk_ioc self._collect_disk_io_counters = True if not WINDOWS: self.num_fds = deque(maxlen=n) self.quantities["num_fds"] = self.num_fds if nvml.device_get_count() > 0: gpu_extra = nvml.one_time() self.gpu_name = gpu_extra["name"] self.gpu_memory_total = gpu_extra["memory-total"] self.gpu_utilization = deque(maxlen=n) self.gpu_memory_used = deque(maxlen=n) self.quantities["gpu_utilization"] = self.gpu_utilization self.quantities["gpu_memory_used"] = self.gpu_memory_used self.update()
async def test_gpu_monitoring_range_query(s, a, b): if nvml.device_get_count() < 1: pytest.skip("No GPUs available") res = await s.get_worker_monitor_info() ms = ["gpu_utilization", "gpu_memory_used"] for w in (a, b): assert all(res[w.address]["range_query"][m] is not None for m in ms) assert res[w.address]["count"] is not None assert res[w.address]["last_time"] is not None
async def test_gpu_metrics(s, a, b): if nvml.device_get_count() < 1: pytest.skip("No GPUs available") h = nvml._pynvml_handles() assert "gpu" in a.metrics assert (s.workers[a.address].metrics["gpu"]["memory-used"] == pynvml.nvmlDeviceGetMemoryInfo(h).used) assert "gpu" in a.startup_information assert (s.workers[a.address].extra["gpu"]["name"] == pynvml.nvmlDeviceGetName(h).decode())
async def test_gpu_monitoring_recent(s, a, b): if nvml.device_get_count() < 1: pytest.skip("No GPUs available") h = nvml._pynvml_handles() res = await s.get_worker_monitor_info(recent=True) assert (res[a.address]["range_query"]["gpu_utilization"] == pynvml.nvmlDeviceGetUtilizationRates(h).gpu) assert (res[a.address]["range_query"]["gpu_memory_used"] == pynvml.nvmlDeviceGetMemoryInfo(h).used) assert res[a.address]["gpu_name"] == pynvml.nvmlDeviceGetName(h).decode() assert res[a.address][ "gpu_memory_total"] == pynvml.nvmlDeviceGetMemoryInfo(h).total
def test_2_visible_devices(CVD): if nvml.device_get_count() < 2: pytest.skip("Less than two GPUs available") os.environ["CUDA_VISIBLE_DEVICES"] = CVD idx = int(CVD.split(",")[0]) h = nvml._pynvml_handles() h2 = pynvml.nvmlDeviceGetHandleByIndex(idx) s = pynvml.nvmlDeviceGetSerial(h) s2 = pynvml.nvmlDeviceGetSerial(h2) assert s == s2
def test_has_cuda_context(): if nvml.device_get_count() < 1: pytest.skip("No GPUs available") # This test should be run in a new process so that it definitely doesn't have a CUDA context # and uses a queue to pass exceptions back ctx = mp.get_context("spawn") queue = ctx.Queue() p = ctx.Process(target=run_has_cuda_context, args=(queue, )) p.start() p.join() # this blocks until the process terminates e = queue.get() if e is not None: raise e
NumeralTickFormatter, OpenURL, TapTool, ) from bokeh.plotting import figure from tornado import escape from dask.utils import format_bytes from distributed.dashboard.components import DashboardComponent, add_periodic_callback from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024, env from distributed.dashboard.utils import update, without_property_validation from distributed.diagnostics import nvml from distributed.utils import log_errors NVML_ENABLED = nvml.device_get_count() > 0 class GPUCurrentLoad(DashboardComponent): """How many tasks are on each worker""" def __init__(self, scheduler, width=600, **kwargs): with log_errors(): self.last = 0 self.scheduler = scheduler self.source = ColumnDataSource({ "memory": [1, 2], "memory-half": [0.5, 1], "memory_text": ["1B", "2B"], "utilization": [1, 2], "utilization-half": [0.5, 1], "worker": ["a", "b"],
def update(self): with self.proc.oneshot(): cpu = self.proc.cpu_percent() memory = self.get_process_memory() now = time() self.cpu.append(cpu) self.memory.append(memory) self.time.append(now) self.count += 1 result = { "cpu": cpu, "memory": memory, "time": now, "count": self.count } if self._collect_net_io_counters: try: ioc = psutil.net_io_counters() except Exception: pass else: last = self._last_io_counters duration = now - self.last_time read_bytes = (ioc.bytes_recv - last.bytes_recv) / (duration or 0.5) write_bytes = (ioc.bytes_sent - last.bytes_sent) / (duration or 0.5) self.last_time = now self._last_io_counters = ioc self.read_bytes.append(read_bytes) self.write_bytes.append(write_bytes) result["read_bytes"] = read_bytes result["write_bytes"] = write_bytes if self._collect_disk_io_counters: try: disk_ioc = psutil.disk_io_counters() except Exception: pass else: last_disk = self._last_disk_io_counters duration_disk = now - self.last_time_disk read_bytes_disk = (disk_ioc.read_bytes - last_disk.read_bytes ) / (duration_disk or 0.5) write_bytes_disk = (disk_ioc.write_bytes - last_disk.write_bytes) / (duration_disk or 0.5) self.last_time_disk = now self._last_disk_io_counters = disk_ioc self.read_bytes_disk.append(read_bytes_disk) self.write_bytes_disk.append(write_bytes_disk) result["read_bytes_disk"] = read_bytes_disk result["write_bytes_disk"] = write_bytes_disk if not WINDOWS: num_fds = self.proc.num_fds() self.num_fds.append(num_fds) result["num_fds"] = num_fds if nvml.device_get_count() > 0: gpu_metrics = nvml.real_time() self.gpu_utilization.append(gpu_metrics["utilization"]) self.gpu_memory_used.append(gpu_metrics["memory-used"]) result["gpu_utilization"] = gpu_metrics["utilization"] result["gpu_memory_used"] = gpu_metrics["memory-used"] return result