def get_graphics_processes(handle): """Returns the list of processes ids having a graphics context on the device with the memory used https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g7eacf7fa7ba4f4485d166736bf31195e """ processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses(handle) return [{"pid": p.pid, "used_memory": p.usedGpuMemory} for p in processes]
def sample_processes_information(self, handle: DeviceHandle) -> GPUProcesses: try: # List processes with a compute context (e.g. CUDA applications) compute_processes = nvmlDeviceGetComputeRunningProcesses(handle) # List processes with a graphics context (eg. applications using OpenGL, DirectX) graphics_processes = nvmlDeviceGetGraphicsRunningProcesses(handle) # Note: a single process may have both the graphics and compute context active at the same time except NVMLError as err: self.logger.warning(nvml_error_to_string(err)) return {} processes = {} for p in compute_processes + graphics_processes: mem = p.usedGpuMemory processes[p.pid] = None if mem is None else mem / MiB self.log_debug(f"Sampled processes ({len(processes)}): {processes}") return processes
def get_usage(device_index, my_pid): N.nvmlInit() handle = N.nvmlDeviceGetHandleByIndex(int(device_index)) usage = [ nv_process.usedGpuMemory // MB for nv_process in N.nvmlDeviceGetComputeRunningProcesses(handle) + N.nvmlDeviceGetGraphicsRunningProcesses(handle) if nv_process.pid == my_pid ] if len(usage) == 1: usage = usage[0] else: raise KeyError("PID not found") return usage
def gpu_in_use_by_this_process(gpu_handle): if not psutil: return False base_process = psutil.Process().parent() or psutil.Process() our_processes = base_process.children(recursive=True) our_processes.append(base_process) our_pids = set([process.pid for process in our_processes]) compute_pids = set([ process.pid for process in pynvml.nvmlDeviceGetComputeRunningProcesses(gpu_handle) ]) graphics_pids = set([ process.pid for process in pynvml.nvmlDeviceGetGraphicsRunningProcesses(gpu_handle) ]) pids_using_device = compute_pids | graphics_pids return len(pids_using_device & our_pids) > 0
def _get_handles_by_pid(self): """Returns handles of GPU running at least one process from PIDS. Note: GPUs need to have started work before showing any processes. Requires NVML to be initialized. Bug: Containers need to be started with --pid=host for NVML to show processes: https://github.com/NVIDIA/nvidia-docker/issues/179. """ device_count = pynvml.nvmlDeviceGetCount() devices = [] for index in range(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(index) gpu_pids = [ p.pid for p in pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + pynvml.nvmlDeviceGetGraphicsRunningProcesses(handle) ] if set(gpu_pids).intersection(self.pids): devices.append(handle) return devices
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = nv_process.usedGpuMemory // MB if \ nv_process.usedGpuMemory else None process['gpu_memory_usage'] = usedmem process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] # TODO: ps_process is being cached, but the dict below is not. process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = (nv_process.usedGpuMemory // MB if nv_process.usedGpuMemory else None) process['gpu_memory_usage'] = usedmem # process['gpu_memory_usage'] = ("%d MiB" % usedmem if usedmem is not None else usedmem) process['cpu_percent'] = ps_process.cpu_percent() # process['cpu_memory_usage'] = "%d MiB" % ( # round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB) process['cpu_memory_usage'] = ( round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle) except N.NVMLError: utilization_enc = None # Not supported try: utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle) except N.NVMLError: utilization_dec = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] # A single process might run in both of graphics and compute mode, # However we will display the process only once seen_pids = set() for nv_process in nv_comp_processes + nv_graphics_processes: if nv_process.pid in seen_pids: continue seen_pids.add(nv_process.pid) try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] try: process['cpu_percent'] = cache_process.cpu_percent() except psutil.NoSuchProcess: process['cpu_percent'] = 0.0 except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. process['cpu_percent'] = 0.0 pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else 0, 'utilization.enc': utilization_enc[0] if utilization_enc else None, 'utilization.dec': utilization_dec[0] if utilization_dec else None, 'power.draw': power // 1000 if power is not None else 0, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else 0, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else 0, 'memory.total': memory.total // MB if memory else 0, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_last_used(index): last_useds = [] if not os.path.exists('gpu_history.pkl'): pickle.dump({}, open('gpu_history.pkl', 'wb')) with open('gpu_history.pkl', 'rb') as f: history = pickle.load(f) if platform.node() in history: for user, last_used in history[ platform.node()][index].items(): # 1 day = 24 hours, 1 hour = 3600 seconds used_before = (datetime.now() - last_used['last_used']).days * 24 + \ (datetime.now() - last_used['last_used']).seconds / 3600 last_useds.append((user, used_before)) return last_useds else: return [] def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) last_used = get_last_used(index) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, 'last_used': last_used, } GPUStatCollection.clean_processes() return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(pid): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. process['command'] = os.path.basename(ps_process.cmdline()[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses( handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses( handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process.pid) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, 'processes': processes, } return gpu_info
def info_refresh(self): try: stat = open("/proc/stat") self.statlines = stat.read().splitlines()[1:-1] stat.close() except IOError: print("Problem opening /proc/stat, exiting..") pynvml.nvmlShutdown() quit() for i in range(self.corecount): for j in self.statlines[i].split()[1:]: #remove cpu# self.total[i]+= int(j) self.idle[i] = int(self.statlines[i].split()[4]) for i in range(self.corecount): if (self.total[i] - self.prev_total[i]) == 0: self.prev_idle[i] = self.idle[i] self.prev_total[i] = self.total[i] break self.cpu_prog_bars[i].set_fraction(1 - ((self.idle[i] - self.prev_idle[i]) / (self.total[i] - self.prev_total[i])) ) self.prev_idle[i] = self.idle[i] self.prev_total[i] = self.total[i] self.idle[i] = 0 self.total[i] = 0 for i in range(self.deviceCount): util = pynvml.nvmlDeviceGetUtilizationRates(self.gpu_handles[i]) temp = pynvml.nvmlDeviceGetTemperature(self.gpu_handles[i], pynvml.NVML_TEMPERATURE_GPU) memInfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpu_handles[i]) (encoder_util, sPeriod) = pynvml.nvmlDeviceGetEncoderUtilization(self.gpu_handles[i]) (decoder_util, sPeriod) = pynvml.nvmlDeviceGetDecoderUtilization(self.gpu_handles[i]) mem_total = memInfo.total / 1024 / 1024 mem_used = memInfo.used / 1024 / 1024 self.gpu_prog_bars[i*6].set_text("GPU: %d%%" % util.gpu) self.gpu_prog_bars[i*6].set_fraction(util.gpu / 100) ######## self.util_history.append(util.gpu) self.util_graph.queue_draw() self.temp_history.append(temp) self.temp_graph.queue_draw() ######## self.gpu_prog_bars[i*6 +1].set_text("Memory Utilization: %d%%" % util.memory) self.gpu_prog_bars[i*6 +1].set_fraction(util.memory / 100) self.gpu_prog_bars[i*6 +4].set_text("Encoder: %d%%" % encoder_util) self.gpu_prog_bars[i*6 +5].set_text("Decoder: %d%%" % decoder_util) self.gpu_prog_bars[i*6 +4].set_fraction(encoder_util / 100) self.gpu_prog_bars[i*6 +5].set_fraction(decoder_util / 100) self.gpu_prog_bars[i*6 +2].set_text("Memory Usage: %d MiB/%d MiB" % (mem_used, mem_total)) self.gpu_prog_bars[i*6 +2].set_fraction(mem_used / mem_total) self.gpu_prog_bars[i*6 +3].set_text("Temperature: %d °C" % temp) if temp > 100: temp = 100 elif temp < 0: temp = 0 self.gpu_prog_bars[i*6 +3].set_fraction(temp / 100) #--proc-- procs = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.gpu_handles[0]) proc_liststore = Gtk.ListStore(int, str, int) for p in procs: pid = p.pid try: path = pynvml.nvmlSystemGetProcessName(p.pid).decode('utf-8') except: self.exit() if (p.usedGpuMemory == None): mem = 0 else: mem = (p.usedGpuMemory / 1024 / 1024) proc_liststore.append([pid, path, mem]) self.tree.set_model(proc_liststore) return True
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(pid): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: minor = int(N.nvmlDeviceGetMinorNumber(handle)) except N.NVMLError: minor = None # Not supported try: bus_id = _decode(N.nvmlDeviceGetPciInfo(handle).busId) except N.NVMLError: bus_id = None # Not supported try: serial = _decode(N.nvmlDeviceGetSerial(handle)) except N.NVMLError: serial = None # Not supported try: temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except (N.NVMLError, N.NVMLError_NotSupported): power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except (N.NVMLError, N.NVMLError_NotSupported): power_limit = None processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses( handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process.pid) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'minor': minor, 'bus_id': bus_id, 'serial': serial, 'temperature_gpu': temperature, 'utilization_gpu': utilization.gpu if utilization else None, 'power_draw': int(power / 1000) if power is not None else None, 'power_limit': int(power_limit / 1000) if power is not None else None, 'memory_free': int(memory.free) if memory else None, 'memory_used': int(memory.used) if memory else None, 'memory_total': int(memory.total) if memory else None, 'memory_utilization': utilization.memory if utilization else None, 'processes': processes, } return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(pid): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid # For docker cmd = 'cat /proc/{}/cgroup'.format(nv_process.pid) ret = subprocess.check_output(cmd.split()) container_id = str(ret).split('/')[2][:12] process['container_id'] = container_id cmd = 'docker ps -a' ret = subprocess.check_output(cmd.split()) docker_data = str(ret).split('\\n')[1:-1] for personal in docker_data: personal_data = personal.split() if container_id == personal_data[0]: process['container_user_name'] = personal_data[-1] return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except: power_limit = None processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses( handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses( handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process.pid) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': int(power / 1000) if power is not None else None, 'enforced.power.limit': int(power_limit / 1000) if power is not None else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, 'processes': processes, } return gpu_info
# power state power_used = nvmlDeviceGetPowerUsage(handle) / 1000 power_limit = nvmlDeviceGetPowerManagementDefaultLimit(handle) / 1000 power_used = int(power_used) power_limit = int(power_limit) power_rate = int(power_used / power_limit * 100) msg = pack_msg([power_used, power_limit], 'W') info['Power Util'] = getBar(power_rate, msg) # fan speed, temperature fan_speed = nvmlDeviceGetFanSpeed(handle) temp = nvmlDeviceGetTemperature(handle, 0) msg = f"{temp}C" info['Fan Speed'] = getBar(fan_speed, msg) message = [f"{k} \t{v}" for k, v in info.items()] print('\n'.join(message)) # graphic processes graphic_processes = nvmlDeviceGetGraphicsRunningProcesses(handle) header = "\n=== Graphic Processes ===" show_process(header, graphic_processes) # graphic processes compute_processes = nvmlDeviceGetComputeRunningProcesses(handle) header = "\n=== Compute Processes ===" show_process(header, compute_processes) nvmlShutdown()