def get_gpu_count_mig(return_uuids=False): """Return the number of MIG instances available Parameters ---------- return_uuids: bool Returns the uuids of the MIG instances available optionally """ pynvml.nvmlInit() uuids = [] for index in range(get_gpu_count()): handle = pynvml.nvmlDeviceGetHandleByIndex(index) try: is_mig_mode = pynvml.nvmlDeviceGetMigMode(handle)[0] except pynvml.NVMLError: # if not a MIG device, i.e. a normal GPU, skip continue if is_mig_mode: count = pynvml.nvmlDeviceGetMaxMigDeviceCount(handle) miguuids = [] for i in range(count): try: mighandle = pynvml.nvmlDeviceGetMigDeviceHandleByIndex( device=handle, index=i ) miguuids.append(mighandle) uuids.append(pynvml.nvmlDeviceGetUUID(mighandle)) except pynvml.NVMLError: pass if return_uuids: return len(uuids), uuids return len(uuids)
def get_uuid(handle): """ Returns the globally unique GPU device UUID https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html#group__nvmlDeviceQueries_1g72710fb20f30f0c2725ce31579832654 """ uuid = pynvml.nvmlDeviceGetUUID(handle) return to_utf8(uuid)
def query_device(index): handle = pynvml.nvmlDeviceGetHandleByIndex(index) return { 'index': index, 'name': pynvml.nvmlDeviceGetName(handle).decode(), 'utilization': pynvml.nvmlDeviceGetUtilizationRates(handle).gpu, 'uuid': pynvml.nvmlDeviceGetUUID(handle).decode(), }
def get_gpu_status(gpu_index=0): # init for getting N.nvmlInit() handle = N.nvmlDeviceGetHandleByIndex(gpu_index) def _decode(b): if isinstance(b, bytes): return b.decode() # to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None try: memory = N.nvmlDeviceGetMemoryInfo(handle) except N.NVMLError: memory = None try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None try: power = N.nvmlDeviceGetPowerUsage(handle) except: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except: power_limit = None # real gpu index index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature': temperature, 'utilization': utilization.gpu if utilization else None, 'power': int(power / 1000) if power is not None else None, 'enforced.power': int(power_limit / 1000) if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, } # release resource N.nvmlShutdown() return GPUStat(gpu_info)
def measure(self, batch_idx: int) -> Dict[str, Measurement]: measurements = {} timestamp = datetime.now(timezone.utc) for i in range(self.num_gpus): handle = pynvml.nvmlDeviceGetHandleByIndex(i) gpu_uuid = pynvml.nvmlDeviceGetUUID(handle) try: info = pynvml.nvmlDeviceGetMemoryInfo(handle) measurements[gpu_uuid] = Measurement(timestamp, batch_idx, info.free) except pynvml.NVMLError as e: logging.info(f"{LOG_NAMESPACE}: failed to sample GPU memory for GPU {i}: {e}") return measurements
def get_gpu_uuid_from_index(device_index=0): """Get GPU UUID from CUDA device index. Parameters ---------- device_index: int or str The index of the device from which to obtain the UUID. Default: 0. Examples -------- >>> get_gpu_uuid_from_index() 'GPU-9baca7f5-0f2f-01ac-6b05-8da14d6e9005' >>> get_gpu_uuid_from_index(3) 'GPU-9fb42d6f-7d6b-368f-f79c-3c3e784c93f6' """ import pynvml pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(device_index) return pynvml.nvmlDeviceGetUUID(handle).decode("utf-8")
def __init__(self) -> None: self._pynvml = None # type: Optional[Any] self._device_count = None # type: Optional[int] self._index_to_uuid_map = {} # type: Dict[int, str] try: import pynvml pynvml.nvmlInit() try: num_gpus = pynvml.nvmlDeviceGetCount() for i in range(num_gpus): handle = pynvml.nvmlDeviceGetHandleByIndex(i) uuid = pynvml.nvmlDeviceGetUUID(handle) pynvml.nvmlDeviceGetMemoryInfo(handle) pynvml.nvmlDeviceGetUtilizationRates(handle) self._index_to_uuid_map[i] = uuid self._pynvml = pynvml self._device_count = num_gpus except Exception as e: logging.warning( f"{LOG_NAMESPACE}: pynvml is functional, but failed to pass functionality " f"test due to exception. Not collecting GPU metrics. Exception details: {e}" ) except ModuleNotFoundError: logging.info( f"{LOG_NAMESPACE}: pynvml not found. Not collecting GPU metrics" ) except pynvml.NVMLError_LibraryNotFound: logging.info( f"{LOG_NAMESPACE}: pynvml LibraryNotFound error. Not collecting GPU metrics" ) except Exception as e: logging.error( f"{LOG_NAMESPACE}: unexpected error while trying to set up pynvml. Not " f"collecting GPU metrics. Please report this error to " f"https://github.com/determined-ai/determined as it should not be " f"encountered by users. Error details: {e}")
def uuids(ngpus, handles): uuids = [pynvml.nvmlDeviceGetUUID(handles[i]) for i in range(ngpus)] assert len(uuids) == ngpus return uuids
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = nv_process.usedGpuMemory // MB if \ nv_process.usedGpuMemory else None process['gpu_memory_usage'] = usedmem process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] # TODO: ps_process is being cached, but the dict below is not. process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = (nv_process.usedGpuMemory // MB if nv_process.usedGpuMemory else None) process['gpu_memory_usage'] = usedmem # process['gpu_memory_usage'] = ("%d MiB" % usedmem if usedmem is not None else usedmem) process['cpu_percent'] = ps_process.cpu_percent() # process['cpu_memory_usage'] = "%d MiB" % ( # round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB) process['cpu_memory_usage'] = ( round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle) except N.NVMLError: utilization_enc = None # Not supported try: utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle) except N.NVMLError: utilization_dec = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] # A single process might run in both of graphics and compute mode, # However we will display the process only once seen_pids = set() for nv_process in nv_comp_processes + nv_graphics_processes: if nv_process.pid in seen_pids: continue seen_pids.add(nv_process.pid) try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] try: process['cpu_percent'] = cache_process.cpu_percent() except psutil.NoSuchProcess: process['cpu_percent'] = 0.0 except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. process['cpu_percent'] = 0.0 pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else 0, 'utilization.enc': utilization_enc[0] if utilization_enc else None, 'utilization.dec': utilization_dec[0] if utilization_dec else None, 'power.draw': power // 1000 if power is not None else 0, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else 0, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else 0, 'memory.total': memory.total // MB if memory else 0, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_last_used(index): last_useds = [] if not os.path.exists('gpu_history.pkl'): pickle.dump({}, open('gpu_history.pkl', 'wb')) with open('gpu_history.pkl', 'rb') as f: history = pickle.load(f) if platform.node() in history: for user, last_used in history[ platform.node()][index].items(): # 1 day = 24 hours, 1 hour = 3600 seconds used_before = (datetime.now() - last_used['last_used']).days * 24 + \ (datetime.now() - last_used['last_used']).seconds / 3600 last_useds.append((user, used_before)) return last_useds else: return [] def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) last_used = get_last_used(index) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, 'last_used': last_used, } GPUStatCollection.clean_processes() return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(pid): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. process['command'] = os.path.basename(ps_process.cmdline()[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses( handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses( handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process.pid) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, 'processes': processes, } return gpu_info
def get_uuid(self): return pynvml.nvmlDeviceGetUUID(self.handle)
def uuid(self): """ NVIDIA device UUID """ return nv.nvmlDeviceGetUUID(self._handle).decode()
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
for i in range(0, deviceCount): tmp_dict = {} handle = pn.nvmlDeviceGetHandleByIndex(i) #获得某一个ID 的gpu句柄 pciInfo = pn.nvmlDeviceGetPciInfo(handle) #检索此设备的PCI属性 gpu_id= pciInfo.busId product_name=pn.nvmlDeviceGetName(handle) #检索此设备名称 try: mode=pn.nvmlDeviceGetPersistenceMode(handle) #检索与此设备关联的持久性模式,0:Disable except pn.NVMLError, err: mode='NA' try: Current_driver_model = pn.nvmlDeviceGetCurrentDriverModel(handle) #检索当前驱动模式 except pn.NVMLError, err: Current_driver_model='NA' try: uuid=pn.nvmlDeviceGetUUID(handle) #检索与此设备相关联的全局唯一不可变UUID except np.NVMLError, err: uuid='NA' pci_device_id=pciInfo.pciDeviceId #检索该gpu的pci设备id pci_bus_id=pciInfo.busId #总线id try: width=pn.nvmlDeviceGetMaxPcieLinkWidth(handle) #检索此设备和系统可能的最大PCIe链路宽度 except np.NVMLError, err: width='NA' try: memInfo = pn.nvmlDeviceGetMemoryInfo(handle) #检索设备上已用空间,可用空间和总内存量 mem_total = str(memInfo.total / 1024 / 1024) + ' MB' mem_used = str(memInfo.used / 1024 / 1024) + ' MB' mem_free = str(memInfo.free / 1024 / 1024) + ' MB' except np.NVMLError, err: mem_total = 'NA'
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(pid): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: minor = int(N.nvmlDeviceGetMinorNumber(handle)) except N.NVMLError: minor = None # Not supported try: bus_id = _decode(N.nvmlDeviceGetPciInfo(handle).busId) except N.NVMLError: bus_id = None # Not supported try: serial = _decode(N.nvmlDeviceGetSerial(handle)) except N.NVMLError: serial = None # Not supported try: temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except (N.NVMLError, N.NVMLError_NotSupported): power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except (N.NVMLError, N.NVMLError_NotSupported): power_limit = None processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses( handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process.pid) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'minor': minor, 'bus_id': bus_id, 'serial': serial, 'temperature_gpu': temperature, 'utilization_gpu': utilization.gpu if utilization else None, 'power_draw': int(power / 1000) if power is not None else None, 'power_limit': int(power_limit / 1000) if power is not None else None, 'memory_free': int(memory.free) if memory else None, 'memory_used': int(memory.used) if memory else None, 'memory_total': int(memory.total) if memory else None, 'memory_utilization': utilization.memory if utilization else None, 'processes': processes, } return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(pid): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=pid) process['username'] = ps_process.username() # cmdline returns full path; as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = int(nv_process.usedGpuMemory / 1024 / 1024) process['pid'] = nv_process.pid # For docker cmd = 'cat /proc/{}/cgroup'.format(nv_process.pid) ret = subprocess.check_output(cmd.split()) container_id = str(ret).split('/')[2][:12] process['container_id'] = container_id cmd = 'docker ps -a' ret = subprocess.check_output(cmd.split()) docker_data = str(ret).split('\\n')[1:-1] for personal in docker_data: personal_data = personal.split() if container_id == personal_data[0]: process['container_user_name'] = personal_data[-1] return process def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except: power_limit = None processes = [] try: nv_comp_processes = N.nvmlDeviceGetComputeRunningProcesses( handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = N.nvmlDeviceGetGraphicsRunningProcesses( handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None # Not supported (in both cases) else: nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in (nv_comp_processes + nv_graphics_processes): # TODO: could be more information such as system memory usage, # CPU percentage, create time etc. try: process = get_process_info(nv_process.pid) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': int(power / 1000) if power is not None else None, 'enforced.power.limit': int(power_limit / 1000) if power is not None else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, 'processes': processes, } return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
def _get_device_uuid(gpu): return {'uuid': pynvml.nvmlDeviceGetUUID(gpu)}