def _get_clock_info(gpu): return { 'clock_freq_gpu_mhz': pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_GRAPHICS), 'clock_freq_sm_mhz': pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_SM), 'clock_freq_memory_mhz': pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_MEM), 'clock_freq_video_mhz': pynvml.nvmlDeviceGetClockInfo(gpu, pynvml.NVML_CLOCK_VIDEO) }
def device_status(device_index): handle = nv.nvmlDeviceGetHandleByIndex(device_index) device_name = nv.nvmlDeviceGetName(handle) device_name = device_name.decode('UTF-8') nv_procs = nv.nvmlDeviceGetComputeRunningProcesses(handle) utilization = nv.nvmlDeviceGetUtilizationRates(handle).gpu clock_mhz = nv.nvmlDeviceGetClockInfo(handle, nv.NVML_CLOCK_SM) temperature = nv.nvmlDeviceGetTemperature(handle, nv.NVML_TEMPERATURE_GPU) pids = [] users = [] dates = [] cmd = None for nv_proc in nv_procs: pid = nv_proc.pid pids.append(pid) try: proc = psutil.Process(pid) users.append(proc.username()) dates.append(proc.create_time()) if cmd is None: cmd = parse_cmd_roughly(proc.cmdline()) except psutil.NoSuchProcess: users.append('?') return { 'type': device_name, 'is_available': len(pids) == 0, 'pids': ','.join([str(pid) for pid in pids]), 'users': ','.join(users), 'running_since': arrow.get(min(dates)).humanize() if len(dates) > 0 else None, 'utilization': utilization, 'clock_mhz': clock_mhz, 'temperature': temperature, 'cmd': cmd, }
def _get_data(self): data = {} if self.deviceCount: for i in range(self.deviceCount): gpuIdx = str(i) handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle) brand = pynvml.nvmlDeviceGetBrand(handle) ### Get data ### ## Memory usage try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) except Exception as e: self.debug(str(e)) mem = None ## ECC errors try: _memError = {} _eccCounter = {} eccErrors = {} eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC'] memErrorType = [ 'ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED' ] memoryLocationType = [ 'L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY', 'REGISTER_FILE', 'TEXTURE_MEMORY' ] for memoryLocation in range(5): for eccCounter in range(2): for memError in range(2): _memError[memErrorType[ memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter( handle, memError, eccCounter, memoryLocation) _eccCounter[eccCounterType[eccCounter]] = _memError eccErrors[ memoryLocationType[memoryLocation]] = _eccCounter except Exception as e: self.debug(str(e)) eccErrors = None ## Temperature try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) except Exception as e: self.debug(str(e)) temp = None ## Fan try: fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle) except Exception as e: self.debug(str(e)) fanspeed = None ## GPU and Memory Utilization try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) gpu_util = util.gpu mem_util = util.memory except Exception as e: self.debug(str(e)) gpu_util = None mem_util = None ## Encoder Utilization try: encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) enc_util = encoder[0] except Exception as e: self.debug(str(e)) enc_util = None ## Decoder Utilization try: decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) dec_util = decoder[0] except Exception as e: self.debug(str(e)) dec_util = None ## Clock frequencies try: clock_core = pynvml.nvmlDeviceGetClockInfo( handle, pynvml.NVML_CLOCK_GRAPHICS) clock_sm = pynvml.nvmlDeviceGetClockInfo( handle, pynvml.NVML_CLOCK_SM) clock_mem = pynvml.nvmlDeviceGetClockInfo( handle, pynvml.NVML_CLOCK_MEM) * self.nvMemFactor except Exception as e: self.debug(str(e)) clock_core = None clock_sm = None clock_mem = None ### Packing data ### self.debug("Device", gpuIdx, ":", str(name)) data["device_name_" + gpuIdx] = name self.debug("Brand:", str(brand)) self.debug(str(name), "Temp :", str(temp)) data["device_temp_" + gpuIdx] = temp self.debug(str(name), "Mem total :", str(mem.total), 'bytes') data["device_mem_total_" + gpuIdx] = mem.total self.debug(str(name), "Mem used :", str(mem.used), 'bytes') data["device_mem_used_" + gpuIdx] = mem.used self.debug(str(name), "Mem free :", str(mem.free), 'bytes') data["device_mem_free_" + gpuIdx] = mem.free self.debug(str(name), "Load GPU :", str(gpu_util), '%') data["device_load_gpu_" + gpuIdx] = gpu_util self.debug(str(name), "Load MEM :", str(mem_util), '%') data["device_load_mem_" + gpuIdx] = mem_util self.debug(str(name), "Load ENC :", str(enc_util), '%') data["device_load_enc_" + gpuIdx] = enc_util self.debug(str(name), "Load DEC :", str(dec_util), '%') data["device_load_dec_" + gpuIdx] = dec_util self.debug(str(name), "Core clock:", str(clock_core), 'MHz') data["device_core_clock_" + gpuIdx] = clock_core self.debug(str(name), "SM clock :", str(clock_sm), 'MHz') data["device_sm_clock_" + gpuIdx] = clock_sm self.debug(str(name), "Mem clock :", str(clock_mem), 'MHz') data["device_mem_clock_" + gpuIdx] = clock_mem self.debug(str(name), "Fan speed :", str(fanspeed), '%') data["device_fanspeed_" + gpuIdx] = fanspeed self.debug(str(name), "ECC errors:", str(eccErrors)) if eccErrors is not None: data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"][ "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"][ "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"][ "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"][ "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data[ "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] else: data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = None ## Get unit (S-class Nvidia cards) data if self.unitCount: for i in range(self.unitCount): gpuIdx = str(i) handle = pynvml.nvmlUnitGetHandleByIndex(i) try: fan = pynvml.nvmlUnitGetFanSpeedInfo(handle) fan_speed = fan.speed # Fan speed (RPM) fan_state = fan.state # Flag that indicates whether fan is working properly except Exception as e: self.debug(str(e)) fan_speed = None fan_state = None try: psu = pynvml.nvmlUnitGetPsuInfo(handle) psu_current = psu.current # PSU current (A) psu_power = psu.power # PSU power draw (W) psu_state = psu.state # The power supply state psu_voltage = psu.voltage # PSU voltage (V) except Exception as e: self.debug(str(e)) psu_current = None psu_power = None psu_state = None psu_voltage = None try: temp_intake = pynvml.nvmlUnitGetTemperature( handle, 0) # Temperature at intake in C temp_exhaust = pynvml.nvmlUnitGetTemperature( handle, 1) # Temperature at exhaust in C temp_board = pynvml.nvmlUnitGetTemperature( handle, 2) # Temperature on board in C except Exception as e: self.debug(str(e)) temp_intake = None temp_exhaust = None temp_board = None self.debug('Unit fan speed:', str(fan_speed)) data["unit_fan_speed_" + gpuIdx] = fan_speed self.debug('Unit fan state:', str(fan_state)) data["unit_fan_state_" + gpuIdx] = fan_state self.debug('Unit PSU current:', str(psu_current)) data["unit_psu_current_" + gpuIdx] = psu_current self.debug('Unit PSU power:', str(psu_power)) data["unit_psu_power_" + gpuIdx] = psu_power self.debug('Unit PSU state:', str(psu_state)) data["unit_psu_state_" + gpuIdx] = psu_state self.debug('Unit PSU voltage:', str(psu_voltage)) data["unit_psu_voltage_" + gpuIdx] = psu_voltage self.debug('Unit temp intake:', str(temp_intake)) data["unit_temp_intake_" + gpuIdx] = temp_intake self.debug('Unit temp exhaust:', str(temp_exhaust)) data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust self.debug('Unit temp board:', str(temp_board)) data["unit_temp_board_" + gpuIdx] = temp_board ## Get data via legacy mode if self.legacy: try: output, error = Popen([ "nvidia-settings", "-c", ":0", "-q", "GPUUtilization", "-q", "GPUCurrentClockFreqs", "-q", "GPUCoreTemp", "-q", "TotalDedicatedGPUMemory", "-q", "UsedDedicatedGPUMemory" ], shell=False, stdout=PIPE, stderr=PIPE).communicate() output = repr(str(output)) if len(output) < 800: raise Exception( 'Error in fetching data from nvidia-settings ' + output) self.debug(str(error), output) except Exception as e: self.error(str(e)) self.error('Setting legacy mode to False') self.legacy = False return data for i in range(self.deviceCount): gpuIdx = str(i) if data["device_temp_" + gpuIdx] is None: coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)', output)[i][1] try: data["device_temp_" + gpuIdx] = int(coreTemp) self.debug('Using legacy temp for GPU {0}: {1}'.format( gpuIdx, coreTemp)) except Exception as e: self.debug(str(e), "skipping device_temp_" + gpuIdx) if data["device_mem_used_" + gpuIdx] is None: memUsed = findall( 'UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)', output)[i][1] try: data["device_mem_used_" + gpuIdx] = int(memUsed) self.debug( 'Using legacy mem_used for GPU {0}: {1}'.format( gpuIdx, memUsed)) except Exception as e: self.debug(str(e), "skipping device_mem_used_" + gpuIdx) if data["device_load_gpu_" + gpuIdx] is None: gpu_util = findall( '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][1] try: data["device_load_gpu_" + gpuIdx] = int(gpu_util) self.debug( 'Using legacy load_gpu for GPU {0}: {1}'.format( gpuIdx, gpu_util)) except Exception as e: self.debug(str(e), "skipping device_load_gpu_" + gpuIdx) if data["device_load_mem_" + gpuIdx] is None: mem_util = findall( '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][2] try: data["device_load_mem_" + gpuIdx] = int(mem_util) self.debug( 'Using legacy load_mem for GPU {0}: {1}'.format( gpuIdx, mem_util)) except Exception as e: self.debug(str(e), "skipping device_load_mem_" + gpuIdx) if data["device_core_clock_" + gpuIdx] is None: clock_core = findall( 'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][1] try: data["device_core_clock_" + gpuIdx] = int(clock_core) self.debug( 'Using legacy core_clock for GPU {0}: {1}'.format( gpuIdx, clock_core)) except Exception as e: self.debug(str(e), "skipping device_core_clock_" + gpuIdx) if data["device_mem_clock_" + gpuIdx] is None: clock_mem = findall( 'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][2] try: data["device_mem_clock_" + gpuIdx] = int(clock_mem) self.debug( 'Using legacy mem_clock for GPU {0}: {1}'.format( gpuIdx, clock_mem)) except Exception as e: self.debug(str(e), "skipping device_mem_clock_" + gpuIdx) return data
def mem_clock(self): """Control the graphics clock (may require permission), only values compatible with the graphics clock can be set directly""" return pynvml.nvmlDeviceGetClockInfo(self.dev, pynvml.NVML_CLOCK_MEM)
util = nvmlDeviceGetUtilizationRates(handle) #检索设备主要子系统的当前利用率 gpu_util = str(util.gpu) #gpu利用率 mem_util = str(util.memory) #显存利用率 except pn.NVMLError, err: gpu_util = 'NA' mem_util = 'NA' try: temp = pn.nvmlDeviceGetTemperature(handle, pn.NVML_TEMPERATURE_GPU) #获取GPU当前温度 except np.NVMLError, err: temp = 'NA' try: powMan = pn.nvmlDeviceGetPowerManagementMode(handle) #获取设备当前的电源管理模式 except pn.NVMLError, err: powMan = 'NA' try: graphics_clock = pn.nvmlDeviceGetClockInfo(handle, pn.NVML_CLOCK_GRAPHICS) #检索设备的当前时钟速度 except pn.NVMLError, err: graphics_clock = 'NA' try: mem_clock = pn.nvmlDeviceGetClockInfo(handle, pn.NVML_CLOCK_MEM) except np.NVMLError, err: mem_clock = 'NA' try: perf_stat = pn.nvmlDeviceGetPowerState(handle) #检索设备的当前性能状 except np.NVMLError, err: perf_stat = 'NA' tmp_dict['Gpu_Id'] = gpu_id tmp_dict['Product_Name'] = product_name tmp_dict['Mode'] = mode tmp_dict['Current_Driver_Model'] = Current_driver_model tmp_dict['Total_Memory'] = mem_total
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] # TODO: ps_process is being cached, but the dict below is not. process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = nv_process.usedGpuMemory // MB if \ nv_process.usedGpuMemory else None process['gpu_memory_usage'] = usedmem process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle) except N.NVMLError: utilization_enc = None # Not supported try: utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle) except N.NVMLError: utilization_dec = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: mem_clock = N.nvmlDeviceGetClockInfo(handle, 2) except N.NVMLError: mem_clock = None try: core_clock = N.nvmlDeviceGetClockInfo(handle, 1) except N.NVMLError: core_clock = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] # A single process might run in both of graphics and compute mode, # However we will display the process only once seen_pids = set() for nv_process in nv_comp_processes + nv_graphics_processes: if nv_process.pid in seen_pids: continue seen_pids.add(nv_process.pid) try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'mem.clock': mem_clock, 'core.clock': core_clock, 'utilization.gpu': utilization.gpu if utilization else None, 'utilization.enc': utilization_enc[0] if utilization_enc else None, 'utilization.dec': utilization_dec[0] if utilization_dec else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info
for i in range(nvmlDeviceGetCount()): info = defaultdict() driver_version = nvmlSystemGetDriverVersion().decode() info['Driver Version'] = driver_version info['GPU idx'] = i handle = nvmlDeviceGetHandleByIndex(i) # device name device_name = nvmlDeviceGetName(handle).decode() info['GPU Name'] = device_name # clock clk = nvmlDeviceGetClockInfo(handle, 0) max_clk = nvmlDeviceGetMaxClockInfo(handle, 0) clk_rate = int(clk / max_clk * 100) msg = pack_msg([clk, max_clk], 'MHz') info['GPU Clock'] = getBar(clk_rate, msg) # utilize util = nvmlDeviceGetUtilizationRates(handle) # memory mem_info = nvmlDeviceGetMemoryInfo(handle) mem_used = toMiB(mem_info.used) # mem_free = toMiB(info.free) mem_total = toMiB(mem_info.total) info['GPU Util'] = getBar(util.gpu) mem_rate = int(mem_used / mem_total * 100) msg = pack_msg([mem_used, mem_total], 'MiB')