def collect_metrics(self): """ Collect NVIDIA GPU metrics (eg: Temperature, Power-Consumption, fan-speed, etc.) """ data_list = [] for gpu_num in range(nvmlDeviceGetCount()): handle = nvmlDeviceGetHandleByIndex(gpu_num) device_name = DEVICE_NAME_FORMAT % gpu_num power_usage = float(nvmlDeviceGetPowerUsage(handle)) / 1000.0 fan_speed = nvmlDeviceGetFanSpeed(handle) temperature = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) data_list.append({ 'measurement': device_name, 'tags': { 'host': 'minar', 'gpu': device_name }, 'fields': { 'power_usage': power_usage, 'fan_speed': fan_speed, 'temperature': temperature } }) time.sleep(PERIOD_SECS) return data_list
def get_fan_speed(handle): fan = -1 try: fan = pynvml.nvmlDeviceGetFanSpeed(handle) except Exception: pass return fan
def get_fan_speed(handle): fan = -1 try: fan = pynvml.nvmlDeviceGetFanSpeed(handle) except Exception: pass return fan
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = nv_process.usedGpuMemory // MB if \ nv_process.usedGpuMemory else None process['gpu_memory_usage'] = usedmem process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] # TODO: ps_process is being cached, but the dict below is not. process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes # if drivers are not TTC this will be None. usedmem = (nv_process.usedGpuMemory // MB if nv_process.usedGpuMemory else None) process['gpu_memory_usage'] = usedmem # process['gpu_memory_usage'] = ("%d MiB" % usedmem if usedmem is not None else usedmem) process['cpu_percent'] = ps_process.cpu_percent() # process['cpu_memory_usage'] = "%d MiB" % ( # round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB) process['cpu_memory_usage'] = ( round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) // MB) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: utilization_enc = N.nvmlDeviceGetEncoderUtilization(handle) except N.NVMLError: utilization_enc = None # Not supported try: utilization_dec = N.nvmlDeviceGetDecoderUtilization(handle) except N.NVMLError: utilization_dec = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] # A single process might run in both of graphics and compute mode, # However we will display the process only once seen_pids = set() for nv_process in nv_comp_processes + nv_graphics_processes: if nv_process.pid in seen_pids: continue seen_pids.add(nv_process.pid) try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] try: process['cpu_percent'] = cache_process.cpu_percent() except psutil.NoSuchProcess: process['cpu_percent'] = 0.0 except FileNotFoundError: # Ignore the exception which probably has occured # from psutil, due to a non-existent PID (see #95). # The exception should have been translated, but # there appears to be a bug of psutil. It is unlikely # FileNotFoundError is thrown in different situations. process['cpu_percent'] = 0.0 pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else 0, 'utilization.enc': utilization_enc[0] if utilization_enc else None, 'utilization.dec': utilization_dec[0] if utilization_dec else None, 'power.draw': power // 1000 if power is not None else 0, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else 0, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else 0, 'memory.total': memory.total // MB if memory else 0, 'processes': processes, } GPUStatCollection.clean_processes() return gpu_info
def get_infos(): """Get all information about all your graphics cards. Returns: dict: The returned result is a dict with 3 keys: count, driver_version and devices: count: Number of gpus found driver_version: The version of the system’s graphics driver devices: It's a list and every item is a namedtuple Device which has 10 fields, for exzample id, name and fan_speed etc. It should be noted that the Process field is also a namedtuple which has 11 fields. """ infos = {} Device = namedtuple( "Device", [ "id", "name", "free", "used", "total", "temperature", "fan_speed", "power_usage", "power_state", "process", ], ) Process = namedtuple( "Process", [ "pid", "memory_percent", "status", "username", "num_threads", "cpu_num", "cpu_percent", "name", "cmdline", "used_gpu_mem", "create_time", ], ) driver_version = pynvml.nvmlSystemGetDriverVersion().decode() device_count = pynvml.nvmlDeviceGetCount() devices = [] for i in range(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle).decode() mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) power_usage = pynvml.nvmlDeviceGetPowerUsage( handle) # Power usage in milliwatts mW processes = pynvml.nvmlDeviceGetComputeRunningProcesses( handle) # Which processes are using the GPU # process_info = [(item.pid, item.usedGpuMemory) for item in process_info] process_info = [] for p in processes: # append Process object to process_info pid = p.pid used_gpu_mem = p.usedGpuMemory p = psutil.Process(pid=pid) _ = p.cpu_percent() time.sleep(0.05) process_info.append( Process( pid=pid, memory_percent=p.memory_percent(), status=p.status(), username=p.username(), num_threads=p.num_threads(), cpu_num=p.cpu_num(), cpu_percent=p.cpu_percent(), name=p.name(), cmdline=" ".join(p.cmdline()), used_gpu_mem=used_gpu_mem, create_time=p.create_time(), )) try: fan_speed = pynvml.nvmlDeviceGetFanSpeed(handle) except pynvml.NVMLError_NotSupported as e: fan_speed = None power_usage = pynvml.nvmlDeviceGetPowerUsage(handle) power_state = pynvml.nvmlDeviceGetPowerState(handle) temperature = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) devices.append( Device( id=i, name=name, free=mem_info.free, used=mem_info.used, total=mem_info.total, temperature=temperature, fan_speed=fan_speed, power_usage=power_usage, power_state=power_state, process=process_info, )) infos["count"] = device_count infos["driver_version"] = driver_version infos["devices"] = devices return infos
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_last_used(index): last_useds = [] if not os.path.exists('gpu_history.pkl'): pickle.dump({}, open('gpu_history.pkl', 'wb')) with open('gpu_history.pkl', 'rb') as f: history = pickle.load(f) if platform.node() in history: for user, last_used in history[ platform.node()][index].items(): # 1 day = 24 hours, 1 hour = 3600 seconds used_before = (datetime.now() - last_used['last_used']).days * 24 + \ (datetime.now() - last_used['last_used']).seconds / 3600 last_useds.append((user, used_before)) return last_useds else: return [] def get_process_info(nv_process): """Get the process information of specific pid""" process = {} if nv_process.pid not in GPUStatCollection.global_processes: GPUStatCollection.global_processes[nv_process.pid] = \ psutil.Process(pid=nv_process.pid) ps_process = GPUStatCollection.global_processes[nv_process.pid] process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' process['full_command'] = ['?'] else: process['command'] = os.path.basename(_cmdline[0]) process['full_command'] = _cmdline # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['cpu_percent'] = ps_process.cpu_percent() process['cpu_memory_usage'] = \ round((ps_process.memory_percent() / 100.0) * psutil.virtual_memory().total) process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass # TODO: Do not block if full process info is not requested time.sleep(0.1) for process in processes: pid = process['pid'] cache_process = GPUStatCollection.global_processes[pid] process['cpu_percent'] = cache_process.cpu_percent() index = N.nvmlDeviceGetIndex(handle) last_used = get_last_used(index) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, 'last_used': last_used, } GPUStatCollection.clean_processes() return gpu_info
def log_system(log_file, process_pids=None): """ Logs system utilization metrics to log file """ # log cpu util cpu_util = psutil.cpu_percent() cpu_util_ind = psutil.cpu_percent(percpu=True) ts = time.time() key = "INFO" message = "CPU util: {}% -- Individual utils 1-24: {}".format( cpu_util, cpu_util_ind[:24]) write_to_log(log_file, (ts, key, message)) message = "CPU util: {}% -- Individual utils 25-48: {}".format( cpu_util, cpu_util_ind[24:]) write_to_log(log_file, (ts, key, message)) # log GPU util and memory try: max_gpu_util = 0 deviceCount = pynvml.nvmlDeviceGetCount() for idx in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(idx) board_num = pynvml.nvmlDeviceGetBoardId(handle) name = "GPU {}: {} (ID {})".format( idx, pynvml.nvmlDeviceGetName(handle).decode("utf-8"), board_num) util = pynvml.nvmlDeviceGetUtilizationRates(handle) fan_util = pynvml.nvmlDeviceGetFanSpeed(handle) pcie_counter = pynvml.nvmlDeviceGetPcieReplayCounter(handle) pcie_util = pynvml.nvmlDeviceGetPcieThroughput( handle, pcie_counter) gpu_util = util.gpu mem_util = util.memory message = "{}: Kernel:{}% Mem:{}% Fan:{}% PCIe: {}MB/s".format( name, gpu_util, mem_util, fan_util, round(pcie_util / 1000, 1)) ts = time.time() key = "INFO" write_to_log(log_file, (ts, key, message)) if gpu_util > max_gpu_util: max_gpu_util = gpu_util except pynvml.NVMLError as error: print(error) # log memory util mem_util = psutil.virtual_memory() used = round(mem_util.used / 1e+9, 2) total = round(mem_util.total / 1e+9, 2) ts = time.time() key = "INFO" message = "Memory util: {}% ({}/{}GB)".format( round(used / total * 100, 2), used, total) write_to_log(log_file, (ts, key, message)) pid_statuses = [] warning = False if process_pids is not None: for key in process_pids: pid = process_pids[key] try: os.kill(pid, 0) RUNNING = "running" except OSError: RUNNING = "stopped" warning = True pid_statuses.append("{} ({}): {}\n".format(key, pid, RUNNING)) ts = time.time() key = "INFO" if warning: key = "WARNING" write_to_log(log_file, (ts, key, pid_statuses)) last_log_time = time.time() return last_log_time, max_gpu_util
def get_gpu_pid_info(): """Retrieves the process IDs of processes running on the GPU.""" gpus = [] device_count = -1 try: nvmlInit() device_count = nvmlDeviceGetCount() gpus = [{}] * device_count for i in range(device_count): gpus[i] = {'id': i} handle = nvmlDeviceGetHandleByIndex(i) device_name = nvmlDeviceGetName(handle) gpus[i]['name'] = device_name try: util = nvmlDeviceGetUtilizationRates(handle) gpus[i]['utilization'] = util.gpu except NVMLError as err: print(f'Error while reading GPU utilization for GPU {i}: {err}', file=sys.stderr) try: mem_info = nvmlDeviceGetMemoryInfo(handle) gpus[i]['mem_total'] = mem_info.total gpus[i]['mem_used'] = mem_info.used except NVMLError as err: print(f'Error while reading memory utilization for GPU {i}: {err}', file=sys.stderr) try: fan_speed = nvmlDeviceGetFanSpeed(handle) gpus[i]['fan_speed'] = fan_speed except NVMLError as err: print(f'Error while reading fan speed for GPU {i}: {err}', file=sys.stderr) try: temp = nvmlDeviceGetTemperature(handle, 0) gpus[i]['temp'] = temp except NVMLError as err: print(f'Error while reading temperature for GPU {i}: {err}', file=sys.stderr) try: power_usage = nvmlDeviceGetPowerUsage(handle) gpus[i]['power_usage'] = round(power_usage / 1000.) except NVMLError as err: print(f'Error while reading power usage for GPU {i}: {err}', file=sys.stderr) try: power_limit = nvmlDeviceGetEnforcedPowerLimit(handle) gpus[i]['power_limit'] = round(power_limit / 1000.) except NVMLError as err: print(f'Error while reading power limit for GPU {i}: {err}', file=sys.stderr) gpus[i]['processes'] = [] try: processes = nvmlDeviceGetComputeRunningProcesses(handle) for process in processes: process_name = nvmlSystemGetProcessName(process.pid).decode() gpus[i]['processes'].append({'pid': process.pid, 'name': process_name}) except NVMLError as err: print(f'Error while reading processes for GPU {i}: {err}', file=sys.stderr) except NVMLError as err: print(f'Error while reading GPU information: {err}', file=sys.stderr) nvmlShutdown() return gpus, device_count
deviceCount = pynvml.nvmlDeviceGetCount() print(' 共 %s 块 GPU,名称为:'%deviceCount) for i in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(i) print(" GPU", i, ":", pynvml.nvmlDeviceGetName(handle)) print('--------------') for i in range(deviceCount): print('查看第 %s 块GPU的显存、温度、风扇、电源: '%i) handle = pynvml.nvmlDeviceGetHandleByIndex(i) info = pynvml.nvmlDeviceGetMemoryInfo(handle) print("Memory Total: %0.2f G"%(info.total/1024/1024/1024)) # 总的显存大小 print("Memory Free: %0.2f G "%(info.free/1024/1024/1024)) # 剩余显存大小 print("Memory Used: %0.2f G "%(info.used/1024/1024/1024)) print("Memory Used percent: %0.2f %% "%(info.used/info.total*100)) print("Temperature is %d C"%(pynvml.nvmlDeviceGetTemperature(handle,0))) print("Fan speed is ",pynvml.nvmlDeviceGetFanSpeed(handle)) print("Power ststus",pynvml.nvmlDeviceGetPowerState(handle)) print('--------------') #最后要关闭管理工具 pynvml.nvmlShutdown() #%% ----------------- os ---------------------- os.listdir(r'c:\windows') os.getcwd() # 当前工作目录 os.chdir('C:\Users\Python_Folder') # 改变工作目录到dirname os.curdir # 返回当前工作目录 os.__file__ # D:\envs\py27\lib\os.pyc os.rename("python26","python21") shutil.move("python21","python20")
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 # Number of active GPUs self.gauge('nvml.gpus.number', deviceCount) for device_id in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetTemperature:{}'.format(err)) # power info try: pwr = pynvml.nvmlDeviceGetPowerUsage(handle) // 1000 self.gauge('nvml.power.', pwr, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetPowerUsage:{}'.format(err)) # fan info try: fan = pynvml.nvmlDeviceGetFanSpeed(handle) self.gauge('nvml.fan.', fan, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetFanSpeed:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append('nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.debug('nvml.util.encoder %s' % int(util_encoder[0])) self.gauge('nvml.util.encoder', int( util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.debug('nvml.util.decoder %s' % int(util_decoder[0])) self.gauge('nvml.util.decoder', int( util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid) p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) # Clocks throttling info # Divide by the mask so that the value is either 0 or 1 per GPU try: throttle_reasons = ( pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(handle)) self.gauge('nvml.throttle.appsettings', (throttle_reasons & pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting) / pynvml.nvmlClocksThrottleReasonApplicationsClocksSetting, tags=d_tags) self.gauge('nvml.throttle.display', (throttle_reasons & GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS) / GPU_THROTTLE_DISPLAY_CLOCKS_SETTINGS, tags=d_tags) self.gauge('nvml.throttle.hardware', (throttle_reasons & pynvml.nvmlClocksThrottleReasonHwSlowdown) / pynvml.nvmlClocksThrottleReasonHwSlowdown, tags=d_tags) self.gauge('nvml.throttle.idle', (throttle_reasons & pynvml.nvmlClocksThrottleReasonGpuIdle) / pynvml.nvmlClocksThrottleReasonGpuIdle, tags=d_tags) self.gauge('nvml.throttle.power.hardware', (throttle_reasons & GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE) / GPU_THROTTLE_POWER_BRAKE_SLOWDOWN_HARDWARE, tags=d_tags) self.gauge('nvml.throttle.power.software', (throttle_reasons & pynvml.nvmlClocksThrottleReasonSwPowerCap) / pynvml.nvmlClocksThrottleReasonSwPowerCap, tags=d_tags) self.gauge('nvml.throttle.syncboost', (throttle_reasons & GPU_THROTTLE_SYNCBOOST) / GPU_THROTTLE_SYNCBOOST, tags=d_tags) self.gauge('nvml.throttle.temp.hardware', (throttle_reasons & GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE) / GPU_THROTTLE_THERMAL_SLOWDOWN_HARDWARE, tags=d_tags) self.gauge('nvml.throttle.temp.software', (throttle_reasons & GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE) / GPU_THROTTLE_THERMAL_SLOWDOWN_SOFTWARE, tags=d_tags) self.gauge('nvml.throttle.unknown', (throttle_reasons & pynvml.nvmlClocksThrottleReasonUnknown) / pynvml.nvmlClocksThrottleReasonUnknown, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( 'nvmlDeviceGetCurrentClocksThrottleReasons:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = ','.join(msg_list) else: status = AgentCheck.OK msg = 'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
pid, p.memory_full_info().pss / 1024. / 1024. / 1024.)) #################查看进程资源情况################################################ #################查看GPU资源情况################################################# pynvml.nvmlInit() deviceCount = pynvml.nvmlDeviceGetCount() for i in range(deviceCount): # 这里的i是GPU id handle = pynvml.nvmlDeviceGetHandleByIndex(i) print("GPU %d name %s" % (i, pynvml.nvmlDeviceGetName(handle))) print("GPU %d Driver %s" % (i, pynvml.nvmlSystemGetDriverVersion())) # 显示驱动信息 meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) print("GPU %d mem info total : %.3f GByte" % (i, meminfo.total / 1024. / 1024. / 1024.)) print("GPU %d mem info used : %.3f MByte" % (i, meminfo.used / 1024. / 1024.)) print("GPU %d mem info free : %.3f MByte" % (i, meminfo.free / 1024. / 1024.)) print("Temperature is %d℃" % pynvml.nvmlDeviceGetTemperature(handle, 0)) print("Fan speed is %d%%" % pynvml.nvmlDeviceGetFanSpeed(handle)) print("Power ststus P%d" % pynvml.nvmlDeviceGetPowerState(handle)) print("Power ststus %.1fW" % (pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0)) print("nvmlSystemGetProcessName is %s" % pynvml.nvmlSystemGetProcessName(pid)) # /usr/bin/python # print("nvmlSystemGetProcessName is %s" % pynvml.nvmlDeviceGetAccountingStats(handle, pid)) # 最后要关闭管理工具 pynvml.nvmlShutdown() #################查看GPU资源情况#################################################
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # power info try: pwr = pynvml.nvmlDeviceGetPowerUsage(handle) // 1000 self.gauge('nvml.power.', pwr, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetPowerUsage:{}'.format(err)) # fan info try: fan = pynvml.nvmlDeviceGetFanSpeed(handle) self.gauge('nvml.fan.', fan, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetFanSpeed:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.debug('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long( util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.debug('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long( util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid) p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
# decoder usage utilization, samplingPeriodUs = nvmlDeviceGetDecoderUtilization(handle) info['DEC Util'] = getBar(utilization) # power state power_used = nvmlDeviceGetPowerUsage(handle) / 1000 power_limit = nvmlDeviceGetPowerManagementDefaultLimit(handle) / 1000 power_used = int(power_used) power_limit = int(power_limit) power_rate = int(power_used / power_limit * 100) msg = pack_msg([power_used, power_limit], 'W') info['Power Util'] = getBar(power_rate, msg) # fan speed, temperature fan_speed = nvmlDeviceGetFanSpeed(handle) temp = nvmlDeviceGetTemperature(handle, 0) msg = f"{temp}C" info['Fan Speed'] = getBar(fan_speed, msg) message = [f"{k} \t{v}" for k, v in info.items()] print('\n'.join(message)) # graphic processes graphic_processes = nvmlDeviceGetGraphicsRunningProcesses(handle) header = "\n=== Graphic Processes ===" show_process(header, graphic_processes) # graphic processes compute_processes = nvmlDeviceGetComputeRunningProcesses(handle) header = "\n=== Compute Processes ==="
def _get_full_status_nvml(): devices_status = [] devices_full_status = [] for handle in _static_info['private']['gpu']['handles']: util = pynvml.nvmlDeviceGetUtilizationRates(handle) mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) process_info = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) devices_status.append({ 'utilization': { 'gpu': util.gpu, 'memory': util.memory }, 'memory': { 'percent': int(1000.0 * mem_info.used / mem_info.total) / 10.0 }, 'processes': len(process_info) }) with _process_info_lock: process_list = [] for p in process_info: info = _process_info[p.pid] info['gpu_memory'] = p.usedGpuMemory process_list.append(info) process_list.sort(key=lambda i: i['gpu_memory'] or 0, reverse=True) full_status = { 'memory': { 'free': mem_info.free, 'used': mem_info.used }, 'process_list': process_list } try: full_status['fan_speed'] = pynvml.nvmlDeviceGetFanSpeed(handle) except pynvml.NVMLError_NotSupported: pass try: full_status['temperature'] = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) except pynvml.NVMLError_NotSupported: pass try: full_status['performance'] = pynvml.nvmlDeviceGetPerformanceState( handle) except pynvml.NVMLError_NotSupported: pass try: full_status['power'] = { 'usage': pynvml.nvmlDeviceGetPowerUsage(handle), 'limit': pynvml.nvmlDeviceGetPowerManagementLimit(handle) } except pynvml.NVMLError_NotSupported: pass devices_full_status.append(full_status) status = { 'basic': { 'devices': devices_status }, 'full': { 'devices': devices_full_status } } return status
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None # Not supported try: fan_speed = N.nvmlDeviceGetFanSpeed(handle) except N.NVMLError: fan_speed = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'fan.speed': fan_speed, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
def get_fan_speed(handle): try: return pynvml.nvmlDeviceGetFanSpeed(handle) except pynvml.NVMLError_NotSupported: return None
def _get_data(self): data = {} if self.deviceCount: for i in range(self.deviceCount): gpuIdx = str(i) handle = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(handle) brand = pynvml.nvmlDeviceGetBrand(handle) ### Get data ### ## Memory usage try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) except Exception as e: self.debug(str(e)) mem = None ## ECC errors try: _memError = {} _eccCounter = {} eccErrors = {} eccCounterType = ['VOLATILE_ECC', 'AGGREGATE_ECC'] memErrorType = [ 'ERROR_TYPE_CORRECTED', 'ERROR_TYPE_UNCORRECTED' ] memoryLocationType = [ 'L1_CACHE', 'L2_CACHE', 'DEVICE_MEMORY', 'REGISTER_FILE', 'TEXTURE_MEMORY' ] for memoryLocation in range(5): for eccCounter in range(2): for memError in range(2): _memError[memErrorType[ memError]] = pynvml.nvmlDeviceGetMemoryErrorCounter( handle, memError, eccCounter, memoryLocation) _eccCounter[eccCounterType[eccCounter]] = _memError eccErrors[ memoryLocationType[memoryLocation]] = _eccCounter except Exception as e: self.debug(str(e)) eccErrors = None ## Temperature try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) except Exception as e: self.debug(str(e)) temp = None ## Fan try: fanspeed = pynvml.nvmlDeviceGetFanSpeed(handle) except Exception as e: self.debug(str(e)) fanspeed = None ## GPU and Memory Utilization try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) gpu_util = util.gpu mem_util = util.memory except Exception as e: self.debug(str(e)) gpu_util = None mem_util = None ## Encoder Utilization try: encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) enc_util = encoder[0] except Exception as e: self.debug(str(e)) enc_util = None ## Decoder Utilization try: decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) dec_util = decoder[0] except Exception as e: self.debug(str(e)) dec_util = None ## Clock frequencies try: clock_core = pynvml.nvmlDeviceGetClockInfo( handle, pynvml.NVML_CLOCK_GRAPHICS) clock_sm = pynvml.nvmlDeviceGetClockInfo( handle, pynvml.NVML_CLOCK_SM) clock_mem = pynvml.nvmlDeviceGetClockInfo( handle, pynvml.NVML_CLOCK_MEM) * self.nvMemFactor except Exception as e: self.debug(str(e)) clock_core = None clock_sm = None clock_mem = None ### Packing data ### self.debug("Device", gpuIdx, ":", str(name)) data["device_name_" + gpuIdx] = name self.debug("Brand:", str(brand)) self.debug(str(name), "Temp :", str(temp)) data["device_temp_" + gpuIdx] = temp self.debug(str(name), "Mem total :", str(mem.total), 'bytes') data["device_mem_total_" + gpuIdx] = mem.total self.debug(str(name), "Mem used :", str(mem.used), 'bytes') data["device_mem_used_" + gpuIdx] = mem.used self.debug(str(name), "Mem free :", str(mem.free), 'bytes') data["device_mem_free_" + gpuIdx] = mem.free self.debug(str(name), "Load GPU :", str(gpu_util), '%') data["device_load_gpu_" + gpuIdx] = gpu_util self.debug(str(name), "Load MEM :", str(mem_util), '%') data["device_load_mem_" + gpuIdx] = mem_util self.debug(str(name), "Load ENC :", str(enc_util), '%') data["device_load_enc_" + gpuIdx] = enc_util self.debug(str(name), "Load DEC :", str(dec_util), '%') data["device_load_dec_" + gpuIdx] = dec_util self.debug(str(name), "Core clock:", str(clock_core), 'MHz') data["device_core_clock_" + gpuIdx] = clock_core self.debug(str(name), "SM clock :", str(clock_sm), 'MHz') data["device_sm_clock_" + gpuIdx] = clock_sm self.debug(str(name), "Mem clock :", str(clock_mem), 'MHz') data["device_mem_clock_" + gpuIdx] = clock_mem self.debug(str(name), "Fan speed :", str(fanspeed), '%') data["device_fanspeed_" + gpuIdx] = fanspeed self.debug(str(name), "ECC errors:", str(eccErrors)) if eccErrors is not None: data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L1_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L1_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L1_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L1_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L2_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L2_CACHE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_L2_CACHE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_CORRECTED"] data["device_ecc_errors_L2_CACHE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["L2_CACHE"]["AGGREGATE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"][ "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_DEVICE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_DEVICE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_DEVICE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["DEVICE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_REGISTER_FILE_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"][ "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_REGISTER_FILE_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"]["VOLATILE_ECC"][ "ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_REGISTER_FILE_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"][ "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_REGISTER_FILE_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["REGISTER_FILE"][ "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] data["device_ecc_errors_TEXTURE_MEMORY_VOLATILE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "VOLATILE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_TEXTURE_MEMORY_VOLATILE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "VOLATILE_ECC"]["ERROR_TYPE_UNCORRECTED"] data[ "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_CORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_CORRECTED"] data[ "device_ecc_errors_TEXTURE_MEMORY_AGGREGATE_UNCORRECTED_" + gpuIdx] = eccErrors["TEXTURE_MEMORY"][ "AGGREGATE_ECC"]["ERROR_TYPE_UNCORRECTED"] else: data["device_ecc_errors_L1_CACHE_VOLATILE_CORRECTED_" + gpuIdx] = None ## Get unit (S-class Nvidia cards) data if self.unitCount: for i in range(self.unitCount): gpuIdx = str(i) handle = pynvml.nvmlUnitGetHandleByIndex(i) try: fan = pynvml.nvmlUnitGetFanSpeedInfo(handle) fan_speed = fan.speed # Fan speed (RPM) fan_state = fan.state # Flag that indicates whether fan is working properly except Exception as e: self.debug(str(e)) fan_speed = None fan_state = None try: psu = pynvml.nvmlUnitGetPsuInfo(handle) psu_current = psu.current # PSU current (A) psu_power = psu.power # PSU power draw (W) psu_state = psu.state # The power supply state psu_voltage = psu.voltage # PSU voltage (V) except Exception as e: self.debug(str(e)) psu_current = None psu_power = None psu_state = None psu_voltage = None try: temp_intake = pynvml.nvmlUnitGetTemperature( handle, 0) # Temperature at intake in C temp_exhaust = pynvml.nvmlUnitGetTemperature( handle, 1) # Temperature at exhaust in C temp_board = pynvml.nvmlUnitGetTemperature( handle, 2) # Temperature on board in C except Exception as e: self.debug(str(e)) temp_intake = None temp_exhaust = None temp_board = None self.debug('Unit fan speed:', str(fan_speed)) data["unit_fan_speed_" + gpuIdx] = fan_speed self.debug('Unit fan state:', str(fan_state)) data["unit_fan_state_" + gpuIdx] = fan_state self.debug('Unit PSU current:', str(psu_current)) data["unit_psu_current_" + gpuIdx] = psu_current self.debug('Unit PSU power:', str(psu_power)) data["unit_psu_power_" + gpuIdx] = psu_power self.debug('Unit PSU state:', str(psu_state)) data["unit_psu_state_" + gpuIdx] = psu_state self.debug('Unit PSU voltage:', str(psu_voltage)) data["unit_psu_voltage_" + gpuIdx] = psu_voltage self.debug('Unit temp intake:', str(temp_intake)) data["unit_temp_intake_" + gpuIdx] = temp_intake self.debug('Unit temp exhaust:', str(temp_exhaust)) data["unit_temp_exhaust_" + gpuIdx] = temp_exhaust self.debug('Unit temp board:', str(temp_board)) data["unit_temp_board_" + gpuIdx] = temp_board ## Get data via legacy mode if self.legacy: try: output, error = Popen([ "nvidia-settings", "-c", ":0", "-q", "GPUUtilization", "-q", "GPUCurrentClockFreqs", "-q", "GPUCoreTemp", "-q", "TotalDedicatedGPUMemory", "-q", "UsedDedicatedGPUMemory" ], shell=False, stdout=PIPE, stderr=PIPE).communicate() output = repr(str(output)) if len(output) < 800: raise Exception( 'Error in fetching data from nvidia-settings ' + output) self.debug(str(error), output) except Exception as e: self.error(str(e)) self.error('Setting legacy mode to False') self.legacy = False return data for i in range(self.deviceCount): gpuIdx = str(i) if data["device_temp_" + gpuIdx] is None: coreTemp = findall('GPUCoreTemp.*?(gpu:\d*).*?\s(\d*)', output)[i][1] try: data["device_temp_" + gpuIdx] = int(coreTemp) self.debug('Using legacy temp for GPU {0}: {1}'.format( gpuIdx, coreTemp)) except Exception as e: self.debug(str(e), "skipping device_temp_" + gpuIdx) if data["device_mem_used_" + gpuIdx] is None: memUsed = findall( 'UsedDedicatedGPUMemory.*?(gpu:\d*).*?\s(\d*)', output)[i][1] try: data["device_mem_used_" + gpuIdx] = int(memUsed) self.debug( 'Using legacy mem_used for GPU {0}: {1}'.format( gpuIdx, memUsed)) except Exception as e: self.debug(str(e), "skipping device_mem_used_" + gpuIdx) if data["device_load_gpu_" + gpuIdx] is None: gpu_util = findall( '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][1] try: data["device_load_gpu_" + gpuIdx] = int(gpu_util) self.debug( 'Using legacy load_gpu for GPU {0}: {1}'.format( gpuIdx, gpu_util)) except Exception as e: self.debug(str(e), "skipping device_load_gpu_" + gpuIdx) if data["device_load_mem_" + gpuIdx] is None: mem_util = findall( '(gpu:\d*).*?graphics=(\d*),.*?memory=(\d*)', output)[i][2] try: data["device_load_mem_" + gpuIdx] = int(mem_util) self.debug( 'Using legacy load_mem for GPU {0}: {1}'.format( gpuIdx, mem_util)) except Exception as e: self.debug(str(e), "skipping device_load_mem_" + gpuIdx) if data["device_core_clock_" + gpuIdx] is None: clock_core = findall( 'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][1] try: data["device_core_clock_" + gpuIdx] = int(clock_core) self.debug( 'Using legacy core_clock for GPU {0}: {1}'.format( gpuIdx, clock_core)) except Exception as e: self.debug(str(e), "skipping device_core_clock_" + gpuIdx) if data["device_mem_clock_" + gpuIdx] is None: clock_mem = findall( 'GPUCurrentClockFreqs.*?(gpu:\d*).*?(\d*),(\d*)', output)[i][2] try: data["device_mem_clock_" + gpuIdx] = int(clock_mem) self.debug( 'Using legacy mem_clock for GPU {0}: {1}'.format( gpuIdx, clock_mem)) except Exception as e: self.debug(str(e), "skipping device_mem_clock_" + gpuIdx) return data
def _get_fan_speed_percent(gpu): return {'fan_speed_percent': pynvml.nvmlDeviceGetFanSpeed(gpu)}