def pwr_limit(self, new_limit): if not self.pwr_constraints[0] <= new_limit <= self.pwr_constraints[1]: raise ValueError( f"Power limit {new_limit} out of range [{self.pwr_constraints[0]}, {self.pwr_constraints[1]}]" ) pynvml.nvmlDeviceSetPowerManagementLimit(self.dev, new_limit) self._pwr_limit = pynvml.nvmlDeviceGetPowerManagementLimit(self.dev)
def __init__(self, id=0): """Create object to control device using NVML""" pynvml.nvmlInit() self.dev = pynvml.nvmlDeviceGetHandleByIndex(id) try: self._pwr_limit = pynvml.nvmlDeviceGetPowerManagementLimit( self.dev) self.pwr_constraints = pynvml.nvmlDeviceGetPowerManagementLimitConstraints( self.dev) except pynvml.NVMLError_NotSupported: self._pwr_limit = None self.pwr_constraints = [ 1, 0 ] # inverted range to make all range checks fail try: self._persistence_mode = pynvml.nvmlDeviceGetPersistenceMode( self.dev) except pynvml.NVMLError_NotSupported: self._persistence_mode = None try: self._auto_boost = pynvml.nvmlDeviceGetAutoBoostedClocksEnabled( self.dev)[0] # returns [isEnabled, isDefaultEnabled] except pynvml.NVMLError_NotSupported: self._auto_boost = None try: self.gr_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock( self.dev, pynvml.NVML_CLOCK_GRAPHICS) self.sm_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock( self.dev, pynvml.NVML_CLOCK_SM) self.mem_clock_default = pynvml.nvmlDeviceGetDefaultApplicationsClock( self.dev, pynvml.NVML_CLOCK_MEM) self.supported_mem_clocks = pynvml.nvmlDeviceGetSupportedMemoryClocks( self.dev) #gather the supported gr clocks for each supported mem clock into a dict self.supported_gr_clocks = dict() for mem_clock in self.supported_mem_clocks: supported_gr_clocks = pynvml.nvmlDeviceGetSupportedGraphicsClocks( self.dev, mem_clock) self.supported_gr_clocks[mem_clock] = supported_gr_clocks except pynvml.NVMLError_NotSupported: self.gr_clock_default = None self.sm_clock_default = None self.mem_clock_default = None self.supported_mem_clocks = [] self.supported_gr_clocks = dict()
def get_perf(proc=None, recursive=True, children_pool=None, metrics=None): """ Get process performance metrics """ _initialize_pynvml() if metrics is None: metrics = OrderedDict() metrics['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S') metrics['cpu_total'] = psutil.cpu_count() metrics['cpu_used'] = psutil.cpu_percent() if proc: if recursive: percents = [] _recursive_proc(percents, proc, children_pool, lambda p: p.cpu_percent()) metrics['proc_count'] = len(percents) metrics['cpu_used_proc'] = sum(percents) else: metrics['cpu_used_proc'] = proc.cpu_percent() mem = psutil.virtual_memory() metrics['mem_total'] = mem.total metrics['mem_used'] = mem.used if proc: if recursive: rss = [] _recursive_proc(rss, proc, children_pool, lambda p: p.memory_info().rss) metrics['mem_used_proc'] = sum(rss) else: metrics['mem_used_proc'] = proc.memory_info().rss for i, h in enumerate(_gpu_devices): used = pynvml.nvmlDeviceGetUtilizationRates(h) mem = pynvml.nvmlDeviceGetMemoryInfo(h) metrics[f'gpu_{i}_used'] = used.gpu metrics[f'gpu_{i}_mem_used'] = mem.used # used.memory metrics[f'gpu_{i}_mem_total'] = mem.total metrics[f'gpu_{i}_power_used'] = pynvml.nvmlDeviceGetPowerUsage(h) metrics[ f'gpu_{i}_power_total'] = pynvml.nvmlDeviceGetPowerManagementLimit( h) return metrics
def _get_full_status_nvml(): devices_status = [] devices_full_status = [] for handle in _static_info['private']['gpu']['handles']: util = pynvml.nvmlDeviceGetUtilizationRates(handle) mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) process_info = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) devices_status.append({ 'utilization': { 'gpu': util.gpu, 'memory': util.memory }, 'memory': { 'percent': int(1000.0 * mem_info.used / mem_info.total) / 10.0 }, 'processes': len(process_info) }) with _process_info_lock: process_list = [] for p in process_info: info = _process_info[p.pid] info['gpu_memory'] = p.usedGpuMemory process_list.append(info) process_list.sort(key=lambda i: i['gpu_memory'] or 0, reverse=True) full_status = { 'memory': { 'free': mem_info.free, 'used': mem_info.used }, 'process_list': process_list } try: full_status['fan_speed'] = pynvml.nvmlDeviceGetFanSpeed(handle) except pynvml.NVMLError_NotSupported: pass try: full_status['temperature'] = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) except pynvml.NVMLError_NotSupported: pass try: full_status['performance'] = pynvml.nvmlDeviceGetPerformanceState( handle) except pynvml.NVMLError_NotSupported: pass try: full_status['power'] = { 'usage': pynvml.nvmlDeviceGetPowerUsage(handle), 'limit': pynvml.nvmlDeviceGetPowerManagementLimit(handle) } except pynvml.NVMLError_NotSupported: pass devices_full_status.append(full_status) status = { 'basic': { 'devices': devices_status }, 'full': { 'devices': devices_full_status } } return status
def _get_power_limit_watts(gpu): return { 'power_limit_watts': (pynvml.nvmlDeviceGetPowerManagementLimit(gpu) / 1000.0) }