def get_utilization_rates(handle): try: return dict( gpu=pynvml.nvmlDeviceGetUtilizationRates(handle).gpu, memory=pynvml.nvmlDeviceGetUtilizationRates(handle).memory, ) except pynvml.NVMLError_Unknown: return dict( gpu=None, memory=None, )
def real_time(): return { "utilization": [pynvml.nvmlDeviceGetUtilizationRates(h).gpu for h in handles], "memory-used": [pynvml.nvmlDeviceGetMemoryInfo(h).used for h in handles], }
def admin_system(): factor = 1073741824 vmem = psutil.virtual_memory() ram = { "percent": vmem.percent, "used": round(vmem.used / factor, 2), "total": round(vmem.total / factor, 2) } # GB hdd = psutil.disk_usage(app.config['USERSPACE_FOLDER']) disk_usage = { "percent": round((hdd.used / hdd.total) * 100, 2), "used": round(hdd.used / factor, 2), "total": round(hdd.total / factor, 2) } # GB gpus = [] pynvml.nvmlInit() for i in range(0, pynvml.nvmlDeviceGetCount()): handle = pynvml.nvmlDeviceGetHandleByIndex(i) resources = pynvml.nvmlDeviceGetUtilizationRates(handle) gpus.append({ "id": i, "memory": resources.memory, "proc": resources.gpu }) return render_template('system.admin.html.jinja2', page_name='admin_system', page_title='System', ram=ram, cpu=round(psutil.cpu_percent(), 2), gpus=gpus, disk_usage=disk_usage)
def autoselect(gpu_target: List[int], min_memory: float) -> int: logging.info(f'GPU search space: {gpu_target}') nvmlInit() deviceCount = nvmlDeviceGetCount() memories = np.zeros((deviceCount, COUNT), dtype=np.float32) rates = np.zeros((deviceCount, COUNT), dtype=np.float32) for c in range(COUNT): for i in range(deviceCount): if i not in gpu_target: memories[i, c] = 0 rates[i, c] = 100 else: handle = nvmlDeviceGetHandleByIndex(i) memories[ i, c] = nvmlDeviceGetMemoryInfo(handle).free / 1024**3 rates[i, c] = int(nvmlDeviceGetUtilizationRates(handle).gpu) time.sleep(INTERVAL) nvmlShutdown() memories = memories.mean(1) rates = rates.mean(1) # enough memory GPU ids memory_enough_ids = np.where(memories > min_memory)[0] if len(memory_enough_ids) > 0: # min util GPU gpuid = memory_enough_ids[np.argmin(rates[memory_enough_ids])] # if multi GPUs' util are the same, choose one that has the most memory gpu_min_ids = np.where(rates[memory_enough_ids] <= rates[gpuid])[0] gpu_min_ids = memory_enough_ids[gpu_min_ids] gpuid = gpu_min_ids[np.argmax(memories[gpu_min_ids])] logging.info(f'Auto select GPU {gpuid}') else: raise MemoryError(str(memories)) return int(gpuid)
def cb(): nonlocal last_time now = time.time() src_dict = {"time": [now * 1000]} gpu_tot = 0 mem_tot = 0 tx_tot = 0 rx_tot = 0 for i in range(ngpus): gpu = pynvml.nvmlDeviceGetUtilizationRates(gpu_handles[i]).gpu mem = pynvml.nvmlDeviceGetMemoryInfo(gpu_handles[i]).used tx = (pynvml.nvmlDeviceGetPcieThroughput( gpu_handles[i], pynvml.NVML_PCIE_UTIL_TX_BYTES) * 1024) rx = (pynvml.nvmlDeviceGetPcieThroughput( gpu_handles[i], pynvml.NVML_PCIE_UTIL_RX_BYTES) * 1024) gpu_tot += gpu mem_tot += mem / (1024 * 1024) rx_tot += rx tx_tot += tx src_dict["gpu-" + str(i)] = [gpu] src_dict["memory-" + str(i)] = [mem] src_dict["gpu-total"] = [gpu_tot / ngpus] src_dict["memory-total"] = [(mem_tot / gpu_mem_sum) * 100] src_dict["tx-total"] = [tx_tot] src_dict["rx-total"] = [rx_tot] source.stream(src_dict, 1000) last_time = now
def __query_util(handle): """ Query information on the utilization of a GPU. Arguments: handle: NVML device handle. Returns: summaries (:obj:`dict`): Dictionary containing the memory values for ['mem_util', 'gpu_util']. All values are given as integers in the range (0, 100). """ # Query information on the GPU utilization. util = nvml.nvmlDeviceGetUtilizationRates(handle) summaries = dict() # Percent of time over the past second during which global (device) memory was being # read or written. summaries['mem_util'] = util.memory # Percent of time over the past second during which one or more kernels was executing # on the GPU. summaries['gpu_util'] = util.gpu return summaries
def _get_gpu_usage(gpu_count): import pynvml gpus = [] for i in range(gpu_count): handle = pynvml.nvmlDeviceGetHandleByIndex(i) try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) memory = pynvml.nvmlDeviceGetMemoryInfo(handle) temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) try: power_usage = ( pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0) / (pynvml.nvmlDeviceGetEnforcedPowerLimit(handle) / 1000.0) * 100 except pynvml.NVMLError as e: logger.error( "Coudln't extract power usage due to NVML exception: {}". format(str(e))) power_usage = -9999 gpus.append( (handle, util.gpu, util.memory, (memory.used / float(memory.total)) * 100, temp, power_usage)) except pynvml.NVMLError as e: logger.error( "Coudln't extract gpu usage information due to NVML exception: {}" .format(str(e))) return None return gpus
def avg_gpu_info(measure_duration, print_info=False): """ Input: measure_duration: int Output: avg_free_memory: numpy.array[int], len=gpu_count avg_gpu_util: numpy.array[int], len=gpu_count """ # Get average gpu status pynvml.nvmlInit() #初始化 gpu_count = pynvml.nvmlDeviceGetCount() handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(gpu_count)] avg_free_memory = [0.0] * gpu_count avg_gpu_util = [0.0] * gpu_count for _ in range(int(measure_duration)): for id, handle in enumerate(handles): avg_free_memory[id] = avg_free_memory[ id] + pynvml.nvmlDeviceGetMemoryInfo(handle).free / 1e6 avg_gpu_util[id] = avg_gpu_util[ id] + pynvml.nvmlDeviceGetUtilizationRates(handle).gpu time.sleep(1) avg_free_memory = np.array( [int(memory / measure_duration) for memory in avg_free_memory]) avg_gpu_util = np.array( [int(power / measure_duration) for power in avg_gpu_util]) if print_info: present_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') logging.info(present_time) for gpu_id in range(gpu_count): gpu_info = 'GPU%d: gpu util:%d%% | free memory:%dMiB' % ( gpu_id, avg_gpu_util[gpu_id], avg_free_memory[gpu_id]) logging.info(gpu_info) return avg_free_memory, avg_gpu_util
def gpu_info(self): # pip install nvidia-ml-py3 if len(self.gpu_ids) >= 0 and torch.cuda.is_available(): try: import pynvml pynvml.nvmlInit() self.config_dic[ 'gpu_driver_version'] = pynvml.nvmlSystemGetDriverVersion( ) for gpu_id in self.gpu_ids: handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) gpu_id_name = "gpu%s" % gpu_id mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) gpu_utilize = pynvml.nvmlDeviceGetUtilizationRates(handle) self.config_dic['%s_device_name' % gpu_id_name] = pynvml.nvmlDeviceGetName( handle) self.config_dic['%s_mem_total' % gpu_id_name] = gpu_mem_total = round( mem_info.total / 1024**3, 2) self.config_dic['%s_mem_used' % gpu_id_name] = gpu_mem_used = round( mem_info.used / 1024**3, 2) # self.config_dic['%s_mem_free' % gpu_id_name] = gpu_mem_free = mem_info.free // 1024 ** 2 self.config_dic['%s_mem_percent' % gpu_id_name] = round( (gpu_mem_used / gpu_mem_total) * 100, 1) self._set_dict_smooth('%s_utilize_gpu' % gpu_id_name, gpu_utilize.gpu, 0.8) # self.config_dic['%s_utilize_gpu' % gpu_id_name] = gpu_utilize.gpu # self.config_dic['%s_utilize_memory' % gpu_id_name] = gpu_utilize.memory pynvml.nvmlShutdown() except Exception as e: print(e)
def real_time(): init_once() h = _pynvml_handles() return { "utilization": pynvml.nvmlDeviceGetUtilizationRates(h).gpu, "memory-used": pynvml.nvmlDeviceGetMemoryInfo(h).used, }
def load(self): if self.__has_gpu: l = 0.0 for i in range(self.__ngpus): l += pynvml.nvmlDeviceGetUtilizationRates(self.__handle[i]).gpu return (l / self.__ngpus) / 100.0 return 0.0
def autoset_nvgpu(metric="memory", k=1): """autoset_nvgpu automatically set NVIDIA GPU device Args: metric (str): memory/utilization select the GPU with min(metric) k (int): num. of selected devices """ pynvml.nvmlInit() gpunum = pynvml.nvmlDeviceGetCount() assert (k <= gpunum) metric_list = [] for idx in range(gpunum): handle = pynvml.nvmlDeviceGetHandleByIndex(idx) if metric in ["util", "utilization"]: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) metric_list.append((util_rate, idx)) else: mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) mem_use_rate = 1.0 - mem_info.free / mem_info.total metric_list.append((mem_use_rate, idx)) # sort the devices with ascending metric metric_list = sorted(metric_list, key=lambda x: x[0]) selected_idx = [str(x[1]) for x in metric_list[:k]] # set the visible devices os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(selected_idx)
def device_status(device_index): handle = nv.nvmlDeviceGetHandleByIndex(device_index) device_name = nv.nvmlDeviceGetName(handle) device_name = device_name.decode('UTF-8') nv_procs = nv.nvmlDeviceGetComputeRunningProcesses(handle) utilization = nv.nvmlDeviceGetUtilizationRates(handle).gpu clock_mhz = nv.nvmlDeviceGetClockInfo(handle, nv.NVML_CLOCK_SM) temperature = nv.nvmlDeviceGetTemperature(handle, nv.NVML_TEMPERATURE_GPU) pids = [] users = [] dates = [] cmd = None for nv_proc in nv_procs: pid = nv_proc.pid pids.append(pid) try: proc = psutil.Process(pid) users.append(proc.username()) dates.append(proc.create_time()) if cmd is None: cmd = parse_cmd_roughly(proc.cmdline()) except psutil.NoSuchProcess: users.append('?') return { 'type': device_name, 'is_available': len(pids) == 0, 'pids': ','.join([str(pid) for pid in pids]), 'users': ','.join(users), 'running_since': arrow.get(min(dates)).humanize() if len(dates) > 0 else None, 'utilization': utilization, 'clock_mhz': clock_mhz, 'temperature': temperature, 'cmd': cmd, }
def utilization(device: Optional[Union[Device, int]] = None) -> int: r"""Returns the percent of time over the past sample period during which one or more kernels was executing on the GPU as given by `nvidia-smi`. Args: device (torch.device or int, optional): selected device. Returns statistic for the current device, given by :func:`~torch.cuda.current_device`, if :attr:`device` is ``None`` (default). Warning: Each sample period may be between 1 second and 1/6 second, depending on the product being queried. """ try: import pynvml # type: ignore[import] except ModuleNotFoundError: raise ModuleNotFoundError( "pynvml module not found, please install pynvml") from pynvml import NVMLError_DriverNotLoaded try: pynvml.nvmlInit() except NVMLError_DriverNotLoaded: raise RuntimeError("cuda driver can't be loaded, is cuda enabled?") device = _get_device_index(device, optional=True) handle = pynvml.nvmlDeviceGetHandleByIndex(device) return pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
def get_gpu_utilization(gpu_idx): try: handle = nv.nvmlDeviceGetHandleByIndex(gpu_idx) util = nv.nvmlDeviceGetUtilizationRates(handle) except nv.NVMLError as err: util = err return util
def _get_gpu_status(self, used_gpu_indexes): """ Get the status of the currently used GPUs. Args: used_gpu_indexes: (list) Returns: gpu_status: (list) """ gpu_status = list() nvmlInit() for index in used_gpu_indexes: handle = nvmlDeviceGetHandleByIndex(index) utilization_rates = nvmlDeviceGetUtilizationRates(handle) mem_info = nvmlDeviceGetMemoryInfo(handle) mem_usage = mem_info.used / mem_info.total status = { "index": index, "gpu_util": utilization_rates.gpu, "mem_usage": mem_usage } gpu_status.append(status) nvmlShutdown() return gpu_status
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = psutil.Process(ps.pid).name() p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def query_device(index): handle = pynvml.nvmlDeviceGetHandleByIndex(index) return { 'index': index, 'name': pynvml.nvmlDeviceGetName(handle).decode(), 'utilization': pynvml.nvmlDeviceGetUtilizationRates(handle).gpu, 'uuid': pynvml.nvmlDeviceGetUUID(handle).decode(), }
def _get_gpu_usage(gpu_id): handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) gpu_usage = pynvml.nvmlDeviceGetUtilizationRates(handle) gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle) gpu_mem_usage = gpu_mem.used / gpu_mem.total if gpu_mem.total else 0 return gpu_usage.gpu, gpu_mem_usage
def get_gpu_util(handle): util = -1 try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) util = util.gpu except Exception: raise return util
def get_gpu_util(handle): util = -1 try: util = pynvml.nvmlDeviceGetUtilizationRates(handle) util = util.gpu except Exception: raise return util
def mem_utilization_for(device_handle): """ Percent of time over the past sample period during which global (device) memory was being read or written. """ try: return pynvml.nvmlDeviceGetUtilizationRates(device_handle).memory except pynvml.NVMLError: return None
def utilization_for(device_handle): """Get GPU device consumption in percent Percent of time over the past sample period during which one or more kernels was executing on the GPU. """ try: return pynvml.nvmlDeviceGetUtilizationRates(device_handle).gpu except pynvml.NVMLError: return None
def query_gpu(handle: int) -> Dict: memory = pynvml.nvmlDeviceGetMemoryInfo(handle) # in Bytes utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) return { "gpu_{}_memory_free".format(handle): int(memory.free), "gpu_{}_memory_used".format(handle): int(memory.used), "gpu_{}_utilization".format(handle): utilization.gpu, }
def get_gpu_util(gpu_id=0): nv.nvmlInit() handle = nv.nvmlDeviceGetHandleByIndex(gpu_id) print('AWS DEBUG nvmlDeviceGetHandleByIndex', handle) utilization = nv.nvmlDeviceGetUtilizationRates(handle).gpu print('AWS DEBUG nvmlDeviceGetUtilizationRates.gpu', utilization) return utilization
def getFreeRatio(id): handle = pynvml.nvmlDeviceGetHandleByIndex(id) info = pynvml.nvmlDeviceGetMemoryInfo(handle) # print("Memory Total: ",info.total/(1024*1024)) # print("Memory Free: ",info.free/(1024*1024)) # print("Memory Used: ",info.used/(1024*1024)) use = pynvml.nvmlDeviceGetUtilizationRates(handle) ratio = 0.5 * (float(use.gpu + float(use.memory))) return ratio
def load(self) -> float: if self.__has_gpu: total_load = 0.0 for i in range(self.__ngpus): with contextlib.suppress(Exception): total_load += pynvml.nvmlDeviceGetUtilizationRates( self.__handle[i]).gpu return (total_load / self.__ngpus) / 100.0 return 0.0
def _crawl_in_system(self): ''' nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE, POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS, PAGE_RETIREMENT, ACCOUNTING currently, following are requested based on dlaas requirements: utilization.gpu, utilization.memory, memory.total, memory.free, memory.used nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\ memory.total,memory.free,memory.used --format=csv,noheader,nounits ''' if self._init_nvml() == -1: return self.inspect_arr = exec_dockerps() num_gpus = pynvml.nvmlDeviceGetCount() for gpuid in range(0, num_gpus): gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid) temperature = pynvml.nvmlDeviceGetTemperature( gpuhandle, pynvml.NVML_TEMPERATURE_GPU) memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle) mem_total = memory.total / 1024 / 1024 mem_used = memory.used / 1024 / 1024 mem_free = memory.free / 1024 / 1024 power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000 power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit( gpuhandle) / 1000 util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle) util_gpu = util.gpu util_mem = util.memory entry = { 'utilization': { 'gpu': util_gpu, 'memory': util_mem }, 'memory': { 'total': mem_total, 'free': mem_free, 'used': mem_used }, 'temperature': temperature, 'power': { 'draw': power_draw, 'limit': power_limit } } key = self._get_feature_key(gpuhandle, gpuid) if gpuid == num_gpus - 1: self._shutdown_nvml() yield (key, entry, 'gpu') return
def sample_utilization_rates(self, handle: DeviceHandle) -> DeviceUtilizationRates: memory = nvmlDeviceGetMemoryInfo(handle) total = memory.total / MiB used = memory.used / MiB utilization = nvmlDeviceGetUtilizationRates(handle) utilization_gpu = utilization.gpu utilization_memory = utilization.memory self.log_debug(f"Sampled utilization rates: {used:.2f} MiB, {utilization_gpu}%, {utilization_memory}%") return DeviceUtilizationRates(total, used, utilization_gpu, utilization_memory)
def __getitem__(self, item: int): if item >= len(self): raise IndexError h = nv.nvmlDeviceGetHandleByIndex(item) idx = nv.nvmlDeviceGetIndex(h) mem = nv.nvmlDeviceGetMemoryInfo(h) uti = nv.nvmlDeviceGetUtilizationRates(h) return idx, dict(free=Bytes(mem.free), used=Bytes(mem.used), util=Percent(uti.gpu))
def _log_gpu_utilization(self): gpu_utilizations = {} # Get current GPU utilizations in percent for gpu_name, gpu_hdl in self.gpu_handles.items(): gpu_percentage = nvmlDeviceGetUtilizationRates(handle=gpu_hdl).gpu gpu_utilizations[gpu_name] = gpu_percentage # log CPU utilization to tensorboard self._tb_logger.add_scalars(main_tag='GPUs_utilization_percentage', tag_scalar_dict=gpu_utilizations, global_step=time() - self._start_time)
def _crawl_in_system(self): ''' nvidia-smi returns following: MEMORY, UTILIZATION, ECC, TEMPERATURE, POWER, CLOCK, COMPUTE, PIDS, PERFORMANCE, SUPPORTED_CLOCKS, PAGE_RETIREMENT, ACCOUNTING currently, following are requested based on dlaas requirements: utilization.gpu, utilization.memory, memory.total, memory.free, memory.used nvidia-smi --query-gpu=utilization.gpu,utilization.memory,\ memory.total,memory.free,memory.used --format=csv,noheader,nounits ''' if self._init_nvml() == -1: return self.inspect_arr = exec_dockerps() num_gpus = pynvml.nvmlDeviceGetCount() for gpuid in range(0, num_gpus): gpuhandle = pynvml.nvmlDeviceGetHandleByIndex(gpuid) temperature = pynvml.nvmlDeviceGetTemperature( gpuhandle, pynvml.NVML_TEMPERATURE_GPU) memory = pynvml.nvmlDeviceGetMemoryInfo(gpuhandle) mem_total = memory.total / 1024 / 1024 mem_used = memory.used / 1024 / 1024 mem_free = memory.free / 1024 / 1024 power_draw = pynvml.nvmlDeviceGetPowerUsage(gpuhandle) / 1000 power_limit = pynvml.nvmlDeviceGetEnforcedPowerLimit( gpuhandle) / 1000 util = pynvml.nvmlDeviceGetUtilizationRates(gpuhandle) util_gpu = util.gpu util_mem = util.memory entry = { 'utilization': {'gpu': util_gpu, 'memory': util_mem}, 'memory': {'total': mem_total, 'free': mem_free, 'used': mem_used}, 'temperature': temperature, 'power': {'draw': power_draw, 'limit': power_limit} } key = self._get_feature_key(gpuhandle, gpuid) if gpuid == num_gpus - 1: self._shutdown_nvml() yield (key, entry, 'gpu') return
def collect_via_pynvml(self, stats_config): """ Use pynvml python binding to collect metrics :param stats_config: :return: """ try: NVML_TEMPERATURE_GPU = 0 pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() for device_index in xrange(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(device_index) memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle) metrics = { 'memory.total': memoryInfo.total / 1024 / 1024, 'memory.used': memoryInfo.total / 1024 / 1024, 'memory.free': memoryInfo.free / 1024 / 1024, 'utilization.gpu': utilizationRates.gpu, 'utilization.memory': utilizationRates.memory, 'temperature.gpu': pynvml.nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) } for stat_name in stats_config[1:]: metric = metrics.get(stat_name) if metric: metric_name = 'gpu_{index}.{stat_name}'.format( index=str(device_index), stat_name=stat_name ) self.publish(metric_name, metric) finally: pynvml.nvmlShutdown()
def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info
def step(self): valuesDict = {} valuesDict['table'] = self._tableName cpu = valuesDict['cpu'] = psutil.cpu_percent(interval=0) mem = valuesDict['mem'] = psutil.virtual_memory().percent swap = valuesDict['swap'] = psutil.swap_memory().percent # some code examples: # https://github.com/ngi644/datadog_nvml/blob/master/nvml.py if self.doGpu: for i in self.gpusToUse: try: handle = nvmlDeviceGetHandleByIndex(i) memInfo = nvmlDeviceGetMemoryInfo(handle) valuesDict["gpuMem_%d" % i] = \ float(memInfo.used)*100./float(memInfo.total) util = nvmlDeviceGetUtilizationRates(handle) valuesDict["gpuUse_%d" % i] = util.gpu temp = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) valuesDict["gpuTem_%d" % i] = temp except NVMLError as err: handle = nvmlDeviceGetHandleByIndex(i) msg = "Device %d -> %s not suported\n" \ "Remove device %d from FORM" % \ (i, nvmlDeviceGetName(handle), i) errorWindow(None, msg) if self.doNetwork: try: # measure a sort interval pnic_before = psutil.net_io_counters(pernic=True)[self.nif] time.sleep(self.samplingTime) # sec pnic_after = psutil.net_io_counters(pernic=True)[self.nif] bytes_sent = pnic_after.bytes_sent - pnic_before.bytes_sent bytes_recv = pnic_after.bytes_recv - pnic_before.bytes_recv valuesDict["%s_send" % self.nif] = \ bytes_sent * self.samplingTime / 1048576 valuesDict["%s_recv" % self.nif] = \ bytes_recv * self.samplingTime / 1048576 except: msg = "cannot get information of network interface %s" % \ self.nif if self.doDiskIO: try: # measure a sort interval disk_before = psutil.disk_io_counters(perdisk=False) time.sleep(self.samplingTime) # sec disk_after = psutil.disk_io_counters(perdisk=False) bytes_read = disk_after.read_bytes - disk_before.read_bytes bytes_write = disk_after.write_bytes - disk_before.write_bytes valuesDict["disk_read"] = \ self.samplingTime * bytes_read / self.mega valuesDict["disk_write"] = \ self.samplingTime * bytes_write / self.mega except: msg = "cannot get information of disk usage " if self.cpuAlert < 100 and cpu > self.cpuAlert: self.warning("CPU allocation =%f." % cpu) self.cpuAlert = cpu if self.memAlert < 100 and mem.percent > self.memAlert: self.warning("Memory allocation =%f." % mem) self.memAlert = mem if self.swapAlert < 100 and swap.percent > self.swapAlert: self.warning("SWAP allocation =%f." % swap) self.swapAlert = swap sqlCommand = "INSERT INTO %(table)s (" for label in self.labelList: sqlCommand += "%s, " % label # remove last comma sqlCommand = sqlCommand[:-2] sqlCommand += ") VALUES(" for label in self.labelList: sqlCommand += "%"+"(%s)f, " % label # remove last comma sqlCommand = sqlCommand[:-2] sqlCommand += ");" sql = sqlCommand % valuesDict try: self.cur.execute(sql) except Exception as e: print("ERROR: saving one data point (monitor). I continue") # Return finished = True if all protocols have finished finished = [] for prot in self.protocols: updatedProt = getUpdatedProtocol(prot) finished.append(updatedProt.getStatus() != STATUS_RUNNING) return all(finished)
def info_refresh(self): try: stat = open("/proc/stat") self.statlines = stat.read().splitlines()[1:-1] stat.close() except IOError: print("Problem opening /proc/stat, exiting..") pynvml.nvmlShutdown() quit() for i in range(self.corecount): for j in self.statlines[i].split()[1:]: #remove cpu# self.total[i]+= int(j) self.idle[i] = int(self.statlines[i].split()[4]) for i in range(self.corecount): if (self.total[i] - self.prev_total[i]) == 0: self.prev_idle[i] = self.idle[i] self.prev_total[i] = self.total[i] break self.cpu_prog_bars[i].set_fraction(1 - ((self.idle[i] - self.prev_idle[i]) / (self.total[i] - self.prev_total[i])) ) self.prev_idle[i] = self.idle[i] self.prev_total[i] = self.total[i] self.idle[i] = 0 self.total[i] = 0 for i in range(self.deviceCount): util = pynvml.nvmlDeviceGetUtilizationRates(self.gpu_handles[i]) temp = pynvml.nvmlDeviceGetTemperature(self.gpu_handles[i], pynvml.NVML_TEMPERATURE_GPU) memInfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpu_handles[i]) (encoder_util, sPeriod) = pynvml.nvmlDeviceGetEncoderUtilization(self.gpu_handles[i]) (decoder_util, sPeriod) = pynvml.nvmlDeviceGetDecoderUtilization(self.gpu_handles[i]) mem_total = memInfo.total / 1024 / 1024 mem_used = memInfo.used / 1024 / 1024 self.gpu_prog_bars[i*6].set_text("GPU: %d%%" % util.gpu) self.gpu_prog_bars[i*6].set_fraction(util.gpu / 100) ######## self.util_history.append(util.gpu) self.util_graph.queue_draw() self.temp_history.append(temp) self.temp_graph.queue_draw() ######## self.gpu_prog_bars[i*6 +1].set_text("Memory Utilization: %d%%" % util.memory) self.gpu_prog_bars[i*6 +1].set_fraction(util.memory / 100) self.gpu_prog_bars[i*6 +4].set_text("Encoder: %d%%" % encoder_util) self.gpu_prog_bars[i*6 +5].set_text("Decoder: %d%%" % decoder_util) self.gpu_prog_bars[i*6 +4].set_fraction(encoder_util / 100) self.gpu_prog_bars[i*6 +5].set_fraction(decoder_util / 100) self.gpu_prog_bars[i*6 +2].set_text("Memory Usage: %d MiB/%d MiB" % (mem_used, mem_total)) self.gpu_prog_bars[i*6 +2].set_fraction(mem_used / mem_total) self.gpu_prog_bars[i*6 +3].set_text("Temperature: %d °C" % temp) if temp > 100: temp = 100 elif temp < 0: temp = 0 self.gpu_prog_bars[i*6 +3].set_fraction(temp / 100) #--proc-- procs = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.gpu_handles[0]) proc_liststore = Gtk.ListStore(int, str, int) for p in procs: pid = p.pid try: path = pynvml.nvmlSystemGetProcessName(p.pid).decode('utf-8') except: self.exit() if (p.usedGpuMemory == None): mem = 0 else: mem = (p.usedGpuMemory / 1024 / 1024) proc_liststore.append([pid, path, mem]) self.tree.set_model(proc_liststore) return True
def get_proc(device_handle): """Get GPU device CPU consumption in percent.""" try: return pynvml.nvmlDeviceGetUtilizationRates(device_handle).gpu except pynvml.NVMLError: return None
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.info('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.info('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = psutil.Process(ps.pid).name() p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def getFreeRatio(id): handle = pynvml.nvmlDeviceGetHandleByIndex(id) use = pynvml.nvmlDeviceGetUtilizationRates(handle) ratio = 0.5*(float(use.gpu+float(use.memory))) return ratio