def _get_gpu_status(self, used_gpu_indexes): """ Get the status of the currently used GPUs. Args: used_gpu_indexes: (list) Returns: gpu_status: (list) """ gpu_status = list() nvmlInit() for index in used_gpu_indexes: handle = nvmlDeviceGetHandleByIndex(index) utilization_rates = nvmlDeviceGetUtilizationRates(handle) mem_info = nvmlDeviceGetMemoryInfo(handle) mem_usage = mem_info.used / mem_info.total status = { "index": index, "gpu_util": utilization_rates.gpu, "mem_usage": mem_usage } gpu_status.append(status) nvmlShutdown() return gpu_status
def gpu_info() -> dict: info = dict() try: nvmlInit() except NVMLError: info['no-gpu'] = 'No Nvidia GPU detected' return info device_count = nvmlDeviceGetCount() info['driver_version'] = nvmlSystemGetDriverVersion().decode() info['device_count'] = device_count info['device'] = dict() for i in range(device_count): handle = nvmlDeviceGetHandleByIndex(i) memory = nvmlDeviceGetMemoryInfo(handle) info['device'][i] = dict() info['device'][i]['name'] = str(nvmlDeviceGetName(handle)) info['device'][i]['memory'] = dict() info['device'][i]['memory']['total'] = str(size_in_gb(memory.total)) nvmlShutdown() return info
def gpu_info(self): # pip install nvidia-ml-py3 if len(self.gpu_ids) >= 0 and torch.cuda.is_available(): try: import pynvml pynvml.nvmlInit() self.config_dic[ 'gpu_driver_version'] = pynvml.nvmlSystemGetDriverVersion( ) for gpu_id in self.gpu_ids: handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) gpu_id_name = "gpu%s" % gpu_id mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) gpu_utilize = pynvml.nvmlDeviceGetUtilizationRates(handle) self.config_dic['%s_device_name' % gpu_id_name] = pynvml.nvmlDeviceGetName( handle) self.config_dic['%s_mem_total' % gpu_id_name] = gpu_mem_total = round( mem_info.total / 1024**3, 2) self.config_dic['%s_mem_used' % gpu_id_name] = gpu_mem_used = round( mem_info.used / 1024**3, 2) # self.config_dic['%s_mem_free' % gpu_id_name] = gpu_mem_free = mem_info.free // 1024 ** 2 self.config_dic['%s_mem_percent' % gpu_id_name] = round( (gpu_mem_used / gpu_mem_total) * 100, 1) self._set_dict_smooth('%s_utilize_gpu' % gpu_id_name, gpu_utilize.gpu, 0.8) # self.config_dic['%s_utilize_gpu' % gpu_id_name] = gpu_utilize.gpu # self.config_dic['%s_utilize_memory' % gpu_id_name] = gpu_utilize.memory pynvml.nvmlShutdown() except Exception as e: print(e)
def auto_select_gpu(): """Select gpu which has largest free memory""" if HAS_NVML: pynvml.nvmlInit() deviceCount = pynvml.nvmlDeviceGetCount() largest_free_mem = 0 largest_free_idx = 0 for i in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(i) info = pynvml.nvmlDeviceGetMemoryInfo(handle) if info.free > largest_free_mem: largest_free_mem = info.free largest_free_idx = i pynvml.nvmlShutdown() largest_free_mem = largest_free_mem / 1024. / 1024. # Convert to MB idx_to_gpu_id = {} for i in range(deviceCount): idx_to_gpu_id[i] = '{}'.format(i) gpu_id = idx_to_gpu_id[largest_free_idx] logging.info('Using largest free memory GPU {} with free memory {}MB'.format(gpu_id, largest_free_mem)) return gpu_id else: logging.info('nvidia-ml-py is not installed, automatically select gpu is disabled!') return '0'
def main(): parser = argparse.ArgumentParser( description="collect GPU device memory usage") parser.add_argument("-g", type=int, default=1, help="number of gpu devices") parser.add_argument("-n", type=float, default=1, help="metrics rate") args = parser.parse_args() pynvml.nvmlInit() n_gpus = args.g devices = [ Device(pynvml.nvmlDeviceGetHandleByIndex(i)) for i in range(n_gpus) ] running = True while running: time.sleep(args.n) running = False for device in devices: running |= device.update() pynvml.nvmlShutdown() for i, device in enumerate(devices): max_mem_usage_mbytes = device.max_mem_usage / 1024 / 1024 print(f"gpt{i} max memory usage: {max_mem_usage_mbytes:.2f}M")
def shutdown(self): """ Shutdown pynvml """ if self.initialized: self.handles = None if not IS_MACOS: pynvml.nvmlShutdown() self.initialized = False
def autoselect(gpu_target: List[int], min_memory: float) -> int: logging.info(f'GPU search space: {gpu_target}') nvmlInit() deviceCount = nvmlDeviceGetCount() memories = np.zeros((deviceCount, COUNT), dtype=np.float32) rates = np.zeros((deviceCount, COUNT), dtype=np.float32) for c in range(COUNT): for i in range(deviceCount): if i not in gpu_target: memories[i, c] = 0 rates[i, c] = 100 else: handle = nvmlDeviceGetHandleByIndex(i) memories[ i, c] = nvmlDeviceGetMemoryInfo(handle).free / 1024**3 rates[i, c] = int(nvmlDeviceGetUtilizationRates(handle).gpu) time.sleep(INTERVAL) nvmlShutdown() memories = memories.mean(1) rates = rates.mean(1) # enough memory GPU ids memory_enough_ids = np.where(memories > min_memory)[0] if len(memory_enough_ids) > 0: # min util GPU gpuid = memory_enough_ids[np.argmin(rates[memory_enough_ids])] # if multi GPUs' util are the same, choose one that has the most memory gpu_min_ids = np.where(rates[memory_enough_ids] <= rates[gpuid])[0] gpu_min_ids = memory_enough_ids[gpu_min_ids] gpuid = gpu_min_ids[np.argmax(memories[gpu_min_ids])] logging.info(f'Auto select GPU {gpuid}') else: raise MemoryError(str(memories)) return int(gpuid)
def gpus_available() -> Dict[int, float]: if not torch.cuda.is_available(): return dict() try: nvmlInit() gpus = {} visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES', None) if visible_devices is None: visible_devices = list(range(nvmlDeviceGetCount())) else: visible_devices = {int(x.strip()) for x in visible_devices.split(',')} for i, real_id in enumerate(visible_devices): h = nvmlDeviceGetHandleByIndex(real_id) info = nvmlDeviceGetMemoryInfo(h) total = info.total free = info.free ratio = free / total gpus[i] = ratio # print(f'total : {info.total}') # print(f'free : {info.free}') # print(f'used : {info.used}') # t = torch.cuda.get_device_properties(0).total_memory # c = torch.cuda.memory_cached(0) # a = torch.cuda.memory_allocated(0) # print(t, c, a) nvmlShutdown() return dict(sorted(gpus.items(), key=lambda x: x[1], reverse=True)) except Exception as e: logger.debug(f'Failed to get gpu info due to {e}') return dict((i, 1.0) for i in range(torch.cuda.device_count()))
def gpus_available() -> dict: try: nvmlInit() gpus = {} visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES', None) if visible_devices: visible_devices = { int(x.strip()) for x in visible_devices.split(',') } else: visible_devices = list(range(nvmlDeviceGetCount())) for i, real_id in enumerate(visible_devices): h = nvmlDeviceGetHandleByIndex(real_id) info = nvmlDeviceGetMemoryInfo(h) total = info.total free = info.free ratio = free / total gpus[i] = ratio # print(f'total : {info.total}') # print(f'free : {info.free}') # print(f'used : {info.used}') # t = torch.cuda.get_device_properties(0).total_memory # c = torch.cuda.memory_cached(0) # a = torch.cuda.memory_allocated(0) # print(t, c, a) nvmlShutdown() return gpus except Exception as e: logger.debug(f'Failed to get gpu info due to {e}') return {}
def shutdown(self): """ Shutdown pynvml """ if self.initialized: self.handles = list() if not IS_MACOS and not self.plaid: pynvml.nvmlShutdown() self.initialized = False
def auto_select_gpu(): """Select gpu which has largest free memory""" if HAS_NVML: pynvml.nvmlInit() deviceCount = pynvml.nvmlDeviceGetCount() largest_free_mem = 0 largest_free_idx = 0 for i in range(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(i) info = pynvml.nvmlDeviceGetMemoryInfo(handle) if info.free > largest_free_mem: largest_free_mem = info.free largest_free_idx = i pynvml.nvmlShutdown() largest_free_mem = largest_free_mem / 1024. / 1024. # Convert to MB idx_to_gpu_id = {} for i in range(deviceCount): idx_to_gpu_id[i] = '{}'.format(i) gpu_id = idx_to_gpu_id[largest_free_idx] logging.info( 'Using largest free memory GPU {} with free memory {}MB'.format( gpu_id, largest_free_mem)) return gpu_id else: logging.info( 'nvidia-ml-py is not installed, automatically select gpu is disabled!' ) return '0'
def get_num_gpus(): import pynvml pynvml.nvmlInit() ngpus = pynvml.nvmlDeviceGetCount() pynvml.nvmlShutdown() return ngpus
def track(self): """ Track the GPU memory usage """ pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(self.device) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) self.curr_line = self.frame.f_lineno where_str = self.module_name + ' ' + self.func_name + ':' + ' line ' + str(self.curr_line) with open(self.gpu_profile_fn.replace(':', ''), 'a+') as f: if self.begin: f.write(f"GPU Memory Track | {datetime.datetime.now():%d-%b-%y-%H:%M:%S} |" f" Total Used Memory:{meminfo.used/1000**2:<7.1f}Mb\n\n") self.begin = False if self.print_detail is True: ts_list = [tensor.size() for tensor in self.get_tensors()] new_tensor_sizes = {(type(x), tuple(x.size()), ts_list.count(x.size()), np.prod(np.array(x.size()))*4/1000**2) for x in self.get_tensors()} for t, s, n, m in new_tensor_sizes - self.last_tensor_sizes: f.write(f'+ | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20}\n') for t, s, n, m in self.last_tensor_sizes - new_tensor_sizes: f.write(f'- | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20} \n') self.last_tensor_sizes = new_tensor_sizes f.write(f"\nAt {where_str:<50}" f"Total Used Memory:{meminfo.used/1000**2:<7.1f}Mb\n\n") pynvml.nvmlShutdown()
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature( handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = psutil.Process(ps.pid).name() p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append( u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def wrap(*arg, **kwargs): try: result = func(*arg, **kwargs) nvmlShutdown() except Exception: pass else: return result
def get_gpu_count(): """ return the gpu number """ pynvml.nvmlInit() gpu_number = pynvml.nvmlDeviceGetCount() pynvml.nvmlShutdown() return gpu_number
def _shutdown(self): """ Shutdown pynvml if it was the library used for obtaining stats and set :attr:`_initialized` back to ``False``. """ if self._initialized: self._handles = list() if not IS_MACOS and not self._is_plaidml: pynvml.nvmlShutdown() self._initialized = False
def get_gpu_temperatures(): nvmlInit() gpus = dict() for i in range(nvmlDeviceGetCount()): handle = nvmlDeviceGetHandleByIndex(i) gpus[i] = int(nvmlDeviceGetTemperature(handle, 0)) nvmlShutdown() return gpus
def memory_info(): """ Assumes identical GPUs in a node """ pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle).total pynvml.nvmlShutdown() return gpu_mem
def _nvml(): """Enter a context manager that will init and shutdown nvml.""" # Copyright (c) 2018 Bohumír Zámečník, Rossum Ltd., MIT license # from https://github.com/rossumai/nvgpu/blob/a66dda5ae816a6a8936645fe0520cb4dc6354137/nvgpu/nvml.py#L5 # Modifications copyright 2019, Nathan Hunt, MIT license nv.nvmlInit() yield nv.nvmlShutdown()
def run_logging_loop(async_task, async_loop): asyncio.set_event_loop(async_loop) pynvml.nvmlInit() logger = _logger() logger.info("Driver Version: {}".format( nativestr(pynvml.nvmlSystemGetDriverVersion()))) async_loop.run_until_complete(async_task) logger.info("Shutting down driver") pynvml.nvmlShutdown()
def log_gpu_stat(logger): pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) used_mem = (meminfo.used / 1024) /1024 pynvml.nvmlShutdown() gpu_info = subprocess.check_output(["nvidia-smi"]) logger.info(("\nThe pid of current job is {} and {}, the used memory before we run is {}MB,"+\ " the <nvidia-smi> shows:\n{}").format(os.getpid(),os.getppid(), used_mem, gpu_info.decode("utf-8")))
def end(self, session): """Called at the end of a session. Arguments: session (tf.Session): The `session` argument can be used in case the hook wants to run final ops, such as saving a last checkpoint. """ # Shutdown the NVML interface. nvml.nvmlShutdown()
def exit(self): """Overwrite the exit method to close the GPU API.""" if self.nvml_ready: try: pynvml.nvmlShutdown() except Exception as e: logger.debug("pynvml failed to shutdown correctly ({})".format(e)) # Call the father exit method super(Plugin, self).exit()
def __customCurveSpeed(self): nvmlInit() self._handle = nvmlDeviceGetHandleByIndex(self.id) curve = Curve() while (not self.stopped()): current_temp = self.__getTemp() new_fan_speed = curve.evaluate(current_temp) self.__setSpeed(new_fan_speed) time.sleep(1.0) nvmlShutdown()
def run_hardware_monitor(sv: SharedValues): print(time.strftime(LOG_TIME), "Hardware monitoring starts") try: if GPU_MODE: pynvml.nvmlInit() hw_info.SSEUpdater.broadcast_sys_info(sv) except KeyboardInterrupt: print(time.strftime(LOG_TIME), "Hardware monitoring stops") if GPU_MODE: pynvml.nvmlShutdown()
def get_gpu_status(gpu_index=0): # init for getting N.nvmlInit() handle = N.nvmlDeviceGetHandleByIndex(gpu_index) def _decode(b): if isinstance(b, bytes): return b.decode() # to unicode return b name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature(handle, N.NVML_TEMPERATURE_GPU) except N.NVMLError: temperature = None try: memory = N.nvmlDeviceGetMemoryInfo(handle) except N.NVMLError: memory = None try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None try: power = N.nvmlDeviceGetPowerUsage(handle) except: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except: power_limit = None # real gpu index index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature': temperature, 'utilization': utilization.gpu if utilization else None, 'power': int(power / 1000) if power is not None else None, 'enforced.power': int(power_limit / 1000) if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': int(memory.used / 1024 / 1024) if memory else None, 'memory.total': int(memory.total / 1024 / 1024) if memory else None, } # release resource N.nvmlShutdown() return GPUStat(gpu_info)
def func0(memory_require=128 * 1024 * 1024, tf_gpu_mem_growth=False, logger=None, console=True): try: gpu = None pynvml.nvmlInit() gpu_num = pynvml.nvmlDeviceGetCount() # check nvidia driver import tensorflow as tf gpus = tf.config.experimental.list_physical_devices('GPU') del tf if gpu_num <= 0 or len(gpus) <= 0: pynvml.nvmlShutdown() if len(gpus) <= 0 and gpu_num > 0: msg = "have {} GPU, but tensorflow can not detect, check driver or tensorflow if GPU version".format( gpu_num) else: msg = "NO GPU" if logger: logger.i(msg) if console: print(msg) return gpu for i in range(gpu_num): h = pynvml.nvmlDeviceGetHandleByIndex(i) name = pynvml.nvmlDeviceGetName(h) info = pynvml.nvmlDeviceGetMemoryInfo(h) msg = "GPU:{}, used:{}/{}MB, free:{}MB".format( name.decode(), info.used / 1024 / 1024, info.total / 1024 / 1024, info.free / 1024 / 1024) if logger: logger.i(msg) if console: print(msg) if info.free >= memory_require: gpu = GPU_info(id=i, name=name.decode(), mem_free=info.free, mem_total=info.total) os.environ["CUDA_VISIBLE_DEVICES"] = str(i) import tensorflow as tf tf.config.experimental.set_memory_growth(gpus[i], True) del tf break pynvml.nvmlShutdown() except Exception as e: msg = "select gpu fail:{}".format(e) if logger: logger.i(msg) if console: print(msg) return gpu
def check_nvidia_device(): try: pynvml.nvmlInit() driver_version = float(pynvml.nvmlSystemGetDriverVersion()) pynvml.nvmlShutdown() if driver_version < 367.48: raise OSError( 'NVIDIA driver v.{} is not supported. The driver version must be 367.48 or newer' .format(driver_version)) except pynvml.NVMLError: raise OSError('NVIDIA device not found')
def get_gpu_machine(self) -> GPUMachine: # from pynvml.smi import nvidia_smi # nvsmi = nvidia_smi.getInstance() # gpu_info = nvsmi.DeviceQuery('index, utilization.gpu, memory.free, count') # gpu_machine = GPUMachine(gpu_info["count"]) # # for one_gpu in gpu_info["gpu"]: # gpu_machine.add_gpu_state( # GPUState(free=one_gpu["fb_memory_usage"]["free"], # util=one_gpu["utilization"]["gpu_util"], # index=int(one_gpu["minor_number"]) # ) # ) import pynvml MB = 1024 * 1024 pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() gpu_machine = GPUMachine(device_count) pynvml.nvmlShutdown() for index in range(device_count): pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(index) index = pynvml.nvmlDeviceGetIndex(handle) try: utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) except pynvml.NVMLError: utilization = None # Not supported try: memory = pynvml.nvmlDeviceGetMemoryInfo(handle) # in Bytes except pynvml.NVMLError: memory = None # Not supported pynvml.nvmlShutdown() gpu_machine.add_gpu_state( GPUState(free=memory.free // MB, util=utilization.gpu, index=index ) ) return gpu_machine
def clean_up(): global _nvml_inited if _nvml_inited: try: pynvml.nvmlShutdown() logger.info('[NVML] NVML Shutdown') except pynvml.NVMLError as e: logger.error('[NVML] NVML Failed to Shutdown: %s' % str(e)) pass _nvml_inited = False _static_info['public'] = {} _static_info['private'] = {}
def get_nvml_driver_version(): try: from pynvml import nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion try: nvmlInit() v = nvmlSystemGetDriverVersion() log("nvmlSystemGetDriverVersion=%s", v) return v.split(".") except Exception as e: log.warn("Warning: failed to query the NVidia kernel module version via NVML:") log.warn(" %s", e) finally: nvmlShutdown() except ImportError as e: log("cannot use nvml to query the kernel module version:") log(" %s", e) return ""
def request_mem(mem_mb, i_am_nice=True): # titanx' mem: 12,881,559,552 bytes # 12*1024*1024*1024 = 12,884,901,888 mem = mem_mb * 1024 * 1024 nvml.nvmlInit() # n = nvml.nvmlDeviceGetCount() try: handle = nvml.nvmlDeviceGetHandleByIndex(0) info = nvml.nvmlDeviceGetMemoryInfo(handle) cap = info.total * nice_ratio # req = cap if mem > cap and i_am_nice else mem req = mem if req > cap and i_am_nice: raise MemoryError('You are supposed to be polite..') if req > info.free: raise MemoryError('Cannot fullfil the gpumem request') return req / info.free finally: nvml.nvmlShutdown()
def collect_via_pynvml(self, stats_config): """ Use pynvml python binding to collect metrics :param stats_config: :return: """ try: NVML_TEMPERATURE_GPU = 0 pynvml.nvmlInit() device_count = pynvml.nvmlDeviceGetCount() for device_index in xrange(device_count): handle = pynvml.nvmlDeviceGetHandleByIndex(device_index) memoryInfo = pynvml.nvmlDeviceGetMemoryInfo(handle) utilizationRates = pynvml.nvmlDeviceGetUtilizationRates(handle) metrics = { 'memory.total': memoryInfo.total / 1024 / 1024, 'memory.used': memoryInfo.total / 1024 / 1024, 'memory.free': memoryInfo.free / 1024 / 1024, 'utilization.gpu': utilizationRates.gpu, 'utilization.memory': utilizationRates.memory, 'temperature.gpu': pynvml.nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU) } for stat_name in stats_config[1:]: metric = metrics.get(stat_name) if metric: metric_name = 'gpu_{index}.{stat_name}'.format( index=str(device_index), stat_name=stat_name ) self.publish(metric_name, metric) finally: pynvml.nvmlShutdown()
def identify_cards(): devices = {} try: import pynvml from pynvml import nvmlInit, nvmlShutdown, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex deviceCount = None try: nvmlInit() deviceCount = nvmlDeviceGetCount() for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) props = {} def meminfo(memory): return { "total" : int(memory.total), "free" : int(memory.free), "used" : int(memory.used), } def pciinfo(pci): i = {} for x in ("domain", "bus", "device", "pciDeviceId", "pciSubSystemId"): try: i[x] = int(getattr(pci, x)) except: pass try: i["busId"] = str(pci.busId) except: pass return i for prop, fn_name, args, conv in ( ("name", "nvmlDeviceGetName", (), str), ("serial", "nvmlDeviceGetSerial", (), str), ("uuid", "nvmlDeviceGetUUID", (), str), ("pci", "nvmlDeviceGetPciInfo", (), pciinfo), ("memory", "nvmlDeviceGetMemoryInfo", (), meminfo), ("pcie-link-generation-max", "nvmlDeviceGetMaxPcieLinkGeneration", (), int), ("pcie-link-width-max", "nvmlDeviceGetMaxPcieLinkWidth", (), int), ("pcie-link-generation", "nvmlDeviceGetCurrPcieLinkGeneration", (), int), ("pcie-link-width", "nvmlDeviceGetCurrPcieLinkWidth", (), int), ("clock-info-graphics", "nvmlDeviceGetClockInfo", (0,), int), ("clock-info-sm", "nvmlDeviceGetClockInfo", (1,), int), ("clock-info-mem", "nvmlDeviceGetClockInfo", (2,), int), ("clock-info-graphics-max", "nvmlDeviceGetMaxClockInfo", (0,), int), ("clock-info-sm-max", "nvmlDeviceGetMaxClockInfo", (1,), int), ("clock-info-mem-max", "nvmlDeviceGetMaxClockInfo", (2,), int), ("fan-speed", "nvmlDeviceGetFanSpeed", (), int), ("temperature", "nvmlDeviceGetTemperature", (0,), int), ("power-state", "nvmlDeviceGetPowerState", (), int), ("vbios-version", "nvmlDeviceGetVbiosVersion", (), str), ): try: fn = getattr(pynvml, fn_name) v = fn(handle, *args) if conv: v = conv(v) props[prop] = v except Exception as e: log("identify_cards() cannot query %s using %s on device %i with handle %s: %s", prop, fn, i, handle, e) continue devices[i] = props #unitCount = nvmlUnitGetCount() #log.info("unitCount=%s", unitCount) except Exception as e: log("identify_cards() pynvml error", exc_info=True) log.warn("Warning: failed to query the NVidia cards via NVML:") log.warn(" %s", e) finally: if deviceCount is not None: nvmlShutdown() except ImportError as e: log("cannot use nvml to query the kernel module version:") log(" %s", e) return devices
def do_GET(self): #checks if the server is alive if self.path == '/test': send_header(self) self.wfile.write(bytes('passed<br>', 'utf-8')) self.wfile.write(bytes('server is responding', 'utf-8')) #returns the running processes if self.path == '/runningProcesses': send_header(self) #send response: if modules['psutil']: for proc in psutil.process_iter(): try: pinfo = proc.as_dict(attrs=['pid', 'name']) except psutil.NoSuchProcess: pass print(pinfo) self.wfile.write(bytes(str(pinfo), 'utf-8')) else: self.wfile.write('I am sorry but the Python module psutil is not installed. Therefore the running processes cannot be shown.', 'utf-8') #returns the CPU utilization and number of cores elif self.path == '/cpuInfo': send_header(self) #get CPU info cpuInfo = {} if modules['psutil']: cpuInfo['CPU Utilization'] = int(psutil.cpu_percent()) cpuInfo['CPU Cores'] = int(psutil.cpu_count()) else: cpuInfo['Missing Python module'] = 'I am sorry but the Python module psutil is not installed. Therefore the number of CPU cores cannot be shown.' json_dump = json.dumps(cpuInfo) self.wfile.write(bytes(json_dump, 'utf-8')) #get GPU info if modules['pynvml']: try: pynvml.nvmlInit() gpus = pynvml.nvmlDeviceGetCount() except: gpus = 0 self.wfile.write(bytes('No NVIDIA GPU detected', 'utf-8')) else: gpus = 0 self.wfile.write(bytes('I am sorry but the the Python module pynvml is not installed. Therefore info about NVIDIA GPUs cannot be shown.', 'utf-8')) for i in range(gpus): handle = pynvml.nvmlDeviceGetHandleByIndex(i) self.wfile.write(bytes("<br>GPU " + str(i + 1) + ": " + pynvml.nvmlDeviceGetName(handle).decode('utf-8'), 'utf-8')) try: self.wfile.write(bytes('<br>Temperature: ' + str(pynvml.nvmlDeviceGetTemperature(handle, 0)) + '°C', 'utf-8')) except: self.wfile.write(bytes('<br>Could not retrieve temperature', 'utf-8')) try: gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.wfile.write(bytes('<br>Total memory: %i Megabytes' % (gpu_mem.total / 10**6), 'utf-8')) self.wfile.write(bytes(str('<br>Free memory: %i' % (gpu_mem.free/gpu_mem.total*100)) + '%', 'utf-8')) except: self.wfile.write(bytes('<br>nCould not retrieve memory information', 'utf-8')) if gpus > 0: try: pynvml.nvmlShutdown() except: pass elif self.path == '/availableComputers': send_header(self) s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('google.com', 0)) global myownsocket myownsocket = s.getsockname()[0] port = 8003 available_computers = [] for i in range(1, 256): host = '192.168.178.' + str(i) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(0.2) try: alive = sock.connect_ex((host, port)) except: alive = -1 if alive == 0: print('available') available_computers.append(host) else: print('not available') print(host) self.wfile.write(bytes('<form action="submit_job">\n', 'utf-8')) cmd_txt = """@echo off call "C:\Program Files\Autodesk\Softimage 2015\Application\bin\setenv.bat" echo ##### start_rendering xsibatch -render "Z:\TAZ_RoterFaden\PROCESS\XSI\Scenes\SC_060\088_160523_SC_060_V007.scn" -frames #1#-#2# -pass "BEAUTY" -skip on -verbose on echo ##### rendering_done """ self.wfile.write(bytes('Command: <textarea name="command">' + cmd_txt + '</textarea><br>\n', 'utf-8')) self.wfile.write(bytes('<table border="1">\n', 'utf-8')) self.wfile.write(bytes('<tr>\n', 'utf-8')) self.wfile.write(bytes('<th>Computer</th>\n', 'utf-8')) self.wfile.write(bytes('<th>CPU cores</th>\n', 'utf-8')) self.wfile.write(bytes('<th>Start Frame [%]</th>\n', 'utf-8')) self.wfile.write(bytes('<th>End Frame [%]</th>\n</tr>\n', 'utf-8')) available_cpus = {} for host in available_computers: available_cpus[host] = abs(get_cpu_cores(host)) total_cpus = sum(available_cpus.values()) frame_list = {} start_frame = 0 for host in available_computers: start_frame += 1 frame_list[host] = [start_frame] start_frame = start_frame + int(100 * (available_cpus[host] / total_cpus)) if start_frame > 100: start_frame = 100 frame_list[host].append(start_frame) index = 0 for host in available_computers: index += 1 self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8')) self.wfile.write(bytes(host, 'utf-8')) self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('</tr>', 'utf-8')) index = 2 self.wfile.write(bytes('<tr>\n<td>\n<input type="checkbox" name="host' + str(index) + '" value="', 'utf-8')) self.wfile.write(bytes(host, 'utf-8')) self.wfile.write(bytes('">' + host + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td>' + str(available_cpus[host]) + '</td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="start' + str(index) + '" value=" ' + str(frame_list[host][0]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('<td><input type="text" name="end' + str(index) + '" value=" ' + str(frame_list[host][1]) + '"></td>\n', 'utf-8')) self.wfile.write(bytes('</tr>', 'utf-8')) self.wfile.write(bytes('</table>\n', 'utf-8')) self.wfile.write(bytes('<input type="submit" value="Submit Job">\n', 'utf-8')) self.wfile.write(bytes('</form>\n', 'utf-8')) self.wfile.write(bytes('</body>\n', 'utf-8')) self.wfile.write(bytes('</html>\n', 'utf-8')) elif self.path == '/execute_job': send_header(self) parsed = urlparse(self.path) parameters = parse_qs(parsed.query) elif '/submit_job' in self.path: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) parsed = urlparse(self.path) parameters = parse_qs(parsed.query) #print(parsed) print(parameters) self.wfile.write(bytes('<body>', 'utf-8')) for index in range(1, 100): if not parameters.get('host' + str(index)).strip(): pass elif not parameters.get('start' + str(index)).strip(): pass elif not parameters.get('end' + str(index)).strip(): pass elif parameters.get('command'): cmd_txt = parameters['command'][0].replace('#1#', parameters['start' + str(index)][0].strip()) cmd_txt = cmd_txt.replace('#2#', parameters['end' + str(index)][0].strip()) self.wfile.write(bytes(escape(cmd_txt), 'utf-8')) self.wfile.write(bytes('<br>', 'utf-8')) print(cmd_txt) self.wfile.write(bytes('</body></html>', 'utf-8')) elif '/shutdown' in self.path: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) self.wfile.write(bytes("Server will be shut down now......", 'utf-8')) server.shutdown() sys.exit() else: send_header(self) self.wfile.write(bytes(str(self.client_address), 'utf-8')) self.wfile.write(bytes("<br>", 'utf-8')) self.wfile.write(bytes(self.path, 'utf-8')) print(self.path)
def init(self): self.util_history = [] self.temp_history = [] pynvml.nvmlInit() self.gpu_handles = [] self.deviceCount = pynvml.nvmlDeviceGetCount() for i in range(self.deviceCount): self.gpu_handles.append(pynvml.nvmlDeviceGetHandleByIndex(i)) self.cpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=6) self.cpu_prog_bars = [] self.gpu_boxes = [] self.gpu_prog_bars = [] self.prev_idle = [] self.prev_total = [] self.idle = [] self.total = [] #---cpu_box--- try: stat = open("/proc/stat") statlines = stat.read().splitlines() stat.close() self.corecount = -1 for line in statlines: if (line[0:2] == "cp"): self.corecount+= 1 else: break except IOError: print("Problem opening /proc/stat, exiting..") pynvml.nvmlShutdown() quit() for i in range(self.corecount): self.cpu_prog_bars.append(Gtk.ProgressBar(text="CPU %d" % i, show_text=True)) self.cpu_box.pack_start(self.cpu_prog_bars[i], True, True, 0) self.prev_idle.append(0) self.prev_total.append(0) self.idle.append(0) self.total.append(0) #---gpu_boxes--- for i in range(self.deviceCount): product_name = pynvml.nvmlDeviceGetName(self.gpu_handles[i]) product_name = product_name.decode('utf-8') gpu_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=8) label = Gtk.Label(product_name) self.gpu_prog_bars.append(Gtk.ProgressBar(text="GPU", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Utilization", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Memory Usage", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Temperature", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Encoder", show_text=True)) self.gpu_prog_bars.append(Gtk.ProgressBar(text="Decoder", show_text=True)) gpu_box.pack_start(label, True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +1], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +2], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +3], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +4], True, True, 0) gpu_box.pack_start(self.gpu_prog_bars[i*6 +5], True, True, 0) self.gpu_boxes.append(gpu_box) #---proc--- proc_liststore = Gtk.ListStore(int, str, int) self.tree = Gtk.TreeView(model=proc_liststore) renderer_pid = Gtk.CellRendererText() column_pid = Gtk.TreeViewColumn("Proccess ID", renderer_pid, text=0) column_pid.set_resizable(True) self.tree.append_column(column_pid) renderer_path = Gtk.CellRendererText() column_path = Gtk.TreeViewColumn("Command Line", renderer_path, text=1) column_path.set_resizable(True) column_path.set_fixed_width(250) self.tree.append_column(column_path) renderer_mem = Gtk.CellRendererText() column_mem = Gtk.TreeViewColumn("Memory (MiB)", renderer_mem, text=2) column_mem.set_resizable(True) self.tree.append_column(column_mem)
def count_gpus(): nvmlInit() count = nvmlDeviceGetCount() nvmlShutdown() return count
def info_refresh(self): try: stat = open("/proc/stat") self.statlines = stat.read().splitlines()[1:-1] stat.close() except IOError: print("Problem opening /proc/stat, exiting..") pynvml.nvmlShutdown() quit() for i in range(self.corecount): for j in self.statlines[i].split()[1:]: #remove cpu# self.total[i]+= int(j) self.idle[i] = int(self.statlines[i].split()[4]) for i in range(self.corecount): if (self.total[i] - self.prev_total[i]) == 0: self.prev_idle[i] = self.idle[i] self.prev_total[i] = self.total[i] break self.cpu_prog_bars[i].set_fraction(1 - ((self.idle[i] - self.prev_idle[i]) / (self.total[i] - self.prev_total[i])) ) self.prev_idle[i] = self.idle[i] self.prev_total[i] = self.total[i] self.idle[i] = 0 self.total[i] = 0 for i in range(self.deviceCount): util = pynvml.nvmlDeviceGetUtilizationRates(self.gpu_handles[i]) temp = pynvml.nvmlDeviceGetTemperature(self.gpu_handles[i], pynvml.NVML_TEMPERATURE_GPU) memInfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpu_handles[i]) (encoder_util, sPeriod) = pynvml.nvmlDeviceGetEncoderUtilization(self.gpu_handles[i]) (decoder_util, sPeriod) = pynvml.nvmlDeviceGetDecoderUtilization(self.gpu_handles[i]) mem_total = memInfo.total / 1024 / 1024 mem_used = memInfo.used / 1024 / 1024 self.gpu_prog_bars[i*6].set_text("GPU: %d%%" % util.gpu) self.gpu_prog_bars[i*6].set_fraction(util.gpu / 100) ######## self.util_history.append(util.gpu) self.util_graph.queue_draw() self.temp_history.append(temp) self.temp_graph.queue_draw() ######## self.gpu_prog_bars[i*6 +1].set_text("Memory Utilization: %d%%" % util.memory) self.gpu_prog_bars[i*6 +1].set_fraction(util.memory / 100) self.gpu_prog_bars[i*6 +4].set_text("Encoder: %d%%" % encoder_util) self.gpu_prog_bars[i*6 +5].set_text("Decoder: %d%%" % decoder_util) self.gpu_prog_bars[i*6 +4].set_fraction(encoder_util / 100) self.gpu_prog_bars[i*6 +5].set_fraction(decoder_util / 100) self.gpu_prog_bars[i*6 +2].set_text("Memory Usage: %d MiB/%d MiB" % (mem_used, mem_total)) self.gpu_prog_bars[i*6 +2].set_fraction(mem_used / mem_total) self.gpu_prog_bars[i*6 +3].set_text("Temperature: %d °C" % temp) if temp > 100: temp = 100 elif temp < 0: temp = 0 self.gpu_prog_bars[i*6 +3].set_fraction(temp / 100) #--proc-- procs = pynvml.nvmlDeviceGetGraphicsRunningProcesses(self.gpu_handles[0]) proc_liststore = Gtk.ListStore(int, str, int) for p in procs: pid = p.pid try: path = pynvml.nvmlSystemGetProcessName(p.pid).decode('utf-8') except: self.exit() if (p.usedGpuMemory == None): mem = 0 else: mem = (p.usedGpuMemory / 1024 / 1024) proc_liststore.append([pid, path, mem]) self.tree.set_model(proc_liststore) return True
def new_query(): """Query the information of all the GPUs on local machine""" N.nvmlInit() def _decode(b): if isinstance(b, bytes): return b.decode() # for python3, to unicode return b def get_gpu_info(handle): """Get one GPU information specified by nvml handle""" def get_process_info(nv_process): """Get the process information of specific pid""" process = {} ps_process = psutil.Process(pid=nv_process.pid) process['username'] = ps_process.username() # cmdline returns full path; # as in `ps -o comm`, get short cmdnames. _cmdline = ps_process.cmdline() if not _cmdline: # sometimes, zombie or unknown (e.g. [kworker/8:2H]) process['command'] = '?' else: process['command'] = os.path.basename(_cmdline[0]) # Bytes to MBytes process['gpu_memory_usage'] = nv_process.usedGpuMemory // MB process['pid'] = nv_process.pid return process name = _decode(N.nvmlDeviceGetName(handle)) uuid = _decode(N.nvmlDeviceGetUUID(handle)) try: temperature = N.nvmlDeviceGetTemperature( handle, N.NVML_TEMPERATURE_GPU ) except N.NVMLError: temperature = None # Not supported try: memory = N.nvmlDeviceGetMemoryInfo(handle) # in Bytes except N.NVMLError: memory = None # Not supported try: utilization = N.nvmlDeviceGetUtilizationRates(handle) except N.NVMLError: utilization = None # Not supported try: power = N.nvmlDeviceGetPowerUsage(handle) except N.NVMLError: power = None try: power_limit = N.nvmlDeviceGetEnforcedPowerLimit(handle) except N.NVMLError: power_limit = None try: nv_comp_processes = \ N.nvmlDeviceGetComputeRunningProcesses(handle) except N.NVMLError: nv_comp_processes = None # Not supported try: nv_graphics_processes = \ N.nvmlDeviceGetGraphicsRunningProcesses(handle) except N.NVMLError: nv_graphics_processes = None # Not supported if nv_comp_processes is None and nv_graphics_processes is None: processes = None else: processes = [] nv_comp_processes = nv_comp_processes or [] nv_graphics_processes = nv_graphics_processes or [] for nv_process in nv_comp_processes + nv_graphics_processes: # TODO: could be more information such as system memory # usage, CPU percentage, create time etc. try: process = get_process_info(nv_process) processes.append(process) except psutil.NoSuchProcess: # TODO: add some reminder for NVML broken context # e.g. nvidia-smi reset or reboot the system pass index = N.nvmlDeviceGetIndex(handle) gpu_info = { 'index': index, 'uuid': uuid, 'name': name, 'temperature.gpu': temperature, 'utilization.gpu': utilization.gpu if utilization else None, 'power.draw': power // 1000 if power is not None else None, 'enforced.power.limit': power_limit // 1000 if power_limit is not None else None, # Convert bytes into MBytes 'memory.used': memory.used // MB if memory else None, 'memory.total': memory.total // MB if memory else None, 'processes': processes, } return gpu_info # 1. get the list of gpu and status gpu_list = [] device_count = N.nvmlDeviceGetCount() for index in range(device_count): handle = N.nvmlDeviceGetHandleByIndex(index) gpu_info = get_gpu_info(handle) gpu_stat = GPUStat(gpu_info) gpu_list.append(gpu_stat) # 2. additional info (driver version, etc). try: driver_version = _decode(N.nvmlSystemGetDriverVersion()) except N.NVMLError: driver_version = None # N/A N.nvmlShutdown() return GPUStatCollection(gpu_list, driver_version=driver_version)
def _shutdown_nvml(self): try: pynvml.nvmlShutdown() except pynvml.NVMLError, err: logger.debug('Failed to shutdown NVML: ', err)
def check(self, instance): pynvml.nvmlInit() msg_list = [] try: deviceCount = pynvml.nvmlDeviceGetCount() except: deviceCount = 0 for device_id in xrange(deviceCount): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) name = pynvml.nvmlDeviceGetName(handle) tags = dict(name="{}-{}".format(name, device_id)) d_tags = self._dict2list(tags) # temperature info try: temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) self.gauge('nvml.temp.', temp, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err)) # memory info try: mem = pynvml.nvmlDeviceGetMemoryInfo(handle) self.gauge('nvml.mem.total', mem.total, tags=d_tags) self.gauge('nvml.mem.used', mem.used, tags=d_tags) self.gauge('nvml.mem.free', mem.free, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err)) # utilization GPU/Memory info try: util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle) self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags) self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err)) # utilization Encoder info try: util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle) self.log.info('nvml.util.encoder %s' % long(util_encoder[0])) self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err)) # utilization Decoder info try: util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle) self.log.info('nvml.util.decoder %s' % long(util_decoder[0])) self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err)) # Compute running processes try: cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) for ps in cps: p_tags = tags.copy() p_tags['pid'] = ps.pid p_tags['name'] = psutil.Process(ps.pid).name() p_tags = self._dict2list(p_tags) self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags) except pynvml.NVMLError as err: msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err)) if msg_list: status = AgentCheck.CRITICAL msg = u','.join(msg_list) else: status = AgentCheck.OK msg = u'Ok' pynvml.nvmlShutdown() self.service_check('nvml.check', status, message=msg)
def exit(self, widget, ev): pynvml.nvmlShutdown() Gtk.main_quit() quit()