def measure_gpu_usage(self): from py3nvml.py3nvml import nvmlInit, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, \ nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlShutdown, NVMLError max_gpu_usage = [] gpu_name = [] try: nvmlInit() deviceCount = nvmlDeviceGetCount() max_gpu_usage = [0 for i in range(deviceCount)] gpu_name = [ nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)) for i in range(deviceCount) ] while True: for i in range(deviceCount): info = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i)) max_gpu_usage[i] = max(max_gpu_usage[i], info.used / 1024**2) sleep(0.005) # 5ms if not self.keep_measuring: break nvmlShutdown() return [{ "device_id": i, "name": gpu_name[i], "max_used_MB": max_gpu_usage[i] } for i in range(deviceCount)] except NVMLError as error: if not self.silent: self.logger.error( "Error fetching GPU information using nvml: %s", error) return None
def _get_nvml_cuda_mapping(): vis = os.environ.get('CUDA_VISIBLE_DEVICES') if vis is None: return [ py3nvml.nvmlDeviceGetHandleByIndex(i) for i in range(torch.cuda.device_count()) ] return [ py3nvml.nvmlDeviceGetHandleByIndex(i) for i in map(int, vis.split(',')) ]
def measure_gpu_usage(self) -> Optional[List[Dict[str, Any]]]: from py3nvml.py3nvml import ( NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlInit, nvmlShutdown, ) max_gpu_usage = [] gpu_name = [] try: nvmlInit() device_count = nvmlDeviceGetCount() if not isinstance(device_count, int): logger.error( f"nvmlDeviceGetCount result is not integer: {device_count}" ) return None max_gpu_usage = [0 for i in range(device_count)] gpu_name = [ nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)) for i in range(device_count) ] while True: for i in range(device_count): info = nvmlDeviceGetMemoryInfo( nvmlDeviceGetHandleByIndex(i)) if isinstance(info, str): logger.error( f"nvmlDeviceGetMemoryInfo returns str: {info}") return None max_gpu_usage[i] = max(max_gpu_usage[i], info.used / 1024**2) sleep(0.005) # 5ms if not self.keep_measuring: break nvmlShutdown() return [{ "device_id": i, "name": gpu_name[i], "max_used_MB": max_gpu_usage[i], } for i in range(device_count)] except NVMLError as error: logger.error("Error fetching GPU information using nvml: %s", error) return None
def __init__(self): """Constructor.""" nvml.nvmlInit() self._device_count = self.get_device_count() self._device_handlers = list() for i in range(self._device_count): self._device_handlers.append(nvml.nvmlDeviceGetHandleByIndex(i))
def query_gpu(index: int) -> Tuple[int, int, int]: h = nvml.nvmlDeviceGetHandleByIndex(index) # # Get memory info. # mem_info = try_get_info(nvml.nvmlDeviceGetMemoryInfo, h) if mem_info != 'N/A': mem_used = mem_info.used >> 20 mem_total = mem_info.total >> 20 else: mem_used = 0 mem_total = 0 # # Get utilization info # util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, h) if util != 'N/A': gpu_util = util.gpu else: gpu_util = 0 return mem_used, mem_total, gpu_util
def inference_speed_memory(self, batch_size, seq_length): # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length)) key = jax.random.PRNGKey(0) input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size) @jax.jit def ref_step(): out = self.model(input_ids=input_ids) return out[0] if jax.local_devices()[0].platform == 'gpu': nvml.nvmlInit() ref_step().block_until_ready() handle = nvml.nvmlDeviceGetHandleByIndex(0) meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) max_bytes_in_use = meminfo.used memory = Memory(max_bytes_in_use) # shutdown nvml nvml.nvmlShutdown() else: memory = None timeit.repeat("ref_step().block_until_ready()", repeat=1, number=2,globals=locals()) if self.jit: runtimes = timeit.repeat("ref_step().block_until_ready()", repeat=self.repeat,number=3,globals=locals()) else: with jax.disable_jit(): runtimes = timeit.repeat("ref_step().block_until_ready()",repeat=self.repeat,number=3,globals=locals()) return float(np.min(runtimes)/3.0), memory
def getCUDAEnvironment(): """ Get the CUDA runtime environment parameters (number of cards etc.). """ rdict = dict() rdict['first_available_device_index'] = None rdict['device_count'] = 0 try: nvml.nvmlInit() rdict['device_count'] = nvml.nvmlDeviceGetCount() except Exception: print( 'WARNING: At least one of (py3nvml.nvml, CUDA) is not available. Will continue without GPU.' ) return rdict for i in range(rdict['device_count']): memory_info = nvml.nvmlDeviceGetMemoryInfo( nvml.nvmlDeviceGetHandleByIndex(i)) memory_usage_percentage = memory_info.used / memory_info.total if memory_usage_percentage <= 0.1: rdict['first_available_device_index'] = i break nvml.nvmlShutdown() return rdict
def collect(self) -> Generator: hashrate = GaugeMetricFamily('miner_hashrate', 'Hashrate', labels=['gpu_id', 'type']) efficiency = GaugeMetricFamily('miner_efficiency', 'Efficiency', labels=['gpu_id', 'type']) pool_shares = GaugeMetricFamily('miner_pool_shares', 'Pool Shares', labels=['type']) uptime = GaugeMetricFamily('miner_uptime', 'Uptime', labels=['type']) data = BMinerCollector.query_miner(self.host, self.port) uptime.add_metric(['miner'], int(time.time()) - data['start_time']) uptime.add_metric(['connection'], 0) pool_shares.add_metric(['accepted'], data['stratum']['accepted_shares']) pool_shares.add_metric(['rejected'], data['stratum']['rejected_shares']) for key, gpu in data['miners'].items(): # setting CUDA_DEVICE_ORDER=PCI_BUS_ID env var is a must otherwise cuda id and bminer id are different! gpu_id = nvml.nvmlDeviceGetUUID( nvml.nvmlDeviceGetHandleByIndex(int(key))) hashrate.add_metric([gpu_id, 'current'], gpu['solver']['solution_rate']) efficiency.add_metric( [gpu_id, 'current'], round(gpu['solver']['solution_rate'] / gpu['device']['power'], 2)) yield uptime yield hashrate yield efficiency yield pool_shares
def get_gpu_info_by_nvml(self) -> Dict: """Get GPU info using nvml""" gpu_info_list = [] driver_version = None try: nvmlInit() driver_version = nvmlSystemGetDriverVersion() deviceCount = nvmlDeviceGetCount() for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) info = nvmlDeviceGetMemoryInfo(handle) gpu_info = {} gpu_info["memory_total"] = info.total gpu_info["memory_available"] = info.free gpu_info["name"] = nvmlDeviceGetName(handle) gpu_info_list.append(gpu_info) nvmlShutdown() except NVMLError as error: if not self.silent: self.logger.error( "Error fetching GPU information using nvml: %s", error) return None result = {"driver_version": driver_version, "devices": gpu_info_list} if 'CUDA_VISIBLE_DEVICES' in environ: result["cuda_visible"] = environ['CUDA_VISIBLE_DEVICES'] return result
def get_free_gpus(): """ For an N gpu system, returns a list of N boolean values. The nth value will be True if no process was running on the nth gpu.""" # Try connect with NVIDIA drivers logger = logging.getLogger(__name__) try: py3nvml.nvmlInit() except: str_ = """Couldn't connect to nvml drivers. Check they are installed correctly.""" warnings.warn(str_, RuntimeWarning) logger.warn(str_) return [] numDevices = py3nvml.nvmlDeviceGetCount() gpu_free = [False] * numDevices num_gpus = py3nvml.nvmlDeviceGetCount() for i in range(num_gpus): try: h = py3nvml.nvmlDeviceGetHandleByIndex(i) except: continue procs = try_get_info(py3nvml.nvmlDeviceGetComputeRunningProcesses, h, ['something']) if len(procs) == 0: gpu_free[i] = True return gpu_free
def gpu_profile(frame, event, arg): # it is _about to_ execute (!) global last_tensor_sizes global lineno, func_name, filename, module_name if event == 'line': try: # about _previous_ line (!) if lineno is not None: py3nvml.nvmlInit() handle = py3nvml.nvmlDeviceGetHandleByIndex( int(os.environ['GPU_DEBUG'])) meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) line = linecache.getline(filename, lineno) where_str = module_name + ' ' + func_name + ':' + str(lineno) with open(gpu_profile_fn, 'a+') as f: f.write(f"{where_str:<50}" f":{meminfo.used/1024**2:<7.1f}Mb " f"{line.rstrip()}\n") if print_tensor_sizes is True: for tensor in get_tensors(): if not hasattr(tensor, 'dbg_alloc_where'): tensor.dbg_alloc_where = where_str new_tensor_sizes = {(type(x), tuple(x.size()), x.dbg_alloc_where) for x in get_tensors()} for t, s, loc in new_tensor_sizes - last_tensor_sizes: f.write(f'+ {loc:<50} {str(s):<20} {str(t):<10}\n') for t, s, loc in last_tensor_sizes - new_tensor_sizes: f.write(f'- {loc:<50} {str(s):<20} {str(t):<10}\n') last_tensor_sizes = new_tensor_sizes py3nvml.nvmlShutdown() # save details about line _to be_ executed lineno = None func_name = frame.f_code.co_name filename = frame.f_globals["__file__"] if (filename.endswith(".pyc") or filename.endswith(".pyo")): filename = filename[:-1] module_name = frame.f_globals["__name__"] lineno = frame.f_lineno if 'gmwda-pytorch' not in os.path.dirname( os.path.abspath(filename)): lineno = None # skip current line evaluation if ('car_datasets' in filename or '_exec_config' in func_name or 'gpu_profile' in module_name or 'tee_stdout' in module_name): lineno = None # skip current return gpu_profile except (KeyError, AttributeError) as e: print(e) return gpu_profile
def read_top_card_memory_in_bytes(): # pylint: disable=no-member # pylint incorrectly detects that function nvmlDeviceGetMemoryInfo returns str return self.__nvml_get_or_else(lambda: [ nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(card_index)) .total for card_index in range(nvmlDeviceGetCount()) ], default=0)
def environment_info(self): if self._environment_info is None: info = {} info["transformers_version"] = version info["framework"] = self.framework if self.framework == "PyTorch": info["use_torchscript"] = self.args.torchscript if self.framework == "TensorFlow": info["eager_mode"] = self.args.eager_mode info["use_xla"] = self.args.use_xla info["framework_version"] = self.framework_version info["python_version"] = platform.python_version() info["system"] = platform.system() info["cpu"] = platform.processor() info["architecture"] = platform.architecture()[0] info["date"] = datetime.date(datetime.now()) info["time"] = datetime.time(datetime.now()) info["fp16"] = self.args.fp16 info["use_multiprocessing"] = self.args.do_multi_processing info["only_pretrain_model"] = self.args.only_pretrain_model if is_psutil_available(): info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total) else: logger.warning( "Psutil not installed, we won't log available CPU memory. " "Install psutil (pip install psutil) to log available CPU memory." ) info["cpu_ram_mb"] = "N/A" info["use_gpu"] = self.args.is_gpu if self.args.is_gpu: info["num_gpus"] = 1 # TODO(PVP) Currently only single GPU is supported if is_py3nvml_available(): nvml.nvmlInit() handle = nvml.nvmlDeviceGetHandleByIndex(self.args.device_idx) info["gpu"] = nvml.nvmlDeviceGetName(handle) info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total) info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000 info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle) nvml.nvmlShutdown() else: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" info["use_tpu"] = self.args.is_tpu # TODO(PVP): See if we can add more information about TPU # see: https://github.com/pytorch/xla/issues/2180 self._environment_info = info return self._environment_info
def get_device_handles(): """Get a list of NVML device handles, one per device. Can throw NVMLError. """ return [ pynvml.nvmlDeviceGetHandleByIndex(i) for i in range(pynvml.nvmlDeviceGetCount()) ]
def gpu_profile(frame, event): global last_meminfo_used, last_tensor_sizes global lineno, func_name, filename, module_name if event == 'line': try: if lineno: py3nvml.nvmlInit() handle = py3nvml.nvmlDeviceGetHandleByIndex( int(os.environ["GPU_DEBUG"])) meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) line = linecache.getline(filename, lineno) where_str = module_name + ' ' + func_name + ' ' + str(lineno) new_meminfo_used = meminfo.used mem_display = new_meminfo_used - last_meminfo_used if use_incremental else new_meminfo_used with open(gpu_profile_fn, "a+") as f: f.write(f"{where_str:<50}" f":{(mem_display) / 1024 ** 2:<7.1f}Mb " f"{line.rstrip()}\n") last_meminfo_used = new_meminfo_used if print_tensor_sizes: for tensor in get_tensors(): if not hasattr(tensor, 'dbg_alloc_where'): tensor.dbg_alloc_where = where_str new_tensor_sizes = {(type(x), tuple(x.size()), x.dbg_alloc_where) for x in get_tensors()} for t, s, loc in new_tensor_sizes - last_tensor_sizes: f.write(f'+ {loc:<50} {str(s):<20} {str(t):<10}\n') for t, s, loc in last_tensor_sizes - new_tensor_sizes: f.write(f'- {loc:<50} {str(s):<20} {str(t):<10}\n') last_tensor_sizes = new_tensor_sizes py3nvml.nvmlShutdown() lineno = None func_name = frame.f_code.co_name filename = frame.f_globals["__file__"] module_name = frame.f_globals["__name__"] lineno = frame.f_lineno if 'Beta' not in os.path.dirname(os.path.abspath(filename)): lineno = None return gpu_profile except (KeyError, AttributeError): pass return gpu_profile
def __init__(self, index: int): self.index = index self.handle = py3nvml.nvmlDeviceGetHandleByIndex(index) self.name = py3nvml.nvmlDeviceGetName(self.handle) self.memory = Memory(self.handle) self.utilization = Utilization(self.handle) self.processes = Processes(self.handle) self.update()
def __init__(self, report=None, devices=None, quiet=False, always_suffix=False, output=print, verbose_once=True): super(self.__class__, self).__init__() global nvml self.output = output if nvml is not None: try: nvml.nvmlInit() except (OSError, nvml.NVMLError_LibraryNotFound): # the python library might be installed, but not the drivers... nvml = None if nvml is None: if not quiet: self.output( "Could not load py3nvml, cannot report any nvidia device statistics." ) report = [] else: device_count = nvml.nvmlDeviceGetCount() if devices is None: devices = list(range(device_count)) else: devices = [ int(device) for device in devices if 0 <= int(device) < device_count ] self.devices = devices self.deviceHandles = [ nvml.nvmlDeviceGetHandleByIndex(device) for device in devices ] if not quiet: for n, handle in enumerate(self.deviceHandles): self.output("Collecting statistics for device #% 2d: %s" % (n, nvml.nvmlDeviceGetName(handle))) if report is None: report = ['temperature', 'utilization_gpu'] elif report == 'all': report = list(self.reportable_values.keys()) self.verbose_once = verbose_once self.report = report self.always_suffix = always_suffix
def gpus(self): out = [] with nvml_manager(): if not pynvml: return out cpu_to_node = self.cpu_nodes() n_devices = pynvml.nvmlDeviceGetCount() for i in range(n_devices): handle = pynvml.nvmlDeviceGetHandleByIndex(i) out.append(GPU(handle, cpu_to_node)) return out
def gpu_info(): "Returns a tuple of (GPU ID, GPU Description, GPU % Utilization)" nvmlInit() deviceCount = nvmlDeviceGetCount() info = [] for i in range(0, deviceCount): handle = nvmlDeviceGetHandleByIndex(i) util = nvmlDeviceGetUtilizationRates(handle) desc = nvmlDeviceGetName(handle) info.append( (i, desc, util.gpu)) #['GPU %i - %s' % (i, desc)] = util.gpu return info
def test_nvidia_device(idx: int): from py3nvml import py3nvml as nvml handle = nvml.nvmlDeviceGetHandleByIndex(idx) pciInfo = nvml.nvmlDeviceGetPciInfo(handle) brands = { nvml.NVML_BRAND_UNKNOWN: "Unknown", nvml.NVML_BRAND_QUADRO: "Quadro", nvml.NVML_BRAND_TESLA: "Tesla", nvml.NVML_BRAND_NVS: "NVS", nvml.NVML_BRAND_GRID: "Grid", nvml.NVML_BRAND_GEFORCE: "GeForce" } inspect( idx=idx, # id=pciInfo.busId, # uuid=nvml.nvmlDeviceGetUUID(handle), name=nvml.nvmlDeviceGetName(handle), # brand=brands[nvml.nvmlDeviceGetBrand(handle)], # multi_gpu=nvml.nvmlDeviceGetMultiGpuBoard(handle), # pcie_link=nvml.nvmlDeviceGetCurrPcieLinkWidth(handle), fan=nvml.nvmlDeviceGetFanSpeed(handle), # power=nvml.nvmlDeviceGetPowerState(handle), mem_total=nvml.nvmlDeviceGetMemoryInfo(handle).total, mem_used=nvml.nvmlDeviceGetMemoryInfo(handle).used, util_gpu=nvml.nvmlDeviceGetUtilizationRates(handle).gpu, # util_mem=nvml.nvmlDeviceGetUtilizationRates(handle).memory, temp=nvml.nvmlDeviceGetTemperature(handle, nvml.NVML_TEMPERATURE_GPU), power=nvml.nvmlDeviceGetPowerUsage(handle), power_limit=nvml.nvmlDeviceGetPowerManagementLimit(handle), # display=nvml.nvmlDeviceGetDisplayMode(handle), display_active=nvml.nvmlDeviceGetDisplayActive(handle), ) logger.log() procs = nvml.nvmlDeviceGetGraphicsRunningProcesses(handle) for p in procs: inspect(name=nvml.nvmlSystemGetProcessName(p.pid), pid=p.pid, mem=p.usedGpuMemory) procs = nvml.nvmlDeviceGetComputeRunningProcesses(handle) for p in procs: inspect(name=nvml.nvmlSystemGetProcessName(p.pid), pid=p.pid, mem=p.usedGpuMemory) logger.log()
def get_gpu_info() -> Optional[List[Dict[str, Any]]]: from py3nvml.py3nvml import ( NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlDeviceGetName, nvmlInit, nvmlShutdown, ) try: nvmlInit() result = [] device_count = nvmlDeviceGetCount() if not isinstance(device_count, int): return None for i in range(device_count): info = nvmlDeviceGetMemoryInfo(nvmlDeviceGetHandleByIndex(i)) if isinstance(info, str): return None result.append({ "id": i, "name": nvmlDeviceGetName(nvmlDeviceGetHandleByIndex(i)), "total": info.total, "free": info.free, "used": info.used, }) nvmlShutdown() return result except NVMLError as error: print("Error fetching GPU information using nvml: %s", error) return None
def get_gpu_info() -> Tuple[Optional[str], Optional[List[GpuInfo]]]: """ Get driver version and list of ``GpuInfo``, if available. """ try: nvml.nvmlInit() except nvml.NVMLError: # Not available. return None, None driver_version: str = nvml.nvmlSystemGetDriverVersion() gpus: List[GpuInfo] = [] device_count: int = nvml.nvmlDeviceGetCount() for i in range(device_count): handle = nvml.nvmlDeviceGetHandleByIndex(i) name = try_get_info(nvml.nvmlDeviceGetName, handle) fan_speed = try_get_info(nvml.nvmlDeviceGetFanSpeed, handle, default=0) temp = try_get_info( lambda h: nvml.nvmlDeviceGetTemperature(h, nvml. NVML_TEMPERATURE_GPU), handle, default=0, ) mem_info = try_get_info(nvml.nvmlDeviceGetMemoryInfo, handle) if mem_info: mem_used = mem_info.used >> 20 mem_total = mem_info.total >> 20 else: mem_used = 0 mem_total = 0 util = try_get_info(nvml.nvmlDeviceGetUtilizationRates, handle) if util: gpu_util = util.gpu else: gpu_util = 0 gpus.append( GpuInfo( id=i, name=name, mem_usage=mem_used, mem_capacity=mem_total, utilization=gpu_util, temp=temp, fan=fan_speed, )) nvml.nvmlShutdown() return driver_version, gpus
def environment_info(self): if self._environment_info is None: info = {} info["gluonnlp_version"] = gluonnlp.__version__ info["framework_version"] = mxnet.__version__ info["python_version"] = platform.python_version() info["system"] = platform.system() info["cpu"] = platform.processor() info["architecture"] = platform.architecture()[0] info["date"] = datetime.date(datetime.now()) info["time"] = datetime.time(datetime.now()) info["fp16"] = self._use_fp16 if is_psutil_available(): info["cpu_ram_mb"] = bytes_to_mega_bytes( psutil.virtual_memory().total) else: logger.warning( "Psutil not installed, we won't log available CPU memory." "Install psutil (pip install psutil) to log available CPU memory." ) info["cpu_ram_mb"] = "N/A" info["use_gpu"] = self._use_gpu if self._use_gpu: info["num_gpus"] = 1 if is_py3nvml_available(): nvml.nvmlInit() handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx) info["gpu"] = nvml.nvmlDeviceGetName(handle) info["gpu_ram_mb"] = bytes_to_mega_bytes( nvml.nvmlDeviceGetMemoryInfo(handle).total) info[ "gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit( handle) / 1000 info[ "gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState( handle) nvml.nvmlShutdown() else: logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) info["gpu"] = "N/A" info["gpu_ram_mb"] = "N/A" info["gpu_power_watts"] = "N/A" info["gpu_performance_state"] = "N/A" self._environment_info = info return self._environment_info
def train_speed_memory(self, batch_size, seq_length): key = jax.random.PRNGKey(0) input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size) targets = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size) labels = jax.random.randint(key, (batch_size, seq_length), 0, 2) # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length)) # targets = np.random.randint(0, self.vocab_size, (batch_size, seq_length)) # labels = np.random.randint(0,2, (batch_size, seq_length)) @jax.jit def train_step(): def loss_fn(params): token_mask = jnp.where(labels > 0, 1.0, 0.0).astype(self.dtype) logits = self.model(input_ids=input_ids, train=True, params=params, dropout_rng=jax.random.PRNGKey(0))[0] loss, normalizing_factor = cross_entropy(logits,targets, token_mask) jax.profiler.save_device_memory_profile(f"memory/{workload[0]}_{workload[1]}_memory.prof", "gpu") return loss / normalizing_factor if self.fp16 and jax.local_devices()[0].platform == 'gpu': grad_fn = self.dynamic_scale.value_and_grad(loss_fn) dyn_scale, is_fin, loss, grad = grad_fn(self.model.params) else: grad_fn = jax.value_and_grad(loss_fn) loss, grad = grad_fn(self.model.params) return tree_flatten(grad)[0] if jax.local_devices()[0].platform == 'gpu': nvml.nvmlInit() train_step() handle = nvml.nvmlDeviceGetHandleByIndex(0) meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) max_bytes_in_use = meminfo.used memory = Memory(max_bytes_in_use) # shutdown nvml nvml.nvmlShutdown() else: memory = None # timeit.repeat(train_step,repeat=1,number=2) timeit.repeat("for i in train_step():i.block_until_ready()", repeat=1, number=2,globals=locals()) if self.jit: # runtimes = timeit.repeat(train_step,repeat=self.repeat,number=3) runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals()) else: with jax.disable_jit(): # runtimes = timeit.repeat(train_step, repeat=self.repeat, number=3) runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals()) return float(np.min(runtimes)/3.0), memory
def run_gpu_mem_counter(do_shutdown=False): # Sum used memory for all GPUs if not torch.cuda.is_available(): return 0 if do_shutdown: py3nvml.nvmlInit() devices = list(range(py3nvml.nvmlDeviceGetCount()) ) #if gpus_to_trace is None else gpus_to_trace gpu_mem = 0 for i in devices: handle = py3nvml.nvmlDeviceGetHandleByIndex(i) meminfo = py3nvml.nvmlDeviceGetMemoryInfo(handle) gpu_mem += meminfo.used if do_shutdown: py3nvml.nvmlShutdown() return gpu_mem
def __init__(self): self.labels = ['gpu', 'name', 'driver'] self.driver = nv.nvmlSystemGetDriverVersion() self.n_gpu = nv.nvmlDeviceGetCount() self.hnds = [ nv.nvmlDeviceGetHandleByIndex(i) for i in range(self.n_gpu) ] self.args = [] for i, hnd in enumerate(self.hnds): args = OrderedDict() args['gpu'] = 'gpu%d' % i args['name'] = nv.nvmlDeviceGetName(hnd) args['driver'] = self.driver self.args.append(args)
def get_free_gpus(max_procs=0): """ Checks the number of processes running on your GPUs. Parameters ---------- max_procs : int Maximum number of procs allowed to run on a gpu for it to be considered 'available' Returns ------- availabilities : list(bool) List of length N for an N-gpu system. The nth value will be true, if the nth gpu had at most max_procs processes running on it. Set to 0 to look for gpus with no procs on it. Note ---- If function can't query the driver will return an empty list rather than raise an Exception. """ # Try connect with NVIDIA drivers logger = logging.getLogger(__name__) try: py3nvml.nvmlInit() except: str_ = """Couldn't connect to nvml drivers. Check they are installed correctly.""" warnings.warn(str_, RuntimeWarning) logger.warning(str_) return [] num_gpus = py3nvml.nvmlDeviceGetCount() gpu_free = [False] * num_gpus for i in range(num_gpus): try: h = py3nvml.nvmlDeviceGetHandleByIndex(i) except: continue procs = try_get_info(py3nvml.nvmlDeviceGetComputeRunningProcesses, h, ['something']) if len(procs) <= max_procs: gpu_free[i] = True py3nvml.nvmlShutdown() return gpu_free
def _measure_memory(self, func: Callable[[], None]) -> [Memory, MemorySummary]: try: if self.args.trace_memory_line_by_line: trace = start_memory_tracing("transformers") if self.args.is_tpu: # tpu raise NotImplementedError( "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `--no-memory` or `args.memory=False`" ) elif self.args.is_gpu: if not is_py3nvml_available(): logger.warning( "py3nvml not installed, we won't log GPU memory usage. " "Install py3nvml (pip install py3nvml) to log information about GPU." ) memory = "N/A" else: logger.info( "Measuring total GPU usage on GPU device. Make sure to not have additional processes running on the same GPU." ) # init nvml nvml.nvmlInit() func() handle = nvml.nvmlDeviceGetHandleByIndex( self.args.device_idx) meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) max_bytes_in_use = meminfo.used memory = Memory(max_bytes_in_use) # shutdown nvml nvml.nvmlShutdown() else: # cpu memory_bytes = measure_peak_memory_cpu(func) memory = Memory(memory_bytes) if isinstance( memory_bytes, int) else memory_bytes if self.args.trace_memory_line_by_line: summary = stop_memory_tracing(trace) else: summary = None return memory, summary except RuntimeError as e: self.print_fn(f"Doesn't fit on GPU. {e}") return "N/A", None
def _get_current_power(self, arrange_next=True): # return in kilo watt if self.gpu: num_gpus = nvmlDeviceGetCount() current_power = 0 for i in range(num_gpus): h = nvmlDeviceGetHandleByIndex(i) power = try_get_info(nvmlDeviceGetPowerUsage, h, "-1") current_power += power / 1000 if arrange_next: self.schedule.enter(self.interval, 1, self._get_current_power) else: pass else: current_power = 0 self.powers.append(current_power) return current_power
def memory_status(msg="", reset_max=True, sync=True): rank = smp.rank() tp_rank = smp.tp_rank() pp_rank = smp.pp_rank() rdp_rank = smp.rdp_rank() local_rank = smp.local_rank() if sync: torch.cuda.synchronize() if rdp_rank != 0: return if py3nvml != None: py3nvml.nvmlInit() handle = py3nvml.nvmlDeviceGetHandleByIndex(local_rank) info = py3nvml.nvmlDeviceGetMemoryInfo(handle) total_used = info.used / 1024**3 total_used_str = f"Totally used GPU memory: {total_used}" else: total_used_str = "" alloced = torch.cuda.memory_allocated(device=local_rank) max_alloced = torch.cuda.max_memory_allocated(device=local_rank) cached = torch.cuda.memory_reserved(device=local_rank) max_cached = torch.cuda.max_memory_reserved(device=local_rank) # convert to GB for printing alloced /= 1024**3 cached /= 1024**3 max_alloced /= 1024**3 max_cached /= 1024**3 print( f'[{msg}] rank {rank} tp_rank {tp_rank} pp_rank {pp_rank} TORCH {torch.__version__}', f'device={local_rank} ' f'alloc {alloced:0.4f} max_alloced {max_alloced:0.4f} ' f'cache {cached:0.4f} max_cached {max_cached:0.4f} ' f'{total_used_str}') if reset_max: torch.cuda.reset_max_memory_cached() torch.cuda.reset_max_memory_allocated() if py3nvml != None: py3nvml.nvmlShutdown()