def checkGPUsAvailability(n_gpus=1): ''' Test that GPUs have free memory on 'n_gpus'. OUT: True: if they have False: if not ''' # For every gpu to check for i_gpu in range(n_gpus): # Access to the memory used by the i-th gpu try: nvidia_smi.nvmlInit() handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i_gpu) mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) except Exception: print('Warning: GPU did not accessed') break # If more than 1GB is taken, then stop if (mem_res.used/(1024.**3) > 1.0): # greater than 1GB of VRAM # Report it print('Memory used (gpu-%i): %.2f GB' % (i_gpu, mem_res.used/(1024**3)), end='') print(' - on total: %.2f GB' % (mem_res.total/(1024**3))) return False return True
def __init__(self, device='cpu'): self.log = SummaryWriter() if nvidia_smi and device != 'cpu': nvidia_smi.nvmlInit() self.handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0) else: self.handle = None
def __init__(self, print_time=60, print_current=False, time_step=0.01): # Call the Thread class's init function super(utilizationGPU, self).__init__() self.print_time = print_time self.print_current = print_current self.time_step = time_step self.GPUs = [] self.occAvgTot = [] self.occAvgStep = [] self.memAvgTot = [] self.memAvgStep = [] self.running = True try: nvmlInit() self.deviceCount = nvmlDeviceGetCount() # Get list of handles # logging.info("[GPU] Detected devices are :") for i in range(self.deviceCount): handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i) self.GPUs.append(handle) logging.info("[GPU] ..... Device %d : %s" % (i, nvmlDeviceGetName(handle))) # Records # self.occAvgTot.append(0) self.occAvgStep.append(0) self.memAvgTot.append(0) self.memAvgStep.append(0) logging.info("[GPU] Will print usage every %d seconds" % self.print_time) except Exception as e: logging.error("[GPU] *** Caught exception: %s : %s" % (str(e.__class__), str(e))) traceback.print_exc()
def get_gpu_memory(): import nvidia_smi nvidia_smi.nvmlInit() handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0) info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) print("Used GPU memory: {}%".format((info.used * 100) // info.total)) nvidia_smi.nvmlShutdown()
def __init__(self, gpus=[]): r"""CO2 consumption tracker for deep learning models. Look at https://arxiv.org/abs/1906.02243 for details. """ # temporal variables self._start = None self._step = None # power variables self._cpu_power = 0 self._gpu_power = 0 self._ram_power = 0 self.total_energy = 0 # GPU-specific constants self._cuda = torch.cuda.is_available() print(gpus) if self._cuda: nvidia_smi.nvmlInit() self._handles = [ nvidia_smi.nvmlDeviceGetHandleByIndex(gpu) for gpu in gpus ] # energy consumption constants self._pue_coeff = 1.58 self._co2_coeff = 0.477
def __init__(self, batch_size, validation_split=0.2, gpu=0, smooth=0.05): self.cuda = torch.cuda.is_available() self.gpu = gpu self.smooth = smooth self.device = torch.device(f"cuda:{self.gpu}" if self.cuda else "cpu") if (NVIDIA_SMI): nvidia_smi.nvmlInit() self.handle = nvidia_smi.nvmlDeviceGetHandleByIndex(self.gpu) print("Computing in {0} : {1}".format(self.device, nvidia_smi.nvmlDeviceGetName(self.handle))) self.batch_size = batch_size self.validation_split = validation_split kwargs = {'num_workers': 2, 'pin_memory': False} if self.cuda else {} self.model = model.Network(95*3+1, 100, 2).to(self.device) print('N. total parameters : {0}'.format(sum(p.numel() for p in self.model.parameters() if p.requires_grad))) self.dataset = Dataset() # Compute the fraction of data for training/validation idx = np.arange(self.dataset.n_training) self.train_index = idx[0:int((1-validation_split)*self.dataset.n_training)] self.validation_index = idx[int((1-validation_split)*self.dataset.n_training):] # Define samplers for the training and validation sets self.train_sampler = torch.utils.data.sampler.SubsetRandomSampler(self.train_index) self.validation_sampler = torch.utils.data.sampler.SubsetRandomSampler(self.validation_index) # Data loaders that will inject data during training self.train_loader = torch.utils.data.DataLoader(self.dataset, batch_size=self.batch_size, sampler=self.train_sampler, shuffle=False, **kwargs) self.validation_loader = torch.utils.data.DataLoader(self.dataset, batch_size=self.batch_size, sampler=self.validation_sampler, shuffle=False, **kwargs)
def get_gpu_memory(device_idx): assert device_idx < NvidiaSmi.total_devices, "device index should {} less than total devices {}"\ .format(device_idx, NvidiaSmi.total_devices) handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_idx) res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) M = 1024**2 return res.free / M, res.total / M, res.used / M
def check_gpu_stat(): nvidia_smi.nvmlInit() deviceCount = nvmlDeviceGetCount() for i in range(deviceCount): handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i) res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle) print(f'gpu{i}: {res.gpu}%, gpu-mem: {res.memory}%')
def __init__(self, batch_size, validation_split=0.2, gpu=0, smooth=0.05, K=3, model_class='conv1d'): self.cuda = torch.cuda.is_available() self.gpu = gpu self.smooth = smooth self.device = torch.device(f"cuda:{self.gpu}" if self.cuda else "cpu") # self.device = 'cpu' self.batch_size = batch_size self.model_class = model_class self.K = K if (NVIDIA_SMI): nvidia_smi.nvmlInit() self.handle = nvidia_smi.nvmlDeviceGetHandleByIndex(self.gpu) print("Computing in {0} : {1}".format( self.device, nvidia_smi.nvmlDeviceGetName(self.handle))) self.validation_split = validation_split kwargs = {'num_workers': 4, 'pin_memory': False} if self.cuda else {} if (model_class == 'conv1d'): self.model = model.Network(K=self.K, L=32, device=self.device, model_class=model_class).to(self.device) if (model_class == 'conv2d'): self.model = model.Network(K=self.K, L=32, NSIDE=16, device=self.device, model_class=model_class).to(self.device) print('N. total parameters : {0}'.format( sum(p.numel() for p in self.model.parameters() if p.requires_grad))) self.train_dataset = Dataset(n_training=20000) self.validation_dataset = Dataset(n_training=2000) # Data loaders that will inject data during training self.train_loader = torch.utils.data.DataLoader( self.train_dataset, batch_size=self.batch_size, shuffle=True, drop_last=True, **kwargs) self.validation_loader = torch.utils.data.DataLoader( self.validation_dataset, batch_size=self.batch_size, shuffle=True, drop_last=True, **kwargs)
def __init__(self, debug_mode=False, challenge_mode=False, track=None, timeout=20.0): """ Init requires scenario as input """ self.scenario = None self.scenario_tree = None self.scenario_class = None self.ego_vehicles = None self.other_actors = None self._debug_mode = debug_mode self._challenge_mode = challenge_mode self._track = track self._agent = None self._running = False self._timestamp_last_run = 0.0 self._timeout = timeout self._watchdog = Watchdog(float(self._timeout)) self.scenario_duration_system = 0.0 self.scenario_duration_game = 0.0 self.start_system_time = None self.end_system_time = None nvidia_smi.nvmlInit() self.handle = nvidia_smi.nvmlDeviceGetHandleByIndex(1) # Register the scenario tick as callback for the CARLA world # Use the callback_id inside the signal handler to allow external interrupts signal.signal(signal.SIGINT, self._signal_handler)
def __init__(self, basis_wavefront='zernike', npix_image=128, n_modes=44, n_frames=10, gpu=0, smooth=0.05,\ batch_size=16, arguments=None): self.pixel_size = 0.0303 self.telescope_diameter = 256.0 # cm self.central_obscuration = 51.0 # cm self.wavelength = 8000.0 self.n_frames = n_frames self.batch_size = batch_size self.arguments = arguments self.basis_for_wavefront = basis_wavefront self.npix_image = npix_image self.n_modes = n_modes self.gpu = gpu self.cuda = torch.cuda.is_available() self.device = torch.device(f"cuda:{self.gpu}" if self.cuda else "cpu") # Ger handlers to later check memory and usage of GPUs if (NVIDIA_SMI): nvidia_smi.nvmlInit() self.handle = nvidia_smi.nvmlDeviceGetHandleByIndex(self.gpu) print("Computing in {0} : {1}".format( gpu, nvidia_smi.nvmlDeviceGetName(self.handle))) # Define the neural network model print("Defining the model...") self.model = model.Network(device=self.device, n_modes=self.n_modes, n_frames=self.n_frames, \ pixel_size=self.pixel_size, telescope_diameter=self.telescope_diameter, central_obscuration=self.central_obscuration, wavelength=self.wavelength,\ basis_for_wavefront=self.basis_for_wavefront, npix_image=self.npix_image).to(self.device) print('N. total parameters : {0}'.format( sum(p.numel() for p in self.model.parameters() if p.requires_grad))) kwargs = {'num_workers': 1, 'pin_memory': False} if self.cuda else {} # Data loaders that will inject data during training self.training_dataset = Dataset( filename='/scratch1/aasensio/fastcam/training_small.h5', n_training_per_star=1000, n_frames=self.n_frames) self.train_loader = torch.utils.data.DataLoader( self.training_dataset, batch_size=self.batch_size, shuffle=True, drop_last=True, **kwargs) self.validation_dataset = Dataset( filename='/scratch1/aasensio/fastcam/validation_small.h5', n_training_per_star=100, n_frames=self.n_frames, validation=True) self.validation_loader = torch.utils.data.DataLoader( self.validation_dataset, batch_size=self.batch_size, shuffle=True, drop_last=True, **kwargs)
def get_mem_info(device_id): gpu_list = [device_id] nvidia_smi.nvmlInit() handle = [nvidia_smi.nvmlDeviceGetHandleByIndex(i) for i in gpu_list] res = [nvidia_smi.nvmlDeviceGetMemoryInfo(item) for item in handle] res = [100 * item.used / item.total for item in res] nvidia_smi.nvmlShutdown() return res[0]
def show_memory_usage(): nvidia_smi.nvmlInit() handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0) # GPU number mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) # print('=' * 50) # print(f'mem: {mem_res.used / (1024 ** 3)} (GiB)') # usage in GiB print(f'mem usage: {100 * (mem_res.used / mem_res.total):.3f}%' ) # percentage
def get_gpu_temp(): try: nvmlInit() gpu = nvmlDeviceGetHandleByIndex(0) gpu_temp = nvmlDeviceGetTemperature(gpu, NVML_TEMPERATURE_GPU) return gpu_temp except NVMLError: return None
def print_gpu_info(idx=0): nvidia_smi.nvmlInit() handle = nvidia_smi.nvmlDeviceGetHandleByIndex(idx) info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) print("Total memory:", info.total) print("Free memory:", info.free) print("Used memory:", info.used) nvidia_smi.nvmlShutdown()
def gpu_memory_tracker(): """returns nvidia gpu memory consumed""" nvidia_smi.nvmlInit() handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0) info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) used = info.used total = info.total percent = used / total * 100 return percent
def memory_check(): nvidia_smi.nvmlInit() handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0) # card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) mbs = mem_res.used / (1024**2) percent = mem_res.used / mem_res.total return mbs, percent
def get(): handles = [] output = [] for device_id in nvidia_smi.nvmlDeviceGetCount(): handles.append(nvidia_smi.nvmlDeviceGetHandleByIndex(device_id)) for handle in handles: res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle) output.append({'usage': res.gpu, 'memory': res.memory}) return output
def Available_GPUs(self): available = [] for i in range(self.total_gpus): handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i) res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle) mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) if res.gpu < 30 and (mem_res.used / mem_res.total * 100) < 30: available.append(i) return available
def Watch_fin(): nvidia_smi.nvmlInit() handle = nvidia_smi.nvmlDeviceGetHandleByIndex(1) res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) time.sleep(1) if res.used == 0: return 0 else: return 1
def get_usage(gpu_list=None, **kwargs): """ Track GPU memory usage. """ _ = kwargs gpu_list = gpu_list or [0] nvidia_smi.nvmlInit() handle = [nvidia_smi.nvmlDeviceGetHandleByIndex(i) for i in gpu_list] res = [nvidia_smi.nvmlDeviceGetMemoryInfo(item) for item in handle] res = [100 * item.used / item.total for item in res] nvidia_smi.nvmlShutdown() return res
def get_usage(gpu_list=None, **kwargs): """ Track GPU memory utilization. """ _ = kwargs gpu_list = gpu_list or [0] nvidia_smi.nvmlInit() handle = [nvidia_smi.nvmlDeviceGetHandleByIndex(i) for i in gpu_list] res = [ nvidia_smi.nvmlDeviceGetUtilizationRates(item) for item in handle ] return [item.memory for item in res]
def gpu_usage(): nvidia_smi.nvmlInit() handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0) # card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) usage = info.used nvidia_smi.nvmlShutdown() return usage
def available_GPUs(total_gpus): available_gpus = [] for i in range(total_gpus): handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i) res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle) mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) if res.gpu < 30 and ( mem_res.used / mem_res.total * 100 ) < 30: # Jon heuristically defines what it means for a GPU to be available available_gpus.append(i) return available_gpus
def use_gpu(): nvidia_smi.nvmlInit() handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0) info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) nvidia_smi.nvmlShutdown() if info.used > 1000000000: return True else: return False
def get_max_data_group_size(): nvidia_smi.nvmlInit() handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0) info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) total_memory = info.total if total_memory >= 12 * (10 ** 9): return 2 ** 12 elif total_memory >= 6 * (10 ** 9): return 2 ** 11 else: os.environ["CUDA_VISIBLE_DEVICES"] = "" return 2 ** 12
def get_device(gpuID=False): """Checks available GPUs and selects the one with the most available memory Parameters ---------- gpuID: bool or int whether to use GPU, or the device ID of a specific GPU to use. If False, use only CPU. If True, attempts to find the GPU with most available memory. Returns ------- device : jax.device handle to gpu or cpu device selected """ import jax if gpuID is False: return jax.devices('cpu')[0] try: gpus = jax.devices('gpu') # did the user request a specific GPU? if isinstance(gpuID, int) and gpuID < len(gpus): return gpus[gpuID] if isinstance(gpuID, int): from desc.backend import TextColors # ID was not valid warnings.warn( TextColors.WARNING + 'gpuID did not match any found devices, trying default gpu option' + TextColors.ENDC) # find all available options and see which has the most space import nvidia_smi nvidia_smi.nvmlInit() maxmem = 0 gpu = gpus[0] for i in range(len(gpus)): handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i) info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) if info.free > maxmem: maxmem = info.free gpu = gpus[i] nvidia_smi.nvmlShutdown() return gpu except: from desc.backend import TextColors warnings.warn(TextColors.WARNING + 'No GPU found, falling back to CPU' + TextColors.ENDC) return jax.devices('cpu')[0]
def on_train_batch_begin(self, batch, logs=None): nvidia_smi.nvmlInit() handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0) # card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle) res1 = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) #GPUs = GPU.getGPUs() #gpu = GPUs[0] print(f'gpu: {res.gpu}%, gpu-mem: {res.memory}%')
def check_cuda_memory(): nvidia_smi.nvmlInit() deviceCount = nvidia_smi.nvmlDeviceGetCount() for i in range(deviceCount): handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i) info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) print("Device {}: {}, Memory : ({:.2f}% free): {}(total), {} (free), {} (used)"\ .format(i, nvidia_smi.nvmlDeviceGetName(handle), 100*info.free/info.total, \ info.total, info.free, info.used)) nvidia_smi.nvmlShutdown() return
def get_gpu_info(gpu_id=None): """ Get gpu-info regarding gpu_id :param gpu_id: gpu bus id :return mem_used: used memory in MiB :return mem_total: total memory in MiB """ if gpu_id is None: gpu_id = int(os.environ["CUDA_VISIBLE_DEVICES"]) handle = nvidia_smi.nvmlDeviceGetHandleByIndex(int(gpu_id)) mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) mem_used = mem_res.used / (1024**2) mem_total = mem_res.total / (1024**2) return mem_used, mem_total, gpu_id