def allocate(self, size): from traceback import extract_stack stack = tuple(frm[2] for frm in extract_stack()) description = self.describe(stack, size) histogram = {} for bsize, descr in self.blocks.itervalues(): histogram[bsize, descr] = histogram.get((bsize, descr), 0) + 1 from pytools import common_prefix cpfx = common_prefix(descr for bsize, descr in histogram) print >> self.logfile, \ "\n Allocation of size %d occurring " \ "(mem: last_free:%d, free: %d, total:%d) (pool: held:%d, active:%d):" \ "\n at: %s" % ( (size, self.last_free) + cuda.mem_get_info() + (self.held_blocks, self.active_blocks, description)) hist_items = sorted(list(histogram.iteritems())) for (bsize, descr), count in hist_items: print >> self.logfile, \ " %s (%d bytes): %dx" % (descr[len(cpfx):], bsize, count) if self.interactive: raw_input(" [Enter]") result = DeviceMemoryPool.allocate(self, size) self.blocks[result] = size, description self.last_free, _ = cuda.mem_get_info() return result
def test_memleak(): log.info("test_memleak()") from pycuda import driver #use the first device for this test start_free_memory = None for i in range(100): d = driver.Device(0) context = d.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST) if start_free_memory is None: start_free_memory, _ = driver.mem_get_info() free_memory, total_memory = driver.mem_get_info() log.info("%s%% free_memory: %s MB, total_memory: %s MB", str(i).rjust(3), free_memory/1024/1024, total_memory/1024/1024) context.pop() context.detach() w = random.randint(16, 128)*8 h = random.randint(16, 128)*8 n = random.randint(2, 10) test_encoder(encoder_module, options={}, dimensions=[(w, h)], n_images=n) d = driver.Device(0) context = d.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST) end_free_memory, _ = driver.mem_get_info() context.pop() context.detach() log.info("memory lost: %s MB", (start_free_memory-end_free_memory)/1024/1024)
def test_memleak(): log.info("test_memleak()") from pycuda import driver #use the first device for this test start_free_memory = None for i in range(100): d = driver.Device(0) context = d.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST) if start_free_memory is None: start_free_memory, _ = driver.mem_get_info() free_memory, total_memory = driver.mem_get_info() log.info("%s%% free_memory: %s MB, total_memory: %s MB", str(i).rjust(3), free_memory / 1024 / 1024, total_memory / 1024 / 1024) context.pop() context.detach() w = random.randint(16, 128) * 8 h = random.randint(16, 128) * 8 n = random.randint(2, 10) test_encoder(encoder_module, options={}, dimensions=[(w, h)], n_images=n) d = driver.Device(0) context = d.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST) end_free_memory, _ = driver.mem_get_info() context.pop() context.detach() log.info("memory lost: %s MB", (start_free_memory - end_free_memory) / 1024 / 1024)
def __init__(self, init_data, n_generators): self.ctx = curr_gpu.make_context() self.module = pycuda.compiler.SourceModule(kernels_cuda_src, no_extern_c=True) (free, total) = cuda.mem_get_info() print(("Global memory occupancy:%f%% free" % (free * 100 / total))) print(("Global free memory :%i Mo free" % (free / 10 ** 6))) ################################################################################################################ self.width_mat = np.int32(init_data.shape[0]) # self.gpu_init_data = ga.to_gpu(init_data) self.gpu_init_data = cuda.mem_alloc(init_data.nbytes) cuda.memcpy_htod(self.gpu_init_data, init_data) self.cpu_new_data = np.zeros_like(init_data, dtype=np.float32) print("size new data = ", self.cpu_new_data.nbytes / 10 ** 6) (free, total) = cuda.mem_get_info() print(("Global memory occupancy:%f%% free" % (free * 100 / total))) print(("Global free memory :%i Mo free" % (free / 10 ** 6))) self.gpu_new_data = cuda.mem_alloc(self.cpu_new_data.nbytes) cuda.memcpy_htod(self.gpu_new_data, self.cpu_new_data) # self.gpu_new_data = ga.to_gpu(self.cpu_new_data) self.cpu_vect_sum = np.zeros((self.width_mat,), dtype=np.float32) self.gpu_vect_sum = cuda.mem_alloc(self.cpu_vect_sum.nbytes) cuda.memcpy_htod(self.gpu_vect_sum, self.cpu_vect_sum) # self.gpu_vect_sum = ga.to_gpu(self.cpu_vect_sum) ################################################################################################################ self.init_rng = self.module.get_function("init_rng") self.gen_rand_mat = self.module.get_function("gen_rand_mat") self.sum_along_axis = self.module.get_function("sum_along_axis") self.norm_along_axis = self.module.get_function("norm_along_axis") self.init_vect_sum = self.module.get_function("init_vect_sum") self.copy_mat = self.module.get_function("copy_mat") ################################################################################################################ self.n_generators = n_generators seed = 1 self.rng_states = cuda.mem_alloc( n_generators * characterize.sizeof("curandStateXORWOW", "#include <curand_kernel.h>") ) self.init_rng( np.int32(n_generators), self.rng_states, np.uint64(seed), np.uint64(0), block=(64, 1, 1), grid=(n_generators // 64 + 1, 1), ) (free, total) = cuda.mem_get_info() size_block_x = 32 size_block_y = 32 n_blocks_x = int(self.width_mat) // (size_block_x) + 1 n_blocks_y = int(self.width_mat) // (size_block_y) + 1 self.grid = (n_blocks_x, n_blocks_y, 1) self.block = (size_block_x, size_block_y, 1)
def getFreeMemory(show=True): ''' Return the free memory of the device,. Very usful to look for save device memory ''' Mb = 1024.*1024. Mbytes = float(cuda.mem_get_info()[0])/Mb if show: print "Free Global Memory: %f Mbytes" %Mbytes return cuda.mem_get_info()[0]/Mb
def propagate_eager(self, wavelength, wavefront): """ 'Not-Too-Good' version of the propagation on the GPU (lots of Memory issues...) Remove in the future :param wavelength: :param wavefront: :return: """ N = self.N_PIX # free, total = cuda.mem_get_info() free, total = cuda.mem_get_info() print("Free: %.2f percent" % (free / total * 100)) # Pupil Plane -> Image Slicer complex_pupil = self.pupil_masks[wavelength] * np.exp( 1j * 2 * np.pi * self.pupil_masks[wavelength] / wavelength) complex_pupil_gpu = gpuarray.to_gpu( np.asarray(complex_pupil, np.complex64)) plan = cu_fft.Plan(complex_pupil_gpu.shape, np.complex64, np.complex64) cu_fft.fft(complex_pupil_gpu, complex_pupil_gpu, plan, scale=True) # Add N_slices copies to be Masked complex_slicer_cpu = complex_pupil_gpu.get() complex_pupil_gpu.gpudata.free() free, total = cuda.mem_get_info() print("*Free: %.2f percent" % (free / total * 100)) complex_slicer_cpu = np.stack([complex_slicer_cpu] * self.N_slices) complex_slicer_gpu = gpuarray.to_gpu(complex_slicer_cpu) slicer_masks_gpu = gpuarray.to_gpu(self.slicer_masks_fftshift) clinalg.multiply(slicer_masks_gpu, complex_slicer_gpu, overwrite=True) slicer_masks_gpu.gpudata.free() free, total = cuda.mem_get_info() print("**Free: %.2f percent" % (free / total * 100)) # Slicer -> Pupil Mirror plan = cu_fft.Plan((N, N), np.complex64, np.complex64, self.N_slices) cu_fft.ifft(complex_slicer_gpu, complex_slicer_gpu, plan, scale=True) mirror_mask_gpu = gpuarray.to_gpu(self.pupil_mirror_masks_fft) clinalg.multiply(mirror_mask_gpu, complex_slicer_gpu, overwrite=True) # Pupil Mirror -> Slits cu_fft.fft(complex_slicer_gpu, complex_slicer_gpu, plan) slits = complex_slicer_gpu.get() complex_slicer_gpu.gpudata.free() mirror_mask_gpu.gpudata.free() slit = fftshift(np.sum((np.abs(slits))**2, axis=0)) free, total = cuda.mem_get_info() print("***Free: %.2f percent" % (free / total * 100)) return slit
def swap_out_to_CPU(elem): # prepare variables return_falg = True u, ss, sp = elem dp = data_list[u][ss][sp] bytes = dp.data_bytes # now we will swap out, this data to CPU # so first we should check CPU has enough free memory MemFree = cpu_mem_check() if log_type in ['memory']: fm,tm = cuda.mem_get_info() log_str = "CPU MEM CEHCK Before swap out: %s Free, %s Maximum, %s Want to use"%(print_bytes(MemFree),'-',print_bytes(bytes)) log(log_str,'memory',log_type) if bytes > MemFree: # not enough memory for swap out to CPU return False # we have enough memory so we can swap out # if other process not malloc during this swap out oeprataion try: buf = numpy.empty((dp.data_memory_shape), dtype= dp.data_contents_memory_dtype) except: # we failed memory allocation in the CPU return False # do the swap out #cuda.memcpy_dtoh_async(buf, dp.devptr, stream=stream[1]) cuda.memcpy_dtoh(buf, dp.devptr) ctx.synchronize() dp.devptr.free() dp.devptr = None dp.data = buf dp.data_dtype = numpy.ndarray dp.memory_type = 'memory' gpu_list.remove(elem) cpu_list.append(elem) if log_type in ['memory']: fm,tm = cuda.mem_get_info() log_str = "GPU MEM CEHCK After swap out: %s Free, %s Maximum, %s Want to use"%(print_bytes(fm),print_bytes(tm),print_bytes(bytes)) log(log_str,'memory',log_type) return True
def swap_out_to_CPU(elem): # prepare variables return_falg = True u, ss, sp = elem dp = data_list[u][ss][sp] bytes = dp.data_bytes # now we will swap out, this data to CPU # so first we should check CPU has enough free memory MemFree = cpu_mem_check() if log_type in ['memory']: fm, tm = cuda.mem_get_info() log_str = "CPU MEM CEHCK Before swap out: %s Free, %s Maximum, %s Want to use" % ( print_bytes(MemFree), '-', print_bytes(bytes)) log(log_str, 'memory', log_type) if bytes > MemFree: # not enough memory for swap out to CPU return False # we have enough memory so we can swap out # if other process not malloc during this swap out oeprataion try: buf = numpy.empty((dp.data_memory_shape), dtype=dp.data_contents_memory_dtype) except: # we failed memory allocation in the CPU return False # do the swap out #cuda.memcpy_dtoh_async(buf, dp.devptr, stream=stream[1]) cuda.memcpy_dtoh(buf, dp.devptr) ctx.synchronize() dp.devptr.free() dp.devptr = None dp.data = buf dp.data_dtype = numpy.ndarray dp.memory_type = 'memory' gpu_list.remove(elem) cpu_list.append(elem) if log_type in ['memory']: fm, tm = cuda.mem_get_info() log_str = "GPU MEM CEHCK After swap out: %s Free, %s Maximum, %s Want to use" % ( print_bytes(fm), print_bytes(tm), print_bytes(bytes)) log(log_str, 'memory', log_type) return True
def swap_out_to_hard_disk(elem): # prepare variables return_falg = True u, ss, sp = elem dp = data_list[u][ss][sp] bytes = dp.data_bytes # now we will swap out, this CPU to hard disk # so first we should check hard disk has enough free memory file_name = '%d_temp' % (rank) os.system('df . > %s' % (file_name)) f = open(file_name) s = f.read() f.close() ss = s.split() # get available byte avail = int(ss[10]) if log_type in ['memory']: fm, tm = cuda.mem_get_info() log_str = "HARD disk MEM CEHCK Before swap out: %s Free, %s Maximum, %s Want to use" % ( print_bytes(avail), '-', print_bytes(bytes)) log(log_str, 'memory', log_type) if bytes > avail: # we failed make swap file in hard disk return False # now we have enough hard disk to make swap file # temp file name, "temp_data, rank, u, ss, sp" file_name = 'temp_data, %s, %s, %s, %s' % (rank, u, ss, sp) f = open(file_name, 'wb') f.write(dp.data) f.close() dp.data = None dp.hard_disk = file_name dp.memory_type = 'hard_disk' cpu_list.remove(elem) hard_list.append(elem) if log_type in ['memory']: fm, tm = cuda.mem_get_info() log_str = "CPU MEM CEHCK After swap out: %s Free, %s Maximum, %s Want to use" % ( print_bytes(fm), print_bytes(tm), print_bytes(bytes)) log(log_str, 'memory', log_type) return True
def show_GPU_mem(): import pycuda.driver as cuda mem_free = float(cuda.mem_get_info()[0]) mem_free_per = mem_free / float(cuda.mem_get_info()[1]) mem_used = float(cuda.mem_get_info()[1] - cuda.mem_get_info()[0]) mem_used_per = mem_used / float(cuda.mem_get_info()[1]) print '\nGPU memory available {0} Mbytes, {1} % of total \n'.format( mem_free / 1024**2, 100 * mem_free_per) print 'GPU memory used {0} Mbytes, {1} % of total \n'.format( mem_used / 1024**2, 100 * mem_used_per)
def show_GPU_mem(): import pycuda.driver as cuda mem_free = float(cuda.mem_get_info()[0]) mem_free_per = mem_free/float(cuda.mem_get_info()[1]) mem_used = float(cuda.mem_get_info()[1] - cuda.mem_get_info()[0]) mem_used_per = mem_used/float(cuda.mem_get_info()[1]) print '\nGPU memory available {0} Mbytes, {1} % of total \n'.format( mem_free/1024**2, 100*mem_free_per) print 'GPU memory used {0} Mbytes, {1} % of total \n'.format( mem_used/1024**2, 100*mem_used_per)
def swap_out_to_hard_disk(elem): # prepare variables return_falg = True u, ss, sp = elem dp = data_list[u][ss][sp] bytes = dp.data_bytes # now we will swap out, this CPU to hard disk # so first we should check hard disk has enough free memory file_name = '%d_temp'%(rank) os.system('df . > %s'%(file_name)) f = open(file_name) s = f.read() f.close() ss = s.split() # get available byte avail = int(ss[10]) if log_type in ['memory']: fm,tm = cuda.mem_get_info() log_str = "HARD disk MEM CEHCK Before swap out: %s Free, %s Maximum, %s Want to use"%(print_bytes(avail),'-',print_bytes(bytes)) log(log_str,'memory',log_type) if bytes > avail: # we failed make swap file in hard disk return False # now we have enough hard disk to make swap file # temp file name, "temp_data, rank, u, ss, sp" file_name = 'temp_data, %s, %s, %s, %s'%(rank, u, ss, sp) f = open(file_name,'wb') f.write(dp.data) f.close() dp.data = None dp.hard_disk = file_name dp.memory_type = 'hard_disk' cpu_list.remove(elem) hard_list.append(elem) if log_type in ['memory']: fm,tm = cuda.mem_get_info() log_str = "CPU MEM CEHCK After swap out: %s Free, %s Maximum, %s Want to use"%(print_bytes(fm),print_bytes(tm),print_bytes(bytes)) log(log_str,'memory',log_type) return True
def create_array(n_elements, n_dims, device_array, list_array, seed=0, ftype=FTYPE): """Create an arbitrary array for test_GPUHist.""" assert n_elements > 0 assert n_dims > 0 center = 1e3 sigm = 1e3 rand = np.random.RandomState(seed) values = rand.normal(loc=center, scale=sigm, size=(n_elements, n_dims)).astype(ftype) if device_array or (list_array and n_dims > 3): try: d_values = cuda.mem_alloc(values.nbytes) cuda.memcpy_htod(d_values, values) return values, d_values except pycuda._driver.MemoryError: print("Error at allocating memory") available_memory = cuda.mem_get_info()[0] print("You have %d Mbytes memory. Trying to allocate %d" " bytes (%d Mbytes) of memory\n" % (available_memory / (1024 * 1024), values.nbytes, values.nbytes / (1024 * 1024))) return values, values elif list_array and n_dims < 4: try: # We need a different shape here: Each array in a list shall # contain one dimension of all data. d_values = [] for i in xrange(n_dims): tmp_values = np.asarray([v[i] for v in values]) d_values.append(cuda.mem_alloc(tmp_values.nbytes)) cuda.memcpy_htod(d_values[i], tmp_values) return values, d_values except pycuda._driver.MemoryError: print("Error at allocating memory") available_memory = cuda.mem_get_info()[0] print("You have %d Mbytes memory. Trying to allocate %d" " bytes (%d Mbytes) of memory\n" % (available_memory / (1024 * 1024), values.nbytes, values.nbytes / (1024 * 1024))) return values, values else: return values, values
def run(self): drv.init() a0=numpy.zeros((p,),dtype=numpy.complex64) self.dev = drv.Device(self.number) self.ctx = self.dev.make_context() #TO VERIFY WHETHER ALL THE MEMORY IS FREED BEFORE NEXT ALLOCATION (THIS DOES NOT HAPPEN IN MULTITHREADING) print drv.mem_get_info() self.gpu_a = garray.empty((self.input_cpu.size,), dtype=numpy.complex64) self.gpu_b = garray.zeros_like(self.gpu_a) self.gpu_a = garray.to_gpu(self.input_cpu) plan = Plan(a0.shape,context=self.ctx) plan.execute(self.gpu_a, self.gpu_b, batch=p/m) self.temp = self.gpu_b.get() print output_cpu._closed self.output_cpu.put(self.temp)
def init_module(): global context, context_wrapper if context_wrapper is not None: return log_sys_info() device_id, device = select_device() context = device.make_context(flags=driver.ctx_flags.SCHED_YIELD | driver.ctx_flags.MAP_HOST) debug("testing with context=%s", context) debug("api version=%s", context.get_api_version()) free, total = driver.mem_get_info() debug("using device %s", device_info(device)) debug("memory: free=%sMB, total=%sMB", int(free / 1024 / 1024), int(total / 1024 / 1024)) context_wrapper = CudaContextWrapper(context) #generate kernel sources: for rgb_format, yuv_formats in COLORSPACES_MAP.items(): m = gen_rgb_to_yuv_kernels(rgb_format, yuv_formats) KERNELS_MAP.update(m) _kernel_names_ = sorted(set([x[0] for x in KERNELS_MAP.values()])) log.info("%s csc_nvcuda kernels: %s", len(_kernel_names_), ", ".join(_kernel_names_)) #now, pre-compile the kernels: for src_format, dst_format in KERNELS_MAP.keys(): get_CUDA_kernel(device_id, src_format, dst_format) context.pop()
def filter(self, video_input): """ Performs RF filtering on input video for all the rfs """ if len(video_input.shape) == 2: # if input has 2 dimensions assert video_input.shape[1] == self.size else: # if input has 3 dimensions assert (video_input.shape[1] * video_input.shape[2] == self.size) # rasterizing inputs video_input.resize((video_input.shape[0], self.size)) d_video = parray.to_gpu(video_input) d_output = parray.empty((self.num_neurons, video_input.shape[0]), self.dtype) free, total = cuda.mem_get_info() self.ONE_TIME_FILTERS = ((free // self.dtype.itemsize) * 3 // 4 // self.size) self.ONE_TIME_FILTERS -= self.ONE_TIME_FILTERS % 2 self.ONE_TIME_FILTERS = min(self.ONE_TIME_FILTERS, self.num_neurons) handle = la.cublashandle() for i in np.arange(0, self.num_neurons, self.ONE_TIME_FILTERS): Nfilters = min(self.ONE_TIME_FILTERS, self.num_neurons - i) self.generate_filters(startbias=i, N_filters=Nfilters) la.dot(self.filters, d_video, opb='t', C=d_output[i:i + Nfilters], handle=handle) del self.filters return d_output.T()
def init_module(): global context, context_wrapper if context_wrapper is not None: return log_sys_info() device_id, device = select_device() context = device.make_context(flags=driver.ctx_flags.SCHED_YIELD | driver.ctx_flags.MAP_HOST) debug("testing with context=%s", context) debug("api version=%s", context.get_api_version()) free, total = driver.mem_get_info() debug("using device %s", device_info(device)) debug("memory: free=%sMB, total=%sMB", int(free/1024/1024), int(total/1024/1024)) context_wrapper = CudaContextWrapper(context) #generate kernel sources: for rgb_format, yuv_formats in COLORSPACES_MAP.items(): m = gen_rgb_to_yuv_kernels(rgb_format, yuv_formats) KERNELS_MAP.update(m) _kernel_names_ = sorted(set([x[0] for x in KERNELS_MAP.values()])) log.info("%s csc_nvcuda kernels: %s", len(_kernel_names_), ", ".join(_kernel_names_)) #now, pre-compile the kernels: for src_format, dst_format in KERNELS_MAP.keys(): get_CUDA_kernel(device_id, src_format, dst_format) context.pop()
def setDevice(ndev=None): ''' To use CUDA or OpenCL you need a context and a device to stablish the context o communication ''' cuda.init() nDevices = cuda.Device.count() print "Available Devices:" for i in range(nDevices): dev = cuda.Device(i) try: mem = cuda.mem_get_info()[-i - 1] except: mem = 0 print " Device {0}: {1}, Total (MB) {2:.1f}, Free (MB) {3:.1f}".format( i, dev.name(), dev.total_memory() / 2.**20, mem / 2.**20) #mem/2.**20 ) devNumber = 0 if nDevices > 1: if ndev == None: devNumber = int(raw_input("Select device number: ")) else: devNumber = ndev dev = cuda.Device(devNumber) #cuda.Context.pop() #Disable previus CUDA context ctxCUDA = dev.make_context() devdata = DeviceData(dev) print "Using device {0}: {1}".format(devNumber, dev.name()) return ctxCUDA, dev, devdata
def filter(self, V): """ Filter a video V Must set up parameters of CS RF first Parameters ---------- V : 3D ndarray, with shape (num_frames, Px, Py) Returns ------- the filtered output by the gabor filters specified in self output is a PitchArray with shape (num_neurons, num_frames), jth row of which is the output of jth gabor filter """ d_output = parray.empty((self.num_neurons, V.shape[0]), self.dtype) d_video = parray.to_gpu(V.reshape(V.shape[0], V.shape[1]*V.shape[2])) free,total = cuda.mem_get_info() self.ONE_TIME_FILTERS = (free / self.dtype.itemsize) * 3/4 / self.Pxall / self.Pyall handle = la.cublashandle() for i in np.arange(0,self.num_neurons,self.ONE_TIME_FILTERS): Nfilters = min(self.ONE_TIME_FILTERS, self.num_neurons - i) self.generate_visual_receptive_fields(startbias = i, N_filters = Nfilters) cublasDgemm(handle.handle, 't','n', V.shape[0], int(Nfilters), self.Pxall*self.Pyall, self.dx*self.dy, d_video.gpudata, d_video.ld, self.filters.gpudata, self.filters.ld, 0, int(int(d_output.gpudata)+int(d_output.ld*i*d_output.dtype.itemsize)) , d_output.ld) return d_output.T()
def filter(self, video_input): """ Performs RF filtering on input video for all the rfs """ if len(video_input.shape) == 2: # if input has 2 dimensions assert video_input.shape[1] == self.size else: # if input has 3 dimensions assert (video_input.shape[1]*video_input.shape[2] == self.size) # rasterizing inputs video_input.resize((video_input.shape[0], self.size)) d_video = parray.to_gpu(video_input) d_output = parray.empty((self.num_neurons, video_input.shape[0]), self.dtype) free, total = cuda.mem_get_info() self.ONE_TIME_FILTERS = ((free // self.dtype.itemsize) * 3 // 4 // self.size) self.ONE_TIME_FILTERS -= self.ONE_TIME_FILTERS % 2 self.ONE_TIME_FILTERS = min(self.ONE_TIME_FILTERS, self.num_neurons) handle = la.cublashandle() for i in np.arange(0, self.num_neurons, self.ONE_TIME_FILTERS): Nfilters = min(self.ONE_TIME_FILTERS, self.num_neurons - i) self.generate_filters(startbias=i, N_filters=Nfilters) la.dot(self.filters, d_video, opb='t', C=d_output[i: i+Nfilters], handle=handle) del self.filters return d_output.T()
def init_cuda(): """Initialize CUDA functionality This function attempts to load the necessary interfaces (hardware connectivity) to run CUDA-based filtering. This function should only need to be run once per session. If the config var (set via mne.set_config or in ENV) MNE_USE_CUDA == 'true', this function will be executed when importing mne. If this variable is not set, this function can be manually executed. """ global cuda_capable global cuda_multiply_inplace_c128 global cuda_halve_c128 global cuda_real_c128 if cuda_capable is True: logger.info("CUDA previously enabled, currently %s available memory" % sizeof_fmt(mem_get_info()[0])) return # Triage possible errors for informative messaging cuda_capable = False try: import pycuda.gpuarray import pycuda.driver except ImportError: logger.warning("module pycuda not found, CUDA not enabled") return try: # Initialize CUDA; happens with importing autoinit import pycuda.autoinit # noqa, analysis:ignore except ImportError: logger.warning("pycuda.autoinit could not be imported, likely " "a hardware error, CUDA not enabled") return # Make sure scikits.cuda is installed try: from scikits.cuda import fft as cudafft except ImportError: logger.warning("module scikits.cuda not found, CUDA not " "enabled") return # Make our multiply inplace kernel from pycuda.elementwise import ElementwiseKernel # let's construct our own CUDA multiply in-place function cuda_multiply_inplace_c128 = ElementwiseKernel( "pycuda::complex<double> *a, pycuda::complex<double> *b", "b[i] *= a[i]", "multiply_inplace" ) cuda_halve_c128 = ElementwiseKernel("pycuda::complex<double> *a", "a[i] /= 2.0", "halve_value") cuda_real_c128 = ElementwiseKernel("pycuda::complex<double> *a", "a[i] = real(a[i])", "real_value") # Make sure we can use 64-bit FFTs try: cudafft.Plan(16, np.float64, np.complex128) # will get auto-GC'ed except: logger.warning("Device does not support 64-bit FFTs, " "CUDA not enabled") return cuda_capable = True # Figure out limit for CUDA FFT calculations logger.info("Enabling CUDA with %s available memory" % sizeof_fmt(mem_get_info()[0]))
def init_all_devices(): global DEVICES if DEVICES is not None: return DEVICES log.info("CUDA initialization (this may take a few seconds)") driver.init() DEVICES = [] log("CUDA driver version=%s", driver.get_driver_version()) ngpus = driver.Device.count() log.info("CUDA %s / PyCUDA %s, found %s device(s):", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT, ngpus) da = driver.device_attribute cf = driver.ctx_flags for i in range(ngpus): device = None context = None try: device = driver.Device(i) log(" + testing device %s: %s", i, device_info(device)) host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY) if not host_mem: log.warn("skipping device %s (cannot map host memory)", device_info(device)) continue context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) log(" created context=%s", context) log(" api version=%s", context.get_api_version()) free, total = driver.mem_get_info() log(" memory: free=%sMB, total=%sMB", int(free / 1024 / 1024), int(total / 1024 / 1024)) log(" multi-processors: %s, clock rate: %s", device.get_attribute(da.MULTIPROCESSOR_COUNT), device.get_attribute(da.CLOCK_RATE)) log(" max block sizes: (%s, %s, %s)", device.get_attribute(da.MAX_BLOCK_DIM_X), device.get_attribute(da.MAX_BLOCK_DIM_Y), device.get_attribute(da.MAX_BLOCK_DIM_Z)) log(" max grid sizes: (%s, %s, %s)", device.get_attribute(da.MAX_GRID_DIM_X), device.get_attribute(da.MAX_GRID_DIM_Y), device.get_attribute(da.MAX_GRID_DIM_Z)) max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH) max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT) log(" maximum texture size: %sx%s", max_width, max_height) log(" max pitch: %s", device.get_attribute(da.MAX_PITCH)) SMmajor, SMminor = device.compute_capability() compute = (SMmajor << 4) + SMminor log(" compute capability: %#x (%s.%s)", compute, SMmajor, SMminor) try: DEVICES.append(i) log.info(" + %s (memory: %s%% free, compute: %s.%s)", device_info(device), 100 * free / total, SMmajor, SMminor) finally: context.pop() except Exception, e: log.error("error on device %s: %s", (device or i), e)
def is_gpu_memory_enough(self, a): if CUDA: rest, total = driver.mem_get_info() if (sys.getsizeof(a) * 2) < rest: return True else: return True
def is_memory_enough(a): try: rest, total = driver.mem_get_info() except driver.LogicError: # child thread cannot use context from the main thread... # the following does not work yet from pycuda import tools import skcuda driver.init() context = tools.make_default_context() # try to make as new context, but cannot deactivate the old context stack device = context.get_device() skcuda.misc.init_context(device) rest, total = driver.mem_get_info() if (sys.getsizeof(a) * 2) < rest: return True
def select_device(preferred_device_id=-1, preferred_device_name=None, min_compute=0): if preferred_device_name is None: preferred_device_name = get_pref("device-name") if preferred_device_id<0: device_id = get_pref("device-id") if device_id is not None and device_id>=0: preferred_device_id = device_id devices = init_all_devices() global DEVICE_STATE free_pct = 0 cf = driver.ctx_flags #split device list according to device state: ok_devices = [device_id for device_id in devices if DEVICE_STATE.get(device_id, True) is True] nok_devices = [device_id for device_id in devices if DEVICE_STATE.get(device_id, True) is not True] for list_name, device_list in {"OK" : ok_devices, "failing" : nok_devices}.items(): selected_device_id = -1 selected_device = None log("will test %s device%s from %s list: %s", len(device_list), engs(device_list), list_name, device_list) for device_id in device_list: context = None try: log("device %i", device_id) device = driver.Device(device_id) log("select_device: testing device %s: %s", device_id, device_info(device)) context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) log("created context=%s", context) free, total = driver.mem_get_info() log("memory: free=%sMB, total=%sMB", int(free/1024/1024), int(total/1024/1024)) tpct = 100*free/total SMmajor, SMminor = device.compute_capability() compute = (SMmajor<<4) + SMminor if compute<min_compute: log("ignoring device %s: compute capability %#x (minimum %#x required)", device_info(device), compute, min_compute) elif device_id==preferred_device_id: l = log if len(device_list)>1: l = log.info l("device matches preferred device id %s: %s", preferred_device_id, device_info(device)) return device_id, device elif preferred_device_name and device_info(device).find(preferred_device_name)>=0: log("device matches preferred device name: %s", preferred_device_name) return device_id, device elif tpct>=MIN_FREE_MEMORY and tpct>free_pct: log("device has enough free memory: %i (min=%i, current best device=%i)", tpct, MIN_FREE_MEMORY, free_pct) selected_device = device selected_device_id = device_id free_pct = tpct finally: if context: context.pop() context.detach() if selected_device_id>=0 and selected_device: l = log if len(devices)>1: l = log.info l("selected device %s: %s", device_id, device_info(device)) return selected_device_id, selected_device return -1, None
def measure_gpu_memory(ident=""): global DATA_MEM torch.cuda.synchronize() old = DATA_MEM[1] - DATA_MEM[0] mem = mem_get_info() now = mem[1] - mem[0] text = "[Memory] {} {} + {} = {}".format(ident, format_memory(old), format_memory(now - old), format_memory(now)) DATA_MEM = mem return text
def gpu_stat(): if torch.cuda.is_available(): def pretty_bytes(bytes, precision=1): abbrevs = ((1<<50, 'PB'),(1<<40, 'TB'),(1<<30, 'GB'),(1<<20, 'MB'),(1<<10, 'kB'),(1, 'bytes')) if bytes == 1: return '1 byte' for factor, suffix in abbrevs: if bytes >= factor: break return '%.*f%s' % (precision, bytes / factor, suffix) device = autoinit.device print() print( 'GPU Name: %s' % device.name()) print( 'GPU Memory: %s' % pretty_bytes(device.total_memory())) print( 'CUDA Version: %s' % str(driver.get_version())) print( 'GPU Free/Total Memory: %d%%' % ((driver.mem_get_info()[0] /driver.mem_get_info()[1]) * 100))
def select_device(preferred_device_id=-1, preferred_device_name=None, min_compute=0): if preferred_device_name is None: preferred_device_name = get_pref("device-name") if preferred_device_id<0: device_id = get_pref("device-id") if device_id>=0: preferred_device_id = device_id devices = init_all_devices() global DEVICE_STATE free_pct = 0 cf = driver.ctx_flags #split device list according to device state: ok_devices = [device_id for device_id in devices if DEVICE_STATE.get(device_id, True) is True] nok_devices = [device_id for device_id in devices if DEVICE_STATE.get(device_id, True) is not True] for list_name, device_list in {"OK" : ok_devices, "failing" : nok_devices}.items(): selected_device_id = None selected_device = None log("will test %s device%s from %s list: %s", len(device_list), engs(device_list), list_name, device_list) for device_id in device_list: context = None try: device = driver.Device(device_id) log("select_device: testing device %s: %s", device_id, device_info(device)) context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) log("created context=%s", context) free, total = driver.mem_get_info() log("memory: free=%sMB, total=%sMB", int(free/1024/1024), int(total/1024/1024)) tpct = 100*free/total SMmajor, SMminor = device.compute_capability() compute = (SMmajor<<4) + SMminor if compute<min_compute: log("ignoring device %s: compute capability %#x (minimum %#x required)", device_info(device), compute, min_compute) elif device_id==preferred_device_id: l = log if len(device_list)>1: l = log.info l("device matches preferred device id %s: %s", preferred_device_id, device_info(device)) return device_id, device elif preferred_device_name and device_info(device).find(preferred_device_name)>=0: log("device matches preferred device name: %s", preferred_device_name) return device_id, device elif tpct>free_pct: selected_device = device selected_device_id = device_id free_pct = tpct finally: if context: context.pop() context.detach() if selected_device_id>=0 and selected_device: l = log if len(devices)>1: l = log.info l("selected device %s: %s", device_id, device_info(device)) return selected_device_id, selected_device return -1, None
def init_all_devices(): global DEVICES, DEVICE_INFO if DEVICES is not None: return DEVICES log.info("CUDA initialization (this may take a few seconds)") driver.init() DEVICES = [] DEVICE_INFO = {} log("CUDA driver version=%s", driver.get_driver_version()) ngpus = driver.Device.count() if ngpus==0: log.info("CUDA %s / PyCUDA %s, no devices found", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT) return DEVICES da = driver.device_attribute cf = driver.ctx_flags for i in range(ngpus): device = None context = None devinfo = "gpu %i" % i try: device = driver.Device(i) devinfo = device_info(device) log(" + testing device %s: %s", i, devinfo) DEVICE_INFO[i] = devinfo host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY) if not host_mem: log.warn("skipping device %s (cannot map host memory)", devinfo) continue context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) try: log(" created context=%s", context) log(" api version=%s", context.get_api_version()) free, total = driver.mem_get_info() log(" memory: free=%sMB, total=%sMB", int(free/1024/1024), int(total/1024/1024)) log(" multi-processors: %s, clock rate: %s", device.get_attribute(da.MULTIPROCESSOR_COUNT), device.get_attribute(da.CLOCK_RATE)) log(" max block sizes: (%s, %s, %s)", device.get_attribute(da.MAX_BLOCK_DIM_X), device.get_attribute(da.MAX_BLOCK_DIM_Y), device.get_attribute(da.MAX_BLOCK_DIM_Z)) log(" max grid sizes: (%s, %s, %s)", device.get_attribute(da.MAX_GRID_DIM_X), device.get_attribute(da.MAX_GRID_DIM_Y), device.get_attribute(da.MAX_GRID_DIM_Z)) max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH) max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT) log(" maximum texture size: %sx%s", max_width, max_height) log(" max pitch: %s", device.get_attribute(da.MAX_PITCH)) SMmajor, SMminor = device.compute_capability() compute = (SMmajor<<4) + SMminor log(" compute capability: %#x (%s.%s)", compute, SMmajor, SMminor) if i==0: #we print the list info "header" from inside the loop #so that the log output is bunched up together log.info("CUDA %s / PyCUDA %s, found %s device%s:", ".".join([str(x) for x in driver.get_version()]), pycuda.VERSION_TEXT, ngpus, engs(ngpus)) DEVICES.append(i) log.info(" + %s (memory: %s%% free, compute: %s.%s)", device_info(device), 100*free/total, SMmajor, SMminor) finally: context.pop() except Exception as e: log.error("error on device %s: %s", devinfo, e) return DEVICES
def infer(self, input_img, output_size, num_binding): #self.runtime=self.create_runtime() #self.context=self.create_context() assert (self.__engine.get_nb_bindings() == num_binding) output = np.empty(output_size, dtype=np.float32) d_input = cuda.mem_alloc(self.batchsize * input_img.size * input_img.dtype.itemsize) d_output = cuda.mem_alloc(self.batchsize * output.size * output.dtype.itemsize) # pointers to gpu memory bindings = [int(d_input), int(d_output)] stream = cuda.Stream() #transfer input data to device cuda.memcpy_htod_async(d_input, input_img, stream) #execute model self.context.enqueue(self.batchsize, bindings, stream.handle, None) #transfer predictions back cuda.memcpy_dtoh_async(output, d_output, stream) #syncronize threads stream.synchronize() print 'all of activities in stream is done: {}'.format( stream.is_done()) #destroy cuda context d_input.free() d_output.free() print 1999 - cuda.mem_get_info()[0] / 1048576, cuda.mem_get_info( )[1] / 1048576 #self.context.destroy #self.runtime.destroy() return output
def ShowGPUInfo(): (free,total) = driver.mem_get_info() print('Global memory occupancy:%f%% free' % (free*100 / total)) for devicenum in range(driver.Device.count()): device = driver.Device(devicenum) attrs = device.get_attributes() #Beyond this point is just pretty printing print('\n===Attributes for device %d' % devicenum) for (key,value) in attrs.iteritems(): print(' %s:%s' % (str(key), str(value)))
def mem_check_and_malloc(bytes): fm, tm = cuda.mem_get_info() if log_type in ['memory']: log_str = "RANK %d, GPU MEM CEHCK before malloc: %s Free, %s Maximum, %s Want to use" % ( rank, print_bytes(fm), print_bytes(tm), print_bytes(bytes)) log(log_str, 'memory', log_type) # we have enough memory if fm < bytes: # we don't have enough memory, free data fool print "BUFFER POOL" size = fm for elem in list(data_pool): usage = elem['usage'] devptr = elem['devptr'] devptr.free() print "FREE data", usage size += usage data_pool.remove(elem) if size >= bytes: break fm, tm = cuda.mem_get_info() if fm >= bytes: # we have enough memory, just malloc afm, tm = cuda.mem_get_info() devptr = cuda.mem_alloc(bytes) bfm, tm = cuda.mem_get_info() if log_type in ['memory']: fm, tm = cuda.mem_get_info() log_str = "RANK %d, GPU MALLOC AFTER: %s Free, %s Maximum, %s Want to use" % ( rank, print_bytes(fm), print_bytes(tm), print_bytes(bytes)) log(log_str, 'memory', log_type) return True, devptr # we don't have enough memory return False, None
def mem_check_and_malloc(bytes): fm,tm = cuda.mem_get_info() if log_type in ['memory']: log_str = "RANK %d, GPU MEM CEHCK before malloc: %s Free, %s Maximum, %s Want to use"%(rank, print_bytes(fm),print_bytes(tm),print_bytes(bytes)) log(log_str,'memory',log_type) # we have enough memory if fm < bytes: # we don't have enough memory, free data fool print "BUFFER POOL" size = fm for elem in list(data_pool): usage = elem['usage'] devptr = elem['devptr'] devptr.free() print "FREE data", usage size += usage data_pool.remove(elem) if size >= bytes: break fm,tm = cuda.mem_get_info() if fm >= bytes: # we have enough memory, just malloc afm,tm = cuda.mem_get_info() devptr = cuda.mem_alloc(bytes) bfm,tm = cuda.mem_get_info() if log_type in ['memory']: fm,tm = cuda.mem_get_info() log_str = "RANK %d, GPU MALLOC AFTER: %s Free, %s Maximum, %s Want to use"%(rank, print_bytes(fm),print_bytes(tm),print_bytes(bytes)) log(log_str, 'memory', log_type) return True, devptr # we don't have enough memory return False, None
def meminfo(self,kernel,k=-1,o=-1,threads=[],name=""): (free,total)=cuda.mem_get_info() shared=kernel.shared_size_bytes regs=kernel.num_regs local=kernel.local_size_bytes const=kernel.const_size_bytes mbpt=kernel.max_threads_per_block devdata=ctools.DeviceData() occupancy=ctools.OccupancyRecord(devdata,threads[0], shared_mem=shared,registers=regs) util.log.info("%s(%03d,%d)=L:%d,S:%d,R:%d,C:%d,MT:%d,T:%d,OC:%f,Free:%d"%(name,k,o,local,shared,regs,const,mbpt,threads[0],occupancy.occupancy,(free*100)/total))
def load_device(device_id): log("load_device(%i)", device_id) device = driver.Device(device_id) log("select_device: testing device %s: %s", device_id, device_info(device)) cf = driver.ctx_flags context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) log("created context=%s", context) free, total = driver.mem_get_info() log("memory: free=%sMB, total=%sMB", int(free/1024/1024), int(total/1024/1024)) tpct = 100*free//total return device, context, tpct
def select_device(preferred_device_id=DEFAULT_CUDA_DEVICE_ID, min_compute=0): devices = init_all_devices() global DEVICE_STATE free_pct = 0 cf = driver.ctx_flags #split device list according to device state: ok_devices = [ device_id for device_id in devices if DEVICE_STATE.get(device_id, True) is True ] nok_devices = [ device_id for device_id in devices if DEVICE_STATE.get(device_id, True) is not True ] for list_name, device_list in { "OK": ok_devices, "failing": nok_devices }.items(): selected_device_id = None selected_device = None log("will test %s devices from %s list: %s", len(device_list), list_name, device_list) for device_id in device_list: context = None try: device = driver.Device(device_id) log("select_device: testing device %s: %s", device_id, device_info(device)) context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) log("created context=%s", context) free, total = driver.mem_get_info() log("memory: free=%sMB, total=%sMB", int(free / 1024 / 1024), int(total / 1024 / 1024)) tpct = 100 * free / total SMmajor, SMminor = device.compute_capability() compute = (SMmajor << 4) + SMminor if compute < min_compute: log( "ignoring device %s: compute capability %#x (minimum %#x required)", device_info(device), compute, min_compute) elif device_id == preferred_device_id: return device_id, device elif tpct > free_pct: selected_device = device selected_device_id = device_id finally: if context: context.pop() context.detach() if selected_device_id >= 0 and selected_device: log("select device: %s / %s", device_id, device) return selected_device_id, selected_device return -1, None
def run(self): try: #Initialise this device self.local.dev = cuda.Device(self.device) self.local.ctx = self.local.dev.make_context() self.local.ctx.push() (free,total)=cuda.mem_get_info() util.log.info("Initialising CUDA device %d:(%.2f%% Free)"%(self.device,(free*100.0/total))) except pycuda._driver.MemoryError: util.log.info("Balls") raise return #Initialise the kernel self.local.kernels=SourceModule(self.r_kernels) gridmax=65535 #Kernels self.k_osbprepare=self.local.kernels.get_function("lk_osbprepare_permutations") self.k_osbsolve=self.local.kernels.get_function("solve_permutations") self.k_osblk=self.local.kernels.get_function("lk_max_permutations") self.k_solve=self.local.kernels.get_function("solve") self.k_isboptimise=self.local.kernels.get_function("isb_optimise_pk") self.k_isboptimise_inc=self.local.kernels.get_function("isb_optimise_inc") self.k_calcpsd=self.local.kernels.get_function("calc_psd") self.k_osb_optimise_p=self.local.kernels.get_function("osb_optimise_p") #loop to empty queue while True: #grab args from queue (block until recieved) queueitem=self.argqueue.get() func=queueitem[0] args=queueitem[1:] if func=='osb_optimise_p': result=self.osb_optimise_p(*args) self.resqueue.put((func,result)) elif func=='isb_optimise_p': result=self.isb_optimise_p(*args) self.resqueue.put((func,result)) elif func=='isb_optimise_inc': result=self.isb_optimise_inc(*args) self.resqueue.put((func,result)) elif func=='mipb_update_cost': result=self.mipb_update_cost(*args) self.resqueue.put((func,result)) elif func=='calc_psd': result=self.calc_psd(*args) self.resqueue.put((func,result)) else: self.resqueue.put(None) self.argqueue.task_done()#nothing seems to get past this
def usage_info(self): (free, total) = drv.mem_get_info() print("Global memory occupancy:{}% free".format(free * 100 / total)) for devicenum in range(drv.Device.count()): device = drv.Device(devicenum) attrs = device.get_attributes() #Beyond this point is just pretty printing print("\n===Attributes for device {}".format(device)) for (key, value) in attrs.items(): print("{}:{}".format(str(key), str(value)))
def remaining_mem(N, L, flag): meminfo = cuda.mem_get_info() print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1])) available_mem = float(meminfo[0]) available_mem /= np.dtype(np.float32).itemsize NL = N * L if flag is 0: available_mem -= NL x = available_mem temp = N * 2 x /= temp return int(x)
def cuda_mem_check(device_dictionary,cache_size,arrays): """Function to check if there will be enought memory in the GPU to perform the computation""" module_logger.info('Checking if the system has enought memory on device.') input_size=0 for array in arrays: input_size = input_size + array.nbytes cache_size_bytes = cache_size *4 free,total= driver.mem_get_info() max_mem_size=512*1000 memory_limit=(total-input_size)/device_dictionary['MULTIPROCESSOR_COUNT']/device_dictionary[ 'MAX_THREADS_PER_MULTIPROCESSOR'] limitator=min(max_mem_size,memory_limit) if cache_size_bytes >= limitator: module_logger.error("Cache memory per thread ("+bytes2human(cache_size_bytes)+") is greater than memory " "limitation per thread ("+bytes2human(limitator)+")") exit() elif input_size >= total: module_logger.error("The arrays to transfer ("+bytes2human(input_size)+") is greater than global memory " "limitations ("+bytes2human(total)+")") exit() else: headers=("Cache size per thread","Maximum memory size per thread") printdata=(bytes2human(cache_size_bytes),bytes2human(limitator)) stype('\n'+'Memory limitation status on device:') stype (tabulate.tabulate(zip(headers,printdata), headers=['Variable Name', 'Value'], tablefmt='rst')+'\n') module_logger.ok('The system has enought memory to perform the calculation.') module_logger.info('Using '+bytes2human(cache_size_bytes)+' out of '+bytes2human(limitator)+'.') # module_logger.warning("Warning: The cuda kernel will use max capacity of graphics procesors, the screen could " # "become unresponsible during the process.") stype('\n'+bcolors.WARNING +"Warning: The cuda kernel will use max capacity of graphics procesors," +'\n the screen could become unresponsible during the process.'+ bcolors.ENDC+'\n')
def showDeviceAttributes(): (free, total) = cuda.mem_get_info() print("Global memory occupancy:%f%% free" % (free * 100 / total)) for devicenum in range(cuda.Device.count()): device = cuda.Device(devicenum) attrs = device.get_attributes() #Beyond this point is just pretty printing print("\n===Attributes for device %d" % devicenum) for (key, value) in attrs.iteritems(): print("%s:%s" % (str(key), str(value)))
def showDeviceAttributes(): (free,total)=cuda.mem_get_info() print("Global memory occupancy:%f%% free"%(free*100/total)) for devicenum in range(cuda.Device.count()): device=cuda.Device(devicenum) attrs=device.get_attributes() #Beyond this point is just pretty printing print("\n===Attributes for device %d"%devicenum) for (key,value) in attrs.iteritems(): print("%s:%s"%(str(key),str(value)))
def device_usage_str(self): '''Returns a formatted string displaying the memory usage.''' s = 'device usage:\n' s += '-'*10 + '\n' #s += format_array('vertices', self.vertices) + '\n' #s += format_array('triangles', self.triangles) + '\n' s += format_array('nodes', self.nodes) + '\n' s += '%-15s %6s %6s' % ('total', '', format_size(self.nodes.nbytes)) + '\n' s += '-'*10 + '\n' free, total = cuda.mem_get_info() s += '%-15s %6s %6s' % ('device total', '', format_size(total)) + '\n' s += '%-15s %6s %6s' % ('device used', '', format_size(total-free)) + '\n' s += '%-15s %6s %6s' % ('device free', '', format_size(free)) + '\n' return s
def __call__(self, tag, description): if api.is_gpu_api_cuda(): gpu_free, gpu_total = cuda.mem_get_info() elif api.is_gpu_api_opencl(): ctx = cltools.get_last_context() device = ctx.get_info(cl.context_info.DEVICES)[0] gpu_total = device.get_info(cl.device_info.GLOBAL_MEM_SIZE) gpu_free = gpu_total # free memory info not availabe to opencl... if tag is None: self['gpu_total'] = gpu_total else: self['%s' % tag] = description self['%s_gpu_used' % tag] = gpu_total - gpu_free pass
def _gpuAlloc(self): #Get GPU information self.freeMem = cuda.mem_get_info()[0] * .5 * .8 # limit memory use to 80% of available self.maxPossRows = np.int(np.floor(self.freeMem / (4 * self.totalCols))) # multiply by 4 as that is size of float # set max rows to smaller number to save memory usage if self.totalRows < self.maxPossRows: print "reducing max rows to reduce memory use on GPU" self.maxPossRows = self.totalRows # create pagelocked buffers and GPU arrays self.to_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32) self.from_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32) self.data_gpu = cuda.mem_alloc(self.to_gpu_buffer.nbytes) self.result_gpu = cuda.mem_alloc(self.from_gpu_buffer.nbytes)
def get_cuda_memory(): """Get the amount of free memory for CUDA operations Returns ------- memory : str The amount of available memory as a human-readable string. """ if not _cuda_capable: warn('CUDA not enabled, returning zero for memory') mem = 0 else: from pycuda.driver import mem_get_info mem = mem_get_info()[0] return sizeof_fmt(mem)
def free_gpu(self,): self.rng_states.free() self.gpu_vect_sum.free() self.gpu_new_data.free() self.gpu_init_data.free() (free, total) = cuda.mem_get_info() print( ( "Global memory occupancy after cleaning processes: %f%% free" % (free * 100 / total) ) ) print(("Global free memory :%i Mo free" % (free / 10 ** 6))) del self.module self.ctx.detach()
def get_cuda_memory(): """Get the amount of free memory for CUDA operations. Returns ------- memory : str The amount of available memory as a human-readable string. """ if not _cuda_capable: warn('CUDA not enabled, returning zero for memory') mem = 0 else: from pycuda.driver import mem_get_info mem = mem_get_info()[0] return sizeof_fmt(mem)
def run(): (free,total) = cuda.mem_get_info() #free = 2.e9 print "Device global memory {0:.2f}GB total, {1:0.2f}GB free".format(total/1.e9, free/1.e9) print "Roughly {0:.2f}GB required for large box".format(float(NBYTES*4)/1.e9) if not os.path.exists(parent_folder+'/Boxes'): os.makedirs(parent_folder+"/Boxes") if NBYTES*4 < free: print "Congratulations, your GPU has enough memory, running without stitching" init() else: N = DIM while float(N)/DIM*NBYTES*8 > free: N /= 2 print "Stitching with {} meta block size".format(N) init_stitch(np.int32(N))
def __init__(self, interactive=True, logfile=None): DeviceMemoryPool.__init__(self) self.last_free, _ = cuda.mem_get_info() self.interactive = interactive if logfile is None: import sys logfile = sys.stdout self.logfile = logfile from weakref import WeakKeyDictionary self.blocks = WeakKeyDictionary() if interactive: from pytools.diskdict import DiskDict self.stacktrace_mnemonics = DiskDict("pycuda-stacktrace-mnemonics")
def Optimize_Blocks(): # Get memory info print "Memory info:" (free, total) = cuda.mem_get_info() print round(100.0 * free / total, 2), "% free of", round(total * 1.0 / pow(1024, 3), 4), "Gb" # Find the largest numpy array mem_list = [1.0] np_array_size = 1 opts = 0 counter = 0 while opts < free: opts = free * 1.0 / sys.getsizeof(mem_list) * counter counter += 1 print free, opts, sys.getsizeof(mem_list) * counter print "Max list size:", counter # while np_array_size < free: # mem_list = mem_list.append(1.0) # np_array_size = sys.getsizeof(mem_list) # np = array(np) print "Max Numpy array size:", sys.getsizeof(np) # Create lists to store multiple device info threads_per_block = [] grid_dim_x = [] # Loop through devices for devicenum in range(cuda.Device.count()): # Initialize the device device = cuda.Device(devicenum) # Get dictionary of device info attrs = device.get_attributes() # Get max threads per block info tpb = attrs[pycuda._driver.device_attribute.MAX_THREADS_PER_BLOCK] if tpb not in threads_per_block: threads_per_block.append(tpb) # Get max grid dimension mgd = attrs[pycuda._driver.device_attribute.MAX_GRID_DIM_X] if mgd not in grid_dim_x: grid_dim_x.append(mgd) print "You should use the following code in your Python code:" print "block_size =", str(min(threads_per_block)) print "blocks =", str(min(grid_dim_x) / min(threads_per_block))
def init_all_devices(): global DEVICES if DEVICES is not None: return DEVICES log.info("CUDA initialization (this may take a few seconds)") driver.init() DEVICES = [] log("CUDA driver version=%s", driver.get_driver_version()) log.info("PyCUDA version=%s", pycuda.VERSION_TEXT) ngpus = driver.Device.count() log.info("CUDA version=%s found %s device(s):", ".".join([str(x) for x in driver.get_version()]), ngpus) da = driver.device_attribute cf = driver.ctx_flags for i in range(ngpus): device = None context = None try: device = driver.Device(i) log(" + testing device %s: %s", i, device_info(device)) host_mem = device.get_attribute(da.CAN_MAP_HOST_MEMORY) if not host_mem: log.warn("skipping device %s (cannot map host memory)", device_info(device)) continue context = device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) log(" created context=%s", context) log(" api version=%s", context.get_api_version()) free, total = driver.mem_get_info() log(" memory: free=%sMB, total=%sMB", int(free/1024/1024), int(total/1024/1024)) log(" multi-processors: %s, clock rate: %s", device.get_attribute(da.MULTIPROCESSOR_COUNT), device.get_attribute(da.CLOCK_RATE)) log(" max block sizes: (%s, %s, %s)", device.get_attribute(da.MAX_BLOCK_DIM_X), device.get_attribute(da.MAX_BLOCK_DIM_Y), device.get_attribute(da.MAX_BLOCK_DIM_Z)) log(" max grid sizes: (%s, %s, %s)", device.get_attribute(da.MAX_GRID_DIM_X), device.get_attribute(da.MAX_GRID_DIM_Y), device.get_attribute(da.MAX_GRID_DIM_Z)) max_width = device.get_attribute(da.MAXIMUM_TEXTURE2D_WIDTH) max_height = device.get_attribute(da.MAXIMUM_TEXTURE2D_HEIGHT) log(" maximum texture size: %sx%s", max_width, max_height) log(" max pitch: %s", device.get_attribute(da.MAX_PITCH)) SMmajor, SMminor = device.compute_capability() compute = (SMmajor<<4) + SMminor log(" compute capability: %#x (%s.%s)", compute, SMmajor, SMminor) try: DEVICES.append(i) log.info(" + %s (memory %s%% free, compute %#x)", device_info(device), 100*free/total, compute) finally: context.pop() except Exception, e: log.error("error on device %s: %s", (device or i), e)
def test_mempool(self): from pycuda.tools import bitlog2 from pycuda.tools import DeviceMemoryPool pool = DeviceMemoryPool() maxlen = 10 queue = [] free, total = drv.mem_get_info() e0 = bitlog2(free) for e in range(e0-6, e0-4): for i in range(100): queue.append(pool.allocate(1<<e)) if len(queue) > 10: queue.pop(0) del queue pool.stop_holding()
def filter_image(self, image_input): """ Performs RF filtering on input video for all the rfs """ # video dimensions should match screen dimensions # numpy resize operation doesn,t make any checks if len(image_input.shape) == 2: # if input has 2 dimensions assert image_input.shape[1] == self.size else: # if input has 3 dimensions assert (image_input.shape[1]*image_input.shape[2] == self.size) # rasterizing inputs image_input.resize((1, self.size)) d_image = parray.to_gpu(image_input) d_output = parray.empty((self.num_neurons, image_input.shape[0]), self.dtype) free, total = cuda.mem_get_info() self.ONE_TIME_FILTERS = ((free // self.dtype.itemsize) * 3 // 4 // self.size) self.ONE_TIME_FILTERS -= self.ONE_TIME_FILTERS % 2 self.ONE_TIME_FILTERS = min(self.ONE_TIME_FILTERS, self.num_neurons) handle = la.cublashandle() for i in np.arange(0, self.num_neurons, self.ONE_TIME_FILTERS): Nfilters = min(self.ONE_TIME_FILTERS, self.num_neurons - i) self.generate_filters(startbias=i, N_filters=Nfilters) la.dot(self.filters, d_image, opb='t', C=d_output[i: i+Nfilters], handle=handle) del self.filters return d_output.T()