def main(): from pycuda.tools import DeviceMemoryPool, PageLockedMemoryPool dev_pool = DeviceMemoryPool() pagelocked_pool = PageLockedMemoryPool() from scipy.io import mmread csr_mat = mmread(args[0]).tocsr().astype(numpy.float32) inv_mat_diag = 1 / csr_mat.diagonal() print "building..." from pycuda.sparse.packeted import PacketedSpMV spmv = PacketedSpMV(csr_mat, options.is_symmetric, csr_mat.dtype) rhs = numpy.random.rand(spmv.shape[0]).astype(spmv.dtype) from pycuda.sparse.operator import DiagonalPreconditioner if True: precon = DiagonalPreconditioner( spmv.permute( gpuarray.to_gpu(inv_mat_diag, allocator=dev_pool.allocate))) else: precon = None from pycuda.sparse.cg import solve_pkt_with_cg print "start solve" for i in range(4): start = drv.Event() stop = drv.Event() start.record() rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate) res_gpu, it_count, res_count = \ solve_pkt_with_cg(spmv, rhs_gpu, precon, tol=1e-7 if spmv.dtype == numpy.float64 else 5e-5, pagelocked_allocator=pagelocked_pool.allocate) res = res_gpu.get() stop.record() stop.synchronize() elapsed = stop.time_since(start) * 1e-3 est_flops = (csr_mat.nnz * 2 * (it_count + res_count) + csr_mat.shape[0] * (2 + 2 + 2 + 2 + 2) * it_count) if precon is not None: est_flops += csr_mat.shape[0] * it_count print "residual norm: %g" % (la.norm(csr_mat * res - rhs) / la.norm(rhs)) print( "size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, " "%g gflops/s" % (csr_mat.shape[0], elapsed, it_count, res_count, it_count / elapsed, est_flops / elapsed / 1e9)) # TODO: mixed precision # TODO: benchmark pagelocked_pool.stop_holding() dev_pool.stop_holding()
def _solve_cuda(lap_sparse, B, return_full_prob=False, maxiter=100, tol=5e-5): """ solves lap_sparse X_i = B_i for each phase i, using the conjugate gradient method. For each pixel, the label i corresponding to the maximal X_i is returned. """ print("using gpu mode") dev_pool = DeviceMemoryPool() pagelocked_pool = PageLockedMemoryPool() csr_mat = lap_sparse csr_mat = csr_mat.astype(np.float32) inv_mat_diag = 1 / csr_mat.diagonal() spmv = PacketedSpMV(csr_mat, True, csr_mat.dtype) X = [] for i in range(len(B)): rhs = -B[i].astype(spmv.dtype) if True: precon = DiagonalPreconditioner( spmv.permute( gpuarray.to_gpu(inv_mat_diag, allocator=dev_pool.allocate))) else: precon = None print("start solve") start = drv.Event() stop = drv.Event() start.record() rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate) tol = 1e-7 if spmv.dtype == np.float64 else tol res_gpu, it_count, res_count = solve_pkt_with_cg( spmv, rhs_gpu, precon, tol=tol, pagelocked_allocator=pagelocked_pool.allocate) res = res_gpu.get() stop.record() stop.synchronize() elapsed = stop.time_since(start) * 1e-3 est_flops = (csr_mat.nnz * 2 * (it_count + res_count) + csr_mat.shape[0] * (2 + 2 + 2 + 2 + 2) * it_count) if precon is not None: est_flops += csr_mat.shape[0] * it_count print("size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, " "%g gflops/s" % (csr_mat.shape[0], elapsed, it_count, res_count, it_count / elapsed, est_flops / elapsed / 1e9)) x0 = res[0] X.append(x0) pagelocked_pool.stop_holding() dev_pool.stop_holding() if not return_full_prob: X = np.array(X) X = np.argmax(X, axis=0) return X
def test_mempool_2(self): from pycuda.tools import DeviceMemoryPool as DMP from random import randrange for i in range(2000): s = randrange(1 << 31) >> randrange(32) bin_nr = DMP.bin_number(s) asize = DMP.alloc_size(bin_nr) assert asize >= s, s assert DMP.bin_number(asize) == bin_nr, s assert asize < asize * (1 + 1 / 8)
def test_mempool_2(self): from pycuda.tools import DeviceMemoryPool as DMP from random import randrange for i in range(2000): s = randrange(1<<31) >> randrange(32) bin_nr = DMP.bin_number(s) asize = DMP.alloc_size(bin_nr) assert asize >= s, s assert DMP.bin_number(asize) == bin_nr, s assert asize < asize*(1+1/8)
def calculate_intensity_from_spectrum(projections, spectrum, blocksize=50): pool = DeviceMemoryPool() energies = spectrum[:, 0] / 1000 pdf = spectrum[:, 1] / np.sum(spectrum[:, 1]) projection_shape = projections[next(iter(projections))].shape num_blocks = np.ceil(projection_shape[0] / blocksize).astype(int) intensity = np.zeros(projection_shape, dtype=np.float32) photon_prob = np.zeros(projections[next(iter(projections))].shape, dtype=np.float32) logger.info('running mass attenuation...') for i in range(0, num_blocks): logger.debug(f"running block: {i + 1} / {num_blocks}") lower_i = i * blocksize upper_i = min([(i + 1) * blocksize, projection_shape[0]]) intensity_gpu = gpuarray.zeros((upper_i - lower_i, projection_shape[1], projection_shape[2]), dtype=np.float32, allocator=pool.allocate) photon_prob_gpu = gpuarray.zeros((upper_i - lower_i, projection_shape[1], projection_shape[2]), dtype=np.float32, allocator=pool.allocate) projections_gpu = {} for mat in projections: projections_gpu[mat] = gpuarray.to_gpu(projections[mat][lower_i:upper_i, :, :], allocator=pool.allocate) for i, _ in enumerate(pdf): logger.debug(f"evaluating: {i + 1} / {len(pdf)} spectral bins") intensity_tmp = calculate_attenuation_gpu(projections_gpu, energies[i], pdf[i], pool) intensity_gpu = intensity_gpu.mul_add(1, intensity_tmp, 1) photon_prob_gpu = photon_prob_gpu.mul_add(1, intensity_tmp, 1 / energies[i]) intensity[lower_i:upper_i, :, :] = intensity_gpu.get() photon_prob[lower_i:upper_i, :, :] = photon_prob_gpu.get() return intensity, photon_prob
def test_mempool(self): from pycuda.tools import bitlog2 from pycuda.tools import DeviceMemoryPool pool = DeviceMemoryPool() queue = [] free, total = drv.mem_get_info() e0 = bitlog2(free) for e in range(e0 - 6, e0 - 4): for i in range(100): queue.append(pool.allocate(1 << e)) if len(queue) > 10: queue.pop(0) del queue pool.stop_holding()
def test_mempool(self): from pycuda.tools import bitlog2 from pycuda.tools import DeviceMemoryPool pool = DeviceMemoryPool() maxlen = 10 queue = [] free, total = drv.mem_get_info() e0 = bitlog2(free) for e in range(e0-6, e0-4): for i in range(100): queue.append(pool.allocate(1<<e)) if len(queue) > 10: queue.pop(0) del queue pool.stop_holding()
def test1d(wavelet='haar', use_float32=False, depth=1, num_rows=512, row_size=512, iterations=20, gpu_input=False, gpu_output=False, gpu_mempool=False): try: dtype = numpy.float64 if use_float32: dtype = numpy.float32 img = (numpy.array(scipy.misc.ascent(), dtype=dtype)-128.)/128. resized_img = resize(img, (num_rows, row_size), mode='constant') if gpu_input: cont_input_array = numpy.ascontiguousarray(resized_img, dtype=dtype) img_array_gpu = gpuarray.to_gpu(cont_input_array) else: img_array_gpu = resized_img if gpu_mempool: dev_mem_pool = DeviceMemoryPool() gpu_alloc = dev_mem_pool.allocate else: gpu_alloc = cuda.mem_alloc pwt = PycudaWaveletTransform(wavelet=wavelet, use_float32=use_float32) # Forward Transform print('---------FORWARD DWT---------') t = time.time() for _ in range(iterations): dec_cpu = pywt.wavedec(resized_img, wavelet=wavelet, mode='periodization', level=depth) t = time.time()-t print('PyWavelets:\t\t\t\t{:.3f} ms'.format((t*1000.)/iterations)) t = time.time() for _ in range(iterations): dec_gpu = pwt.dwt1d(img_array_gpu, depth=depth, gpu_output=gpu_output, gpu_allocator=gpu_alloc) t = time.time()-t print('PycudaWaveletTransform:\t{:.3f} ms'.format((t*1000.)/iterations)) for i, (d1, d2) in enumerate(zip(dec_gpu, dec_cpu)): if i == 0: result1 = d1.get() if gpu_output else d1 result2 = d2 else: result1 = numpy.concatenate((result1, d1.get() if gpu_output else d1), axis=1) result2 = numpy.concatenate((result2, d2), axis=1) print('RMSE: {} \n'.format(rmse(result1, result2))) dec_cpu_g = [] if gpu_input: for d in dec_cpu: cont_array = numpy.ascontiguousarray(d, dtype=dtype) dec_cpu_g.append(gpuarray.to_gpu(cont_array)) else: dec_cpu_g = dec_cpu # Inverse Transform print('---------INVERSE DWT---------') t = time.time() for _ in range(iterations): rec_cpu = pywt.waverec(dec_cpu, wavelet=wavelet, mode='periodization') t = time.time()-t print('PyWavelets:\t\t\t\t{:.3f} ms'.format((t*1000.)/iterations)) t = time.time() for _ in range(iterations): rec_gpu = pwt.idwt1d(dec_cpu_g, gpu_output=gpu_output, gpu_allocator=gpu_alloc) t = time.time()-t print('PycudaWaveletTransform:\t{:.3f} ms'.format((t*1000.)/iterations)) print('RMSE: {} '.format(rmse(rec_gpu.get() if gpu_output else rec_gpu, rec_cpu))) if gpu_mempool: dev_mem_pool.stop_holding() except Exception as e: tb = traceback.format_exc() print("%s",tb)
ctx = current_dev.make_context() #make a working context ctx.push() #let context make the lead from pycuda.compiler import SourceModule from pycuda.tools import DeviceMemoryPool from scikits.cuda.cublas import cublasSgemv from pycuda.elementwise import ElementwiseKernel from pycuda import cumath _dropout_kernel = None _saltpepper_kernel = None _rng_state = None _rng_blocks = 128 _rng_threads = 128 _mempool = DeviceMemoryPool() def init_rng(seed): global _dropout_kernel, _saltpepper_kernel, _rng_state, _rng_threads, _rng_blocks from pycuda.characterize import sizeof ds = sizeof("curandState", "#include <curand_kernel.h>") _rng_state = drv.mem_alloc(_rng_threads * _rng_blocks * ds) src = SourceModule(''' #include <curand_kernel.h> extern "C" { __global__ void setup_rng(curandState* rng_state, const unsigned seed) {
def __init__(self, mesh, poissonsolver, context, gradient=make_GPU_gradient, optimize_meshing_memory=True, memory_pool=DeviceMemoryPool()): '''Mesh sizes need to be powers of 2 in x (and y if it exists). The argument memory_pool can be used to provide a GPU pool for memory allocation. If it is None (default), then a new DeviceMemoryPool() is used. ''' self.mesh = mesh self.optimize_meshing_memory = optimize_meshing_memory self._context = context self.poissonsolver = poissonsolver self.kernel_call_config = { 'p2m': { 'block': (16, 16, 1), #'grid': (-1, 1, 1) # adapt to number of particles! 'grid': (0, 1, 1) # adapt to number of particles! }, 'm2p': { 'block': (16, 16, 1), #'grid': (-1, 1, 1) # adapt to number of particles! 'grid': (0, 1, 1) # adapt to number of particles! }, 'sorted_p2m': { 'block': (256, 1, 1), #'grid': (self.mesh.n_nodes//256, 1, 1) 'grid': (idivup(self.mesh.n_nodes, 256), 1, 1) } } self._mempool = memory_pool # load kernels with open(where + 'p2m/p2m_kernels.cu') as stream: source = stream.read() p2m_kernels = SourceModule(source) with open(where + 'p2m/p2m_kernels_inclmeshing.cu') as stream: source = stream.read() p2m_kernels_inclmeshing = SourceModule(source) with open(where + 'm2p/m2p_kernels.cu') as stream: source = stream.read() m2p_kernels = SourceModule(source) with open(where + 'm2p/m2p_kernels_inclmeshing.cu') as stream: source = stream.read() m2p_kernels_inclmeshing = SourceModule(source) self._gradient = gradient(mesh, context) # initialize in init because otherwise it tries to compile even if # no instance of the class is created -> errors if you import the module # without having a running pycuda context. # depending on the dimension, the correct funtions are loaded self._particles_to_mesh_kernel = ( p2m_kernels.get_function('particles_to_mesh_' + str(mesh.dimension) + 'd')) self._particles_to_mesh_64atomics_kernel = ( # double precision atomics, slower p2m_kernels.get_function('particles_to_mesh_' + str(mesh.dimension) + 'd_64atomics')) self._p2m_inclmeshing_32atomics_kernel = ( p2m_kernels_inclmeshing.get_function('p2m_rectmesh' + str(mesh.dimension) + 'd_32atomics')) self._p2m_inclmeshing_64atomics_kernel = ( p2m_kernels_inclmeshing.get_function('p2m_rectmesh' + str(mesh.dimension) + 'd_64atomics')) self._sorted_particles_to_guard_mesh_kernel = ( p2m_kernels.get_function('cic_guard_cell_weights_' + str(mesh.dimension) + 'd')) self._join_guard_cells_kernel = ( p2m_kernels.get_function('join_guard_cells_' + str(mesh.dimension) + 'd')) self._mesh_to_particles_kernel = ( m2p_kernels.get_function('mesh_to_particles_' + str(mesh.dimension) + 'd')) self._field_to_particles_kernel = ( m2p_kernels.get_function('field_to_particles_' + str(mesh.dimension) + 'd')) self._m2p_scalar_inclmeshing_kernel = ( m2p_kernels_inclmeshing.get_function('m2p_rectmesh' + str(mesh.dimension) + 'd_scalar')) self._m2p_vector_inclmeshing_kernel = ( m2p_kernels_inclmeshing.get_function('m2p_rectmesh' + str(mesh.dimension) + 'd_vector')) # prepare calls to kernels!!! self._particles_to_mesh_kernel.prepare('i' + 'P' + 'i' * (mesh.dimension) + 'P' * 2**mesh.dimension + 'P' * mesh.dimension) self._particles_to_mesh_64atomics_kernel.prepare( 'i' + 'P' + 'i' * (mesh.dimension) + 'P' * 2**mesh.dimension + 'P' * mesh.dimension) self._p2m_inclmeshing_32atomics_kernel.prepare( 'i' + 'P' * mesh.dimension + 'd' * mesh.dimension * 2 + 'i' * mesh.dimension + 'P') self._p2m_inclmeshing_64atomics_kernel.prepare( 'i' + 'P' * mesh.dimension + 'd' * mesh.dimension * 2 + 'i' * mesh.dimension + 'P') self._field_to_particles_kernel.prepare('i' + 'PP' + 'i' * (mesh.dimension) + 'P' * 2**mesh.dimension + 'P' * mesh.dimension) self._mesh_to_particles_kernel.prepare('i' + 'P' * mesh.dimension * 2 + 'i' * (mesh.dimension) + 'P' * 2**mesh.dimension + 'P' * mesh.dimension) self._sorted_particles_to_guard_mesh_kernel.prepare( 'P' * mesh.dimension + 'd' * 2 * mesh.dimension + 'i' * (mesh.dimension - 1) + 'i' + 'PP' + 'P' * 2**mesh.dimension) self._join_guard_cells_kernel.prepare('P' * 2**mesh.dimension + 'i' + 'i' * mesh.dimension + 'P') self._m2p_scalar_inclmeshing_kernel.prepare('i' + 'P' * mesh.dimension + 'd' * mesh.dimension * 2 + 'i' * mesh.dimension + 'P' * 2) self._m2p_vector_inclmeshing_kernel.prepare('i' + 'P' * mesh.dimension + 'd' * mesh.dimension * 2 + 'i' * mesh.dimension + 'P' * mesh.dimension * 2)
def main_cg(): from optparse import OptionParser parser = OptionParser( usage="%prog [options] MATRIX-MARKET-FILE") parser.add_option("-s", "--is-symmetric", action="store_true", help="Specify that the input matrix is already symmetric") options, args = parser.parse_args() from pycuda.tools import DeviceMemoryPool, PageLockedMemoryPool dev_pool = DeviceMemoryPool() pagelocked_pool = PageLockedMemoryPool() from scipy.io import mmread csr_mat = mmread(args[0]).tocsr().astype(numpy.float32) inv_mat_diag = 1/csr_mat.diagonal() print "building..." from pycuda.sparse.packeted import PacketedSpMV spmv = PacketedSpMV(csr_mat, options.is_symmetric, csr_mat.dtype) rhs = numpy.random.rand(spmv.shape[0]).astype(spmv.dtype) from pycuda.sparse.operator import DiagonalPreconditioner if True: precon = DiagonalPreconditioner( spmv.permute(gpuarray.to_gpu( inv_mat_diag, allocator=dev_pool.allocate))) else: precon = None from pycuda.sparse.cg import solve_pkt_with_cg print "start solve" for i in range(4): start = drv.Event() stop = drv.Event() start.record() rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate) res_gpu, it_count, res_count = \ solve_pkt_with_cg(spmv, rhs_gpu, precon, tol=1e-7 if spmv.dtype == numpy.float64 else 5e-5, pagelocked_allocator=pagelocked_pool.allocate) res = res_gpu.get() stop.record() stop.synchronize() elapsed = stop.time_since(start)*1e-3 est_flops = (csr_mat.nnz*2*(it_count+res_count) + csr_mat.shape[0]*(2+2+2+2+2)*it_count) if precon is not None: est_flops += csr_mat.shape[0] * it_count print "residual norm: %g" % (la.norm(csr_mat*res - rhs)/la.norm(rhs)) print ("size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, " "%g gflops/s" % ( csr_mat.shape[0], elapsed, it_count, res_count, it_count/elapsed, est_flops/elapsed/1e9)) # TODO: mixed precision # TODO: benchmark pagelocked_pool.stop_holding() dev_pool.stop_holding()
def test2d(wavelet='haar', use_float32=False, depth=1, num_slices=1, row_size=512, col_size=512, iterations=20, gpu_input=False, gpu_output=False, gpu_mempool=False): try: dtype = numpy.float64 if use_float32: dtype = numpy.float32 # Prepare Image Array img = (numpy.array(scipy.misc.ascent(), dtype=dtype) - 128.) / 128. resized_img = resize(img, (col_size, row_size), mode='constant') img_array = numpy.empty([num_slices, col_size, row_size], dtype=dtype) for s in range(num_slices): img_array[s, :, :] = resized_img[:, :] if gpu_input: cont_input_array = numpy.ascontiguousarray(img_array, dtype=dtype) img_array_gpu = gpuarray.to_gpu(cont_input_array) else: img_array_gpu = img_array if gpu_mempool: dev_mem_pool = DeviceMemoryPool() gpu_alloc = dev_mem_pool.allocate else: gpu_alloc = cuda.mem_alloc pwt = PycudaWaveletTransform(wavelet=wavelet, use_float32=use_float32) # Forward Transform print('---------FORWARD 2D DWT---------') t = time.time() for _ in range(iterations): dec_cpu = [ pywt.wavedec2(img_array[s], wavelet=wavelet, mode='periodization', level=depth) for s in range(num_slices) ] t = time.time() - t print('PyWavelets:\t\t\t\t{:.3f} ms'.format((t * 1000.) / iterations)) t = time.time() for _ in range(iterations): dec_gpu = pwt.dwt2d(img_array_gpu, depth=depth, gpu_output=gpu_output, gpu_allocator=gpu_alloc) t = time.time() - t print('PycudaWaveletTransform:\t{:.3f} ms'.format( (t * 1000.) / iterations)) dec_cpu_g = [] for ig, vg in enumerate(dec_gpu): if ig == 0: a = numpy.empty_like(vg.get() if gpu_output else vg) for ic, vc in enumerate(dec_cpu): a[ic, :, :] = vc[0] dec_cpu_g.append(a) else: dl = [] for id, vd in enumerate(vg): d = numpy.empty_like(vd.get() if gpu_output else vd) for ic, vc in enumerate(dec_cpu): d[ic, :, :] = vc[ig][id] dl.append(d) dec_cpu_g.append(dl) for i, (d1, d2) in enumerate(zip(dec_gpu, dec_cpu_g)): if i == 0: result1 = d1.get().flatten() if gpu_output else d1.flatten() result2 = d2.flatten() else: for d in d1: result1 = numpy.concatenate( (result1, d.get().flatten() if gpu_output else d.flatten())) for d in d2: result2 = numpy.concatenate((result2, d.flatten())) print('RMSE: {} \n'.format(rmse(result1, result2))) if gpu_input: for ig, vg in enumerate(dec_cpu_g): if ig == 0: cont_array = numpy.ascontiguousarray(vg, dtype=dtype) dec_cpu_g[ig] = gpuarray.to_gpu(cont_array) else: for id, vd in enumerate(vg): cont_array = numpy.ascontiguousarray(vd, dtype=dtype) dec_cpu_g[ig][id] = gpuarray.to_gpu(cont_array) # Inverse Transform print('---------INVERSE 2D DWT---------') t = time.time() for _ in range(iterations): rec_cpu = [ pywt.waverec2(d, wavelet=wavelet, mode='periodization') for d in dec_cpu ] t = time.time() - t print('PyWavelets:\t\t\t\t{:.3f} ms'.format((t * 1000.) / iterations)) t = time.time() for _ in range(iterations): rec_gpu = pwt.idwt2d(dec_cpu_g, gpu_output=gpu_output, gpu_allocator=gpu_alloc) t = time.time() - t print('PycudaWaveletTransform:\t{:.3f} ms'.format( (t * 1000.) / iterations)) rec_cpu_g = numpy.empty_like(rec_gpu.get() if gpu_output else rec_gpu) for ic, vc in enumerate(rec_cpu): rec_cpu_g[ic, :, :] = vc print('RMSE: {} '.format( rmse(rec_gpu.get() if gpu_output else rec_gpu, rec_cpu_g))) if gpu_mempool: dev_mem_pool.stop_holding() except Exception as e: tb = traceback.format_exc() print("%s", tb)
def init(self): self._memory_pool = DeviceMemoryPool()