def main(): from pycuda.tools import DeviceMemoryPool, PageLockedMemoryPool dev_pool = DeviceMemoryPool() pagelocked_pool = PageLockedMemoryPool() from scipy.io import mmread csr_mat = mmread(args[0]).tocsr().astype(numpy.float32) inv_mat_diag = 1 / csr_mat.diagonal() print "building..." from pycuda.sparse.packeted import PacketedSpMV spmv = PacketedSpMV(csr_mat, options.is_symmetric, csr_mat.dtype) rhs = numpy.random.rand(spmv.shape[0]).astype(spmv.dtype) from pycuda.sparse.operator import DiagonalPreconditioner if True: precon = DiagonalPreconditioner( spmv.permute( gpuarray.to_gpu(inv_mat_diag, allocator=dev_pool.allocate))) else: precon = None from pycuda.sparse.cg import solve_pkt_with_cg print "start solve" for i in range(4): start = drv.Event() stop = drv.Event() start.record() rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate) res_gpu, it_count, res_count = \ solve_pkt_with_cg(spmv, rhs_gpu, precon, tol=1e-7 if spmv.dtype == numpy.float64 else 5e-5, pagelocked_allocator=pagelocked_pool.allocate) res = res_gpu.get() stop.record() stop.synchronize() elapsed = stop.time_since(start) * 1e-3 est_flops = (csr_mat.nnz * 2 * (it_count + res_count) + csr_mat.shape[0] * (2 + 2 + 2 + 2 + 2) * it_count) if precon is not None: est_flops += csr_mat.shape[0] * it_count print "residual norm: %g" % (la.norm(csr_mat * res - rhs) / la.norm(rhs)) print( "size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, " "%g gflops/s" % (csr_mat.shape[0], elapsed, it_count, res_count, it_count / elapsed, est_flops / elapsed / 1e9)) # TODO: mixed precision # TODO: benchmark pagelocked_pool.stop_holding() dev_pool.stop_holding()
def _solve_cuda(lap_sparse, B, return_full_prob=False, maxiter=100, tol=5e-5): """ solves lap_sparse X_i = B_i for each phase i, using the conjugate gradient method. For each pixel, the label i corresponding to the maximal X_i is returned. """ print("using gpu mode") dev_pool = DeviceMemoryPool() pagelocked_pool = PageLockedMemoryPool() csr_mat = lap_sparse csr_mat = csr_mat.astype(np.float32) inv_mat_diag = 1 / csr_mat.diagonal() spmv = PacketedSpMV(csr_mat, True, csr_mat.dtype) X = [] for i in range(len(B)): rhs = -B[i].astype(spmv.dtype) if True: precon = DiagonalPreconditioner( spmv.permute( gpuarray.to_gpu(inv_mat_diag, allocator=dev_pool.allocate))) else: precon = None print("start solve") start = drv.Event() stop = drv.Event() start.record() rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate) tol = 1e-7 if spmv.dtype == np.float64 else tol res_gpu, it_count, res_count = solve_pkt_with_cg( spmv, rhs_gpu, precon, tol=tol, pagelocked_allocator=pagelocked_pool.allocate) res = res_gpu.get() stop.record() stop.synchronize() elapsed = stop.time_since(start) * 1e-3 est_flops = (csr_mat.nnz * 2 * (it_count + res_count) + csr_mat.shape[0] * (2 + 2 + 2 + 2 + 2) * it_count) if precon is not None: est_flops += csr_mat.shape[0] * it_count print("size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, " "%g gflops/s" % (csr_mat.shape[0], elapsed, it_count, res_count, it_count / elapsed, est_flops / elapsed / 1e9)) x0 = res[0] X.append(x0) pagelocked_pool.stop_holding() dev_pool.stop_holding() if not return_full_prob: X = np.array(X) X = np.argmax(X, axis=0) return X
def test_mempool(self): from pycuda.tools import bitlog2 from pycuda.tools import DeviceMemoryPool pool = DeviceMemoryPool() queue = [] free, total = drv.mem_get_info() e0 = bitlog2(free) for e in range(e0 - 6, e0 - 4): for i in range(100): queue.append(pool.allocate(1 << e)) if len(queue) > 10: queue.pop(0) del queue pool.stop_holding()
def test_mempool(self): from pycuda.tools import bitlog2 from pycuda.tools import DeviceMemoryPool pool = DeviceMemoryPool() maxlen = 10 queue = [] free, total = drv.mem_get_info() e0 = bitlog2(free) for e in range(e0-6, e0-4): for i in range(100): queue.append(pool.allocate(1<<e)) if len(queue) > 10: queue.pop(0) del queue pool.stop_holding()
def test1d(wavelet='haar', use_float32=False, depth=1, num_rows=512, row_size=512, iterations=20, gpu_input=False, gpu_output=False, gpu_mempool=False): try: dtype = numpy.float64 if use_float32: dtype = numpy.float32 img = (numpy.array(scipy.misc.ascent(), dtype=dtype)-128.)/128. resized_img = resize(img, (num_rows, row_size), mode='constant') if gpu_input: cont_input_array = numpy.ascontiguousarray(resized_img, dtype=dtype) img_array_gpu = gpuarray.to_gpu(cont_input_array) else: img_array_gpu = resized_img if gpu_mempool: dev_mem_pool = DeviceMemoryPool() gpu_alloc = dev_mem_pool.allocate else: gpu_alloc = cuda.mem_alloc pwt = PycudaWaveletTransform(wavelet=wavelet, use_float32=use_float32) # Forward Transform print('---------FORWARD DWT---------') t = time.time() for _ in range(iterations): dec_cpu = pywt.wavedec(resized_img, wavelet=wavelet, mode='periodization', level=depth) t = time.time()-t print('PyWavelets:\t\t\t\t{:.3f} ms'.format((t*1000.)/iterations)) t = time.time() for _ in range(iterations): dec_gpu = pwt.dwt1d(img_array_gpu, depth=depth, gpu_output=gpu_output, gpu_allocator=gpu_alloc) t = time.time()-t print('PycudaWaveletTransform:\t{:.3f} ms'.format((t*1000.)/iterations)) for i, (d1, d2) in enumerate(zip(dec_gpu, dec_cpu)): if i == 0: result1 = d1.get() if gpu_output else d1 result2 = d2 else: result1 = numpy.concatenate((result1, d1.get() if gpu_output else d1), axis=1) result2 = numpy.concatenate((result2, d2), axis=1) print('RMSE: {} \n'.format(rmse(result1, result2))) dec_cpu_g = [] if gpu_input: for d in dec_cpu: cont_array = numpy.ascontiguousarray(d, dtype=dtype) dec_cpu_g.append(gpuarray.to_gpu(cont_array)) else: dec_cpu_g = dec_cpu # Inverse Transform print('---------INVERSE DWT---------') t = time.time() for _ in range(iterations): rec_cpu = pywt.waverec(dec_cpu, wavelet=wavelet, mode='periodization') t = time.time()-t print('PyWavelets:\t\t\t\t{:.3f} ms'.format((t*1000.)/iterations)) t = time.time() for _ in range(iterations): rec_gpu = pwt.idwt1d(dec_cpu_g, gpu_output=gpu_output, gpu_allocator=gpu_alloc) t = time.time()-t print('PycudaWaveletTransform:\t{:.3f} ms'.format((t*1000.)/iterations)) print('RMSE: {} '.format(rmse(rec_gpu.get() if gpu_output else rec_gpu, rec_cpu))) if gpu_mempool: dev_mem_pool.stop_holding() except Exception as e: tb = traceback.format_exc() print("%s",tb)
def main_cg(): from optparse import OptionParser parser = OptionParser( usage="%prog [options] MATRIX-MARKET-FILE") parser.add_option("-s", "--is-symmetric", action="store_true", help="Specify that the input matrix is already symmetric") options, args = parser.parse_args() from pycuda.tools import DeviceMemoryPool, PageLockedMemoryPool dev_pool = DeviceMemoryPool() pagelocked_pool = PageLockedMemoryPool() from scipy.io import mmread csr_mat = mmread(args[0]).tocsr().astype(numpy.float32) inv_mat_diag = 1/csr_mat.diagonal() print "building..." from pycuda.sparse.packeted import PacketedSpMV spmv = PacketedSpMV(csr_mat, options.is_symmetric, csr_mat.dtype) rhs = numpy.random.rand(spmv.shape[0]).astype(spmv.dtype) from pycuda.sparse.operator import DiagonalPreconditioner if True: precon = DiagonalPreconditioner( spmv.permute(gpuarray.to_gpu( inv_mat_diag, allocator=dev_pool.allocate))) else: precon = None from pycuda.sparse.cg import solve_pkt_with_cg print "start solve" for i in range(4): start = drv.Event() stop = drv.Event() start.record() rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate) res_gpu, it_count, res_count = \ solve_pkt_with_cg(spmv, rhs_gpu, precon, tol=1e-7 if spmv.dtype == numpy.float64 else 5e-5, pagelocked_allocator=pagelocked_pool.allocate) res = res_gpu.get() stop.record() stop.synchronize() elapsed = stop.time_since(start)*1e-3 est_flops = (csr_mat.nnz*2*(it_count+res_count) + csr_mat.shape[0]*(2+2+2+2+2)*it_count) if precon is not None: est_flops += csr_mat.shape[0] * it_count print "residual norm: %g" % (la.norm(csr_mat*res - rhs)/la.norm(rhs)) print ("size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, " "%g gflops/s" % ( csr_mat.shape[0], elapsed, it_count, res_count, it_count/elapsed, est_flops/elapsed/1e9)) # TODO: mixed precision # TODO: benchmark pagelocked_pool.stop_holding() dev_pool.stop_holding()
def test2d(wavelet='haar', use_float32=False, depth=1, num_slices=1, row_size=512, col_size=512, iterations=20, gpu_input=False, gpu_output=False, gpu_mempool=False): try: dtype = numpy.float64 if use_float32: dtype = numpy.float32 # Prepare Image Array img = (numpy.array(scipy.misc.ascent(), dtype=dtype) - 128.) / 128. resized_img = resize(img, (col_size, row_size), mode='constant') img_array = numpy.empty([num_slices, col_size, row_size], dtype=dtype) for s in range(num_slices): img_array[s, :, :] = resized_img[:, :] if gpu_input: cont_input_array = numpy.ascontiguousarray(img_array, dtype=dtype) img_array_gpu = gpuarray.to_gpu(cont_input_array) else: img_array_gpu = img_array if gpu_mempool: dev_mem_pool = DeviceMemoryPool() gpu_alloc = dev_mem_pool.allocate else: gpu_alloc = cuda.mem_alloc pwt = PycudaWaveletTransform(wavelet=wavelet, use_float32=use_float32) # Forward Transform print('---------FORWARD 2D DWT---------') t = time.time() for _ in range(iterations): dec_cpu = [ pywt.wavedec2(img_array[s], wavelet=wavelet, mode='periodization', level=depth) for s in range(num_slices) ] t = time.time() - t print('PyWavelets:\t\t\t\t{:.3f} ms'.format((t * 1000.) / iterations)) t = time.time() for _ in range(iterations): dec_gpu = pwt.dwt2d(img_array_gpu, depth=depth, gpu_output=gpu_output, gpu_allocator=gpu_alloc) t = time.time() - t print('PycudaWaveletTransform:\t{:.3f} ms'.format( (t * 1000.) / iterations)) dec_cpu_g = [] for ig, vg in enumerate(dec_gpu): if ig == 0: a = numpy.empty_like(vg.get() if gpu_output else vg) for ic, vc in enumerate(dec_cpu): a[ic, :, :] = vc[0] dec_cpu_g.append(a) else: dl = [] for id, vd in enumerate(vg): d = numpy.empty_like(vd.get() if gpu_output else vd) for ic, vc in enumerate(dec_cpu): d[ic, :, :] = vc[ig][id] dl.append(d) dec_cpu_g.append(dl) for i, (d1, d2) in enumerate(zip(dec_gpu, dec_cpu_g)): if i == 0: result1 = d1.get().flatten() if gpu_output else d1.flatten() result2 = d2.flatten() else: for d in d1: result1 = numpy.concatenate( (result1, d.get().flatten() if gpu_output else d.flatten())) for d in d2: result2 = numpy.concatenate((result2, d.flatten())) print('RMSE: {} \n'.format(rmse(result1, result2))) if gpu_input: for ig, vg in enumerate(dec_cpu_g): if ig == 0: cont_array = numpy.ascontiguousarray(vg, dtype=dtype) dec_cpu_g[ig] = gpuarray.to_gpu(cont_array) else: for id, vd in enumerate(vg): cont_array = numpy.ascontiguousarray(vd, dtype=dtype) dec_cpu_g[ig][id] = gpuarray.to_gpu(cont_array) # Inverse Transform print('---------INVERSE 2D DWT---------') t = time.time() for _ in range(iterations): rec_cpu = [ pywt.waverec2(d, wavelet=wavelet, mode='periodization') for d in dec_cpu ] t = time.time() - t print('PyWavelets:\t\t\t\t{:.3f} ms'.format((t * 1000.) / iterations)) t = time.time() for _ in range(iterations): rec_gpu = pwt.idwt2d(dec_cpu_g, gpu_output=gpu_output, gpu_allocator=gpu_alloc) t = time.time() - t print('PycudaWaveletTransform:\t{:.3f} ms'.format( (t * 1000.) / iterations)) rec_cpu_g = numpy.empty_like(rec_gpu.get() if gpu_output else rec_gpu) for ic, vc in enumerate(rec_cpu): rec_cpu_g[ic, :, :] = vc print('RMSE: {} '.format( rmse(rec_gpu.get() if gpu_output else rec_gpu, rec_cpu_g))) if gpu_mempool: dev_mem_pool.stop_holding() except Exception as e: tb = traceback.format_exc() print("%s", tb)