def setup_texture_gpuarr(tex_ref, arr): _arr = arr.astype(np.float32) if arr.dtype != np.float32 else arr tex_ref.set_array(cuda.gpuarray_to_array(_arr, "C")) tex_ref.set_address_mode(0, cuda.address_mode.WRAP) tex_ref.set_address_mode(1, cuda.address_mode.WRAP) tex_ref.set_filter_mode(cuda.filter_mode.POINT) tex_ref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
def update_ref(self): """Needs to be called after self.img_d has been written directly""" self.debug(3, "Updating original image") self.array = cuda.gpuarray_to_array(self.devRef, 'C') # 'C' order implies tex2D(x,y) will fetch matrix(y,x): # this is where x and y are inverted to comlpy with the kernels order self.tex.set_array(self.array) self._computeGradients() self._ready = False
def updateOrig(self): """ Needs to be called after self.img_d has been written directly (without using setOrig) """ self.debug(2,"Updating original image") self.array = cuda.gpuarray_to_array(self.devOrig,"C") self.tex.set_array(self.array) self.__computeGradients() self.__ready = False
def test_2d_fp_surfaces(self): orden = "C" npoints = 32 for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: fpName_str = 'fp_tex_cfloat' elif prec == np.complex128: fpName_str = 'fp_tex_cdouble' elif prec == np.float64: fpName_str = 'fp_tex_double' else: fpName_str = prec_str A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec) A_cpu[:] = np.random.rand(npoints,npoints)[:] A_gpu = gpuarray.to_gpu(A_cpu) # Array randomized myKernRW = ''' #include <pycuda-helpers.hpp> surface<void, cudaSurfaceType2DLayered> mtx_tex; __global__ void copy_texture(cuPres *dest, int rw) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; int layer = 1; int tid = row + col*blockDim.x*gridDim.x ; if (rw==0){ cuPres aux = dest[tid]; fp_surf2DLayeredwrite(aux, mtx_tex, row, col, layer,cudaBoundaryModeClamp);} else { cuPres aux = 0; fp_surf2DLayeredread(&aux, mtx_tex, col, row, layer, cudaBoundaryModeClamp); dest[tid] = aux; } } ''' myKernRW = myKernRW.replace('fpName',fpName_str) myKernRW = myKernRW.replace('cuPres',prec_str) modW = SourceModule(myKernRW) copy_texture = modW.get_function("copy_texture") mtx_tex = modW.get_surfref("mtx_tex") cuBlock = (8,8,1) if cuBlock[0]>npoints: cuBlock = (npoints,npoints,1) cuGrid = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1) copy_texture.prepare('Pi')#,texrefs=[mtx_tex]) A_gpu2 = gpuarray.zeros_like(A_gpu) # To initialize surface with zeros cudaArray = drv.gpuarray_to_array(A_gpu2,orden,allowSurfaceBind=True) A_cpu = A_gpu.get() # To remember original array mtx_tex.set_array(cudaArray) copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(0)) # Write random array copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(1)) # Read, but transposed assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec) A_gpu.gpudata.free()
def test_2d_fp_surfaces(self): orden = "C" npoints = 32 for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: fpName_str = 'fp_tex_cfloat' elif prec == np.complex128: fpName_str = 'fp_tex_cdouble' elif prec == np.float64: fpName_str = 'fp_tex_double' else: fpName_str = prec_str A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec) A_cpu[:] = np.random.rand(npoints,npoints)[:] A_gpu = gpuarray.to_gpu(A_cpu) # Array randomized myKernRW = ''' #include <pycuda-helpers.hpp> surface<void, cudaSurfaceType2DLayered> mtx_tex; __global__ void copy_texture(cuPres *dest, int rw) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; int layer = 1; int tid = row + col*blockDim.x*gridDim.x ; if (rw==0){ cuPres aux = dest[tid]; fp_surf2DLayeredwrite(aux, mtx_tex, row, col, layer,cudaBoundaryModeClamp);} else { cuPres aux = 0; fp_surf2DLayeredread(&aux, mtx_tex, col, row, layer, cudaBoundaryModeClamp); dest[tid] = aux; } } ''' myKernRW = myKernRW.replace('fpName',fpName_str) myKernRW = myKernRW.replace('cuPres',prec_str) modW = SourceModule(myKernRW) copy_texture = modW.get_function("copy_texture") mtx_tex = modW.get_surfref("mtx_tex") cuBlock = (8,8,1) if cuBlock[0]>npoints: cuBlock = (npoints,npoints,1) cuGrid = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1) copy_texture.prepare('Pi')#,texrefs=[mtx_tex]) A_gpu2 = gpuarray.zeros_like(A_gpu) # To initialize surface with zeros cudaArray = drv.gpuarray_to_array(A_gpu2,orden,allowSurfaceBind=True) A_cpu = A_gpu.get() # To remember original array mtx_tex.set_array(cudaArray) copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(0)) # Write random array copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(1)) # Read, but transposed assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec) A_gpu.gpudata.free()
def set_mask(self, mask): self.debug(3, "Setting the mask") assert mask.shape == (self.h, self.w), \ "Got a {} mask in a {} routine.".format(mask.shape, (self.h, self.w)) if not mask.dtype == np.float32: self.debug(2, "Converting the mask to float32") mask = mask.astype(np.float32) if isinstance(mask, np.ndarray): self.maskArray = cuda.matrix_to_array(mask, 'C') elif isinstance(mask, gpuarray.GPUArray): self.maskArray = cuda.gpuarray_to_array(mask, 'C') else: self.debug(0, "Error! Mask data type not understood") raise ValueError self.texMask.set_array(self.maskArray)
def setImage(self,img_d): """ Set the image to compare with the original """ assert img_d.shape == (self.w,self.h),"Got a {} image in a {} correlation routine!".format(img_d.shape,(self.w,self.h)) if isinstance(img_d,np.ndarray): self.debug(3,"Creating texture from numpy array") self.array_d = cuda.matrix_to_array(img_d,"C") elif isinstance(img_d,gpuarray.GPUArray): self.debug(3,"Creating texture from gpuarray") self.array_d = cuda.gpuarray_to_array(img_d,"C") else: print("[Error] Unknown type of data given to CorrelStage.setImage()") raise ValueError self.tex_d.set_array(self.array_d) self.devX.set(np.zeros(self.Nfields,dtype=np.float32))
def set_image(self, img_d): """ Set the image to compare with the original Note that calling this method is not necessary: you can do .compute(image) This will automatically call this method first """ assert img_d.shape == (self.h, self.w), \ "Got a {} image in a {} correlation routine!".format( img_d.shape, (self.h, self.w)) if isinstance(img_d, np.ndarray): self.debug(3, "Creating texture from numpy array") self.array_d = cuda.matrix_to_array(img_d, "C") elif isinstance(img_d, gpuarray.GPUArray): self.debug(3, "Creating texture from gpuarray") self.array_d = cuda.gpuarray_to_array(img_d, "C") else: self.debug(0, "Error ! Unknown type of data given to .set_image()") raise ValueError self.tex_d.set_array(self.array_d) self.devX.set(np.zeros(self.fields_count, dtype=np.float32))
def ndarray_to_float_tex(tex_ref, ndarray, address_mode=cuda.address_mode.BORDER, filter_mode=cuda.filter_mode.LINEAR): if isinstance(ndarray, np.ndarray): cu_array = cuda.np_to_array(ndarray, 'C') elif isinstance(ndarray, gpuarray.GPUArray): cu_array = cuda.gpuarray_to_array(ndarray, 'C') else: raise TypeError( 'ndarray must be numpy.ndarray or pycuda.gpuarray.GPUArray') cuda.TextureReference.set_array(tex_ref, cu_array) cuda.TextureReference.set_address_mode( tex_ref, 0, address_mode) if ndarray.ndim >= 2: cuda.TextureReference.set_address_mode( tex_ref, 1, address_mode) if ndarray.ndim >= 3: cuda.TextureReference.set_address_mode( tex_ref, 2, address_mode) cuda.TextureReference.set_filter_mode( tex_ref, filter_mode) tex_ref.set_flags(tex_ref.get_flags( ) & ~cuda.TRSF_NORMALIZED_COORDINATES & ~cuda.TRSF_READ_AS_INTEGER)
def alm2lenmap_onGPU(lib_alm, unlalm, dx_gu, dy_gu, do_not_prefilter=False): """ Lens the input unl_CMB map on the GPU using the pyCUDA interface. dx dy displacement in grid units. (f.get_dx_ingridunits() e.g.) Can be path to arrays or arrays or memmap. Will probably crash for too large maps, with need to split the job. Works for 4096 x 4096 at least on my laptop. Cost dominated by texture setup. # FIXME : try get rid of texture Note that the first call might be substantially slower than subsequent calls, as it caches the fft and ifft plans for subsequent usage. :param unl_CMB: :param func: bicubic or bilinear :param normalized_tex: use a modified version of the GPU bicubic spline to account for periodicity of the map :return: """ if timed: ti = time.time() shape = lib_alm.ell_mat.shape rshape = (shape[0], shape[1] / 2 + 1) assert shape[0] == shape[1], shape assert IsPowerOfTwo(shape[0]), shape assert load_map(dx_gu).shape == shape, (load_map(dx_gu).shape, lib_alm.ell_mat.shape) assert load_map(dy_gu).shape == shape, (load_map(dy_gu).shape, lib_alm.ell_mat.shape) assert np.all(np.array(shape) % GPU_block[0] == 0), shape if shape[0] > 4096: print "--- Exercise caution, array shapes larger than 4096 have never been tested so far ---" GPU_grid = (shape[0] / GPU_block[0], shape[1] / GPU_block[1], 1) # Prefiltering forces the interpolant to pass through the samples and increase accuracy, but dominates the cost. rfft2_unlCMB_gpu = gpuarray.to_gpu( lib_alm.alm2rfft(unlalm / np.prod(shape)).astype(np.complex64)) coeffs_gpu = gpuarray.empty(lib_alm.ell_mat.shape, dtype=np.float32) plan, plan_inv = get_rfft_plans(shape) if not do_not_prefilter: # The prefilter makes sure the spline is exact at the nodes. # Uncomments this to put coeffs_gpu on pitched memory to allow later for 2D texture binding : # alloc,pitch = cuda.mem_alloc_pitch(shape[0] * 4,shape[1],4) # 4 bytes per float32 wx = (6. / (2. * np.cos( 2. * np.pi * Freq(np.arange(shape[0]), shape[0]) / shape[0]) + 4.)) wx_gpu = gpuarray.to_gpu(wx.astype(np.float32)) prefilter = CUDA_module.get_function("cf_outer_w") prefilter(rfft2_unlCMB_gpu, wx_gpu, np.int32(rshape[1]), np.int32(rshape[0]), block=GPU_block, grid=GPU_grid) del wx_gpu ifft(rfft2_unlCMB_gpu, coeffs_gpu, plan_inv, False) # Binding arrays to textures and getting lensing func. if texture_count == 0: lens_func = CUDA_module.get_function("bicubiclensKernel_notex") tex_refs = [] dx_gu = gpuarray.to_gpu(load_map(dx_gu).astype(np.float32)) dy_gu = gpuarray.to_gpu(load_map(dy_gu).astype(np.float32)) elif texture_count == 1: unl_CMB_tex = CUDA_module.get_texref("unl_CMB") tex_refs = [unl_CMB_tex] unl_CMB_tex.set_array(cuda.gpuarray_to_array(coeffs_gpu, "C")) del coeffs_gpu dx_gu = gpuarray.to_gpu(load_map(dx_gu).astype(np.float32)) dy_gu = gpuarray.to_gpu(load_map(dy_gu).astype(np.float32)) lens_func = CUDA_module.get_function( "bicubiclensKernel_normtex_singletex") elif texture_count == 3: unl_CMB_tex = CUDA_module.get_texref("unl_CMB") dx_tex = CUDA_module.get_texref("tex_dx") dy_tex = CUDA_module.get_texref("tex_dy") tex_refs = ([unl_CMB_tex, dx_tex, dy_tex]) unl_CMB_tex.set_array(cuda.gpuarray_to_array(coeffs_gpu, "C")) del coeffs_gpu cuda.matrix_to_texref(load_map(dx_gu).astype(np.float32), dx_tex, order="C") cuda.matrix_to_texref(load_map(dy_gu).astype(np.float32), dy_tex, order="C") lens_func = CUDA_module.get_function("bicubiclensKernel_normtex") else: tex_refs = [] lens_func = 0 assert 0 # Wraping, important for periodic boundary conditions. # Note that WRAP has not effect for unnormalized texture coordinates. for tex_ref in tex_refs: tex_ref.set_address_mode(0, cuda.address_mode.WRAP) tex_ref.set_address_mode(1, cuda.address_mode.WRAP) tex_ref.set_filter_mode(cuda.filter_mode.POINT) tex_ref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) if timed: t0 = time.time() len_CMB = np.empty(shape, dtype=np.float32) if texture_count == 0: lens_func(cuda.Out(len_CMB), coeffs_gpu, dx_gu, dy_gu, np.int32(shape[0]), block=GPU_block, grid=GPU_grid, texrefs=tex_refs) elif texture_count == 1: lens_func(cuda.Out(len_CMB), dx_gu, dy_gu, np.int32(shape[0]), block=GPU_block, grid=GPU_grid, texrefs=tex_refs) elif texture_count == 3: lens_func(cuda.Out(len_CMB), np.int32(shape[0]), block=GPU_block, grid=GPU_grid, texrefs=tex_refs) if timed: dt = time.time() - t0 t_tot = time.time() - ti print " GPU bicubic spline and transfer at %s Mpixel / sec, time %s sec" % ( np.prod(lib_alm.ell_mat.shape) / 1e6 / dt, dt) print " Total ex. time at %s Mpixel / sec, ex. time %s sec." % ( np.prod(shape) / 1e6 / t_tot, t_tot) return len_CMB.astype(np.float64)