def setup(self, size, units, lam=0.5, n0=1.0, use_fresnel_approx=False): """ sets up the internal variables e.g. propagators etc... :param size: the size of the geometry in pixels (Nx,Ny,Nz) :param units: the phyiscal units of each voxel in microns (dx,dy,dz) :param lam: the wavelength of light in microns :param n0: the refractive index of the surrounding media :param use_fresnel_approx: if True, uses fresnel approximation for propagator """ Bpm3d_Base.setup(self, size, units, lam=lam, n0=n0, use_fresnel_approx=use_fresnel_approx) # setting up the gpu buffers and kernels self.program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) Nx, Ny = self.size[:2] plan = fft_plan(()) self._H_g = OCLArray.from_array(self._H.astype(np.complex64)) self.scatter_weights_g = OCLArray.from_array(self.scatter_weights.astype(np.float32)) self.gfactor_weights_g = OCLArray.from_array(self.gfactor_weights.astype(np.float32)) self.scatter_cross_sec_g = OCLArray.zeros(Nz, "float32") self.gfactor_g = OCLArray.zeros(Nz, "float32") self.reduce_kernel = OCLReductionKernel( np.float32, neutral="0", reduce_expr="a+b", map_expr="weights[i]*cfloat_abs(field[i]-(i==0)*plain)*cfloat_abs(field[i]-(i==0)*plain)", arguments="__global cfloat_t *field, __global float * weights,cfloat_t plain", )
def _setup_gpu(self): dev = get_device() self._queue = dev.queue self._ctx = dev.context prog = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) # the buffers/ images Nx, Ny = self.simul_xy Nx0, Ny0 = self.shape[:2] self._plan = fft_plan((Ny, Nx), **self.fftplan_kwargs) self._buf_plane = OCLArray.empty((Ny, Nx), np.complex64) self._buf_H = OCLArray.empty((Ny, Nx), np.complex64) self._img_xy = OCLImage.empty((Ny, Nx), dtype=np.float32, num_channels=2) # buffer for the weighted dn average self.intens_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type) self.intens_dn_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type) self.intens_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type) self.intens_dn_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type) # the kernels self._kernel_compute_propagator = prog.compute_propagator self._kernel_compute_propagator.set_scalar_arg_dtypes((None, ) + (np.float32, ) * 5) self._kernel_compute_propagator_buf = prog.compute_propagator_buf self._kernel_compute_propagator_buf.set_scalar_arg_dtypes( (None, ) + (np.float32, ) * 5 + (None, ) * 2) self._kernel_mult_complex = prog.mult self._kernel_im_to_buf_field = prog.img_to_buf_field self._kernel_im_to_buf_intensity = prog.img_to_buf_intensity self._kernel_im_to_im_intensity = prog.img_to_img_intensity self._kernel_buf_to_buf_field = prog.buf_to_buf_field self._kernel_buf_to_buf_intensity = prog.buf_to_buf_intensity self._kernel_mult_dn_img_float = prog.mult_dn_image self._kernel_mult_dn_buf_float = prog.mult_dn self._kernel_mult_dn_img_complex = prog.mult_dn_image_complex self._kernel_mult_dn_buf_complex = prog.mult_dn_complex self._kernel_mult_dn_img_float_local = prog.mult_dn_image_local self._kernel_mult_dn_buf_float_local = prog.mult_dn_local self._kernel_mult_dn_img_complex_local = prog.mult_dn_image_complex_local self._kernel_mult_dn_buf_complex_local = prog.mult_dn_complex_local self._kernel_reduction = OCLMultiReductionKernel( np.float32, neutral="0", reduce_expr="a+b", map_exprs=["a[i]", "b[i]"], arguments="__global float *a, __global float *b") self._fill_propagator(self.n0)
def time_gpu(dshape, niter=100, fast_math=False): d_g = OCLArray.empty(dshape, np.complex64) get_device().queue.finish() plan = fft_plan(dshape, fast_math=fast_math) t = time() for _ in xrange(niter): fft(d_g, inplace=True, plan=plan) get_device().queue.finish() t = (time()-t)/niter print "GPU (fast_math = %s)\t%s\t\t%.2f ms"%(fast_math, dshape, 1000.*t)
def time_gpu(dshape, niter=100, fast_math=False): d_g = OCLArray.empty(dshape, np.complex64) get_device().queue.finish() plan = fft_plan(dshape, fast_math=fast_math) t = time() for _ in range(niter): fft(d_g, inplace=True, plan=plan) get_device().queue.finish() t = (time() - t) / niter print("GPU (fast_math = %s)\t%s\t\t%.2f ms" % (fast_math, dshape, 1000. * t)) return t
def _deconv_rl_np_fft(data, h, Niter = 10, h_is_fftshifted = False): """ deconvolves data with given psf (kernel) h data and h have to be same shape via lucy richardson deconvolution """ if data.shape != h.shape: raise ValueError("data and h have to be same shape") if not h_is_fftshifted: h = np.fft.fftshift(h) hflip = h[::-1,::-1] #set up some gpu buffers y_g = OCLArray.from_array(data.astype(np.complex64)) u_g = OCLArray.from_array(data.astype(np.complex64)) tmp_g = OCLArray.empty(data.shape,np.complex64) hf_g = OCLArray.from_array(h.astype(np.complex64)) hflip_f_g = OCLArray.from_array(hflip.astype(np.complex64)) # hflipped_g = OCLArray.from_array(h.astype(np.complex64)) plan = fft_plan(data.shape) #transform psf fft(hf_g,inplace = True) fft(hflip_f_g,inplace = True) for i in range(Niter): print i fft_convolve(u_g, hf_g, res_g = tmp_g, kernel_is_fft = True) _complex_divide_inplace(y_g,tmp_g) fft_convolve(tmp_g,hflip_f_g, inplace = True, kernel_is_fft = True) _complex_multiply_inplace(u_g,tmp_g) return np.abs(u_g.get())
def _setup_gpu(self): dev = get_device() self._queue = dev.queue self._ctx = dev.context prog = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) # the buffers/ images Nx, Ny = self.simul_xy Nx0, Ny0 = self.shape[:2] self._plan = fft_plan((Ny, Nx), **self.fftplan_kwargs) self._buf_plane = OCLArray.empty((Ny, Nx), np.complex64) self._buf_H = OCLArray.empty((Ny, Nx), np.complex64) self._img_xy = OCLImage.empty((Ny, Nx), dtype=np.float32, num_channels=2) # buffer for the weighted dn average self.intens_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type) self.intens_dn_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type) self.intens_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type) self.intens_dn_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type) # the kernels self._kernel_compute_propagator = prog.compute_propagator self._kernel_compute_propagator.set_scalar_arg_dtypes((None,)+(np.float32,)*5) self._kernel_compute_propagator_buf = prog.compute_propagator_buf self._kernel_compute_propagator_buf.set_scalar_arg_dtypes((None,)+(np.float32,)*5+(None,)*2) self._kernel_mult_complex = prog.mult self._kernel_im_to_buf_field = prog.img_to_buf_field self._kernel_im_to_buf_intensity = prog.img_to_buf_intensity self._kernel_im_to_im_intensity = prog.img_to_img_intensity self._kernel_buf_to_buf_field = prog.buf_to_buf_field self._kernel_buf_to_buf_intensity = prog.buf_to_buf_intensity self._kernel_mult_dn_img_float = prog.mult_dn_image self._kernel_mult_dn_buf_float = prog.mult_dn self._kernel_mult_dn_img_complex = prog.mult_dn_image_complex self._kernel_mult_dn_buf_complex = prog.mult_dn_complex self._kernel_mult_dn_img_float_local = prog.mult_dn_image_local self._kernel_mult_dn_buf_float_local = prog.mult_dn_local self._kernel_mult_dn_img_complex_local = prog.mult_dn_image_complex_local self._kernel_mult_dn_buf_complex_local = prog.mult_dn_complex_local self._kernel_reduction = OCLMultiReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_exprs=["a[i]", "b[i]"], arguments="__global float *a, __global float *b") self._fill_propagator(self.n0)
def _deconv_rl_np_fft(data, h, Niter=10, h_is_fftshifted=False): """ deconvolves data with given psf (kernel) h data and h have to be same shape via lucy richardson deconvolution """ if data.shape != h.shape: raise ValueError("data and h have to be same shape") if not h_is_fftshifted: h = np.fft.fftshift(h) hflip = h[::-1, ::-1] #set up some gpu buffers y_g = OCLArray.from_array(data.astype(np.complex64)) u_g = OCLArray.from_array(data.astype(np.complex64)) tmp_g = OCLArray.empty(data.shape, np.complex64) hf_g = OCLArray.from_array(h.astype(np.complex64)) hflip_f_g = OCLArray.from_array(hflip.astype(np.complex64)) # hflipped_g = OCLArray.from_array(h.astype(np.complex64)) plan = fft_plan(data.shape) #transform psf fft(hf_g, inplace=True) fft(hflip_f_g, inplace=True) for i in range(Niter): logger.info("Iteration: {}".format(i)) fft_convolve(u_g, hf_g, res_g=tmp_g, kernel_is_fft=True) _complex_divide_inplace(y_g, tmp_g) fft_convolve(tmp_g, hflip_f_g, inplace=True, kernel_is_fft=True) _complex_multiply_inplace(u_g, tmp_g) return np.abs(u_g.get())
def _deconv_rl_gpu_fft(data_g, h_g, Niter = 10): """ using fft_convolve """ if data_g.shape != h_g.shape: raise ValueError("data and h have to be same shape") #set up some gpu buffers u_g = OCLArray.empty(data_g.shape,np.complex64) u_g.copy_buffer(data_g) tmp_g = OCLArray.empty(data_g.shape,np.complex64) #fix this hflip_g = OCLArray.from_array((h_g.get()[::-1,::-1]).copy()) plan = fft_plan(data_g.shape) #transform psf fft(h_g,inplace = True) fft(hflip_g,inplace = True) for i in range(Niter): print i fft_convolve(u_g, h_g, res_g = tmp_g, kernel_is_fft = True) _complex_divide_inplace(data_g,tmp_g) fft_convolve(tmp_g,hflip_g, inplace = True, kernel_is_fft = True) _complex_multiply_inplace(u_g,tmp_g) return u_g
def get_gpu(N = 256, niter=100, sig = 1.): np.random.seed(0) a = np.random.normal(0,sig,(N,N)).astype(np.complex64) b = (1.*a.copy()).astype(np.complex64) c_g = OCLArray.empty_like(b) b_g = OCLArray.from_array(b) p = fft_plan((N,N), fast_math = False) rels = [] for _ in range(niter): fft(b_g,res_g = c_g, plan = p) fft(c_g, res_g = b_g, inverse = True, plan = p) # b = fft(fft(b), inverse = True) # rels.append(np.amax(np.abs(a-b))/np.amax(np.abs(a))) rels.append(np.amax(np.abs(a-b_g.get()))/np.amax(np.abs(a))) return np.array(rels)
def get_gpu(N=256, niter=100, sig=1.): np.random.seed(0) a = np.random.normal(0, sig, (N, N)).astype(np.complex64) b = (1. * a.copy()).astype(np.complex64) c_g = OCLArray.empty_like(b) b_g = OCLArray.from_array(b) p = fft_plan((N, N), fast_math=False) rels = [] for _ in range(niter): fft(b_g, res_g=c_g, plan=p) fft(c_g, res_g=b_g, inverse=True, plan=p) # b = fft(fft(b), inverse = True) # rels.append(np.amax(np.abs(a-b))/np.amax(np.abs(a))) rels.append(np.amax(np.abs(a - b_g.get())) / np.amax(np.abs(a))) return np.array(rels)
def _setup_impl(self): """setting up the gpu buffers and kernels """ self.bpm_program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) Nx, Ny, Nz = self.size self._plan = fft_plan((Ny, Nx)) self._H_g = OCLArray.from_array(self._H.astype(np.complex64)) if not self.dn is None and self.n_volumes == 1: self.dn_g = OCLArray.from_array(self.dn) self.scatter_weights_g = OCLArray.from_array( self.scatter_weights.astype(np.float32)) self.gfactor_weights_g = OCLArray.from_array( self.gfactor_weights.astype(np.float32)) self.scatter_cross_sec_g = OCLArray.zeros(Nz, "float32") self.gfactor_g = OCLArray.zeros(Nz, "float32")
def _setup_impl(self): """setting up the gpu buffers and kernels """ self.bpm_program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) Nx, Ny, Nz = self.size self._plan = fft_plan((Ny,Nx)) self._H_g = OCLArray.from_array(self._H.astype(np.complex64)) if not self.dn is None and self.n_volumes==1: self.dn_g = OCLArray.from_array(self.dn) self.scatter_weights_g = OCLArray.from_array(self.scatter_weights.astype(np.float32)) self.gfactor_weights_g = OCLArray.from_array(self.gfactor_weights.astype(np.float32)) self.scatter_cross_sec_g = OCLArray.zeros(Nz,"float32") self.gfactor_g = OCLArray.zeros(Nz,"float32")
def __init__(self, psf: np.ndarray, psf_is_fftshifted: bool = False, n_iter=10): """ setup deconvolution for a given shape """ self.shape = psf.shape if not psf_is_fftshifted: psf = np.fft.fftshift(psf) self.n_iter = n_iter # What happens here? Indices are being flipped ? Why. What if it is 3D? psfflip = psf[::-1, ::-1] self.psf_g = OCLArray.from_array(psf.astype(np.complex64)) self.psfflip_f_g = OCLArray.from_array(psfflip.astype(np.complex64)) self.plan = fft_plan(self.shape) # transform psf fft(self.psf_g, inplace=True) fft(self.psfflip_f_g, inplace=True) # get temp self.tmp_g = OCLArray.empty(psf.shape, np.complex64)
def setup(self, size, units, lam = .5, n0 = 1., use_fresnel_approx = False): """ sets up the internal variables e.g. propagators etc... :param size: the size of the geometry in pixels (Nx,Ny,Nz) :param units: the phyiscal units of each voxel in microns (dx,dy,dz) :param lam: the wavelength of light in microns :param n0: the refractive index of the surrounding media :param use_fresnel_approx: if True, uses fresnel approximation for propagator """ Bpm3d_Base.setup(self,size, units, lam = lam, n0 = n0, use_fresnel_approx = use_fresnel_approx) #setting up the gpu buffers and kernels self.program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) Nx, Ny = self.size[:2] plan = fft_plan(()) self._H_g = OCLArray.from_array(self._H.astype(np.complex64)) self.scatter_weights_g = OCLArray.from_array(self.scatter_weights.astype(np.float32)) self.gfactor_weights_g = OCLArray.from_array(self.gfactor_weights.astype(np.float32)) self.scatter_cross_sec_g = OCLArray.zeros(Nz,"float32") self.gfactor_g = OCLArray.zeros(Nz,"float32") self.reduce_kernel = OCLReductionKernel( np.float32, neutral="0", reduce_expr="a+b", map_expr="weights[i]*cfloat_abs(field[i]-(i==0)*plain)*cfloat_abs(field[i]-(i==0)*plain)", arguments="__global cfloat_t *field, __global float * weights,cfloat_t plain")
def _deconv_rl_gpu_fft(data_g, h_g, Niter=10): """ using fft_convolve """ if data_g.shape != h_g.shape: raise ValueError("data and h have to be same shape") #set up some gpu buffers u_g = OCLArray.empty(data_g.shape, np.complex64) u_g.copy_buffer(data_g) tmp_g = OCLArray.empty(data_g.shape, np.complex64) #fix this hflip_g = OCLArray.from_array((h_g.get()[::-1, ::-1]).copy()) plan = fft_plan(data_g.shape) #transform psf fft(h_g, inplace=True) fft(hflip_g, inplace=True) for i in range(Niter): logger.info("Iteration: {}".format(i)) fft_convolve(u_g, h_g, res_g=tmp_g, kernel_is_fft=True) _complex_divide_inplace(data_g, tmp_g) fft_convolve(tmp_g, hflip_g, inplace=True, kernel_is_fft=True) _complex_multiply_inplace(u_g, tmp_g) return u_g
def _convolve_spatial2(im, hs, mode = "constant", grid_dim = None, pad_factor = 2, plan = None, return_plan = False): """ spatial varying convolution of an 2d image with a 2d grid of psfs shape(im_ = (Ny,Nx) shape(hs) = (Gy,Gx, Hy,Hx) the input image im is subdivided into (Gy,Gx) blocks hs[j,i] is the psf at the center of each block (i,j) as of now each image dimension has to be divisible by the grid dim, i.e. Nx % Gx == 0 Ny % Gy == 0 mode can be: "constant" - assumed values to be zero "wrap" - periodic boundary condition """ if grid_dim: Gs = tuple(grid_dim) else: Gs = hs.shape[:2] mode_str = {"constant":"CLK_ADDRESS_CLAMP", "wrap":"CLK_ADDRESS_REPEAT"} Ny, Nx = im.shape Gy, Gx = Gs # the size of each block within the grid Nblock_y, Nblock_x = Ny/Gy, Nx/Gx # the size of the overlapping patches with safety padding Npatch_x, Npatch_y = _next_power_of_2(pad_factor*Nblock_x), _next_power_of_2(pad_factor*Nblock_y) prog = OCLProgram(abspath("kernels/conv_spatial2.cl"), build_options=["-D","ADDRESSMODE=%s"%mode_str[mode]]) if plan is None: plan = fft_plan((Npatch_y,Npatch_x)) x0s = Nblock_x*np.arange(Gx) y0s = Nblock_y*np.arange(Gy) patches_g = OCLArray.empty((Gy,Gx,Npatch_y,Npatch_x),np.complex64) #prepare psfs if grid_dim: h_g = OCLArray.zeros((Gy,Gx,Npatch_y,Npatch_x),np.complex64) tmp_g = OCLArray.from_array(hs.astype(np.float32, copy = False)) for i,_x0 in enumerate(x0s): for j,_y0 in enumerate(y0s): prog.run_kernel("fill_psf_grid2", (Nblock_x,Nblock_y),None, tmp_g.data, np.int32(Nx), np.int32(i*Nblock_x), np.int32(j*Nblock_y), h_g.data, np.int32(Npatch_x), np.int32(Npatch_y), np.int32(-Nblock_x/2+Npatch_x/2), np.int32(-Nblock_y/2+Npatch_y/2), np.int32(i*Npatch_x*Npatch_y+j*Gx*Npatch_x*Npatch_y) ) else: hs = np.fft.fftshift(pad_to_shape(hs,(Gy,Gx,Npatch_y,Npatch_x)),axes=(2,3)) h_g = OCLArray.from_array(hs.astype(np.complex64)) #prepare image im_g = OCLImage.from_array(im.astype(np.float32,copy=False)) for i,_x0 in enumerate(x0s): for j,_y0 in enumerate(y0s): prog.run_kernel("fill_patch2",(Npatch_x,Npatch_y),None, im_g, np.int32(_x0+Nblock_x/2-Npatch_x/2), np.int32(_y0+Nblock_y/2-Npatch_y/2), patches_g.data, np.int32(i*Npatch_x*Npatch_y+j*Gx*Npatch_x*Npatch_y)) #return np.abs(patches_g.get()) # convolution fft(patches_g,inplace=True, batch = Gx*Gy, plan = plan) fft(h_g,inplace=True, batch = Gx*Gy, plan = plan) prog.run_kernel("mult_inplace",(Npatch_x*Npatch_y*Gx*Gy,),None, patches_g.data, h_g.data) fft(patches_g,inplace=True, inverse = True, batch = Gx*Gy, plan = plan) print Nblock_x, Npatch_x #return np.abs(patches_g.get()) #accumulate res_g = OCLArray.empty(im.shape,np.float32) for j in xrange(Gy+1): for i in xrange(Gx+1): prog.run_kernel("interpolate2",(Nblock_x,Nblock_y),None, patches_g.data,res_g.data, np.int32(i),np.int32(j), np.int32(Gx),np.int32(Gy), np.int32(Npatch_x),np.int32(Npatch_y)) res = res_g.get() if return_plan: return res, plan else: return res
def _convolve_spatial2(im, hs, mode="constant", grid_dim=None, pad_factor=2, plan=None, return_plan=False): """ spatial varying convolution of an 2d image with a 2d grid of psfs shape(im_ = (Ny,Nx) shape(hs) = (Gy,Gx, Hy,Hx) the input image im is subdivided into (Gy,Gx) blocks hs[j,i] is the psf at the center of each block (i,j) as of now each image dimension has to be divisible by the grid dim, i.e. Nx % Gx == 0 Ny % Gy == 0 mode can be: "constant" - assumed values to be zero "wrap" - periodic boundary condition """ if grid_dim: Gs = tuple(grid_dim) else: Gs = hs.shape[:2] mode_str = {"constant": "CLK_ADDRESS_CLAMP", "wrap": "CLK_ADDRESS_REPEAT"} Ny, Nx = im.shape Gy, Gx = Gs # the size of each block within the grid Nblock_y, Nblock_x = Ny // Gy, Nx // Gx # the size of the overlapping patches with safety padding Npatch_x, Npatch_y = _next_power_of_2( pad_factor * Nblock_x), _next_power_of_2(pad_factor * Nblock_y) prog = OCLProgram(abspath("kernels/conv_spatial2.cl"), build_options=["-D", "ADDRESSMODE=%s" % mode_str[mode]]) if plan is None: plan = fft_plan((Gy, Gx, Npatch_y, Npatch_x), axes=(-2, -1)) x0s = Nblock_x * np.arange(Gx) y0s = Nblock_y * np.arange(Gy) patches_g = OCLArray.empty((Gy, Gx, Npatch_y, Npatch_x), np.complex64) #prepare psfs if grid_dim: h_g = OCLArray.zeros((Gy, Gx, Npatch_y, Npatch_x), np.complex64) tmp_g = OCLArray.from_array(hs.astype(np.float32, copy=False)) for i, _x0 in enumerate(x0s): for j, _y0 in enumerate(y0s): prog.run_kernel( "fill_psf_grid2", (Nblock_x, Nblock_y), None, tmp_g.data, np.int32(Nx), np.int32(i * Nblock_x), np.int32(j * Nblock_y), h_g.data, np.int32(Npatch_x), np.int32(Npatch_y), np.int32(-Nblock_x // 2 + Npatch_x // 2), np.int32(-Nblock_y // 2 + Npatch_y // 2), np.int32(i * Npatch_x * Npatch_y + j * Gx * Npatch_x * Npatch_y)) else: hs = np.fft.fftshift(pad_to_shape(hs, (Gy, Gx, Npatch_y, Npatch_x)), axes=(2, 3)) h_g = OCLArray.from_array(hs.astype(np.complex64)) #prepare image im_g = OCLImage.from_array(im.astype(np.float32, copy=False)) for i, _x0 in enumerate(x0s): for j, _y0 in enumerate(y0s): prog.run_kernel( "fill_patch2", (Npatch_x, Npatch_y), None, im_g, np.int32(_x0 + Nblock_x // 2 - Npatch_x // 2), np.int32(_y0 + Nblock_y // 2 - Npatch_y // 2), patches_g.data, np.int32(i * Npatch_x * Npatch_y + j * Gx * Npatch_x * Npatch_y)) #return np.abs(patches_g.get()) # convolution fft(patches_g, inplace=True, plan=plan) fft(h_g, inplace=True, plan=plan) prog.run_kernel("mult_inplace", (Npatch_x * Npatch_y * Gx * Gy, ), None, patches_g.data, h_g.data) fft(patches_g, inplace=True, inverse=True, plan=plan) logger.debug("Nblock_x: {}, Npatch_x: {}".format(Nblock_x, Npatch_x)) #return np.abs(patches_g.get()) #accumulate res_g = OCLArray.empty(im.shape, np.float32) for j in range(Gy + 1): for i in range(Gx + 1): prog.run_kernel("interpolate2", (Nblock_x, Nblock_y), None, patches_g.data, res_g.data, np.int32(i), np.int32(j), np.int32(Gx), np.int32(Gy), np.int32(Npatch_x), np.int32(Npatch_y)) res = res_g.get() if return_plan: return res, plan else: return res
def convolve_spatial3(im, hs, mode = "constant", plan = None, return_plan = False, pad_factor = 2): """ spatial varying convolution of an 3d image with a 3d grid of psfs shape(im_ = (Nz,Ny,Nx) shape(hs) = (Gz,Gy,Gx, Hz,Hy,Hx) the input image im is subdivided into (Gx,Gy,Gz) blocks hs[k,j,i] is the psf at the center of each block (i,j,k) as of now each image dimension has to be divisble by the grid dim, i.e. Nx % Gx == 0 Ny % Gy == 0 Nz % Gz == 0 mode can be: "constant" - assumed values to be zero "wrap" - periodic boundary condition """ if im.ndim !=3 or hs.ndim !=6: raise ValueError("wrong dimensions of input!") if not np.all([n%g==0 for n,g in zip(im.shape,hs.shape[:3])]): raise NotImplementedError("shape of image has to be divisible by Gx Gy = %s !"%(str(hs.shape[:3]))) mode_str = {"constant":"CLK_ADDRESS_CLAMP", "wrap":"CLK_ADDRESS_REPEAT"} Ns = tuple(im.shape) Gs = tuple(hs.shape[:3]) # the size of each block within the grid Nblocks = [n/g for n,g in zip(Ns,Gs)] # the size of the overlapping patches with safety padding Npatchs = tuple([_next_power_of_2(pad_factor*nb) for nb in Nblocks]) print hs.shape hs = np.fft.fftshift(pad_to_shape(hs,Gs+Npatchs),axes=(3,4,5)) prog = OCLProgram(abspath("kernels/conv_spatial.cl"), build_options=["-D","ADDRESSMODE=%s"%mode_str[mode]]) if plan is None: plan = fft_plan(Npatchs) patches_g = OCLArray.empty(Gs+Npatchs,np.complex64) h_g = OCLArray.from_array(hs.astype(np.complex64)) im_g = OCLImage.from_array(im.astype(np.float32,copy=False)) Xs = [nb*np.arange(g) for nb, g in zip(Nblocks,Gs)] print Nblocks # this loops over all i,j,k for (k,_z0), (j,_y0),(i,_x0) in product(*[enumerate(X) for X in Xs]): prog.run_kernel("fill_patch3",Npatchs[::-1],None, im_g, np.int32(_x0+Nblocks[2]/2-Npatchs[2]/2), np.int32(_y0+Nblocks[1]/2-Npatchs[1]/2), np.int32(_z0+Nblocks[0]/2-Npatchs[0]/2), patches_g.data, np.int32(i*np.prod(Npatchs)+ j*Gs[2]*np.prod(Npatchs)+ k*Gs[2]*Gs[1]*np.prod(Npatchs))) print patches_g.shape, h_g.shape # convolution fft(patches_g,inplace=True, batch = np.prod(Gs), plan = plan) fft(h_g,inplace=True, batch = np.prod(Gs), plan = plan) prog.run_kernel("mult_inplace",(np.prod(Npatchs)*np.prod(Gs),),None, patches_g.data, h_g.data) fft(patches_g, inplace=True, inverse = True, batch = np.prod(Gs), plan = plan) #return patches_g.get() #accumulate res_g = OCLArray.zeros(im.shape,np.float32) for k, j, i in product(*[range(g+1) for g in Gs]): prog.run_kernel("interpolate3",Nblocks[::-1],None, patches_g.data, res_g.data, np.int32(i),np.int32(j),np.int32(k), np.int32(Gs[2]),np.int32(Gs[1]),np.int32(Gs[0]), np.int32(Npatchs[2]),np.int32(Npatchs[1]),np.int32(Npatchs[0])) res = res_g.get() if return_plan: return res, plan else: return res
def convolve_spatial3(im, hs, mode="constant", plan=None, return_plan=False, pad_factor=2): """ spatial varying convolution of an 3d image with a 3d grid of psfs shape(im_ = (Nz,Ny,Nx) shape(hs) = (Gz,Gy,Gx, Hz,Hy,Hx) the input image im is subdivided into (Gx,Gy,Gz) blocks hs[k,j,i] is the psf at the center of each block (i,j,k) as of now each image dimension has to be divisble by the grid dim, i.e. Nx % Gx == 0 Ny % Gy == 0 Nz % Gz == 0 mode can be: "constant" - assumed values to be zero "wrap" - periodic boundary condition """ if im.ndim != 3 or hs.ndim != 6: raise ValueError("wrong dimensions of input!") if not np.all([n % g == 0 for n, g in zip(im.shape, hs.shape[:3])]): raise NotImplementedError( "shape of image has to be divisible by Gx Gy = %s !" % (str(hs.shape[:3]))) mode_str = {"constant": "CLK_ADDRESS_CLAMP", "wrap": "CLK_ADDRESS_REPEAT"} Ns = tuple(im.shape) Gs = tuple(hs.shape[:3]) # the size of each block within the grid Nblocks = [n / g for n, g in zip(Ns, Gs)] # the size of the overlapping patches with safety padding Npatchs = tuple([_next_power_of_2(pad_factor * nb) for nb in Nblocks]) print(hs.shape) hs = np.fft.fftshift(pad_to_shape(hs, Gs + Npatchs), axes=(3, 4, 5)) prog = OCLProgram(abspath("kernels/conv_spatial.cl"), build_options=["-D", "ADDRESSMODE=%s" % mode_str[mode]]) if plan is None: plan = fft_plan(Npatchs) patches_g = OCLArray.empty(Gs + Npatchs, np.complex64) h_g = OCLArray.from_array(hs.astype(np.complex64)) im_g = OCLImage.from_array(im.astype(np.float32, copy=False)) Xs = [nb * np.arange(g) for nb, g in zip(Nblocks, Gs)] print(Nblocks) # this loops over all i,j,k for (k, _z0), (j, _y0), (i, _x0) in product(*[enumerate(X) for X in Xs]): prog.run_kernel( "fill_patch3", Npatchs[::-1], None, im_g, np.int32(_x0 + Nblocks[2] / 2 - Npatchs[2] / 2), np.int32(_y0 + Nblocks[1] / 2 - Npatchs[1] / 2), np.int32(_z0 + Nblocks[0] / 2 - Npatchs[0] / 2), patches_g.data, np.int32(i * np.prod(Npatchs) + j * Gs[2] * np.prod(Npatchs) + k * Gs[2] * Gs[1] * np.prod(Npatchs))) print(patches_g.shape, h_g.shape) # convolution fft(patches_g, inplace=True, batch=np.prod(Gs), plan=plan) fft(h_g, inplace=True, batch=np.prod(Gs), plan=plan) prog.run_kernel("mult_inplace", (np.prod(Npatchs) * np.prod(Gs), ), None, patches_g.data, h_g.data) fft(patches_g, inplace=True, inverse=True, batch=np.prod(Gs), plan=plan) #return patches_g.get() #accumulate res_g = OCLArray.zeros(im.shape, np.float32) for k, j, i in product(*[list(range(g + 1)) for g in Gs]): prog.run_kernel("interpolate3", Nblocks[::-1], None, patches_g.data, res_g.data, np.int32(i), np.int32(j), np.int32(k), np.int32(Gs[2]), np.int32(Gs[1]), np.int32(Gs[0]), np.int32(Npatchs[2]), np.int32(Npatchs[1]), np.int32(Npatchs[0])) res = res_g.get() if return_plan: return res, plan else: return res
def convolve_spatial2(im, hs, mode="constant", plan=None, return_plan=False): """ spatial varying convolution of an 2d image with a 2d grid of psfs shape(im_ = (Ny,Nx) shape(hs) = (Gy,Gx, Hy,Hx) the input image im is subdivided into (Gy,Gz) blocks hs[j,i] is the psf at the center of each block (i,j) as of now each image dimension has to be divisble by the grid dim, i.e. Nx % Gx == 0 Ny % Gy == 0 mode can be: "constant" - assumed values to be zero "wrap" - periodic boundary condition """ if im.ndim != 2 or hs.ndim != 4: raise ValueError("wrong dimensions of input!") if not np.all([n % g == 0 for n, g in zip(im.shape, hs.shape[:2])]): raise NotImplementedError( "shape of image has to be divisible by Gx Gy = %s shape mismatch" % (str(hs.shape[:2]))) mode_str = {"constant": "CLK_ADDRESS_CLAMP", "wrap": "CLK_ADDRESS_REPEAT"} Ny, Nx = im.shape Gy, Gx = hs.shape[:2] # the size of each block within the grid Nblock_y, Nblock_x = Ny / Gy, Nx / Gx # the size of the overlapping patches with safety padding Npatch_x, Npatch_y = _next_power_of_2(3 * Nblock_x), _next_power_of_2( 3 * Nblock_y) #Npatch_x, Npatch_y = _next_power_of_2(2*Nblock_x), _next_power_of_2(2*Nblock_y) print(Nblock_x, Npatch_x) hs = np.fft.fftshift(pad_to_shape(hs, (Gy, Gx, Npatch_y, Npatch_x)), axes=(2, 3)) prog = OCLProgram(abspath("kernels/conv_spatial.cl"), build_options=["-D", "ADDRESSMODE=%s" % mode_str[mode]]) if plan is None: plan = fft_plan((Npatch_y, Npatch_x)) patches_g = OCLArray.empty((Gy, Gx, Npatch_y, Npatch_x), np.complex64) h_g = OCLArray.from_array(hs.astype(np.complex64)) im_g = OCLImage.from_array(im.astype(np.float32, copy=False)) x0s = Nblock_x * np.arange(Gx) y0s = Nblock_y * np.arange(Gy) print(x0s) for i, _x0 in enumerate(x0s): for j, _y0 in enumerate(y0s): prog.run_kernel( "fill_patch2", (Npatch_x, Npatch_y), None, im_g, np.int32(_x0 + Nblock_x / 2 - Npatch_x / 2), np.int32(_y0 + Nblock_y / 2 - Npatch_y / 2), patches_g.data, np.int32(i * Npatch_x * Npatch_y + j * Gx * Npatch_x * Npatch_y)) # convolution fft(patches_g, inplace=True, batch=Gx * Gy, plan=plan) fft(h_g, inplace=True, batch=Gx * Gy, plan=plan) prog.run_kernel("mult_inplace", (Npatch_x * Npatch_y * Gx * Gy, ), None, patches_g.data, h_g.data) fft(patches_g, inplace=True, inverse=True, batch=Gx * Gy, plan=plan) #return patches_g.get() #accumulate res_g = OCLArray.empty(im.shape, np.float32) for i in range(Gx + 1): for j in range(Gy + 1): prog.run_kernel("interpolate2", (Nblock_x, Nblock_y), None, patches_g.data, res_g.data, np.int32(i), np.int32(j), np.int32(Gx), np.int32(Gy), np.int32(Npatch_x), np.int32(Npatch_y)) res = res_g.get() if return_plan: return res, plan else: return res
def _bpm_3d2(size, units, lam = .5, u0 = None, dn = None, subsample = 1, n0 = 1., return_scattering = False, return_g = False, return_full = True, return_field = True, use_fresnel_approx = False, absorbing_width = 0, scattering_plane_ind = 0, return_last_plane = False, store_dn_as_half = False): """ simulates the propagation of monochromatic wave of wavelength lam with initial conditions u0 along z in a media filled with dn size - the dimension of the image to be calulcated in pixels (Nx,Ny,Nz) units - the unit lengths of each dimensions in microns lam - the wavelength u0 - the initial field distribution, if u0 = None an incident plane wave is assumed dn - the refractive index of the medium (can be complex) """ if subsample != 1: raise NotImplementedError("subsample still has to be 1") clock = StopWatch() clock.tic("setup") Nx, Ny, Nz = size dx, dy, dz = units #setting up the propagator k0 = 2.*np.pi/lam kxs = 2.*np.pi*np.fft.fftfreq(Nx,dx) kys = 2.*np.pi*np.fft.fftfreq(Ny,dy) KY, KX = np.meshgrid(kys,kxs, indexing= "ij") #H0 = np.sqrt(0.j+n0**2*k0**2-KX**2-KY**2) H0 = np.sqrt(n0**2*k0**2-KX**2-KY**2) if use_fresnel_approx: H0 = 0.j+n0*k0-.5*(KX**2+KY**2)/n0/k0 outsideInds = np.isnan(H0) H = np.exp(-1.j*dz*H0) H[outsideInds] = 0. H0[outsideInds] = 0. if u0 is None: u0 = np.ones((Ny,Nx),np.complex64) # setting up the gpu buffers and kernels program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) plan = fft_plan((Ny,Nx)) plane_g = OCLArray.from_array(u0.astype(np.complex64, copy = False)) h_g = OCLArray.from_array(H.astype(np.complex64)) if dn is not None: if isinstance(dn,OCLArray): dn_g = dn else: if dn.dtype.type in (np.complex64,np.complex128): isComplexDn = True dn_g = OCLArray.from_array(dn.astype(np.complex64,copy= False)) else: isComplexDn = False if store_dn_as_half: dn_g = OCLArray.from_array(dn.astype(np.float16,copy= False)) else: dn_g = OCLArray.from_array(dn.astype(np.float32,copy= False)) else: #dummy dn dn_g = OCLArray.empty((1,)*3,np.float32) if return_scattering: cos_theta = np.real(H0)/n0/k0 # _H = np.sqrt(n0**2*k0**2-KX**2-KY**2) # _H[np.isnan(_H)] = 0. # # cos_theta = _H/n0/k0 # # = cos(theta) scatter_weights = cos_theta #scatter_weights = np.sqrt(KX**2+KY**2)/k0/np.real(H0) #scatter_weights[outsideInds] = 0. scatter_weights_g = OCLArray.from_array(scatter_weights.astype(np.float32)) # = cos(theta)^2 gfactor_weights = cos_theta**2 gfactor_weights_g = OCLArray.from_array(gfactor_weights.astype(np.float32)) #return None,None,scatter_weights, gfactor_weights scatter_cross_sec_g = OCLArray.zeros(Nz,"float32") gfactor_g = OCLArray.zeros(Nz,"float32") plain_wave_dct = Nx*Ny*np.exp(-1.j*k0*n0*(scattering_plane_ind+np.arange(Nz))*dz).astype(np.complex64) reduce_kernel = OCLReductionKernel( np.float32, neutral="0", reduce_expr="a+b", map_expr="weights[i]*cfloat_abs(field[i]-(i==0)*plain)*cfloat_abs(field[i]-(i==0)*plain)", arguments="__global cfloat_t *field, __global float * weights,cfloat_t plain") # reduce_kernel = OCLReductionKernel( # np.float32, neutral="0", # reduce_expr="a+b", # map_expr = "weights[i]*(i!=0)*cfloat_abs(field[i])*cfloat_abs(field[i])", # arguments = "__global cfloat_t *field, __global float * weights,cfloat_t plain") if return_full: if return_field: u_g = OCLArray.empty((Nz,Ny,Nx),dtype=np.complex64) u_g[0] = plane_g else: u_g = OCLArray.empty((Nz,Ny,Nx),dtype=np.float32) program.run_kernel("copy_intens",(Nx*Ny,),None, plane_g.data,u_g.data, np.int32(0)) clock.toc("setup") clock.tic("run") for i in range(Nz-1): fft(plane_g,inplace = True, plan = plan) program.run_kernel("mult",(Nx*Ny,),None, plane_g.data,h_g.data) #a = dn_g.sum() if return_scattering: scatter_cross_sec_g[i+1] = reduce_kernel(plane_g, scatter_weights_g, plain_wave_dct[i+1]) gfactor_g[i+1] = reduce_kernel(plane_g, gfactor_weights_g, plain_wave_dct[i+1]) fft(plane_g,inplace = True, inverse = True, plan = plan) if dn is not None: if isComplexDn: kernel_str = "mult_dn_complex" else: if dn_g.dtype.type == np.float16: kernel_str = "mult_dn_half" else: kernel_str = "mult_dn" program.run_kernel(kernel_str,(Nx,Ny,),None, plane_g.data,dn_g.data, np.float32(k0*dz), np.int32(Nx*Ny*(i+1)), np.int32(absorbing_width)) if return_full: if return_field: u_g[i+1] = plane_g else: program.run_kernel("copy_intens",(Nx*Ny,),None, plane_g.data,u_g.data, np.int32(Nx*Ny*(i+1))) clock.toc("run") print clock if return_full: u = u_g.get() else: u = plane_g.get() if not return_field: u = np.abs(u)**2 if return_scattering: # normalizing prefactor dkx = dx/Nx # prefac = 1./Nx/Ny*dx*dy/4./np.pi/n0 prefac = 1./Nx/Ny*dx*dy p = prefac*scatter_cross_sec_g.get() if return_g: prefac = 1./Nx/Ny*dx*dy g = prefac*gfactor_g.get()/p if return_scattering: if return_g: result = u, p, g else: result = u, p else: result = u if return_last_plane: if isinstance(result,tuple): result = result + (plane_g.get(),) else: result = (result, plane_g.get()) return result
def _bpm_3d_image(size, units, lam = .5, u0 = None, dn = None, subsample = 1, n0 = 1., return_scattering = False, return_g = False, return_full_last = False, use_fresnel_approx = False, ): """ simulates the propagation of monochromativ wave of wavelength lam with initial conditions u0 along z in a media filled with dn size - the dimension of the image to be calulcated in pixels (Nx,Ny,Nz) units - the unit lengths of each dimensions in microns lam - the wavelength u0 - the initial field distribution, if u0 = None an incident plane wave is assumed dn - the refractive index of the medium (can be complex) """ clock = StopWatch() clock.tic("setup") Nx, Ny, Nz = size dx, dy, dz = units # subsampling Nx2, Ny2, Nz2 = (subsample*N for N in size) dx2, dy2, dz2 = (1.*d/subsample for d in units) #setting up the propagator k0 = 2.*np.pi/lam kxs = 2.*np.pi*np.fft.fftfreq(Nx2,dx2) kys = 2.*np.pi*np.fft.fftfreq(Ny2,dy2) KY, KX = np.meshgrid(kys,kxs, indexing= "ij") #H0 = np.sqrt(0.j+n0**2*k0**2-KX**2-KY**2) H0 = np.sqrt(n0**2*k0**2-KX**2-KY**2) if use_fresnel_approx: H0 = 0.j+n0**2*k0-.5*(KX**2+KY**2) outsideInds = np.isnan(H0) H = np.exp(-1.j*dz2*H0) H[outsideInds] = 0. H0[outsideInds] = 0. if u0 is None: u0 = np.ones((Ny2,Nx2),np.complex64) else: if subsample >1: u0 = zoom(np.real(u0),subsample) + 1.j*zoom(np.imag(u0),subsample) # setting up the gpu buffers and kernels program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) plan = fft_plan((Ny2,Nx2)) plane_g = OCLArray.from_array(u0.astype(np.complex64)) h_g = OCLArray.from_array(H.astype(np.complex64)) if dn is not None: if isinstance(dn,OCLImage): dn_g = dn else: if dn.dtype.type in (np.complex64,np.complex128): dn_complex = np.zeros(dn.shape+(2,),np.float32) dn_complex[...,0] = np.real(dn) dn_complex[...,1] = np.imag(dn) dn_g = OCLImage.from_array(dn_complex) else: dn_g = OCLImage.from_array(dn.astype(np.float32)) isComplexDn = dn.dtype.type in (np.complex64,np.complex128) else: #dummy dn dn_g = OCLArray.empty((1,)*3,np.float16) if return_scattering: cos_theta = np.real(H0)/n0/k0 # = cos(theta) scatter_weights = cos_theta scatter_weights_g = OCLArray.from_array(scatter_weights.astype(np.float32)) # = cos(theta)^2 gfactor_weights = cos_theta**2 gfactor_weights_g = OCLArray.from_array(gfactor_weights.astype(np.float32)) #return None,None,scatter_weights, gfactor_weights scatter_cross_sec_g = OCLArray.zeros(Nz,"float32") gfactor_g = OCLArray.zeros(Nz,"float32") plain_wave_dct = Nx2*Ny2*np.exp(-1.j*k0*n0*np.arange(Nz)*dz).astype(np.complex64) reduce_kernel = OCLReductionKernel( np.float32, neutral="0", reduce_expr="a+b", map_expr="weights[i]*cfloat_abs(field[i]-(i==0)*plain)*cfloat_abs(field[i]-(i==0)*plain)", arguments="__global cfloat_t *field, __global float * weights,cfloat_t plain") # reduce_kernel = OCLReductionKernel( # np.float32, neutral="0", # reduce_expr="a+b", # map_expr = "weights[i]*(i!=0)*cfloat_abs(field[i])*cfloat_abs(field[i])", # arguments = "__global cfloat_t *field, __global float * weights,cfloat_t plain") u_g = OCLArray.empty((Nz,Ny,Nx),dtype=np.complex64) program.run_kernel("copy_subsampled_buffer",(Nx,Ny),None, u_g.data,plane_g.data, np.int32(subsample), np.int32(0)) clock.toc("setup") clock.tic("run") for i in range(Nz-1): for substep in range(subsample): fft(plane_g,inplace = True, plan = plan) program.run_kernel("mult",(Nx2*Ny2,),None, plane_g.data,h_g.data) if return_scattering and substep == (subsample-1): scatter_cross_sec_g[i+1] = reduce_kernel(plane_g, scatter_weights_g, plain_wave_dct[i+1]) gfactor_g[i+1] = reduce_kernel(plane_g, gfactor_weights_g, plain_wave_dct[i+1]) fft(plane_g,inplace = True, inverse = True, plan = plan) if dn is not None: if isComplexDn: program.run_kernel("mult_dn_complex_image",(Nx2,Ny2),None, plane_g.data,dn_g, np.float32(k0*dz2), np.float32(n0), np.int32(subsample*(i+1.)+substep), np.int32(subsample)) else: program.run_kernel("mult_dn_image",(Nx2,Ny2),None, plane_g.data,dn_g, np.float32(k0*dz2), np.float32(n0), np.int32(subsample*(i+1.)+substep), np.int32(subsample)) program.run_kernel("copy_subsampled_buffer",(Nx,Ny),None, u_g.data,plane_g.data, np.int32(subsample), np.int32((i+1)*Nx*Ny)) clock.toc("run") print clock result = (u_g.get(), dn_g.get(),) if return_scattering: # normalizing prefactor dkx = dx2/Nx2 # prefac = 1./Nx2/Ny2*dx2*dy2/4./np.pi/n0 prefac = 1./Nx2/Ny2*dx2*dy2 p = prefac*scatter_cross_sec_g.get() result += (p,) if return_g: prefac = 1./Nx2/Ny2*dx2*dy2 g = prefac*gfactor_g.get()/p result += (g,) if return_full_last: result += (plane_g.get(),) return result
def convolve_spatial2(im, hs, mode = "constant", plan = None, return_plan = False): """ spatial varying convolution of an 2d image with a 2d grid of psfs shape(im_ = (Ny,Nx) shape(hs) = (Gy,Gx, Hy,Hx) the input image im is subdivided into (Gy,Gz) blocks hs[j,i] is the psf at the center of each block (i,j) as of now each image dimension has to be divisble by the grid dim, i.e. Nx % Gx == 0 Ny % Gy == 0 mode can be: "constant" - assumed values to be zero "wrap" - periodic boundary condition """ if im.ndim !=2 or hs.ndim !=4: raise ValueError("wrong dimensions of input!") if not np.all([n%g==0 for n,g in zip(im.shape,hs.shape[:2])]): raise NotImplementedError("shape of image has to be divisible by Gx Gy = %s shape mismatch"%(str(hs.shape[:2]))) mode_str = {"constant":"CLK_ADDRESS_CLAMP", "wrap":"CLK_ADDRESS_REPEAT"} Ny, Nx = im.shape Gy, Gx = hs.shape[:2] # the size of each block within the grid Nblock_y, Nblock_x = Ny/Gy, Nx/Gx # the size of the overlapping patches with safety padding Npatch_x, Npatch_y = _next_power_of_2(3*Nblock_x), _next_power_of_2(3*Nblock_y) #Npatch_x, Npatch_y = _next_power_of_2(2*Nblock_x), _next_power_of_2(2*Nblock_y) print Nblock_x, Npatch_x hs = np.fft.fftshift(pad_to_shape(hs,(Gy,Gx,Npatch_y,Npatch_x)),axes=(2,3)) prog = OCLProgram(abspath("kernels/conv_spatial.cl"), build_options=["-D","ADDRESSMODE=%s"%mode_str[mode]]) if plan is None: plan = fft_plan((Npatch_y,Npatch_x)) patches_g = OCLArray.empty((Gy,Gx,Npatch_y,Npatch_x),np.complex64) h_g = OCLArray.from_array(hs.astype(np.complex64)) im_g = OCLImage.from_array(im.astype(np.float32,copy=False)) x0s = Nblock_x*np.arange(Gx) y0s = Nblock_y*np.arange(Gy) print x0s for i,_x0 in enumerate(x0s): for j,_y0 in enumerate(y0s): prog.run_kernel("fill_patch2",(Npatch_x,Npatch_y),None, im_g, np.int32(_x0+Nblock_x/2-Npatch_x/2), np.int32(_y0+Nblock_y/2-Npatch_y/2), patches_g.data, np.int32(i*Npatch_x*Npatch_y+j*Gx*Npatch_x*Npatch_y)) # convolution fft(patches_g,inplace=True, batch = Gx*Gy, plan = plan) fft(h_g,inplace=True, batch = Gx*Gy, plan = plan) prog.run_kernel("mult_inplace",(Npatch_x*Npatch_y*Gx*Gy,),None, patches_g.data, h_g.data) fft(patches_g,inplace=True, inverse = True, batch = Gx*Gy, plan = plan) #return patches_g.get() #accumulate res_g = OCLArray.empty(im.shape,np.float32) for i in xrange(Gx+1): for j in xrange(Gy+1): prog.run_kernel("interpolate2",(Nblock_x,Nblock_y),None, patches_g.data,res_g.data, np.int32(i),np.int32(j), np.int32(Gx),np.int32(Gy), np.int32(Npatch_x),np.int32(Npatch_y)) res = res_g.get() if return_plan: return res, plan else: return res
def _convolve_spatial3(im, hs, mode="constant", grid_dim=None, plan=None, return_plan=False, pad_factor=2): if im.ndim != 3: raise ValueError("wrong dimensions of input!") if not (hs.ndim == 6 or (hs.ndim == 3 and grid_dim)): raise ValueError("wrong dimensions of psf grid!") if grid_dim: if hs.shape != im.shape: raise ValueError("if grid_dim is set, then im.shape = hs.shape !") Gs = tuple(grid_dim) else: if not hs.ndim == 6: raise ValueError("wrong dimensions of psf grid! (Gy,Gx,Ny,Nx)") Gs = hs.shape[:3] if not np.all([n % g == 0 for n, g in zip(im.shape, Gs)]): raise NotImplementedError( "shape of image has to be divisible by Gx Gy = %s shape mismatch" % (str(hs.shape[:2]))) mode_str = { "constant": "CLK_ADDRESS_CLAMP", "wrap": "CLK_ADDRESS_REPEAT", "edge": "CLK_ADDRESS_CLAMP_TO_EDGE", "reflect": "CLK_ADDRESS_MIRRORED_REPEAT" } Ns = im.shape # the size of each block within the grid Nblocks = [n // g for n, g in zip(Ns, Gs)] # the size of the overlapping patches with safety padding Npatchs = tuple([next_power_of_2(pad_factor * nb) for nb in Nblocks]) prog = OCLProgram(abspath("kernels/conv_spatial3.cl"), build_options=["-D", "ADDRESSMODE=%s" % mode_str[mode]]) if plan is None: plan = fft_plan(Gs + Npatchs, axes=(-3, -2, -1)) Xs = [nb * np.arange(g) for nb, g in zip(Nblocks, Gs)] patches_g = OCLArray.empty(Gs + Npatchs, np.complex64) # prepare psfs if grid_dim: h_g = OCLArray.zeros(Gs + Npatchs, np.complex64) tmp_g = OCLArray.from_array(hs.astype(np.float32, copy=False)) for (k, _z0), (j, _y0), (i, _x0) in product(*[enumerate(X) for X in Xs]): prog.run_kernel( "fill_psf_grid3", Nblocks[::-1], None, tmp_g.data, np.int32(im.shape[2]), np.int32(im.shape[1]), np.int32(i * Nblocks[2]), np.int32(j * Nblocks[1]), np.int32(k * Nblocks[0]), h_g.data, np.int32(Npatchs[2]), np.int32(Npatchs[1]), np.int32(Npatchs[0]), np.int32(-Nblocks[2] // 2 + Npatchs[2] // 2), np.int32(-Nblocks[1] // 2 + Npatchs[1] // 2), np.int32(-Nblocks[0] // 2 + Npatchs[0] // 2), np.int32(i * np.prod(Npatchs) + j * Gs[2] * np.prod(Npatchs) + k * Gs[2] * Gs[1] * np.prod(Npatchs))) else: hs = np.fft.fftshift(pad_to_shape(hs, Gs + Npatchs), axes=(3, 4, 5)) h_g = OCLArray.from_array(hs.astype(np.complex64)) im_g = OCLImage.from_array(im.astype(np.float32, copy=False)) # this loops over all i,j,k for (k, _z0), (j, _y0), (i, _x0) in product(*[enumerate(X) for X in Xs]): prog.run_kernel( "fill_patch3", Npatchs[::-1], None, im_g, np.int32(_x0 + Nblocks[2] // 2 - Npatchs[2] // 2), np.int32(_y0 + Nblocks[1] // 2 - Npatchs[1] // 2), np.int32(_z0 + Nblocks[0] // 2 - Npatchs[0] // 2), patches_g.data, np.int32(i * np.prod(Npatchs) + j * Gs[2] * np.prod(Npatchs) + k * Gs[2] * Gs[1] * np.prod(Npatchs))) # convolution fft(patches_g, inplace=True, plan=plan) fft(h_g, inplace=True, plan=plan) prog.run_kernel("mult_inplace", (np.prod(Npatchs) * np.prod(Gs), ), None, patches_g.data, h_g.data) fft(patches_g, inplace=True, inverse=True, plan=plan) # return patches_g.get() # accumulate res_g = OCLArray.zeros(im.shape, np.float32) for k, j, i in product(*[list(range(g + 1)) for g in Gs]): prog.run_kernel("interpolate3", Nblocks[::-1], None, patches_g.data, res_g.data, np.int32(i), np.int32(j), np.int32(k), np.int32(Gs[2]), np.int32(Gs[1]), np.int32(Gs[0]), np.int32(Npatchs[2]), np.int32(Npatchs[1]), np.int32(Npatchs[0])) res = res_g.get() if return_plan: return res, plan else: return res
def _convolve_spatial3(im, hs, mode = "constant", grid_dim = None, plan = None, return_plan = False, pad_factor = 2): if im.ndim !=3: raise ValueError("wrong dimensions of input!") if not (hs.ndim==6 or (hs.ndim==3 and grid_dim)): raise ValueError("wrong dimensions of psf grid!") if grid_dim: if hs.shape != im.shape: raise ValueError("if grid_dim is set, then im.shape = hs.shape !") Gs = tuple(grid_dim) else: if not hs.ndim==6: raise ValueError("wrong dimensions of psf grid! (Gy,Gx,Ny,Nx)") Gs = hs.shape[:3] if not np.all([n%g==0 for n,g in zip(im.shape,Gs)]): raise NotImplementedError("shape of image has to be divisible by Gx Gy = %s shape mismatch"%(str(hs.shape[:2]))) mode_str = {"constant":"CLK_ADDRESS_CLAMP", "wrap":"CLK_ADDRESS_REPEAT"} Ns = im.shape # the size of each block within the grid Nblocks = [n/g for n,g in zip(Ns,Gs)] # the size of the overlapping patches with safety padding Npatchs = tuple([_next_power_of_2(pad_factor*nb) for nb in Nblocks]) prog = OCLProgram(abspath("kernels/conv_spatial3.cl"), build_options=["-D","ADDRESSMODE=%s"%mode_str[mode]]) if plan is None: plan = fft_plan(Npatchs) Xs = [nb*np.arange(g) for nb, g in zip(Nblocks,Gs)] patches_g = OCLArray.empty(Gs+Npatchs,np.complex64) #prepare psfs if grid_dim: h_g = OCLArray.zeros(Gs+Npatchs,np.complex64) tmp_g = OCLArray.from_array(hs.astype(np.float32, copy = False)) for (k,_z0), (j,_y0),(i,_x0) in product(*[enumerate(X) for X in Xs]): prog.run_kernel("fill_psf_grid3", Nblocks[::-1],None, tmp_g.data, np.int32(im.shape[2]), np.int32(im.shape[1]), np.int32(i*Nblocks[2]), np.int32(j*Nblocks[1]), np.int32(k*Nblocks[0]), h_g.data, np.int32(Npatchs[2]), np.int32(Npatchs[1]), np.int32(Npatchs[0]), np.int32(-Nblocks[2]/2+Npatchs[2]/2), np.int32(-Nblocks[1]/2+Npatchs[1]/2), np.int32(-Nblocks[0]/2+Npatchs[0]/2), np.int32(i*np.prod(Npatchs)+ j*Gs[2]*np.prod(Npatchs)+ k*Gs[2]*Gs[1]*np.prod(Npatchs))) else: hs = np.fft.fftshift(pad_to_shape(hs,Gs+Npatchs),axes=(3,4,5)) h_g = OCLArray.from_array(hs.astype(np.complex64)) im_g = OCLImage.from_array(im.astype(np.float32,copy=False)) # this loops over all i,j,k for (k,_z0), (j,_y0),(i,_x0) in product(*[enumerate(X) for X in Xs]): prog.run_kernel("fill_patch3",Npatchs[::-1],None, im_g, np.int32(_x0+Nblocks[2]/2-Npatchs[2]/2), np.int32(_y0+Nblocks[1]/2-Npatchs[1]/2), np.int32(_z0+Nblocks[0]/2-Npatchs[0]/2), patches_g.data, np.int32(i*np.prod(Npatchs)+ j*Gs[2]*np.prod(Npatchs)+ k*Gs[2]*Gs[1]*np.prod(Npatchs))) # convolution fft(patches_g,inplace=True, batch = np.prod(Gs), plan = plan) fft(h_g,inplace=True, batch = np.prod(Gs), plan = plan) prog.run_kernel("mult_inplace",(np.prod(Npatchs)*np.prod(Gs),),None, patches_g.data, h_g.data) fft(patches_g, inplace=True, inverse = True, batch = np.prod(Gs), plan = plan) #return patches_g.get() #accumulate res_g = OCLArray.zeros(im.shape,np.float32) for k, j, i in product(*[range(g+1) for g in Gs]): prog.run_kernel("interpolate3",Nblocks[::-1],None, patches_g.data, res_g.data, np.int32(i),np.int32(j),np.int32(k), np.int32(Gs[2]),np.int32(Gs[1]),np.int32(Gs[0]), np.int32(Npatchs[2]),np.int32(Npatchs[1]),np.int32(Npatchs[0])) res = res_g.get() if return_plan: return res, plan else: return res