def _convolve_sep2_numpy(data, hx, hy): hx_g = OCLArray.from_array(hx.astype(np.float32)) hy_g = OCLArray.from_array(hy.astype(np.float32)) data_g = OCLArray.from_array(data.astype(np.float32)) return _convolve_sep2_gpu(data_g, hx_g, hy_g).get()
def run(self, data: np.ndarray): if data.shape != self.shape: raise ValueError("data and h have to be same shape") # set up some gpu buffers data64 = data.astype(np.complex64) y_g = OCLArray.from_array(data64) u_g = OCLArray.from_array(data64) # hflipped_g = OCLArray.from_array(h.astype(np.complex64)) for i in range(self.n_iter): # logger.info("Iteration: {}".format(i)) fft_convolve(u_g, self.psf_g, plan=self.plan, res_g=self.tmp_g, kernel_is_fft=True) _complex_divide_inplace(y_g, self.tmp_g) fft_convolve(self.tmp_g, self.psfflip_f_g, plan=self.plan, inplace=True, kernel_is_fft=True) _complex_multiply_inplace(u_g, self.tmp_g) # can abs be calculated on the gpu ? return np.abs(u_g.get())
def _deconv_rl_np(data, h, Niter = 10, ): """ """ d_g = OCLArray.from_array(data.astype(np.float32, copy = False)) h_g = OCLArray.from_array(h.astype(np.float32, copy = False)) res_g = _deconv_rl_gpu_conv(d_g,h_g,Niter) return res_g.get()
def setup(self, size, units, lam=0.5, n0=1.0, use_fresnel_approx=False): """ sets up the internal variables e.g. propagators etc... :param size: the size of the geometry in pixels (Nx,Ny,Nz) :param units: the phyiscal units of each voxel in microns (dx,dy,dz) :param lam: the wavelength of light in microns :param n0: the refractive index of the surrounding media :param use_fresnel_approx: if True, uses fresnel approximation for propagator """ Bpm3d_Base.setup(self, size, units, lam=lam, n0=n0, use_fresnel_approx=use_fresnel_approx) # setting up the gpu buffers and kernels self.program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) Nx, Ny = self.size[:2] plan = fft_plan(()) self._H_g = OCLArray.from_array(self._H.astype(np.complex64)) self.scatter_weights_g = OCLArray.from_array(self.scatter_weights.astype(np.float32)) self.gfactor_weights_g = OCLArray.from_array(self.gfactor_weights.astype(np.float32)) self.scatter_cross_sec_g = OCLArray.zeros(Nz, "float32") self.gfactor_g = OCLArray.zeros(Nz, "float32") self.reduce_kernel = OCLReductionKernel( np.float32, neutral="0", reduce_expr="a+b", map_expr="weights[i]*cfloat_abs(field[i]-(i==0)*plain)*cfloat_abs(field[i]-(i==0)*plain)", arguments="__global cfloat_t *field, __global float * weights,cfloat_t plain", )
def _fft_convolve_numpy(data, h, plan = None, kernel_is_fft = False, kernel_is_fftshifted = False): """ convolving via opencl fft for numpy arrays data and h must have the same size """ dev = get_device() if data.shape != h.shape: raise ValueError("data and kernel must have same size! %s vs %s "%(str(data.shape),str(h.shape))) data_g = OCLArray.from_array(data.astype(np.complex64)) if not kernel_is_fftshifted: h = np.fft.fftshift(h) h_g = OCLArray.from_array(h.astype(np.complex64)) res_g = OCLArray.empty_like(data_g) _fft_convolve_gpu(data_g,h_g,res_g = res_g, plan = plan, kernel_is_fft = kernel_is_fft) res = abs(res_g.get()) del data_g del h_g del res_g return res
def test_3d(): from time import time Niter = 10 data = np.zeros((128,)*3,np.float32) data[30,30,30] = 1. hx = 1./5*np.ones(5) hy = 1./13*np.ones(13) hz = 1./13*np.ones(11) t = time() for _ in range(Niter): out = convolve_sep3(data,hx,hy, hz) print "time: %.3f ms"%(1000.*(time()-t)/Niter) data_g = OCLArray.from_array(data.astype(np.float32)) hx_g = OCLArray.from_array(hx.astype(np.float32)) hy_g = OCLArray.from_array(hy.astype(np.float32)) hz_g = OCLArray.from_array(hz.astype(np.float32)) t = time() for _ in range(Niter): out_g = convolve_sep3(data_g,hx_g,hy_g, hz_g) out_g.get(); print "time: %.3f ms"%(1000.*(time()-t)/Niter) return out, out_g.get()
def _convolve_sep2_numpy(data,hx,hy): hx_g = OCLArray.from_array(hx.astype(np.float32)) hy_g = OCLArray.from_array(hy.astype(np.float32)) data_g = OCLArray.from_array(data.astype(np.float32)) return _convolve_sep2_gpu(data_g,hx_g,hy_g).get()
def test_3d(): from time import time Niter = 10 data = np.zeros((128, ) * 3, np.float32) data[30, 30, 30] = 1. hx = 1. / 5 * np.ones(5) hy = 1. / 13 * np.ones(13) hz = 1. / 13 * np.ones(11) t = time() for _ in range(Niter): out = convolve_sep3(data, hx, hy, hz) print("time: %.3f ms" % (1000. * (time() - t) / Niter)) data_g = OCLArray.from_array(data.astype(np.float32)) hx_g = OCLArray.from_array(hx.astype(np.float32)) hy_g = OCLArray.from_array(hy.astype(np.float32)) hz_g = OCLArray.from_array(hz.astype(np.float32)) t = time() for _ in range(Niter): out_g = convolve_sep3(data_g, hx_g, hy_g, hz_g) out_g.get() print("time: %.3f ms" % (1000. * (time() - t) / Niter)) return out, out_g.get()
def _convolve_np(data, h): """ numpy variant """ data_g = OCLArray.from_array(data.astype(np.float32, copy=False)) h_g = OCLArray.from_array(h.astype(np.float32, copy=False)) return _convolve_buf(data_g, h_g).get()
def _convolve_np(data, h): """ numpy variant """ data_g = OCLArray.from_array(np.require(data,np.float32,"C")) h_g = OCLArray.from_array(np.require(h,np.float32,"C")) return _convolve_buf(data_g, h_g).get()
def _convolve_np(data, h): """ numpy variant """ data_g = OCLArray.from_array(data.astype(np.float32, copy = False)) h_g = OCLArray.from_array(h.astype(np.float32, copy = False)) return _convolve_buf(data_g, h_g).get()
def _deconv_rl_np( data, h, Niter=10, ): """ """ d_g = OCLArray.from_array(data.astype(np.float32, copy=False)) h_g = OCLArray.from_array(h.astype(np.float32, copy=False)) res_g = _deconv_rl_gpu_conv(d_g, h_g, Niter) return res_g.get()
def fftshift(arr_obj, axes = None, res_g = None, return_buffer = False): """ gpu version of fftshift for numpy arrays or OCLArrays Parameters ---------- arr_obj: numpy array or OCLArray (float32/complex64) the array to be fftshifted axes: list or None the axes over which to shift (like np.fft.fftshift) if None, all axes are taken res_g: if given, fills it with the result (has to be same shape and dtype as arr_obj) else internally creates a new one Returns ------- if return_buffer, returns the result as (well :) OCLArray else returns the result as numpy array """ if axes is None: axes = range(arr_obj.ndim) if isinstance(arr_obj, OCLArray): if not arr_obj.dtype.type in DTYPE_KERNEL_NAMES.keys(): raise NotImplementedError("only works for float32 or complex64") elif isinstance(arr_obj, np.ndarray): if np.iscomplexobj(arr_obj): arr_obj = OCLArray.from_array(arr_obj.astype(np.complex64,copy = False)) else: arr_obj = OCLArray.from_array(arr_obj.astype(np.float32,copy = False)) else: raise ValueError("unknown type (%s)"%(type(arr_obj))) if not np.all([arr_obj.shape[a]%2==0 for a in axes]): raise NotImplementedError("only works on axes of even dimensions") if res_g is None: res_g = OCLArray.empty_like(arr_obj) # iterate over all axes # FIXME: this is still rather inefficient in_g = arr_obj for ax in axes: _fftshift_single(in_g, res_g, ax) in_g = res_g if return_buffer: return res_g else: return res_g.get()
def focus_field_lattice(shape, units, lam=.5, NA1=.4, NA2=.5, sigma=.1, Npoly=6, n0=1., n_integration_steps=100): """ """ kxs, kys = .5 * (NA1 + NA2) * poly_points(Npoly) p = OCLProgram(absPath("kernels/psf_lattice.cl"), build_options=[ "-I", absPath("kernels"), "-D", "INT_STEPS=%s" % n_integration_steps ]) kxs = np.array(kxs) kys = np.array(kys) Nx, Ny, Nz = shape dx, dy, dz = units alpha1 = np.arcsin(NA1 / n0) alpha2 = np.arcsin(NA2 / n0) u_g = OCLArray.empty((Nz, Ny, Nx), np.float32) ex_g = OCLArray.empty((Nz, Ny, Nx), np.complex64) ey_g = OCLArray.empty((Nz, Ny, Nx), np.complex64) ez_g = OCLArray.empty((Nz, Ny, Nx), np.complex64) kxs_g = OCLArray.from_array(kxs.astype(np.float32)) kys_g = OCLArray.from_array(kys.astype(np.float32)) t = time.time() p.run_kernel( "debye_wolf_lattice", (Nx, Ny, Nz), None, ex_g.data, ey_g.data, ez_g.data, u_g.data, np.float32(1.), np.float32(0.), np.float32(-dx * (Nx - 1) / 2.), np.float32(dx * (Nx - 1) / 2.), np.float32(-dy * (Ny - 1) / 2.), np.float32(dy * (Ny - 1) / 2.), np.float32(-dz * (Nz - 1) / 2.), np.float32(dz * (Nz - 1) / 2.), np.float32(1. * lam / n0), np.float32(alpha1), np.float32(alpha2), kxs_g.data, kys_g.data, np.int32(len(kxs)), np.float32(sigma)) ex = ex_g.get() print "time in secs:", time.time() - t return ex
def fftshift(arr_obj, axes = None, res_g = None, return_buffer = False): """ gpu version of fftshift for numpy arrays or OCLArrays Parameters ---------- arr_obj: numpy array or OCLArray (float32/complex64) the array to be fftshifted axes: list or None the axes over which to shift (like np.fft.fftshift) if None, all axes are taken res_g: if given, fills it with the result (has to be same shape and dtype as arr_obj) else internally creates a new one Returns ------- if return_buffer, returns the result as (well :) OCLArray else returns the result as numpy array """ if axes is None: axes = list(range(arr_obj.ndim)) if isinstance(arr_obj, OCLArray): if not arr_obj.dtype.type in DTYPE_KERNEL_NAMES: raise NotImplementedError("only works for float32 or complex64") elif isinstance(arr_obj, np.ndarray): if np.iscomplexobj(arr_obj): arr_obj = OCLArray.from_array(arr_obj.astype(np.complex64,copy = False)) else: arr_obj = OCLArray.from_array(arr_obj.astype(np.float32,copy = False)) else: raise ValueError("unknown type (%s)"%(type(arr_obj))) if not np.all([arr_obj.shape[a]%2==0 for a in axes]): raise NotImplementedError("only works on axes of even dimensions") if res_g is None: res_g = OCLArray.empty_like(arr_obj) # iterate over all axes # FIXME: this is still rather inefficient in_g = arr_obj for ax in axes: _fftshift_single(in_g, res_g, ax) in_g = res_g if return_buffer: return res_g else: return res_g.get()
def _deconv_rl_np_fft(data, h, Niter = 10, h_is_fftshifted = False): """ deconvolves data with given psf (kernel) h data and h have to be same shape via lucy richardson deconvolution """ if data.shape != h.shape: raise ValueError("data and h have to be same shape") if not h_is_fftshifted: h = np.fft.fftshift(h) hflip = h[::-1,::-1] #set up some gpu buffers y_g = OCLArray.from_array(data.astype(np.complex64)) u_g = OCLArray.from_array(data.astype(np.complex64)) tmp_g = OCLArray.empty(data.shape,np.complex64) hf_g = OCLArray.from_array(h.astype(np.complex64)) hflip_f_g = OCLArray.from_array(hflip.astype(np.complex64)) # hflipped_g = OCLArray.from_array(h.astype(np.complex64)) plan = fft_plan(data.shape) #transform psf fft(hf_g,inplace = True) fft(hflip_f_g,inplace = True) for i in range(Niter): print i fft_convolve(u_g, hf_g, res_g = tmp_g, kernel_is_fft = True) _complex_divide_inplace(y_g,tmp_g) fft_convolve(tmp_g,hflip_f_g, inplace = True, kernel_is_fft = True) _complex_multiply_inplace(u_g,tmp_g) return np.abs(u_g.get())
def gpu_kuwahara(data, N=5): """Function to convolve an imgage with the Kuwahara filter on GPU.""" # create numpy arrays if (N%2==0): raise ValueError("Data has to be a (2n+1)x(2n+1) array.") data_g = OCLArray.from_array(data.astype(float32)) res_g = OCLArray.empty((data.shape[0],data.shape[1]),float32) prog = OCLProgram("./OpenCL/gpu_kernels/gpu_kuwahara.cl") # start kernel on gput prog.run_kernel("kuwahara", # the name of the kernel in the cl file data_g.shape[::-1], # global size, the number of threads e.g. (128,128,) None, # local size, just leave it to None data_g.data,res_g.data, int32(N)) # return res_g.get()
def _deconv_rl_gpu_conv(data_g, h_g, Niter=10): """ using convolve """ #set up some gpu buffers u_g = OCLArray.empty(data_g.shape, np.float32) u_g.copy_buffer(data_g) tmp_g = OCLArray.empty(data_g.shape, np.float32) tmp2_g = OCLArray.empty(data_g.shape, np.float32) #fix this hflip_g = OCLArray.from_array((h_g.get()[::-1, ::-1]).copy()) for i in range(Niter): convolve(u_g, h_g, res_g=tmp_g) _divide_inplace(data_g, tmp_g) # return data_g, tmp_g convolve(tmp_g, hflip_g, res_g=tmp2_g) _multiply_inplace(u_g, tmp2_g) return u_g
def test_parseval(): from time import time Nx = 512 Nz = 10 d = np.random.uniform(-1,1,(Nx,Nx)).astype(np.complex64) d_g = OCLArray.from_array(d.astype(np.complex64)) s1, s2 = [],[] t = time() for i in range(Nz): print i # myfunc(d_g) # fft(d_g, inplace=True, fast_math=False) # fft(d_g, inverse = True,inplace=True,fast_math=False) fft(d_g, inplace=True) # fft(d_g, inverse = True,inplace=True) s1.append(np.sum(np.abs(d_g.get())**2)) print time()-t for i in range(Nz): print i d = np.fft.fftn(d).astype(np.complex64) d = np.fft.ifftn(d).astype(np.complex64) s2.append(np.sum(np.abs(d)**2)) return s1, s2
def test_parseval(): from time import time Nx = 512 Nz = 10 d = np.random.uniform(-1, 1, (Nx, Nx)).astype(np.complex64) d_g = OCLArray.from_array(d.astype(np.complex64)) s1, s2 = [], [] t = time() for i in range(Nz): # myfunc(d_g) # fft(d_g, inplace=True, fast_math=False) # fft(d_g, inverse = True,inplace=True,fast_math=False) fft(d_g, inplace=True) # fft(d_g, inverse = True,inplace=True) s1.append(np.sum(np.abs(d_g.get())**2)) print(time() - t) for i in range(Nz): d = np.fft.fftn(d).astype(np.complex64) d = np.fft.ifftn(d).astype(np.complex64) s2.append(np.sum(np.abs(d)**2)) return s1, s2
def time_multi(N, nargs, niter=100): map_exprs = ["%s*x%s[i]" % (i, i) for i in xrange(nargs)] arguments = ",".join("__global float *x%s" % i for i in xrange(nargs)) k = OCLReductionKernel2(np.float32, neutral="0", reduce_expr="a+b", map_exprs=map_exprs, arguments=arguments) ins = [ OCLArray.from_array(np.ones(N, np.float32)) for _ in xrange(len(map_exprs)) ] outs = [OCLArray.empty(1, np.float32) for _ in xrange(len(map_exprs))] from time import time t = time() for _ in xrange(niter): k(*ins, outs=outs) get_device().queue.finish() t = (time() - t) / niter print "multi reduction: result =", [float(out.get()) for out in outs] print "multi reduction:\t\t%.2f ms" % (1000 * t) return t
def time_simple(N, nargs, niter=100): from gputools import OCLReductionKernel map_exprs = ["%s*x[i]" % i for i in xrange(nargs)] ks = [ OCLReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_expr="%s*x[i]" % i, arguments="__global float *x") for i in xrange(len(map_exprs)) ] ins = [ OCLArray.from_array(np.ones(N, np.float32)) for _ in xrange(len(map_exprs)) ] outs = [OCLArray.empty(1, np.float32) for _ in xrange(len(map_exprs))] from time import time t = time() for _ in xrange(niter): for k, inn, out in zip(ks, ins, outs): k(inn, out=out) get_device().queue.finish() t = (time() - t) / niter print "simple reduction: result =", [float(out.get()) for out in outs] print "simple reduction:\t\t%.2f ms" % (1000 * t) return t
def _deconv_rl_gpu_conv(data_g, h_g, Niter = 10): """ using convolve """ #set up some gpu buffers u_g = OCLArray.empty(data_g.shape,np.float32) u_g.copy_buffer(data_g) tmp_g = OCLArray.empty(data_g.shape,np.float32) tmp2_g = OCLArray.empty(data_g.shape,np.float32) #fix this hflip_g = OCLArray.from_array((h_g.get()[::-1,::-1]).copy()) for i in range(Niter): convolve(u_g, h_g, res_g = tmp_g) _divide_inplace(data_g,tmp_g) # return data_g, tmp_g convolve(tmp_g, hflip_g, res_g = tmp2_g) _multiply_inplace(u_g,tmp2_g) return u_g
def _transfer_dn(self, dn): if self._is_subsampled: self._im_dn = OCLImage.from_array( self._copy_arr_with_correct_type(dn)) else: self._buf_dn = OCLArray.from_array( self._copy_arr_with_correct_type(dn))
def push(any_array): ''' converts a numpy array to an OpenCL array This method does the same as the converters in CLIJ but is less flexible https://github.com/clij/clij-core/tree/master/src/main/java/net/haesleinhuepf/clij/converters/implementations :param any_array: input numpy array :return: opencl-array ''' if (isinstance(any_array, OCLArray)): return any_array temp = any_array.astype(np.float32) #print("tmep: ") #print(temp) if (len(temp.shape) == 2): temp = np.swapaxes(temp, 0, 1) else: temp = np.swapaxes(temp, 0, 2) temp2 = OCLArray.from_array(temp) return temp2
def create_dn_buffer(size, units, points, dn_inner=.0, rad_inner=0, dn_outer=.1, rad_outer=.4): Nx, Ny, Nz = size dx, dy, dz = units program = OCLProgram(absPath("kernels/bpm_3d_spheres.cl")) dn_g = OCLArray.empty((Nz, Ny, Nx), dtype=np.float32) # sort by z ps = np.array(points) ps = ps[np.argsort(ps[:, 2]), :] Np = ps.shape[0] pointsBuf = OCLArray.from_array(ps.flatten().astype(np.float32)) program.run_kernel("fill_dn", (Nx, Ny, Nz), None, dn_g.data, pointsBuf.data, np.int32(Np), np.float32(dx), np.float32(dy), np.float32(dz), np.float32(dn_inner), np.float32(rad_inner), np.float32(dn_outer), np.float32(rad_outer)) return dn_g
def create_dn_buffer(size, units,points, dn_inner = .0, rad_inner = 0, dn_outer = .1, rad_outer = .4): Nx, Ny, Nz = size dx, dy, dz = units program = OCLProgram(absPath("kernels/bpm_3d_spheres.cl")) dn_g = OCLArray.empty((Nz,Ny,Nx),dtype=np.float32) # sort by z ps = np.array(points) ps = ps[np.argsort(ps[:,2]),:] Np = ps.shape[0] pointsBuf = OCLArray.from_array(ps.flatten().astype(np.float32)) program.run_kernel("fill_dn",(Nx,Ny,Nz),None,dn_g.data, pointsBuf.data,np.int32(Np), np.float32(dx),np.float32(dy),np.float32(dz), np.float32(dn_inner),np.float32(rad_inner), np.float32(dn_outer),np.float32(rad_outer)) return dn_g
def _gaussian_buf(d_g, sigma=(4., 4.), res_g=None, normalize=True, truncate=4.0): radius = tuple(int(truncate * s + 0.5) for s in sigma) ns = tuple(np.arange(-r, r + 1) for r in radius) hs = tuple( np.exp(-.5 / s**2 * n**2) for s, n in zip(reversed(sigma), reversed(ns))) if normalize: hs = tuple(1. * h / np.sum(h) for h in hs) h_gs = tuple(OCLArray.from_array(h.astype(np.float32)) for h in hs) if len(d_g.shape) == 1: return convolve(d_g, *h_gs, res_g=res_g) elif len(d_g.shape) == 2: return convolve_sep2(d_g, *h_gs, res_g=res_g) elif len(d_g.shape) == 3: return convolve_sep3(d_g, *h_gs, res_g=res_g) else: raise NotImplentedError("only 1D, 2D, or 3D images supported yet")
def _deconv_rl_np_fft(data, h, Niter=10, h_is_fftshifted=False): """ deconvolves data with given psf (kernel) h data and h have to be same shape via lucy richardson deconvolution """ if data.shape != h.shape: raise ValueError("data and h have to be same shape") if not h_is_fftshifted: h = np.fft.fftshift(h) hflip = h[::-1, ::-1] #set up some gpu buffers y_g = OCLArray.from_array(data.astype(np.complex64)) u_g = OCLArray.from_array(data.astype(np.complex64)) tmp_g = OCLArray.empty(data.shape, np.complex64) hf_g = OCLArray.from_array(h.astype(np.complex64)) hflip_f_g = OCLArray.from_array(hflip.astype(np.complex64)) # hflipped_g = OCLArray.from_array(h.astype(np.complex64)) plan = fft_plan(data.shape) #transform psf fft(hf_g, inplace=True) fft(hflip_f_g, inplace=True) for i in range(Niter): logger.info("Iteration: {}".format(i)) fft_convolve(u_g, hf_g, res_g=tmp_g, kernel_is_fft=True) _complex_divide_inplace(y_g, tmp_g) fft_convolve(tmp_g, hflip_f_g, inplace=True, kernel_is_fft=True) _complex_multiply_inplace(u_g, tmp_g) return np.abs(u_g.get())
def test_2d(): import time data = np.zeros((100,)*2,np.float32) data[50,50] = 1. hx = 1./5*np.ones(5) hy = 1./13*np.ones(13) out = convolve_sep2(data,hx,hy) data_g = OCLArray.from_array(data.astype(np.float32)) hx_g = OCLArray.from_array(hx.astype(np.float32)) hy_g = OCLArray.from_array(hy.astype(np.float32)) out_g = convolve_sep2(data_g,hx_g,hy_g) return out, out_g.get()
def _ocl_fft_numpy(plan, arr, inverse=False, fast_math=True): if arr.dtype != np.complex64: logger.info("converting %s to complex64, might slow things down..." % arr.dtype) ocl_arr = OCLArray.from_array(arr.astype(np.complex64, copy=False)) _ocl_fft_gpu_inplace(plan, ocl_arr, inverse=inverse) return ocl_arr.get()
def test_bessel(n,x): x_g = OCLArray.from_array(x.astype(float32)) res_g = OCLArray.empty_like(x.astype(float32)) p = OCLProgram(absPath("kernels/bessel.cl")) p.run_kernel("bessel_fill",x_g.shape,None, x_g.data,res_g.data,int32(n)) return res_g.get()
def test_2d(): import time data = np.zeros((100, ) * 2, np.float32) data[50, 50] = 1. hx = 1. / 5 * np.ones(5) hy = 1. / 13 * np.ones(13) out = convolve_sep2(data, hx, hy) data_g = OCLArray.from_array(data.astype(np.float32)) hx_g = OCLArray.from_array(hx.astype(np.float32)) hy_g = OCLArray.from_array(hy.astype(np.float32)) out_g = convolve_sep2(data_g, hx_g, hy_g) return out, out_g.get()
def test_bessel(n, x): x_g = OCLArray.from_array(x.astype(float32)) res_g = OCLArray.empty_like(x.astype(float32)) p = OCLProgram(absPath("kernels/bessel.cl")) p.run_kernel("bessel_fill", x_g.shape, None, x_g.data, res_g.data, int32(n)) return res_g.get()
def focus_field_debye_at(x,y,z,lam, NA, n0 = 1., n_integration_steps = 200): """ the same as focus_field_debye but for the coordinates given in x, y, z (arrays of same shape) slower than focus_field_debye as it doesnt assume the coordinates to be on a grid """ print absPath("kernels/psf_debye.cl") p = OCLProgram(absPath("kernels/psf_debye.cl"), build_options = str("-I %s -D INT_STEPS=%s"%(absPath("."),n_integration_steps))) if np.isscalar(NA): NA = [0.,NA] alphas = np.arcsin(np.array(NA)/n0) assert len(alphas)%2 ==0 assert x.shape == y.shape == z.shape dshape =x.shape N = np.prod(dshape) x_g = OCLArray.from_array(x.flatten().astype(np.float32)) y_g = OCLArray.from_array(y.flatten().astype(np.float32)) z_g = OCLArray.from_array(z.flatten().astype(np.float32)) u_g = OCLArray.empty(N,np.float32) ex_g = OCLArray.empty(N,np.complex64) ey_g = OCLArray.empty(N,np.complex64) ez_g = OCLArray.empty(N,np.complex64) alpha_g = OCLArray.from_array(alphas.astype(np.float32)) p.run_kernel("debye_wolf_at",(N,),None, x_g.data,y_g.data,z_g.data, ex_g.data,ey_g.data,ez_g.data, u_g.data, np.float32(1.),np.float32(0.), np.float32(lam/n0), alpha_g.data, np.int32(len(alphas))) u = u_g.get().reshape(dshape) ex = ex_g.get().reshape(dshape) ey = ey_g.get().reshape(dshape) ez = ez_g.get().reshape(dshape) return u, ex, ey, ez
def focus_field_debye_at(x, y, z, lam, NA, n0=1., n_integration_steps=200): """ the same as focus_field_debye but for the coordinates given in x, y, z (arrays of same shape) slower than focus_field_debye as it doesnt assume the coordinates to be on a grid """ print absPath("kernels/psf_debye.cl") p = OCLProgram(absPath("kernels/psf_debye.cl"), build_options=str("-I %s -D INT_STEPS=%s" % (absPath("."), n_integration_steps))) if np.isscalar(NA): NA = [0., NA] alphas = np.arcsin(np.array(NA) / n0) assert len(alphas) % 2 == 0 assert x.shape == y.shape == z.shape dshape = x.shape N = np.prod(dshape) x_g = OCLArray.from_array(x.flatten().astype(np.float32)) y_g = OCLArray.from_array(y.flatten().astype(np.float32)) z_g = OCLArray.from_array(z.flatten().astype(np.float32)) u_g = OCLArray.empty(N, np.float32) ex_g = OCLArray.empty(N, np.complex64) ey_g = OCLArray.empty(N, np.complex64) ez_g = OCLArray.empty(N, np.complex64) alpha_g = OCLArray.from_array(alphas.astype(np.float32)) p.run_kernel("debye_wolf_at", (N, ), None, x_g.data, y_g.data, z_g.data, ex_g.data, ey_g.data, ez_g.data, u_g.data, np.float32(1.), np.float32(0.), np.float32(lam / n0), alpha_g.data, np.int32(len(alphas))) u = u_g.get().reshape(dshape) ex = ex_g.get().reshape(dshape) ey = ey_g.get().reshape(dshape) ez = ez_g.get().reshape(dshape) return u, ex, ey, ez
def _ocl_fft_numpy(plan, arr,inverse = False, batch = 1, fast_math = True): if arr.dtype != np.complex64: logger.info("converting %s to complex64, might slow things down..."%arr.dtype) ocl_arr = OCLArray.from_array(arr.astype(np.complex64,copy=False)) _ocl_fft_gpu_inplace(plan, ocl_arr, inverse = inverse, batch = batch) return ocl_arr.get()
def _ocl_fft_numpy(arr,inverse = False, plan = None): if plan is None: plan = Plan(arr.shape, queue = get_device().queue) if arr.dtype != np.complex64: logger.info("converting %s to complex64, might slow things down..."%arr.dtype) ocl_arr = OCLArray.from_array(arr.astype(np.complex64,copy=False)) _ocl_fft_gpu_inplace(ocl_arr, inverse = inverse, plan = plan) return ocl_arr.get()
def _setup_impl(self): """setting up the gpu buffers and kernels """ self.bpm_program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) Nx, Ny, Nz = self.size self._plan = fft_plan((Ny, Nx)) self._H_g = OCLArray.from_array(self._H.astype(np.complex64)) if not self.dn is None and self.n_volumes == 1: self.dn_g = OCLArray.from_array(self.dn) self.scatter_weights_g = OCLArray.from_array( self.scatter_weights.astype(np.float32)) self.gfactor_weights_g = OCLArray.from_array( self.gfactor_weights.astype(np.float32)) self.scatter_cross_sec_g = OCLArray.zeros(Nz, "float32") self.gfactor_g = OCLArray.zeros(Nz, "float32")
def _ocl_fft_numpy(arr, inverse=False, plan=None): if plan is None: plan = Plan(arr.shape, queue=get_device().queue) if arr.dtype != np.complex64: logger.info("converting %s to complex64, might slow things down..." % arr.dtype) ocl_arr = OCLArray.from_array(arr.astype(np.complex64, copy=False)) _ocl_fft_gpu_inplace(ocl_arr, inverse=inverse, plan=plan) return ocl_arr.get()
def _setup_impl(self): """setting up the gpu buffers and kernels """ self.bpm_program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) Nx, Ny, Nz = self.size self._plan = fft_plan((Ny,Nx)) self._H_g = OCLArray.from_array(self._H.astype(np.complex64)) if not self.dn is None and self.n_volumes==1: self.dn_g = OCLArray.from_array(self.dn) self.scatter_weights_g = OCLArray.from_array(self.scatter_weights.astype(np.float32)) self.gfactor_weights_g = OCLArray.from_array(self.gfactor_weights.astype(np.float32)) self.scatter_cross_sec_g = OCLArray.zeros(Nz,"float32") self.gfactor_g = OCLArray.zeros(Nz,"float32")
def __init__(self, psf: np.ndarray, psf_is_fftshifted: bool = False, n_iter=10): """ setup deconvolution for a given shape """ self.shape = psf.shape if not psf_is_fftshifted: psf = np.fft.fftshift(psf) self.n_iter = n_iter # What happens here? Indices are being flipped ? Why. What if it is 3D? psfflip = psf[::-1, ::-1] self.psf_g = OCLArray.from_array(psf.astype(np.complex64)) self.psfflip_f_g = OCLArray.from_array(psfflip.astype(np.complex64)) self.plan = fft_plan(self.shape) # transform psf fft(self.psf_g, inplace=True) fft(self.psfflip_f_g, inplace=True) # get temp self.tmp_g = OCLArray.empty(psf.shape, np.complex64)
def setup(self, size, units, lam = .5, n0 = 1., use_fresnel_approx = False): """ sets up the internal variables e.g. propagators etc... :param size: the size of the geometry in pixels (Nx,Ny,Nz) :param units: the phyiscal units of each voxel in microns (dx,dy,dz) :param lam: the wavelength of light in microns :param n0: the refractive index of the surrounding media :param use_fresnel_approx: if True, uses fresnel approximation for propagator """ Bpm3d_Base.setup(self,size, units, lam = lam, n0 = n0, use_fresnel_approx = use_fresnel_approx) #setting up the gpu buffers and kernels self.program = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) Nx, Ny = self.size[:2] plan = fft_plan(()) self._H_g = OCLArray.from_array(self._H.astype(np.complex64)) self.scatter_weights_g = OCLArray.from_array(self.scatter_weights.astype(np.float32)) self.gfactor_weights_g = OCLArray.from_array(self.gfactor_weights.astype(np.float32)) self.scatter_cross_sec_g = OCLArray.zeros(Nz,"float32") self.gfactor_g = OCLArray.zeros(Nz,"float32") self.reduce_kernel = OCLReductionKernel( np.float32, neutral="0", reduce_expr="a+b", map_expr="weights[i]*cfloat_abs(field[i]-(i==0)*plain)*cfloat_abs(field[i]-(i==0)*plain)", arguments="__global cfloat_t *field, __global float * weights,cfloat_t plain")
def transfer(data): """transfers data""" d1_g = OCLArray.from_array(data) d2_g = OCLArray.empty_like(data) if data.dtype.type == np.float32: im = OCLImage.empty(data.shape[::1], dtype=np.float32) elif data.dtype.type == np.complex64: im = OCLImage.empty(data.shape[::1], dtype=np.float32, num_channels=2) im.copy_buffer(d1_g) d2_g.copy_image(im) return d2_g.get()
def resample_buf(data, new_shape): """resamples d""" d1_g = OCLArray.from_array(data) d2_g = OCLArray.empty(new_shape,data.dtype) if data.dtype.type == np.float32: im = OCLImage.empty(data.shape[::1],dtype = np.float32) elif data.dtype.type == np.complex64: im = OCLImage.empty(data.shape[::1],dtype = np.float32, num_channels=2) im.copy_buffer(d1_g) d2_g.copy_image_resampled(im) return d2_g.get()
def transfer(data): """transfers data""" d1_g = OCLArray.from_array(data) d2_g = OCLArray.empty_like(data) if data.dtype.type == np.float32: im = OCLImage.empty(data.shape[::1],dtype = np.float32) elif data.dtype.type == np.complex64: im = OCLImage.empty(data.shape[::1],dtype = np.float32, num_channels=2) im.copy_buffer(d1_g) d2_g.copy_image(im) return d2_g.get()
def resample_buf(data, new_shape): """resamples d""" d1_g = OCLArray.from_array(data) d2_g = OCLArray.empty(new_shape, data.dtype) if data.dtype.type == np.float32: im = OCLImage.empty(data.shape[::1], dtype=np.float32) elif data.dtype.type == np.complex64: im = OCLImage.empty(data.shape[::1], dtype=np.float32, num_channels=2) im.copy_buffer(d1_g) d2_g.copy_image_resampled(im) return d2_g.get()
def _blur_buf(d_g, width=(4.0, 4.0), res_g=None): Ns = [3 * s + 1 for s in width] sigmas = [0.5 * s for s in width] hs = [np.exp(-0.5 / s ** 2 * np.linspace(-N / 2, N / 2, N) ** 2) for s, N in zip(sigmas, Ns)] h_gs = [OCLArray.from_array(h.astype(np.float32)) for h in hs][::-1] if len(d_g.shape) == 1: return convolve(d_g, *h_gs, res_g=res_g) elif len(d_g.shape) == 2: return convolve_sep2(d_g, *h_gs, res_g=res_g) elif len(d_g.shape) == 3: return convolve_sep3(d_g, *h_gs, res_g=res_g) else: pass
def _deconv_rl_gpu_fft(data_g, h_g, Niter = 10): """ using fft_convolve """ if data_g.shape != h_g.shape: raise ValueError("data and h have to be same shape") #set up some gpu buffers u_g = OCLArray.empty(data_g.shape,np.complex64) u_g.copy_buffer(data_g) tmp_g = OCLArray.empty(data_g.shape,np.complex64) #fix this hflip_g = OCLArray.from_array((h_g.get()[::-1,::-1]).copy()) plan = fft_plan(data_g.shape) #transform psf fft(h_g,inplace = True) fft(hflip_g,inplace = True) for i in range(Niter): print i fft_convolve(u_g, h_g, res_g = tmp_g, kernel_is_fft = True) _complex_divide_inplace(data_g,tmp_g) fft_convolve(tmp_g,hflip_g, inplace = True, kernel_is_fft = True) _complex_multiply_inplace(u_g,tmp_g) return u_g
def get_gpu(N=256, niter=100, sig=1.): np.random.seed(0) a = np.random.normal(0, sig, (N, N)).astype(np.complex64) b = (1. * a.copy()).astype(np.complex64) c_g = OCLArray.empty_like(b) b_g = OCLArray.from_array(b) p = fft_plan((N, N), fast_math=False) rels = [] for _ in range(niter): fft(b_g, res_g=c_g, plan=p) fft(c_g, res_g=b_g, inverse=True, plan=p) # b = fft(fft(b), inverse = True) # rels.append(np.amax(np.abs(a-b))/np.amax(np.abs(a))) rels.append(np.amax(np.abs(a - b_g.get())) / np.amax(np.abs(a))) return np.array(rels)
def gpu_structure(data): """Function to convolve an imgage with a structure filter on GPU.""" # create numpy arrays data_g = OCLArray.from_array(data.astype(float32)) res_g = OCLArray.empty((data.shape[0],data.shape[1],2),float32) prog = OCLProgram("./OpenCL/gpu_kernels/gpu_structure.cl") # start kernel on gput prog.run_kernel("structure", # the name of the kernel in the cl file data_g.shape[::-1], # global size, the number of threads e.g. (128,128,) None, # local size, just leave it to None data_g.data,res_g.data) return res_g.get()
def get_gpu(N = 256, niter=100, sig = 1.): np.random.seed(0) a = np.random.normal(0,sig,(N,N)).astype(np.complex64) b = (1.*a.copy()).astype(np.complex64) c_g = OCLArray.empty_like(b) b_g = OCLArray.from_array(b) p = fft_plan((N,N), fast_math = False) rels = [] for _ in range(niter): fft(b_g,res_g = c_g, plan = p) fft(c_g, res_g = b_g, inverse = True, plan = p) # b = fft(fft(b), inverse = True) # rels.append(np.amax(np.abs(a-b))/np.amax(np.abs(a))) rels.append(np.amax(np.abs(a-b_g.get()))/np.amax(np.abs(a))) return np.array(rels)
def gpu_mean(data, Nx=10,Ny=10): """Function to convolve an imgage with a mean filter on GPU.""" # create numpy arrays data_g = OCLArray.from_array(data.astype(float32)) res_g = OCLArray.empty(data.shape,float32) prog = OCLProgram("./OpenCL/gpu_kernels/gpu_mean.cl") # start kernel on gput prog.run_kernel("mean", # the name of the kernel in the cl file data_g.shape[::-1], # global size, the number of threads e.g. (128,128,) None, # local size, just leave it to None data_g.data,res_g.data, int32(Nx),int32(Ny)) return res_g.get()
def test_parseval(): Nx = 512 Nz = 100 d = np.random.uniform(-1,1,(Nx,Nx)).astype(np.complex64) d_g = OCLArray.from_array(d.astype(np.complex64)) s1, s2 = [],[] for i in range(Nz): print(i) fft(d_g, inplace=True, fast_math=False) fft(d_g, inverse = True,inplace=True,fast_math=False) s1.append(np.sum(np.abs(d_g.get())**2)) for i in range(Nz): print(i) d = np.fft.fftn(d).astype(np.complex64) d = np.fft.ifftn(d).astype(np.complex64) s2.append(np.sum(np.abs(d)**2)) return s1, s2
def test_parseval(): Nx = 512 Nz = 100 d = np.random.uniform(-1, 1, (Nx, Nx)).astype(np.complex64) d_g = OCLArray.from_array(d.astype(np.complex64)) s1, s2 = [], [] for i in range(Nz): print(i) fft(d_g, inplace=True, fast_math=False) fft(d_g, inverse=True, inplace=True, fast_math=False) s1.append(np.sum(np.abs(d_g.get())**2)) for i in range(Nz): print(i) d = np.fft.fftn(d).astype(np.complex64) d = np.fft.ifftn(d).astype(np.complex64) s2.append(np.sum(np.abs(d)**2)) return s1, s2
def time_multi(N, nargs, niter =100): map_exprs=["%s*x%s[i]"%(i,i) for i in xrange(nargs)] arguments = ",".join("__global float *x%s"%i for i in xrange(nargs)) k = OCLReductionKernel2(np.float32, neutral="0", reduce_expr="a+b", map_exprs=map_exprs, arguments=arguments) ins = [OCLArray.from_array(np.ones(N,np.float32)) for _ in xrange(len(map_exprs))] outs = [OCLArray.empty(1,np.float32) for _ in xrange(len(map_exprs))] from time import time t = time() for _ in xrange(niter): k(*ins, outs = outs) get_device().queue.finish() t = (time()-t)/niter print "multi reduction: result =", [float(out.get()) for out in outs] print "multi reduction:\t\t%.2f ms"%(1000*t) return t
def affine(data, mat = np.identity(4), mode ="linear"): """affine transform data with matrix mat """ bop = {"linear":"","nearest":"-D USENEAREST"} if not mode in bop.keys(): raise KeyError("mode = '%s' not defined ,valid: %s"%(mode, bop.keys())) d_im = OCLImage.from_array(data) res_g = OCLArray.empty(data.shape,np.float32) mat_g = OCLArray.from_array(np.linalg.inv(mat).astype(np.float32,copy=False)) prog = OCLProgram(abspath("kernels/transformations.cl") , build_options=[bop[mode]]) prog.run_kernel("affine", data.shape[::-1],None, d_im,res_g.data,mat_g.data) return res_g.get()
def _convolve3_old(data,h, dev = None): """convolves 3d data with kernel h on the GPU Device dev boundary conditions are clamping to edge. h is converted to float32 if dev == None the default one is used """ if dev is None: dev = get_device() if dev is None: raise ValueError("no OpenCLDevice found...") dtype = data.dtype.type dtypes_options = {np.float32:"", np.uint16:"-D SHORTTYPE"} if not dtype in dtypes_options.keys(): raise TypeError("data type %s not supported yet, please convert to:"%dtype,dtypes_options.keys()) prog = OCLProgram(abspath("kernels/convolve3.cl"), build_options = dtypes_options[dtype]) hbuf = OCLArray.from_array(h.astype(np.float32)) img = OCLImage.from_array(data) res = OCLArray.empty(data.shape,dtype=np.float32) Ns = [np.int32(n) for n in data.shape+h.shape] prog.run_kernel("convolve3d",img.shape,None, img,hbuf.data,res.data, *Ns) return res.get()
def time_simple(N, nargs, niter =100): from gputools import OCLReductionKernel map_exprs=["%s*x[i]"%i for i in xrange(nargs)] ks = [OCLReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_expr="%s*x[i]"%i, arguments="__global float *x") for i in xrange(len(map_exprs))] ins = [OCLArray.from_array(np.ones(N,np.float32)) for _ in xrange(len(map_exprs))] outs = [OCLArray.empty(1,np.float32) for _ in xrange(len(map_exprs))] from time import time t = time() for _ in xrange(niter): for k,inn,out in zip(ks,ins,outs): k(inn, out = out) get_device().queue.finish() t = (time()-t)/niter print "simple reduction: result =", [float(out.get()) for out in outs] print "simple reduction:\t\t%.2f ms"%(1000*t) return t