def time_multi(N, nargs, niter=100): map_exprs = ["%s*x%s[i]" % (i, i) for i in xrange(nargs)] arguments = ",".join("__global float *x%s" % i for i in xrange(nargs)) k = OCLReductionKernel2(np.float32, neutral="0", reduce_expr="a+b", map_exprs=map_exprs, arguments=arguments) ins = [ OCLArray.from_array(np.ones(N, np.float32)) for _ in xrange(len(map_exprs)) ] outs = [OCLArray.empty(1, np.float32) for _ in xrange(len(map_exprs))] from time import time t = time() for _ in xrange(niter): k(*ins, outs=outs) get_device().queue.finish() t = (time() - t) / niter print "multi reduction: result =", [float(out.get()) for out in outs] print "multi reduction:\t\t%.2f ms" % (1000 * t) return t
def time_simple(N, nargs, niter=100): from gputools import OCLReductionKernel map_exprs = ["%s*x[i]" % i for i in xrange(nargs)] ks = [ OCLReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_expr="%s*x[i]" % i, arguments="__global float *x") for i in xrange(len(map_exprs)) ] ins = [ OCLArray.from_array(np.ones(N, np.float32)) for _ in xrange(len(map_exprs)) ] outs = [OCLArray.empty(1, np.float32) for _ in xrange(len(map_exprs))] from time import time t = time() for _ in xrange(niter): for k, inn, out in zip(ks, ins, outs): k(inn, out=out) get_device().queue.finish() t = (time() - t) / niter print "simple reduction: result =", [float(out.get()) for out in outs] print "simple reduction:\t\t%.2f ms" % (1000 * t) return t
def time_np(dshape, niter=3): d = np.empty(dshape, np.complex64) get_device().queue.finish() t = time() for _ in xrange(niter): np.fft.fftn(d) get_device().queue.finish() t = (time()-t)/niter print "CPU\t\t\t%s\t\t%.2f ms"%(dshape, 1000.*t)
def time_np(dshape, niter=3): d = np.empty(dshape, np.complex64) get_device().queue.finish() t = time() for _ in range(niter): np.fft.fftn(d) get_device().queue.finish() t = (time() - t) / niter print("CPU\t\t\t%s\t\t%.2f ms" % (dshape, 1000. * t)) return t
def time_gpu(dshape, niter=100, fast_math=False): d_g = OCLArray.empty(dshape, np.complex64) get_device().queue.finish() plan = fft_plan(dshape, fast_math=fast_math) t = time() for _ in xrange(niter): fft(d_g, inplace=True, plan=plan) get_device().queue.finish() t = (time()-t)/niter print "GPU (fast_math = %s)\t%s\t\t%.2f ms"%(fast_math, dshape, 1000.*t)
def time_gpu(dshape, niter=100, fast_math=False): d_g = OCLArray.empty(dshape, np.complex64) get_device().queue.finish() plan = fft_plan(dshape, fast_math=fast_math) t = time() for _ in range(niter): fft(d_g, inplace=True, plan=plan) get_device().queue.finish() t = (time() - t) / niter print("GPU (fast_math = %s)\t%s\t\t%.2f ms" % (fast_math, dshape, 1000. * t)) return t
def fft(arr_obj,res_g = None, inplace = False, inverse = False, batch = 1, plan = None, fast_math = True, normalize = True): """ (inverse) fourier trafo of 1-3D arrays creates a new plan or uses the given plan the transformed arr_obj should be either a - numpy array: returns the fft as numpy array (inplace is ignored) - OCLArray of type complex64: writes transform into res_g if given, to arr_obj if inplace or returns a new OCLArray with the transform otherwise """ if plan is None: if batch==1: plan = Plan(arr_obj.shape, queue = get_device().queue, normalize = normalize, fast_math = fast_math) else: plan = Plan(arr_obj.shape[1:], queue = get_device().queue, normalize = normalize, fast_math = fast_math) if isinstance(arr_obj,np.ndarray): return _ocl_fft_numpy(plan, arr_obj,inverse = inverse) elif isinstance(arr_obj,OCLArray): if not arr_obj.dtype.type is np.complex64: raise TypeError("OCLArray arr_obj has to be of complex64 type") if inplace: _ocl_fft_gpu_inplace(plan, arr_obj, inverse = inverse, batch = batch) else: return _ocl_fft_gpu(plan, arr_obj, res_arr = res_g, inverse = inverse, batch = batch) else: raise TypeError("array argument (1) has bad type: %s"%type(arr_obj))
def func_to_time(dshape, Niter = 10, **kwargs): d = zeros(dshape,np.complex64) d_g = gputools.OCLArray.from_array(d) #burn in f(d_g,**kwargs) gputools.get_device().queue.finish() t = time() for _ in range(Niter): f(d_g,**kwargs) gputools.get_device().queue.finish() return (time()-t)/Niter
def empty(cls, shape, dtype, num_channels=1, channel_order=None): ctx = get_device().context if not len(shape) in [2, 3]: raise ValueError( "dimension of shape wrong, should be 2...3 but is %s" % len(shape)) mem_flags = cl.mem_flags.READ_WRITE channel_type = cl.DTYPE_TO_CHANNEL_TYPE[np.dtype(dtype)] _dict_channel_order = { 1: cl.channel_order.R, 2: cl.channel_order.RG, 3: cl.channel_order.RGB, 4: cl.channel_order.RGBA } if channel_order is None: channel_order = _dict_channel_order[num_channels] fmt = cl.ImageFormat(channel_order, channel_type) res = cls(ctx, mem_flags, fmt, shape=shape[::-1]) res.dtype = dtype res.num_channels = num_channels return res
def write_array(self, data): queue = get_device().queue # 1d images dont have a shape but only a width if hasattr(self, "shape"): imshape = self.shape else: imshape = (self.width, ) ndim = len(imshape) dshape = data.shape # if clImg.format.channel_order in [cl.channel_order.RGBA, # cl.channel_order.BGRA]: # dshape = dshape[:-1] if dshape != imshape[::-1]: raise ValueError("write_array: wrong shape!", data.shape[::-1], imshape) else: #cl.enqueue_write_image(queue,self,[0]*ndim,imshape,data) #FIXME data.copy() is a work around cl.enqueue_copy(queue, self, data.copy(), origin=(0, ) * ndim, region=imshape)
def estimate_niter(N): """returns niter s.t. the time spent on kernel is same as for memory transfer""" a = np.ones(N,np.float32) dev = get_device() context, queue = dev.context, dev.queue mf = cl.mem_flags t = time() copy_g = cl.Buffer(context, mf.ALLOC_HOST_PTR ,size = a.nbytes) cl.enqueue_map_buffer(queue, copy_g, a, device_offset=0, is_blocking=False) # cl.enqueue_copy(queue, copy_g, a, # device_offset=0, # is_blocking=False) queue.flush() # a_g = OCLArray.from_array(a, async = True) #a_g = array.to_device(queue, a, async = False) print time()-t
def from_array(cls, arr, *args, **kwargs): ctx = get_device().context if not arr.ndim in [2, 3, 4]: raise ValueError( "dimension of array wrong, should be 1...4 but is %s" % arr.ndim) elif arr.ndim == 4: num_channels = arr.shape[-1] else: num_channels = 1 if arr.dtype.type == np.complex64: num_channels = 2 res = OCLImage.empty(arr.shape, dtype=np.float32, num_channels=num_channels) res.write_array(arr) res.dtype = np.float32 else: res = cl.image_from_array(ctx, prepare(arr), num_channels=num_channels, *args, **kwargs) res.dtype = arr.dtype res.num_channels = num_channels return res
def empty(cls, shape, dtype, num_channels=1, channel_order=None): ctx = get_device().context if not len(shape) in [2, 3]: raise ValueError( "number of dimension = %s not supported (can be 2 or 3)" % len(shape)) if not num_channels in [1, 2, 3, 4]: raise ValueError( "number of channels = %s not supported (can be 1,2, 3 or 4)" % num_channels) mem_flags = cl.mem_flags.READ_WRITE channel_type = cl.DTYPE_TO_CHANNEL_TYPE[np.dtype(dtype)] _dict_channel_order = { 1: cl.channel_order.R, 2: cl.channel_order.RG, 3: cl.channel_order.RGB, 4: cl.channel_order.RGBA } if channel_order is None: channel_order = _dict_channel_order[num_channels] fmt = cl.ImageFormat(channel_order, channel_type) res = cls(ctx, mem_flags, fmt, shape=shape[::-1]) res.dtype = dtype res.num_channels = num_channels return res
def _fft_convolve_numpy(data, h, plan = None, kernel_is_fft = False, kernel_is_fftshifted = False): """ convolving via opencl fft for numpy arrays data and h must have the same size """ dev = get_device() if data.shape != h.shape: raise ValueError("data and kernel must have same size! %s vs %s "%(str(data.shape),str(h.shape))) data_g = OCLArray.from_array(data.astype(np.complex64)) if not kernel_is_fftshifted: h = np.fft.fftshift(h) h_g = OCLArray.from_array(h.astype(np.complex64)) res_g = OCLArray.empty_like(data_g) _fft_convolve_gpu(data_g,h_g,res_g = res_g, plan = plan, kernel_is_fft = kernel_is_fft) res = abs(res_g.get()) del data_g del h_g del res_g return res
def copy_image(self,img, **kwargs): queue = get_device().queue return cl.enqueue_copy(queue, self, img, src_origin = (0,0), dest_origin = (0,0), region = self.shape, **kwargs)
def _ocl_fft_gpu_inplace(ocl_arr, inverse=False, plan=None): assert_bufs_type(np.complex64, ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue=get_device().queue) plan.execute(ocl_arr.data, ocl_arr.data, inverse=inverse)
def _ocl_fft_gpu_inplace(ocl_arr,inverse = False, plan = None): assert_bufs_type(np.complex64,ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue = get_device().queue) plan.execute(ocl_arr.data,ocl_arr.data, inverse = inverse)
def func_to_time(im_size = 256, psf_size = 5, Niter = 10, is_pad = False ): d = zeros((im_size,)*2,np.float32) if is_pad: h = ones((im_size,)*2,np.float32) else: h = ones((psf_size,)*2,np.float32) d_g = gputools.OCLArray.from_array(d) h_g = gputools.OCLArray.from_array(h) gputools.get_device().queue.finish() t = time() for _ in range(Niter): f(d_g, h_g) gputools.get_device().queue.finish() return (time()-t)/Niter
def _setup_gpu(self): dev = get_device() self._queue = dev.queue self._ctx = dev.context prog = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) # the buffers/ images Nx, Ny = self.simul_xy Nx0, Ny0 = self.shape[:2] self._plan = fft_plan((Ny, Nx), **self.fftplan_kwargs) self._buf_plane = OCLArray.empty((Ny, Nx), np.complex64) self._buf_H = OCLArray.empty((Ny, Nx), np.complex64) self._img_xy = OCLImage.empty((Ny, Nx), dtype=np.float32, num_channels=2) # buffer for the weighted dn average self.intens_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type) self.intens_dn_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type) self.intens_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type) self.intens_dn_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type) # the kernels self._kernel_compute_propagator = prog.compute_propagator self._kernel_compute_propagator.set_scalar_arg_dtypes((None, ) + (np.float32, ) * 5) self._kernel_compute_propagator_buf = prog.compute_propagator_buf self._kernel_compute_propagator_buf.set_scalar_arg_dtypes( (None, ) + (np.float32, ) * 5 + (None, ) * 2) self._kernel_mult_complex = prog.mult self._kernel_im_to_buf_field = prog.img_to_buf_field self._kernel_im_to_buf_intensity = prog.img_to_buf_intensity self._kernel_im_to_im_intensity = prog.img_to_img_intensity self._kernel_buf_to_buf_field = prog.buf_to_buf_field self._kernel_buf_to_buf_intensity = prog.buf_to_buf_intensity self._kernel_mult_dn_img_float = prog.mult_dn_image self._kernel_mult_dn_buf_float = prog.mult_dn self._kernel_mult_dn_img_complex = prog.mult_dn_image_complex self._kernel_mult_dn_buf_complex = prog.mult_dn_complex self._kernel_mult_dn_img_float_local = prog.mult_dn_image_local self._kernel_mult_dn_buf_float_local = prog.mult_dn_local self._kernel_mult_dn_img_complex_local = prog.mult_dn_image_complex_local self._kernel_mult_dn_buf_complex_local = prog.mult_dn_complex_local self._kernel_reduction = OCLMultiReductionKernel( np.float32, neutral="0", reduce_expr="a+b", map_exprs=["a[i]", "b[i]"], arguments="__global float *a, __global float *b") self._fill_propagator(self.n0)
def copy_image(self, img, **kwargs): queue = get_device().queue return cl.enqueue_copy(queue, self, img, src_origin=(0, ) * len(self.imshape()), dest_origin=(0, ) * len(self.imshape()), region=self.shape, **kwargs)
def copy_image(self, img, **kwargs): queue = get_device().queue return cl.enqueue_copy(queue, self.data, img, offset=0, origin=(0, ) * len(img.shape), region=img.shape, **kwargs)
def copy_image(self, img, **kwargs): queue = get_device().queue return pyopencl.enqueue_copy(queue, self, img, src_origin=(0, 0), dest_origin=(0, 0), region=img.shape, **kwargs)
def copy_buffer(self, buf, **kwargs): queue = get_device().queue self.dtype = buf.dtype return cl.enqueue_copy(queue, self, buf.data, offset=0, origin=(0, ) * len(self.imshape()), region=self.imshape(), **kwargs)
def copy_buffer(self, buf, **kwargs): queue = get_device().queue self.dtype = buf.dtype return pyopencl.enqueue_copy(queue, self, buf.data, offset=0, origin=(0, 0), region=self.shape, **kwargs)
def _integral3_buf(x_g, res_g = None, tmp_g = None): if not x_g.dtype.type in _output_type_dict: raise ValueError("dtype %s currently not supported! (%s)" % (x_g.dtype.type, str(_output_type_dict.keys()))) dtype_out = _output_type_dict[x_g.dtype.type] cl_dtype_in = cl_buffer_datatype_dict[x_g.dtype.type] cl_dtype_out = cl_buffer_datatype_dict[dtype_out] dtype_itemsize = np.dtype(dtype_out).itemsize max_local_size = get_device().get_info("MAX_WORK_GROUP_SIZE") prog = OCLProgram(abspath("kernels/integral_image.cl"), build_options=["-D", "DTYPE=%s" % cl_dtype_out]) if x_g.dtype.type != dtype_out: x_g = x_g.astype(dtype_out) if tmp_g is None: tmp_g = OCLArray.empty(x_g.shape, dtype_out) if res_g is None: res_g = OCLArray.empty(x_g.shape, dtype_out) assert_bufs_type(dtype_out, tmp_g, res_g) nz, ny, nx = x_g.shape def _scan_single(src, dst, ns, strides): nx, ny, nz = ns stride_x, stride_y, stride_z = strides loc = min(next_power_of_2(nx // 2), max_local_size // 2) nx_block = 2 * loc nx_pad = math.ceil(nx / nx_block) * nx_block nblocks = math.ceil(nx_pad // 2 / loc) sum_blocks = OCLArray.empty((nz, ny, nblocks), dst.dtype) shared = cl.LocalMemory(2 * dtype_itemsize * loc) for b in range(nblocks): offset = b * loc prog.run_kernel("scan3d", (loc, ny, nz), (loc, 1, 1), src.data, dst.data, sum_blocks.data, shared, np.int32(nx_block), np.int32(stride_x), np.int32(stride_y), np.int32(stride_z), np.int32(offset), np.int32(b), np.int32(nblocks), np.int32(ny), np.int32(nx)) if nblocks > 1: _scan_single(sum_blocks, sum_blocks, (nblocks, ny, nz), (1, nblocks, nblocks * ny)) prog.run_kernel("add_sums3d", (nx_pad, ny, nz), (nx_block, 1, 1), sum_blocks.data, dst.data, np.int32(stride_x), np.int32(stride_y), np.int32(stride_z), np.int32(nblocks), np.int32(ny), np.int32(nx)) _scan_single(x_g, res_g, (nx, ny, nz), (1, nx, nx * ny)) _scan_single(res_g, tmp_g, (ny, nx, nz), (nx, 1, nx * ny)) _scan_single(tmp_g, res_g, (nz, nx, ny), (ny * nx, 1, nx)) return res_g
def test_integral(): max_size = get_device().get_info("GLOBAL_MEM_SIZE") // 32 ndims = (2, 3) ns = (33, 197, 2183) dtypes = (np.uint8, np.uint16, np.int32, np.float32) for dtype, ndim in product(dtypes, ndims): for shape0 in combinations(ns, ndim): for shape in permutations(shape0): if np.prod(shape) > max_size: continue single_test(shape, dtype, check=True)
def bench(description, dshape, dtype, func_cpu, func_gpu, func_gpu_notransfer=None, niter=2): x = np.random.randint(0, 100, dshape).astype(dtype) func_cpu(x) t_cpu = time() for _ in range(niter): y = func_cpu(x) t_cpu = (time() - t_cpu) / niter func_gpu(x) t_gpu = time() for _ in range(niter): y = func_gpu(x) t_gpu = (time() - t_gpu) / niter if func_gpu_notransfer is not None: x_g = OCLArray.from_array(x) tmp_g = OCLArray.empty_like(x) func_gpu_notransfer(x_g, tmp_g) get_device().queue.finish() t_gpu_notransfer = time() for _ in range(niter): func_gpu_notransfer(x_g, tmp_g) get_device().queue.finish() t_gpu_notransfer = (time() - t_gpu_notransfer) / niter else: t_gpu_notransfer = None # print("%s\t\t %s\t%d ms \t %d ms"%(description,dshape, 1000*t1,1000*t2)) print("%s| %s %s | %d ms | %d ms | %s" % (description, dshape, type_name_dict[dtype], 1000 * t_cpu, 1000 * t_gpu, "%d ms" % (1000 * t_gpu_notransfer) if t_gpu_notransfer is not None else "-")) return t_cpu, t_gpu, t_gpu_notransfer
def _ocl_fft_numpy(arr,inverse = False, plan = None): if plan is None: plan = Plan(arr.shape, queue = get_device().queue) if arr.dtype != np.complex64: logger.info("converting %s to complex64, might slow things down..."%arr.dtype) ocl_arr = OCLArray.from_array(arr.astype(np.complex64,copy=False)) _ocl_fft_gpu_inplace(ocl_arr, inverse = inverse, plan = plan) return ocl_arr.get()
def time_multi(N, nargs, niter =100): map_exprs=["%s*x%s[i]"%(i,i) for i in xrange(nargs)] arguments = ",".join("__global float *x%s"%i for i in xrange(nargs)) k = OCLReductionKernel2(np.float32, neutral="0", reduce_expr="a+b", map_exprs=map_exprs, arguments=arguments) ins = [OCLArray.from_array(np.ones(N,np.float32)) for _ in xrange(len(map_exprs))] outs = [OCLArray.empty(1,np.float32) for _ in xrange(len(map_exprs))] from time import time t = time() for _ in xrange(niter): k(*ins, outs = outs) get_device().queue.finish() t = (time()-t)/niter print "multi reduction: result =", [float(out.get()) for out in outs] print "multi reduction:\t\t%.2f ms"%(1000*t) return t
def test_speed_multipass(): import time from gputools import get_device N = 256 x = np.linspace(-1, 1, N) Z, Y, X = np.meshgrid(x, x, x, indexing="ij") R = np.sqrt(X ** 2 + Y ** 2 + Z ** 2) d = 200 * np.exp(-10 * R ** 2) rend = VolumeRenderer((800,) * 2) rend.set_modelView(mat4_translate(0, 0, -10.)) rend.set_data(d.astype(np.float32)) get_device().queue.finish() for niter in range(1, 10): get_device().queue.finish() t = time.time() rend.render(method="max_project", maxVal=200., currentPart=0, numParts=niter) get_device().queue.finish() print("time to render with %s substeps:\t %.2f ms" % (niter, 1000 * (time.time() - t))) return rend
def test_speed_multipass(): import time from gputools import get_device N = 256 x = np.linspace(-1, 1, N) Z, Y, X = np.meshgrid(x, x, x, indexing="ij") R = np.sqrt(X**2 + Y**2 + Z**2) d = 200 * np.exp(-10 * R**2) rend = VolumeRenderer((800, ) * 2) rend.set_modelView(mat4_translate(0, 0, -10.)) rend.set_data(d.astype(np.float32)) get_device().queue.finish() for niter in range(1, 10): get_device().queue.finish() t = time.time() rend.render(method="max_project", maxVal=200., currentPart=0, numParts=niter) get_device().queue.finish() print("time to render with %s substeps:\t %.2f ms" % (niter, 1000 * (time.time() - t))) return rend
def from_array(cls,arr, *args, **kwargs): ctx = get_device().context if not arr.ndim in [1,2,3,4]: raise ValueError("dimension of array wrong, should be 1...4 but is %s"%arr.ndim) elif arr.ndim == 4: num_channels = arr.shape[-1] else: num_channels = None res = pyopencl.image_from_array(ctx, arr,num_channels = num_channels, *args, **kwargs) res.dtype = arr.dtype return res
def _ocl_fft_numpy(arr, inverse=False, plan=None): if plan is None: plan = Plan(arr.shape, queue=get_device().queue) if arr.dtype != np.complex64: logger.info("converting %s to complex64, might slow things down..." % arr.dtype) ocl_arr = OCLArray.from_array(arr.astype(np.complex64, copy=False)) _ocl_fft_gpu_inplace(ocl_arr, inverse=inverse, plan=plan) return ocl_arr.get()
def _ocl_fft_gpu(ocl_arr,res_arr = None,inverse = False, plan = None): assert_bufs_type(np.complex64,ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue = get_device().queue) if res_arr is None: res_arr = OCLArray.empty(ocl_arr.shape,np.complex64) plan.execute(ocl_arr.data,res_arr.data, inverse = inverse) return res_arr
def _ocl_fft_gpu(ocl_arr, res_arr=None, inverse=False, plan=None): assert_bufs_type(np.complex64, ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue=get_device().queue) if res_arr is None: res_arr = OCLArray.empty(ocl_arr.shape, np.complex64) plan.execute(ocl_arr.data, res_arr.data, inverse=inverse) return res_arr
def __init__(self,file_name = None,src_str = None, build_options =[], dev = None): if file_name is not None: with open(file_name,"r") as f: src_str = f.read() if src_str is None: raise ValueError("empty src_str! ") if dev is None: dev = get_device() self.dev = dev super(OCLProgram,self).__init__(self.dev.context,src_str) self.build(options = build_options)
def _setup_gpu(self): dev = get_device() self._queue = dev.queue self._ctx = dev.context prog = OCLProgram(absPath("kernels/bpm_3d_kernels.cl")) # the buffers/ images Nx, Ny = self.simul_xy Nx0, Ny0 = self.shape[:2] self._plan = fft_plan((Ny, Nx), **self.fftplan_kwargs) self._buf_plane = OCLArray.empty((Ny, Nx), np.complex64) self._buf_H = OCLArray.empty((Ny, Nx), np.complex64) self._img_xy = OCLImage.empty((Ny, Nx), dtype=np.float32, num_channels=2) # buffer for the weighted dn average self.intens_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type) self.intens_dn_g = OCLArray.empty((1, Ny, Nx), dtype=Bpm3d._real_type) self.intens_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type) self.intens_dn_sum_g = OCLArray.zeros((), dtype=Bpm3d._real_type) # the kernels self._kernel_compute_propagator = prog.compute_propagator self._kernel_compute_propagator.set_scalar_arg_dtypes((None,)+(np.float32,)*5) self._kernel_compute_propagator_buf = prog.compute_propagator_buf self._kernel_compute_propagator_buf.set_scalar_arg_dtypes((None,)+(np.float32,)*5+(None,)*2) self._kernel_mult_complex = prog.mult self._kernel_im_to_buf_field = prog.img_to_buf_field self._kernel_im_to_buf_intensity = prog.img_to_buf_intensity self._kernel_im_to_im_intensity = prog.img_to_img_intensity self._kernel_buf_to_buf_field = prog.buf_to_buf_field self._kernel_buf_to_buf_intensity = prog.buf_to_buf_intensity self._kernel_mult_dn_img_float = prog.mult_dn_image self._kernel_mult_dn_buf_float = prog.mult_dn self._kernel_mult_dn_img_complex = prog.mult_dn_image_complex self._kernel_mult_dn_buf_complex = prog.mult_dn_complex self._kernel_mult_dn_img_float_local = prog.mult_dn_image_local self._kernel_mult_dn_buf_float_local = prog.mult_dn_local self._kernel_mult_dn_img_complex_local = prog.mult_dn_image_complex_local self._kernel_mult_dn_buf_complex_local = prog.mult_dn_complex_local self._kernel_reduction = OCLMultiReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_exprs=["a[i]", "b[i]"], arguments="__global float *a, __global float *b") self._fill_propagator(self.n0)
def get(self, **kwargs): queue = get_device().queue if hasattr(self,"shape"): imshape = self.shape else: imshape = (self.width,) dshape = imshape[::-1] if self.format.channel_count>1: dshape += (self.format.channel_count,) out = np.empty(dshape,dtype=self.dtype) pyopencl.enqueue_read_image(queue,self,[0]*len(dshape),imshape,out) return out.reshape(dshape)
def __init__(self, dtype_out, neutral, reduce_expr, arguments=None, map_exprs=[None], name="reduce_kernel", options=[], preamble=""): ctx = get_device().context dtype_out = self.dtype_out = np.dtype(dtype_out) max_group_size = None trip_count = 0 self.n_exprs = len(map_exprs) assert self.n_exprs>0 while True: self.stage_1_inf = get_reduction_kernel(1, ctx, dtype_out, neutral, reduce_expr, arguments, name=name+"_stage1", options=options, preamble=preamble, map_exprs=map_exprs, max_group_size=max_group_size) kernel_max_wg_size = self.stage_1_inf.kernel.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, ctx.devices[0]) if self.stage_1_inf.group_size<=kernel_max_wg_size: break else: max_group_size = kernel_max_wg_size trip_count += 1 assert trip_count<=2 self.stage_2_inf = get_reduction_kernel(2, ctx, dtype_out, neutral, reduce_expr, arguments=arguments, name=name+"_stage2", options=options, map_exprs=map_exprs, preamble=preamble, max_group_size=max_group_size) from pytools import any from pyopencl.tools import VectorArg assert any( isinstance(arg_tp, VectorArg) for arg_tp in self.stage_1_inf.arg_types), \ "ReductionKernel can only be used with functions " \ "that have at least one vector argument"
def get(self, **kwargs): queue = get_device().queue imshape = self.imshape() dshape = imshape[::-1] ndim = len(imshape) if self.num_channels > 1: dshape += (self.num_channels, ) # dshape = (self.num_channels,) + dshape out = np.empty(dshape, dtype=self.dtype) #cl.enqueue_read_image(queue, self, [0] * ndim, imshape, out) cl.enqueue_copy(queue, out, self, origin=(0, ) * ndim, region=imshape) return out
def fft_plan(shape, dtype=np.complex64, axes=None, fast_math=True): """returns an reikna plan/FFT obj of shape dshape """ # if not axes is None and any([a<0 for a in axes]): # raise NotImplementedError("indices of axes have to be non negative, but are: %s"%str(axes)) axes = _convert_axes_to_absolute(shape, axes) mock_buffer = MockBuffer(dtype, shape) fft_plan = FFT(mock_buffer, axes=axes).compile(cluda.ocl_api().Thread(get_device().queue), fast_math=fast_math) return fft_plan
def test_time_to_render(): import time from gputools import get_device get_device().print_info() N = 256 x = np.linspace(-1, 1, N) Z, Y, X = np.meshgrid(x, x, x, indexing="ij") R = np.sqrt(X**2+Y**2+Z**2) d = 10000*np.exp(-10*R**2) rend = VolumeRenderer((600, 600)) rend.set_modelView(mat4_translate(0, 0, -10.)) # rend.set_box_boundaries(.3*np.array([-1,1,-1,1,-1,1])) t1 = time.time() get_device().queue.finish() rend.set_data(d, autoConvert=True) get_device().queue.finish() t2 = time.time() get_device().queue.finish() rend.render(maxVal=10000.) out = rend.output get_device().queue.finish() print("time to set data %s^3:\t %.2f ms"%(N, 1000*(t2-t1))) print("time to render %s^3:\t %.2f ms"%(N, 1000*(time.time()-t2))) return d, rend, out
def test_time_to_render(): import time from gputools import get_device get_device().print_info() N = 256 x = np.linspace(-1, 1, N) Z, Y, X = np.meshgrid(x, x, x, indexing="ij") R = np.sqrt(X**2 + Y**2 + Z**2) d = 10000 * np.exp(-10 * R**2) rend = VolumeRenderer((600, 600)) rend.set_modelView(mat4_translate(0, 0, -10.)) # rend.set_box_boundaries(.3*np.array([-1,1,-1,1,-1,1])) t1 = time.time() get_device().queue.finish() rend.set_data(d, autoConvert=True) get_device().queue.finish() t2 = time.time() get_device().queue.finish() rend.render(maxVal=10000.) out = rend.output get_device().queue.finish() print("time to set data %s^3:\t %.2f ms" % (N, 1000 * (t2 - t1))) print("time to render %s^3:\t %.2f ms" % (N, 1000 * (time.time() - t2))) return d, rend, out
def get(self, **kwargs): queue = get_device().queue if hasattr(self, "shape"): imshape = self.shape else: imshape = (self.width, ) dshape = imshape[::-1] if self.format.channel_count > 1: dshape += (self.format.channel_count, ) out = np.empty(dshape, dtype=self.dtype) pyopencl.enqueue_read_image(queue, self, [0] * len(dshape), imshape, out) return out.reshape(dshape)
def time_simple(N, nargs, niter =100): from gputools import OCLReductionKernel map_exprs=["%s*x[i]"%i for i in xrange(nargs)] ks = [OCLReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_expr="%s*x[i]"%i, arguments="__global float *x") for i in xrange(len(map_exprs))] ins = [OCLArray.from_array(np.ones(N,np.float32)) for _ in xrange(len(map_exprs))] outs = [OCLArray.empty(1,np.float32) for _ in xrange(len(map_exprs))] from time import time t = time() for _ in xrange(niter): for k,inn,out in zip(ks,ins,outs): k(inn, out = out) get_device().queue.finish() t = (time()-t)/niter print "simple reduction: result =", [float(out.get()) for out in outs] print "simple reduction:\t\t%.2f ms"%(1000*t) return t
def copy_buffer(self, buf): """ copy content of buf into im """ queue = get_device().queue imshape = self.imshape() assert imshape == buf.shape[::-1] ndim = len(imshape) cl.enqueue_copy(queue, self, buf.data, offset=0, origin=(0, ) * ndim, region=imshape)
def _fft_convolve_gpu(data_g, h_g, res_g = None, plan = None, inplace = False, kernel_is_fft = False): """ fft convolve for gpu buffer """ _complex_multiply_kernel = OCLElementwiseKernel( "cfloat_t *a, cfloat_t * b", "a[i] = cfloat_mul(b[i],a[i])","mult") dev = get_device() assert_bufs_type(np.complex64,data_g,h_g) if data_g.shape != h_g.shape: raise ValueError("data and kernel must have same size! %s vs %s "%(str(data_g.shape),str(h_g.shape))) if plan is None: plan = fft_plan(data_g.shape) if inplace: res_g = data_g else: if res_g is None: res_g = OCLArray.empty(data_g.shape,data_g.dtype) res_g.copy_buffer(data_g) if not kernel_is_fft: kern_g = OCLArray.empty(h_g.shape,h_g.dtype) kern_g.copy_buffer(h_g) fft(kern_g,inplace=True, plan = plan) else: kern_g = h_g fft(res_g,inplace=True, plan = plan) #multiply in fourier domain _complex_multiply_kernel(res_g,kern_g) fft(res_g,inplace = True, inverse = True, plan = plan) return res_g
def get(self, **kwargs): queue = get_device().queue if hasattr(self,"shape"): imshape = self.shape else: imshape = (self.width,) dshape = imshape[::-1] ndim = len(imshape) if self.num_channels>1: dshape += (self.num_channels,) #dshape = (self.num_channels,) + dshape out = np.empty(dshape,dtype=self.dtype) cl.enqueue_read_image(queue,self,[0]*ndim,imshape,out) return out
def copy_buffer(self,buf): """ copy content of buf into im """ queue = get_device().queue if hasattr(self,"shape"): imshape = self.shape else: imshape = (self.width,) assert imshape == buf.shape[::-1] ndim = len(imshape) cl.enqueue_copy(queue,self,buf.data, offset = 0, origin = (0,)*ndim, region = imshape)
def write_array(self, data): queue = get_device().queue # 1d images dont have a shape but only a width if hasattr(self,"shape"): imshape = self.shape else: imshape = (self.width,) ndim = len(imshape) dshape = data.shape # if clImg.format.channel_order in [cl.channel_order.RGBA, # cl.channel_order.BGRA]: # dshape = dshape[:-1] if dshape != imshape[::-1]: raise ValueError("write_array: wrong shape!",data.shape[::-1],imshape) else: pyopencl.enqueue_write_image(queue,self,[0]*ndim,imshape,data)
def empty(cls,shape,dtype, num_channels = 1, channel_order = None): ctx = get_device().context if not len(shape) in [1,2,3]: raise ValueError("dimension of shape wrong, should be 1...3 but is %s"%len(shape)) mem_flags = pyopencl.mem_flags.READ_WRITE channel_type = pyopencl.DTYPE_TO_CHANNEL_TYPE[np.dtype(dtype)] _dict_channel_order = {1:pyopencl.channel_order.R, 2:pyopencl.channel_order.RG, 3:pyopencl.channel_order.RGB, 4:pyopencl.channel_order.RGBA} if channel_order is None: channel_order = _dict_channel_order[num_channels] fmt = pyopencl.ImageFormat(channel_order, channel_type) res = pyopencl.Image(ctx, mem_flags,fmt, shape = shape[::-1]) res.dtype = dtype return res
def _convolve3_old(data,h, dev = None): """convolves 3d data with kernel h on the GPU Device dev boundary conditions are clamping to edge. h is converted to float32 if dev == None the default one is used """ if dev is None: dev = get_device() if dev is None: raise ValueError("no OpenCLDevice found...") dtype = data.dtype.type dtypes_options = {np.float32:"", np.uint16:"-D SHORTTYPE"} if not dtype in dtypes_options.keys(): raise TypeError("data type %s not supported yet, please convert to:"%dtype,dtypes_options.keys()) prog = OCLProgram(abspath("kernels/convolve3.cl"), build_options = dtypes_options[dtype]) hbuf = OCLArray.from_array(h.astype(np.float32)) img = OCLImage.from_array(data) res = OCLArray.empty(data.shape,dtype=np.float32) Ns = [np.int32(n) for n in data.shape+h.shape] prog.run_kernel("convolve3d",img.shape,None, img,hbuf.data,res.data, *Ns) return res.get()
def from_array(cls,arr, *args, **kwargs): ctx = get_device().context if not arr.ndim in [2,3,4]: raise ValueError("dimension of array wrong, should be 1...4 but is %s"%arr.ndim) elif arr.ndim == 4: num_channels = arr.shape[-1] else: num_channels = None if arr.dtype.type == np.complex64: num_channels = 2 res = OCLImage.empty(arr.shape,dtype = np.float32, num_channels=num_channels) res.write_array(arr) res.dtype = np.float32 else: res = cl.image_from_array(ctx, arr,num_channels = num_channels, *args, **kwargs) res.dtype = arr.dtype res.num_channels = num_channels return res
def copy_image(self,img, **kwargs): queue = get_device().queue return cl.enqueue_copy(queue, self.data, img, offset = 0, origin = (0,0),region = img.shape, **kwargs)
def write_array(self,data, **kwargs): queue = get_device().queue return cl.enqueue_write_buffer(queue, self.data, data, **kwargs)
def copy_buffer(self,buf, **kwargs): queue = get_device().queue return cl.enqueue_copy(queue, self.data, buf.data, **kwargs)