def _convolve_buf(data_g, h_g, res_g=None): """ buffer variant """ assert_bufs_type(np.float32, data_g, h_g) prog = OCLProgram(abspath("kernels/convolve.cl")) if res_g is None: res_g = OCLArray.empty(data_g.shape, dtype=np.float32) Nhs = [np.int32(n) for n in h_g.shape] kernel_name = "convolve%sd_buf" % (len(data_g.shape)) try: prog.run_kernel(kernel_name, data_g.shape[::-1], None, data_g.data, h_g.data, res_g.data, *Nhs) except cl.cffi_cl.LogicError as e: # this catches the logicerror if the kernel is to big for constant memory if e.code == -52: kernel_name = "convolve%sd_buf_global" % (len(data_g.shape)) prog.run_kernel(kernel_name, data_g.shape[::-1], None, data_g.data, h_g.data, res_g.data, *Nhs) else: raise e return res_g
def _filt(data_g, size=(3, 3, 3), res_g=None): assert_bufs_type(np.float32, data_g) with open(abspath("kernels/generic_reduce_filter.cl"), "r") as f: tpl = Template(f.read()) rendered = tpl.render(FSIZE_X=size[-1], FSIZE_Y=size[-2], FSIZE_Z=size[-3], FUNC=FUNC, DEFAULT=DEFAULT) prog = OCLProgram(src_str=rendered) tmp_g = OCLArray.empty_like(data_g) if res_g is None: res_g = OCLArray.empty_like(data_g) prog.run_kernel("filter_3_x", data_g.shape[::-1], None, data_g.data, res_g.data) prog.run_kernel("filter_3_y", data_g.shape[::-1], None, res_g.data, tmp_g.data) prog.run_kernel("filter_3_z", data_g.shape[::-1], None, tmp_g.data, res_g.data) return res_g
def _ocl_fft_gpu(plan, ocl_arr, res_arr=None, inverse=False): assert_bufs_type(np.complex64, ocl_arr) if res_arr is None: res_arr = OCLArray.empty_like(ocl_arr) plan(ocl_arr, res_arr, inverse=inverse) return res_arr
def _ocl_fft_gpu_inplace(ocl_arr, inverse=False, plan=None): assert_bufs_type(np.complex64, ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue=get_device().queue) plan.execute(ocl_arr.data, ocl_arr.data, inverse=inverse)
def _ocl_fft_gpu(plan, ocl_arr,res_arr = None, inverse = False, batch = 1): assert_bufs_type(np.complex64,ocl_arr) if res_arr is None: res_arr = OCLArray.empty(ocl_arr.shape,np.complex64) plan.execute(ocl_arr.data,res_arr.data, inverse = inverse, batch = batch) return res_arr
def _ocl_fft_gpu_inplace(ocl_arr,inverse = False, plan = None): assert_bufs_type(np.complex64,ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue = get_device().queue) plan.execute(ocl_arr.data,ocl_arr.data, inverse = inverse)
def _max_filter_gpu(data_g, size=5, res_g=None): assert_bufs_type(np.float32, data_g) assert (len(data_g.shape) == len(size)) if len(data_g.shape) == 2: return _filter_max_2_gpu(data_g, size=size, res_g=res_g) elif len(data_g.shape) == 3: return _filter_max_3_gpu(data_g, size=size, res_g=res_g) else: raise NotImplementedError("only 2 or 3d arrays are supported for now")
def _integral3_buf(x_g, res_g = None, tmp_g = None): if not x_g.dtype.type in _output_type_dict: raise ValueError("dtype %s currently not supported! (%s)" % (x_g.dtype.type, str(_output_type_dict.keys()))) dtype_out = _output_type_dict[x_g.dtype.type] cl_dtype_in = cl_buffer_datatype_dict[x_g.dtype.type] cl_dtype_out = cl_buffer_datatype_dict[dtype_out] dtype_itemsize = np.dtype(dtype_out).itemsize max_local_size = get_device().get_info("MAX_WORK_GROUP_SIZE") prog = OCLProgram(abspath("kernels/integral_image.cl"), build_options=["-D", "DTYPE=%s" % cl_dtype_out]) if x_g.dtype.type != dtype_out: x_g = x_g.astype(dtype_out) if tmp_g is None: tmp_g = OCLArray.empty(x_g.shape, dtype_out) if res_g is None: res_g = OCLArray.empty(x_g.shape, dtype_out) assert_bufs_type(dtype_out, tmp_g, res_g) nz, ny, nx = x_g.shape def _scan_single(src, dst, ns, strides): nx, ny, nz = ns stride_x, stride_y, stride_z = strides loc = min(next_power_of_2(nx // 2), max_local_size // 2) nx_block = 2 * loc nx_pad = math.ceil(nx / nx_block) * nx_block nblocks = math.ceil(nx_pad // 2 / loc) sum_blocks = OCLArray.empty((nz, ny, nblocks), dst.dtype) shared = cl.LocalMemory(2 * dtype_itemsize * loc) for b in range(nblocks): offset = b * loc prog.run_kernel("scan3d", (loc, ny, nz), (loc, 1, 1), src.data, dst.data, sum_blocks.data, shared, np.int32(nx_block), np.int32(stride_x), np.int32(stride_y), np.int32(stride_z), np.int32(offset), np.int32(b), np.int32(nblocks), np.int32(ny), np.int32(nx)) if nblocks > 1: _scan_single(sum_blocks, sum_blocks, (nblocks, ny, nz), (1, nblocks, nblocks * ny)) prog.run_kernel("add_sums3d", (nx_pad, ny, nz), (nx_block, 1, 1), sum_blocks.data, dst.data, np.int32(stride_x), np.int32(stride_y), np.int32(stride_z), np.int32(nblocks), np.int32(ny), np.int32(nx)) _scan_single(x_g, res_g, (nx, ny, nz), (1, nx, nx * ny)) _scan_single(res_g, tmp_g, (ny, nx, nz), (nx, 1, nx * ny)) _scan_single(tmp_g, res_g, (nz, nx, ny), (ny * nx, 1, nx)) return res_g
def _ocl_fft_gpu(ocl_arr, res_arr=None, inverse=False, plan=None): assert_bufs_type(np.complex64, ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue=get_device().queue) if res_arr is None: res_arr = OCLArray.empty(ocl_arr.shape, np.complex64) plan.execute(ocl_arr.data, res_arr.data, inverse=inverse) return res_arr
def _ocl_fft_gpu(ocl_arr,res_arr = None,inverse = False, plan = None): assert_bufs_type(np.complex64,ocl_arr) if plan is None: plan = Plan(ocl_arr.shape, queue = get_device().queue) if res_arr is None: res_arr = OCLArray.empty(ocl_arr.shape,np.complex64) plan.execute(ocl_arr.data,res_arr.data, inverse = inverse) return res_arr
def _filter_max_2_gpu(data_g, size=10, res_g=None): assert_bufs_type(np.float32, data_g) prog = OCLProgram(abspath("kernels/minmax_filter.cl")) tmp_g = OCLArray.empty_like(data_g) if res_g is None: res_g = OCLArray.empty_like(data_g) prog.run_kernel("max_2_x", data_g.shape[::-1], None, data_g.data, tmp_g.data, np.int32(size[-1])) prog.run_kernel("max_2_y", data_g.shape[::-1], None, tmp_g.data, res_g.data, np.int32(size[-2])) return res_g
def _fft_convolve_gpu(data_g, h_g, res_g = None, plan = None, inplace = False, kernel_is_fft = False): """ fft convolve for gpu buffer """ _complex_multiply_kernel = OCLElementwiseKernel( "cfloat_t *a, cfloat_t * b", "a[i] = cfloat_mul(b[i],a[i])","mult") dev = get_device() assert_bufs_type(np.complex64,data_g,h_g) if data_g.shape != h_g.shape: raise ValueError("data and kernel must have same size! %s vs %s "%(str(data_g.shape),str(h_g.shape))) if plan is None: plan = fft_plan(data_g.shape) if inplace: res_g = data_g else: if res_g is None: res_g = OCLArray.empty(data_g.shape,data_g.dtype) res_g.copy_buffer(data_g) if not kernel_is_fft: kern_g = OCLArray.empty(h_g.shape,h_g.dtype) kern_g.copy_buffer(h_g) fft(kern_g,inplace=True, plan = plan) else: kern_g = h_g fft(res_g,inplace=True, plan = plan) #multiply in fourier domain _complex_multiply_kernel(res_g,kern_g) fft(res_g,inplace = True, inverse = True, plan = plan) return res_g
def _convolve_sep2_gpu(data_g, hx_g, hy_g, res_g = None): assert_bufs_type(np.float32,data_g,hx_g,hy_g) prog = OCLProgram(abspath("kernels/convolve_sep.cl")) Ny,Nx = hy_g.shape[0],hx_g.shape[0] tmp_g = OCLArray.empty_like(data_g) if res_g is None: res_g = OCLArray.empty_like(data_g) prog.run_kernel("conv_sep2_x",data_g.shape[::-1],None,data_g.data,hx_g.data,tmp_g.data,np.int32(Nx)) prog.run_kernel("conv_sep2_y",data_g.shape[::-1],None,tmp_g.data,hy_g.data,res_g.data,np.int32(Ny)) return res_g
def _convolve_sep2_gpu(data_g, hx_g, hy_g, res_g=None): assert_bufs_type(np.float32, data_g, hx_g, hy_g) prog = OCLProgram(abspath("kernels/convolve_sep.cl")) Ny, Nx = hy_g.shape[0], hx_g.shape[0] tmp_g = OCLArray.empty_like(data_g) if res_g is None: res_g = OCLArray.empty_like(data_g) prog.run_kernel("conv_sep2_x", data_g.shape[::-1], None, data_g.data, hx_g.data, tmp_g.data, np.int32(Nx)) prog.run_kernel("conv_sep2_y", data_g.shape[::-1], None, tmp_g.data, hy_g.data, res_g.data, np.int32(Ny)) return res_g
def _fft_convolve_gpu(data_g, h_g, res_g = None, plan = None, inplace = False, kernel_is_fft = False): """ fft convolve for gpu buffer """ dev = get_device() assert_bufs_type(np.complex64,data_g,h_g) if data_g.shape != h_g.shape: raise ValueError("data and kernel must have same size! %s vs %s "%(str(data_g.shape),str(h_g.shape))) if plan is None: plan = fft_plan(data_g.shape) if inplace: res_g = data_g else: if res_g is None: res_g = OCLArray.empty(data_g.shape,data_g.dtype) res_g.copy_buffer(data_g) if not kernel_is_fft: kern_g = OCLArray.empty(h_g.shape,h_g.dtype) kern_g.copy_buffer(h_g) fft(kern_g,inplace=True, plan = plan) else: kern_g = h_g fft(res_g,inplace=True, plan = plan) #multiply in fourier domain print res_g.dtype, res_g.nbytes _complex_multiply_kernel(res_g,kern_g) fft(res_g,inplace = True, inverse = True, plan = plan) return res_g
def _convolve_buf(data_g, h_g , res_g = None): """ buffer variant """ assert_bufs_type(np.float32,data_g,h_g) prog = OCLProgram(abspath("kernels/convolve.cl")) if res_g is None: res_g = OCLArray.empty(data_g.shape,dtype=np.float32) Nhs = [np.int32(n) for n in h_g.shape] kernel_name = "convolve%sd_buf"%(len(data_g.shape)) prog.run_kernel(kernel_name,data_g.shape[::-1],None, data_g.data,h_g.data,res_g.data, *Nhs) return res_g
def _ocl_fft_gpu_inplace(plan, ocl_arr, inverse=False): assert_bufs_type(np.complex64, ocl_arr) plan(ocl_arr, ocl_arr, inverse=inverse)
def _ocl_fft_gpu_inplace(plan, ocl_arr, inverse = False, batch = 1): assert_bufs_type(np.complex64,ocl_arr) plan.execute(ocl_arr.data,ocl_arr.data, inverse = inverse, batch = batch)