示例#1
0
def _convolve_buf(data_g, h_g, res_g=None):
    """
    buffer variant
    """
    assert_bufs_type(np.float32, data_g, h_g)

    prog = OCLProgram(abspath("kernels/convolve.cl"))

    if res_g is None:
        res_g = OCLArray.empty(data_g.shape, dtype=np.float32)

    Nhs = [np.int32(n) for n in h_g.shape]

    kernel_name = "convolve%sd_buf" % (len(data_g.shape))


    try:
        prog.run_kernel(kernel_name, data_g.shape[::-1], None,
                        data_g.data, h_g.data, res_g.data,
                        *Nhs)

    except cl.cffi_cl.LogicError as e:
        # this catches the logicerror if the kernel is to big for constant memory
        if e.code == -52:
            kernel_name = "convolve%sd_buf_global" % (len(data_g.shape))
            prog.run_kernel(kernel_name, data_g.shape[::-1], None,
                            data_g.data, h_g.data, res_g.data,
                            *Nhs)

        else:
            raise e

    return res_g
示例#2
0
    def _filt(data_g, size=(3, 3, 3), res_g=None):
        assert_bufs_type(np.float32, data_g)

        with open(abspath("kernels/generic_reduce_filter.cl"), "r") as f:
            tpl = Template(f.read())

        rendered = tpl.render(FSIZE_X=size[-1],
                              FSIZE_Y=size[-2],
                              FSIZE_Z=size[-3],
                              FUNC=FUNC,
                              DEFAULT=DEFAULT)

        prog = OCLProgram(src_str=rendered)

        tmp_g = OCLArray.empty_like(data_g)

        if res_g is None:
            res_g = OCLArray.empty_like(data_g)

        prog.run_kernel("filter_3_x", data_g.shape[::-1], None, data_g.data,
                        res_g.data)
        prog.run_kernel("filter_3_y", data_g.shape[::-1], None, res_g.data,
                        tmp_g.data)
        prog.run_kernel("filter_3_z", data_g.shape[::-1], None, tmp_g.data,
                        res_g.data)
        return res_g
示例#3
0
def _ocl_fft_gpu(plan, ocl_arr, res_arr=None, inverse=False):
    assert_bufs_type(np.complex64, ocl_arr)

    if res_arr is None:
        res_arr = OCLArray.empty_like(ocl_arr)
    plan(ocl_arr, res_arr, inverse=inverse)

    return res_arr
示例#4
0
def _ocl_fft_gpu_inplace(ocl_arr, inverse=False, plan=None):

    assert_bufs_type(np.complex64, ocl_arr)

    if plan is None:
        plan = Plan(ocl_arr.shape, queue=get_device().queue)

    plan.execute(ocl_arr.data, ocl_arr.data, inverse=inverse)
示例#5
0
def _ocl_fft_gpu(plan, ocl_arr,res_arr = None, inverse = False, batch = 1):

    assert_bufs_type(np.complex64,ocl_arr)
    if res_arr is None:
        res_arr = OCLArray.empty(ocl_arr.shape,np.complex64)
    plan.execute(ocl_arr.data,res_arr.data, inverse = inverse, batch = batch)

    return res_arr
示例#6
0
def _ocl_fft_gpu_inplace(ocl_arr,inverse = False, plan = None):

    assert_bufs_type(np.complex64,ocl_arr)

    if plan is None:
        plan = Plan(ocl_arr.shape, queue = get_device().queue)

    plan.execute(ocl_arr.data,ocl_arr.data, inverse = inverse)
示例#7
0
def _max_filter_gpu(data_g, size=5, res_g=None):
    assert_bufs_type(np.float32, data_g)

    assert (len(data_g.shape) == len(size))

    if len(data_g.shape) == 2:
        return _filter_max_2_gpu(data_g, size=size, res_g=res_g)
    elif len(data_g.shape) == 3:
        return _filter_max_3_gpu(data_g, size=size, res_g=res_g)
    else:
        raise NotImplementedError("only 2 or 3d arrays are supported for now")
def _integral3_buf(x_g, res_g = None, tmp_g = None):
    if not x_g.dtype.type in _output_type_dict:
        raise ValueError("dtype %s currently not supported! (%s)" % (x_g.dtype.type, str(_output_type_dict.keys())))

    dtype_out = _output_type_dict[x_g.dtype.type]
    cl_dtype_in = cl_buffer_datatype_dict[x_g.dtype.type]
    cl_dtype_out = cl_buffer_datatype_dict[dtype_out]

    dtype_itemsize = np.dtype(dtype_out).itemsize

    max_local_size = get_device().get_info("MAX_WORK_GROUP_SIZE")
    prog = OCLProgram(abspath("kernels/integral_image.cl"),
                      build_options=["-D", "DTYPE=%s" % cl_dtype_out])
    if x_g.dtype.type != dtype_out:
        x_g = x_g.astype(dtype_out)

    if tmp_g is None:
        tmp_g = OCLArray.empty(x_g.shape, dtype_out)
    if res_g is None:
        res_g = OCLArray.empty(x_g.shape, dtype_out)

    assert_bufs_type(dtype_out, tmp_g, res_g)

    nz, ny, nx = x_g.shape

    def _scan_single(src, dst, ns, strides):
        nx, ny, nz = ns
        stride_x, stride_y, stride_z = strides
        loc = min(next_power_of_2(nx // 2), max_local_size // 2)
        nx_block = 2 * loc
        nx_pad = math.ceil(nx / nx_block) * nx_block

        nblocks = math.ceil(nx_pad // 2 / loc)
        sum_blocks = OCLArray.empty((nz, ny, nblocks), dst.dtype)
        shared = cl.LocalMemory(2 * dtype_itemsize * loc)
        for b in range(nblocks):
            offset = b * loc
            prog.run_kernel("scan3d", (loc, ny, nz), (loc, 1, 1),
                            src.data, dst.data, sum_blocks.data, shared,
                            np.int32(nx_block),
                            np.int32(stride_x), np.int32(stride_y), np.int32(stride_z), np.int32(offset), np.int32(b),
                            np.int32(nblocks), np.int32(ny), np.int32(nx))
        if nblocks > 1:
            _scan_single(sum_blocks, sum_blocks, (nblocks, ny, nz), (1, nblocks, nblocks * ny))
            prog.run_kernel("add_sums3d", (nx_pad, ny, nz), (nx_block, 1, 1),
                            sum_blocks.data, dst.data,
                            np.int32(stride_x), np.int32(stride_y), np.int32(stride_z),
                            np.int32(nblocks), np.int32(ny), np.int32(nx))

    _scan_single(x_g, res_g, (nx, ny, nz), (1, nx, nx * ny))
    _scan_single(res_g, tmp_g, (ny, nx, nz), (nx, 1, nx * ny))
    _scan_single(tmp_g, res_g, (nz, nx, ny), (ny * nx, 1, nx))

    return res_g
示例#9
0
def _ocl_fft_gpu(ocl_arr, res_arr=None, inverse=False, plan=None):

    assert_bufs_type(np.complex64, ocl_arr)

    if plan is None:
        plan = Plan(ocl_arr.shape, queue=get_device().queue)

    if res_arr is None:
        res_arr = OCLArray.empty(ocl_arr.shape, np.complex64)

    plan.execute(ocl_arr.data, res_arr.data, inverse=inverse)

    return res_arr
示例#10
0
def _ocl_fft_gpu(ocl_arr,res_arr = None,inverse = False, plan = None):

    assert_bufs_type(np.complex64,ocl_arr)

    if plan is None:
        plan = Plan(ocl_arr.shape, queue = get_device().queue)

    if res_arr is None:
        res_arr = OCLArray.empty(ocl_arr.shape,np.complex64)
        
    plan.execute(ocl_arr.data,res_arr.data, inverse = inverse)

    return res_arr
示例#11
0
def _filter_max_2_gpu(data_g, size=10, res_g=None):
    assert_bufs_type(np.float32, data_g)

    prog = OCLProgram(abspath("kernels/minmax_filter.cl"))

    tmp_g = OCLArray.empty_like(data_g)

    if res_g is None:
        res_g = OCLArray.empty_like(data_g)

    prog.run_kernel("max_2_x", data_g.shape[::-1], None, data_g.data,
                    tmp_g.data, np.int32(size[-1]))
    prog.run_kernel("max_2_y", data_g.shape[::-1], None, tmp_g.data,
                    res_g.data, np.int32(size[-2]))

    return res_g
示例#12
0
def _fft_convolve_gpu(data_g, h_g, res_g = None,
                      plan = None, inplace = False,
                      kernel_is_fft = False):
    """ fft convolve for gpu buffer
    """

    _complex_multiply_kernel = OCLElementwiseKernel(
        "cfloat_t *a, cfloat_t * b",
        "a[i] = cfloat_mul(b[i],a[i])","mult")


    dev = get_device()

    assert_bufs_type(np.complex64,data_g,h_g)

    if data_g.shape != h_g.shape:
        raise ValueError("data and kernel must have same size! %s vs %s "%(str(data_g.shape),str(h_g.shape)))


    if plan is None:
        plan = fft_plan(data_g.shape)

    if inplace:
        res_g = data_g
    else:
        if res_g is None:
            res_g = OCLArray.empty(data_g.shape,data_g.dtype)
            
        res_g.copy_buffer(data_g)
        
    if not kernel_is_fft:
        kern_g = OCLArray.empty(h_g.shape,h_g.dtype)
        kern_g.copy_buffer(h_g)
        fft(kern_g,inplace=True, plan = plan)
    else:
        kern_g = h_g


    fft(res_g,inplace=True, plan = plan)


    #multiply in fourier domain
    _complex_multiply_kernel(res_g,kern_g)

    fft(res_g,inplace = True, inverse = True, plan = plan)

    return res_g
示例#13
0
def _convolve_sep2_gpu(data_g, hx_g, hy_g, res_g = None):

    assert_bufs_type(np.float32,data_g,hx_g,hy_g)

    prog = OCLProgram(abspath("kernels/convolve_sep.cl"))

    Ny,Nx = hy_g.shape[0],hx_g.shape[0]

    tmp_g = OCLArray.empty_like(data_g)

    if res_g is None:
        res_g = OCLArray.empty_like(data_g)
    
    prog.run_kernel("conv_sep2_x",data_g.shape[::-1],None,data_g.data,hx_g.data,tmp_g.data,np.int32(Nx))
    prog.run_kernel("conv_sep2_y",data_g.shape[::-1],None,tmp_g.data,hy_g.data,res_g.data,np.int32(Ny))

    return res_g
示例#14
0
def _convolve_sep2_gpu(data_g, hx_g, hy_g, res_g=None):
    assert_bufs_type(np.float32, data_g, hx_g, hy_g)

    prog = OCLProgram(abspath("kernels/convolve_sep.cl"))

    Ny, Nx = hy_g.shape[0], hx_g.shape[0]

    tmp_g = OCLArray.empty_like(data_g)

    if res_g is None:
        res_g = OCLArray.empty_like(data_g)

    prog.run_kernel("conv_sep2_x", data_g.shape[::-1], None, data_g.data,
                    hx_g.data, tmp_g.data, np.int32(Nx))
    prog.run_kernel("conv_sep2_y", data_g.shape[::-1], None, tmp_g.data,
                    hy_g.data, res_g.data, np.int32(Ny))

    return res_g
示例#15
0
def _fft_convolve_gpu(data_g, h_g, res_g = None,
                      plan = None, inplace = False,
                      kernel_is_fft = False):
    """ fft convolve for gpu buffer
    """

    dev = get_device()

    assert_bufs_type(np.complex64,data_g,h_g)

    if data_g.shape != h_g.shape:
        raise ValueError("data and kernel must have same size! %s vs %s "%(str(data_g.shape),str(h_g.shape)))


    if plan is None:
        plan = fft_plan(data_g.shape)

    if inplace:
        res_g = data_g
    else:
        if res_g is None:
            res_g = OCLArray.empty(data_g.shape,data_g.dtype)
            
        res_g.copy_buffer(data_g)
        
    if not kernel_is_fft:
        kern_g = OCLArray.empty(h_g.shape,h_g.dtype)
        kern_g.copy_buffer(h_g)
        fft(kern_g,inplace=True, plan = plan)
    else:
        kern_g = h_g


    fft(res_g,inplace=True, plan = plan)


    #multiply in fourier domain
    print res_g.dtype, res_g.nbytes
    _complex_multiply_kernel(res_g,kern_g)

    fft(res_g,inplace = True, inverse = True, plan = plan)

    return res_g
示例#16
0
def _convolve_buf(data_g, h_g , res_g = None):
    """
    buffer variant
    """
    assert_bufs_type(np.float32,data_g,h_g)

    prog = OCLProgram(abspath("kernels/convolve.cl"))

    if res_g is None:
        res_g = OCLArray.empty(data_g.shape,dtype=np.float32)

    Nhs = [np.int32(n) for n in h_g.shape]
    
    kernel_name = "convolve%sd_buf"%(len(data_g.shape)) 
    prog.run_kernel(kernel_name,data_g.shape[::-1],None,
                    data_g.data,h_g.data,res_g.data,
                    *Nhs)

    return res_g
示例#17
0
def _ocl_fft_gpu_inplace(plan, ocl_arr, inverse=False):
    assert_bufs_type(np.complex64, ocl_arr)
    plan(ocl_arr, ocl_arr, inverse=inverse)
示例#18
0
def _ocl_fft_gpu_inplace(plan, ocl_arr, inverse = False, batch = 1):

    assert_bufs_type(np.complex64,ocl_arr)
    plan.execute(ocl_arr.data,ocl_arr.data, inverse = inverse, batch = batch)