Exemplo n.º 1
0
    def __init__(self, m=None, n=None, k=None):
        super(BMM, self).__init__()
        self.m = m
        self.n = n
        self.k = k
        with open("kernels/bmm_kernel.cu", 'r') as f:  ###
            self.kernel = f.read()

        self.kernel = (self.kernel.replace("_M_",
                                           str(m) if m else "M").replace(
                                               "_N_",
                                               str(n) if n else "N").replace(
                                                   "_K_",
                                                   str(k) if k else "K"))

        self._fn_tt = cp.RawKernel(code=self.kernel,
                                   name="bmm_tt",
                                   backend='nvcc',
                                   options=('--maxrregcount=128',
                                            '--use_fast_math'))
        self._fn_nn = cp.RawKernel(code=self.kernel,
                                   name="bmm_nn",
                                   backend='nvcc',
                                   options=('--maxrregcount=128',
                                            '--use_fast_math'))
        self._fn_tn = cp.RawKernel(code=self.kernel,
                                   name="bmm_tn",
                                   backend='nvcc',
                                   options=('--maxrregcount=128',
                                            '--use_fast_math'))
        self._fn_nt = cp.RawKernel(code=self.kernel,
                                   name="bmm_nt",
                                   backend='nvcc',
                                   options=('--maxrregcount=128',
                                            '--use_fast_math'))
Exemplo n.º 2
0
def mexThSpkPC(Params, dataRAW, wPCA, iC):
    code, constants = get_cuda('mexThSpkPC')
    Nthreads = constants.Nthreads
    maxFR = constants.maxFR

    NT, Nchan, NchanNear, nt0, nt0min, spkTh, NrankPC = Params
    NT = int(NT)
    Nchan = int(Nchan)

    # Input GPU arrays.
    d_Params = cp.asarray(Params, dtype=np.float64, order='F')
    d_data = cp.asarray(dataRAW, dtype=np.float32, order='F')
    d_W = cp.asarray(wPCA, dtype=np.float32, order='F')
    d_iC = cp.asarray(iC, dtype=np.int32, order='F')

    # New GPU arrays.
    d_dout = cp.zeros((Nchan, NT), dtype=np.float32, order='F')
    d_dmax = cp.zeros((Nchan, NT), dtype=np.float32, order='F')
    d_st = cp.zeros(maxFR, dtype=np.int32, order='F')
    d_id = cp.zeros(maxFR, dtype=np.int32, order='F')
    d_counter = cp.zeros(1, dtype=np.int32, order='F')

    # filter the data with the temporal templates
    Conv1D = cp.RawKernel(code, 'Conv1D')
    Conv1D((Nchan, ), (Nthreads, ), (d_Params, d_data, d_W, d_dout))

    # get the max of the data
    max1D = cp.RawKernel(code, 'max1D')
    max1D((Nchan, ), (Nthreads, ), (d_Params, d_dout, d_dmax))

    # take max across nearby channels
    maxChannels = cp.RawKernel(code, 'maxChannels')
    maxChannels((int(NT // Nthreads), ), (Nthreads, ),
                (d_Params, d_dout, d_dmax, d_iC, d_st, d_id, d_counter))

    # move d_x to the CPU
    minSize = 1
    minSize = min(maxFR, int(d_counter[0]))

    d_featPC = cp.zeros((NrankPC * NchanNear, minSize),
                        dtype=np.float32,
                        order='F')

    d_id2 = cp.zeros(minSize, dtype=np.int32, order='F')

    if (minSize > 0):
        computeProjections = cp.RawKernel(code, 'computeProjections')
        computeProjections((minSize, ), (NchanNear, NrankPC),
                           (d_Params, d_data, d_iC, d_st, d_id, d_W, d_featPC))

    # TODO: check that the copy occurs on the GPU only
    d_id2[:] = d_id[:minSize]

    # Free memory.
    # TODO: unclear - does this do something special for cupy objects?
    #               - all of these go out of scope in the next line anyway
    del d_st, d_id, d_counter, d_Params, d_dmax, d_dout
    # free_gpu_memory()

    return d_featPC, d_id2
Exemplo n.º 3
0
 def __enter__(self):
     """Return self at start of a with-block."""
     # Call the __enter__ methods for any composed operators.
     # Allocate special memory objects.
     self.scatter_kernel = cp.RawKernel(_cu_source, "scatter")
     self.gather_kernel = cp.RawKernel(_cu_source, "gather")
     return self
Exemplo n.º 4
0
def _bspline_prefilter(volume):

    code = f'''
        #include "helper_math.h"
        #include "bspline.h"
    '''
    incl_path = str((Path(__file__).parent / 'kernels').absolute())
    prefilter_x = cp.RawKernel(code=code,
                               name='SamplesToCoefficients3DX',
                               options=('-I', incl_path))
    prefilter_y = cp.RawKernel(code=code,
                               name='SamplesToCoefficients3DY',
                               options=('-I', incl_path))
    prefilter_z = cp.RawKernel(code=code,
                               name='SamplesToCoefficients3DZ',
                               options=('-I', incl_path))

    slice_stride = volume.strides[1]
    dim_grid, dim_block = utils.compute_prefilter_workgroup_dims(volume.shape)
    dims = cp.asarray(volume.shape[::-1], dtype=cp.int32)

    prefilter_x(dim_grid[0], dim_block[0], (volume, slice_stride, dims))
    prefilter_y(dim_grid[1], dim_block[1], (volume, slice_stride, dims))
    prefilter_z(dim_grid[2], dim_block[2], (volume, slice_stride, dims))

    return volume
Exemplo n.º 5
0
def mexDistances2(Params, Ws, W, iMatch, iC, Wh, mus, mu):
    code, _ = get_cuda('mexDistances2')

    Nspikes = int(Params[0])
    Nfilters = int(Params[2])

    d_Params = cp.asarray(Params, dtype=np.float64, order='F')

    d_Ws = cp.asarray(Ws, dtype=np.float32, order='F')
    d_W = cp.asarray(W, dtype=np.float32, order='F')
    d_iMatch = cp.asarray(iMatch, dtype=np.bool, order='F')
    d_iC = cp.asarray(iC, dtype=np.int32, order='F')
    d_Wh = cp.asarray(Wh, dtype=np.int32, order='F')
    d_mu = cp.asarray(mu, dtype=np.float32, order='F')
    d_mus = cp.asarray(mus, dtype=np.float32, order='F')

    d_cmax = cp.zeros(Nspikes * Nfilters, dtype=np.float32, order='F')
    d_id = cp.zeros(Nspikes, dtype=np.int32, order='F')
    d_x = cp.zeros(Nspikes, dtype=np.float32, order='F')

    # get list of cmaxes for each combination of neuron and filter
    computeCost = cp.RawKernel(code, 'computeCost')
    computeCost(
        (Nfilters, ), (1024, ),
        (d_Params, d_Ws, d_mus, d_W, d_mu, d_iMatch, d_iC, d_Wh, d_cmax))

    # loop through cmax to find best template
    bestFilter = cp.RawKernel(code, 'bestFilter')
    bestFilter((40, ), (256, ),
               (d_Params, d_iMatch, d_Wh, d_cmax, d_mus, d_id, d_x))

    del d_Params, d_cmax

    return d_id, d_x
def Q_inner_product_cupy(Q, A, start_indices, window_size):
    num_time_points, num_lms = Q.shape
    num_extrinsic_samples, _ = A.shape

    assert not cupy.isfortran(Q)
    assert not cupy.isfortran(A)

    out = cupy.empty(
        (num_extrinsic_samples, window_size),
        dtype=cupy.complex128,
        order="C",
    )

    global _cuda_code
    if _cuda_code is None:
        # it's assumed that cuda_Q_inner_product.cu is placed in the same folder as this code
        path = os.path.join(os.path.dirname(__file__),
                            'cuda_Q_inner_product.cu')
        # alternative to deal with packaging in another directory
        if not (os.path.isfile(path)):
            path = os.path.join(
                os.path.split(os.path.dirname(__file__))[0],
                'cuda_Q_inner_product.cu')
        with open(path, 'r') as f:
            _cuda_code = f.read()
            Q_prod_fn = cupy.RawKernel(_cuda_code, "Q_inner")
    else:
        Q_prod_fn = cupy.RawKernel(_cuda_code, "Q_inner")

    float_prec = 16
    num_threads_x = 4
    num_threads_y = 1024 // 4
    block_size = num_threads_x, num_threads_y, 0
    grid_size = (
        (num_extrinsic_samples + num_threads_x - 1) // num_threads_x,
        0,
        0,
    )
    args = (
        Q,
        A,
        start_indices,
        window_size,
        num_time_points,
        num_extrinsic_samples,
        num_lms,
        out,
    )
    Q_prod_fn(
        grid_size,
        block_size,
        args,
        shared_mem=cupy.int32(num_threads_x * num_lms * float_prec),
    )

    return out
Exemplo n.º 7
0
def mexSVDsmall2(Params, dWU, W, iC, iW, Ka, Kb):
    code, constants = get_cuda('mexSVDsmall2')

    Nthreads = constants.Nthreads

    Nfilt = int(Params[1])
    nt0 = int(Params[4])
    Nrank = int(Params[6])
    Nchan = int(Params[9])

    d_Params = cp.asarray(Params, dtype=np.float64, order='F')

    d_dWU = cp.asarray(dWU, dtype=np.float64, order='F')
    d_iC = cp.asarray(iC, dtype=np.int32, order='F')
    d_iW = cp.asarray(iW, dtype=np.int32, order='F')

    d_A = cp.asarray(Ka, dtype=np.float64, order='F')
    d_B = cp.asarray(Kb, dtype=np.float64, order='F')

    d_U = cp.zeros((Nchan, Nfilt, Nrank), dtype=np.float64, order='F')
    d_mu = cp.zeros(Nfilt, dtype=np.float64, order='F')

    d_W = cp.asarray(W, dtype=np.float64, order='F')

    d_wtw = cp.zeros((nt0, nt0, Nfilt), dtype=np.float64, order='F')
    d_dWUb = cp.zeros((nt0, Nchan, Nfilt), dtype=np.float64, order='F')

    tpS = (nt0, int(Nthreads // nt0))
    tpK = (Nrank, int(Nthreads // Nrank))

    blankdWU = cp.RawKernel(code, 'blankdWU')
    blankdWU((Nfilt, ), tpS, (d_Params, d_dWU, d_iC, d_iW, d_dWUb))

    # compute dWU * dWU'
    getwtw = cp.RawKernel(code, 'getwtw')
    getwtw((Nfilt, ), tpS, (d_Params, d_dWUb, d_wtw))

    # get W by power svd iterations
    getW = cp.RawKernel(code, 'getW')
    getW((Nfilt, ), (nt0, ), (d_Params, d_wtw, d_W))

    # compute U by W' * dWU
    getU = cp.RawKernel(code, 'getU')
    getU((Nfilt, ), tpK, (d_Params, d_dWUb, d_W, d_U))

    # normalize U, get S, get mu, renormalize W
    reNormalize = cp.RawKernel(code, 'reNormalize')
    reNormalize((Nfilt, ), (nt0, ), (d_Params, d_A, d_B, d_W, d_U, d_mu))

    del d_wtw, d_Params, d_dWUb

    return d_W, d_U, d_mu
Exemplo n.º 8
0
def GPU_Bilateral(src, sigmaS=3.0, sigmaR=0.02, sigma=0):
    half_kernel_size = round(sigmaS * 2)
    blksize = (32, 8)
    fast = False
    snn = int(sigma > 0)  # whether to use SNN sampling strategy

    if src.format.id != vs.GRAYS:
        raise vs.Error("Bilateral: Only 32-bit float grayscale is supported!")

    w, h = src.width, src.height

    # source code of CUDA kernel
    with open(
            os.path.join(os.path.dirname(Path(__file__).resolve()),
                         'bilateral.cu'), 'r') as f:
        kernel_source_code = f.read()

    kernel_source_code = Template(kernel_source_code)
    kernel_source_code = kernel_source_code.substitute(
        width=w,
        height=h,
        sigma_s=-0.5 / (sigmaS**2),
        sigma_r=-0.5 / (sigmaR**2),
        sigma=sigma,
        snn=snn,
        half_kernel_size=half_kernel_size)

    if fast:
        kernel = cp.RawKernel(kernel_source_code,
                              'bilateral',
                              options=('--use_fast_math', ))
    else:
        kernel = cp.RawKernel(kernel_source_code, 'bilateral')

    # create NumPy function
    def bilateral_core(h_img, kernel):
        # h_img must be a 2-D image

        d_img = cp.asarray(h_img)
        d_out = cp.empty_like(d_img)

        kernel(((w + blksize[0] - 1) // blksize[0],
                (h + blksize[1] - 1) // blksize[1]), blksize, (d_img, d_out))

        h_out = cp.asnumpy(d_out)

        return h_out

    # process
    return mufnp.numpy_process(src, bilateral_core, kernel=kernel)
Exemplo n.º 9
0
    def __init__(self,
                 m=None,
                 n=None,
                 k=None,
                 write_float8=True,
                 share_mask=False):
        super(MBMM, self).__init__()
        assert type(write_float8) == bool
        assert type(share_mask) == bool
        self.m = m
        self.n = n
        self.k = k
        self.write_float8 = write_float8
        self.share_mask = share_mask
        with open("kernels/mbmm_kernel.cu", 'r') as f:  ###
            self.kernel = f.read()

        self.kernel = (self.kernel.replace(
            "_M_",
            str(m) if m else "M").replace(
                "_N_",
                str(n) if n else "N").replace(
                    "_K_",
                    str(k) if k else "K").replace(
                        "__WRITE_FLOAT8__",
                        "true" if write_float8 else "false").replace(
                            "__MASK_BATCH__", "0" if share_mask else "bid"))

        self._fn_tt = cp.RawKernel(code=self.kernel,
                                   name="mbmm_tt",
                                   backend='nvcc',
                                   options=('--maxrregcount=128',
                                            '--use_fast_math'))
        self._fn_nn = cp.RawKernel(code=self.kernel,
                                   name="mbmm_nn",
                                   backend='nvcc',
                                   options=('--maxrregcount=128',
                                            '--use_fast_math'))
        self._fn_tn = cp.RawKernel(code=self.kernel,
                                   name="mbmm_tn",
                                   backend='nvcc',
                                   options=('--maxrregcount=128',
                                            '--use_fast_math'))
        self._fn_nt = cp.RawKernel(code=self.kernel,
                                   name="mbmm_nt",
                                   backend='nvcc',
                                   options=('--maxrregcount=128',
                                            '--use_fast_math'))
Exemplo n.º 10
0
    def __init__(
        self,
        ta=1,
        tpb=256,
        sm_size=48 * 256 * 4,
    ):
        super(GetDivOfAddressV2CUDA, self).__init__()
        self.ta = ta  # how many clusters each thread is responsible of
        self.tpb = tpb
        self.sm_size = sm_size
        assert ta * tpb * 8 <= sm_size

        with open(get_absolute_path("kernels", "GetDivOfAddressV2Kernel.cu"),
                  "r") as f:
            self.kernel = f.read()
        kernel = (self.kernel.replace("_TA_",
                                      str(ta)).replace("_TPB_", str(tpb)))

        self.fn = cp.RawKernel(
            kernel,
            'get_div_of_address',
            backend='nvcc',
            # options=('--maxrregcount=255',),
        )
        self.fn.max_dynamic_shared_size_bytes = ta * tpb * 8
Exemplo n.º 11
0
def nn_gpu(ref, query):
    import cupy

    with open(cu_file) as f:
        kernel = cupy.RawKernel(f.read(), "cuComputeDistanceGlobal")

    ref_nb, ref_dim = ref.shape
    query_nb, query_dim = query.shape
    assert ref_dim == query_dim
    dim = ref_dim

    ref = ref.transpose(1, 0)
    query = query.transpose(1, 0)
    ref = cupy.ascontiguousarray(ref)
    query = cupy.ascontiguousarray(query)

    dist = cupy.empty((ref_nb, query_nb), dtype=cupy.float32)

    BLOCK_DIM = 16
    grid = (
        int(math.ceil(query_nb / BLOCK_DIM)),
        int(math.ceil(ref_nb / BLOCK_DIM)),
        1,
    )
    block = (16, 16, 1)
    args = (ref, ref_nb, query, query_nb, dim, dist)
    shared_mem = BLOCK_DIM * BLOCK_DIM + BLOCK_DIM * BLOCK_DIM + 5

    kernel(grid, block, args=args, shared_mem=shared_mem)

    indices = cupy.argmin(dist, axis=0)
    return indices
Exemplo n.º 12
0
def Overlap_GPU(image, frames, coord_x, coord_y, k_r):

    n_angles = coord_x.shape[1]
    n_rays = coord_x.shape[0]

    img_x = image.shape[0]
    img_y = image.shape[1]

    nthreads = 128
    nblocks = ((n_rays * n_angles) / nthreads) + 1

    import cupy as cp

    image = image.astype(cp.complex64)
    frames = frames.astype(cp.complex64)

    coord_x = coord_x.astype(cp.int32)
    coord_y = coord_y.astype(cp.int32)


    cp.RawKernel(convolve_raw_kernel, "Convolve") \
        ((int(nblocks),), (int(nthreads),), \
        (image, frames, coord_x, coord_y, k_r, n_angles, n_rays, img_x, img_y))

    return image
Exemplo n.º 13
0
    def __init__(
        self,
        de=16,
        dk=16,
        sm_size=48 * 256 * 4,
    ):
        super(ComputeCentroidsCUDA, self).__init__()
        self.de = de
        self.dk = dk
        assert dk * (de + 1) * 4 <= sm_size
        self.tpb = 256
        self.sm_size = sm_size

        with open(
                get_absolute_path("kmeans", "kernels",
                                  "ComputeCentroidsKernel.cu"), "r") as f:
            self.kernel = f.read()

        kernel = (self.kernel.replace("_DE_", str(de)).replace(
            "_DK_", str(dk)).replace("_TPB_", str(self.tpb)).replace(
                "_NITERS_", str(math.ceil(dk / self.tpb))))

        self.fn = cp.RawKernel(
            kernel,
            'compute_centroids',
            # options=('--maxrregcount=255',),
            # backend='nvcc',
        )

        self.fn.max_dynamic_shared_size_bytes = sm_size
Exemplo n.º 14
0
    def setUp(self):
        if hasattr(self, 'clean_up'):
            if cupy.cuda.runtime.is_hip:
                # Clearing memo triggers recompiling kernels using name
                # expressions in other tests, e.g. dot and matmul, which
                # hits a nvrtc bug. See #5843, #5945 and #6725.
                self.skipTest('Clearing memo hits a nvrtc bug in other tests')
            _util.clear_memo()
        self.dev = cupy.cuda.runtime.getDevice()
        assert self.dev != 1
        if not hasattr(self, 'jitify'):
            self.jitify = False
        if cupy.cuda.runtime.is_hip and self.jitify:
            self.skipTest('Jitify does not support ROCm/HIP')

        self.temporary_cache_dir_context = use_temporary_cache_dir()
        self.in_memory_context = compile_in_memory(self.in_memory)
        self.cache_dir = self.temporary_cache_dir_context.__enter__()
        self.in_memory_context.__enter__()

        self.kern = cupy.RawKernel(_test_source1,
                                   'test_sum',
                                   backend=self.backend,
                                   jitify=self.jitify)
        self.mod2 = cupy.RawModule(code=_test_source2,
                                   backend=self.backend,
                                   jitify=self.jitify)
        self.mod3 = cupy.RawModule(code=_test_source3,
                                   options=('-DPRECISION=2', ),
                                   backend=self.backend,
                                   jitify=self.jitify)
Exemplo n.º 15
0
def bool_spike_to_float(spike_b: torch.Tensor, s_dtype: torch.dtype, s_shape: torch.Size, s_padding: int = 0):
    device_id = spike_b.get_device()
    spike = torch.zeros(spike_b.numel() * 8, device=spike_b.device, dtype=s_dtype)
    if s_dtype == torch.float:
        kernel_codes = DataTypeConvertCUDACode.bool2float
        kernel_name = 'bool2float'
    elif s_dtype == torch.half:
        kernel_codes = DataTypeConvertCUDACode.bool2half
        kernel_name = 'bool2half'
    else:
        raise NotImplementedError
    with cuda_utils.DeviceEnvironment(device_id):
        numel = spike_b.numel()
        blocks = cuda_utils.cal_blocks(numel)
        numel = cupy.asarray(numel)
        spike_b, spike, numel = cuda_utils.get_contiguous(spike_b, spike, numel)
        kernel_args = [spike_b, spike, numel]
        kernel = cupy.RawKernel(
            kernel_codes,
            kernel_name,
            options=configure.cuda_compiler_options, backend=configure.cuda_compiler_backend
        )
        kernel(
            (blocks,), (configure.cuda_threads,),
            cuda_utils.wrap_args_to_raw_kernel(
                device_id,
                *kernel_args
            )
        )
    if s_padding != 0 and s_padding != 8:
        spike = spike[0: spike.numel() - s_padding]
    return spike.reshape(s_shape)
Exemplo n.º 16
0
    def setUp(self):
        if hasattr(self, 'clean_up'):
            _util.clear_memo()
        self.dev = cupy.cuda.runtime.getDevice()
        assert self.dev != 1
        if not hasattr(self, 'jitify'):
            self.jitify = False
        if cupy.cuda.runtime.is_hip and self.jitify:
            self.skipTest('Jitify does not support ROCm/HIP')

        self.temporary_cache_dir_context = use_temporary_cache_dir()
        self.in_memory_context = compile_in_memory(self.in_memory)
        self.cache_dir = self.temporary_cache_dir_context.__enter__()
        self.in_memory_context.__enter__()

        self.kern = cupy.RawKernel(
            _test_source1, 'test_sum',
            backend=self.backend, jitify=self.jitify)
        self.mod2 = cupy.RawModule(
            code=_test_source2,
            backend=self.backend, jitify=self.jitify)
        self.mod3 = cupy.RawModule(
            code=_test_source3,
            options=('-DPRECISION=2',),
            backend=self.backend, jitify=self.jitify)
Exemplo n.º 17
0
def sgemm(A, B,
          dim_x=16, dim_y=16, blk_m=64, blk_n=64, blk_k=4,
          dim_xa=64, dim_ya=4, dim_xb=4, dim_yb=64):
    assert A.dtype == cp.float32
    assert B.dtype == cp.float32
    assert(dim_x * dim_y == dim_xa * dim_ya == dim_xb * dim_yb)

    m, k = A.shape
    k, n = B.shape

    # Inputs matrices need to be in Fortran order.
    A = cp.asfortranarray(A)
    B = cp.asfortranarray(B)

    C = cp.empty((m, n), dtype=cp.float32, order='F')

    config = {'DIM_X': dim_x, 'DIM_Y': dim_y,
              'BLK_M': blk_m, 'BLK_N': blk_n, 'BLK_K': blk_k,
              'DIM_XA': dim_xa, 'DIM_YA': dim_ya,
              'DIM_XB': dim_xb, 'DIM_YB': dim_yb,
              'THR_M': blk_m // dim_x, 'THR_N': blk_n // dim_y}
    code = read_code(sgemm_file, params=config)
    kern = cp.RawKernel(code, 'sgemm')

    grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), 1)
    block = (dim_x, dim_y, 1)
    args = (m, n, k, A, B, C)
    shared_mem = blk_k * (blk_m + 1) * 4 + blk_n * (blk_k + 1) * 4
    kern(grid, block, args=args, shared_mem=shared_mem)
    return C
Exemplo n.º 18
0
def setup_cupy():
    global cupy
    global cupy_stream
    global square_diff_kernel
    global mix_channels_kernel
    global gray_scale_kernel
    import cupy as cupy
    cupy_stream = cupy.cuda.Stream()
    square_diff_kernel = cupy.ElementwiseKernel('T x, T y', 'T z',
                                                'z = x*x - y*y', 'square_diff')

    mix_channels_kernel = cupy.ElementwiseKernel('uint8 x, uint8 y', 'uint8 z',
                                                 'z = (i % 3) ? x : y',
                                                 'mix_channels')

    gray_scale_kernel = cupy.RawKernel(
        r'''
    extern "C" __global__
    void gray_scale(float *output, const unsigned char *input, long long height, long long width) {
        int tidx = blockIdx.x * blockDim.x + threadIdx.x;
        int tidy = blockIdx.y * blockDim.y + threadIdx.y;
        if (tidx < width && tidy < height) {
            float r = input[tidy * width + tidx] / 255.;
            float g = input[tidy * width + tidx + 1] / 255.;
            float b = input[tidy * width + tidx + 2] / 255.;
            output[tidy * width + tidx] = 0.299 * r + 0.59 * g + 0.11 * b;
        }
    }
    ''', 'gray_scale')
Exemplo n.º 19
0
def mexWtW2(Params, W1, W2, UtU):
    code, constants = get_cuda('mexWtW2')

    nblock = constants.nblock

    Nfilt = int(Params[1])
    nt0 = int(Params[9])

    d_Params = cp.asarray(Params, dtype=np.float64, order='F')

    d_W1 = cp.asarray(W1, dtype=np.float32, order='F')
    d_W2 = cp.asarray(W2, dtype=np.float32, order='F')
    d_UtU = cp.asarray(UtU, dtype=np.float32, order='F')

    d_WtW = cp.zeros((Nfilt, Nfilt, 2 * nt0 - 1), dtype=np.float32, order='F')

    grid = (1 + int(Nfilt // nblock), 1 + int(Nfilt // nblock))
    block = (nblock, nblock)

    crossFilter = cp.RawKernel(code, 'crossFilter')
    crossFilter(grid, block, (d_Params, d_W1, d_W2, d_UtU, d_WtW))

    del d_Params, d_W1, d_W2, d_UtU

    return d_WtW
Exemplo n.º 20
0
def _generate_kernel(parallactic_angles, feed_type):
    dtype = parallactic_angles.dtype

    # Block sizes
    if dtype == np.float32:
        block = (1024, 1, 1)
    elif dtype == np.float64:
        block = (512, 1, 1)
    else:
        raise TypeError("Unhandled type %s" % dtype)

    # Create template
    render = jinja_env.get_template(_TEMPLATE_PATH).render
    name = "feed_rotation"

    code = render(kernel_name=name,
                  feed_type=feed_type,
                  sincos_fn=cuda_function('sincos', dtype),
                  pa_type=_get_typename(dtype),
                  out_type=_get_typename(dtype))

    code = code.encode('utf-8')

    # Complex output type
    out_dtype = np.result_type(dtype, np.complex64)
    return cp.RawKernel(code, name), block, out_dtype
Exemplo n.º 21
0
def get_raw_spline1d_kernel(axis,
                            ndim,
                            mode,
                            order,
                            index_type="int",
                            data_type="double",
                            pole_type="double",
                            block_size=128):
    """Generate a kernel for applying a spline prefilter along a given axis."""
    poles = get_poles(order)

    # determine number of samples for the boundary approximation
    # (SciPy uses n_boundary = n_samples but this is excessive)
    largest_pole = max([abs(p) for p in poles])
    # tol < 1e-7 fails test cases comparing to SciPy at atol = rtol = 1e-5
    tol = 1e-10 if pole_type == "float" else 1e-18
    n_boundary = math.ceil(math.log(tol, largest_pole))

    # headers and general utility function for extracting rows of data
    code = _FILTER_GENERAL.format(index_type=index_type,
                                  data_type=data_type,
                                  pole_type=pole_type)

    # generate source for a 1d function for a given boundary mode and poles
    code += _get_spline1d_code(mode, poles, n_boundary)

    # generate code handling batch operation of the 1d filter
    code += _batch_spline1d_strided_template.format(ndim=ndim,
                                                    axis=axis,
                                                    block_size=block_size)
    return cupy.RawKernel(code, "cupyx_spline_filter")
Exemplo n.º 22
0
def _generate_kernel(lm, uvw, frequency):
    # Floating point output type
    out_dtype = np.result_type(lm, uvw, frequency)

    # Block sizes
    blockdimx = 32 if frequency.dtype == np.float32 else 16
    blockdimy = 32 if uvw.dtype == np.float32 else 16
    block = (blockdimx, blockdimy, 1)

    # Create template
    render = jinja_env.get_template(_TEMPLATE_PATH).render
    name = "phase_delay"

    code = render(kernel_name=name,
                  lm_type=_get_typename(lm.dtype),
                  uvw_type=_get_typename(uvw.dtype),
                  freq_type=_get_typename(frequency.dtype),
                  out_type=_get_typename(out_dtype),
                  sqrt_fn=cuda_function('sqrt', lm.dtype),
                  sincos_fn=cuda_function('sincos', out_dtype),
                  minus_two_pi_over_c=minus_two_pi_over_c,
                  blockdimx=blockdimx,
                  blockdimy=blockdimy).encode('utf-8')

    # Complex output type
    out_dtype = np.result_type(out_dtype, np.complex64)
    return cp.RawKernel(code, name), block, out_dtype
Exemplo n.º 23
0
def get_raw_spline1d_kernel(axis,
                            ndim,
                            mode,
                            order,
                            index_type='int',
                            data_type='double',
                            pole_type='double',
                            block_size=128):
    """Generate a kernel for applying a spline prefilter along a given axis."""
    poles = get_poles(order)

    # determine number of samples for the boundary approximation
    # (SciPy uses n_boundary = n_samples but this is excessive)
    largest_pole = max([abs(p) for p in poles])
    # tol < 1e-7 fails test cases comparing to SciPy at atol = rtol = 1e-5
    tol = 1e-10 if pole_type == 'float' else 1e-18
    n_boundary = math.ceil(math.log(tol, largest_pole))

    # headers and general utility function for extracting rows of data
    code = _FILTER_GENERAL.format(index_type=index_type,
                                  data_type=data_type,
                                  pole_type=pole_type)

    # generate source for a 1d function for a given boundary mode and poles
    code += _get_spline1d_code(mode, poles, n_boundary)

    # generate code handling batch operation of the 1d filter
    mode_str = mode.replace('-', '_')  # cannot have '-' in kernel name
    kernel_name = (f'cupyx_scipy_ndimage_spline_filter_{ndim}d_ord{order}_'
                   f'axis{axis}_{mode_str}')
    code += _batch_spline1d_strided_template.format(ndim=ndim,
                                                    axis=axis,
                                                    block_size=block_size,
                                                    kernel_name=kernel_name)
    return cupy.RawKernel(code, kernel_name)
Exemplo n.º 24
0
def _generate_kernel(inputs, input_schema, output_schema):
    mapping, in_shape, out_shape, out_dtype = stokes_convert_setup(
        inputs, input_schema, output_schema)

    # Flatten input and output shapes
    # Check that number elements are the same
    in_elems = reduce(mul, in_shape, 1)
    out_elems = reduce(mul, out_shape, 1)

    if in_elems != out_elems:
        raise ValueError("Number of input_schema elements %s "
                         "and output schema elements %s "
                         "must match for CUDA kernel." % (in_shape, out_shape))

    # Infer the output data type
    if out_dtype == "real":
        if np.iscomplexobj(inputs):
            out_dtype = inputs.real.dtype
        else:
            out_dtype = inputs.dtype
    elif out_dtype == "complex":
        if np.iscomplexobj(inputs):
            out_dtype = inputs.dtype
        else:
            out_dtype = np.result_type(inputs.dtype, np.complex64)
    else:
        raise ValueError("Invalid setup dtype %s" % out_dtype)

    cuda_out_dtype = cuda_type(out_dtype)
    assign_exprs = []

    # Render the assignment expression for each element
    for (c1, c1i), (c2, c2i), outi, template_fn in mapping:
        # Flattened indices
        flat_outi = np.ravel_multi_index(outi, out_shape)
        render = jinja_env.from_string(template_fn).render
        kwargs = {
            c1: "in[%d]" % np.ravel_multi_index(c1i, in_shape),
            c2: "in[%d]" % np.ravel_multi_index(c2i, in_shape),
            "out_type": cuda_out_dtype
        }

        expr_str = render(**kwargs)
        assign_exprs.append("out[%d] = %s;" % (flat_outi, expr_str))

    # Now render the main template
    render = jinja_env.get_template(_TEMPLATE_PATH).render
    name = "stokes_convert"
    code = render(kernel_name=name,
                  input_type=cuda_type(inputs.dtype),
                  output_type=cuda_type(out_dtype),
                  assign_exprs=assign_exprs,
                  elements=in_elems).encode("utf-8")

    # cuda block, flatten non-schema dims into a single source dim
    blockdimx = 512
    block = (blockdimx, 1, 1)

    return (cp.RawKernel(code, name), block, in_shape, out_shape, out_dtype)
Exemplo n.º 25
0
def mexClustering2(Params, uproj, W, mu, call, iMatch, iC):

    code, _ = get_cuda('mexClustering2')

    Nspikes = int(Params[0])
    NrankPC = int(Params[1])
    Nfilters = int(Params[2])
    NchanNear = int(Params[6])
    Nchan = int(Params[7])

    d_Params = cp.asarray(Params, dtype=np.float64, order='F')
    d_uproj = cp.asarray(uproj, dtype=np.float32, order='F')
    d_W = cp.asarray(W, dtype=np.float32, order='F')
    d_mu = cp.asarray(mu, dtype=np.float32, order='F')
    d_call = cp.asarray(call, dtype=np.int32, order='F')
    d_iC = cp.asarray(iC, dtype=np.int32, order='F')
    d_iMatch = cp.asarray(iMatch, dtype=np.bool, order='F')

    d_dWU = cp.zeros((NrankPC * Nchan, Nfilters), dtype=np.float32, order='F')
    d_cmax = cp.zeros((Nspikes, Nfilters), dtype=np.float32, order='F')
    d_id = cp.zeros(Nspikes, dtype=np.int32, order='F')
    d_x = cp.zeros(Nspikes, dtype=np.float32, order='F')
    d_nsp = cp.zeros(Nfilters, dtype=np.int32, order='F')
    d_V = cp.zeros(Nfilters, dtype=np.float32, order='F')

    # get list of cmaxes for each combination of neuron and filter
    computeCost = cp.RawKernel(code, 'computeCost')
    computeCost((Nfilters, ), (1024, ),
                (d_Params, d_uproj, d_mu, d_W, d_iMatch, d_iC, d_call, d_cmax))

    # loop through cmax to find best template
    bestFilter = cp.RawKernel(code, 'bestFilter')
    bestFilter((40, ), (256, ),
               (d_Params, d_iMatch, d_iC, d_call, d_cmax, d_id, d_x))

    # average all spikes for same template -- ORIGINAL
    average_snips_v2 = cp.RawKernel(code, 'average_snips_v2')
    average_snips_v2((Nfilters, ), (NrankPC, NchanNear),
                     (d_Params, d_iC, d_call, d_id, d_uproj, d_cmax, d_dWU))

    count_spikes = cp.RawKernel(code, 'count_spikes')
    count_spikes((7, ), (256, ), (d_Params, d_id, d_nsp, d_x, d_V))

    del d_Params, d_V

    return d_dWU, d_id, d_x, d_nsp, d_cmax
Exemplo n.º 26
0
 def __getitem__(self, key):
     try:
         return super().__getitem__(key)
     except KeyError:
         # lazy load kernel definition, will NOT compile until called
         kernel = cp.RawKernel(self._source, key)
         super().__setitem__(key, kernel)
         return kernel
Exemplo n.º 27
0
def load_cupy_func(fname, name, **kwargs):
    try: fname = str((Path(__file__).parent / fname).resolve())
    except: pass
    with open(fname) as f:
        code = f.read()
    macros = ['#define {!s} {!s}'.format(k, v) for k,v in kwargs.items()]
    code = '\n'.join(macros + [code])
    return add_checks(cp.RawKernel(code, name))
Exemplo n.º 28
0
 def test_dynamical_parallelism(self):
     ker = cupy.RawKernel(_test_source4, 'test_kernel', options=('-dc',),
                          backend=self.backend, jitify=self.jitify)
     N = 169
     inner_chunk = 13
     x = cupy.zeros((N,), dtype=cupy.float32)
     ker((1,), (N//inner_chunk,), (x, N, inner_chunk))
     assert (x == 1.0).all()
Exemplo n.º 29
0
def conv_forward_pass(image, filt, bias):
    # print("CUDA cODE")
    (n_f, n_c_f, f, _) = filt.shape # filter dimensions
    n_c, in_dim, _ = image.shape # image dimensions

    convfilter_kernel = cp.RawKernel(r'''
        extern "C" __global__
        void my_conv(const float* img, const float * filt, const float * bias, float * out, const int depth, const int img_dim, const int f, const int out_dim ) {
            const unsigned int j = blockDim.x*blockIdx.x + threadIdx.x;
            const unsigned int i = blockDim.y*blockIdx.y + threadIdx.y;
            const unsigned int pp = blockIdx.z;
            const unsigned int dp = depth;
            //printf("%d,%d ::: ",i,j);

            if(i<img_dim && j<img_dim){
                unsigned int oPixelPos = i*out_dim + j+ pp*out_dim*out_dim;
                out[oPixelPos] = 0;
                for(int h=0;h<dp;h++){
                    for(int k=0;k<f;k++){
                        for(int l=0;l<f;l++){
                            unsigned int iPixelPos = ((i+k)*img_dim + (j+l)) + img_dim*img_dim*h;
                            unsigned int filtPos = (k*f+l) + f*f*h + f*f*dp*pp;
                            out[oPixelPos] += img[iPixelPos] * filt[filtPos] + bias[pp];
                        }
                    }
                }
            }
        }
        ''', 'my_conv')

    out_dim = int(in_dim - f)+1 # calculate output dimensions
    img_gpu = cp.asarray(image, dtype=cp.float32) #28X28
    img_dim_gpu = cp.int32(in_dim) #28
    filt_size_gpu = cp.int32(f) # 5
    out_dim_gpu = cp.int32(out_dim) #24
    depth_gpu = cp.int32(n_c)
    threads_per_block = f
    num_blocks = (in_dim//f) +1

    filt_gpu = cp.asarray(filt, dtype=cp.float32) #5X5
    # print(filt)
    # filt_gpu = cp.zeros((n_f, n_c_f, f, f), dtype=cp.float32) #5X5
    filt_gpu = cp.asarray(filt_gpu.flatten(), dtype=cp.float32)
    # print(filt_gpu)


    # out_gpu = cp.zeros((out_dim,out_dim,n_f), dtype=cp.float32) # 24 X 24
    out_gpu = cp.zeros((n_f,out_dim,out_dim), dtype=cp.float32) # 24 X 24
    out_gpu = out_gpu.flatten()

    bias_gpu = cp.asarray(bias, dtype=cp.float32) #5X5
    bias_gpu = bias_gpu.flatten()

    convfilter_kernel((num_blocks,num_blocks,8), (threads_per_block,threads_per_block), (img_gpu, filt_gpu, bias_gpu, out_gpu, depth_gpu, img_dim_gpu, filt_size_gpu, out_dim_gpu))
    # # convfilter_kernel((576,), (24,5), (img_gpu, filt_gpu, out_gpu, depth_gpu, img_dim_gpu, filt_size_gpu, out_dim_gpu))
    out_gpu = cp.reshape(out_gpu,(n_f,out_dim,out_dim))
    output = cp.asnumpy(out_gpu)
    return output
Exemplo n.º 30
0
def _get_transform_kernel(interpolation: str = 'linear'):

    if interpolation not in AVAILABLE_INTERPOLATIONS:
        raise ValueError(f'Interpolation must be one of {interpolation}')

    code = f'''
        #include "helper_math.h"
        #include "bspline.h"
        #include "helper_interpolation.h"
        
        extern "C" {{
            inline __device__ int get_z_idx(const int i, const uint4* const dims) {{
                return i / (dims[0].y * dims[0].z);
            }}
            inline __device__ int get_y_idx(const int i, const uint4* const dims) {{
                return (i % (dims[0].y * dims[0].z)) / dims[0].z;
            }}
            inline __device__ int get_x_idx(const int i, const uint4* const dims) {{
                return (i % (dims[0].y * dims[0].z)) % dims[0].z;
            }}

            __global__ void transform(float* const volume,
                                      cudaTextureObject_t texture,
                                      const float4* const xform,
                                      const uint4* const dims) {{
            
                unsigned tid = threadIdx.x;
                unsigned total_threads = gridDim.x*blockDim.x;
                unsigned cta_start = blockDim.x*blockIdx.x;
                unsigned i;
                unsigned n = dims[0].x * dims[0].y * dims[0].z;
                
                for (i = cta_start + tid; i < n; i += total_threads) {{
                    int z = get_x_idx(i, dims);
                    int y = get_y_idx(i, dims);
                    int x = get_z_idx(i, dims);
                    
                    float4 voxf = make_float4((float)x, (float)y, (float)z, 1.0f);
                    
                    float3 ndx;
                    ndx.z = dot(voxf, xform[0]) + .5f;
                    ndx.y = dot(voxf, xform[1]) + .5f;
                    ndx.x = dot(voxf, xform[2]) + .5f;
                    
                    if (ndx.x < 0 || ndx.y < 0 || ndx.z < 0 || ndx.x >= dims[0].z || ndx.y >= dims[0].y || ndx.z >= dims[0].x) {{
                        continue;
                    }}
                    
                    volume[i] = {_INTERPOLATIONS[interpolation]}(texture, ndx);
                }}
            }}
        }}
    '''
    incl_path = str((Path(__file__).parent / 'kernels').absolute())
    kernel = cp.RawKernel(code=code,
                          name='transform',
                          options=('-I', incl_path))
    return kernel