Пример #1
0
    def __init__(self, dtype_out,
            neutral, reduce_expr, map_expr=None, arguments=None,
            name="reduce_kernel", keep=False, options=None, preamble=""):

        self.dtype_out = dtype_out

        self.block_size = 512

        s1_func, self.stage1_arg_types = get_reduction_kernel_and_types(
                dtype_to_ctype(dtype_out), self.block_size,
                neutral, reduce_expr, map_expr,
                arguments, name=name+"_stage1", keep=keep, options=options,
                preamble=preamble)
        self.stage1_func = s1_func.prepared_async_call

        # stage 2 has only one input and no map expression
        s2_func, self.stage2_arg_types = get_reduction_kernel_and_types(
                dtype_to_ctype(dtype_out), self.block_size,
                neutral, reduce_expr,
                name=name+"_stage2", keep=keep, options=options,
                preamble=preamble)
        self.stage2_func = s2_func.prepared_async_call

        assert [i for i, arg_tp in enumerate(self.stage1_arg_types) if arg_tp == "P"], \
                "ReductionKernel can only be used with functions that have at least one " \
                "vector argument"
Пример #2
0
def get_divscalar_function(src_type, dest_type, pitch = True):
    type_src = dtype_to_ctype(src_type)
    type_dest = dtype_to_ctype(dest_type)
    
    name = "divscalar"
    operation = "/"
    
    if pitch:
        func = SourceModule(
                pitch_left_scalar_op_template % {
                    "name": name,
                    "src_type": type_src,
                    "dest_type": type_dest,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare([np.int32, np.int32, np.intp, np.int32,
                      np.intp, np.int32, _get_type(dest_type)])
    else:
        func = SourceModule(
                non_pitch_left_scalar_op_template % {
                    "name": name,
                    "src_type": type_src,
                    "dest_type": type_dest,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare([np.intp, np.intp, _get_type(dest_type), np.int32])
    return func
Пример #3
0
def get_dot_kernel(dtype_out, dtype_a, dtype_b):
    return ReductionKernel(dtype_out, neutral="0",
            reduce_expr="a+b", map_expr="a[i]*b[i]",
            arguments="const %(tp_a)s *a, const %(tp_b)s *b" % {
                "tp_a": dtype_to_ctype(dtype_a),
                "tp_b": dtype_to_ctype(dtype_b),
                }, keep=True)
Пример #4
0
def get_complex_from_amp_function(in_type, result_type, pitch = True):
    type_in = dtype_to_ctype(in_type)
    type_result = dtype_to_ctype(result_type)
    
    name = "makecomplex_amp_phase"
    
    if pitch:
        func = SourceModule(
                pitch_complex_amp_template % {
                    "name": name,
                    "in_type": type_in,
                    "result_type": type_result,
                    "fletter": 'f' if in_type == np.float32 else ''
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPPi')#[np.int32, np.int32, np.intp, np.int32,
        #              np.intp, np.intp, np.int32])
    else:
        func = SourceModule(
                non_pitch_complex_amp_template % {
                    "name": name,
                    "in_type": type_in,
                    "result_type": type_result,
                    "fletter": 'f' if in_type == np.float32 else ''
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPPi')#[np.intp, np.intp, np.intp, np.int32])
    return func
Пример #5
0
def get_astype_function(dtype_dest, dtype_src, pitch = True):
    type_dest = dtype_to_ctype(dtype_dest)
    type_src = dtype_to_ctype(dtype_src)
    name = "astype"
    operation = ""
    
    if pitch:
        func = SourceModule(
                pitch_template % {
                    "name": name,
                    "dest_type": type_dest,
                    "src_type": type_src,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi')
        #        [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(
                non_pitch_template % {
                    "name": name,
                    "dest_type": type_dest,
                    "src_type": type_src,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPi')#[np.intp, np.intp, np.int32])
    return func
Пример #6
0
def get_divarray_function(left_dtype, right_dtype, rslt_dtype, pitch = True):
    type_left = dtype_to_ctype(left_dtype)
    type_right = dtype_to_ctype(right_dtype)
    type_rslt = dtype_to_ctype(rslt_dtype)

    name = "divarray"
    operation = "/"
    
    if pitch:
        func = SourceModule(
                pitch_array_op_template % {
                    "name": name,
                    "dest_type": type_rslt,
                    "left_type": type_left,
                    "right_type": type_right,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPiPi')#[np.int32, np.int32, np.intp, np.int32,
        #             np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(
                non_pitch_array_op_template % {
                    "name": name,
                    "dest_type": type_rslt,
                    "left_type": type_left,
                    "right_type": type_right,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPPi')#[np.intp, np.intp, np.intp, np.int32])
    return func
Пример #7
0
def get_take_kernel(dtype, idx_dtype, vec_count=1):
    ctx = {
            "idx_tp": dtype_to_ctype(idx_dtype),
            "tp": dtype_to_ctype(dtype),
            "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True),
            }

    args = [VectorArg(idx_dtype, "idx")] + [
            VectorArg(dtype, "dest"+str(i))for i in range(vec_count)] + [
                ScalarArg(np.intp, "n")
            ]
    preamble = "#include <pycuda-helpers.hpp>\n\n" + "\n".join(
        "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i)
        for i in range(vec_count))
    body = (
            ("%(idx_tp)s src_idx = idx[i];\n" % ctx)
            + "\n".join(
                "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)
                for i in range(vec_count)))

    mod = get_elwise_module(args, body, "take", preamble=preamble)
    func = mod.get_function("take")
    tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]
    func.prepare("P"+(vec_count*"P")+np.dtype(np.uintp).char, texrefs=tex_src)
    return func, tex_src
Пример #8
0
def get_angle_function(dtypein, dtypeout, pitch = True):
    type_src = dtype_to_ctype(dtypein)
    type_dest = dtype_to_ctype(dtypeout)
    name = "angle_function"
    if dtypeout == np.float32:
        fletter = "f"
    else:
        fletter = ""
    
    if pitch:
        func = SourceModule(
                pitch_angle_template % {
                    "name": name,
                    "dest_type": type_dest,
                    "src_type": type_src,
                    "fletter": fletter,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi')
        #        [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(
                non_pitch_angle_template % {
                    "name": name,
                    "dest_type": type_dest,
                    "src_type": type_src,
                    "fletter": fletter,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPi')#[np.intp, np.intp, np.int32])
    return func
Пример #9
0
def get_divarray_function(left_dtype, right_dtype, rslt_dtype, pitch = True):
    type_left = dtype_to_ctype(left_dtype)
    type_right = dtype_to_ctype(right_dtype)
    type_rslt = dtype_to_ctype(rslt_dtype)

    name = "divarray"
    operation = "/"

    if pitch:
        func = func_compile(name, pitch_array_op_template % {"name": name,
                   "dest_type": type_rslt,
                   "left_type": type_left,
                   "right_type": type_right,
                   "operation": operation,
                   })
        func.prepare([np.int32, np.int32, np.intp, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = func_compile(name, non_pitch_array_op_template % {"name": name,
                   "dest_type": type_rslt,
                   "left_type": type_left,
                   "right_type": type_right,
                   "operation": operation,
                   })
        func.prepare([np.intp, np.intp, np.intp, np.int32])
    return func
Пример #10
0
def get_powscalar_function(src_type, dest_type, pitch = True):
    type_src = dtype_to_ctype(src_type)
    type_dest = dtype_to_ctype(dest_type)
    name = "powscalar"
    operation = "pow"
    
    if pitch:
        func = SourceModule(
                pitch_left_scalar_func_template % {
                    "name": name,
                    "src_type": type_src,
                    "dest_type": type_dest,
                    "operation": operation,
                    "fletter": 'f' if src_type == np.float32 else '',
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi'+np.dtype(dest_type).char)#[np.int32, np.int32, np.intp, np.int32,
        #              np.intp, np.int32, _get_type(dest_type)])
    else:
        func = SourceModule(
                non_pitch_left_scalar_func_template % {
                    "name": name,
                    "src_type": type_src,
                    "dest_type": type_dest,
                    "operation": operation,
                    "fletter": 'f' if src_type == np.float32 else '',
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PP'+np.dtype(dest_type).char+'i')#[np.intp, np.intp, _get_type(dest_type), np.int32])
    return func
Пример #11
0
def _get_eigsq_kernel(dtype_s, dtype_q):       
    template = """
#include <pycuda/pycuda-complex.hpp>

__global__ void
eigsq_Kernel(%(types)s* d_S, %(typeq)s* d_q, %(types)s thres, int size)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int total = blockDim.x * gridDim.x;

    for(int i = tid; i < size; i += total)
    {
        %(types)s s = d_S[i];
        %(typeq)s q = d_q[i];

        if(fabs%(iff)s(s) > thres)
        {
            d_q[i] = q / s;
        }else
        {
            d_q[i] = 0.0;
        }
    }
}

    """
    mod = SourceModule(template % {
                       "types": dtype_to_ctype(dtype_s),
                       "typeq": dtype_to_ctype(dtype_q),
                       "iff": "f" if dtype_q == np.float32 else ""})
    func = mod.get_function("eigsq_Kernel")
    func.prepare([np.intp, np.intp, 
                  np.double if dtype_s == np.double else np.float32,
                  np.int32])
    return func
Пример #12
0
def get_by_index(src_gpu, ind):
    """
    Get values in a GPUArray by index.

    Parameters
    ----------
    src_gpu : pycuda.gpuarray.GPUArray
        GPUArray instance from which to extract values.
    ind : pycuda.gpuarray.GPUArray or numpy.ndarray
        Array of element indices to set. Must have an integer dtype.

    Returns
    -------
    res_gpu : pycuda.gpuarray.GPUArray
        GPUArray with length of `ind` and dtype of `src_gpu` containing
        selected values.

    Examples
    --------
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import misc
    >>> src = np.random.rand(5).astype(np.float32)
    >>> src_gpu = gpuarray.to_gpu(src)
    >>> ind = gpuarray.to_gpu(np.array([0, 2, 4]))
    >>> res_gpu = misc.get_by_index(src_gpu, ind)
    >>> np.allclose(res_gpu.get(), src[[0, 2, 4]])
    True

    Notes
    -----
    Only supports 1D index arrays.

    May not be efficient for certain index patterns because of lack of inability
    to coalesce memory operations.
    """

    # Only support 1D index arrays:
    assert len(np.shape(ind)) == 1
    assert issubclass(ind.dtype.type, numbers.Integral)
    N = len(ind)
    if not isinstance(ind, gpuarray.GPUArray):
        ind = gpuarray.to_gpu(ind)
    dest_gpu = gpuarray.empty(N, dtype=src_gpu.dtype)

    # Manually handle empty index array because it will cause the kernel to
    # fail if processed:
    if N == 0:
        return dest_gpu
    try:
        func = get_by_index.cache[(src_gpu.dtype, ind.dtype)]
    except KeyError:
        data_ctype = tools.dtype_to_ctype(src_gpu.dtype)
        ind_ctype = tools.dtype_to_ctype(ind.dtype)
        v = "{data_ctype} *dest, {ind_ctype} *ind, {data_ctype} *src".format(data_ctype=data_ctype, ind_ctype=ind_ctype)
        func = elementwise.ElementwiseKernel(v, "dest[i] = src[ind[i]]")
        get_by_index.cache[(src_gpu.dtype, ind.dtype)] = func
    func(dest_gpu, ind, src_gpu, range=slice(0, N, 1))
    return dest_gpu
Пример #13
0
def get_scalardiv_function(src_type, dest_type, pitch = True):
    type_src = dtype_to_ctype(src_type)
    type_dest = dtype_to_ctype(dest_type)
    
    name = "scalardiv"
    operation = "/"
    
    if pitch:
        func = SourceModule(
                pitch_right_scalar_op_template % {
                    "name": name,
                    "src_type": type_src,
                    "dest_type": type_dest,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi'+np.dtype(dest_type).char)#[np.int32, np.int32, np.intp, np.int32,
        #              np.intp, np.int32, _get_type(dest_type)])
    else:
        func = SourceModule(
                non_pitch_right_scalar_op_template % {
                    "name": name,
                    "src_type": type_src,
                    "dest_type": type_dest,
                    "operation": operation,
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PP'+np.dtype(dest_type).char+'i')#[np.intp, np.intp, _get_type(dest_type), np.int32])
    return func
Пример #14
0
def get_complex_function(real_type, imag_type, result_type, pitch = True):
    type_real = dtype_to_ctype(real_type)
    type_imag = dtype_to_ctype(imag_type)
    type_result = dtype_to_ctype(result_type)
    
    name = "makecomplex"
    
    if pitch:
        func = SourceModule(
                pitch_complex_template % {
                    "name": name,
                    "real_type": type_real,
                    "imag_type": type_imag,
                    "result_type": type_result
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPPi')#[np.int32, np.int32, np.intp, np.int32,
        #              np.intp, np.intp, np.int32])
    else:
        func = SourceModule(
                non_pitch_complex_template % {
                    "name": name,
                    "real_type": type_real,
                    "imag_type": type_imag,
                    "result_type": type_result
                },
                options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPPi')#[np.intp, np.intp, np.intp, np.int32])
    return func
Пример #15
0
def get_norm_kernel(dtype_x, dtype_out):
    return ElementwiseKernel(
            "%(tp_x)s *x, %(tp_z)s *z" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_z": dtype_to_ctype(dtype_out),
                },
            "z[i] = norm(x[i])",
            "normalize")
Пример #16
0
def get_accum_diff_sq_kernel(dtype_x, dtype_z):
    return ElementwiseKernel(
            "%(tp_a)s *x,  %(tp_c)s *z" % {
                "tp_a": dtype_to_ctype(dtype_x),
                "tp_c": dtype_to_ctype(dtype_z),
                },
            "x[i] += norm(z[i]) ",
            "chisq_accum")    
Пример #17
0
def get_imag_kernel(dtype, real_dtype):
    return get_elwise_kernel(
            "%(tp)s *y, %(real_tp)s *z" % {
                "tp": dtype_to_ctype(dtype),
                "real_tp": dtype_to_ctype(real_dtype),
                },
            "z[i] = imag(y[i])",
            "imag")
Пример #18
0
def get_axpbz_kernel(dtype_x, dtype_z):
    return get_elwise_kernel(
            "%(tp_z)s a, %(tp_x)s *x,%(tp_z)s b, %(tp_z)s *z" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_z": dtype_to_ctype(dtype_z)
                },
            "z[i] = a * x[i] + b",
            "axpb")
Пример #19
0
def get_rdivide_elwise_kernel(dtype_x, dtype_z):
    return get_elwise_kernel(
            "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = y / x[i]",
            "divide_r")
Пример #20
0
def get_copy_kernel(dtype_dest, dtype_src):
    return get_elwise_kernel(
            "%(tp_dest)s *dest, %(tp_src)s *src" % {
                "tp_dest": dtype_to_ctype(dtype_dest),
                "tp_src": dtype_to_ctype(dtype_src),
                },
            "dest[i] = src[i]",
            "copy")
Пример #21
0
def get_scalar_op_kernel(dtype_x, dtype_y, operator):
    return get_elwise_kernel(
            "%(tp_x)s *x, %(tp_a)s a, %(tp_y)s *y" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_a": dtype_to_ctype(dtype_x),
                },
            "y[i] = x[i] %s a" % operator,
            "scalarop_kernel")
Пример #22
0
def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z):
    return get_elwise_kernel(
            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = %s(x[i], y[i])" % func,
            func+"_kernel")
Пример #23
0
def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator):
    return get_elwise_kernel(
            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = x[i] %s y[i]" % operator,
            "multiply")
Пример #24
0
def get_multiply_kernel(dtype_x, dtype_y, dtype_z):
    return get_elwise_kernel(
            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = x[i] * y[i]",
            "multiply")
Пример #25
0
def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z):
    return get_elwise_kernel(
            "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = a*x[i] + b*y[i]",
            "axpbyz")
Пример #26
0
def get_gt_kernel(dtype_x, dtype_y, dtype_z):
    return get_elwise_kernel(
            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = x[i] > y[i]",
            "gt")       
Пример #27
0
def get_correlate_kernel(dtype_x, dtype_y,dtype_out):
    return ElementwiseKernel(
            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_out),
                },
            "z[i] = conj(x[i]) * y[i]",
            "correlate")
Пример #28
0
    def get_inds(self, src, dest, inds, src_shift=0):
        """
        Set `dest[i] = src[src_shift+inds[i]] for i in range(len(inds))`
        """

        assert src.dtype == dest.dtype
        inds_ctype = dtype_to_ctype(inds.dtype)
        data_ctype = dtype_to_ctype(src.dtype)
        
        func = get_inds_kernel(inds_ctype, data_ctype)
        func(dest, int(src_shift), inds, src, range=slice(0, len(inds), 1) )
Пример #29
0
def get_unary_func_kernel(func_name, in_dtype, out_dtype=None):
    if out_dtype is None:
        out_dtype = in_dtype

    return get_elwise_kernel(
            "%(tp_in)s *y, %(tp_out)s *z" % {
                "tp_in": dtype_to_ctype(in_dtype),
                "tp_out": dtype_to_ctype(out_dtype),
                },
            "z[i] = %s(y[i])" % func_name,
            "%s_kernel" % func_name)
Пример #30
0
def get_subset_sum_kernel(dtype_out, dtype_subset, dtype_in):
    if dtype_out is None:
        dtype_out = dtype_in

    return ReductionKernel(dtype_out, "0", "a+b",
            map_expr="in[lookup_tbl[i]]",
            arguments="const %(tp_lut)s *lookup_tbl, const %(tp)s *in"
            % {
                "tp": dtype_to_ctype(dtype_in),
                "tp_lut": dtype_to_ctype(dtype_subset),
                })
Пример #31
0
def get_linear_combination_kernel(summand_descriptors, dtype_z):
    from pycuda.tools import dtype_to_ctype
    from pycuda.elementwise import \
            VectorArg, ScalarArg, get_elwise_module

    args = []
    preamble = ["#include <pycuda-helpers.hpp>\n\n"]
    loop_prep = []
    summands = []
    tex_names = []

    for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \
            enumerate(summand_descriptors):
        if is_gpu_scalar:
            preamble.append(
                "texture <%s, 1, cudaReadModeElementType> tex_a%d;" %
                (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i))
            args.append(VectorArg(vector_dtype, "x%d" % i))
            tex_names.append("tex_a%d" % i)
            loop_prep.append("%s a%d = fp_tex1Dfetch(tex_a%d, 0)" %
                             (dtype_to_ctype(scalar_dtype), i, i))
        else:
            args.append(ScalarArg(scalar_dtype, "a%d" % i))
            args.append(VectorArg(vector_dtype, "x%d" % i))

        summands.append("a%d*x%d[i]" % (i, i))

    args.append(VectorArg(dtype_z, "z"))
    args.append(ScalarArg(np.uintp, "n"))

    mod = get_elwise_module(args,
                            "z[i] = " + " + ".join(summands),
                            "linear_combination",
                            preamble="\n".join(preamble),
                            loop_prep=";\n".join(loop_prep))

    func = mod.get_function("linear_combination")
    tex_src = [mod.get_texref(tn) for tn in tex_names]
    func.prepare("".join(arg.struct_char for arg in args), texrefs=tex_src)

    return func, tex_src
Пример #32
0
def get_subset_minmax_kernel(what, dtype, dtype_subset):
    if dtype == np.float64:
        reduce_expr = "f%s(a,b)" % what
    elif dtype == np.float32:
        reduce_expr = "f%sf(a,b)" % what
    elif dtype.kind in "iu":
        reduce_expr = "%s(a,b)" % what
    else:
        raise TypeError("unsupported dtype specified")

    return ReductionKernel(dtype,
                           neutral=get_minmax_neutral(what, dtype),
                           reduce_expr="%(reduce_expr)s" %
                           {"reduce_expr": reduce_expr},
                           map_expr="in[lookup_tbl[i]]",
                           arguments="const %(tp_lut)s *lookup_tbl, "
                           "const %(tp)s *in" % {
                               "tp": dtype_to_ctype(dtype),
                               "tp_lut": dtype_to_ctype(dtype_subset),
                           },
                           preamble="#define MY_INFINITY (1./0)")
Пример #33
0
 def get_update_func(self, dtypes):
     type_dict = {k: dtype_to_ctype(dtypes[k]) for k in dtypes}
     type_dict.update({'fletter': 'f' if type_dict['n'] == 'float' else ''})
     mod = SourceModule(self.get_update_template() % type_dict,
                        options=self.compile_options)
     func = mod.get_function("update")
     func.prepare('i' + np.dtype(dtypes['dt']).char + 'i' + 'P' *
                  (len(type_dict) - 2))
     func.block = (128, 1, 1)
     func.grid = (min(6 * cuda.Context.get_device().MULTIPROCESSOR_COUNT,
                      (self.num_comps - 1) / 128 + 1), 1)
     return func
Пример #34
0
    def __init__(
        self,
        dtype,
        scan_expr,
        neutral=None,
        name_prefix="scan",
        options=None,
        preamble="",
        devices=None,
    ):

        if isinstance(self, ExclusiveScanKernel) and neutral is None:
            raise ValueError("neutral element is required for exclusive scan")

        dtype = self.dtype = np.dtype(dtype)
        self.neutral = neutral

        # Thrust says these are good for GT200
        self.scan_wg_size = 128
        self.update_wg_size = 256
        self.scan_wg_seq_batches = 6

        kw_values = dict(
            preamble=preamble,
            name_prefix=name_prefix,
            scan_type=dtype_to_ctype(dtype),
            scan_expr=scan_expr,
            neutral=neutral,
        )

        scan_intervals_src = str(
            SCAN_INTERVALS_SOURCE.render(
                wg_size=self.scan_wg_size,
                wg_seq_batches=self.scan_wg_seq_batches,
                **kw_values))
        scan_intervals_prg = SourceModule(scan_intervals_src,
                                          options=options,
                                          no_extern_c=True)
        self.scan_intervals_knl = scan_intervals_prg.get_function(
            name_prefix + "_scan_intervals")
        self.scan_intervals_knl.prepare("PIIPP")

        final_update_src = str(
            self.final_update_tp.render(wg_size=self.update_wg_size,
                                        **kw_values))

        final_update_prg = SourceModule(final_update_src,
                                        options=options,
                                        no_extern_c=True)
        self.final_update_knl = final_update_prg.get_function(name_prefix +
                                                              "_final_update")
        self.final_update_knl.prepare("PIIP")
Пример #35
0
    def test_3d_fp_textures(self):
        orden = "C"
        npoints = 32

        for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]:
            prec_str = dtype_to_ctype(prec)
            if prec == np.complex64:
                fpName_str = "fp_tex_cfloat"
            elif prec == np.complex128:
                fpName_str = "fp_tex_cdouble"
            elif prec == np.float64:
                fpName_str = "fp_tex_double"
            else:
                fpName_str = prec_str
            A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec)
            A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:]
            A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden)

            myKern = """
            #include <pycuda-helpers.hpp>
            texture<fpName, 3, cudaReadModeElementType> mtx_tex;

            __global__ void copy_texture(cuPres *dest)
            {
              int row   = blockIdx.x*blockDim.x + threadIdx.x;
              int col   = blockIdx.y*blockDim.y + threadIdx.y;
              int slice = blockIdx.z*blockDim.z + threadIdx.z;
              dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row);
            }
            """
            myKern = myKern.replace("fpName", fpName_str)
            myKern = myKern.replace("cuPres", prec_str)
            mod = SourceModule(myKern)

            copy_texture = mod.get_function("copy_texture")
            mtx_tex = mod.get_texref("mtx_tex")
            cuBlock = (8, 8, 8)
            if cuBlock[0] > npoints:
                cuBlock = (npoints, npoints, npoints)
            cuGrid = (
                npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0),
                npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0),
                npoints // cuBlock[2] + 1 * (npoints % cuBlock[1] != 0),
            )
            copy_texture.prepare("P", texrefs=[mtx_tex])
            cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=False)
            mtx_tex.set_array(cudaArray)
            copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata)
            assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(
                0, dtype=prec
            )
            A_gpu.gpudata.free()
Пример #36
0
def get_dot_kernel(dtype_out, dtype_a=None, dtype_b=None):
    if dtype_out is None:
        dtype_out = dtype_a

    if dtype_b is None:
        if dtype_a is None:
            dtype_b = dtype_out
        else:
            dtype_b = dtype_a

    if dtype_a is None:
        dtype_a = dtype_out

    return ReductionKernel(dtype_out,
                           neutral="0",
                           reduce_expr="a+b",
                           map_expr="a[i]*b[i]",
                           arguments="const %(tp_a)s *a, const %(tp_b)s *b" % {
                               "tp_a": dtype_to_ctype(dtype_a),
                               "tp_b": dtype_to_ctype(dtype_b),
                           },
                           keep=True)
Пример #37
0
def get_fill_zeros_kernel(data_dtype):
    template = """
__global__ void update(%(data_ctype)s* dest,
                       %(inds_ctype)s* inds,
                       int N)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int total_threads = gridDim.x * blockDim.x;

    for(int i = tid; i < N; i += total_threads)
    {
        dest[inds[i]] = 0;
    }
}
"""
    mod = SourceModule(template % {"data_ctype": dtype_to_ctype(data_dtype),
                                   "inds_ctype": dtype_to_ctype(np.int32)})
    func = mod.get_function("update")
    func.prepare('PPi')
    func.block = (128,1,1)
    func.grid = (16 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, 1)
    return func
Пример #38
0
def get_pow_kernel(dtype):
    if dtype == np.float32:
        func = "powf"
    else:
        func = "pow"

    return get_elwise_kernel(
        "%(tp)s value, %(tp)s *y, %(tp)s *z" % {
            "tp": dtype_to_ctype(dtype),
        },
        "z[i] = %s(y[i], value)" % func,
        "pow_method",
    )
Пример #39
0
def get_abs_function(dtype, pitch=True):
    type_src = dtype_to_ctype(dtype)
    if dtype == np.complex128:
        operation = "pycuda::abs"
        type_dest = "double"
    elif dtype == np.complex64:
        operation = "pycuda::abs"
        type_dest = "float"
    elif dtype == np.float64:
        operation = "fabs"
        type_dest = "double"
    elif dtype == np.float32:
        operation = "fabsf"
        type_dest = "float"
    else:
        operation = "abs"
        type_dest = dtype_to_ctype(dtype)
    name = "abs_function"

    if pitch:
        func = SourceModule(pitch_template % {
            "name": name,
            "dest_type": type_dest,
            "src_type": type_src,
            "operation": operation,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('iiPiPi')
        #        [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = SourceModule(non_pitch_template % {
            "name": name,
            "dest_type": type_dest,
            "src_type": type_src,
            "operation": operation,
        },
                            options=["--ptxas-options=-v"]).get_function(name)
        func.prepare('PPi')  #[np.intp, np.intp, np.int32])
    return func
Пример #40
0
def get_resize_function(dtype):
    type_src = dtype_to_ctype(dtype)
    name = "resize"
    func = SourceModule(reshape_template % {
        "name": name,
        "dest_type": type_src,
        "src_type": type_src,
        "operation": "",
    },
                        options=["--ptxas-options=-v"]).get_function(name)
    func.prepare('iiiiPiPi')  #[np.int32, np.int32, np.int32, np.int32,
    #              np.intp, np.int32, np.intp, np.int32])
    return func
Пример #41
0
def get_subset_dot_kernel(dtype_out, dtype_subset, dtype_a=None, dtype_b=None):
    if dtype_out is None:
        dtype_out = dtype_a

    if dtype_b is None:
        if dtype_a is None:
            dtype_b = dtype_out
        else:
            dtype_b = dtype_a

    if dtype_a is None:
        dtype_a = dtype_out

    # important: lookup_tbl must be first--it controls the length
    return ReductionKernel(dtype_out, neutral="0",
            reduce_expr="a+b", map_expr="a[lookup_tbl[i]]*b[lookup_tbl[i]]",
            arguments="const %(tp_lut)s *lookup_tbl, "
            "const %(tp_a)s *a, const %(tp_b)s *b" % {
            "tp_a": dtype_to_ctype(dtype_a),
            "tp_b": dtype_to_ctype(dtype_b),
            "tp_lut": dtype_to_ctype(dtype_subset),
            })
Пример #42
0
def get_put_kernel(dtype, idx_dtype, vec_count=1):
    ctx = {
        "idx_tp": dtype_to_ctype(idx_dtype),
        "tp": dtype_to_ctype(dtype),
    }

    args = (
        [
            VectorArg(idx_dtype, "gmem_dest_idx"),
        ]
        + [VectorArg(dtype, "dest%d" % i) for i in range(vec_count)]
        + [VectorArg(dtype, "src%d" % i) for i in range(vec_count)]
        + [ScalarArg(np.intp, "n")]
    )

    body = "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + "\n".join(
        "dest%d[dest_idx] = src%d[i];" % (i, i) for i in range(vec_count)
    )

    func = get_elwise_module(args, body, "put").get_function("put")
    func.prepare("P" + (2 * vec_count * "P") + np.dtype(np.uintp).char)
    return func
Пример #43
0
def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array,
                         is_exp_array):
    """
    Returns the kernel for the operation: ``z = x ** y``
    """
    if dtype_z == np.float32:
        func = "powf"
    else:
        # FIXME: Casting args to double-precision not
        # ideal for all cases (ex. int args)
        func = "pow"

    if not is_base_array and is_exp_array:
        x_ctype = "%(tp_x)s x"
        y_ctype = "%(tp_y)s *y"
        func = "%s(x,y[i])" % func

    elif is_base_array and is_exp_array:
        x_ctype = "%(tp_x)s *x"
        y_ctype = "%(tp_y)s *y"
        func = "%s(x[i],y[i])" % func

    elif is_base_array and not is_exp_array:
        x_ctype = "%(tp_x)s *x"
        y_ctype = "%(tp_y)s y"
        func = "%s(x[i],y)" % func

    else:
        raise AssertionError

    return get_elwise_kernel(
        (x_ctype + ", " + y_ctype + ", " + "%(tp_z)s *z") % {
            "tp_x": dtype_to_ctype(dtype_x),
            "tp_y": dtype_to_ctype(dtype_y),
            "tp_z": dtype_to_ctype(dtype_z),
        },
        "z[i] = %s" % func,
        name="pow_method")
Пример #44
0
def _get_eigsq_kernel(dtype_s, dtype_q):
    template = """
#include <pycuda/pycuda-complex.hpp>

__global__ void
eigsq_Kernel(%(types)s* d_S, %(typeq)s* d_q, %(types)s thres, int size)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int total = blockDim.x * gridDim.x;

    for(int i = tid; i < size; i += total)
    {
        %(types)s s = d_S[i];
        %(typeq)s q = d_q[i];

        if(fabs%(iff)s(s) > thres)
        {
            d_q[i] = q / s;
        }else
        {
            d_q[i] = 0.0;
        }
    }
}

    """
    mod = SourceModule(
        template % {
            "types": dtype_to_ctype(dtype_s),
            "typeq": dtype_to_ctype(dtype_q),
            "iff": "f" if dtype_q == np.float32 else ""
        })
    func = mod.get_function("eigsq_Kernel")
    func.prepare([
        np.intp, np.intp, np.double if dtype_s == np.double else np.float32,
        np.int32
    ])
    return func
Пример #45
0
def get_take_kernel(dtype, idx_dtype, vec_count=1):
    ctx = {
        "idx_tp": dtype_to_ctype(idx_dtype),
        "tp": dtype_to_ctype(dtype),
        "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True),
    }

    args = [VectorArg(idx_dtype, "idx")
            ] + [VectorArg(dtype, "dest" + str(i))
                 for i in range(vec_count)] + [ScalarArg(np.intp, "n")]
    preamble = "#include <pycuda-helpers.hpp>\n\n" + "\n".join(
        "texture <%s, 1, cudaReadModeElementType> tex_src%d;" %
        (ctx["tex_tp"], i) for i in range(vec_count))
    body = (("%(idx_tp)s src_idx = idx[i];\n" % ctx) +
            "\n".join("dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)
                      for i in range(vec_count)))

    mod = get_elwise_module(args, body, "take", preamble=preamble)
    func = mod.get_function("take")
    tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]
    func.prepare("P" + (vec_count * "P") + np.dtype(np.uintp).char,
                 texrefs=tex_src)
    return func, tex_src
Пример #46
0
def eye(N, dtype=np.float32):
    """
    Construct a 2D matrix with ones on the diagonal and zeros elsewhere.

    Constructs a matrix in device memory whose diagonal elements
    are set to 1 and non-diagonal elements are set to 0.

    Parameters
    ----------
    N : int
        Number of rows or columns in the output matrix.
    dtype : type
        Matrix data type.

    Returns
    -------
    e_gpu : pycuda.gpuarray.GPUArray
        Diagonal matrix of dimensions `[N, N]` with diagonal values
        set to 1.

    Examples
    --------
    >>> import pycuda.driver as drv
    >>> import pycuda.gpuarray as gpuarray
    >>> import pycuda.autoinit
    >>> import numpy as np
    >>> import linalg
    >>> linalg.init()
    >>> N = 5
    >>> e_gpu = linalg.eye(N)
    >>> np.all(e_gpu.get() == np.eye(N))
    True
    >>> e_gpu = linalg.eye(N, np.complex64)
    >>> np.all(e_gpu.get() == np.eye(N, dtype=np.complex64))
    True

    """

    if dtype not in [np.float32, np.float64, np.complex64,
                     np.complex128]:
        raise ValueError('unrecognized type')
    if N <= 0:
        raise ValueError('N must be greater than 0')
    alloc = misc._global_cublas_allocator

    e_gpu = misc.zeros((N, N), dtype, allocator=alloc)
    func = el.ElementwiseKernel("{ctype} *e".format(ctype=tools.dtype_to_ctype(dtype)),
                                "e[i] = 1")
    func(e_gpu, slice=slice(0, N*N, N+1))
    return e_gpu
Пример #47
0
    def __init__(self, model, **kwargs):
        self.dtype = dtype_to_ctype(kwargs.pop("dtype", np.float32))
        self.model = model
        self.solver = model.solver.__name__
        self.float_char = "f" if self.dtype == "float" else ""

        self.params_gdata = kwargs.pop("params_gdata", [])
        dct = kwargs.pop("inputs_gdata", dict())
        self.inputs = {k: {"value": v, "used": False} for k, v in dct.items()}

        cls = self.model.__class__

        self.has_post = np.all([cls.post != base.post for base in cls.__bases__])

        self.generate()
Пример #48
0
def _scale_inplace(a, x_gpu):
    """
    Scale an array by a specified value in-place.
    """

    # Cache the kernel to avoid invoking the compiler if the
    # specified scale factor and array type have already been encountered:
    try:
        func = _scale_inplace.cache[(a, x_gpu.dtype)]
    except KeyError:
        ctype = tools.dtype_to_ctype(x_gpu.dtype)
        func = el.ElementwiseKernel(
            "{ctype} a, {ctype} *x".format(ctype=ctype), "x[i] /= a")
        _scale_inplace.cache[(a, x_gpu.dtype)] = func
    func(x_gpu.dtype.type(a), x_gpu)
Пример #49
0
    def test_2d_fp_texturesLayered(self):
        orden = "F"
        npoints = 32

        for prec in [
                np.int16, np.float32, np.float64, np.complex64, np.complex128
        ]:
            prec_str = dtype_to_ctype(prec)
            if prec == np.complex64: fpName_str = 'fp_tex_cfloat'
            elif prec == np.complex128: fpName_str = 'fp_tex_cdouble'
            elif prec == np.float64: fpName_str = 'fp_tex_double'
            else: fpName_str = prec_str
            A_cpu = np.zeros([npoints, npoints], order=orden, dtype=prec)
            A_cpu[:] = np.random.rand(npoints, npoints)[:]
            A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden)

            myKern = '''
            #include <pycuda-helpers.hpp>
            texture<fpName, cudaTextureType2DLayered, cudaReadModeElementType> mtx_tex;

            __global__ void copy_texture(cuPres *dest)
            {
              int row = blockIdx.x*blockDim.x + threadIdx.x;
              int col = blockIdx.y*blockDim.y + threadIdx.y;

              dest[row + col*blockDim.x*gridDim.x] = fp_tex2DLayered(mtx_tex, col, row, 1);
            }
            '''
            myKern = myKern.replace('fpName', fpName_str)
            myKern = myKern.replace('cuPres', prec_str)
            mod = SourceModule(myKern)

            copy_texture = mod.get_function("copy_texture")
            mtx_tex = mod.get_texref("mtx_tex")
            cuBlock = (16, 16, 1)
            if cuBlock[0] > npoints:
                cuBlock = (npoints, npoints, 1)
            cuGrid = (npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0),
                      npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0),
                      1)
            copy_texture.prepare('P', texrefs=[mtx_tex])
            cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=True)
            mtx_tex.set_array(cudaArray)
            copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata)
            assert np.sum(np.abs(A_gpu.get() -
                                 np.transpose(A_cpu))) == np.array(0,
                                                                   dtype=prec)
            A_gpu.gpudata.free()
Пример #50
0
def get_resize_function(dtype):
    type_src = dtype_to_ctype(dtype)
    name = "resize"

    func = func_compile(
        name, reshape_template % {
            "name": name,
            "dest_type": type_src,
            "src_type": type_src,
            "operation": "",
        })
    func.prepare([
        np.int32, np.int32, np.int32, np.int32, np.intp, np.int32, np.intp,
        np.int32
    ])
    return func
Пример #51
0
 def process_signature(self):
     new_signature = []
     for key in self.signature:
         val = self.inputs.get(key, None)
         if val is None:
             isArray = True
             dtype = self.dtype
         else:
             val['used'] = True
             isArray = hasattr(val['value'], '__len__')
             if isArray:
                 dtype = dtype_to_ctype(val['value'].dtype)
             else:
                 dtype = self.dtype
         new_signature.append((key, dtype, isArray))
     return new_signature
Пример #52
0
    def __init__(self, model, **kwargs):
        self.dtype = dtype_to_ctype(kwargs.pop('dtype', np.float32))
        self.model = model
        self.solver = model.solver.__name__
        self.float_char = 'f' if self.dtype == 'float' else ''

        self.params_gdata = kwargs.pop('params_gdata', [])
        dct = kwargs.pop('inputs_gdata', dict())
        self.inputs = {k: {'value': v, 'used': False} for k, v in dct.items()}

        cls = self.model.__class__

        self.has_post = np.all(
            [cls.post != base.post for base in cls.__bases__])

        self.generate()
Пример #53
0
def _get_binaryop_vecmat_kernel(dtype, binary_op):
    template = Template("""
    #include <pycuda-complex.hpp>

    __global__ void opColVecToMat(const ${type} *mat, const ${type} *vec, ${type} *out,
                                   const int n, const int m){
        const int tx = threadIdx.x;
        const int ty = threadIdx.y;
        const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
        const int tidy = blockIdx.y * blockDim.y + threadIdx.y;

        extern __shared__ ${type} shared_vec[];

        if ((ty == 0) & (tidx < n))
            shared_vec[tx] = vec[tidx];
        __syncthreads();

        if ((tidy < m) & (tidx < n)) {
            out[tidx*m+tidy] = mat[tidx*m+tidy] ${binary_op} shared_vec[tx];
        }
    }

    __global__ void opRowVecToMat(const ${type}* mat, const ${type}* vec, ${type}* out,
                                   const int n, const int m){
        const int tx = threadIdx.x;
        const int ty = threadIdx.y;
        const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
        const int tidy = blockIdx.y * blockDim.y + threadIdx.y;

        extern __shared__ ${type} shared_vec[];

        if ((tx == 0) & (tidy < m))
            shared_vec[ty] = vec[tidy];
        __syncthreads();

        if ((tidy < m) & (tidx < n)) {
            out[tidx*m+tidy] = mat[tidx*m+tidy] ${binary_op} shared_vec[ty];
        }
    }""")
    cache_dir=None
    ctype = dtype_to_ctype(dtype)
    tmpl = template.substitute(type=ctype, binary_op=binary_op)
    mod = SourceModule(tmpl)

    add_row_vec_kernel = mod.get_function('opRowVecToMat')
    add_col_vec_kernel = mod.get_function('opColVecToMat')
    return add_row_vec_kernel, add_col_vec_kernel
Пример #54
0
    def get_gpu_kernel(self):
        self.gpu_block = (128,1,1)
        self.gpu_grid = (min( 6*cuda.Context.get_device().MULTIPROCESSOR_COUNT,\
                              (self.num-1)/self.gpu_block[0] + 1), 1)
        mod = SourceModule( \
                cuda_src % {"type": dtype_to_ctype(np.float64)},\
                options=self.compile_options)
        func = mod.get_function("dummy_synapse")
        func.prepare('PiiiiPPP')# [ np.intp,    # neuron state buffer
#                        np.int32,   # buffer width
#                        np.int32,   # buffer position
#                        np.int32,   # buffer delay steps
#                        np.int32,   # syn_num
#                        np.intp,    # pre-synaptic neuron list
#                        np.intp,    # delay step
#                        np.intp ] ) # cond array
        return func
Пример #55
0
    def set_by_inds_array(self, inds, data):
        """
        Set mapped data with array by integer indices.

        Parameters
        ----------
        inds : array-like
            Integer indices of data elements to update.
        data : numpy.ndarray
            Data to assign.
        """

        if np.isscalar(data):
            raise ValueError('data must be array-like')
        if len(np.shape(inds)) > 1:
            raise ValueError('index array must be 1D')
        N = len(inds)
        if N == 0:
            return

        if not isinstance(inds, gpuarray.GPUArray):
            inds = gpuarray.to_gpu(inds)
        if not issubclass(inds.dtype.type, numbers.Integral):
            raise ValueError('index array must contain integers')
        if N != len(data):
            raise ValueError('len(inds) = %s != %s = len(data)' %
                             (N, len(data)))

        if not isinstance(data, gpuarray.GPUArray):
            data = gpuarray.to_gpu(data)

        # Allocate data array if it doesn't exist:
        if not self.data:
            self.data = gpuarray.empty(N, data.dtype)
        else:
            assert self.data.dtype == data.dtype
        try:
            func = self.set_by_inds_array.cache[(inds.dtype, self.data.dtype)]
        except KeyError:
            inds_ctype = tools.dtype_to_ctype(inds.dtype)
            v = "{data_ctype} *dest, {inds_ctype} *inds, {data_ctype} *src".format(
                data_ctype=self.data_ctype, inds_ctype=inds_ctype)
            func = elementwise.ElementwiseKernel(v, "dest[inds[i]] = src[i]")
            self.set_by_inds_array.cache[(inds.dtype, self.data.dtype)] = func
        func(self.data, inds, data, range=slice(0, N, 1))
Пример #56
0
def get_diag_add_kernel(dtype):
    template = """
__global__ void
diag_add_Kernel(%(type)s* d_G, int ld, int size, %(type)s addin)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	int total = gridDim.x * blockDim.x;
	
	for(int i = tid; i < size; i+=total)
	{
		d_G[i * ld + i] += addin;
	}

}
"""
    func = func_compile("diag_add_Kernel",
                        template % {"type": dtype_to_ctype(dtype)})
    return func
Пример #57
0
def get_realimag_function(dtype, real=True, pitch=True):
    type_src = dtype_to_ctype(dtype)

    if dtype == np.complex64:
        type_dest = "float"
        if real:
            operation = "pycuda::real"
            name = "real"
        else:
            operation = "pycuda::imag"
            name = "imag"
    elif dtype == np.complex128:
        type_dest = "double"
        if real:
            operation = "pycuda::real"
            name = "real"
        else:
            operation = "pycuda::imag"
            name = "imag"
    else:
        raise TypeError(
            "only support complex inputs as numpy.complex64 or numpy.complex128"
        )

    if pitch:
        func = func_compile(
            name, pitch_template % {
                "name": name,
                "dest_type": type_dest,
                "src_type": type_src,
                "operation": operation,
            })
        func.prepare(
            [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    else:
        func = func_compile(
            name, non_pitch_template % {
                "name": name,
                "dest_type": type_dest,
                "src_type": type_src,
                "operation": operation,
            })
        func.prepare([np.intp, np.intp, np.int32])
    return func
Пример #58
0
def get_fill_function(dtype, pitch=True):
    type_dst = dtype_to_ctype(dtype)
    name = "fill"

    if pitch:
        func = func_compile(
            name, fill_pitch_template % {
                "name": name,
                "type_dst": type_dst
            })
        func.prepare([np.int32, np.int32, np.intp, np.int32, dtype.type])
    else:
        func = func_compile(
            name, fill_nonpitch_template % {
                "name": name,
                "type_dst": type_dst
            })
        func.prepare([np.int32, np.intp, dtype.type])
    return func
Пример #59
0
def get_transpose_function(dtype, conj=False):

    src_type = dtype_to_ctype(dtype)
    name = "trans"
    operation = ""
    if conj:
        if dtype == np.complex128:
            operation = "pycuda::conj"
        elif dtype == np.complex64:
            operation = "pycuda::conj"

    func = func_compile(
        name, transpose_template % {
            "name": name,
            "type": src_type,
            "operation": operation
        })
    func.prepare([np.int32, np.int32, np.intp, np.int32, np.intp, np.int32])
    return func
Пример #60
0
def get_minmax_kernel(what, dtype):
    if dtype == np.float64:
        reduce_expr = "f%s(a,b)" % what
    elif dtype == np.float32:
        reduce_expr = "f%sf(a,b)" % what
    elif dtype.kind in "iu":
        reduce_expr = "%s(a,b)" % what
    else:
        raise TypeError("unsupported dtype specified")

    return ReductionKernel(
        dtype,
        neutral=get_minmax_neutral(what, dtype),
        reduce_expr=f"{reduce_expr}",
        arguments="const %(tp)s *in" % {
            "tp": dtype_to_ctype(dtype),
        },
        preamble="#define MY_INFINITY (1./0)",
    )