예제 #1
0
def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1):
    ctx = {"idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype)}

    args = (
        [VectorArg(dtype, "dest%d" % i) for i in range(vec_count)]
        + [
            VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True),
            VectorArg(idx_dtype, "gmem_src_idx", with_offset=True),
        ]
        + [VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count)]
        + [ScalarArg(idx_dtype, "offset%d" % i) for i in range(vec_count) if with_offsets]
    )

    if with_offsets:

        def get_copy_insn(i):
            return "dest%d[dest_idx] = " "src%d[src_idx+offset%d];" % (i, i, i)

    else:

        def get_copy_insn(i):
            return "dest%d[dest_idx] = " "src%d[src_idx];" % (i, i)

    body = ("%(idx_tp)s src_idx = gmem_src_idx[i];\n" "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx) + "\n".join(
        get_copy_insn(i) for i in range(vec_count)
    )

    return get_elwise_kernel(
        context, args, body, preamble=dtype_to_c_struct(context.devices[0], dtype), name="take_put"
    )
예제 #2
0
def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
    ax = "a*x[i]"
    by = "b*y[i]"

    x_is_complex = dtype_x.kind == "c"
    y_is_complex = dtype_y.kind == "c"

    if x_is_complex:
        ax = "%s_mul(a, x[i])" % complex_dtype_to_name(dtype_x)

    if y_is_complex:
        by = "%s_mul(b, y[i])" % complex_dtype_to_name(dtype_y)

    if x_is_complex and not y_is_complex:
        by = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_x), by)

    if not x_is_complex and y_is_complex:
        ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_y), ax)

    if x_is_complex or y_is_complex:
        result = "{root}_add({root}_cast({ax}), {root}_cast({by}))".format(
            ax=ax, by=by, root=complex_dtype_to_name(dtype_z)
        )
    else:
        result = "%s + %s" % (ax, by)

    return get_elwise_kernel(
        context,
        "%(tp_z)s *z, %(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y"
        % {"tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z)},
        "z[i] = %s" % result,
        name="axpbyz",
    )
예제 #3
0
def get_weighted_inner_kernel(dtype_x, dtype_y, dtype_w, dtype_out):
    if (dtype_x == np.complex64) or (dtype_x == np.complex128):
        if (dtype_y == np.float64) or (dtype_y == np.float32):
            ys = "%s_fromreal(y[i])" % complex_dtype_to_name(dtype_x)
        else:
            ys = "y[i]"
        inner_map="%s_mul(%s_conj(x[i]), %s)" % (complex_dtype_to_name(dtype_x), complex_dtype_to_name(dtype_x), ys)
    else:
        inner_map="x[i]*y[i]"  
        
    if (dtype_w == np.float64) or (dtype_w == np.float32):
        inner_map = inner_map + "/w[i]"  
    else:
        inner_map = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_x), inner_map, "w[i]")
               
    return ReductionKernel(mgr.state.context, dtype_out,
            neutral="0",
            arguments="__global const %(tp_x)s *x, __global const %(tp_y)s *y, __global const %(tp_w)s *w" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_w": dtype_to_ctype(dtype_w),
                },
            reduce_expr="a+b",
            map_expr=inner_map,
            name="weighted_inner")
예제 #4
0
def _get_dot_expr(dtype_out, dtype_a, dtype_b, conjugate_first,
        has_double_support, index_expr="i"):
    if dtype_b is None:
        if dtype_a is None:
            dtype_b = dtype_out
        else:
            dtype_b = dtype_a

    if dtype_out is None:
        from pyopencl.compyte.array import get_common_dtype
        dtype_out = get_common_dtype(
                dtype_a.type(0), dtype_b.type(0),
                has_double_support)

    a_real_dtype = dtype_a.type(0).real.dtype
    b_real_dtype = dtype_b.type(0).real.dtype
    out_real_dtype = dtype_out.type(0).real.dtype

    a_is_complex = dtype_a.kind == "c"
    b_is_complex = dtype_b.kind == "c"
    out_is_complex = dtype_out.kind == "c"

    from pyopencl.elementwise import complex_dtype_to_name

    if a_is_complex and b_is_complex:
        a = "a[%s]" % index_expr
        b = "b[%s]" % index_expr
        if dtype_a != dtype_out:
            a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
        if dtype_b != dtype_out:
            b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)

        if conjugate_first and a_is_complex:
            a = "%s_conj(%s)" % (
                    complex_dtype_to_name(dtype_out), a)

        map_expr = "%s_mul(%s, %s)" % (
                complex_dtype_to_name(dtype_out), a, b)
    else:
        a = "a[%s]" % index_expr
        b = "b[%s]" % index_expr

        if out_is_complex:
            if a_is_complex and dtype_a != dtype_out:
                a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
            if b_is_complex and dtype_b != dtype_out:
                b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)

            if not a_is_complex and a_real_dtype != out_real_dtype:
                a = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), a)
            if not b_is_complex and b_real_dtype != out_real_dtype:
                b = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), b)

        if conjugate_first and a_is_complex:
            a = "%s_conj(%s)" % (
                    complex_dtype_to_name(dtype_out), a)

        map_expr = "%s*%s" % (a, b)

    return map_expr, dtype_out, dtype_b
예제 #5
0
def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
    ctx = {
            "idx_tp": dtype_to_ctype(idx_dtype),
            "tp": dtype_to_ctype(dtype),
            }

    args = [
            VectorArg(dtype, "dest%d" % i, with_offset=True)
            for i in range(vec_count)
            ] + [
                VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True),
            ] + [
                VectorArg(dtype, "src%d" % i, with_offset=True)
                for i in range(vec_count)
            ] + [
                VectorArg(np.uint8, "use_fill", with_offset=True)
            ] + [
                VectorArg(np.int64, "val_ary_lengths", with_offset=True)
            ]

    body = (
            "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx
            + "\n".join(
                    "dest{i}[dest_idx] = (use_fill[{i}] ? src{i}[0] : "
                    "src{i}[i % val_ary_lengths[{i}]]);".format(i=i)
                    for i in range(vec_count)
                    )
            )

    return get_elwise_kernel(context, args, body,
            preamble=dtype_to_c_struct(context.devices[0], dtype),
            name="put")
예제 #6
0
def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z):
    x_is_complex = dtype_x.kind == "c"
    y_is_complex = dtype_y.kind == "c"

    x = "x[i]"
    y = "y[i]"

    if x_is_complex and dtype_x != dtype_z:
        x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
    if y_is_complex and dtype_y != dtype_z:
        y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y)

    if x_is_complex and y_is_complex:
        xy = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
    elif x_is_complex and not y_is_complex:
        xy = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
    elif not x_is_complex and y_is_complex:
        xy = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
    else:
        xy = "%s * %s" % (x, y)

    return get_elwise_kernel(context,
            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = %s" % xy,
            name="multiply")
예제 #7
0
def get_divide_kernel(context, dtype_x, dtype_y, dtype_z):
    x_is_complex = dtype_x.kind == "c"
    y_is_complex = dtype_y.kind == "c"
    z_is_complex = dtype_z.kind == "c"

    x = "x[i]"
    y = "y[i]"

    if z_is_complex and dtype_x != dtype_y:
        if x_is_complex and dtype_x != dtype_z:
            x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
        if y_is_complex and dtype_y != dtype_z:
            y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y)

    if x_is_complex and y_is_complex:
        xoy = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
    elif not x_is_complex and y_is_complex:
        xoy = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
    elif x_is_complex and not y_is_complex:
        xoy = "%s_divider(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
    else:
        xoy = "%s / %s" % (x, y)

    if z_is_complex:
        xoy = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), xoy)

    return get_elwise_kernel(context,
            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = %s" % xoy,
            name="divide")
예제 #8
0
def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1):
    ctx = {
            "idx_tp": dtype_to_ctype(idx_dtype),
            "tp": dtype_to_ctype(dtype),
            }

    args = [
            VectorArg(dtype, "dest%d" % i)
                for i in range(vec_count)
            ] + [
            VectorArg(idx_dtype, "gmem_dest_idx"),
            VectorArg(idx_dtype, "gmem_src_idx"),
            ] + [
            VectorArg(dtype, "src%d" % i)
                for i in range(vec_count)
            ] + [
            ScalarArg(idx_dtype, "offset%d" % i)
                for i in range(vec_count) if with_offsets
            ]

    if with_offsets:
        def get_copy_insn(i):
            return ("dest%d[dest_idx] = "
                    "src%d[src_idx+offset%d];"
                    % (i, i, i))
    else:
        def get_copy_insn(i):
            return ("dest%d[dest_idx] = "
                    "src%d[src_idx];" % (i, i))

    body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n"
                "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx)
            + "\n".join(get_copy_insn(i) for i in range(vec_count)))

    return get_elwise_kernel(context, args, body, name="take_put")
예제 #9
0
def get_rdivide_elwise_kernel(context, dtype_x, dtype_y, dtype_z):
    # implements y / x!
    x_is_complex = dtype_x.kind == "c"
    y_is_complex = dtype_y.kind == "c"
    z_is_complex = dtype_z.kind == "c"

    x = "x[i]"
    y = "y"

    if z_is_complex and dtype_x != dtype_y:
        if x_is_complex and dtype_x != dtype_z:
            x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
        if y_is_complex and dtype_y != dtype_z:
            y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y)

    if x_is_complex and y_is_complex:
        yox = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x)
    elif not y_is_complex and x_is_complex:
        yox = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x)
    elif y_is_complex and not x_is_complex:
        yox = "%s_divider(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x)
    else:
        yox = "%s / %s" % (y, x)

    return get_elwise_kernel(context,
            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s y" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = %s" % yox,
            name="divide_r")
예제 #10
0
def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
    ax = "a*x[i]"
    by = "b*y[i]"

    x_is_complex = dtype_x.kind == "c"
    y_is_complex = dtype_y.kind == "c"
    z_is_complex = dtype_z.kind == "c"

    if x_is_complex:
        ax = "%s_mul(a, x[i])" % complex_dtype_to_name(dtype_x)

    if y_is_complex:
        by = "%s_mul(b, y[i])" % complex_dtype_to_name(dtype_y)

    if x_is_complex and not y_is_complex:
        by = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_x), by)

    if not x_is_complex and y_is_complex:
        ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_y), ax)

    result = "%s + %s" % (ax, by)
    if z_is_complex:
        result = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), result)

    return get_elwise_kernel(context,
            "%(tp_z)s *z, %(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = %s" % result,
            name="axpbyz")
예제 #11
0
파일: elementwise.py 프로젝트: lichinka/cai
def get_copy_kernel(context, dtype_dest, dtype_src):
    return get_elwise_kernel(context,
            "%(tp_dest)s *dest, %(tp_src)s *src" % {
                "tp_dest": dtype_to_ctype(dtype_dest),
                "tp_src": dtype_to_ctype(dtype_src),
                },
            "dest[i] = src[i]",
            name="copy")
예제 #12
0
def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z):
    a_is_complex = dtype_a.kind == "c"
    x_is_complex = dtype_x.kind == "c"
    b_is_complex = dtype_b.kind == "c"

    z_is_complex = dtype_z.kind == "c"

    ax = "a*x[i]"
    if x_is_complex:
        a = "a"
        x = "x[i]"

        if dtype_x != dtype_z:
            x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)

        if a_is_complex:
            if dtype_a != dtype_z:
                a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a)

            ax = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x)
        else:
            ax = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x)
    elif a_is_complex:
        a = "a"
        x = "x[i]"

        if dtype_a != dtype_z:
            a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a)
        ax = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x)

    b = "b"
    if z_is_complex and not b_is_complex:
        b = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_z), b)

    if z_is_complex and not (a_is_complex or x_is_complex):
        ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_z), ax)

    if z_is_complex:
        ax = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), ax)
        b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), b)

    if a_is_complex or x_is_complex or b_is_complex:
        expr = "{root}_add({ax}, {b})".format(
                ax=ax,
                b=b,
                root=complex_dtype_to_name(dtype_z))
    else:
        expr = "%s + %s" % (ax, b)

    return get_elwise_kernel(context,
            "%(tp_z)s *z, %(tp_a)s a, %(tp_x)s *x,%(tp_b)s b" % {
                "tp_a": dtype_to_ctype(dtype_a),
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_b": dtype_to_ctype(dtype_b),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = " + expr,
            name="axpb")
예제 #13
0
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None):
    if dtype_b is None:
        if dtype_a is None:
            dtype_b = dtype_out
        else:
            dtype_b = dtype_a

    if dtype_out is None:
        from pyopencl.compyte.array import get_common_dtype
        from pyopencl.characterize import has_double_support
        dtype_out = get_common_dtype(
                dtype_a.type(0), dtype_b.type(0), has_double_support(ctx.devices[0]))

    a_real_dtype = dtype_a.type(0).real.dtype
    b_real_dtype = dtype_b.type(0).real.dtype
    out_real_dtype = dtype_out.type(0).real.dtype

    a_is_complex = dtype_a.kind == "c"
    b_is_complex = dtype_b.kind == "c"
    out_is_complex = dtype_out.kind == "c"

    from pyopencl.elementwise import complex_dtype_to_name

    if a_is_complex and b_is_complex:
        a = "a[i]"
        b = "b[i]"
        if dtype_a != dtype_out:
            a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
        if dtype_b != dtype_out:
            b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)

        map_expr = "%s_mul(%s, %s)" % (
                complex_dtype_to_name(dtype_out), a, b)
    else:
        a = "a[i]"
        b = "b[i]"

        if out_is_complex:
            if a_is_complex and dtype_a != dtype_out:
                a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
            if b_is_complex and dtype_b != dtype_out:
                b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)

            if not a_is_complex and a_real_dtype != out_real_dtype:
                a = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), a)
            if not b_is_complex and b_real_dtype != out_real_dtype:
                b = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), b)

        map_expr = "%s*%s" % (a, b)

    return ReductionKernel(ctx, dtype_out, neutral="0",
            reduce_expr="a+b", map_expr=map_expr,
            arguments=
            "__global const %(tp_a)s *a, "
            "__global const %(tp_b)s *b" % {
                "tp_a": dtype_to_ctype(dtype_a),
                "tp_b": dtype_to_ctype(dtype_b),
                })
예제 #14
0
파일: elementwise.py 프로젝트: lichinka/cai
def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z):
    return get_elwise_kernel(context,
            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = x[i] * y[i]",
            name="multiply")
예제 #15
0
파일: elementwise.py 프로젝트: lichinka/cai
def get_divide_kernel(context, dtype_x, dtype_y, dtype_z):
    return get_elwise_kernel(context,
            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = x[i] / y[i]",
            name="divide")
예제 #16
0
파일: elementwise.py 프로젝트: lichinka/cai
def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
    return get_elwise_kernel(context,
            "%(tp_z)s *z, %(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = a*x[i] + b*y[i]",
            name="axpbyz")
예제 #17
0
파일: elementwise.py 프로젝트: lichinka/cai
def get_pow_array_kernel(context, dtype_x, dtype_y, dtype_z):
    return get_elwise_kernel(context,
            "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = pow(x[i], y[i])",
            name="pow_method")
예제 #18
0
def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None):
    if out_dtype is None:
        out_dtype = in_dtype

    return get_elwise_kernel(context,
            "%(tp_out)s *z, %(tp_in)s *y" % {
                "tp_in": dtype_to_ctype(in_dtype),
                "tp_out": dtype_to_ctype(out_dtype),
                },
            "z[i] = %s(y[i])" % func_name,
            name="%s_kernel" % func_name)
예제 #19
0
def get_copy_kernel(context, dtype_dest, dtype_src):
    src = "src[i]"
    if dtype_dest.kind == "c" != dtype_src.kind:
        src = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_dest), src)

    return get_elwise_kernel(context,
            "%(tp_dest)s *dest, %(tp_src)s *src" % {
                "tp_dest": dtype_to_ctype(dtype_dest),
                "tp_src": dtype_to_ctype(dtype_src),
                },
            "dest[i] = %s" % src,
            name="copy")
예제 #20
0
def get_norm_kernel(dtype_x, dtype_out):
    if dtype_x == np.float32 or dtype_x == np.float64:
        op = "z[i] = x[i] * x[i]"
    if dtype_x == np.complex64 or dtype_x == np.complex128:
        op = "z[i] = x[i].x*x[i].x + x[i].y*x[i].y"
    return ElementwiseKernel(mgr.state.context, 
            "%(tp_x)s *x, %(tp_z)s *z" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_z": dtype_to_ctype(dtype_out),
                },
            op,
            "normsq")
예제 #21
0
def get_take_kernel(context, dtype, idx_dtype, vec_count=1):
    ctx = {"idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype)}

    args = (
        [VectorArg(dtype, "dest" + str(i), with_offset=True) for i in range(vec_count)]
        + [VectorArg(dtype, "src" + str(i), with_offset=True) for i in range(vec_count)]
        + [VectorArg(idx_dtype, "idx", with_offset=True)]
    )
    body = ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join(
        "dest%d[i] = src%d[src_idx];" % (i, i) for i in range(vec_count)
    )

    return get_elwise_kernel(context, args, body, preamble=dtype_to_c_struct(context.devices[0], dtype), name="take")
예제 #22
0
def get_correlate_kernel(dtype_x, dtype_y,dtype_out):
    if dtype_x == numpy.complex64:
        op = "z[i] = cfloat_mul(cfloat_conj(x[i]), y[i])"
    elif dtype_x == numpy.complex128:
        op = "z[i] = cdouble_mul(cdouble_conj(x[i]), y[i])"
    return ElementwiseKernel(mgr.state.context,
            "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_out),
                },
            op,
            "correlate")
예제 #23
0
def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
    ctx = {"idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype)}

    args = (
        [VectorArg(dtype, "dest%d" % i, with_offset=True) for i in range(vec_count)]
        + [VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True)]
        + [VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count)]
    )

    body = "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + "\n".join(
        "dest%d[dest_idx] = src%d[i];" % (i, i) for i in range(vec_count)
    )

    return get_elwise_kernel(context, args, body, preamble=dtype_to_c_struct(context.devices[0], dtype), name="put")
예제 #24
0
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None,
        conjugate_first=False):
    from pyopencl.characterize import has_double_support
    map_expr, dtype_out, dtype_b = _get_dot_expr(
            dtype_out, dtype_a, dtype_b, conjugate_first,
            has_double_support=has_double_support(ctx.devices[0]))

    return ReductionKernel(ctx, dtype_out, neutral="0",
            reduce_expr="a+b", map_expr=map_expr,
            arguments=
            "const %(tp_a)s *a, "
            "const %(tp_b)s *b" % {
                "tp_a": dtype_to_ctype(dtype_a),
                "tp_b": dtype_to_ctype(dtype_b),
                })
예제 #25
0
def get_pow_kernel(context, dtype_x, dtype_y, dtype_z,
        is_base_array, is_exp_array):
    if is_base_array:
        x = "x[i]"
        x_ctype = "%(tp_x)s *x"
    else:
        x = "x"
        x_ctype = "%(tp_x)s x"

    if is_exp_array:
        y = "y[i]"
        y_ctype = "%(tp_y)s *y"
    else:
        y = "y"
        y_ctype = "%(tp_y)s y"

    x_is_complex = dtype_x.kind == "c"
    y_is_complex = dtype_y.kind == "c"
    z_is_complex = dtype_z.kind == "c"

    if z_is_complex and dtype_x != dtype_y:
        if x_is_complex and dtype_x != dtype_z:
            x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
        if y_is_complex and dtype_y != dtype_z:
            y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y)
    elif dtype_x != dtype_y:
        if dtype_x != dtype_z:
            x = "(%s) (%s)" % (dtype_to_ctype(dtype_z), x)
        if dtype_y != dtype_z:
            y = "(%s) (%s)" % (dtype_to_ctype(dtype_z), y)

    if x_is_complex and y_is_complex:
        result = "%s_pow(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
    elif x_is_complex and not y_is_complex:
        result = "%s_powr(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
    elif not x_is_complex and y_is_complex:
        result = "%s_rpow(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
    else:
        result = "pow(%s, %s)" % (x, y)

    return get_elwise_kernel(context,
            ("%(tp_z)s *z, " + x_ctype + ", "+y_ctype) % {
                "tp_x": dtype_to_ctype(dtype_x),
                "tp_y": dtype_to_ctype(dtype_y),
                "tp_z": dtype_to_ctype(dtype_z),
                },
            "z[i] = %s" % result,
            name="pow_method")
예제 #26
0
 def _fill_array_with_index_knl(self, context, idx_dtype, array_dtype):
     return ElementwiseKernel(
         context,
         Template(r"""
             ${idx_t} *index,
             ${array_t} *array,
             ${array_t} val
         """).render(
             idx_t=dtype_to_ctype(idx_dtype),
             array_t=dtype_to_ctype(array_dtype)
         ),
         Template(r"""
             array[index[i]] = val;
         """).render(),
         name="fill_array_with_index"
     )
예제 #27
0
def get_fill_kernel(context, dtype):
    return get_elwise_kernel(
        context,
        "{tp} *z, {tp} a".format(tp=dtype_to_ctype(dtype), ),
        "z[i] = a",
        preamble=dtype_to_c_struct(context.devices[0], dtype),
        name="fill")
예제 #28
0
def get_sum_kernel(ctx, dtype_out, dtype_in):
    if dtype_out is None:
        dtype_out = dtype_in

    return ReductionKernel(ctx, dtype_out, "0", "a+b",
            arguments="const %(tp)s *in"
            % {"tp": dtype_to_ctype(dtype_in)})
예제 #29
0
def get_reverse_kernel(context, dtype):
    return get_elwise_kernel(context,
                             "%(tp)s *z, %(tp)s *y" % {
                                 "tp": dtype_to_ctype(dtype),
                             },
                             "z[i] = y[n-1-i]",
                             name="reverse")
예제 #30
0
    def get_compress_kernel(self, index_dtype):
        arguments = """
            __global ${index_t} *count,
            __global ${index_t} *compressed_counts,
            __global ${index_t} *nonempty_indices,
            __global ${index_t} *compressed_indices,
            __global ${index_t} *num_non_empty_list
        """
        from sys import version_info
        if version_info > (3, 0):
            arguments = Template(arguments)
        else:
            arguments = Template(arguments, disable_unicode=True)

        from pyopencl.scan import GenericScanKernel
        return GenericScanKernel(
            self.context,
            index_dtype,
            arguments=arguments.render(index_t=dtype_to_ctype(index_dtype)),
            input_expr="count[i] == 0 ? 0 : 1",
            scan_expr="a+b",
            neutral="0",
            output_statement="""
                    if (i + 1 < N) compressed_indices[i + 1] = item;
                    if (prev_item != item) {
                        nonempty_indices[item - 1] = i;
                        compressed_counts[item - 1] = count[i];
                    }
                    if (i + 1 == N) *num_non_empty_list = item;
                    """,
            devices=self.devices)
예제 #31
0
def get_arange_kernel(context, dtype):
    return get_elwise_kernel(context,
                             "%(tp)s *z, %(tp)s start, %(tp)s step" % {
                                 "tp": dtype_to_ctype(dtype),
                             },
                             "z[i] = start + i*step",
                             name="arange")
예제 #32
0
def python_dtype_str(dtype):
    import pyopencl.tools as cl_tools
    if dtype.isbuiltin:
        return "_lpy_np."+dtype.name
    else:
        return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")"
                % cl_tools.dtype_to_ctype(dtype))
예제 #33
0
def maxpool2d(q, A, f, stride, out=None, indices=None):
    dtype = dtype_to_ctype(A.dtype)
    n, c, h, w = A.shape
    out_h = (h - f) / stride + 1
    out_w = (w - f) / stride + 1

    if out is None:
        out = clarray.empty(q, (n, c, out_h, out_w), dtype=A.dtype)
    if indices is None:
        indices = clarray.empty(q, (n, c, out_h, out_w), dtype=np.int32)

    if 'max_pool' not in _kernel_cache:
        prg = cl.Program(clplatf.ctx, _maxpool_template % {
            'dtype': dtype
        }).build()
        _kernel_cache['max_pool'] = prg.max_pool
    krnl = _kernel_cache['max_pool']
    # TODO better global and local dimensions (make divisible by 64 etc.)
    ev = krnl(q, (n * c * out_h * out_w, ), None, A.data, out.data,
              indices.data, np.int32(h), np.int32(w), np.int32(out_h),
              np.int32(out_w), np.int32(f), np.int32(f), np.int32(stride),
              np.int32(stride))

    ev.wait()
    return out, indices
예제 #34
0
def get_fill_kernel(context, dtype):
    return get_elwise_kernel(context,
                             "%(tp)s *z, %(tp)s a" % {
                                 "tp": dtype_to_ctype(dtype),
                             },
                             "z[i] = a",
                             name="fill")
예제 #35
0
    def __init__(self, ctx, queue, data, symmetry_modes):
        self._ctx = ctx
        self._queue = queue
        self.symmetry_modes = symmetry_modes

        self.data = data

        ctype = dtype_to_ctype(data.dtype)

        with open('sandpile.cl') as f:
            program = cl.Program(self._ctx, f.read())

        macros = _gen_macros(data, symmetry_modes)
        options = _macros_to_options(macros)
        self._program = program.build(options=options)

        from pyopencl.reduction import ReductionKernel
        self._diff_krnl = ReductionKernel(
            self._ctx,
            numpy.uint32,
            neutral='0',
            reduce_expr='a+b',
            map_expr='grid[i]!=new_grid[i]',
            arguments='const __global %s *grid, const __global %s *new_grid' %
            (ctype, ctype))
예제 #36
0
파일: compiled.py 프로젝트: navjotk/loopy
def python_dtype_str(dtype):
    import pyopencl.tools as cl_tools
    if dtype.isbuiltin:
        return "_lpy_np."+dtype.name
    else:
        return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")"
                % cl_tools.dtype_to_ctype(dtype))
예제 #37
0
def get_arange_kernel(context, dtype):
    return get_elwise_kernel(context,
            "%(tp)s *z, %(tp)s start, %(tp)s step" % {
                "tp": dtype_to_ctype(dtype),
                },
            "z[i] = start + i*step",
            name="arange")
예제 #38
0
    def get_write_kernel(self, index_dtype):
        index_ctype = dtype_to_ctype(index_dtype)
        from pyopencl.tools import VectorArg, OtherArg

        kernel_list_args = []
        kernel_list_arg_values = ""
        user_list_args = []

        for name, dtype in self.list_names_and_dtypes:
            list_name = "plb_%s_list" % name
            list_arg = VectorArg(dtype, list_name)

            kernel_list_args.append(list_arg)
            user_list_args.append(list_arg)

            if name in self.count_sharing:
                kernel_list_arg_values += "%s, " % list_name
                continue

            kernel_list_args.append(VectorArg(index_dtype, "plb_%s_start_index" % name))

            index_name = "plb_%s_index" % name
            user_list_args.append(OtherArg("%s *%s" % (index_ctype, index_name), index_name))

            kernel_list_arg_values += "%s, &%s, " % (list_name, index_name)

        kernel_name = self.name_prefix + "_write"

        from pyopencl.characterize import has_double_support

        src = _LIST_BUILDER_TEMPLATE.render(
            is_count_stage=False,
            kernel_name=kernel_name,
            double_support=all(has_double_support(dev) for dev in self.context.devices),
            debug=self.debug,
            do_not_vectorize=self.do_not_vectorize(),
            kernel_list_arg_decl=_get_arg_decl(kernel_list_args),
            kernel_list_arg_values=kernel_list_arg_values,
            user_list_arg_decl=_get_arg_decl(user_list_args),
            user_list_args=_get_arg_list(user_list_args),
            user_arg_decl=_get_arg_decl(self.arg_decls),
            user_args=_get_arg_list(self.arg_decls),
            list_names_and_dtypes=self.list_names_and_dtypes,
            count_sharing=self.count_sharing,
            name_prefix=self.name_prefix,
            generate_template=self.generate_template,
            preamble=self.preamble,
            index_type=index_ctype,
        )

        src = str(src)

        prg = cl.Program(self.context, src).build(self.options)
        knl = getattr(prg, kernel_name)

        from pyopencl.tools import get_arg_list_scalar_arg_dtypes

        knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(kernel_list_args + self.arg_decls) + [index_dtype])

        return knl
예제 #39
0
def get_reverse_kernel(context, dtype):
    return get_elwise_kernel(context,
            "%(tp)s *z, %(tp)s *y" % {
                "tp": dtype_to_ctype(dtype),
                },
            "z[i] = y[n-1-i]",
            name="reverse")
예제 #40
0
def get_fill_kernel(context, dtype):
    return get_elwise_kernel(context,
            "%(tp)s *z, %(tp)s a" % {
                "tp": dtype_to_ctype(dtype),
                },
            "z[i] = a",
            name="fill")
예제 #41
0
def get_linear_combination_kernel(summand_descriptors,
        dtype_z):
    # TODO: Port this!
    raise NotImplementedError

    from pyopencl.tools import dtype_to_ctype
    from pyopencl.elementwise import \
            VectorArg, ScalarArg, get_elwise_module

    args = []
    preamble = []
    loop_prep = []
    summands = []
    tex_names = []

    for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \
            enumerate(summand_descriptors):
        if is_gpu_scalar:
            preamble.append(
                    "texture <%s, 1, cudaReadModeElementType> tex_a%d;"
                    % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i))
            args.append(VectorArg(vector_dtype, "x%d" % i, with_offset=True))
            tex_names.append("tex_a%d" % i)
            loop_prep.append(
                    "%s a%d = fp_tex1Dfetch(tex_a%d, 0)"
                    % (dtype_to_ctype(scalar_dtype), i, i))
        else:
            args.append(ScalarArg(scalar_dtype, "a%d" % i))
            args.append(VectorArg(vector_dtype, "x%d" % i, with_offset=True))

        summands.append("a%d*x%d[i]" % (i, i))

    args.append(VectorArg(dtype_z, "z", with_offset=True))
    args.append(ScalarArg(np.uintp, "n"))

    mod = get_elwise_module(args,
            "z[i] = " + " + ".join(summands),
            "linear_combination",
            preamble="\n".join(preamble),
            loop_prep=";\n".join(loop_prep))

    func = mod.get_function("linear_combination")
    tex_src = [mod.get_texref(tn) for tn in tex_names]
    func.prepare("".join(arg.struct_char for arg in args),
            (1, 1, 1), texrefs=tex_src)

    return func, tex_src
예제 #42
0
def get_axpbyz_kernel(context,
                      dtype_x,
                      dtype_y,
                      dtype_z,
                      x_is_scalar=False,
                      y_is_scalar=False):
    result_t = dtype_to_ctype(dtype_z)

    x_is_complex = dtype_x.kind == "c"
    y_is_complex = dtype_y.kind == "c"

    x = "x[0]" if x_is_scalar else "x[i]"
    y = "y[0]" if y_is_scalar else "y[i]"

    if dtype_z.kind == "c":
        # a and b will always be complex here.
        z_ct = complex_dtype_to_name(dtype_z)

        if x_is_complex:
            ax = f"{z_ct}_mul(a, {z_ct}_cast({x}))"
        else:
            ax = f"{z_ct}_mulr(a, {x})"

        if y_is_complex:
            by = f"{z_ct}_mul(b, {z_ct}_cast({y}))"
        else:
            by = f"{z_ct}_mulr(b, {y})"

        result = f"{z_ct}_add({ax}, {by})"
    else:
        # real-only

        ax = f"a*(({result_t}) {x})"
        by = f"b*(({result_t}) {y})"

        result = f"{ax} + {by}"

    return get_elwise_kernel(
        context,
        "{tp_z} *z, {tp_z} a, {tp_x} *x, {tp_z} b, {tp_y} *y".format(
            tp_x=dtype_to_ctype(dtype_x),
            tp_y=dtype_to_ctype(dtype_y),
            tp_z=dtype_to_ctype(dtype_z),
        ),
        "z[i] = %s" % result,
        name="axpbyz")
예제 #43
0
def get_fill_kernel(context, dtype):
    return get_elwise_kernel(context,
            "%(tp)s *z, %(tp)s a" % {
                "tp": dtype_to_ctype(dtype),
                },
            "z[i] = a",
            preamble=dtype_to_c_struct(context.devices[0], dtype),
            name="fill")
예제 #44
0
def get_take_kernel(context, dtype, idx_dtype, vec_count=1):
    ctx = {
        "idx_tp": dtype_to_ctype(idx_dtype),
        "tp": dtype_to_ctype(dtype),
    }

    args = ([
        VectorArg(dtype, "dest" + str(i), with_offset=True)
        for i in range(vec_count)
    ] + [
        VectorArg(dtype, "src" + str(i), with_offset=True)
        for i in range(vec_count)
    ] + [VectorArg(idx_dtype, "idx", with_offset=True)])
    body = (("%(idx_tp)s src_idx = idx[i];\n" % ctx) +
            "\n".join("dest%d[i] = src%d[src_idx];" % (i, i)
                      for i in range(vec_count)))

    return get_elwise_kernel(context, args, body, name="take")
예제 #45
0
    def get_count_kernel(self, index_dtype):
        index_ctype = dtype_to_ctype(index_dtype)
        from pyopencl.tools import VectorArg, OtherArg
        kernel_list_args = [
                VectorArg(index_dtype, "plb_%s_count" % name)
                for name, dtype in self.list_names_and_dtypes
                if name not in self.count_sharing]

        user_list_args = []
        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue

            name = "plb_loc_%s_count" % name
            user_list_args.append(OtherArg("%s *%s" % (
                index_ctype, name), name))

        kernel_name = self.name_prefix+"_count"

        from pyopencl.characterize import has_double_support
        src = _LIST_BUILDER_TEMPLATE.render(
                is_count_stage=True,
                kernel_name=kernel_name,
                double_support=all(has_double_support(dev) for dev in
                    self.context.devices),
                debug=self.debug,
                do_not_vectorize=self.do_not_vectorize(),
                eliminate_empty_output_lists=self.eliminate_empty_output_lists,

                kernel_list_arg_decl=_get_arg_decl(kernel_list_args),
                kernel_list_arg_values=_get_arg_list(user_list_args, prefix="&"),
                user_list_arg_decl=_get_arg_decl(user_list_args),
                user_list_args=_get_arg_list(user_list_args),
                user_arg_decl_with_offset=_get_arg_decl(self.arg_decls),
                user_arg_decl_no_offset=_get_arg_decl(self.arg_decls_no_offset),
                user_args_no_offset=_get_arg_list(self.arg_decls_no_offset),
                arg_offset_adjustment=get_arg_offset_adjuster_code(self.arg_decls),

                list_names_and_dtypes=self.list_names_and_dtypes,
                count_sharing=self.count_sharing,
                name_prefix=self.name_prefix,
                generate_template=self.generate_template,
                preamble=self.preamble,

                index_type=index_ctype,
                )

        src = str(src)

        prg = cl.Program(self.context, src).build(self.options)
        knl = getattr(prg, kernel_name)

        from pyopencl.tools import get_arg_list_scalar_arg_dtypes
        knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(
            kernel_list_args+self.arg_decls) + [index_dtype])

        return knl
예제 #46
0
 def get_scan_kernel(self, index_dtype):
     from pyopencl.scan import GenericScanKernel
     return GenericScanKernel(
             self.context, index_dtype,
             arguments="__global %s *ary" % dtype_to_ctype(index_dtype),
             input_expr="ary[i]",
             scan_expr="a+b", neutral="0",
             output_statement="ary[i+1] = item;",
             devices=self.devices)
예제 #47
0
def get_subset_minmax_kernel(ctx, what, dtype, dtype_subset):
    if dtype.kind == "f":
        reduce_expr = "f%s(a,b)" % what
    elif dtype.kind in "iu":
        reduce_expr = "%s(a,b)" % what
    else:
        raise TypeError("unsupported dtype specified")

    return ReductionKernel(ctx, dtype,
            neutral=get_minmax_neutral(what, dtype),
            reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr},
            map_expr="in[lookup_tbl[i]]",
            arguments=
            "const %(tp_lut)s *lookup_tbl, "
            "const %(tp)s *in"  % {
            "tp": dtype_to_ctype(dtype),
            "tp_lut": dtype_to_ctype(dtype_subset),
            }, preamble="#define MY_INFINITY (1./0)")
예제 #48
0
def get_divide_kernel(context,
                      dtype_x,
                      dtype_y,
                      dtype_z,
                      x_is_scalar=False,
                      y_is_scalar=False):
    x_is_complex = dtype_x.kind == "c"
    y_is_complex = dtype_y.kind == "c"
    z_is_complex = dtype_z.kind == "c"

    x = "x[0]" if x_is_scalar else "x[i]"
    y = "y[0]" if y_is_scalar else "y[i]"

    if z_is_complex and dtype_x != dtype_y:
        if x_is_complex and dtype_x != dtype_z:
            x = "{}_cast({})".format(complex_dtype_to_name(dtype_z), x)
        if y_is_complex and dtype_y != dtype_z:
            y = "{}_cast({})".format(complex_dtype_to_name(dtype_z), y)
    else:
        if dtype_x != dtype_z:
            x = f"({dtype_to_ctype(dtype_z)}) ({x})"
        if dtype_y != dtype_z:
            y = f"({dtype_to_ctype(dtype_z)}) ({y})"

    if x_is_complex and y_is_complex:
        xoy = "{}_divide({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
    elif not x_is_complex and y_is_complex:
        xoy = "{}_rdivide({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
    elif x_is_complex and not y_is_complex:
        xoy = "{}_divider({}, {})".format(complex_dtype_to_name(dtype_z), x, y)
    else:
        xoy = f"{x} / {y}"

    if z_is_complex:
        xoy = "{}_cast({})".format(complex_dtype_to_name(dtype_z), xoy)

    return get_elwise_kernel(context,
                             "{tp_z} *z, {tp_x} *x, {tp_y} *y".format(
                                 tp_x=dtype_to_ctype(dtype_x),
                                 tp_y=dtype_to_ctype(dtype_y),
                                 tp_z=dtype_to_ctype(dtype_z),
                             ),
                             "z[i] = %s" % xoy,
                             name="divide")
예제 #49
0
def get_reduction_kernel(stage,
                         ctx,
                         dtype_out,
                         neutral,
                         reduce_expr,
                         arguments=None,
                         name="reduce_kernel",
                         preamble="",
                         map_exprs=None,
                         device=None,
                         options=[],
                         max_group_size=None):

    if map_exprs is None:
        raise ValueError("map_exprs has to be given!")

    for i, m in enumerate(map_exprs):
        if m is None:
            if stage == 2:
                map_exprs[i] = "pyopencl_reduction_inp_%i[i]" % i
            else:
                map_exprs[i] = "in[i]"

    from pyopencl.tools import (parse_arg_list, get_arg_list_scalar_arg_dtypes,
                                get_arg_offset_adjuster_code, VectorArg)

    arg_prep = ""
    if stage == 1 and arguments is not None:
        arguments = parse_arg_list(arguments, with_offset=True)
        arg_prep = get_arg_offset_adjuster_code(arguments)

    if stage == 2 and arguments is not None:
        arguments = parse_arg_list(arguments)
        arguments = ([
            VectorArg(dtype_out, "pyopencl_reduction_inp_%i" % i)
            for i in range(len(map_exprs))
        ] + arguments)

    inf = _get_reduction_source(ctx, dtype_to_ctype(dtype_out),
                                dtype_out.itemsize, neutral, reduce_expr,
                                map_exprs, arguments, name, preamble, arg_prep,
                                device, max_group_size)

    inf.program = cl.Program(ctx, inf.source)
    inf.program.build(options)
    inf.kernel = getattr(inf.program, name)

    inf.arg_types = arguments

    inf.kernel.set_scalar_arg_dtypes(
        [
            None,
        ] * len(map_exprs) + [np.int64] +
        get_arg_list_scalar_arg_dtypes(inf.arg_types) + [np.uint32] * 2)

    return inf
예제 #50
0
def get_subset_dot_kernel(ctx, dtype_out, dtype_subset, dtype_a=None, dtype_b=None,
        conjugate_first=False):
    from pyopencl.characterize import has_double_support
    map_expr, dtype_out, dtype_b = _get_dot_expr(
            dtype_out, dtype_a, dtype_b, conjugate_first,
            has_double_support=has_double_support(ctx.devices[0]),
            index_expr="lookup_tbl[i]")

    # important: lookup_tbl must be first--it controls the length
    return ReductionKernel(ctx, dtype_out, neutral="0",
            reduce_expr="a+b", map_expr=map_expr,
            arguments=(
                "const %(tp_lut)s *lookup_tbl, "
                "const %(tp_a)s *a, "
                "const %(tp_b)s *b" % {
                    "tp_lut": dtype_to_ctype(dtype_subset),
                    "tp_a": dtype_to_ctype(dtype_a),
                    "tp_b": dtype_to_ctype(dtype_b),
                    }))
예제 #51
0
 def python_dtype_str_inner(self, dtype):
     import pyopencl.tools as cl_tools
     if dtype.isbuiltin:
         name = dtype.name
         if dtype.name == "bool":
             name = "bool8"
         return f"_lpy_np.dtype(_lpy_np.{name})"
     else:
         return ('_lpy_cl_tools.get_or_register_dtype("%s")'
                 % cl_tools.dtype_to_ctype(dtype))
예제 #52
0
def get_copy_kernel(context, dtype_dest, dtype_src):
    src = "src[i]"
    if dtype_dest.kind == "c" != dtype_src.kind:
        src = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_dest), src)

    if dtype_dest.kind == "c" and dtype_src != dtype_dest:
        src = "%s_cast(%s)" % (complex_dtype_to_name(dtype_dest), src),

    if dtype_dest != dtype_src and (
            dtype_dest.kind == "V" or dtype_src.kind == "V"):
        raise TypeError("copying between non-identical struct types")

    return get_elwise_kernel(context,
            "%(tp_dest)s *dest, %(tp_src)s *src" % {
                "tp_dest": dtype_to_ctype(dtype_dest),
                "tp_src": dtype_to_ctype(dtype_src),
                },
            "dest[i] = %s" % src,
            preamble=dtype_to_c_struct(context.devices[0], dtype_dest),
            name="copy")
예제 #53
0
def dtype_to_ctype(dtype):
    """Get the CL type of the given numpy data type.

    Args:
        dtype (np.dtype): the numpy data type

    Returns:
        str: the CL type string for the corresponding type
    """
    from pyopencl.tools import dtype_to_ctype
    return dtype_to_ctype(dtype)
예제 #54
0
def get_copy_kernel(context, dtype_dest, dtype_src):
    src = "src[i]"
    if dtype_dest.kind == "c" != dtype_src.kind:
        src = "{}_fromreal({})".format(complex_dtype_to_name(dtype_dest), src)

    if dtype_dest.kind == "c" and dtype_src != dtype_dest:
        src = "{}_cast({})".format(complex_dtype_to_name(dtype_dest), src),

    if dtype_dest != dtype_src and (
            dtype_dest.kind == "V" or dtype_src.kind == "V"):
        raise TypeError("copying between non-identical struct types")

    return get_elwise_kernel(context,
            "{tp_dest} *dest, {tp_src} *src".format(
                tp_dest=dtype_to_ctype(dtype_dest),
                tp_src=dtype_to_ctype(dtype_src),
                ),
            "dest[i] = %s" % src,
            preamble=dtype_to_c_struct(context.devices[0], dtype_dest),
            name="copy")
예제 #55
0
def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z):

    a_is_complex = dtype_a.kind == "c"
    x_is_complex = dtype_x.kind == "c"
    b_is_complex = dtype_b.kind == "c"

    z_is_complex = dtype_z.kind == "c"

    ax = "a*x[i]"
    if a_is_complex and x_is_complex:
        a = "a"
        x = "x[i]"

        if dtype_a != dtype_z:
            a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a)
        if dtype_x != dtype_z:
            x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)

        ax = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x)

    b = "b"
    if z_is_complex and not b_is_complex:
        b = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_z), b)

    if z_is_complex and not (a_is_complex or x_is_complex):
        ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_z), ax)

    if z_is_complex:
        ax = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), ax)
        b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), b)

    return get_elwise_kernel(
        context,
        "%(tp_z)s *z, %(tp_a)s a, %(tp_x)s *x,%(tp_b)s b" % {
            "tp_a": dtype_to_ctype(dtype_a),
            "tp_x": dtype_to_ctype(dtype_x),
            "tp_b": dtype_to_ctype(dtype_b),
            "tp_z": dtype_to_ctype(dtype_z),
        },
        "z[i] = %s + %s" % (ax, b),
        name="axpb")
예제 #56
0
def get_reduction_kernel(stage,
                         ctx,
                         dtype_out,
                         neutral,
                         reduce_expr,
                         map_expr=None,
                         arguments=None,
                         name="reduce_kernel",
                         preamble="",
                         device=None,
                         options=None,
                         max_group_size=None):

    if map_expr is None:
        if stage == 2:
            map_expr = "pyopencl_reduction_inp[i]"
        else:
            map_expr = "in[i]"

    from pyopencl.tools import (parse_arg_list, get_arg_list_scalar_arg_dtypes,
                                get_arg_offset_adjuster_code, VectorArg)

    if arguments is None:
        raise ValueError("arguments must not be None")

    arguments = parse_arg_list(arguments, with_offset=True)
    arg_prep = get_arg_offset_adjuster_code(arguments)

    if stage == 2 and arguments is not None:
        arguments = ([VectorArg(dtype_out, "pyopencl_reduction_inp")] +
                     arguments)

    source, group_size = _get_reduction_source(ctx, dtype_to_ctype(dtype_out),
                                               dtype_out.itemsize, neutral,
                                               reduce_expr, map_expr,
                                               arguments, name, preamble,
                                               arg_prep, device,
                                               max_group_size)

    program = cl.Program(ctx, source)
    program.build(options)

    kernel = getattr(program, name)
    kernel.set_scalar_arg_dtypes([None, np.int64] +
                                 get_arg_list_scalar_arg_dtypes(arguments) +
                                 [np.int64] * 3 + [np.uint32, np.int64])

    return _ReductionInfo(context=ctx,
                          source=source,
                          group_size=group_size,
                          program=program,
                          kernel=kernel,
                          arg_types=arguments)
예제 #57
0
def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
    ctx = {
        "idx_tp": dtype_to_ctype(idx_dtype),
        "tp": dtype_to_ctype(dtype),
    }

    args = [
        VectorArg(dtype, "dest%d" % i, with_offset=True)
        for i in range(vec_count)
    ] + [
        VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True),
    ] + [
        VectorArg(dtype, "src%d" % i, with_offset=True)
        for i in range(vec_count)
    ]

    body = ("%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx +
            "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i)
                      for i in range(vec_count)))

    return get_elwise_kernel(context, args, body, name="put")
예제 #58
0
def get_arange_kernel(context, dtype):
    if dtype.kind == "c":
        i = "%s_fromreal(i)" % complex_dtype_to_name(dtype)
    else:
        i = "(%s) i" % dtype_to_ctype(dtype)

    return get_elwise_kernel(context, [
        VectorArg(dtype, "z", with_offset=True),
        ScalarArg(dtype, "start"),
        ScalarArg(dtype, "step"),
    ],
                             "z[i] = start + %s*step" % i,
                             name="arange")
예제 #59
0
def get_arange_kernel(context, dtype):
    if dtype.kind == "c":
        expr = ("{root}_add(start, {root}_rmul(i, step))".format(
            root=complex_dtype_to_name(dtype)))
    else:
        expr = "start + ((%s) i)*step" % dtype_to_ctype(dtype)

    return get_elwise_kernel(context, [
        VectorArg(dtype, "z", with_offset=True),
        ScalarArg(dtype, "start"),
        ScalarArg(dtype, "step"),
    ],
                             "z[i] = " + expr,
                             name="arange")
예제 #60
0
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None,
        conjugate_first=False):
    from pyopencl.characterize import has_double_support
    map_expr, dtype_out, dtype_b = _get_dot_expr(
            dtype_out, dtype_a, dtype_b, conjugate_first,
            has_double_support=has_double_support(ctx.devices[0]))

    reduce_expr = "a+b"
    neutral_expr = "0"
    if dtype_out.kind == "c":
        from pyopencl.elementwise import complex_dtype_to_name
        dtname = complex_dtype_to_name(dtype_out)
        reduce_expr = "%s_add(a, b)" % dtname
        neutral_expr = "%s_new(0, 0)" % dtname

    return ReductionKernel(ctx, dtype_out, neutral=neutral_expr,
            reduce_expr=reduce_expr, map_expr=map_expr,
            arguments=(
                "const %(tp_a)s *a, "
                "const %(tp_b)s *b" % {
                    "tp_a": dtype_to_ctype(dtype_a),
                    "tp_b": dtype_to_ctype(dtype_b),
                    }))