Exemplo n.º 1
0
    def get_kernels(self, key_dtype, value_dtype, starts_dtype):
        from pyopencl.algorithm import RadixSort
        from pyopencl.tools import VectorArg, ScalarArg

        by_target_sorter = RadixSort(self.context, [
            VectorArg(value_dtype, "values"),
            VectorArg(key_dtype, "keys"),
        ],
                                     key_expr="keys[i]",
                                     sort_arg_names=["values", "keys"])

        from pyopencl.elementwise import ElementwiseTemplate
        start_finder = ElementwiseTemplate(arguments="""//CL//
                starts_t *key_group_starts,
                key_t *keys_sorted_by_key,
                """,
                                           operation=r"""//CL//
                key_t my_key = keys_sorted_by_key[i];

                if (i == 0 || my_key != keys_sorted_by_key[i-1])
                    key_group_starts[my_key] = i;
                """,
                                           name="find_starts").build(
                                               self.context,
                                               type_aliases=(
                                                   ("key_t", starts_dtype),
                                                   ("starts_t", starts_dtype),
                                               ),
                                               var_values=())

        from pyopencl.scan import GenericScanKernel
        bound_propagation_scan = GenericScanKernel(
            self.context,
            starts_dtype,
            arguments=[
                VectorArg(starts_dtype, "starts"),
                # starts has length n+1
                ScalarArg(key_dtype, "nkeys"),
            ],
            input_expr="starts[nkeys-i]",
            scan_expr="min(a, b)",
            neutral=_make_cl_int_literal(
                np.iinfo(starts_dtype).max, starts_dtype),
            output_statement="starts[nkeys-i] = item;")

        return _KernelInfo(by_target_sorter=by_target_sorter,
                           start_finder=start_finder,
                           bound_propagation_scan=bound_propagation_scan)
Exemplo n.º 2
0
def get_array_binop_kernel(context,
                           operator,
                           dtype_res,
                           dtype_a,
                           dtype_b,
                           a_is_scalar=False,
                           b_is_scalar=False):
    a = "a[0]" if a_is_scalar else "a[i]"
    b = "b[0]" if b_is_scalar else "b[i]"
    return get_elwise_kernel(context, [
        VectorArg(dtype_res, "out", with_offset=True),
        VectorArg(dtype_a, "a", with_offset=True),
        VectorArg(dtype_b, "b", with_offset=True),
    ],
                             f"out[i] = {a} {operator} {b}",
                             name="binop_kernel")
Exemplo n.º 3
0
def get_binary_func_kernel(context,
                           func_name,
                           x_dtype,
                           y_dtype,
                           out_dtype,
                           preamble="",
                           name=None):
    return get_elwise_kernel(context, [
        VectorArg(out_dtype, "z", with_offset=True),
        VectorArg(x_dtype, "x", with_offset=True),
        VectorArg(y_dtype, "y", with_offset=True),
    ],
                             "z[i] = %s(x[i], y[i])" % func_name,
                             name="%s_kernel" %
                             func_name if name is None else name,
                             preamble=preamble)
Exemplo n.º 4
0
def test_list_builder_with_offset(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.algorithm import ListOfListsBuilder
    from pyopencl.tools import VectorArg
    builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL//
            void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
            {
                APPEND_mylist(input_list[i]);
            }
            """, arg_decls=[
                VectorArg(float, "input_list", with_offset=True)])

    n = 10000
    input_list = cl.array.zeros(queue, (n + 10,), float)
    input_list[10:] = 1

    result, evt = builder(queue, n, input_list[10:])

    inf = result["mylist"]
    assert inf.count == n
    assert (inf.lists.get() == 1).all()
Exemplo n.º 5
0
def get_linear_combination_kernel(summand_descriptors,
        dtype_z):
    # TODO: Port this!
    raise NotImplementedError

    from pyopencl.tools import dtype_to_ctype
    from pyopencl.elementwise import \
            VectorArg, ScalarArg, get_elwise_module

    args = []
    preamble = []
    loop_prep = []
    summands = []
    tex_names = []

    for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \
            enumerate(summand_descriptors):
        if is_gpu_scalar:
            preamble.append(
                    "texture <%s, 1, cudaReadModeElementType> tex_a%d;"
                    % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i))
            args.append(VectorArg(vector_dtype, "x%d" % i, with_offset=True))
            tex_names.append("tex_a%d" % i)
            loop_prep.append(
                    "%s a%d = fp_tex1Dfetch(tex_a%d, 0)"
                    % (dtype_to_ctype(scalar_dtype), i, i))
        else:
            args.append(ScalarArg(scalar_dtype, "a%d" % i))
            args.append(VectorArg(vector_dtype, "x%d" % i, with_offset=True))

        summands.append("a%d*x%d[i]" % (i, i))

    args.append(VectorArg(dtype_z, "z", with_offset=True))
    args.append(ScalarArg(np.uintp, "n"))

    mod = get_elwise_module(args,
            "z[i] = " + " + ".join(summands),
            "linear_combination",
            preamble="\n".join(preamble),
            loop_prep=";\n".join(loop_prep))

    func = mod.get_function("linear_combination")
    tex_src = [mod.get_texref(tn) for tn in tex_names]
    func.prepare("".join(arg.struct_char for arg in args),
            (1, 1, 1), texrefs=tex_src)

    return func, tex_src
Exemplo n.º 6
0
def get_bessel_kernel(context,
                      which_func,
                      out_dtype=np.float64,
                      order_dtype=np.int32,
                      x_dtype=np.float64):
    return get_elwise_kernel(context, [
        VectorArg(out_dtype, "z", with_offset=True),
        ScalarArg(order_dtype, "ord_n"),
        VectorArg(x_dtype, "x", with_offset=True),
    ],
                             "z[i] = bessel_%sn(ord_n, x[i])" % which_func,
                             name="bessel_%sn_kernel" % which_func,
                             preamble="""
        #pragma OPENCL EXTENSION cl_khr_fp64: enable
        #define PYOPENCL_DEFINE_CDOUBLE
        #include <pyopencl-bessel-%s.cl>
        """ % which_func)
Exemplo n.º 7
0
def get_all_kernel(ctx, dtype_in):
    from pyopencl.tools import VectorArg
    return ReductionKernel(ctx,
                           np.int8,
                           "true",
                           "a && b",
                           map_expr="(bool) (in[i])",
                           arguments=[VectorArg(dtype_in, "in")])
Exemplo n.º 8
0
def get_bessel_kernel(context, which_func, out_dtype=np.float64,
                      order_dtype=np.int32, x_dtype=np.float64):
    if x_dtype.kind != "c":
        return get_elwise_kernel(context, [
            VectorArg(out_dtype, "z", with_offset=True),
            ScalarArg(order_dtype, "ord_n"),
            VectorArg(x_dtype, "x", with_offset=True),
            ],
            "z[i] = bessel_%sn(ord_n, x[i])" % which_func,
            name="bessel_%sn_kernel" % which_func,
            preamble="""
            #if __OPENCL_C_VERSION__ < 120
            #pragma OPENCL EXTENSION cl_khr_fp64: enable
            #endif
            #define PYOPENCL_DEFINE_CDOUBLE
            #include <pyopencl-bessel-%s.cl>
            """ % which_func)
    else:
        if which_func != "j":
            raise NotImplementedError("complex arguments for Bessel Y")

        if x_dtype != np.complex128:
            raise NotImplementedError("non-complex double dtype")
        if x_dtype != out_dtype:
            raise NotImplementedError("different input/output types")

        return get_elwise_kernel(context, [
            VectorArg(out_dtype, "z", with_offset=True),
            ScalarArg(order_dtype, "ord_n"),
            VectorArg(x_dtype, "x", with_offset=True),
            ],
            """
            cdouble_t jv_loc;
            cdouble_t jvp1_loc;
            bessel_j_complex(ord_n, x[i], &jv_loc, &jvp1_loc);
            z[i] = jv_loc;
            """,
            name="bessel_j_complex_kernel",
            preamble="""
            #if __OPENCL_C_VERSION__ < 120
            #pragma OPENCL EXTENSION cl_khr_fp64: enable
            #endif
            #define PYOPENCL_DEFINE_CDOUBLE
            #include <pyopencl-complex.h>
            #include <pyopencl-bessel-j-complex.cl>
            """)
Exemplo n.º 9
0
def get_take_kernel(context, dtype, idx_dtype, vec_count=1):
    ctx = {
        "idx_tp": dtype_to_ctype(idx_dtype),
        "tp": dtype_to_ctype(dtype),
    }

    args = ([
        VectorArg(dtype, "dest" + str(i), with_offset=True)
        for i in range(vec_count)
    ] + [
        VectorArg(dtype, "src" + str(i), with_offset=True)
        for i in range(vec_count)
    ] + [VectorArg(idx_dtype, "idx", with_offset=True)])
    body = (("%(idx_tp)s src_idx = idx[i];\n" % ctx) +
            "\n".join("dest%d[i] = src%d[src_idx];" % (i, i)
                      for i in range(vec_count)))

    return get_elwise_kernel(context, args, body, name="take")
Exemplo n.º 10
0
    def get_count_kernel(self, index_dtype):
        index_ctype = dtype_to_ctype(index_dtype)
        from pyopencl.tools import VectorArg, OtherArg
        kernel_list_args = [
                VectorArg(index_dtype, "plb_%s_count" % name)
                for name, dtype in self.list_names_and_dtypes
                if name not in self.count_sharing]

        user_list_args = []
        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue

            name = "plb_loc_%s_count" % name
            user_list_args.append(OtherArg("%s *%s" % (
                index_ctype, name), name))

        kernel_name = self.name_prefix+"_count"

        from pyopencl.characterize import has_double_support
        src = _LIST_BUILDER_TEMPLATE.render(
                is_count_stage=True,
                kernel_name=kernel_name,
                double_support=all(has_double_support(dev) for dev in
                    self.context.devices),
                debug=self.debug,
                do_not_vectorize=self.do_not_vectorize(),
                eliminate_empty_output_lists=self.eliminate_empty_output_lists,

                kernel_list_arg_decl=_get_arg_decl(kernel_list_args),
                kernel_list_arg_values=_get_arg_list(user_list_args, prefix="&"),
                user_list_arg_decl=_get_arg_decl(user_list_args),
                user_list_args=_get_arg_list(user_list_args),
                user_arg_decl_with_offset=_get_arg_decl(self.arg_decls),
                user_arg_decl_no_offset=_get_arg_decl(self.arg_decls_no_offset),
                user_args_no_offset=_get_arg_list(self.arg_decls_no_offset),
                arg_offset_adjustment=get_arg_offset_adjuster_code(self.arg_decls),

                list_names_and_dtypes=self.list_names_and_dtypes,
                count_sharing=self.count_sharing,
                name_prefix=self.name_prefix,
                generate_template=self.generate_template,
                preamble=self.preamble,

                index_type=index_ctype,
                )

        src = str(src)

        prg = cl.Program(self.context, src).build(self.options)
        knl = getattr(prg, kernel_name)

        from pyopencl.tools import get_arg_list_scalar_arg_dtypes
        knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(
            kernel_list_args+self.arg_decls) + [index_dtype])

        return knl
Exemplo n.º 11
0
def get_reduction_kernel(stage,
                         ctx,
                         dtype_out,
                         neutral,
                         reduce_expr,
                         arguments=None,
                         name="reduce_kernel",
                         preamble="",
                         map_exprs=None,
                         device=None,
                         options=[],
                         max_group_size=None):

    if map_exprs is None:
        raise ValueError("map_exprs has to be given!")

    for i, m in enumerate(map_exprs):
        if m is None:
            if stage == 2:
                map_exprs[i] = "pyopencl_reduction_inp_%i[i]" % i
            else:
                map_exprs[i] = "in[i]"

    from pyopencl.tools import (parse_arg_list, get_arg_list_scalar_arg_dtypes,
                                get_arg_offset_adjuster_code, VectorArg)

    arg_prep = ""
    if stage == 1 and arguments is not None:
        arguments = parse_arg_list(arguments, with_offset=True)
        arg_prep = get_arg_offset_adjuster_code(arguments)

    if stage == 2 and arguments is not None:
        arguments = parse_arg_list(arguments)
        arguments = ([
            VectorArg(dtype_out, "pyopencl_reduction_inp_%i" % i)
            for i in range(len(map_exprs))
        ] + arguments)

    inf = _get_reduction_source(ctx, dtype_to_ctype(dtype_out),
                                dtype_out.itemsize, neutral, reduce_expr,
                                map_exprs, arguments, name, preamble, arg_prep,
                                device, max_group_size)

    inf.program = cl.Program(ctx, inf.source)
    inf.program.build(options)
    inf.kernel = getattr(inf.program, name)

    inf.arg_types = arguments

    inf.kernel.set_scalar_arg_dtypes(
        [
            None,
        ] * len(map_exprs) + [np.int64] +
        get_arg_list_scalar_arg_dtypes(inf.arg_types) + [np.uint32] * 2)

    return inf
Exemplo n.º 12
0
def get_float_binary_func_kernel(context, func_name, x_dtype, y_dtype,
                                 out_dtype, preamble="", name=None):
    if (np.array(0, x_dtype) * np.array(0, y_dtype)).itemsize > 4:
        arg_type = 'double'
        preamble = """
        #if __OPENCL_C_VERSION__ < 120
        #pragma OPENCL EXTENSION cl_khr_fp64: enable
        #endif
        #define PYOPENCL_DEFINE_CDOUBLE
        """ + preamble
    else:
        arg_type = 'float'
    return get_elwise_kernel(context, [
        VectorArg(out_dtype, "z", with_offset=True),
        VectorArg(x_dtype, "x", with_offset=True),
        VectorArg(y_dtype, "y", with_offset=True),
        ],
        "z[i] = %s((%s)x[i], (%s)y[i])" % (func_name, arg_type, arg_type),
        name="%s_kernel" % func_name if name is None else name,
        preamble=preamble)
Exemplo n.º 13
0
def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
    ctx = {
        "idx_tp": dtype_to_ctype(idx_dtype),
        "tp": dtype_to_ctype(dtype),
    }

    args = [
        VectorArg(dtype, "dest%d" % i, with_offset=True)
        for i in range(vec_count)
    ] + [
        VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True),
    ] + [
        VectorArg(dtype, "src%d" % i, with_offset=True)
        for i in range(vec_count)
    ]

    body = ("%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx +
            "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i)
                      for i in range(vec_count)))

    return get_elwise_kernel(context, args, body, name="put")
Exemplo n.º 14
0
def get_reduction_kernel(stage,
                         ctx,
                         dtype_out,
                         neutral,
                         reduce_expr,
                         map_expr=None,
                         arguments=None,
                         name="reduce_kernel",
                         preamble="",
                         device=None,
                         options=None,
                         max_group_size=None):

    if map_expr is None:
        if stage == 2:
            map_expr = "pyopencl_reduction_inp[i]"
        else:
            map_expr = "in[i]"

    from pyopencl.tools import (parse_arg_list, get_arg_list_scalar_arg_dtypes,
                                get_arg_offset_adjuster_code, VectorArg)

    if arguments is None:
        raise ValueError("arguments must not be None")

    arguments = parse_arg_list(arguments, with_offset=True)
    arg_prep = get_arg_offset_adjuster_code(arguments)

    if stage == 2 and arguments is not None:
        arguments = ([VectorArg(dtype_out, "pyopencl_reduction_inp")] +
                     arguments)

    source, group_size = _get_reduction_source(ctx, dtype_to_ctype(dtype_out),
                                               dtype_out.itemsize, neutral,
                                               reduce_expr, map_expr,
                                               arguments, name, preamble,
                                               arg_prep, device,
                                               max_group_size)

    program = cl.Program(ctx, source)
    program.build(options)

    kernel = getattr(program, name)
    kernel.set_scalar_arg_dtypes([None, np.int64] +
                                 get_arg_list_scalar_arg_dtypes(arguments) +
                                 [np.int64] * 3 + [np.uint32, np.int64])

    return _ReductionInfo(context=ctx,
                          source=source,
                          group_size=group_size,
                          program=program,
                          kernel=kernel,
                          arg_types=arguments)
Exemplo n.º 15
0
    def get_balls_to_leaves_kernel(self, dimensions, coord_dtype, box_id_dtype,
                                   ball_id_dtype, max_levels,
                                   stick_out_factor):
        from pyopencl.tools import dtype_to_ctype
        from boxtree import box_flags_enum
        render_vars = dict(
            dimensions=dimensions,
            dtype_to_ctype=dtype_to_ctype,
            box_id_dtype=box_id_dtype,
            particle_id_dtype=None,
            ball_id_dtype=ball_id_dtype,
            coord_dtype=coord_dtype,
            vec_types=cl.array.vec.types,
            max_levels=max_levels,
            AXIS_NAMES=AXIS_NAMES,
            box_flags_enum=box_flags_enum,
            debug=False,
            stick_out_factor=stick_out_factor,
        )

        logger.info("start building leaves-to-balls lookup kernel")

        from boxtree.traversal import TRAVERSAL_PREAMBLE_TEMPLATE

        src = Template(TRAVERSAL_PREAMBLE_TEMPLATE + BALLS_TO_LEAVES_TEMPLATE,
                       strict_undefined=True).render(**render_vars)

        from pyopencl.tools import VectorArg, ScalarArg
        from pyopencl.algorithm import ListOfListsBuilder
        result = ListOfListsBuilder(
            self.context,
            [
                ("ball_numbers", ball_id_dtype),
                ("overlapping_leaves", box_id_dtype),
            ],
            str(src),
            arg_decls=[
                VectorArg(box_flags_enum.dtype, "box_flags"),
                VectorArg(coord_dtype, "box_centers"),
                VectorArg(box_id_dtype, "box_child_ids"),
                VectorArg(np.uint8, "box_levels"),
                ScalarArg(coord_dtype, "root_extent"),
                ScalarArg(box_id_dtype, "aligned_nboxes"),
                VectorArg(coord_dtype, "ball_radii"),
            ] + [
                VectorArg(coord_dtype, "ball_" + ax)
                for ax in AXIS_NAMES[:dimensions]
            ],
            name_prefix="circles_to_balls",
            count_sharing={
                # /!\ This makes a promise that APPEND_ball_numbers will
                # always occur *before* APPEND_overlapping_leaves.
                "overlapping_leaves": "ball_numbers"
            },
            complex_kernel=True)

        logger.info("done building leaves-to-balls lookup kernel")

        return result
Exemplo n.º 16
0
    def _get_kernel(self, dtype, src_index_dtype, map_values=False):
        from pyopencl.tools import VectorArg

        args = [
            VectorArg(dtype, "input_ary", with_offset=True),
            VectorArg(dtype, "output_ary", with_offset=True),
            VectorArg(src_index_dtype, "from_indices", with_offset=True)
        ]

        if map_values:
            args.append(VectorArg(dtype, "value_map", with_offset=True))

        from pyopencl.tools import dtype_to_ctype
        src = GAPPY_COPY_TPL.render(dtype=dtype,
                                    dtype_to_ctype=dtype_to_ctype,
                                    map_values=map_values)

        from pyopencl.elementwise import ElementwiseKernel
        return ElementwiseKernel(self.context,
                                 args,
                                 str(src),
                                 name="gappy_copy_and_map")
Exemplo n.º 17
0
def get_arange_kernel(context, dtype):
    if dtype.kind == "c":
        i = "%s_fromreal(i)" % complex_dtype_to_name(dtype)
    else:
        i = "(%s) i" % dtype_to_ctype(dtype)

    return get_elwise_kernel(context, [
        VectorArg(dtype, "z", with_offset=True),
        ScalarArg(dtype, "start"),
        ScalarArg(dtype, "step"),
    ],
                             "z[i] = start + %s*step" % i,
                             name="arange")
Exemplo n.º 18
0
def get_arange_kernel(context, dtype):
    if dtype.kind == "c":
        expr = ("{root}_add(start, {root}_rmul(i, step))".format(
            root=complex_dtype_to_name(dtype)))
    else:
        expr = "start + ((%s) i)*step" % dtype_to_ctype(dtype)

    return get_elwise_kernel(context, [
        VectorArg(dtype, "z", with_offset=True),
        ScalarArg(dtype, "start"),
        ScalarArg(dtype, "step"),
    ],
                             "z[i] = " + expr,
                             name="arange")
Exemplo n.º 19
0
    def get_filter_target_lists_in_user_order_kernel(self, particle_id_dtype,
                                                     user_order_flags_dtype):
        from pyopencl.tools import VectorArg, dtype_to_ctype
        from pyopencl.algorithm import ListOfListsBuilder
        from mako.template import Template

        builder = ListOfListsBuilder(
            self.context, [("filt_tgt_list", particle_id_dtype)],
            Template("""//CL//
            typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t;

            void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
            {
                particle_id_t b_t_start = box_target_starts[i];
                particle_id_t b_t_count = box_target_counts_nonchild[i];

                for (particle_id_t j = b_t_start; j < b_t_start+b_t_count; ++j)
                {
                    particle_id_t user_target_id = user_target_ids[j];
                    if (user_order_flags[user_target_id])
                    {
                        APPEND_filt_tgt_list(user_target_id);
                    }
                }
            }
            """,
                     strict_undefined=True).render(
                         dtype_to_ctype=dtype_to_ctype,
                         particle_id_dtype=particle_id_dtype),
            arg_decls=[
                VectorArg(user_order_flags_dtype, "user_order_flags"),
                VectorArg(particle_id_dtype, "user_target_ids"),
                VectorArg(particle_id_dtype, "box_target_starts"),
                VectorArg(particle_id_dtype, "box_target_counts_nonchild"),
            ])

        return builder
Exemplo n.º 20
0
def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1):
    ctx = {
            "idx_tp": dtype_to_ctype(idx_dtype),
            "tp": dtype_to_ctype(dtype),
            }

    args = [
            VectorArg(dtype, "dest%d" % i)
            for i in range(vec_count)
            ] + [
                VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True),
                VectorArg(idx_dtype, "gmem_src_idx", with_offset=True),
            ] + [
                VectorArg(dtype, "src%d" % i, with_offset=True)
                for i in range(vec_count)
            ] + [
                ScalarArg(idx_dtype, "offset%d" % i)
                for i in range(vec_count) if with_offsets
            ]

    if with_offsets:
        def get_copy_insn(i):
            return ("dest%d[dest_idx] = "
                    "src%d[src_idx+offset%d];"
                    % (i, i, i))
    else:
        def get_copy_insn(i):
            return ("dest%d[dest_idx] = "
                    "src%d[src_idx];" % (i, i))

    body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n"
                "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx)
            + "\n".join(get_copy_insn(i) for i in range(vec_count)))

    return get_elwise_kernel(context, args, body,
            preamble=dtype_to_c_struct(context.devices[0], dtype),
            name="take_put")
Exemplo n.º 21
0
    def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype,
                    have_src_indices, have_dst_indices, map_values):
        from pyopencl.tools import VectorArg

        args = [
            VectorArg(dtype, "input_ary", with_offset=True),
            VectorArg(dtype, "output_ary", with_offset=True),
        ]

        if have_src_indices:
            args.append(
                VectorArg(src_index_dtype, "from_indices", with_offset=True))

        if have_dst_indices:
            args.append(
                VectorArg(dst_index_dtype, "to_indices", with_offset=True))

        if map_values:
            args.append(VectorArg(dtype, "value_map", with_offset=True))

        from pyopencl.tools import dtype_to_ctype
        src = GAPPY_COPY_TPL.render(dtype=dtype,
                                    dtype_to_ctype=dtype_to_ctype,
                                    from_dtype=src_index_dtype,
                                    to_dtype=dst_index_dtype,
                                    from_indices=have_src_indices,
                                    to_indices=have_dst_indices,
                                    map_values=map_values)

        from pyopencl.elementwise import ElementwiseKernel
        return ElementwiseKernel(self.context,
                                 args,
                                 str(src),
                                 preamble=dtype_to_c_struct(
                                     self.context.devices[0], dtype),
                                 name="gappy_copy_and_map")
Exemplo n.º 22
0
def get_reduction_kernel(stage,
                         ctx,
                         dtype_out,
                         neutral,
                         reduce_expr,
                         map_expr=None,
                         arguments=None,
                         name="reduce_kernel",
                         preamble="",
                         device=None,
                         options=[],
                         max_group_size=None):

    if map_expr is None:
        if stage == 2:
            map_expr = "pyopencl_reduction_inp[i]"
        else:
            map_expr = "in[i]"

    from pyopencl.tools import (parse_arg_list, get_arg_list_scalar_arg_dtypes,
                                get_arg_offset_adjuster_code, VectorArg)

    arg_prep = ""
    if stage == 1 and arguments is not None:
        arguments = parse_arg_list(arguments, with_offset=True)
        arg_prep = get_arg_offset_adjuster_code(arguments)

    if stage == 2 and arguments is not None:
        arguments = parse_arg_list(arguments)
        arguments = ([VectorArg(dtype_out, "pyopencl_reduction_inp")] +
                     arguments)

    inf = _get_reduction_source(ctx, dtype_to_ctype(dtype_out),
                                dtype_out.itemsize, neutral, reduce_expr,
                                map_expr, arguments, name, preamble, arg_prep,
                                device, max_group_size)

    inf.program = cl.Program(ctx, inf.source)
    inf.program.build(options)
    inf.kernel = getattr(inf.program, name)

    inf.arg_types = arguments

    inf.kernel.set_scalar_arg_dtypes(
        [None, np.int64] + get_arg_list_scalar_arg_dtypes(inf.arg_types) +
        [np.int64] * 3 + [np.uint32, np.int64])

    return inf
Exemplo n.º 23
0
def extract_extra_args_types_values(extra_args):
    from pyopencl.tools import VectorArg, ScalarArg

    extra_args_types = []
    extra_args_values = []
    for name, val in extra_args:
        if isinstance(val, cl.array.Array):
            extra_args_types.append(VectorArg(val.dtype, name, with_offset=False))
            extra_args_values.append(val)
        elif isinstance(val, np.generic):
            extra_args_types.append(ScalarArg(val.dtype, name))
            extra_args_values.append(val)
        else:
            raise RuntimeError("argument '%d' not understood" % name)

    return tuple(extra_args_types), extra_args_values
Exemplo n.º 24
0
    def get_kernel_info(self, dimensions, particle_id_dtype, box_id_dtype,
            coord_dtype, box_level_dtype, max_levels,
            sources_are_targets, sources_have_extent, targets_have_extent,
            stick_out_factor):

        logging.info("building traversal build kernels")

        debug = False

        from pyopencl.tools import dtype_to_ctype
        from boxtree.tree import box_flags_enum
        render_vars = dict(
                dimensions=dimensions,
                dtype_to_ctype=dtype_to_ctype,
                particle_id_dtype=particle_id_dtype,
                box_id_dtype=box_id_dtype,
                box_flags_enum=box_flags_enum,
                coord_dtype=coord_dtype,
                vec_types=cl.array.vec.types,
                max_levels=max_levels,
                AXIS_NAMES=AXIS_NAMES,
                debug=debug,
                sources_are_targets=sources_are_targets,
                sources_have_extent=sources_have_extent,
                targets_have_extent=targets_have_extent,
                stick_out_factor=stick_out_factor,
                )
        from pyopencl.algorithm import ListOfListsBuilder
        from pyopencl.tools import VectorArg, ScalarArg

        result = {}

        # {{{ source boxes, their parents, target boxes

        src = Template(
                TRAVERSAL_PREAMBLE_TEMPLATE
                + SOURCES_PARENTS_AND_TARGETS_TEMPLATE,
                strict_undefined=True).render(**render_vars)

        result["sources_parents_and_targets_builder"] = \
                ListOfListsBuilder(self.context,
                        [
                            ("source_parent_boxes", box_id_dtype),
                            ("source_boxes", box_id_dtype),
                            ("target_or_target_parent_boxes", box_id_dtype)
                            ] + (
                                [("target_boxes", box_id_dtype)]
                                if not sources_are_targets
                                else []),
                        str(src),
                        arg_decls=[
                            VectorArg(box_flags_enum.dtype, "box_flags"),
                            ],
                        debug=debug,
                        name_prefix="sources_parents_and_targets")

        result["level_start_box_nrs_extractor"] = \
                LEVEL_START_BOX_NR_EXTRACTOR_TEMPLATE.build(self.context,
                    type_aliases=(
                        ("box_id_t", box_id_dtype),
                        ("box_level_t", box_level_dtype),
                        ),
                    )

        # }}}

        # {{{ build list N builders

        base_args = [
                VectorArg(coord_dtype, "box_centers"),
                ScalarArg(coord_dtype, "root_extent"),
                VectorArg(np.uint8, "box_levels"),
                ScalarArg(box_id_dtype, "aligned_nboxes"),
                VectorArg(box_id_dtype, "box_child_ids"),
                VectorArg(box_flags_enum.dtype, "box_flags"),
                ]

        for list_name, template, extra_args, extra_lists in [
                ("colleagues", COLLEAGUES_TEMPLATE, [], []),
                ("neighbor_source_boxes", NEIGBHOR_SOURCE_BOXES_TEMPLATE,
                        [
                            VectorArg(box_id_dtype, "target_boxes"),
                            ], []),
                ("sep_siblings", SEP_SIBLINGS_TEMPLATE,
                        [
                            VectorArg(box_id_dtype, "target_or_target_parent_boxes"),
                            VectorArg(box_id_dtype, "box_parent_ids"),
                            VectorArg(box_id_dtype, "colleagues_starts"),
                            VectorArg(box_id_dtype, "colleagues_list"),
                            ], []),
                ("sep_smaller", SEP_SMALLER_TEMPLATE,
                        [
                            VectorArg(box_id_dtype, "target_boxes"),
                            VectorArg(box_id_dtype, "colleagues_starts"),
                            VectorArg(box_id_dtype, "colleagues_list"),
                            ],
                            ["sep_close_smaller"]
                            if sources_have_extent or targets_have_extent
                            else []),
                ("sep_bigger", SEP_BIGGER_TEMPLATE,
                        [
                            VectorArg(box_id_dtype, "target_or_target_parent_boxes"),
                            VectorArg(box_id_dtype, "box_parent_ids"),
                            VectorArg(box_id_dtype, "colleagues_starts"),
                            VectorArg(box_id_dtype, "colleagues_list"),
                            ],
                            ["sep_close_bigger"]
                            if sources_have_extent or targets_have_extent
                            else []),
                ]:
            src = Template(
                    TRAVERSAL_PREAMBLE_TEMPLATE
                    + HELPER_FUNCTION_TEMPLATE
                    + template,
                    strict_undefined=True).render(**render_vars)

            result[list_name+"_builder"] = ListOfListsBuilder(self.context,
                    [(list_name, box_id_dtype)]
                    + [(extra_list_name, box_id_dtype)
                        for extra_list_name in extra_lists],
                    str(src),
                    arg_decls=base_args + extra_args,
                    debug=debug, name_prefix=list_name,
                    complex_kernel=True)

        # }}}

        logging.info("traversal build kernels built")

        return _KernelInfo(**result)
Exemplo n.º 25
0
    def initialize(cls):
        '''
            Compile kernels
        '''
        cls.program = cl.Program(cl_ctx, F(cls.KERNEL)).build()
        cls.longitudinal_sort_kernel = RadixSort(cl_ctx,
                                                 [VectorArg(cl_ftype, "x"), 
                                                  VectorArg(cl_ftype, "px"),
                                                  VectorArg(cl_ftype, "y"),
                                                  VectorArg(cl_ftype, "py"),
                                                  VectorArg(cl_ftype, "theta"),
                                                  VectorArg(cl_ftype, "gamma"),
                                                  ScalarArg(cl_ftype, "inv_slice_len")],
                                                 key_expr="(int) floor(theta[i]*inv_slice_len)",
                                                 sort_arg_names=["x", "px", "y", "py", "theta", "gamma"],
                                                 key_dtype=np.int32)

        class LongitudinalTraverseScanKernel(GenericScanKernel):
            '''
                Adds a preamble method for the longitudinal traverse sort
            '''
            def __init__(self, *argl, **argd):
                '''
                    Patch argd['preamble']
                '''

                sort_fun = '''
                            int sort_fun(FLOAT_TYPE x, 
                                         FLOAT_TYPE y, 
                                         FLOAT_TYPE theta, 
                                         FLOAT_TYPE inv_slice_len, 
                                         FLOAT_TYPE inv_traverse_len,
                                         int bins) {
                                         
                                         FLOAT_TYPE xnorm = 0.5 + (inv_traverse_len*x);
                                         FLOAT_TYPE ynorm = 0.5 + (inv_traverse_len*y);
                                         int xbin = (int) floor(xnorm * inv_traverse_len);
                                         int ybin = (int) floor(ynorm * inv_traverse_len);
                                         int zbin = (int) floor(theta*inv_slice_len);

                                         if ((xbin < 0) || (xbin >= bins) || (ybin < 0) || (ybin >= bins)) {
                                            xbin = 0;
                                            ybin = 0;

                                         }

                                         return xbin+bins*(ybin+bins*zbin);
                            }
                           '''
                
                new_argd = dict(argd)
                new_argd['preamble'] = F(sort_fun + new_argd['preamble'])
                super().__init__(*argl, **new_argd)
        
        cls.longitudinal_traverse_sort_kernel = RadixSort(cl_ctx,
                                                          [VectorArg(cl_ftype, "x"), 
                                                           VectorArg(cl_ftype, "px"),
                                                           VectorArg(cl_ftype, "y"),
                                                           VectorArg(cl_ftype, "py"),
                                                           VectorArg(cl_ftype, "theta"),
                                                           VectorArg(cl_ftype, "gamma"),
                                                           ScalarArg(cl_ftype, "inv_slice_len"),
                                                           ScalarArg(cl_ftype, "inv_traverse_len"),
                                                           ScalarArg(np.int32, "bins")],
                                                           key_expr="sort_fun(x[i],y[i],theta[i], inv_slice_len, inv_traverse_len, bins)",
                                                           sort_arg_names=["x", "px", "y", "py", "theta", "gamma"],
                                                           scan_kernel = LongitudinalTraverseScanKernel,
                                                           key_dtype=np.int32)
Exemplo n.º 26
0
 def __init__(self, dtype, name):
     VectorArg.__init__(self, dtype, name, with_offset=True)
    def get_write_kernel(self, index_dtype):
        index_ctype = dtype_to_ctype(index_dtype)
        from pyopencl.tools import VectorArg, OtherArg
        kernel_list_args = []
        kernel_list_arg_values = ""
        user_list_args = []

        for name, dtype in self.list_names_and_dtypes:
            list_name = "plb_%s_list" % name
            list_arg = VectorArg(dtype, list_name)

            kernel_list_args.append(list_arg)
            user_list_args.append(list_arg)

            if name in self.count_sharing:
                kernel_list_arg_values += "%s, " % list_name
                continue

            kernel_list_args.append(
                VectorArg(index_dtype, "plb_%s_start_index" % name))

            index_name = "plb_%s_index" % name
            user_list_args.append(
                OtherArg("%s *%s" % (index_ctype, index_name), index_name))

            kernel_list_arg_values += "%s, &%s, " % (list_name, index_name)

        kernel_name = self.name_prefix + "_write"

        from pyopencl.characterize import has_double_support
        src = _LIST_BUILDER_TEMPLATE.render(
            is_count_stage=False,
            kernel_name=kernel_name,
            double_support=all(
                has_double_support(dev) for dev in self.context.devices),
            debug=self.debug,
            do_not_vectorize=self.do_not_vectorize(),
            kernel_list_arg_decl=_get_arg_decl(kernel_list_args),
            kernel_list_arg_values=kernel_list_arg_values,
            user_list_arg_decl=_get_arg_decl(user_list_args),
            user_list_args=_get_arg_list(user_list_args),
            user_arg_decl=_get_arg_decl(self.arg_decls),
            user_args=_get_arg_list(self.arg_decls),
            list_names_and_dtypes=self.list_names_and_dtypes,
            count_sharing=self.count_sharing,
            name_prefix=self.name_prefix,
            generate_template=self.generate_template,
            preamble=self.preamble,
            index_type=index_ctype,
        )

        src = str(src)

        prg = cl.Program(self.context, src).build(self.options)
        knl = getattr(prg, kernel_name)

        from pyopencl.tools import get_arg_list_scalar_arg_dtypes
        knl.set_scalar_arg_dtypes(
            get_arg_list_scalar_arg_dtypes(kernel_list_args + self.arg_decls) +
            [index_dtype])

        return knl
    def __init__(self,
                 context,
                 arguments,
                 key_expr,
                 sort_arg_names,
                 bits_at_a_time=2,
                 index_dtype=np.int32,
                 key_dtype=np.uint32,
                 options=[]):
        """
        :arg arguments: A string of comma-separated C argument declarations.
            If *arguments* is specified, then *input_expr* must also be
            specified. All types used here must be known to PyOpenCL.
            (see :func:`pyopencl.tools.get_or_register_dtype`).
        :arg key_expr: An integer-valued C expression returning the
            key based on which the sort is performed. The array index
            for which the key is to be computed is available as `i`.
            The expression may refer to any of the *arguments*.
        :arg sort_arg_names: A list of argument names whose corresponding
            array arguments will be sorted according to *key_expr*.
        """

        # {{{ arg processing

        from pyopencl.tools import parse_arg_list
        self.arguments = parse_arg_list(arguments)
        del arguments

        self.sort_arg_names = sort_arg_names
        self.bits = int(bits_at_a_time)
        self.index_dtype = np.dtype(index_dtype)
        self.key_dtype = np.dtype(key_dtype)

        self.options = options

        # }}}

        # {{{ kernel creation

        scan_ctype, scan_dtype, scan_t_cdecl = \
                _make_sort_scan_type(context.devices[0], self.bits, self.index_dtype)

        from pyopencl.tools import VectorArg, ScalarArg
        scan_arguments = (list(self.arguments) + [
            VectorArg(arg.dtype, "sorted_" + arg.name)
            for arg in self.arguments if arg.name in sort_arg_names
        ] + [ScalarArg(np.int32, "base_bit")])

        def get_count_branch(known_bits):
            if len(known_bits) == self.bits:
                return "s.c%s" % known_bits

            boundary_mnr = known_bits + "1" + (self.bits - len(known_bits) -
                                               1) * "0"

            return ("((mnr < %s) ? %s : %s)" %
                    (int(boundary_mnr, 2), get_count_branch(known_bits + "0"),
                     get_count_branch(known_bits + "1")))

        codegen_args = dict(
            bits=self.bits,
            key_ctype=dtype_to_ctype(self.key_dtype),
            key_expr=key_expr,
            index_ctype=dtype_to_ctype(self.index_dtype),
            index_type_max=np.iinfo(self.index_dtype).max,
            padded_bin=_padded_bin,
            scan_ctype=scan_ctype,
            sort_arg_names=sort_arg_names,
            get_count_branch=get_count_branch,
        )

        preamble = scan_t_cdecl + RADIX_SORT_PREAMBLE_TPL.render(
            **codegen_args)
        scan_preamble = preamble \
                + RADIX_SORT_SCAN_PREAMBLE_TPL.render(**codegen_args)

        from pyopencl.scan import GenericScanKernel
        self.scan_kernel = GenericScanKernel(
            context,
            scan_dtype,
            arguments=scan_arguments,
            input_expr="scan_t_from_value(%s, base_bit, i)" % key_expr,
            scan_expr="scan_t_add(a, b, across_seg_boundary)",
            neutral="scan_t_neutral()",
            output_statement=RADIX_SORT_OUTPUT_STMT_TPL.render(**codegen_args),
            preamble=scan_preamble,
            options=self.options)

        for i, arg in enumerate(self.arguments):
            if isinstance(arg, VectorArg):
                self.first_array_arg_idx = i
Exemplo n.º 29
0
    def __init__(self, context, list_names_and_dtypes, generate_template,
            arg_decls, count_sharing=None, devices=None,
            name_prefix="plb_build_list", options=[], preamble="",
            debug=False, complex_kernel=False,
            eliminate_empty_output_lists=[]):
        """
        :arg context: A :class:`pyopencl.Context`.
        :arg list_names_and_dtypes: a list of `(name, dtype)` tuples
            indicating the lists to be built.
        :arg generate_template: a snippet of C as described below
        :arg arg_decls: A string of comma-separated C argument declarations.
        :arg count_sharing: A mapping consisting of `(child, mother)`
            indicating that `mother` and `child` will always have the
            same number of indices, and the `APPEND` to `mother`
            will always happen *before* the `APPEND` to the child.
        :arg name_prefix: the name prefix to use for the compiled kernels
        :arg options: OpenCL compilation options for kernels using
            *generate_template*.
        :arg complex_kernel: If `True`, prevents vectorization on CPUs.
        :arg eliminate_empty_output_lists: A Python list of list names
            for which the empty output lists are eliminated.

        *generate_template* may use the following C macros/identifiers:

        * `index_type`: expands to C identifier for the index type used
          for the calculation
        * `USER_ARG_DECL`: expands to the C declarator for `arg_decls`
        * `USER_ARGS`: a list of C argument values corresponding to
          `user_arg_decl`
        * `LIST_ARG_DECL`: expands to a C argument list representing the
          data for the output lists. These are escaped prefixed with
          `"plg_"` so as to not interfere with user-provided names.
        * `LIST_ARGS`: a list of C argument values corresponding to
          `LIST_ARG_DECL`
        * `APPEND_name(entry)`: inserts `entry` into the list `name`.
          *entry* must be a valid C expression of the correct type.

        All argument-list related macros have a trailing comma included
        if they are non-empty.

        *generate_template* must supply a function:

        .. code-block:: c

            void generate(USER_ARG_DECL LIST_ARG_DECL index_type i)
            {
                APPEND_mylist(5);
            }

        Internally, the `kernel_template` is expanded (at least) twice. Once,
        for a 'counting' stage where the size of all the lists is determined,
        and a second time, for a 'generation' stage where the lists are
        actually filled. A `generate` function that has side effects beyond
        calling `append` is therefore ill-formed.

        .. versionchanged:: 2018.1

            Change *eliminate_empty_output_lists* argument type from `bool` to
            `list`.
        """

        if devices is None:
            devices = context.devices

        if count_sharing is None:
            count_sharing = {}

        self.context = context
        self.devices = devices

        self.list_names_and_dtypes = list_names_and_dtypes
        self.generate_template = generate_template

        from pyopencl.tools import parse_arg_list
        self.arg_decls = parse_arg_list(arg_decls)

        # To match with the signature of the user-supplied generate(), arguments
        # can't appear to have offsets.
        arg_decls_no_offset = []
        from pyopencl.tools import VectorArg
        for arg in self.arg_decls:
            if isinstance(arg, VectorArg) and arg.with_offset:
                arg = VectorArg(arg.dtype, arg.name)
            arg_decls_no_offset.append(arg)
        self.arg_decls_no_offset = arg_decls_no_offset

        self.count_sharing = count_sharing

        self.name_prefix = name_prefix
        self.preamble = preamble
        self.options = options

        self.debug = debug

        self.complex_kernel = complex_kernel

        if eliminate_empty_output_lists is True:
            eliminate_empty_output_lists = \
                    [name for name, _ in self.list_names_and_dtypes]

        if eliminate_empty_output_lists is False:
            eliminate_empty_output_lists = []

        self.eliminate_empty_output_lists = eliminate_empty_output_lists
        for list_name in self.eliminate_empty_output_lists:
            if not any(list_name == name for name, _ in self.list_names_and_dtypes):
                raise ValueError(
                    "invalid list name '%s' in eliminate_empty_output_lists"
                    % list_name)