def get_linear_combination_kernel(summand_descriptors, dtype_z): # TODO: Port this! raise NotImplementedError from pyopencl.tools import dtype_to_ctype from pyopencl.elementwise import \ VectorArg, ScalarArg, get_elwise_module args = [] preamble = [] loop_prep = [] summands = [] tex_names = [] for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \ enumerate(summand_descriptors): if is_gpu_scalar: preamble.append( "texture <%s, 1, cudaReadModeElementType> tex_a%d;" % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i)) args.append(VectorArg(vector_dtype, "x%d" % i)) tex_names.append("tex_a%d" % i) loop_prep.append("%s a%d = fp_tex1Dfetch(tex_a%d, 0)" % (dtype_to_ctype(scalar_dtype), i, i)) else: args.append(ScalarArg(scalar_dtype, "a%d" % i)) args.append(VectorArg(vector_dtype, "x%d" % i)) summands.append("a%d*x%d[i]" % (i, i)) args.append(VectorArg(dtype_z, "z")) args.append(ScalarArg(np.uintp, "n")) mod = get_elwise_module(args, "z[i] = " + " + ".join(summands), "linear_combination", preamble="\n".join(preamble), loop_prep=";\n".join(loop_prep)) func = mod.get_function("linear_combination") tex_src = [mod.get_texref(tn) for tn in tex_names] func.prepare("".join(arg.struct_char for arg in args), (1, 1, 1), texrefs=tex_src) return func, tex_src
def get_array_scalar_binop_kernel(context, operator, dtype_res, dtype_a, dtype_b): return get_elwise_kernel(context, [ VectorArg(dtype_res, "out", with_offset=True), VectorArg(dtype_a, "a", with_offset=True), ScalarArg(dtype_b, "b"), ], "out[i] = a[i] %s b" % operator, name="scalar_binop_kernel")
def get_kernels(self, key_dtype, value_dtype, starts_dtype): from pyopencl.algorithm import RadixSort from pyopencl.tools import VectorArg, ScalarArg by_target_sorter = RadixSort( self.context, [ VectorArg(value_dtype, "values"), VectorArg(key_dtype, "keys"), ], key_expr="keys[i]", sort_arg_names=["values", "keys"]) from pyopencl.elementwise import ElementwiseTemplate start_finder = ElementwiseTemplate( arguments="""//CL// starts_t *key_group_starts, key_t *keys_sorted_by_key, """, operation=r"""//CL// key_t my_key = keys_sorted_by_key[i]; if (i == 0 || my_key != keys_sorted_by_key[i-1]) key_group_starts[my_key] = i; """, name="find_starts").build(self.context, type_aliases=( ("key_t", starts_dtype), ("starts_t", starts_dtype), ), var_values=()) from pyopencl.scan import GenericScanKernel bound_propagation_scan = GenericScanKernel( self.context, starts_dtype, arguments=[ VectorArg(starts_dtype, "starts"), # starts has length n+1 ScalarArg(key_dtype, "nkeys"), ], input_expr="starts[nkeys-i]", scan_expr="min(a, b)", neutral=_make_cl_int_literal( np.iinfo(starts_dtype).max, starts_dtype), output_statement="starts[nkeys-i] = item;") return _KernelInfo( by_target_sorter=by_target_sorter, start_finder=start_finder, bound_propagation_scan=bound_propagation_scan)
def extract_extra_args_types_values(extra_args): from pyopencl.tools import VectorArg, ScalarArg extra_args_types = [] extra_args_values = [] for name, val in extra_args: if isinstance(val, cl.array.Array): extra_args_types.append(VectorArg(val.dtype, name, with_offset=False)) extra_args_values.append(val) elif isinstance(val, np.generic): extra_args_types.append(ScalarArg(val.dtype, name)) extra_args_values.append(val) else: raise RuntimeError("argument '%d' not understood" % name) return tuple(extra_args_types), extra_args_values
def get_bessel_kernel(context, which_func, out_dtype=np.float64, order_dtype=np.int32, x_dtype=np.float64): return get_elwise_kernel(context, [ VectorArg(out_dtype, "z", with_offset=True), ScalarArg(order_dtype, "ord_n"), VectorArg(x_dtype, "x", with_offset=True), ], "z[i] = bessel_%sn(ord_n, x[i])" % which_func, name="bessel_%sn_kernel" % which_func, preamble=""" #pragma OPENCL EXTENSION cl_khr_fp64: enable #define PYOPENCL_DEFINE_CDOUBLE #include <pyopencl-bessel-%s.cl> """ % which_func)
def get_elwise_kernel_and_types(context, arguments, operation, name="elwise_kernel", options=[], preamble="", **kwargs): if isinstance(arguments, str): from pyopencl.tools import parse_c_arg parsed_args = [parse_c_arg(arg) for arg in arguments.split(",")] else: parsed_args = arguments pragmas = [] includes = [] have_double_pragma = False have_complex_include = False for arg in parsed_args: if arg.dtype in [np.float64, np.complex128]: if not have_double_pragma: pragmas.append( "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n" "#define PYOPENCL_DEFINE_CDOUBLE\n") have_double_pragma = True if arg.dtype.kind == 'c': if not have_complex_include: includes.append("#include <pyopencl-complex.h>\n") have_complex_include = True if pragmas or includes: preamble = "\n".join(pragmas+includes) + "\n" + preamble parsed_args.append(ScalarArg(np.uintp, "n")) prg = get_elwise_program( context, parsed_args, operation, name=name, options=options, preamble=preamble, **kwargs) scalar_arg_dtypes = [] for arg in parsed_args: if isinstance(arg, ScalarArg): scalar_arg_dtypes.append(arg.dtype) else: scalar_arg_dtypes.append(None) kernel = getattr(prg, name) kernel.set_scalar_arg_dtypes(scalar_arg_dtypes) return kernel, parsed_args
def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), } args = [ VectorArg(dtype, "dest%d" % i) for i in range(vec_count) ] + [ VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True), VectorArg(idx_dtype, "gmem_src_idx", with_offset=True), ] + [ VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count) ] + [ ScalarArg(idx_dtype, "offset%d" % i) for i in range(vec_count) if with_offsets ] if with_offsets: def get_copy_insn(i): return ("dest%d[dest_idx] = " "src%d[src_idx+offset%d];" % (i, i, i)) else: def get_copy_insn(i): return ("dest%d[dest_idx] = " "src%d[src_idx];" % (i, i)) body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n" "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx) + "\n".join(get_copy_insn(i) for i in range(vec_count))) return get_elwise_kernel(context, args, body, preamble=dtype_to_c_struct(context.devices[0], dtype), name="take_put")
def render_argument_list(self, *arg_lists): all_args = [] for arg_list in arg_lists: if isinstance(arg_list, str): arg_list = str( self.template.get_text_template(arg_list).render( self.var_dict)) arg_list = self._C_COMMENT_FINDER.sub("", arg_list) arg_list = arg_list.replace("\n", " ") all_args.extend(arg_list.split(",")) else: all_args.extend(arg_list) from pyopencl.compyte.dtypes import parse_c_arg_backend parsed_args = [] for arg in all_args: if isinstance(arg, str): arg = arg.strip() if not arg: continue ph = parse_c_arg_backend(arg, _ScalarArgPlaceholder, _VectorArgPlaceholder, name_to_dtype=lambda x: x) parsed_arg = self.render_arg(ph) elif isinstance(arg, Argument): parsed_arg = arg elif isinstance(arg, tuple): parsed_arg = ScalarArg(self.parse_type(arg[0]), arg[1]) parsed_args.append(parsed_arg) return parsed_args
def __init__(self, context, arguments, key_expr, sort_arg_names, bits_at_a_time=2, index_dtype=np.int32, key_dtype=np.uint32, options=[]): """ :arg arguments: A string of comma-separated C argument declarations. If *arguments* is specified, then *input_expr* must also be specified. All types used here must be known to PyOpenCL. (see :func:`pyopencl.tools.get_or_register_dtype`). :arg key_expr: An integer-valued C expression returning the key based on which the sort is performed. The array index for which the key is to be computed is available as `i`. The expression may refer to any of the *arguments*. :arg sort_arg_names: A list of argument names whose corresponding array arguments will be sorted according to *key_expr*. """ # {{{ arg processing from pyopencl.tools import parse_arg_list self.arguments = parse_arg_list(arguments) del arguments self.sort_arg_names = sort_arg_names self.bits = int(bits_at_a_time) self.index_dtype = np.dtype(index_dtype) self.key_dtype = np.dtype(key_dtype) self.options = options # }}} # {{{ kernel creation scan_ctype, scan_dtype, scan_t_cdecl = \ _make_sort_scan_type(context.devices[0], self.bits, self.index_dtype) from pyopencl.tools import VectorArg, ScalarArg scan_arguments = (list(self.arguments) + [ VectorArg(arg.dtype, "sorted_" + arg.name) for arg in self.arguments if arg.name in sort_arg_names ] + [ScalarArg(np.int32, "base_bit")]) def get_count_branch(known_bits): if len(known_bits) == self.bits: return "s.c%s" % known_bits boundary_mnr = known_bits + "1" + (self.bits - len(known_bits) - 1) * "0" return ("((mnr < %s) ? %s : %s)" % (int(boundary_mnr, 2), get_count_branch(known_bits + "0"), get_count_branch(known_bits + "1"))) codegen_args = dict( bits=self.bits, key_ctype=dtype_to_ctype(self.key_dtype), key_expr=key_expr, index_ctype=dtype_to_ctype(self.index_dtype), index_type_max=np.iinfo(self.index_dtype).max, padded_bin=_padded_bin, scan_ctype=scan_ctype, sort_arg_names=sort_arg_names, get_count_branch=get_count_branch, ) preamble = scan_t_cdecl + RADIX_SORT_PREAMBLE_TPL.render( **codegen_args) scan_preamble = preamble \ + RADIX_SORT_SCAN_PREAMBLE_TPL.render(**codegen_args) from pyopencl.scan import GenericScanKernel self.scan_kernel = GenericScanKernel( context, scan_dtype, arguments=scan_arguments, input_expr="scan_t_from_value(%s, base_bit, i)" % key_expr, scan_expr="scan_t_add(a, b, across_seg_boundary)", neutral="scan_t_neutral()", output_statement=RADIX_SORT_OUTPUT_STMT_TPL.render(**codegen_args), preamble=scan_preamble, options=self.options) for i, arg in enumerate(self.arguments): if isinstance(arg, VectorArg): self.first_array_arg_idx = i
def get_elwise_kernel_and_types(context, arguments, operation, name="elwise_kernel", options=[], preamble="", use_range=False, **kwargs): from pyopencl.tools import parse_arg_list, get_arg_offset_adjuster_code parsed_args = parse_arg_list(arguments, with_offset=True) auto_preamble = kwargs.pop("auto_preamble", True) pragmas = [] includes = [] have_double_pragma = False have_complex_include = False if auto_preamble: for arg in parsed_args: if arg.dtype in [np.float64, np.complex128]: if not have_double_pragma: pragmas.append(""" #if __OPENCL_C_VERSION__ < 120 #pragma OPENCL EXTENSION cl_khr_fp64: enable #endif #define PYOPENCL_DEFINE_CDOUBLE """) have_double_pragma = True if arg.dtype.kind == 'c': if not have_complex_include: includes.append("#include <pyopencl-complex.h>\n") have_complex_include = True if pragmas or includes: preamble = "\n".join(pragmas + includes) + "\n" + preamble if use_range: parsed_args.extend([ ScalarArg(np.intp, "start"), ScalarArg(np.intp, "stop"), ScalarArg(np.intp, "step"), ]) else: parsed_args.append(ScalarArg(np.intp, "n")) loop_prep = kwargs.pop("loop_prep", "") loop_prep = get_arg_offset_adjuster_code(parsed_args) + loop_prep prg = get_elwise_program(context, parsed_args, operation, name=name, options=options, preamble=preamble, use_range=use_range, loop_prep=loop_prep, **kwargs) from pyopencl.tools import get_arg_list_scalar_arg_dtypes kernel = getattr(prg, name) kernel.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(parsed_args)) return kernel, parsed_args
def initialize(cls): ''' Compile kernels ''' cls.program = cl.Program(cl_ctx, F(cls.KERNEL)).build() cls.longitudinal_sort_kernel = RadixSort(cl_ctx, [VectorArg(cl_ftype, "x"), VectorArg(cl_ftype, "px"), VectorArg(cl_ftype, "y"), VectorArg(cl_ftype, "py"), VectorArg(cl_ftype, "theta"), VectorArg(cl_ftype, "gamma"), ScalarArg(cl_ftype, "inv_slice_len")], key_expr="(int) floor(theta[i]*inv_slice_len)", sort_arg_names=["x", "px", "y", "py", "theta", "gamma"], key_dtype=np.int32) class LongitudinalTraverseScanKernel(GenericScanKernel): ''' Adds a preamble method for the longitudinal traverse sort ''' def __init__(self, *argl, **argd): ''' Patch argd['preamble'] ''' sort_fun = ''' int sort_fun(FLOAT_TYPE x, FLOAT_TYPE y, FLOAT_TYPE theta, FLOAT_TYPE inv_slice_len, FLOAT_TYPE inv_traverse_len, int bins) { FLOAT_TYPE xnorm = 0.5 + (inv_traverse_len*x); FLOAT_TYPE ynorm = 0.5 + (inv_traverse_len*y); int xbin = (int) floor(xnorm * inv_traverse_len); int ybin = (int) floor(ynorm * inv_traverse_len); int zbin = (int) floor(theta*inv_slice_len); if ((xbin < 0) || (xbin >= bins) || (ybin < 0) || (ybin >= bins)) { xbin = 0; ybin = 0; } return xbin+bins*(ybin+bins*zbin); } ''' new_argd = dict(argd) new_argd['preamble'] = F(sort_fun + new_argd['preamble']) super().__init__(*argl, **new_argd) cls.longitudinal_traverse_sort_kernel = RadixSort(cl_ctx, [VectorArg(cl_ftype, "x"), VectorArg(cl_ftype, "px"), VectorArg(cl_ftype, "y"), VectorArg(cl_ftype, "py"), VectorArg(cl_ftype, "theta"), VectorArg(cl_ftype, "gamma"), ScalarArg(cl_ftype, "inv_slice_len"), ScalarArg(cl_ftype, "inv_traverse_len"), ScalarArg(np.int32, "bins")], key_expr="sort_fun(x[i],y[i],theta[i], inv_slice_len, inv_traverse_len, bins)", sort_arg_names=["x", "px", "y", "py", "theta", "gamma"], scan_kernel = LongitudinalTraverseScanKernel, key_dtype=np.int32)
def get_kernel_info(self, dimensions, particle_id_dtype, box_id_dtype, coord_dtype, box_level_dtype, max_levels, sources_are_targets, sources_have_extent, targets_have_extent, stick_out_factor): logging.info("building traversal build kernels") debug = False from pyopencl.tools import dtype_to_ctype from boxtree.tree import box_flags_enum render_vars = dict( dimensions=dimensions, dtype_to_ctype=dtype_to_ctype, particle_id_dtype=particle_id_dtype, box_id_dtype=box_id_dtype, box_flags_enum=box_flags_enum, coord_dtype=coord_dtype, vec_types=cl.array.vec.types, max_levels=max_levels, AXIS_NAMES=AXIS_NAMES, debug=debug, sources_are_targets=sources_are_targets, sources_have_extent=sources_have_extent, targets_have_extent=targets_have_extent, stick_out_factor=stick_out_factor, ) from pyopencl.algorithm import ListOfListsBuilder from pyopencl.tools import VectorArg, ScalarArg result = {} # {{{ source boxes, their parents, target boxes src = Template( TRAVERSAL_PREAMBLE_TEMPLATE + SOURCES_PARENTS_AND_TARGETS_TEMPLATE, strict_undefined=True).render(**render_vars) result["sources_parents_and_targets_builder"] = \ ListOfListsBuilder(self.context, [ ("source_parent_boxes", box_id_dtype), ("source_boxes", box_id_dtype), ("target_or_target_parent_boxes", box_id_dtype) ] + ( [("target_boxes", box_id_dtype)] if not sources_are_targets else []), str(src), arg_decls=[ VectorArg(box_flags_enum.dtype, "box_flags"), ], debug=debug, name_prefix="sources_parents_and_targets") result["level_start_box_nrs_extractor"] = \ LEVEL_START_BOX_NR_EXTRACTOR_TEMPLATE.build(self.context, type_aliases=( ("box_id_t", box_id_dtype), ("box_level_t", box_level_dtype), ), ) # }}} # {{{ build list N builders base_args = [ VectorArg(coord_dtype, "box_centers"), ScalarArg(coord_dtype, "root_extent"), VectorArg(np.uint8, "box_levels"), ScalarArg(box_id_dtype, "aligned_nboxes"), VectorArg(box_id_dtype, "box_child_ids"), VectorArg(box_flags_enum.dtype, "box_flags"), ] for list_name, template, extra_args, extra_lists in [ ("colleagues", COLLEAGUES_TEMPLATE, [], []), ("neighbor_source_boxes", NEIGBHOR_SOURCE_BOXES_TEMPLATE, [ VectorArg(box_id_dtype, "target_boxes"), ], []), ("sep_siblings", SEP_SIBLINGS_TEMPLATE, [ VectorArg(box_id_dtype, "target_or_target_parent_boxes"), VectorArg(box_id_dtype, "box_parent_ids"), VectorArg(box_id_dtype, "colleagues_starts"), VectorArg(box_id_dtype, "colleagues_list"), ], []), ("sep_smaller", SEP_SMALLER_TEMPLATE, [ VectorArg(box_id_dtype, "target_boxes"), VectorArg(box_id_dtype, "colleagues_starts"), VectorArg(box_id_dtype, "colleagues_list"), ], ["sep_close_smaller"] if sources_have_extent or targets_have_extent else []), ("sep_bigger", SEP_BIGGER_TEMPLATE, [ VectorArg(box_id_dtype, "target_or_target_parent_boxes"), VectorArg(box_id_dtype, "box_parent_ids"), VectorArg(box_id_dtype, "colleagues_starts"), VectorArg(box_id_dtype, "colleagues_list"), ], ["sep_close_bigger"] if sources_have_extent or targets_have_extent else []), ]: src = Template( TRAVERSAL_PREAMBLE_TEMPLATE + HELPER_FUNCTION_TEMPLATE + template, strict_undefined=True).render(**render_vars) result[list_name+"_builder"] = ListOfListsBuilder(self.context, [(list_name, box_id_dtype)] + [(extra_list_name, box_id_dtype) for extra_list_name in extra_lists], str(src), arg_decls=base_args + extra_args, debug=debug, name_prefix=list_name, complex_kernel=True) # }}} logging.info("traversal build kernels built") return _KernelInfo(**result)