def __init__(self, context, arguments, key_expr, sort_arg_names, bits_at_a_time=2, index_dtype=np.int32, key_dtype=np.uint32, options=[]): """ :arg arguments: A string of comma-separated C argument declarations. If *arguments* is specified, then *input_expr* must also be specified. All types used here must be known to PyOpenCL. (see :func:`pyopencl.tools.get_or_register_dtype`). :arg key_expr: An integer-valued C expression returning the key based on which the sort is performed. The array index for which the key is to be computed is available as `i`. The expression may refer to any of the *arguments*. :arg sort_arg_names: A list of argument names whose corresponding array arguments will be sorted according to *key_expr*. """ # {{{ arg processing from pyopencl.tools import parse_arg_list self.arguments = parse_arg_list(arguments) del arguments self.sort_arg_names = sort_arg_names self.bits = int(bits_at_a_time) self.index_dtype = np.dtype(index_dtype) self.key_dtype = np.dtype(key_dtype) self.options = options # }}} # {{{ kernel creation scan_ctype, scan_dtype, scan_t_cdecl = \ _make_sort_scan_type(context.devices[0], self.bits, self.index_dtype) from pyopencl.tools import VectorArg, ScalarArg scan_arguments = (list(self.arguments) + [ VectorArg(arg.dtype, "sorted_" + arg.name) for arg in self.arguments if arg.name in sort_arg_names ] + [ScalarArg(np.int32, "base_bit")]) def get_count_branch(known_bits): if len(known_bits) == self.bits: return "s.c%s" % known_bits boundary_mnr = known_bits + "1" + (self.bits - len(known_bits) - 1) * "0" return ("((mnr < %s) ? %s : %s)" % (int(boundary_mnr, 2), get_count_branch(known_bits + "0"), get_count_branch(known_bits + "1"))) codegen_args = dict( bits=self.bits, key_ctype=dtype_to_ctype(self.key_dtype), key_expr=key_expr, index_ctype=dtype_to_ctype(self.index_dtype), index_type_max=np.iinfo(self.index_dtype).max, padded_bin=_padded_bin, scan_ctype=scan_ctype, sort_arg_names=sort_arg_names, get_count_branch=get_count_branch, ) preamble = scan_t_cdecl + RADIX_SORT_PREAMBLE_TPL.render( **codegen_args) scan_preamble = preamble \ + RADIX_SORT_SCAN_PREAMBLE_TPL.render(**codegen_args) from pyopencl.scan import GenericScanKernel self.scan_kernel = GenericScanKernel( context, scan_dtype, arguments=scan_arguments, input_expr="scan_t_from_value(%s, base_bit, i)" % key_expr, scan_expr="scan_t_add(a, b, across_seg_boundary)", neutral="scan_t_neutral()", output_statement=RADIX_SORT_OUTPUT_STMT_TPL.render(**codegen_args), preamble=scan_preamble, options=self.options) for i, arg in enumerate(self.arguments): if isinstance(arg, VectorArg): self.first_array_arg_idx = i
def get_elwise_kernel_and_types(context, arguments, operation, name="elwise_kernel", options=[], preamble="", use_range=False, **kwargs): from pyopencl.tools import parse_arg_list, get_arg_offset_adjuster_code parsed_args = parse_arg_list(arguments, with_offset=True) auto_preamble = kwargs.pop("auto_preamble", True) pragmas = [] includes = [] have_double_pragma = False have_complex_include = False if auto_preamble: for arg in parsed_args: if arg.dtype in [np.float64, np.complex128]: if not have_double_pragma: pragmas.append(""" #if __OPENCL_C_VERSION__ < 120 #pragma OPENCL EXTENSION cl_khr_fp64: enable #endif #define PYOPENCL_DEFINE_CDOUBLE """) have_double_pragma = True if arg.dtype.kind == 'c': if not have_complex_include: includes.append("#include <pyopencl-complex.h>\n") have_complex_include = True if pragmas or includes: preamble = "\n".join(pragmas + includes) + "\n" + preamble if use_range: parsed_args.extend([ ScalarArg(np.intp, "start"), ScalarArg(np.intp, "stop"), ScalarArg(np.intp, "step"), ]) else: parsed_args.append(ScalarArg(np.intp, "n")) loop_prep = kwargs.pop("loop_prep", "") loop_prep = get_arg_offset_adjuster_code(parsed_args) + loop_prep prg = get_elwise_program(context, parsed_args, operation, name=name, options=options, preamble=preamble, use_range=use_range, loop_prep=loop_prep, **kwargs) from pyopencl.tools import get_arg_list_scalar_arg_dtypes kernel = getattr(prg, name) kernel.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(parsed_args)) return kernel, parsed_args
def get_elwise_kernel_and_types(context, arguments, operation, name="elwise_kernel", options=[], preamble="", use_range=False, **kwargs): if isinstance(arguments, str): from pyopencl.tools import parse_c_arg parsed_args = [parse_c_arg(arg) for arg in arguments.split(",")] else: parsed_args = arguments auto_preamble = kwargs.pop("auto_preamble", True) pragmas = [] includes = [] have_double_pragma = False have_complex_include = False if auto_preamble: for arg in parsed_args: if arg.dtype in [np.float64, np.complex128]: if not have_double_pragma: pragmas.append( "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n" "#define PYOPENCL_DEFINE_CDOUBLE\n") have_double_pragma = True if arg.dtype.kind == 'c': if not have_complex_include: includes.append("#include <pyopencl-complex.h>\n") have_complex_include = True if pragmas or includes: preamble = "\n".join(pragmas + includes) + "\n" + preamble if use_range: parsed_args.extend([ ScalarArg(np.intp, "start"), ScalarArg(np.intp, "stop"), ScalarArg(np.intp, "step"), ]) else: parsed_args.append(ScalarArg(np.intp, "n")) prg = get_elwise_program(context, parsed_args, operation, name=name, options=options, preamble=preamble, use_range=use_range, **kwargs) scalar_arg_dtypes = [] for arg in parsed_args: if isinstance(arg, ScalarArg): scalar_arg_dtypes.append(arg.dtype) else: scalar_arg_dtypes.append(None) kernel = getattr(prg, name) kernel.set_scalar_arg_dtypes(scalar_arg_dtypes) return kernel, parsed_args
def initialize(cls): ''' Compile kernels ''' cls.program = cl.Program(cl_ctx, F(cls.KERNEL)).build() cls.longitudinal_sort_kernel = RadixSort(cl_ctx, [VectorArg(cl_ftype, "x"), VectorArg(cl_ftype, "px"), VectorArg(cl_ftype, "y"), VectorArg(cl_ftype, "py"), VectorArg(cl_ftype, "theta"), VectorArg(cl_ftype, "gamma"), ScalarArg(cl_ftype, "inv_slice_len")], key_expr="(int) floor(theta[i]*inv_slice_len)", sort_arg_names=["x", "px", "y", "py", "theta", "gamma"], key_dtype=np.int32) class LongitudinalTraverseScanKernel(GenericScanKernel): ''' Adds a preamble method for the longitudinal traverse sort ''' def __init__(self, *argl, **argd): ''' Patch argd['preamble'] ''' sort_fun = ''' int sort_fun(FLOAT_TYPE x, FLOAT_TYPE y, FLOAT_TYPE theta, FLOAT_TYPE inv_slice_len, FLOAT_TYPE inv_traverse_len, int bins) { FLOAT_TYPE xnorm = 0.5 + (inv_traverse_len*x); FLOAT_TYPE ynorm = 0.5 + (inv_traverse_len*y); int xbin = (int) floor(xnorm * inv_traverse_len); int ybin = (int) floor(ynorm * inv_traverse_len); int zbin = (int) floor(theta*inv_slice_len); if ((xbin < 0) || (xbin >= bins) || (ybin < 0) || (ybin >= bins)) { xbin = 0; ybin = 0; } return xbin+bins*(ybin+bins*zbin); } ''' new_argd = dict(argd) new_argd['preamble'] = F(sort_fun + new_argd['preamble']) super().__init__(*argl, **new_argd) cls.longitudinal_traverse_sort_kernel = RadixSort(cl_ctx, [VectorArg(cl_ftype, "x"), VectorArg(cl_ftype, "px"), VectorArg(cl_ftype, "y"), VectorArg(cl_ftype, "py"), VectorArg(cl_ftype, "theta"), VectorArg(cl_ftype, "gamma"), ScalarArg(cl_ftype, "inv_slice_len"), ScalarArg(cl_ftype, "inv_traverse_len"), ScalarArg(np.int32, "bins")], key_expr="sort_fun(x[i],y[i],theta[i], inv_slice_len, inv_traverse_len, bins)", sort_arg_names=["x", "px", "y", "py", "theta", "gamma"], scan_kernel = LongitudinalTraverseScanKernel, key_dtype=np.int32)
def get_kernel_info(self, dimensions, particle_id_dtype, box_id_dtype, coord_dtype, box_level_dtype, max_levels, sources_are_targets, sources_have_extent, targets_have_extent, stick_out_factor): logging.info("building traversal build kernels") debug = False from pyopencl.tools import dtype_to_ctype from boxtree.tree import box_flags_enum render_vars = dict( dimensions=dimensions, dtype_to_ctype=dtype_to_ctype, particle_id_dtype=particle_id_dtype, box_id_dtype=box_id_dtype, box_flags_enum=box_flags_enum, coord_dtype=coord_dtype, vec_types=cl.array.vec.types, max_levels=max_levels, AXIS_NAMES=AXIS_NAMES, debug=debug, sources_are_targets=sources_are_targets, sources_have_extent=sources_have_extent, targets_have_extent=targets_have_extent, stick_out_factor=stick_out_factor, ) from pyopencl.algorithm import ListOfListsBuilder from pyopencl.tools import VectorArg, ScalarArg result = {} # {{{ source boxes, their parents, target boxes src = Template( TRAVERSAL_PREAMBLE_TEMPLATE + SOURCES_PARENTS_AND_TARGETS_TEMPLATE, strict_undefined=True).render(**render_vars) result["sources_parents_and_targets_builder"] = \ ListOfListsBuilder(self.context, [ ("source_parent_boxes", box_id_dtype), ("source_boxes", box_id_dtype), ("target_or_target_parent_boxes", box_id_dtype) ] + ( [("target_boxes", box_id_dtype)] if not sources_are_targets else []), str(src), arg_decls=[ VectorArg(box_flags_enum.dtype, "box_flags"), ], debug=debug, name_prefix="sources_parents_and_targets") result["level_start_box_nrs_extractor"] = \ LEVEL_START_BOX_NR_EXTRACTOR_TEMPLATE.build(self.context, type_aliases=( ("box_id_t", box_id_dtype), ("box_level_t", box_level_dtype), ), ) # }}} # {{{ build list N builders base_args = [ VectorArg(coord_dtype, "box_centers"), ScalarArg(coord_dtype, "root_extent"), VectorArg(np.uint8, "box_levels"), ScalarArg(box_id_dtype, "aligned_nboxes"), VectorArg(box_id_dtype, "box_child_ids"), VectorArg(box_flags_enum.dtype, "box_flags"), ] for list_name, template, extra_args, extra_lists in [ ("colleagues", COLLEAGUES_TEMPLATE, [], []), ("neighbor_source_boxes", NEIGBHOR_SOURCE_BOXES_TEMPLATE, [ VectorArg(box_id_dtype, "target_boxes"), ], []), ("sep_siblings", SEP_SIBLINGS_TEMPLATE, [ VectorArg(box_id_dtype, "target_or_target_parent_boxes"), VectorArg(box_id_dtype, "box_parent_ids"), VectorArg(box_id_dtype, "colleagues_starts"), VectorArg(box_id_dtype, "colleagues_list"), ], []), ("sep_smaller", SEP_SMALLER_TEMPLATE, [ VectorArg(box_id_dtype, "target_boxes"), VectorArg(box_id_dtype, "colleagues_starts"), VectorArg(box_id_dtype, "colleagues_list"), ], ["sep_close_smaller"] if sources_have_extent or targets_have_extent else []), ("sep_bigger", SEP_BIGGER_TEMPLATE, [ VectorArg(box_id_dtype, "target_or_target_parent_boxes"), VectorArg(box_id_dtype, "box_parent_ids"), VectorArg(box_id_dtype, "colleagues_starts"), VectorArg(box_id_dtype, "colleagues_list"), ], ["sep_close_bigger"] if sources_have_extent or targets_have_extent else []), ]: src = Template( TRAVERSAL_PREAMBLE_TEMPLATE + HELPER_FUNCTION_TEMPLATE + template, strict_undefined=True).render(**render_vars) result[list_name+"_builder"] = ListOfListsBuilder(self.context, [(list_name, box_id_dtype)] + [(extra_list_name, box_id_dtype) for extra_list_name in extra_lists], str(src), arg_decls=base_args + extra_args, debug=debug, name_prefix=list_name, complex_kernel=True) # }}} logging.info("traversal build kernels built") return _KernelInfo(**result)