def _get_reduction_source( ctx, out_type, out_type_size, neutral, reduce_expr, map_expr, parsed_args, name="reduce_kernel", preamble="", arg_prep="", device=None, max_group_size=None): if device is not None: devices = [device] else: devices = ctx.devices # {{{ compute group size def get_dev_group_size(device): # dirty fix for the RV770 boards max_work_group_size = device.max_work_group_size if "RV770" in device.name: max_work_group_size = 64 # compute lmem limit from pytools import div_ceil lmem_wg_size = div_ceil(max_work_group_size, out_type_size) result = min(max_work_group_size, lmem_wg_size) # round down to power of 2 from pyopencl.tools import bitlog2 return 2**bitlog2(result) group_size = min(get_dev_group_size(dev) for dev in devices) if max_group_size is not None: group_size = min(max_group_size, group_size) # }}} from mako.template import Template from pytools import all from pyopencl.characterize import has_double_support src = str(Template(KERNEL).render( out_type=out_type, arguments=", ".join(arg.declarator() for arg in parsed_args), group_size=group_size, neutral=neutral, reduce_expr=_process_code_for_macro(reduce_expr), map_expr=_process_code_for_macro(map_expr), name=name, preamble=preamble, arg_prep=arg_prep, double_support=all(has_double_support(dev) for dev in devices), )) from pytools import Record class ReductionInfo(Record): pass return ReductionInfo( context=ctx, source=src, group_size=group_size)