def test_op_counter_basic(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP) ].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == f32div == n*m*ell*n_subgroups assert f64mul == n*m*n_subgroups assert i32add == n*m*2*n_subgroups
def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1): arg = kernel.arg_dict[variable] if arg.dim_tags is None: raise RuntimeError("cannot find padding multiple--dim_tags of '%s' " "are not known" % variable) dim_tag = arg.dim_tags[axis] from loopy.kernel.array import FixedStrideArrayDimTag if not isinstance(dim_tag, FixedStrideArrayDimTag): raise RuntimeError("cannot find padding multiple--" "axis %d of '%s' is not tagged fixed-stride" % (axis, variable)) stride = dim_tag.stride if not isinstance(stride, int): raise RuntimeError("cannot find padding multiple--stride is not a " "known integer") from pytools import div_ceil multiple = 1 while True: true_size = multiple * stride padded_size = div_ceil(true_size, align_bytes) * align_bytes if (padded_size - true_size) / true_size <= allowed_waste: return multiple multiple += 1
def test_op_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ e[i,k] = if( not(k<ell-2) and k>6 or k/2==ell, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params) f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP) ].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul == n*m*n_subgroups assert f64div == 2*n*m*n_subgroups # TODO why? assert f64add == n*m*n_subgroups assert i32add == n*m*n_subgroups
def test_op_counter_triangular_domain(): knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<m and i<j}", """ a[i, j] = b[i,j] * 2 """, name="bitwise", assumptions="n,m >= 1") knl = lp.add_and_infer_dtypes(knl, dict(b=np.float64)) expect_fallback = False import islpy as isl try: isl.BasicSet.card except AttributeError: expect_fallback = True else: expect_fallback = False op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)[lp.Op( np.float64, 'mul', CG.SUBGROUP)] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups * subgroups_per_group if expect_fallback: assert flops == 144 * n_subgroups else: assert flops == 78 * n_subgroups
def test_op_counter_reduction(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul_serial", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32add == f32mul == n*m*ell*n_subgroups op_map_dtype = op_map.group_by('dtype') f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) assert f32 == f32add + f32mul
def check_expansion_disks_undisturbed_by_sources(self, stage1_density_discr, tree, peer_lists, expansion_disturbance_tolerance, refine_flags, debug, wait_for=None): # Avoid generating too many kernels. from pytools import div_ceil max_levels = MAX_LEVELS_INCREMENT * div_ceil( tree.nlevels, MAX_LEVELS_INCREMENT) knl = self.code_container.expansion_disk_undisturbed_by_sources_checker( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) if debug: npanels_to_refine_prev = cl.array.sum(refine_flags).get() found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32) found_panel_to_refine.finish() unwrap_args = AreaQueryElementwiseTemplate.unwrap_args from pytential import bind, sym center_danger_zone_radii = flatten( bind(stage1_density_discr, sym.expansion_radii(stage1_density_discr.ambient_dim, granularity=sym.GRANULARITY_CENTER))(self.array_context)) evt = knl( *unwrap_args( tree, peer_lists, tree.box_to_qbx_source_starts, tree.box_to_qbx_source_lists, tree.qbx_panel_to_source_starts, tree.qbx_panel_to_center_starts, tree.qbx_user_source_slice.start, tree.qbx_user_center_slice.start, tree.sorted_target_ids, center_danger_zone_radii, expansion_disturbance_tolerance, tree.nqbxpanels, refine_flags, found_panel_to_refine, *tree.sources), range=slice(tree.nqbxcenters), queue=self.queue, wait_for=wait_for) cl.wait_for_events([evt]) if debug: npanels_to_refine = cl.array.sum(refine_flags).get() if npanels_to_refine > npanels_to_refine_prev: logger.debug("refiner: found {} panel(s) to refine".format( npanels_to_refine - npanels_to_refine_prev)) return found_panel_to_refine.get()[0] == 1
def add_padding(kernel, variable, axis, align_bytes): arg_to_idx = {arg.name: i for i, arg in enumerate(kernel.args)} arg_idx = arg_to_idx[variable] new_args = kernel.args[:] arg = new_args[arg_idx] if arg.dim_tags is None: raise RuntimeError("cannot add padding--dim_tags of '%s' " "are not known" % variable) new_dim_tags = list(arg.dim_tags) dim_tag = new_dim_tags[axis] from loopy.kernel.array import FixedStrideArrayDimTag if not isinstance(dim_tag, FixedStrideArrayDimTag): raise RuntimeError("cannot find padding multiple--" "axis %d of '%s' is not tagged fixed-stride" % (axis, variable)) stride = dim_tag.stride if not isinstance(stride, int): raise RuntimeError("cannot find split granularity--stride is not a " "known integer") from pytools import div_ceil new_dim_tags[axis] = FixedStrideArrayDimTag( div_ceil(stride, align_bytes) * align_bytes) new_args[arg_idx] = arg.copy(dim_tags=tuple(new_dim_tags)) return kernel.copy(args=new_args)
def __call__(self, queue, tree, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting execution. :returns: a tuple *(pl, event)*, where *pl* is an instance of :class:`PeerListLookup`, and *event* is a :class:`pyopencl.Event` for dependency management. """ from pytools import div_ceil # Avoid generating too many kernels. max_levels = div_ceil(tree.nlevels, 10) * 10 peer_list_finder_kernel = self.get_peer_list_finder_kernel( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, max_levels) logger.info("peer list finder: find peer lists") result, evt = peer_list_finder_kernel( queue, tree.nboxes, tree.box_centers.data, tree.root_extent, tree.box_levels.data, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data, wait_for=wait_for) logger.info("peer list finder: done") return PeerListLookup( tree=tree, peer_list_starts=result["peers"].starts, peer_lists=result["peers"].lists).with_queue(None), evt
def add_padding(kernel, variable, axis, align_bytes): arg_to_idx = dict((arg.name, i) for i, arg in enumerate(kernel.args)) arg_idx = arg_to_idx[variable] new_args = kernel.args[:] arg = new_args[arg_idx] if arg.dim_tags is None: raise RuntimeError("cannot add padding--dim_tags of '%s' " "are not known" % variable) new_dim_tags = list(arg.dim_tags) dim_tag = new_dim_tags[axis] from loopy.kernel.array import FixedStrideArrayDimTag if not isinstance(dim_tag, FixedStrideArrayDimTag): raise RuntimeError("cannot find padding multiple--" "axis %d of '%s' is not tagged fixed-stride" % (axis, variable)) stride = dim_tag.stride if not isinstance(stride, int): raise RuntimeError("cannot find split granularity--stride is not a " "known integer") from pytools import div_ceil new_dim_tags[axis] = FixedStrideArrayDimTag( div_ceil(stride, align_bytes) * align_bytes) new_args[arg_idx] = arg.copy(dim_tags=tuple(new_dim_tags)) return kernel.copy(args=new_args)
def check_sufficient_source_quadrature_resolution(self, stage2_density_discr, tree, peer_lists, refine_flags, debug, wait_for=None): actx = self.array_context # Avoid generating too many kernels. from pytools import div_ceil max_levels = MAX_LEVELS_INCREMENT * div_ceil(tree.nlevels, MAX_LEVELS_INCREMENT) knl = self.code_container.sufficient_source_quadrature_resolution_checker( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) if debug: nelements_to_refine_prev = actx.to_numpy( actx.np.sum(refine_flags)).item() found_element_to_refine = actx.zeros(1, dtype=np.int32) found_element_to_refine.finish() from pytential import bind, sym dd = sym.as_dofdesc(sym.GRANULARITY_ELEMENT).to_stage2() source_danger_zone_radii_by_element = flatten( bind( stage2_density_discr, sym._source_danger_zone_radii(stage2_density_discr.ambient_dim, dofdesc=dd))(self.array_context), self.array_context) unwrap_args = AreaQueryElementwiseTemplate.unwrap_args evt = knl(*unwrap_args( tree, peer_lists, tree.box_to_qbx_center_starts, tree.box_to_qbx_center_lists, tree.qbx_element_to_source_starts, tree.qbx_user_source_slice.start, tree.qbx_user_center_slice.start, tree.sorted_target_ids, source_danger_zone_radii_by_element, tree.nqbxelements, refine_flags, found_element_to_refine, *tree.sources), range=slice(tree.nqbxsources), queue=actx.queue, wait_for=wait_for) import pyopencl as cl cl.wait_for_events([evt]) if debug: nelements_to_refine = actx.to_numpy( actx.np.sum(refine_flags)).item() if nelements_to_refine > nelements_to_refine_prev: logger.debug("refiner: found %d element(s) to refine", nelements_to_refine - nelements_to_refine_prev) return actx.to_numpy(found_element_to_refine)[0] == 1
def check_expansion_disks_undisturbed_by_sources(self, lpot_source, tree, peer_lists, expansion_disturbance_tolerance, refine_flags, debug, wait_for=None): # Avoid generating too many kernels. from pytools import div_ceil max_levels = MAX_LEVELS_INCREMENT * div_ceil( tree.nlevels, MAX_LEVELS_INCREMENT) knl = self.code_container.expansion_disk_undisturbed_by_sources_checker( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) if debug: npanels_to_refine_prev = cl.array.sum(refine_flags).get() found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32) found_panel_to_refine.finish() unwrap_args = AreaQueryElementwiseTemplate.unwrap_args center_danger_zone_radii = lpot_source._expansion_radii("ncenters") evt = knl( *unwrap_args( tree, peer_lists, tree.box_to_qbx_source_starts, tree.box_to_qbx_source_lists, tree.qbx_panel_to_source_starts, tree.qbx_panel_to_center_starts, tree.qbx_user_source_slice.start, tree.qbx_user_center_slice.start, tree.sorted_target_ids, center_danger_zone_radii, expansion_disturbance_tolerance, tree.nqbxpanels, refine_flags, found_panel_to_refine, *tree.sources), range=slice(tree.nqbxcenters), queue=self.queue, wait_for=wait_for) cl.wait_for_events([evt]) if debug: npanels_to_refine = cl.array.sum(refine_flags).get() if npanels_to_refine > npanels_to_refine_prev: logger.debug("refiner: found {} panel(s) to refine".format( npanels_to_refine - npanels_to_refine_prev)) return found_panel_to_refine.get()[0] == 1
def check_sufficient_source_quadrature_resolution( self, lpot_source, tree, peer_lists, refine_flags, debug, wait_for=None): # Avoid generating too many kernels. from pytools import div_ceil max_levels = MAX_LEVELS_INCREMENT * div_ceil( tree.nlevels, MAX_LEVELS_INCREMENT) knl = self.code_container.sufficient_source_quadrature_resolution_checker( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) if debug: npanels_to_refine_prev = cl.array.sum(refine_flags).get() found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32) found_panel_to_refine.finish() from pytential import bind, sym source_danger_zone_radii_by_panel = bind(lpot_source, sym._source_danger_zone_radii( lpot_source.ambient_dim, dofdesc=sym.GRANULARITY_ELEMENT))(self.queue) unwrap_args = AreaQueryElementwiseTemplate.unwrap_args evt = knl( *unwrap_args( tree, peer_lists, tree.box_to_qbx_center_starts, tree.box_to_qbx_center_lists, tree.qbx_panel_to_source_starts, tree.qbx_user_source_slice.start, tree.qbx_user_center_slice.start, tree.sorted_target_ids, source_danger_zone_radii_by_panel, tree.nqbxpanels, refine_flags, found_panel_to_refine, *tree.sources), range=slice(tree.nqbxsources), queue=self.queue, wait_for=wait_for) cl.wait_for_events([evt]) if debug: npanels_to_refine = cl.array.sum(refine_flags).get() if npanels_to_refine > npanels_to_refine_prev: logger.debug("refiner: found {} panel(s) to refine".format( npanels_to_refine - npanels_to_refine_prev)) return found_panel_to_refine.get()[0] == 1
def test_mem_access_counter_reduction(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) subgroup_size = 32 mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=subgroup_size) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) f32l = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={}, gid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group assert f32s == (n*ell)*n_workgroups*subgroups_per_group ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] ).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'] ).to_bytes().eval_and_sum(params) assert ld_bytes == 4*f32l assert st_bytes == 4*f32s
def get_dev_group_size(device): # dirty fix for the RV770 boards max_work_group_size = device.max_work_group_size if "RV770" in device.name: max_work_group_size = 64 # compute lmem limit from pytools import div_ceil lmem_wg_size = div_ceil(max_work_group_size, out_type_size) result = min(max_work_group_size, lmem_wg_size) # round down to power of 2 from pyopencl.tools import bitlog2 return 2**bitlog2(result)
def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1): if isinstance(kernel, TranslationUnit): kernel_names = [ i for i, clbl in kernel.callables_table.items() if isinstance(clbl, CallableKernel) ] if len(kernel_names) > 1: raise LoopyError() return find_padding_multiple(kernel[kernel_names[0]], variable, axis, align_bytes, allowed_waste) assert isinstance(kernel, LoopKernel) arg = kernel.arg_dict[variable] if arg.dim_tags is None: raise RuntimeError("cannot find padding multiple--dim_tags of '%s' " "are not known" % variable) dim_tag = arg.dim_tags[axis] from loopy.kernel.array import FixedStrideArrayDimTag if not isinstance(dim_tag, FixedStrideArrayDimTag): raise RuntimeError("cannot find padding multiple--" "axis %d of '%s' is not tagged fixed-stride" % (axis, variable)) stride = dim_tag.stride if not isinstance(stride, int): raise RuntimeError("cannot find padding multiple--stride is not a " "known integer") from pytools import div_ceil multiple = 1 while True: true_size = multiple * stride padded_size = div_ceil(true_size, align_bytes) * align_bytes if (padded_size - true_size) / true_size <= allowed_waste: return multiple multiple += 1
def test_op_counter_specialops(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1])+rsqrt(g[i,k])*sin(g[i,k]) """ ], name="specialops", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, count_within_subscripts=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups * subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params) f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params) f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params) f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP)].eval_with_dict(params) f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP)].eval_with_dict(params) i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)].eval_with_dict(params) f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP)].eval_with_dict(params) f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP)].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32div == 2 * n * m * ell * n_subgroups assert f32mul == f32add == n * m * ell * n_subgroups assert f64add == 3 * n * m * n_subgroups assert f64pow == i32add == f64rsq == f64sin == n * m * n_subgroups
def test_mem_access_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ e[i,k] = if(not(k<ell-2) and k>6 or k/2==ell, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) subgroup_size = 32 mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=subgroup_size) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32), direction='load') ].eval_with_dict(params) f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), direction='load') ].eval_with_dict(params) f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), direction='store') ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group assert f32_g_l == (2*n*m)*n_workgroups*subgroups_per_group assert f64_g_l == (n*m)*n_workgroups*subgroups_per_group assert f64_g_s == (n*m)*n_workgroups*subgroups_per_group
def test_op_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict( a=np.int32, b=np.int32, g=np.int64, h=np.int64)) op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(params) i32bw = op_map[lp.Op(np.int32, 'bw', CG.SUBGROUP)].eval_with_dict(params) i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.SUBGROUP) ].eval_with_dict(params) i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.SUBGROUP) ].eval_with_dict(params) i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.SUBGROUP) ].eval_with_dict(params) i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert i32add == n*m+n*m*ell*n_subgroups assert i32bw == 2*n*m*ell*n_subgroups assert i64bw == 2*n*m*n_subgroups assert i64add == i64mul == n*m*n_subgroups assert i64shift == 2*n*m*n_subgroups
def __call__(self, queue, tree, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting execution. :returns: a tuple *(pl, event)*, where *pl* is an instance of :class:`PeerListLookup`, and *event* is a :class:`pyopencl.Event` for dependency management. """ from pytools import div_ceil # Round up level count--this gets included in the kernel as # a stack bound. Rounding avoids too many kernel versions. max_levels = div_ceil(tree.nlevels, 10) * 10 peer_list_finder_kernel = self.get_peer_list_finder_kernel( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, max_levels) pl_plog = ProcessLogger(logger, "find peer lists") result, evt = peer_list_finder_kernel(queue, tree.nboxes, tree.box_centers.data, tree.root_extent, tree.box_levels, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, wait_for=wait_for) pl_plog.done() return PeerListLookup( tree=tree, peer_list_starts=result["peers"].starts, peer_lists=result["peers"].lists).with_queue(None), evt
def convert_computed_to_fixed_dim_tags(name, num_user_axes, num_target_axes, shape, dim_tags): # Just to clarify: # # - user axes are user-facing--what the user actually uses for indexing. # # - target axes are implementation facing. Normal in-memory arrays have one. # 3D images have three. import loopy as lp # {{{ pick apart arg dim tags into computed, fixed and vec vector_dim = None # a mapping from target axes to {layout_nesting_level: dim_tag_index} target_axis_to_nesting_level_map = {} for i, dim_tag in enumerate(dim_tags): if isinstance(dim_tag, VectorArrayDimTag): if vector_dim is not None: raise LoopyError("arg '%s' may only have one vector-tagged " "argument dimension" % name) vector_dim = i elif isinstance(dim_tag, _StrideArrayDimTagBase): if dim_tag.layout_nesting_level is None: continue nl_map = target_axis_to_nesting_level_map \ .setdefault(dim_tag.target_axis, {}) assert dim_tag.layout_nesting_level not in nl_map nl_map[dim_tag.layout_nesting_level] = i elif isinstance(dim_tag, SeparateArrayArrayDimTag): pass else: raise LoopyError("invalid array dim tag") # }}} # {{{ convert computed to fixed stride dim tags new_dim_tags = dim_tags[:] for target_axis in range(num_target_axes): if vector_dim is None: stride_so_far = 1 else: if shape is None or shape is lp.auto: # unable to normalize without known shape return None if not is_integer(shape[vector_dim]): raise TypeError("shape along vector axis %d of array '%s' " "must be an integer, not an expression ('%s')" % (vector_dim, name, shape[vector_dim])) stride_so_far = shape[vector_dim] # FIXME: OpenCL-specific if stride_so_far == 3: stride_so_far = 4 nesting_level_map = target_axis_to_nesting_level_map.get(target_axis, {}) nl_keys = sorted(nesting_level_map.keys()) if not nl_keys: continue for key in nl_keys: dim_tag_index = nesting_level_map[key] dim_tag = dim_tags[dim_tag_index] if isinstance(dim_tag, ComputedStrideArrayDimTag): if stride_so_far is None: raise LoopyError("unable to determine fixed stride " "for axis %d because it is nested outside of " "an 'auto' stride axis" % dim_tag_index) new_dim_tags[dim_tag_index] = FixedStrideArrayDimTag(stride_so_far, target_axis=dim_tag.target_axis, layout_nesting_level=dim_tag.layout_nesting_level) if shape is None or shape is lp.auto: # unable to normalize without known shape return None shape_axis = shape[dim_tag_index] if shape_axis is None: stride_so_far = None else: stride_so_far *= shape_axis if dim_tag.pad_to is not None: from pytools import div_ceil stride_so_far = ( div_ceil(stride_so_far, dim_tag.pad_to) * stride_so_far) elif isinstance(dim_tag, FixedStrideArrayDimTag): stride_so_far = dim_tag.stride if stride_so_far is lp.auto: stride_so_far = None else: raise TypeError("internal error in dim_tag conversion") # }}} return new_dim_tags
def convert_computed_to_fixed_dim_tags(name, num_user_axes, num_target_axes, shape, dim_tags): # Just to clarify: # # - user axes are user-facing--what the user actually uses for indexing. # # - target axes are implementation facing. Normal in-memory arrays have one. # 3D images have three. import loopy as lp # {{{ pick apart arg dim tags into computed, fixed and vec vector_dim = None # a mapping from target axes to {layout_nesting_level: dim_tag_index} target_axis_to_nesting_level_map = {} for i, dim_tag in enumerate(dim_tags): if isinstance(dim_tag, VectorArrayDimTag): if vector_dim is not None: raise LoopyError("arg '%s' may only have one vector-tagged " "argument dimension" % name) vector_dim = i elif isinstance(dim_tag, _StrideArrayDimTagBase): if dim_tag.layout_nesting_level is None: continue nl_map = target_axis_to_nesting_level_map \ .setdefault(dim_tag.target_axis, {}) assert dim_tag.layout_nesting_level not in nl_map nl_map[dim_tag.layout_nesting_level] = i elif isinstance(dim_tag, SeparateArrayArrayDimTag): pass else: raise LoopyError("invalid array dim tag") # }}} # {{{ convert computed to fixed stride dim tags new_dim_tags = dim_tags[:] for target_axis in range(num_target_axes): if vector_dim is None: stride_so_far = 1 else: if shape is None or shape is lp.auto: # unable to normalize without known shape return None if not is_integer(shape[vector_dim]): raise TypeError( "shape along vector axis %d of array '%s' " "must be an integer, not an expression ('%s')" % (vector_dim, name, shape[vector_dim])) stride_so_far = shape[vector_dim] # FIXME: OpenCL-specific if stride_so_far == 3: stride_so_far = 4 nesting_level_map = target_axis_to_nesting_level_map.get( target_axis, {}) nl_keys = sorted(nesting_level_map.keys()) if not nl_keys: continue for key in nl_keys: dim_tag_index = nesting_level_map[key] dim_tag = dim_tags[dim_tag_index] if isinstance(dim_tag, ComputedStrideArrayDimTag): if stride_so_far is None: raise LoopyError( "unable to determine fixed stride " "for axis %d because it is nested outside of " "an 'auto' stride axis" % dim_tag_index) new_dim_tags[dim_tag_index] = FixedStrideArrayDimTag( stride_so_far, target_axis=dim_tag.target_axis, layout_nesting_level=dim_tag.layout_nesting_level) if shape is None or shape is lp.auto: # unable to normalize without known shape return None shape_axis = shape[dim_tag_index] if shape_axis is None: stride_so_far = None else: stride_so_far *= shape_axis if dim_tag.pad_to is not None: from pytools import div_ceil stride_so_far = (div_ceil(stride_so_far, dim_tag.pad_to) * stride_so_far) elif isinstance(dim_tag, FixedStrideArrayDimTag): stride_so_far = dim_tag.stride if stride_so_far is lp.auto: stride_so_far = None else: raise TypeError("internal error in dim_tag conversion") # }}} return new_dim_tags
def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, split_kwargs=None): """ :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating that the index in *axis_nr* should be split. The tuples may also be *(array, axis_nr, "F")*, indicating that the index will be split as it would be according to Fortran order. *array* may name a temporary variable or an argument. If *arrays_and_axes* is a :class:`tuple`, it is automatically wrapped in a list, to make single splits easier. :arg count: The group size to use in the split. :arg auto_split_inames: Whether to automatically split inames encountered in the specified indices. :arg split_kwargs: arguments to pass to :func:`loopy.split_inames` Note that splits on the corresponding inames are carried out implicitly. The inames may *not* be split beforehand. (There's no *really* good reason for this--this routine is just not smart enough to deal with this.) """ if count == 1: return kernel if split_kwargs is None: split_kwargs = {} # {{{ process input into array_to_rest # where "rest" is the non-argument-name part of the input tuples # in args_and_axes def normalize_rest(rest): if len(rest) == 1: return (rest[0], "C") elif len(rest) == 2: return rest else: raise RuntimeError("split instruction '%s' not understood" % rest) if isinstance(arrays_and_axes, tuple): arrays_and_axes = [arrays_and_axes] array_to_rest = { tup[0]: normalize_rest(tup[1:]) for tup in arrays_and_axes } if len(arrays_and_axes) != len(array_to_rest): raise RuntimeError("cannot split multiple axes of the same variable") del arrays_and_axes # }}} # {{{ adjust arrays from loopy.kernel.tools import ArrayChanger for array_name, (axis, order) in array_to_rest.items(): achng = ArrayChanger(kernel, array_name) ary = achng.get() from pytools import div_ceil # {{{ adjust shape new_shape = ary.shape if new_shape is not None: new_shape = list(new_shape) axis_len = new_shape[axis] new_shape[axis] = count outer_len = div_ceil(axis_len, count) if order == "F": new_shape.insert(axis + 1, outer_len) elif order == "C": new_shape.insert(axis, outer_len) else: raise RuntimeError("order '%s' not understood" % order) new_shape = tuple(new_shape) # }}} # {{{ adjust dim tags if ary.dim_tags is None: raise RuntimeError("dim_tags of '%s' are not known" % array_name) new_dim_tags = list(ary.dim_tags) old_dim_tag = ary.dim_tags[axis] from loopy.kernel.array import FixedStrideArrayDimTag if not isinstance(old_dim_tag, FixedStrideArrayDimTag): raise RuntimeError("axis %d of '%s' is not tagged fixed-stride" % (axis, array_name)) old_stride = old_dim_tag.stride outer_stride = count * old_stride if order == "F": new_dim_tags.insert(axis + 1, FixedStrideArrayDimTag(outer_stride)) elif order == "C": new_dim_tags.insert(axis, FixedStrideArrayDimTag(outer_stride)) else: raise RuntimeError("order '%s' not understood" % order) new_dim_tags = tuple(new_dim_tags) # }}} # {{{ adjust dim_names new_dim_names = ary.dim_names if new_dim_names is not None: new_dim_names = list(new_dim_names) existing_name = new_dim_names[axis] new_dim_names[axis] = existing_name + "_inner" outer_name = existing_name + "_outer" if order == "F": new_dim_names.insert(axis + 1, outer_name) elif order == "C": new_dim_names.insert(axis, outer_name) else: raise RuntimeError("order '%s' not understood" % order) new_dim_names = tuple(new_dim_names) # }}} kernel = achng.with_changed_array( ary.copy(shape=new_shape, dim_tags=new_dim_tags, dim_names=new_dim_names)) # }}} split_vars = {} var_name_gen = kernel.get_var_name_generator() def split_access_axis(expr): axis_nr, order = array_to_rest[expr.aggregate.name] idx = expr.index if not isinstance(idx, tuple): idx = (idx, ) idx = list(idx) axis_idx = idx[axis_nr] if auto_split_inames: from pymbolic.primitives import Variable if not isinstance(axis_idx, Variable): raise RuntimeError( "found access '%s' in which axis %d is not a " "single variable--cannot split " "(Have you tried to do the split yourself, manually, " "beforehand? If so, you shouldn't.)" % (expr, axis_nr)) split_iname = idx[axis_nr].name assert split_iname in kernel.all_inames() try: outer_iname, inner_iname = split_vars[split_iname] except KeyError: outer_iname = var_name_gen(split_iname + "_outer") inner_iname = var_name_gen(split_iname + "_inner") split_vars[split_iname] = outer_iname, inner_iname inner_index = Variable(inner_iname) outer_index = Variable(outer_iname) else: from loopy.symbolic import simplify_using_aff inner_index = simplify_using_aff(kernel, axis_idx % count) outer_index = simplify_using_aff(kernel, axis_idx // count) idx[axis_nr] = inner_index if order == "F": idx.insert(axis + 1, outer_index) elif order == "C": idx.insert(axis, outer_index) else: raise RuntimeError("order '%s' not understood" % order) return expr.aggregate.index(tuple(idx)) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) aash = ArrayAxisSplitHelper(rule_mapping_context, set(array_to_rest.keys()), split_access_axis) kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) if auto_split_inames: from loopy import split_iname for iname, (outer_iname, inner_iname) in split_vars.items(): kernel = split_iname(kernel, iname, count, outer_iname=outer_iname, inner_iname=inner_iname, **split_kwargs) return kernel
def mark_panels_for_refinement(self, tree, peer_lists, lpot_source, target_status, refine_flags, debug, wait_for=None): # Round up level count--this gets included in the kernel as # a stack bound. Rounding avoids too many kernel versions. from pytools import div_ceil max_levels = 10 * div_ceil(tree.nlevels, 10) knl = self.code_container.refiner_for_failed_target_association( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) found_panel_to_refine = cl.array.zeros(self.queue, 1, np.int32) found_panel_to_refine.finish() # Perform a space invader query over the sources. source_slice = tree.user_source_ids[tree.qbx_user_source_slice] sources = [ axis.with_queue(self.queue)[source_slice] for axis in tree.sources] tunnel_radius_by_source = ( lpot_source._close_target_tunnel_radius("nsources") .with_queue(self.queue)) # See (TGTMARK) above for algorithm. box_to_search_dist, evt = self.code_container.space_invader_query()( self.queue, tree, sources, tunnel_radius_by_source, peer_lists, wait_for=wait_for) wait_for = [evt] evt = knl( *unwrap_args( tree, peer_lists, tree.box_to_qbx_source_starts, tree.box_to_qbx_source_lists, tree.qbx_panel_to_source_starts, tree.qbx_user_source_slice.start, tree.qbx_user_target_slice.start, tree.nqbxpanels, tree.sorted_target_ids, lpot_source._close_target_tunnel_radius("nsources"), target_status, box_to_search_dist, refine_flags, found_panel_to_refine, *tree.sources), range=slice(tree.nqbxtargets), queue=self.queue, wait_for=wait_for) if debug: refine_flags.finish() # Marked panel = 1, 0 otherwise marked_panel_count = cl.array.sum(refine_flags).get() logger.debug("target association: {} panels flagged for refinement" .format(marked_panel_count)) cl.wait_for_events([evt]) return (found_panel_to_refine == 1).all().get()
def mark_targets(self, tree, peer_lists, lpot_source, target_status, debug, wait_for=None): # Round up level count--this gets included in the kernel as # a stack bound. Rounding avoids too many kernel versions. from pytools import div_ceil max_levels = 10 * div_ceil(tree.nlevels, 10) knl = self.code_container.target_marker( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) found_target_close_to_panel = cl.array.zeros(self.queue, 1, np.int32) found_target_close_to_panel.finish() # Perform a space invader query over the sources. source_slice = tree.sorted_target_ids[tree.qbx_user_source_slice] sources = [ axis.with_queue(self.queue)[source_slice] for axis in tree.sources] tunnel_radius_by_source = ( lpot_source._close_target_tunnel_radius("nsources") .with_queue(self.queue)) # Target-marking algorithm (TGTMARK): # # (1) Use a space invader query to tag each leaf box that intersects with the # "near-source-detection tunnel" with the distance to the closest source. # # (2) Do an area query around all targets with the radius resulting # from the space invader query, enumerate sources in that vicinity. # If a source is found whose distance to the target is less than the # source's tunnel radius, mark that target as pending. # (or below: mark the source for refinement) # Note that this comment is referred to below by "TGTMARK". If you # remove this comment or change the algorithm here, make sure that # the reference below is still accurate. # Trade off for space-invaders vs directly tagging targets in # endangered boxes: # # (-) More complicated # (-) More actual work # (+) Taking the point of view of the targets could potentially lead to # more parallelism, if you think of the targets as unbounded while the # sources are fixed (which sort of makes sense, given that the number # of targets per box is not bounded). box_to_search_dist, evt = self.code_container.space_invader_query()( self.queue, tree, sources, tunnel_radius_by_source, peer_lists, wait_for=wait_for) wait_for = [evt] tunnel_radius_by_source = lpot_source._close_target_tunnel_radius("nsources") evt = knl( *unwrap_args( tree, peer_lists, tree.box_to_qbx_source_starts, tree.box_to_qbx_source_lists, tree.qbx_user_source_slice.start, tree.qbx_user_target_slice.start, tree.sorted_target_ids, tunnel_radius_by_source, box_to_search_dist, target_status, found_target_close_to_panel, *tree.sources), range=slice(tree.nqbxtargets), queue=self.queue, wait_for=wait_for) if debug: target_status.finish() # Marked target = 1, 0 otherwise marked_target_count = cl.array.sum(target_status).get() logger.debug("target association: {}/{} targets marked close to panels" .format(marked_target_count, tree.nqbxtargets)) cl.wait_for_events([evt]) return (found_target_close_to_panel == 1).all().get()
def test_mem_access_counter_basic(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group f32l = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32l == (3*n*m*ell)*n_subgroups assert f64l == (2*n*m)*n_subgroups f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={}, gid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32s == (n*m*ell)*n_subgroups assert f64s == (n*m)*n_subgroups
def test_summations_and_filters(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) loads_a = mem_map.filter_by(direction=['load'], variable=['a'], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert loads_a == (2*n*m*ell)*n_subgroups global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert global_stores == (n*m*ell + n*m)*n_subgroups ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_subgroups assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load') ].eval_with_dict(params) f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32lall == (3*n*m*ell)*n_subgroups assert f64lall == (2*n*m)*n_subgroups op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) op_map_dtype = op_map.group_by('dtype') f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params) i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params) assert f32 == n*m*ell*3 assert f64 == n*m assert i32 == n*m*2 addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params) f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params) assert addsub_all == n*m*ell + n*m*2 assert f32ops_all == n*m*ell*3 non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params) assert non_field == 0 ops_nodtype = op_map.group_by('name') ops_noname = op_map.group_by('dtype') mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params) f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params) assert mul_all == n*m*ell + n*m assert f64ops_all == n*m def func_filter(key): return key.lid_strides == {} and key.dtype == to_loopy_type(np.float64) and \ key.direction == 'load' f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert f64l == (2*n*m)*n_subgroups
def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg ball_centers: an object array of coordinate :class:`pyopencl.array.Array` instances. Their *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg ball_radii: a :class:`pyopencl.array.Array` of positive numbers. Its *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg peer_lists: may either be *None* or an instance of :class:`PeerListLookup` associated with `tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting exeuction. :returns: a tuple *(aq, event)*, where *aq* is an instance of :class:`AreaQueryResult`, and *event* is a :class:`pyopencl.Event` for dependency management. """ from pytools import single_valued if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: raise TypeError("ball_centers dtype must match tree.coord_dtype") if ball_radii.dtype != tree.coord_dtype: raise TypeError("ball_radii dtype must match tree.coord_dtype") ball_id_dtype = tree.particle_id_dtype # ? from pytools import div_ceil # Avoid generating too many kernels. max_levels = div_ceil(tree.nlevels, 10) * 10 if peer_lists is None: peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for) wait_for = [evt] if len(peer_lists.peer_list_starts) != tree.nboxes + 1: raise ValueError( "size of peer lists must match with number of boxes") area_query_kernel = self.get_area_query_kernel( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, ball_id_dtype, peer_lists.peer_list_starts.dtype, max_levels) aq_plog = ProcessLogger(logger, "area query") result, evt = area_query_kernel(queue, len(ball_radii), tree.box_centers.data, tree.root_extent, tree.box_levels, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, peer_lists.peer_list_starts, peer_lists.peer_lists, ball_radii, *(tuple(tree.bounding_box[0]) + tuple(bc for bc in ball_centers)), wait_for=wait_for) aq_plog.done() return AreaQueryResult( tree=tree, leaves_near_ball_starts=result["leaves"].starts, leaves_near_ball_lists=result["leaves"].lists).with_queue( None), evt
def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg ball_centers: an object array of coordinate :class:`pyopencl.array.Array` instances. Their *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg ball_radii: a :class:`pyopencl.array.Array` of positive numbers. Its *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg peer_lists: may either be *None* or an instance of :class:`PeerListLookup` associated with `tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting execution. :returns: a tuple *(sqi, event)*, where *sqi* is an instance of :class:`pyopencl.array.Array`, and *event* is a :class:`pyopencl.Event` for dependency management. The *dtype* of *sqi* is *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape is *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`). The entries of *sqi* are indexed by the global box index and are as follows: * if *i* is not the index of a leaf box, *sqi[i] = 0*. * if *i* is the index of a leaf box, *sqi[i]* is the outer space invader distance for *i*. """ from pytools import single_valued if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: raise TypeError("ball_centers dtype must match tree.coord_dtype") if ball_radii.dtype != tree.coord_dtype: raise TypeError("ball_radii dtype must match tree.coord_dtype") from pytools import div_ceil # Avoid generating too many kernels. max_levels = div_ceil(tree.nlevels, 10) * 10 if peer_lists is None: peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for) wait_for = [evt] if len(peer_lists.peer_list_starts) != tree.nboxes + 1: raise ValueError( "size of peer lists must match with number of boxes") space_invader_query_kernel = self.get_space_invader_query_kernel( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, max_levels) si_plog = ProcessLogger(logger, "space invader query") outer_space_invader_dists = cl.array.zeros(queue, tree.nboxes, np.float32) if not wait_for: wait_for = [] wait_for = wait_for + outer_space_invader_dists.events evt = space_invader_query_kernel( *SPACE_INVADER_QUERY_TEMPLATE.unwrap_args( tree, peer_lists, ball_radii, outer_space_invader_dists, *tuple(bc for bc in ball_centers)), wait_for=wait_for, queue=queue, range=slice(len(ball_radii))) if tree.coord_dtype != np.dtype(np.float32): # The kernel output is always an array of float32 due to limited # support for atomic operations with float64 in OpenCL. # Here the output is cast to match the coord dtype. outer_space_invader_dists.finish() outer_space_invader_dists = outer_space_invader_dists.astype( tree.coord_dtype) evt, = outer_space_invader_dists.events si_plog.done() return outer_space_invader_dists, evt
def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"): if count == 1: return kernel # {{{ adjust arrays from loopy.kernel.tools import ArrayChanger achng = ArrayChanger(kernel, array_name) ary = achng.get() from pytools import div_ceil # {{{ adjust shape new_shape = ary.shape if new_shape is not None: new_shape = list(new_shape) axis_len = new_shape[axis_nr] new_shape[axis_nr] = count outer_len = div_ceil(axis_len, count) if order == "F": new_shape.insert(axis_nr+1, outer_len) elif order == "C": new_shape.insert(axis_nr, outer_len) else: raise RuntimeError("order '%s' not understood" % order) new_shape = tuple(new_shape) # }}} # {{{ adjust dim tags if ary.dim_tags is None: raise RuntimeError("dim_tags of '%s' are not known" % array_name) new_dim_tags = list(ary.dim_tags) old_dim_tag = ary.dim_tags[axis_nr] from loopy.kernel.array import FixedStrideArrayDimTag if not isinstance(old_dim_tag, FixedStrideArrayDimTag): raise RuntimeError("axis %d of '%s' is not tagged fixed-stride" % (axis_nr, array_name)) old_stride = old_dim_tag.stride outer_stride = count*old_stride if order == "F": new_dim_tags.insert(axis_nr+1, FixedStrideArrayDimTag(outer_stride)) elif order == "C": new_dim_tags.insert(axis_nr, FixedStrideArrayDimTag(outer_stride)) else: raise RuntimeError("order '%s' not understood" % order) new_dim_tags = tuple(new_dim_tags) # }}} # {{{ adjust dim_names new_dim_names = ary.dim_names if new_dim_names is not None: new_dim_names = list(new_dim_names) existing_name = new_dim_names[axis_nr] new_dim_names[axis_nr] = existing_name + "_inner" outer_name = existing_name + "_outer" if order == "F": new_dim_names.insert(axis_nr+1, outer_name) elif order == "C": new_dim_names.insert(axis_nr, outer_name) else: raise RuntimeError("order '%s' not understood" % order) new_dim_names = tuple(new_dim_names) # }}} kernel = achng.with_changed_array(ary.copy( shape=new_shape, dim_tags=new_dim_tags, dim_names=new_dim_names)) # }}} var_name_gen = kernel.get_var_name_generator() def split_access_axis(expr): idx = expr.index if not isinstance(idx, tuple): idx = (idx,) idx = list(idx) axis_idx = idx[axis_nr] from loopy.symbolic import simplify_using_aff inner_index = simplify_using_aff(kernel, axis_idx % count) outer_index = simplify_using_aff(kernel, axis_idx // count) idx[axis_nr] = inner_index if order == "F": idx.insert(axis_nr+1, outer_index) elif order == "C": idx.insert(axis_nr, outer_index) else: raise RuntimeError("order '%s' not understood" % order) return expr.aggregate.index(tuple(idx)) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) aash = ArrayAxisSplitHelper(rule_mapping_context, set([array_name]), split_access_axis) kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) return kernel
def try_find_centers(self, tree, peer_lists, lpot_source, target_status, target_flags, target_assoc, target_association_tolerance, debug, wait_for=None): # Round up level count--this gets included in the kernel as # a stack bound. Rounding avoids too many kernel versions. from pytools import div_ceil max_levels = 10 * div_ceil(tree.nlevels, 10) knl = self.code_container.center_finder( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, tree.particle_id_dtype, max_levels) if debug: target_status.finish() marked_target_count = int(cl.array.sum(target_status).get()) # Perform a space invader query over the centers. center_slice = ( tree.sorted_target_ids[tree.qbx_user_center_slice] .with_queue(self.queue)) centers = [ axis.with_queue(self.queue)[center_slice] for axis in tree.sources] expansion_radii_by_center = \ lpot_source._expansion_radii("ncenters").with_queue(self.queue) expansion_radii_by_center_with_tolerance = \ expansion_radii_by_center * (1 + target_association_tolerance) # Idea: # # (1) Tag leaf boxes around centers with max distance to usable center. # (2) Area query from targets with those radii to find closest eligible # center. box_to_search_dist, evt = self.code_container.space_invader_query()( self.queue, tree, centers, expansion_radii_by_center_with_tolerance, peer_lists, wait_for=wait_for) wait_for = [evt] min_dist_to_center = cl.array.empty( self.queue, tree.nqbxtargets, tree.coord_dtype) min_dist_to_center.fill(np.inf) wait_for.extend(min_dist_to_center.events) evt = knl( *unwrap_args( tree, peer_lists, tree.box_to_qbx_center_starts, tree.box_to_qbx_center_lists, tree.qbx_user_center_slice.start, tree.qbx_user_target_slice.start, tree.sorted_target_ids, expansion_radii_by_center_with_tolerance, box_to_search_dist, target_flags, target_status, target_assoc.target_to_center, min_dist_to_center, *tree.sources), range=slice(tree.nqbxtargets), queue=self.queue, wait_for=wait_for) if debug: target_status.finish() # Associated target = 2, marked target = 1 ntargets_associated = ( int(cl.array.sum(target_status).get()) - marked_target_count) assert ntargets_associated >= 0 logger.debug("target association: {} targets were assigned centers" .format(ntargets_associated)) cl.wait_for_events([evt])
def test_mem_access_tagged_variables(): bsize = 16 knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", ["c$mmresult[i, j] = sum(k, a$mmaload[i, k]*b$mmbload[k, j])"], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", bsize) # knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") # knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} group_size = bsize * bsize n_workgroups = div_ceil(n, bsize) * div_ceil(ell, bsize) subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups * subgroups_per_group mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) f32s1lb = mem_access_map[lp.MemAccess( 'global', np.float32, lid_strides={0: 1}, gid_strides={1: bsize}, direction='load', variable='b', variable_tag='mmbload', count_granularity=CG.WORKITEM)].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess( 'global', np.float32, lid_strides={1: Variable('m')}, gid_strides={0: Variable('m') * bsize}, direction='load', variable='a', variable_tag='mmaload', count_granularity=CG.SUBGROUP)].eval_with_dict(params) assert f32s1lb == n * m * ell # uniform: (count-per-sub-group)*n_subgroups assert f32s1la == m * n_subgroups f32coal = mem_access_map[lp.MemAccess( 'global', np.float32, lid_strides={ 0: 1, 1: Variable('ell') }, gid_strides={ 0: Variable('ell') * bsize, 1: bsize }, direction='store', variable='c', variable_tag='mmresult', count_granularity=CG.WORKITEM)].eval_with_dict(params) assert f32coal == n * ell
def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"): if count == 1: return kernel # {{{ adjust arrays from loopy.kernel.tools import ArrayChanger achng = ArrayChanger(kernel, array_name) ary = achng.get() from pytools import div_ceil # {{{ adjust shape new_shape = ary.shape if new_shape is not None: new_shape = list(new_shape) axis_len = new_shape[axis_nr] new_shape[axis_nr] = count outer_len = div_ceil(axis_len, count) if order == "F": new_shape.insert(axis_nr + 1, outer_len) elif order == "C": new_shape.insert(axis_nr, outer_len) else: raise RuntimeError("order '%s' not understood" % order) new_shape = tuple(new_shape) # }}} # {{{ adjust dim tags if ary.dim_tags is None: raise RuntimeError("dim_tags of '%s' are not known" % array_name) new_dim_tags = list(ary.dim_tags) old_dim_tag = ary.dim_tags[axis_nr] from loopy.kernel.array import FixedStrideArrayDimTag if not isinstance(old_dim_tag, FixedStrideArrayDimTag): raise RuntimeError("axis %d of '%s' is not tagged fixed-stride" % (axis_nr, array_name)) old_stride = old_dim_tag.stride outer_stride = count * old_stride if order == "F": new_dim_tags.insert(axis_nr + 1, FixedStrideArrayDimTag(outer_stride)) elif order == "C": new_dim_tags.insert(axis_nr, FixedStrideArrayDimTag(outer_stride)) else: raise RuntimeError("order '%s' not understood" % order) new_dim_tags = tuple(new_dim_tags) # }}} # {{{ adjust dim_names new_dim_names = ary.dim_names if new_dim_names is not None: new_dim_names = list(new_dim_names) existing_name = new_dim_names[axis_nr] new_dim_names[axis_nr] = existing_name + "_inner" outer_name = existing_name + "_outer" if order == "F": new_dim_names.insert(axis_nr + 1, outer_name) elif order == "C": new_dim_names.insert(axis_nr, outer_name) else: raise RuntimeError("order '%s' not understood" % order) new_dim_names = tuple(new_dim_names) # }}} kernel = achng.with_changed_array( ary.copy(shape=new_shape, dim_tags=new_dim_tags, dim_names=new_dim_names)) # }}} var_name_gen = kernel.get_var_name_generator() def split_access_axis(expr): idx = expr.index if not isinstance(idx, tuple): idx = (idx, ) idx = list(idx) axis_idx = idx[axis_nr] from loopy.symbolic import simplify_using_aff inner_index = simplify_using_aff(kernel, axis_idx % count) outer_index = simplify_using_aff(kernel, axis_idx // count) idx[axis_nr] = inner_index if order == "F": idx.insert(axis_nr + 1, outer_index) elif order == "C": idx.insert(axis_nr, outer_index) else: raise RuntimeError("order '%s' not understood" % order) return expr.aggregate.index(tuple(idx)) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) aash = ArrayAxisSplitHelper(rule_mapping_context, {array_name}, split_access_axis) kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) return kernel
def test_mem_access_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict( a=np.int32, b=np.int32, g=np.int32, h=np.int32)) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group i32 = mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), lid_strides={}, gid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert i32 == (4*n*m+2*n*m*ell)*n_subgroups i32 = mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert i32 == (n*m+n*m*ell)*n_subgroups
def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg ball_centers: an object array of coordinate :class:`pyopencl.array.Array` instances. Their *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg ball_radii: a :class:`pyopencl.array.Array` of positive numbers. Its *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg peer_lists: may either be *None* or an instance of :class:`PeerListLookup` associated with `tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting exeuction. :returns: a tuple *(aq, event)*, where *aq* is an instance of :class:`AreaQueryResult`, and *event* is a :class:`pyopencl.Event` for dependency management. """ from pytools import single_valued if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: raise TypeError("ball_centers dtype must match tree.coord_dtype") if ball_radii.dtype != tree.coord_dtype: raise TypeError("ball_radii dtype must match tree.coord_dtype") ball_id_dtype = tree.particle_id_dtype # ? from pytools import div_ceil # Avoid generating too many kernels. max_levels = div_ceil(tree.nlevels, 10) * 10 if peer_lists is None: peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for) wait_for = [evt] if len(peer_lists.peer_list_starts) != tree.nboxes + 1: raise ValueError("size of peer lists must match with number of boxes") area_query_kernel = self.get_area_query_kernel(tree.dimensions, tree.coord_dtype, tree.box_id_dtype, ball_id_dtype, peer_lists.peer_list_starts.dtype, max_levels) logger.info("area query: run area query") result, evt = area_query_kernel( queue, len(ball_radii), tree.box_centers.data, tree.root_extent, tree.box_levels.data, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data, peer_lists.peer_list_starts.data, peer_lists.peer_lists.data, ball_radii.data, *(tuple(tree.bounding_box[0]) + tuple(bc.data for bc in ball_centers)), wait_for=wait_for) logger.info("area query: done") return AreaQueryResult( tree=tree, leaves_near_ball_starts=result["leaves"].starts, leaves_near_ball_lists=result["leaves"].lists).with_queue(None), evt
def test_mem_access_counter_mixed(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]+x[i,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="mixed", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict( a=np.float32, b=np.float32, g=np.float64, h=np.float64, x=np.float32)) group_size_0 = 65 knl = lp.split_iname(knl, "j", group_size_0) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = div_ceil(ell, group_size_0) group_size = group_size_0 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) f64uniform = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='x', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f64uniform == (2*n*m)*n_subgroups assert f32uniform == (m*n)*n_subgroups expect_fallback = False import islpy as isl try: isl.BasicSet.card except AttributeError: expect_fallback = True else: expect_fallback = False if expect_fallback: if ell < group_size_0: assert f32nonconsec == 3*n*m*ell*n_workgroups else: assert f32nonconsec == 3*n*m*n_workgroups*group_size_0 else: assert f32nonconsec == 3*n*m*ell f64uniform = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f64uniform == m*n*n_subgroups if expect_fallback: if ell < group_size_0: assert f32nonconsec == n*m*ell*n_workgroups else: assert f32nonconsec == n*m*n_workgroups*group_size_0 else: assert f32nonconsec == n*m*ell
def __call__(self, queue, tree, ball_centers, ball_radii, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg ball_centers: an object array of coordinate :class:`pyopencl.array.Array` instances. Their *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg ball_radii: a :class:`pyopencl.array.Array` of positive numbers. Its *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting exeuction. :returns: a tuple *(lbl, event)*, where *lbl* is an instance of :class:`LeavesToBallsLookup`, and *event* is a :class:`pyopencl.Event` for dependency management. """ from pytools import single_valued if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: raise TypeError("ball_centers dtype must match tree.coord_dtype") if ball_radii.dtype != tree.coord_dtype: raise TypeError("ball_radii dtype must match tree.coord_dtype") ball_id_dtype = tree.particle_id_dtype # ? from pytools import div_ceil max_levels = div_ceil(tree.nlevels, 10) * 10 b2l_knl = self.get_balls_to_leaves_kernel(tree.dimensions, tree.coord_dtype, tree.box_id_dtype, ball_id_dtype, max_levels, tree.stick_out_factor) logger.info("leaves-to-balls lookup: prepare ball list") nballs = len(ball_radii) result, evt = b2l_knl(queue, nballs, tree.box_flags.data, tree.box_centers.data, tree.box_child_ids.data, tree.box_levels.data, tree.root_extent, tree.aligned_nboxes, ball_radii.data, *tuple(bc.data for bc in ball_centers), wait_for=wait_for) wait_for = [evt] logger.info("leaves-to-balls lookup: key-value sort") balls_near_box_starts, balls_near_box_lists, evt \ = self.key_value_sorter( queue, # keys result["overlapping_leaves"].lists, # values result["ball_numbers"].lists, tree.nboxes, starts_dtype=tree.box_id_dtype, wait_for=wait_for) logger.info("leaves-to-balls lookup: built") return LeavesToBallsLookup( tree=tree, balls_near_box_starts=balls_near_box_starts, balls_near_box_lists=balls_near_box_lists).with_queue(None), evt
def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg ball_centers: an object array of coordinate :class:`pyopencl.array.Array` instances. Their *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg ball_radii: a :class:`pyopencl.array.Array` of positive numbers. Its *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg peer_lists: may either be *None* or an instance of :class:`PeerListLookup` associated with `tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting execution. :returns: a tuple *(sqi, event)*, where *sqi* is an instance of :class:`pyopencl.array.Array`, and *event* is a :class:`pyopencl.Event` for dependency management. The *dtype* of *sqi* is *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape is *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`). The entries of *sqi* are indexed by the global box index and are as follows: * if *i* is not the index of a leaf box, *sqi[i] = 0*. * if *i* is the index of a leaf box, *sqi[i]* is the outer space invader distance for *i*. """ from pytools import single_valued if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: raise TypeError("ball_centers dtype must match tree.coord_dtype") if ball_radii.dtype != tree.coord_dtype: raise TypeError("ball_radii dtype must match tree.coord_dtype") from pytools import div_ceil # Avoid generating too many kernels. max_levels = div_ceil(tree.nlevels, 10) * 10 if peer_lists is None: peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for) wait_for = [evt] if len(peer_lists.peer_list_starts) != tree.nboxes + 1: raise ValueError("size of peer lists must match with number of boxes") space_invader_query_kernel = self.get_space_invader_query_kernel( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, max_levels) logger.info("space invader query: run space invader query") outer_space_invader_dists = cl.array.zeros(queue, tree.nboxes, np.float32) if not wait_for: wait_for = [] wait_for = wait_for + outer_space_invader_dists.events evt = space_invader_query_kernel( *SPACE_INVADER_QUERY_TEMPLATE.unwrap_args( tree, peer_lists, ball_radii, outer_space_invader_dists, *tuple(bc for bc in ball_centers)), wait_for=wait_for, queue=queue, range=slice(len(ball_radii))) if tree.coord_dtype != np.dtype(np.float32): # The kernel output is always an array of float32 due to limited # support for atomic operations with float64 in OpenCL. # Here the output is cast to match the coord dtype. outer_space_invader_dists.finish() outer_space_invader_dists = outer_space_invader_dists.astype( tree.coord_dtype) evt, = outer_space_invader_dists.events logger.info("space invader query: done") return outer_space_invader_dists, evt
def test_all_counters_parallel_matmul(): bsize = 16 knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", bsize) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} group_size = bsize*bsize n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize) subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 assert sync_map["kernel_launch"].eval_with_dict(params) == 1 assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) f32mul = op_map[ lp.Op(np.float32, 'mul', CG.SUBGROUP) ].eval_with_dict(params) f32add = op_map[ lp.Op(np.float32, 'add', CG.SUBGROUP) ].eval_with_dict(params) i32ops = op_map[ lp.Op(np.int32, 'add', CG.SUBGROUP) ].eval_with_dict(params) i32ops += op_map[ lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul+f32add == m*2*n_subgroups mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('ell')}, gid_strides={1: bsize}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('m')}, gid_strides={0: Variable('m')*bsize}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize assert f32s1la == n*m*ell/bsize f32coal = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('ell')}, gid_strides={0: Variable('ell')*bsize, 1: bsize}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f32coal == n*ell local_mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS).filter_by(mtype=['local']) local_mem_l = local_mem_map.filter_by(direction=['load'] ).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert local_mem_l == m*2*n_subgroups local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={1: 16}, gid_strides={}, variable='a_fetch', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={0: 1}, gid_strides={}, variable='b_fetch', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert local_mem_l_a == local_mem_l_b == m*n_subgroups local_mem_s = local_mem_map.filter_by(direction=['store'] ).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert local_mem_s == m*2/bsize*n_subgroups
def __call__(self, queue, particles, max_particles_in_box, allocator=None, debug=False, targets=None, source_radii=None, target_radii=None, stick_out_factor=0.25, wait_for=None, non_adaptive=False, **kwargs): """ :arg queue: a :class:`pyopencl.CommandQueue` instance :arg particles: an object array of (XYZ) point coordinate arrays. :arg targets: an object array of (XYZ) point coordinate arrays or ``None``. If ``None``, *particles* act as targets, too. Must have the same (inner) dtype as *particles*. :arg source_radii: If not *None*, a :class:`pyopencl.array.Array` of the same dtype as *particles*. If this is given, *targets* must also be given, i.e. sources and targets must be separate. See :ref:`extent`. :arg target_radii: Like *source_radii*, but for targets. :arg stick_out_factor: See :attr:`Tree.stick_out_factor` and :ref:`extent`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting exeuction. :arg non_adaptive: If *True*, return a tree in which all leaf boxes are on the same (last) level. The tree is pruned, in the sense that empty boxes have been eliminated. :arg kwargs: Used internally for debugging. :returns: a tuple ``(tree, event)``, where *tree* is an instance of :class:`Tree`, and *event* is a :class:`pyopencl.Event` for dependency management. """ # {{{ input processing # we'll modify this below, so copy it if wait_for is None: wait_for = [] else: wait_for = list(wait_for) dimensions = len(particles) from boxtree.tools import AXIS_NAMES axis_names = AXIS_NAMES[:dimensions] sources_are_targets = targets is None sources_have_extent = source_radii is not None targets_have_extent = target_radii is not None srcntgts_have_extent = sources_have_extent or targets_have_extent if srcntgts_have_extent and targets is None: raise ValueError("must specify targets when specifying " "any kind of radii") from pytools import single_valued particle_id_dtype = np.int32 box_id_dtype = np.int32 coord_dtype = single_valued(coord.dtype for coord in particles) if targets is None: nsrcntgts = single_valued(len(coord) for coord in particles) else: nsources = single_valued(len(coord) for coord in particles) ntargets = single_valued(len(coord) for coord in targets) nsrcntgts = nsources + ntargets if source_radii is not None: if source_radii.shape != (nsources,): raise ValueError("source_radii has an invalid shape") if source_radii.dtype != coord_dtype: raise TypeError("dtypes of coordinate arrays and " "source_radii must agree") if target_radii is not None: if target_radii.shape != (ntargets,): raise ValueError("target_radii has an invalid shape") if target_radii.dtype != coord_dtype: raise TypeError("dtypes of coordinate arrays and " "target_radii must agree") # }}} empty = partial(cl.array.empty, queue, allocator=allocator) def zeros(shape, dtype): result = (cl.array.empty(queue, shape, dtype, allocator=allocator) .fill(0, wait_for=wait_for)) event, = result.events return result, event knl_info = self.get_kernel_info(dimensions, coord_dtype, particle_id_dtype, box_id_dtype, sources_are_targets, srcntgts_have_extent, stick_out_factor, adaptive=not non_adaptive) # {{{ combine sources and targets into one array, if necessary prep_events = [] if targets is None: # Targets weren't specified. Sources are also targets. Let's # call them "srcntgts". srcntgts = particles assert source_radii is None assert target_radii is None srcntgt_radii = None else: # Here, we mash sources and targets into one array to give us one # big array of "srcntgts". In this case, a "srcntgt" is either a # source or a target, but not really both, as above. How will we be # able to tell which it was? Easy: We'll compare its 'user' id with # nsources. If it's >=, it's a target, otherwise it's a source. target_coord_dtype = single_valued(tgt_i.dtype for tgt_i in targets) if target_coord_dtype != coord_dtype: raise TypeError("sources and targets must have same coordinate " "dtype") def combine_srcntgt_arrays(ary1, ary2=None): if ary2 is None: dtype = ary1.dtype else: dtype = ary2.dtype result = empty(nsrcntgts, dtype) if (ary1 is None) or (ary2 is None): result.fill(0) if ary1 is not None and ary1.nbytes: result[:len(ary1)] = ary1 if ary2 is not None and ary2.nbytes: result[nsources:] = ary2 return result from pytools.obj_array import make_obj_array srcntgts = make_obj_array([ combine_srcntgt_arrays(src_i, tgt_i) for src_i, tgt_i in zip(particles, targets) ]) if srcntgts_have_extent: srcntgt_radii = combine_srcntgt_arrays(source_radii, target_radii) else: srcntgt_radii = None del source_radii del target_radii del particles user_srcntgt_ids = cl.array.arange(queue, nsrcntgts, dtype=particle_id_dtype, allocator=allocator) evt, = user_srcntgt_ids.events wait_for.append(evt) del evt # }}} # {{{ find and process bounding box bbox, _ = self.bbox_finder(srcntgts, srcntgt_radii, wait_for=wait_for) bbox = bbox.get() root_extent = max( bbox["max_"+ax] - bbox["min_"+ax] for ax in axis_names) * (1+1e-4) # make bbox square and slightly larger at the top, to ensure scaled # coordinates are always < 1 bbox_min = np.empty(dimensions, coord_dtype) for i, ax in enumerate(axis_names): bbox_min[i] = bbox["min_"+ax] bbox_max = bbox_min + root_extent for i, ax in enumerate(axis_names): bbox["max_"+ax] = bbox_max[i] # }}} from pytools import div_ceil # {{{ allocate data logger.debug("allocating memory") # box-local morton bin counts for each particle at the current level # only valid from scan -> split'n'sort morton_bin_counts = empty(nsrcntgts, dtype=knl_info.morton_bin_count_dtype) # (local) morton nrs for each particle at the current level # only valid from scan -> split'n'sort morton_nrs = empty(nsrcntgts, dtype=self.morton_nr_dtype) # 0/1 segment flags # invariant to sorting once set # (particles are only reordered within a box) # valid throughout computation box_start_flags, evt = zeros(nsrcntgts, dtype=np.int8) prep_events.append(evt) srcntgt_box_ids, evt = zeros(nsrcntgts, dtype=box_id_dtype) prep_events.append(evt) split_box_ids, evt = zeros(nsrcntgts, dtype=box_id_dtype) prep_events.append(evt) # number of boxes total, and a guess nboxes_dev = empty((), dtype=box_id_dtype) nboxes_dev.fill(1) # /!\ If you're allocating an array here that depends on nboxes_guess, # you *must* also write reallocation code down below for the case when # nboxes_guess was too low. # Outside nboxes_guess feeding is solely for debugging purposes, # to test the reallocation code. nboxes_guess = kwargs.get("nboxes_guess") if nboxes_guess is None: nboxes_guess = div_ceil(nsrcntgts, max_particles_in_box) * 2**dimensions # per-box morton bin counts box_morton_bin_counts = empty(nboxes_guess, dtype=knl_info.morton_bin_count_dtype) # particle# at which each box starts box_srcntgt_starts, evt = zeros(nboxes_guess, dtype=particle_id_dtype) prep_events.append(evt) # pointer to parent box box_parent_ids, evt = zeros(nboxes_guess, dtype=box_id_dtype) prep_events.append(evt) # morton nr identifier {quadr,oct}ant of parent in which this box was created box_morton_nrs, evt = zeros(nboxes_guess, dtype=self.morton_nr_dtype) prep_events.append(evt) # box -> level map box_levels, evt = zeros(nboxes_guess, self.box_level_dtype) prep_events.append(evt) # number of particles in each box # needs to be globally initialized because empty boxes never get touched box_srcntgt_counts_cumul, evt = zeros(nboxes_guess, dtype=particle_id_dtype) prep_events.append(evt) # Initalize box 0 to contain all particles evt = box_srcntgt_counts_cumul[0].fill( nsrcntgts, queue=queue, wait_for=[evt]) # set parent of root box to itself evt = cl.enqueue_copy( queue, box_parent_ids.data, np.zeros((), dtype=box_parent_ids.dtype)) prep_events.append(evt) # }}} def fin_debug(s): if debug: queue.finish() logger.debug(s) from pytools.obj_array import make_obj_array have_oversize_split_box, evt = zeros((), np.int32) prep_events.append(evt) wait_for = prep_events # {{{ level loop # Level 0 starts at 0 and always contains box 0 and nothing else. # Level 1 therefore starts at 1. level_start_box_nrs = [0, 1] from time import time start_time = time() if nsrcntgts > max_particles_in_box: level = 1 else: level = 0 # INVARIANTS -- Upon entry to this loop: # # - level is the level being built. # - the last entry of level_start_box_nrs is the beginning of the level # to be built # This while condition prevents entering the loop in case there's just a # single box, by how 'level' is set above. Read this as 'while True' with # an edge case. logger.debug("entering level loop with %s srcntgts" % nsrcntgts) while level: if debug: # More invariants: assert level == len(level_start_box_nrs) - 1 if level > np.iinfo(self.box_level_dtype).max: raise RuntimeError("level count exceeded maximum") common_args = ((morton_bin_counts, morton_nrs, box_start_flags, srcntgt_box_ids, split_box_ids, box_morton_bin_counts, box_srcntgt_starts, box_srcntgt_counts_cumul, box_parent_ids, box_morton_nrs, nboxes_dev, level, max_particles_in_box, bbox, user_srcntgt_ids) + tuple(srcntgts) + ((srcntgt_radii,) if srcntgts_have_extent else ()) ) fin_debug("morton count scan") # writes: box_morton_bin_counts, morton_nrs evt = knl_info.morton_count_scan( *common_args, queue=queue, size=nsrcntgts, wait_for=wait_for) wait_for = [evt] fin_debug("split box id scan") # writes: nboxes_dev, split_box_ids evt = knl_info.split_box_id_scan( srcntgt_box_ids, box_srcntgt_starts, box_srcntgt_counts_cumul, max_particles_in_box, box_morton_bin_counts, box_levels, level, # input/output: nboxes_dev, # output: split_box_ids, queue=queue, size=nsrcntgts, wait_for=wait_for) wait_for = [evt] nboxes_new = int(nboxes_dev.get()) # Assumption: Everything between here and the top of the loop must # be repeatable, so that in an out-of-memory situation, we can just # rerun this bit of the code after reallocating and a minimal reset # procedure. # {{{ reallocate and retry if nboxes_guess was too small if nboxes_new > nboxes_guess: fin_debug("starting nboxes_guess increase") while nboxes_guess < nboxes_new: nboxes_guess *= 2 from boxtree.tools import realloc_array my_realloc = partial(realloc_array, new_shape=nboxes_guess, zero_fill=False, queue=queue, wait_for=wait_for) my_realloc_zeros = partial(realloc_array, new_shape=nboxes_guess, zero_fill=True, queue=queue, wait_for=wait_for) resize_events = [] box_morton_bin_counts, evt = my_realloc(box_morton_bin_counts) resize_events.append(evt) box_srcntgt_starts, evt = my_realloc_zeros(box_srcntgt_starts) resize_events.append(evt) box_parent_ids, evt = my_realloc_zeros(box_parent_ids) resize_events.append(evt) box_morton_nrs, evt = my_realloc_zeros(box_morton_nrs) resize_events.append(evt) box_levels, evt = my_realloc_zeros(box_levels) resize_events.append(evt) box_srcntgt_counts_cumul, evt = \ my_realloc_zeros(box_srcntgt_counts_cumul) resize_events.append(evt) del my_realloc del my_realloc_zeros # reset nboxes_dev to previous value nboxes_dev.fill(level_start_box_nrs[-1]) resize_events.append(evt) wait_for = resize_events # retry logger.info("nboxes_guess exceeded: " "enlarged allocations, restarting level") continue # }}} logger.info("LEVEL %d -> %d boxes" % (level, nboxes_new)) assert level_start_box_nrs[-1] != nboxes_new or srcntgts_have_extent if level_start_box_nrs[-1] == nboxes_new: # We haven't created new boxes in this level loop trip. Unless # srcntgts have extent, this should never happen. (I.e., we # should've never entered this loop trip.) # # If srcntgts have extent, this can happen if boxes were # in-principle overfull, but couldn't subdivide because of # extent restrictions. assert srcntgts_have_extent level -= 1 logger.debug("no new boxes created this loop trip") break level_start_box_nrs.append(nboxes_new) del nboxes_new new_user_srcntgt_ids = cl.array.empty_like(user_srcntgt_ids) new_srcntgt_box_ids = cl.array.empty_like(srcntgt_box_ids) split_and_sort_args = ( common_args + (new_user_srcntgt_ids, have_oversize_split_box, new_srcntgt_box_ids, box_levels)) fin_debug("split and sort") evt = knl_info.split_and_sort_kernel(*split_and_sort_args, wait_for=wait_for) wait_for = [evt] if debug: level_bl_chunk = box_levels.get()[ level_start_box_nrs[-2]:level_start_box_nrs[-1]] assert ((level_bl_chunk == level) | (level_bl_chunk == 0)).all() del level_bl_chunk if debug: assert (box_srcntgt_starts.get() < nsrcntgts).all() user_srcntgt_ids = new_user_srcntgt_ids del new_user_srcntgt_ids srcntgt_box_ids = new_srcntgt_box_ids del new_srcntgt_box_ids if not int(have_oversize_split_box.get()): logger.debug("no overfull boxes left") break level += 1 have_oversize_split_box.fill(0) end_time = time() elapsed = end_time-start_time npasses = level+1 logger.info("elapsed time: %g s (%g s/particle/pass)" % ( elapsed, elapsed/(npasses*nsrcntgts))) del npasses nboxes = int(nboxes_dev.get()) # }}} # {{{ extract number of non-child srcntgts from box morton counts if srcntgts_have_extent: box_srcntgt_counts_nonchild = empty(nboxes, particle_id_dtype) fin_debug("extract non-child srcntgt count") assert len(level_start_box_nrs) >= 2 highest_possibly_split_box_nr = level_start_box_nrs[-2] evt = knl_info.extract_nonchild_srcntgt_count_kernel( # input box_morton_bin_counts, box_srcntgt_counts_cumul, highest_possibly_split_box_nr, # output box_srcntgt_counts_nonchild, range=slice(nboxes), wait_for=wait_for) wait_for = [evt] del highest_possibly_split_box_nr if debug: assert (box_srcntgt_counts_nonchild.get() <= box_srcntgt_counts_cumul.get()[:nboxes]).all() # }}} del morton_nrs del box_morton_bin_counts # {{{ prune empty leaf boxes is_pruned = not kwargs.get("skip_prune") if is_pruned: # What is the original index of this box? from_box_id = empty(nboxes, box_id_dtype) # Where should I put this box? to_box_id = empty(nboxes, box_id_dtype) fin_debug("find prune indices") nboxes_post_prune_dev = empty((), dtype=box_id_dtype) evt = knl_info.find_prune_indices_kernel( box_srcntgt_counts_cumul, to_box_id, from_box_id, nboxes_post_prune_dev, size=nboxes, wait_for=wait_for) wait_for = [evt] fin_debug("prune copy") nboxes_post_prune = int(nboxes_post_prune_dev.get()) logger.info("%d empty leaves" % (nboxes-nboxes_post_prune)) prune_events = [] prune_empty = partial(self.gappy_copy_and_map, queue, allocator, nboxes_post_prune, from_box_id) box_srcntgt_starts, evt = prune_empty(box_srcntgt_starts) prune_events.append(evt) box_srcntgt_counts_cumul, evt = prune_empty(box_srcntgt_counts_cumul) prune_events.append(evt) if debug: assert (box_srcntgt_counts_cumul.get() > 0).all() srcntgt_box_ids = cl.array.take(to_box_id, srcntgt_box_ids) box_parent_ids, evt = prune_empty(box_parent_ids, map_values=to_box_id) prune_events.append(evt) box_morton_nrs, evt = prune_empty(box_morton_nrs) prune_events.append(evt) box_levels, evt = prune_empty(box_levels) prune_events.append(evt) if srcntgts_have_extent: box_srcntgt_counts_nonchild, evt = prune_empty( box_srcntgt_counts_nonchild) prune_events.append(evt) # Remap level_start_box_nrs to new box IDs. # FIXME: It would be better to do this on the device. level_start_box_nrs = list( to_box_id.get() [np.array(level_start_box_nrs[:-1], box_id_dtype)]) level_start_box_nrs = level_start_box_nrs + [nboxes_post_prune] wait_for = prune_events else: logger.info("skipping empty-leaf pruning") nboxes_post_prune = nboxes level_start_box_nrs = np.array(level_start_box_nrs, box_id_dtype) # }}} del nboxes # {{{ compute source/target particle indices and counts in each box if targets is None: from boxtree.tools import reverse_index_array user_source_ids = user_srcntgt_ids sorted_target_ids = reverse_index_array(user_srcntgt_ids) box_source_starts = box_target_starts = box_srcntgt_starts box_source_counts_cumul = box_target_counts_cumul = \ box_srcntgt_counts_cumul if srcntgts_have_extent: box_source_counts_nonchild = box_target_counts_nonchild = \ box_srcntgt_counts_nonchild else: source_numbers = empty(nsrcntgts, particle_id_dtype) fin_debug("source counter") evt = knl_info.source_counter(user_srcntgt_ids, nsources, source_numbers, queue=queue, allocator=allocator, wait_for=wait_for) wait_for = [evt] user_source_ids = empty(nsources, particle_id_dtype) # srcntgt_target_ids is temporary until particle permutation is done srcntgt_target_ids = empty(ntargets, particle_id_dtype) sorted_target_ids = empty(ntargets, particle_id_dtype) # need to use zeros because parent boxes won't be initialized box_source_starts, evt = zeros(nboxes_post_prune, particle_id_dtype) wait_for.append(evt) box_source_counts_cumul, evt = zeros( nboxes_post_prune, particle_id_dtype) wait_for.append(evt) box_target_starts, evt = zeros( nboxes_post_prune, particle_id_dtype) wait_for.append(evt) box_target_counts_cumul, evt = zeros( nboxes_post_prune, particle_id_dtype) wait_for.append(evt) if srcntgts_have_extent: box_source_counts_nonchild, evt = zeros( nboxes_post_prune, particle_id_dtype) wait_for.append(evt) box_target_counts_nonchild, evt = zeros( nboxes_post_prune, particle_id_dtype) wait_for.append(evt) fin_debug("source and target index finder") evt = knl_info.source_and_target_index_finder(*( # input: ( user_srcntgt_ids, nsources, srcntgt_box_ids, box_parent_ids, box_srcntgt_starts, box_srcntgt_counts_cumul, source_numbers, ) + ((box_srcntgt_counts_nonchild,) if srcntgts_have_extent else ()) # output: + ( user_source_ids, srcntgt_target_ids, sorted_target_ids, box_source_starts, box_source_counts_cumul, box_target_starts, box_target_counts_cumul, ) + (( box_source_counts_nonchild, box_target_counts_nonchild, ) if srcntgts_have_extent else ()) ), queue=queue, range=slice(nsrcntgts), wait_for=wait_for) wait_for = [evt] if srcntgts_have_extent: if debug: assert ( box_srcntgt_counts_nonchild.get() == (box_source_counts_nonchild + box_target_counts_nonchild).get()).all() if debug: usi_host = user_source_ids.get() assert (usi_host < nsources).all() assert (0 <= usi_host).all() del usi_host sti_host = srcntgt_target_ids.get() assert (sti_host < nsources+ntargets).all() assert (nsources <= sti_host).all() del sti_host assert (box_source_counts_cumul.get() + box_target_counts_cumul.get() == box_srcntgt_counts_cumul.get()).all() del source_numbers del box_srcntgt_starts if srcntgts_have_extent: del box_srcntgt_counts_nonchild # }}} # {{{ permute and source/target-split (if necessary) particle array if targets is None: sources = targets = make_obj_array([ cl.array.empty_like(pt) for pt in srcntgts]) fin_debug("srcntgt permuter (particles)") evt = knl_info.srcntgt_permuter( user_srcntgt_ids, *(tuple(srcntgts) + tuple(sources)), wait_for=wait_for) wait_for = [evt] assert srcntgt_radii is None else: sources = make_obj_array([ empty(nsources, coord_dtype) for i in range(dimensions)]) fin_debug("srcntgt permuter (sources)") evt = knl_info.srcntgt_permuter( user_source_ids, *(tuple(srcntgts) + tuple(sources)), queue=queue, range=slice(nsources), wait_for=wait_for) wait_for = [evt] targets = make_obj_array([ empty(ntargets, coord_dtype) for i in range(dimensions)]) fin_debug("srcntgt permuter (targets)") evt = knl_info.srcntgt_permuter( srcntgt_target_ids, *(tuple(srcntgts) + tuple(targets)), queue=queue, range=slice(ntargets), wait_for=wait_for) wait_for = [evt] if srcntgt_radii is not None: fin_debug("srcntgt permuter (source radii)") source_radii = cl.array.take( srcntgt_radii, user_source_ids, queue=queue, wait_for=wait_for) fin_debug("srcntgt permuter (target radii)") target_radii = cl.array.take( srcntgt_radii, srcntgt_target_ids, queue=queue, wait_for=wait_for) wait_for = source_radii.events + target_radii.events del srcntgt_target_ids del srcntgt_radii # }}} del srcntgts nlevels = len(level_start_box_nrs) - 1 assert level + 1 == nlevels, (level+1, nlevels) if debug: max_level = np.max(box_levels.get()) assert max_level + 1 == nlevels # {{{ compute box info # A number of arrays below are nominally 2-dimensional and stored with # the box index as the fastest-moving index. To make sure that accesses # remain aligned, we round up the number of boxes used for indexing. aligned_nboxes = div_ceil(nboxes_post_prune, 32)*32 box_child_ids, evt = zeros((2**dimensions, aligned_nboxes), box_id_dtype) wait_for.append(evt) box_centers = empty((dimensions, aligned_nboxes), coord_dtype) from boxtree.tree import box_flags_enum box_flags = empty(nboxes_post_prune, box_flags_enum.dtype) if not srcntgts_have_extent: # If srcntgts_have_extent, then non-child counts have already been # computed, and we have nothing to do here. But if not, then # we must fill these non-child counts. This amounts to copying # the cumulative counts and setting them to zero for non-leaves. # {{{ make sure box_{source,target}_counts_nonchild are not defined # (before we overwrite them) try: box_source_counts_nonchild except NameError: pass else: assert False try: box_target_counts_nonchild except NameError: pass else: assert False # }}} box_source_counts_nonchild, evt = zeros( nboxes_post_prune, particle_id_dtype) wait_for.append(evt) if sources_are_targets: box_target_counts_nonchild = box_source_counts_nonchild else: box_target_counts_nonchild, evt = zeros( nboxes_post_prune, particle_id_dtype) wait_for.append(evt) fin_debug("compute box info") evt = knl_info.box_info_kernel( *( # input: box_parent_ids, box_morton_nrs, bbox, aligned_nboxes, box_srcntgt_counts_cumul, box_source_counts_cumul, box_target_counts_cumul, max_particles_in_box, box_levels, nlevels, # output if srcntgts_have_extent, input+output otherwise box_source_counts_nonchild, box_target_counts_nonchild, # output: box_child_ids, box_centers, box_flags, ), range=slice(nboxes_post_prune), wait_for=wait_for) # }}} # {{{ build output extra_tree_attrs = {} if sources_have_extent: extra_tree_attrs.update(source_radii=source_radii) if targets_have_extent: extra_tree_attrs.update(target_radii=target_radii) logger.info("tree build complete") return Tree( # If you change this, also change the documentation # of what's in the tree, above. sources_are_targets=sources_are_targets, sources_have_extent=sources_have_extent, targets_have_extent=targets_have_extent, particle_id_dtype=knl_info.particle_id_dtype, box_id_dtype=knl_info.box_id_dtype, coord_dtype=coord_dtype, box_level_dtype=self.box_level_dtype, root_extent=root_extent, stick_out_factor=stick_out_factor, bounding_box=(bbox_min, bbox_max), level_start_box_nrs=level_start_box_nrs, level_start_box_nrs_dev=cl.array.to_device( queue, level_start_box_nrs, allocator=allocator), sources=sources, targets=targets, box_source_starts=box_source_starts, box_source_counts_nonchild=box_source_counts_nonchild, box_source_counts_cumul=box_source_counts_cumul, box_target_starts=box_target_starts, box_target_counts_nonchild=box_target_counts_nonchild, box_target_counts_cumul=box_target_counts_cumul, box_parent_ids=box_parent_ids, box_child_ids=box_child_ids, box_centers=box_centers, box_levels=box_levels, box_flags=box_flags, user_source_ids=user_source_ids, sorted_target_ids=sorted_target_ids, _is_pruned=is_pruned, **extra_tree_attrs ).with_queue(None), evt
def __call__(self, queue, tree, wait_for=None, debug=False): """ :arg queue: A :class:`pyopencl.CommandQueue` instance. :arg tree: A :class:`boxtree.Tree` instance. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting exeuction. :return: A tuple *(trav, event)*, where *trav* is a new instance of :class:`FMMTraversalInfo` and *event* is a :class:`pyopencl.Event` for dependency management. """ if not tree._is_pruned: raise ValueError("tree must be pruned for traversal generation") # Generated code shouldn't depend on tje *exact* number of tree levels. # So round up to the next multiple of 5. from pytools import div_ceil max_levels = div_ceil(tree.nlevels, 5) * 5 knl_info = self.get_kernel_info( tree.dimensions, tree.particle_id_dtype, tree.box_id_dtype, tree.coord_dtype, tree.box_level_dtype, max_levels, tree.sources_are_targets, tree.sources_have_extent, tree.targets_have_extent, tree.stick_out_factor) def fin_debug(s): if debug: queue.finish() logger.debug(s) logger.info("start building traversal") # {{{ source boxes, their parents, and target boxes fin_debug("building list of source boxes, their parents, and target boxes") result, evt = knl_info.sources_parents_and_targets_builder( queue, tree.nboxes, tree.box_flags.data, wait_for=wait_for) wait_for = [evt] source_parent_boxes = result["source_parent_boxes"].lists source_boxes = result["source_boxes"].lists target_or_target_parent_boxes = result["target_or_target_parent_boxes"].lists if not tree.sources_are_targets: target_boxes = result["target_boxes"].lists else: target_boxes = source_boxes # }}} # {{{ figure out level starts in *_parent_boxes def extract_level_start_box_nrs(box_list, wait_for): result = cl.array.empty(queue, tree.nlevels+1, tree.box_id_dtype) \ .fill(len(box_list)) evt = knl_info.level_start_box_nrs_extractor( tree.level_start_box_nrs_dev, tree.box_levels, box_list, result, range=slice(1, len(box_list)), queue=queue, wait_for=wait_for) result = result.get() # We skipped box 0 above. This is always true, whether # box 0 (=level 0) is a leaf or a parent. result[0] = 0 # Postprocess result for unoccupied levels prev_start = len(box_list) for ilev in range(tree.nlevels-1, -1, -1): result[ilev] = prev_start = \ min(result[ilev], prev_start) return result, evt fin_debug("finding level starts in source parent boxes array") level_start_source_parent_box_nrs, evt_s = \ extract_level_start_box_nrs( source_parent_boxes, wait_for=wait_for) fin_debug("finding level starts in target or target parent boxes array") level_start_target_or_target_parent_box_nrs, evt_t = \ extract_level_start_box_nrs( target_or_target_parent_boxes, wait_for=wait_for) wait_for = [evt_s, evt_t] # }}} # {{{ colleagues fin_debug("finding colleagues") result, evt = knl_info.colleagues_builder( queue, tree.nboxes, tree.box_centers.data, tree.root_extent, tree.box_levels.data, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data, wait_for=wait_for) wait_for = [evt] colleagues = result["colleagues"] # }}} # {{{ neighbor source boxes ("list 1") fin_debug("finding neighbor source boxes ('list 1')") result, evt = knl_info.neighbor_source_boxes_builder( queue, len(target_boxes), tree.box_centers.data, tree.root_extent, tree.box_levels.data, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data, target_boxes.data, wait_for=wait_for) wait_for = [evt] neighbor_source_boxes = result["neighbor_source_boxes"] # }}} # {{{ well-separated siblings ("list 2") fin_debug("finding well-separated siblings ('list 2')") result, evt = knl_info.sep_siblings_builder( queue, len(target_or_target_parent_boxes), tree.box_centers.data, tree.root_extent, tree.box_levels.data, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data, target_or_target_parent_boxes.data, tree.box_parent_ids.data, colleagues.starts.data, colleagues.lists.data, wait_for=wait_for) wait_for = [evt] sep_siblings = result["sep_siblings"] # }}} # {{{ separated smaller ("list 3") fin_debug("finding separated smaller ('list 3')") result, evt = knl_info.sep_smaller_builder( queue, len(target_boxes), tree.box_centers.data, tree.root_extent, tree.box_levels.data, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data, target_boxes.data, colleagues.starts.data, colleagues.lists.data, wait_for=wait_for) wait_for = [evt] sep_smaller = result["sep_smaller"] if tree.sources_have_extent or tree.targets_have_extent: sep_close_smaller_starts = result["sep_close_smaller"].starts sep_close_smaller_lists = result["sep_close_smaller"].lists else: sep_close_smaller_starts = None sep_close_smaller_lists = None # }}} # {{{ separated bigger ("list 4") fin_debug("finding separated bigger ('list 4')") result, evt = knl_info.sep_bigger_builder( queue, len(target_or_target_parent_boxes), tree.box_centers.data, tree.root_extent, tree.box_levels.data, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data, target_or_target_parent_boxes.data, tree.box_parent_ids.data, colleagues.starts.data, colleagues.lists.data, wait_for=wait_for) wait_for = [evt] sep_bigger = result["sep_bigger"] if tree.sources_have_extent or tree.targets_have_extent: sep_close_bigger_starts = result["sep_close_bigger"].starts sep_close_bigger_lists = result["sep_close_bigger"].lists else: sep_close_bigger_starts = None sep_close_bigger_lists = None # }}} evt, = wait_for logger.info("traversal built") return FMMTraversalInfo( tree=tree, source_boxes=source_boxes, target_boxes=target_boxes, source_parent_boxes=source_parent_boxes, level_start_source_parent_box_nrs=level_start_source_parent_box_nrs, target_or_target_parent_boxes=target_or_target_parent_boxes, level_start_target_or_target_parent_box_nrs=( level_start_target_or_target_parent_box_nrs), colleagues_starts=colleagues.starts, colleagues_lists=colleagues.lists, neighbor_source_boxes_starts=neighbor_source_boxes.starts, neighbor_source_boxes_lists=neighbor_source_boxes.lists, sep_siblings_starts=sep_siblings.starts, sep_siblings_lists=sep_siblings.lists, sep_smaller_starts=sep_smaller.starts, sep_smaller_lists=sep_smaller.lists, sep_close_smaller_starts=sep_close_smaller_starts, sep_close_smaller_lists=sep_close_smaller_lists, sep_bigger_starts=sep_bigger.starts, sep_bigger_lists=sep_bigger.lists, sep_close_bigger_starts=sep_close_bigger_starts, sep_close_bigger_lists=sep_close_bigger_lists, ).with_queue(None), evt
def __call__(self, queue, tree, ball_centers, ball_radii, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg ball_centers: an object array of coordinate :class:`pyopencl.array.Array` instances. Their *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg ball_radii: a :class:`pyopencl.array.Array` of positive numbers. Its *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting exeuction. :returns: a tuple *(lbl, event)*, where *lbl* is an instance of :class:`LeavesToBallsLookup`, and *event* is a :class:`pyopencl.Event` for dependency management. """ from pytools import single_valued if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: raise TypeError("ball_centers dtype must match tree.coord_dtype") if ball_radii.dtype != tree.coord_dtype: raise TypeError("ball_radii dtype must match tree.coord_dtype") ball_id_dtype = tree.particle_id_dtype # ? from pytools import div_ceil max_levels = div_ceil(tree.nlevels, 10) * 10 b2l_knl = self.get_balls_to_leaves_kernel( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, ball_id_dtype, max_levels, tree.stick_out_factor) logger.info("leaves-to-balls lookup: prepare ball list") nballs = len(ball_radii) result, evt = b2l_knl( queue, nballs, tree.box_flags.data, tree.box_centers.data, tree.box_child_ids.data, tree.box_levels.data, tree.root_extent, tree.aligned_nboxes, ball_radii.data, *tuple(bc.data for bc in ball_centers), wait_for=wait_for) wait_for = [evt] logger.info("leaves-to-balls lookup: key-value sort") balls_near_box_starts, balls_near_box_lists, evt \ = self.key_value_sorter( queue, # keys result["overlapping_leaves"].lists, # values result["ball_numbers"].lists, tree.nboxes, starts_dtype=tree.box_id_dtype, wait_for=wait_for) logger.info("leaves-to-balls lookup: built") return LeavesToBallsLookup( tree=tree, balls_near_box_starts=balls_near_box_starts, balls_near_box_lists=balls_near_box_lists).with_queue(None), evt
def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, split_kwargs=None): """ :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating that the index in *axis_nr* should be split. The tuples may also be *(array, axis_nr, "F")*, indicating that the index will be split as it would be according to Fortran order. *array* may name a temporary variable or an argument. If *arrays_and_axes* is a :class:`tuple`, it is automatically wrapped in a list, to make single splits easier. :arg count: The group size to use in the split. :arg auto_split_inames: Whether to automatically split inames encountered in the specified indices. :arg split_kwargs: arguments to pass to :func:`loopy.split_inames` Note that splits on the corresponding inames are carried out implicitly. The inames may *not* be split beforehand. (There's no *really* good reason for this--this routine is just not smart enough to deal with this.) """ if count == 1: return kernel if split_kwargs is None: split_kwargs = {} # {{{ process input into array_to_rest # where "rest" is the non-argument-name part of the input tuples # in args_and_axes def normalize_rest(rest): if len(rest) == 1: return (rest[0], "C") elif len(rest) == 2: return rest else: raise RuntimeError("split instruction '%s' not understood" % rest) if isinstance(arrays_and_axes, tuple): arrays_and_axes = [arrays_and_axes] array_to_rest = dict( (tup[0], normalize_rest(tup[1:])) for tup in arrays_and_axes) if len(arrays_and_axes) != len(array_to_rest): raise RuntimeError("cannot split multiple axes of the same variable") del arrays_and_axes # }}} # {{{ adjust arrays from loopy.kernel.tools import ArrayChanger for array_name, (axis, order) in six.iteritems(array_to_rest): achng = ArrayChanger(kernel, array_name) ary = achng.get() from pytools import div_ceil # {{{ adjust shape new_shape = ary.shape if new_shape is not None: new_shape = list(new_shape) axis_len = new_shape[axis] new_shape[axis] = count outer_len = div_ceil(axis_len, count) if order == "F": new_shape.insert(axis+1, outer_len) elif order == "C": new_shape.insert(axis, outer_len) else: raise RuntimeError("order '%s' not understood" % order) new_shape = tuple(new_shape) # }}} # {{{ adjust dim tags if ary.dim_tags is None: raise RuntimeError("dim_tags of '%s' are not known" % array_name) new_dim_tags = list(ary.dim_tags) old_dim_tag = ary.dim_tags[axis] from loopy.kernel.array import FixedStrideArrayDimTag if not isinstance(old_dim_tag, FixedStrideArrayDimTag): raise RuntimeError("axis %d of '%s' is not tagged fixed-stride" % (axis, array_name)) old_stride = old_dim_tag.stride outer_stride = count*old_stride if order == "F": new_dim_tags.insert(axis+1, FixedStrideArrayDimTag(outer_stride)) elif order == "C": new_dim_tags.insert(axis, FixedStrideArrayDimTag(outer_stride)) else: raise RuntimeError("order '%s' not understood" % order) new_dim_tags = tuple(new_dim_tags) # }}} # {{{ adjust dim_names new_dim_names = ary.dim_names if new_dim_names is not None: new_dim_names = list(new_dim_names) existing_name = new_dim_names[axis] new_dim_names[axis] = existing_name + "_inner" outer_name = existing_name + "_outer" if order == "F": new_dim_names.insert(axis+1, outer_name) elif order == "C": new_dim_names.insert(axis, outer_name) else: raise RuntimeError("order '%s' not understood" % order) new_dim_names = tuple(new_dim_names) # }}} kernel = achng.with_changed_array(ary.copy( shape=new_shape, dim_tags=new_dim_tags, dim_names=new_dim_names)) # }}} split_vars = {} var_name_gen = kernel.get_var_name_generator() def split_access_axis(expr): axis_nr, order = array_to_rest[expr.aggregate.name] idx = expr.index if not isinstance(idx, tuple): idx = (idx,) idx = list(idx) axis_idx = idx[axis_nr] if auto_split_inames: from pymbolic.primitives import Variable if not isinstance(axis_idx, Variable): raise RuntimeError("found access '%s' in which axis %d is not a " "single variable--cannot split " "(Have you tried to do the split yourself, manually, " "beforehand? If so, you shouldn't.)" % (expr, axis_nr)) split_iname = idx[axis_nr].name assert split_iname in kernel.all_inames() try: outer_iname, inner_iname = split_vars[split_iname] except KeyError: outer_iname = var_name_gen(split_iname+"_outer") inner_iname = var_name_gen(split_iname+"_inner") split_vars[split_iname] = outer_iname, inner_iname inner_index = Variable(inner_iname) outer_index = Variable(outer_iname) else: from loopy.symbolic import simplify_using_aff inner_index = simplify_using_aff(kernel, axis_idx % count) outer_index = simplify_using_aff(kernel, axis_idx // count) idx[axis_nr] = inner_index if order == "F": idx.insert(axis+1, outer_index) elif order == "C": idx.insert(axis, outer_index) else: raise RuntimeError("order '%s' not understood" % order) return expr.aggregate.index(tuple(idx)) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) aash = ArrayAxisSplitHelper(rule_mapping_context, set(six.iterkeys(array_to_rest)), split_access_axis) kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) if auto_split_inames: from loopy import split_iname for iname, (outer_iname, inner_iname) in six.iteritems(split_vars): kernel = split_iname(kernel, iname, count, outer_iname=outer_iname, inner_iname=inner_iname, **split_kwargs) return kernel
def test_mem_access_counter_specialops(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1]) """ ], name="specialops", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group f32 = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32 == (2*n*m*ell)*n_subgroups assert f64 == (2*n*m)*n_subgroups f32 = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32 == (n*m*ell)*n_subgroups assert f64 == (n*m)*n_subgroups filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], count_granularity=CG.SUBGROUP) tot = filtered_map.eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert tot == (n*m*ell + n*m)*n_subgroups