def test_all_counters_parallel_matmul(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", ["c[i, j] = sum(k, a[i, k]*b[k, j])"], name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 assert sync_map["kernel_launch"].eval_with_dict(params) == 1 assert sync_map["barrier_local"].eval_with_dict(params) == 2 * m / 16 op_map = lp.get_op_map(knl) f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) i32ops = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params) i32ops += op_map[lp.Op(np.dtype(np.int32), 'mul')].eval_with_dict(params) assert f32mul + f32add == n * m * l * 2 op_map = lp.get_mem_access_map(knl) f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b')].eval_with_dict(params) f32coal += op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a')].eval_with_dict(params) assert f32coal == n * m + m * l f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c')].eval_with_dict(params) assert f32coal == n * l local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess( 'local', np.dtype(np.float32), direction='load')].eval_with_dict(params) assert local_mem_l == n * m * l * 2
def test_mem_access_counter_logic(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32_g_l = reduced_map[lp.MemAccess( 'global', to_loopy_type(np.float32), direction='load')].eval_with_dict(params) f64_g_l = reduced_map[lp.MemAccess( 'global', to_loopy_type(np.float64), direction='load')].eval_with_dict(params) f64_g_s = reduced_map[lp.MemAccess( 'global', to_loopy_type(np.float64), direction='store')].eval_with_dict(params) assert f32_g_l == 2 * n * m assert f64_g_l == n * m assert f64_g_s == n * m
def test_mem_access_counter_specialops(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1]) """ ], name="specialops", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a')].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b')].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='g')].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='load', variable='h')].eval_with_dict(params) assert f32 == 2 * n * m * l assert f64 == 2 * n * m f32 = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='store', variable='c')].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e')].eval_with_dict(params) assert f32 == n * m * l assert f64 == n * m filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g']) #tot = lp.eval_and_sum_polys(filtered_map, params) tot = filtered_map.eval_and_sum(params) assert tot == n * m * l + n * m
def test_mem_access_counter_nonconsec(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="nonconsec", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) # noqa n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='g')].eval_with_dict(params) f64nonconsec += mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='h')].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m') * Variable('ell'), direction='load', variable='a')].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m') * Variable('ell'), direction='load', variable='b')].eval_with_dict(params) assert f64nonconsec == 2 * n * m assert f32nonconsec == 3 * n * m * ell f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='store', variable='e')].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m') * Variable('ell'), direction='store', variable='c')].eval_with_dict(params) assert f64nonconsec == n * m assert f32nonconsec == n * m * ell
def test_mem_access_counter_consec(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="consec", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f64consec = mem_map[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='g')].eval_with_dict(params) f64consec += mem_map[lp.MemAccess('global', np.float64, stride=1, direction='load', variable='h')].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a')].eval_with_dict(params) f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=1, direction='load', variable='b')].eval_with_dict(params) assert f64consec == 2 * n * m assert f32consec == 3 * n * m * l f64consec = mem_map[lp.MemAccess('global', np.float64, stride=1, direction='store', variable='e')].eval_with_dict(params) f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c')].eval_with_dict(params) assert f64consec == n * m assert f32consec == n * m * l
def test_mem_access_counter_basic(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a')].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b')].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g')].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h')].eval_with_dict(params) assert f32l == 3 * n * m * l assert f64l == 2 * n * m f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c')].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), stride=0, direction='store', variable='e')].eval_with_dict(params) assert f32s == n * m * l assert f64s == n * m
def test_mem_access_counter_reduction(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) subgroup_size = 32 mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=subgroup_size) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) f32l = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={}, gid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group assert f32s == (n*ell)*n_workgroups*subgroups_per_group ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'] ).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'] ).to_bytes().eval_and_sum(params) assert ld_bytes == 4*f32l assert st_bytes == 4*f32s
def test_mem_access_counter_bitwise(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.int32, b=np.int32, g=np.int32, h=np.int32)) mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='a')].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='b')].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='load', variable='g')].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), stride=0, direction='load', variable='h')].eval_with_dict(params) assert i32 == 4 * n * m + 2 * n * m * l i32 = mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='c')].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, stride=0, direction='store', variable='e')].eval_with_dict(params) assert i32 == n * m + n * m * l
def test_mem_access_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ e[i,k] = if(not(k<ell-2) and k>6 or k/2==ell, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) subgroup_size = 32 mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=subgroup_size) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, subgroup_size) reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float32), direction='load') ].eval_with_dict(params) f64_g_l = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), direction='load') ].eval_with_dict(params) f64_g_s = reduced_map[lp.MemAccess('global', to_loopy_type(np.float64), direction='store') ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group assert f32_g_l == (2*n*m)*n_workgroups*subgroups_per_group assert f64_g_l == (n*m)*n_workgroups*subgroups_per_group assert f64_g_s == (n*m)*n_workgroups*subgroups_per_group
def test_mem_access_counter_reduction(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", ["c[i, j] = sum(k, a[i, k]*b[k, j])"], name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) mem_map = lp.get_mem_access_map(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32l = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='a')].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='b')].eval_with_dict(params) assert f32l == 2 * n * m * l f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=0, direction='store', variable='c')].eval_with_dict(params) assert f32s == n * l ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load' ]).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store' ]).to_bytes().eval_and_sum(params) assert ld_bytes == 4 * f32l assert st_bytes == 4 * f32s
def test_summations_and_filters(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) loads_a = mem_map.filter_by(direction=['load'], variable=['a'], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert loads_a == (2*n*m*ell)*n_subgroups global_stores = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=[CG.SUBGROUP] ).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert global_stores == (n*m*ell + n*m)*n_subgroups ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'], count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store'], count_granularity=[CG.SUBGROUP] ).to_bytes().eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_subgroups assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32lall = reduced_map[lp.MemAccess('global', np.float32, direction='load') ].eval_with_dict(params) f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load') ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32lall == (3*n*m*ell)*n_subgroups assert f64lall == (2*n*m)*n_subgroups op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) op_map_dtype = op_map.group_by('dtype') f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params) i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params) assert f32 == n*m*ell*3 assert f64 == n*m assert i32 == n*m*2 addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params) f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params) assert addsub_all == n*m*ell + n*m*2 assert f32ops_all == n*m*ell*3 non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params) assert non_field == 0 ops_nodtype = op_map.group_by('name') ops_noname = op_map.group_by('dtype') mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params) f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params) assert mul_all == n*m*ell + n*m assert f64ops_all == n*m def func_filter(key): return key.lid_strides == {} and key.dtype == to_loopy_type(np.float64) and \ key.direction == 'load' f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert f64l == (2*n*m)*n_subgroups
# peek at generated code evt, (out, ) = knl(queue, a=x_vec_host) knl = lp.make_kernel("{ [i]: 0<=i<n }", "a[i] = 0", assumptions="n>=1") knl = lp.split_iname(knl, "i", 16) # split loop variable knl = lp.prioritize_loops(knl, "i_outer,i_inner") knl = lp.set_options(knl, "write_cl") evt, (out, ) = knl(queue, a=x_vec_dev) knl = lp.make_kernel("{ [i]: 0<=i<n }", "a[i] = a[i] * b[i] + c[i]", assumptions="n>=0 and n mod 4 = 0") orig_knl = knl # copy kernel, test assumptions, and unrolling knl = lp.split_iname(knl, "i", 4) knl = lp.tag_inames(knl, dict(i_inner="unr")) knl = lp.prioritize_loops(knl, "i_outer,i_inner") knl = lp.set_options(knl, "write_cl") evt, (out, ) = knl(queue, a=x_vec_dev, b=y_vec_dev, c=z_vec_dev) from warnings import resetwarnings, filterwarnings resetwarnings() # surpress some warnings during stats filterwarnings('ignore', category=Warning) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, c=np.float32)) op_map = lp.get_op_map(knl) # get operations counting print(lp.stringify_stats_mapping(op_map)) mem_map = lp.get_mem_access_map(knl) # get memory access(load, store) counting print(lp.stringify_stats_mapping(mem_map))
def _cache_kernel_stats(self, t_unit: lp.TranslationUnit, kwargs: dict) \ -> tuple: """Generate the kernel stats for a program with its args.""" args_tuple = tuple( (key, value.shape) if hasattr(value, "shape") else (key, value) for key, value in kwargs.items()) # Are kernel stats already in the cache? try: self.kernel_stats[t_unit][args_tuple] return args_tuple except KeyError: # If not, calculate and cache the stats ep_name = t_unit.default_entrypoint.name executor = t_unit.target.get_kernel_executor(t_unit, self.queue, entrypoint=ep_name) info = executor.translation_unit_info( ep_name, executor.arg_to_dtype_set(kwargs)) typed_t_unit = executor.get_typed_and_scheduled_translation_unit( ep_name, executor.arg_to_dtype_set(kwargs)) kernel = typed_t_unit[ep_name] idi = info.implemented_data_info param_dict = kwargs.copy() param_dict.update({ k: None for k in kernel.arg_dict.keys() if k not in param_dict }) param_dict.update( {d.name: None for d in idi if d.name not in param_dict}) # Generate the wrapper code wrapper = executor.get_wrapper_generator() gen = PythonFunctionGenerator("_mcom_gen_args_profile", list(param_dict)) wrapper.generate_integer_arg_finding_from_shapes(gen, kernel, idi) wrapper.generate_integer_arg_finding_from_offsets(gen, kernel, idi) wrapper.generate_integer_arg_finding_from_strides(gen, kernel, idi) param_names = kernel.all_params() gen("return {%s}" % ", ".join(f"{repr(name)}: {name}" for name in param_names)) # Run the wrapper code, save argument values in domain_params domain_params = gen.get_picklable_function()(**param_dict) # Get flops/memory statistics op_map = lp.get_op_map(typed_t_unit, subgroup_size="guess") bytes_accessed = lp.get_mem_access_map( typed_t_unit, subgroup_size="guess") \ .to_bytes().eval_and_sum(domain_params) flops = op_map.filter_by( dtype=[np.float32, np.float64]).eval_and_sum(domain_params) # Footprint gathering is not yet available in loopy with # kernel callables: # https://github.com/inducer/loopy/issues/399 if 0: try: footprint = lp.gather_access_footprint_bytes(typed_t_unit) footprint_bytes = sum( footprint[k].eval_with_dict(domain_params) for k in footprint) except lp.symbolic.UnableToDetermineAccessRange: footprint_bytes = None else: footprint_bytes = None res = SingleCallKernelProfile(time=0, flops=flops, bytes_accessed=bytes_accessed, footprint_bytes=footprint_bytes) self.kernel_stats.setdefault(t_unit, {})[args_tuple] = res if self.logmgr: if f"{ep_name}_time" not in self.logmgr.quantity_data: self.logmgr.add_quantity(KernelProfile(self, ep_name)) return args_tuple
def test_mem_access_counter_consec(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="consec", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict( a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size='guess') n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f64consec = mem_map[lp.MemAccess( 'global', np.float64, lid_strides={0: 1}, gid_strides={0: Variable('m')}, direction='load', variable='g', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f64consec += mem_map[lp.MemAccess( 'global', np.float64, lid_strides={0: 1}, gid_strides={0: Variable('m')}, direction='load', variable='h', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess( 'global', np.float32, lid_strides={0: 1}, gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec += mem_map[lp.MemAccess( 'global', np.dtype(np.float32), lid_strides={0: 1}, gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64consec == 2*n*m*ell assert f32consec == 3*n*m*ell f64consec = mem_map[lp.MemAccess( 'global', np.float64, lid_strides={0: 1}, gid_strides={0: Variable('m')}, direction='store', variable='e', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32consec = mem_map[lp.MemAccess( 'global', np.float32, lid_strides={0: 1}, gid_strides={0: Variable('m')*Variable('ell'), 1: Variable('m')}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f64consec == n*m*ell assert f32consec == n*m*ell
def test_mem_access_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict( a=np.int32, b=np.int32, g=np.int32, h=np.int32)) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group i32 = mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.dtype(np.int32), lid_strides={}, gid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert i32 == (4*n*m+2*n*m*ell)*n_subgroups i32 = mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) i32 += mem_map[lp.MemAccess('global', np.int32, lid_strides={}, gid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert i32 == (n*m+n*m*ell)*n_subgroups
def test_mem_access_counter_basic(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group f32l = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32l += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64l += mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32l == (3*n*m*ell)*n_subgroups assert f64l == (2*n*m)*n_subgroups f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={}, gid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64s = mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32s == (n*m*ell)*n_subgroups assert f64s == (n*m)*n_subgroups
def test_mem_access_counter_mixed(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]+x[i,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="mixed", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64, x=np.float32)) threads = 16 knl = lp.split_iname(knl, "j", threads) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) mem_map = lp.get_mem_access_map(knl) # noqa n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g')].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h')].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='x')].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='a')].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='b')].eval_with_dict(params) assert f64uniform == 2 * n * m assert f32uniform == n * m * l / threads assert f32nonconsec == 3 * n * m * l f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e')].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m'), direction='store', variable='c')].eval_with_dict(params) assert f64uniform == n * m assert f32nonconsec == n * m * l
def test_mem_access_counter_specialops(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1]) """ ], name="specialops", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group f32 = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='a', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32 += mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='b', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 += mem_map[lp.MemAccess('global', np.dtype(np.float64), lid_strides={}, gid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32 == (2*n*m*ell)*n_subgroups assert f64 == (2*n*m)*n_subgroups f32 = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='store', variable='c', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64 = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f32 == (n*m*ell)*n_subgroups assert f64 == (n*m)*n_subgroups filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'], count_granularity=CG.SUBGROUP) tot = filtered_map.eval_and_sum(params) # uniform: (count-per-sub-group)*n_subgroups assert tot == (n*m*ell + n*m)*n_subgroups
def test_summations_and_filters(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} mem_map = lp.get_mem_access_map(knl) loads_a = mem_map.filter_by(direction=['load'], variable=['a']).eval_and_sum(params) assert loads_a == 2 * n * m * l global_stores = mem_map.filter_by(mtype=['global'], direction=['store']).eval_and_sum(params) assert global_stores == n * m * l + n * m ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load' ]).to_bytes().eval_and_sum(params) st_bytes = mem_map.filter_by(mtype=['global'], direction=['store' ]).to_bytes().eval_and_sum(params) assert ld_bytes == 4 * n * m * l * 3 + 8 * n * m * 2 assert st_bytes == 4 * n * m * l + 8 * n * m # ignore stride and variable names in this map reduced_map = mem_map.group_by('mtype', 'dtype', 'direction') f32lall = reduced_map[lp.MemAccess( 'global', np.float32, direction='load')].eval_with_dict(params) f64lall = reduced_map[lp.MemAccess( 'global', np.float64, direction='load')].eval_with_dict(params) assert f32lall == 3 * n * m * l assert f64lall == 2 * n * m op_map = lp.get_op_map(knl) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) op_map_dtype = op_map.group_by('dtype') f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params) f64 = op_map_dtype[lp.Op(dtype=np.float64)].eval_with_dict(params) i32 = op_map_dtype[lp.Op(dtype=np.int32)].eval_with_dict(params) assert f32 == n * m * l * 3 assert f64 == n * m assert i32 == n * m * 2 addsub_all = op_map.filter_by(name=['add', 'sub']).eval_and_sum(params) f32ops_all = op_map.filter_by(dtype=[np.float32]).eval_and_sum(params) assert addsub_all == n * m * l + n * m * 2 assert f32ops_all == n * m * l * 3 non_field = op_map.filter_by(xxx=[np.float32]).eval_and_sum(params) assert non_field == 0 ops_nodtype = op_map.group_by('name') ops_noname = op_map.group_by('dtype') mul_all = ops_nodtype[lp.Op(name='mul')].eval_with_dict(params) f64ops_all = ops_noname[lp.Op(dtype=np.float64)].eval_with_dict(params) assert mul_all == n * m * l + n * m assert f64ops_all == n * m def func_filter(key): return key.stride < 1 and key.dtype == to_loopy_type(np.float64) and \ key.direction == 'load' s1f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params) assert s1f64l == 2 * n * m
def test_mem_access_counter_mixed(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]+x[i,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="mixed", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict( a=np.float32, b=np.float32, g=np.float64, h=np.float64, x=np.float32)) group_size_0 = 65 knl = lp.split_iname(knl, "j", group_size_0) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} n_workgroups = div_ceil(ell, group_size_0) group_size = group_size_0 subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) f64uniform = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='g', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='load', variable='h', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, lid_strides={}, gid_strides={}, direction='load', variable='x', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f64uniform == (2*n*m)*n_subgroups assert f32uniform == (m*n)*n_subgroups expect_fallback = False import islpy as isl try: isl.BasicSet.card except AttributeError: expect_fallback = True else: expect_fallback = False if expect_fallback: if ell < group_size_0: assert f32nonconsec == 3*n*m*ell*n_workgroups else: assert f32nonconsec == 3*n*m*n_workgroups*group_size_0 else: assert f32nonconsec == 3*n*m*ell f64uniform = mem_map[lp.MemAccess('global', np.float64, lid_strides={}, gid_strides={}, direction='store', variable='e', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, lid_strides={0: Variable('m')}, gid_strides={0: Variable('m')*group_size_0}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) # uniform: (count-per-sub-group)*n_subgroups assert f64uniform == m*n*n_subgroups if expect_fallback: if ell < group_size_0: assert f32nonconsec == n*m*ell*n_workgroups else: assert f32nonconsec == n*m*n_workgroups*group_size_0 else: assert f32nonconsec == n*m*ell
if mesh.layers: cells = cells * (mesh.layers - 1) print("CELLS= {0}".format(cells)) print("DOFS= {0}".format(dofs)) from loopy.program import make_program knl = compile_form(y_form, coffee=False)[0].ast warnings = list(knl.silenced_warnings) warnings.extend(["insn_count_subgroups_upper_bound", "no_lid_found"]) knl = knl.copy(silenced_warnings=warnings) knl.options.ignore_boostable_into = True program = make_program(knl) op_map = lp.get_op_map(program, subgroup_size=1) mem_map = lp.get_mem_access_map(program, subgroup_size=1) for op in ['add', 'sub', 'mul', 'div']: print("{0}S= {1}".format( op.upper(), op_map.filter_by(name=[op], dtype=[np.float64]).eval_and_sum({}))) print("MEMS= {0}".format( mem_map.filter_by(mtype=['global'], dtype=[np.float64]).eval_and_sum({}))) print("INSTRUCTIONS= {0:d}".format(len(knl.instructions))) print("LOOPS= {0:d}".format(len(knl.all_inames()))) for domain in knl.domains: if domain.get_dim_name(3, 0)[0] == "j": print("DOF_LOOP_EXTENT= {0:d}".format( int(domain.dim_max_val(0).to_str()) + 1)) break
def test_all_counters_parallel_matmul(): bsize = 16 knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", bsize) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} group_size = bsize*bsize n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize) subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups*subgroups_per_group sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 assert sync_map["kernel_launch"].eval_with_dict(params) == 1 assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) f32mul = op_map[ lp.Op(np.float32, 'mul', CG.SUBGROUP) ].eval_with_dict(params) f32add = op_map[ lp.Op(np.float32, 'add', CG.SUBGROUP) ].eval_with_dict(params) i32ops = op_map[ lp.Op(np.int32, 'add', CG.SUBGROUP) ].eval_with_dict(params) i32ops += op_map[ lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert f32mul+f32add == m*2*n_subgroups mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) f32s1lb = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('ell')}, gid_strides={1: bsize}, direction='load', variable='b', count_granularity=CG.WORKITEM) ].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('m')}, gid_strides={0: Variable('m')*bsize}, direction='load', variable='a', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f32s1lb == n*m*ell/bsize assert f32s1la == n*m*ell/bsize f32coal = mem_access_map[lp.MemAccess('global', np.float32, lid_strides={0: 1, 1: Variable('ell')}, gid_strides={0: Variable('ell')*bsize, 1: bsize}, direction='store', variable='c', count_granularity=CG.WORKITEM) ].eval_with_dict(params) assert f32coal == n*ell local_mem_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS).filter_by(mtype=['local']) local_mem_l = local_mem_map.filter_by(direction=['load'] ).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert local_mem_l == m*2*n_subgroups local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={1: 16}, gid_strides={}, variable='a_fetch', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load', lid_strides={0: 1}, gid_strides={}, variable='b_fetch', count_granularity=CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups assert local_mem_l_a == local_mem_l_b == m*n_subgroups local_mem_s = local_mem_map.filter_by(direction=['store'] ).eval_and_sum(params) # (count-per-sub-group)*n_subgroups assert local_mem_s == m*2/bsize*n_subgroups
def test_mem_access_tagged_variables(): bsize = 16 knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", ["c$mmresult[i, j] = sum(k, a$mmaload[i, k]*b$mmbload[k, j])"], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", bsize) # knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") # knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} group_size = bsize * bsize n_workgroups = div_ceil(n, bsize) * div_ceil(ell, bsize) subgroups_per_group = div_ceil(group_size, SGS) n_subgroups = n_workgroups * subgroups_per_group mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS) f32s1lb = mem_access_map[lp.MemAccess( 'global', np.float32, lid_strides={0: 1}, gid_strides={1: bsize}, direction='load', variable='b', variable_tag='mmbload', count_granularity=CG.WORKITEM)].eval_with_dict(params) f32s1la = mem_access_map[lp.MemAccess( 'global', np.float32, lid_strides={1: Variable('m')}, gid_strides={0: Variable('m') * bsize}, direction='load', variable='a', variable_tag='mmaload', count_granularity=CG.SUBGROUP)].eval_with_dict(params) assert f32s1lb == n * m * ell # uniform: (count-per-sub-group)*n_subgroups assert f32s1la == m * n_subgroups f32coal = mem_access_map[lp.MemAccess( 'global', np.float32, lid_strides={ 0: 1, 1: Variable('ell') }, gid_strides={ 0: Variable('ell') * bsize, 1: bsize }, direction='store', variable='c', variable_tag='mmresult', count_granularity=CG.WORKITEM)].eval_with_dict(params) assert f32coal == n * ell
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa pytest.importorskip("fparser") ctx = ctx_factory() filename = os.path.join(os.path.dirname(__file__), "strongVolumeKernels.f90") with open(filename) as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") program = lp.parse_fortran(source, filename, seq_dependencies=False) hsv_r, hsv_s = program["strongVolumeKernelR"], program[ "strongVolumeKernelS"] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s hsv = lp.add_nosync(hsv, "any", "writes:rhsQ", "writes:rhsQ", force=True) from gnuma_loopy_transforms import (fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.prioritize_loops(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) from loopy.frontend.fortran.translator import specialize_fortran_division hsv = specialize_fortran_division(hsv) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]", fetch_outer_inames="e", default_tag="l.auto") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner", ) flux_ilp_inames = ("kk", ) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ( "j", "n", ), rtmps, ( "jj", "ii", )), ("sknl", sflux_insn, ( "i", "n", ), stmps, ( "ii", "jj", )), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions( hsv, "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var + "_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:" + reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute(hsv, lp.find_one_rule_matching( hsv, prep_var_name + "_*subst*"), default_tag="l.auto") if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames, default_tag="l.auto") if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, { "Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr" }, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames( hsv, dict(rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment( hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros" ]) if 1: print("OPS") op_map = lp.get_op_map(hsv, subgroup_size=32) print(lp.stringify_stats_mapping(op_map)) print("MEM") gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes() print(lp.stringify_stats_mapping(gmem_map)) # FIXME: renaming's a bit tricky in this program model. # add a simple transformation for it # hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
def _cache_kernel_stats(self, program: lp.kernel.LoopKernel, kwargs: dict) \ -> tuple: """Generate the kernel stats for a program with its args.""" args_tuple = tuple( (key, value.shape) if hasattr(value, "shape") else (key, value) for key, value in kwargs.items()) # Are kernel stats already in the cache? try: x = self.kernel_stats[program][args_tuple] # noqa return args_tuple except KeyError: # If not, calculate and cache the stats executor = program.target.get_kernel_executor(program, self.queue) info = executor.kernel_info(executor.arg_to_dtype_set(kwargs)) kernel = executor.get_typed_and_scheduled_kernel( executor.arg_to_dtype_set(kwargs)) idi = info.implemented_data_info types = { k: v for k, v in kwargs.items() if hasattr(v, "dtype") and not v.dtype == object } param_dict = kwargs.copy() param_dict.update({ k: None for k in kernel.arg_dict.keys() if k not in param_dict }) param_dict.update( {d.name: None for d in idi if d.name not in param_dict}) # Generate the wrapper code wrapper = executor.get_wrapper_generator() gen = PythonFunctionGenerator("_mcom_gen_args_profile", list(param_dict)) wrapper.generate_integer_arg_finding_from_shapes(gen, kernel, idi) wrapper.generate_integer_arg_finding_from_offsets(gen, kernel, idi) wrapper.generate_integer_arg_finding_from_strides(gen, kernel, idi) param_names = program.all_params() gen("return {%s}" % ", ".join(f"{repr(name)}: {name}" for name in param_names)) # Run the wrapper code, save argument values in domain_params domain_params = gen.get_picklable_function()(**param_dict) # Get flops/memory statistics kernel = lp.add_and_infer_dtypes(kernel, types) op_map = lp.get_op_map(kernel, subgroup_size="guess") bytes_accessed = lp.get_mem_access_map(kernel, subgroup_size="guess") \ .to_bytes().eval_and_sum(domain_params) flops = op_map.filter_by( dtype=[np.float32, np.float64]).eval_and_sum(domain_params) try: footprint = lp.gather_access_footprint_bytes(kernel) footprint_bytes = sum( footprint[k].eval_with_dict(domain_params) for k in footprint) except lp.symbolic.UnableToDetermineAccessRange: footprint_bytes = None res = ProfileResult(time=0, flops=flops, bytes_accessed=bytes_accessed, footprint_bytes=footprint_bytes) self.kernel_stats.setdefault(program, {})[args_tuple] = res return args_tuple