def get_target(lang, device=None, compiler=None): """ Parameters ---------- lang : str One of the supported languages, {'c', 'cuda', 'opencl'} device : :class:`pyopencl.Device` If supplied, and lang is 'opencl', passed to the :class:`loopy.PyOpenCLTarget` compiler: str If supplied, the C-compiler to use Returns ------- The correct loopy target type """ utils.check_lang(lang) # set target if lang == 'opencl': return lp.PyOpenCLTarget(device=device) elif lang == 'c': return lp.ExecutableCTarget(compiler=compiler) elif lang == 'cuda': return lp.CudaTarget() elif lang == 'ispc': return lp.ISPCTarget()
def test_recursive_nested_dependent_reduction(ctx_factory): dtype = np.dtype(np.int32) ctx = ctx_factory() knl = lp.make_kernel([ "{[itgt]: 0 <= itgt < ntgts}", "{[isrc_box]: 0 <= isrc_box < nboxes}", "{[isrc]: 0 <= isrc < npart}" ], """ for itgt for isrc_box <> npart = nparticles_per_box[isrc_box] <> boxsum = sum(isrc, isrc+isrc_box+itgt) end a[itgt] = sum(isrc_box, boxsum) end """, [ lp.ValueArg("n", np.int32), lp.GlobalArg("a", dtype, ("n", )), lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes", )), lp.ValueArg("ntgts", np.int32), lp.ValueArg("nboxes", np.int32), ], assumptions="ntgts>=1", target=lp.PyOpenCLTarget(ctx.devices[0])) print(lp.generate_code_v2(knl).device_code())
def test_dependent_loop_bounds_3(ctx_factory): # The point of this test is that it shows a dependency between # domains that is exclusively mediated by the row_len temporary. # It also makes sure that row_len gets read before any # conditionals use it. dtype = np.dtype(np.float32) ctx = ctx_factory() knl = lp.make_kernel([ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", ], [ "<> row_len = a_row_lengths[i]", "a[i,jj] = 1", ], [ lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), ], target=lp.PyOpenCLTarget(ctx.devices[0]), name="loopy_kernel") assert knl["loopy_kernel"].parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") print(lp.generate_code_v2(knl).device_code()) knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") with pytest.raises(RuntimeError): list(lp.generate_code_v2(knl_bad))
def __init__(self, map_instructions, **kwargs): if "map_dict" in kwargs: from warnings import warn warn("Passing map_dict is deprecated. Pass map_instructions instead.", DeprecationWarning, stacklevel=2) map_instructions = kwargs.pop("map_dict") self.map_instructions = map_instructions if isinstance(self.map_instructions, dict): self.map_instructions = list(self.map_instructions.items()) if "tmp_dict" in kwargs: from warnings import warn warn("Passing tmp_dict is deprecated. Pass tmp_instructions instead.", DeprecationWarning, stacklevel=2) tmp_instructions = kwargs.pop("tmp_dict") else: tmp_instructions = [] self.tmp_instructions = kwargs.pop("tmp_instructions", tmp_instructions) if isinstance(self.tmp_instructions, dict): self.tmp_instructions = list(self.tmp_instructions.items()) self.args = kwargs.pop("args", [...]) self.dtype = kwargs.pop("dtype", None) # default local size which saturates memory bandwidth self.lsize = kwargs.pop("lsize", (16, 4, 1)) rank_shape = kwargs.pop("rank_shape", None) halo_shape = kwargs.pop("halo_shape", None) domains = kwargs.pop( "domains", "[Nx, Ny, Nz] -> {[i,j,k]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz}" ) kernel_kwargs = dict( seq_dependencies=True, default_offset=lp.auto, target=lp.PyOpenCLTarget(), lang_version=(2018, 2), ) kernel_kwargs.update(kwargs) knl = self.make_kernel(self.map_instructions, self.tmp_instructions, self.args, domains, **kernel_kwargs) if isinstance(halo_shape, int): knl = lp.fix_parameters(knl, h=halo_shape) elif isinstance(halo_shape, (tuple, list)): knl = lp.fix_parameters( knl, hx=halo_shape[0], hy=halo_shape[1], hz=halo_shape[2] ) knl = self.parallelize(knl, self.lsize) if rank_shape is not None: knl = lp.fix_parameters( knl, Nx=rank_shape[0], Ny=rank_shape[1], Nz=rank_shape[2] ) self.knl = lp.remove_unused_inames(knl)
def __test(loop_size, vec_width): knl = lp.make_kernel( '{{[i]: 0 <= i < {}}}'.format(loop_size), """ <> x = 1.0 a1[0] = a1[0] + x {id=set} ... lbarrier {id=wait, dep=set} for i a1[0] = a1[0] + 1 {id=a1, dep=set:wait, nosync=set} end """, [ lp.GlobalArg( 'a1', shape=(loop_size, ), order='C', dtype=np.float32) ], target=lp.PyOpenCLTarget()) loopy_opts = type('', (object, ), { 'depth': vec_width, 'order': 'C', 'use_atomics': True }) knl = lp.split_iname(knl, 'i', vec_width, inner_tag='l.0') # feed through deep specializer _, ds = get_deep_specializer(loopy_opts, atomic_ids=['a1'], split_ids=['set'], use_atomics=True, is_write_race=True, split_size=loop_size) knl = ds(knl) val = np.minimum(loop_size, vec_width) assert 'x / {:.1f}f'.format(val) in lp.generate_code(knl)[0]
def test_triangle_domain(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<=i,j<n and i <= j}", "a[i,j] = 17", assumptions="n>=1", target=lp.PyOpenCLTarget(ctx.devices[0])) print(knl) print(lp.generate_code_v2(knl).device_code())
def test_rob_stroud_bernstein(ctx_factory): ctx = ctx_factory() # NOTE: tmp would have to be zero-filled beforehand knl = lp.make_kernel( "{[el, i2, alpha1,alpha2]: \ 0 <= el < nels and \ 0 <= i2 < nqp1d and \ 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }", """ for el,i2 <> xi = qpts[1, i2] <> s = 1-xi <> r = xi/s <> aind = 0 {id=aind_init} for alpha1 <> w = s**(deg-alpha1) {id=init_w} for alpha2 tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \ {id=write_tmp,dep=init_w:aind_init} w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ {id=update_w,dep=init_w:write_tmp} aind = aind + 1 \ {id=aind_incr,dep=aind_init:write_tmp:update_w} end end end """, [ # Must declare coeffs to have "no" shape, to keep loopy # from trying to figure it out the shape automatically. lp.GlobalArg("coeffs", None, shape=None), "..." ], assumptions="deg>=0 and nels>=1", target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) knl = lp.add_dtypes(knl, dict( qpts=np.float32, coeffs=np.float32, tmp=np.float32, )) print(lp.generate_code_v2(knl))
def test_eq_constraint(ctx_factory): logging.basicConfig(level=logging.INFO) ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<= i < 32}", ["a[i] = b[i]"], [ lp.GlobalArg("a", np.float32, shape=(1000, )), lp.GlobalArg("b", np.float32, shape=(1000, )) ], target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") print(lp.generate_code_v2(knl).device_code())
def test_assume(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<n}", "a[i] = a[i] + 1", [lp.GlobalArg("a", np.float32, shape="n"), "..."], target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 16) knl = lp.prioritize_loops(knl, "i_outer,i_inner") knl = lp.assume(knl, "n mod 16 = 0") knl = lp.assume(knl, "n > 10") code = lp.generate_code_v2(knl).device_code() assert "if" not in code
def test_divisibility_assumption(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("[n] -> {[i]: 0<=i<n}", ["b[i] = 2*a[i]"], [ lp.GlobalArg("a", np.float32, shape=("n", )), lp.GlobalArg("b", np.float32, shape=("n", )), lp.ValueArg("n", np.int32), ], assumptions="n>=1 and (exists zz: n = 16*zz)", target=lp.PyOpenCLTarget(ctx.devices[0])) ref_knl = knl knl = lp.split_iname(knl, "i", 16) code = lp.generate_code_v2(knl).device_code() assert "if" not in code lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3})
def test_dependent_loop_bounds(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() knl = lp.make_kernel([ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", ], [ "<> row_len = a_rowstarts[i+1] - a_rowstarts[i]", "a_sum[i] = sum(jj, a_values[[a_rowstarts[i]+jj]])", ], [ lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto), lp.GlobalArg("a_indices", np.int32, shape=lp.auto), lp.GlobalArg("a_values", dtype), lp.GlobalArg("a_sum", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], assumptions="n>=1 and row_len>=1", target=lp.PyOpenCLTarget(ctx.devices[0])) print(lp.generate_code_v2(knl).device_code())
def test_dependent_loop_bounds_2(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() knl = lp.make_kernel([ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", ], [ "<> row_start = a_rowstarts[i]", "<> row_len = a_rowstarts[i+1] - row_start", "ax[i] = sum(jj, a_values[[row_start+jj]])", ], [ lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto), lp.GlobalArg("a_indices", np.int32, shape=lp.auto), lp.GlobalArg("a_values", dtype, strides=(1, )), lp.GlobalArg("ax", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], assumptions="n>=1 and row_len>=1", target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") print(lp.generate_code_v2(knl).device_code())
def get_loopy_target(self) -> "loopy.PyOpenCLTarget": import loopy as lp device = None if self.queue is not None: device = self.queue.device return lp.PyOpenCLTarget(device)
def __inner(queue=None): output = [] kc_ind = 0 oob = False while not oob: # handle weirdness between list / non-list input try: kc = kernel_calls[kc_ind] kc_ind += 1 except IndexError: oob = True break # reached end of list except TypeError: # not a list oob = True # break on next run kc = kernel_calls # create the outputs if kc.out_mask is not None: out_ref = [None for i in kc.out_mask] else: out_ref = [None] found = False # run kernels for k in knl: # test that we want to run this one if kc.is_my_kernel(k): found = True # set the editor to avoid intel bugs test_knl = editor(k) if isinstance(test_knl.target, lp.PyOpenCLTarget): # recreate with device test_knl = test_knl.copy(target=lp.PyOpenCLTarget( device=device)) # check for chaining if kc.chain: kc.chain(kc, output) # run! out = kc(test_knl, queue) if kc.post_process: kc.post_process(kc, out) # output mapping if all(x is None for x in out_ref): # if the outputs are none, we init to zeros # and avoid copying zeros over later data! out_ref = [np.zeros_like(x) for x in out] for ind in range(len(out)): # get indicies that are non-zero (already in there) # or non infinity/nan # try w/o finite check (I'm paranoid, don't want to mask) # any bad data copy_inds = np.where(np.logical_not(out[ind] == 0)) # copy_inds = np.where(np.logical_not( # np.logical_or(np.isinf(out[ind]), # out[ind] == 0, np.isnan(out[ind]))), # ) out_ref[ind][copy_inds] = out[ind][copy_inds] output.append(out_ref) assert found or kc.allow_skip, ( 'No kernels could be found to match kernel call {}'.format( kc.name)) return output
def test_rob_stroud_bernstein_full(ctx_factory): #logging.basicConfig(level=logging.DEBUG) ctx = ctx_factory() # NOTE: result would have to be zero-filled beforehand knl = lp.make_kernel( "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \ 0 <= el < nels and \ 0 <= i2 < nqp1d and \ 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\ \ 0 <= i1_2 < nqp1d and \ 0 <= alpha1_2 <= deg and \ 0 <= i2_2 < nqp1d \ }", """ for el for i2 <> xi = qpts[1, i2] <> s = 1-xi <> r = xi/s <> aind = 0 {id=aind_init} for alpha1 <> w = s**(deg-alpha1) {id=init_w} <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \ {id=write_tmp,dep=init_w:aind_init} for alpha2 w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ {id=update_w,dep=init_w:write_tmp} aind = aind + 1 \ {id=aind_incr,dep=aind_init:write_tmp:update_w} end end end for i1_2 <> xi2 = qpts[0, i1_2] {dep=aind_incr} <> s2 = 1-xi2 <> r2 = xi2/s2 <> w2 = s2**deg {id=w2_init} for alpha1_2 for i2_2 result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \ w2 * tmp[alpha1_2, i2_2] {id=res2,dep=w2_init} end w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2) \ {id=w2_update, dep=res2} end end end """, [ # Must declare coeffs to have "no" shape, to keep loopy # from trying to figure it out the shape automatically. lp.GlobalArg("coeffs", None, shape=None), "..." ], assumptions="deg>=0 and nels>=1", target=lp.PyOpenCLTarget(ctx.devices[0]) ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) if 0: knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) from pickle import dumps, loads knl = loads(dumps(knl)) knl = lp.add_dtypes(knl, dict( qpts=np.float32, tmp=np.float32, coeffs=np.float32, result=np.float32, )) print(lp.generate_code_v2(knl))
def get_loopy_target(self) -> "loopy.LoopyPyOpenCLTarget": import loopy as lp return lp.PyOpenCLTarget(self.device)