def test_add_nosync(): orig_knl = lp.make_kernel("{[i]: 0<=i<10}", """ <>tmp[i] = 10 {id=insn1} <>tmp2[i] = 10 {id=insn2} <>tmp3[2*i] = 0 {id=insn3} <>tmp4 = 1 + tmp3[2*i] {id=insn4} <>tmp5[i] = 0 {id=insn5,groups=g1} tmp5[i] = 1 {id=insn6,conflicts=g1} """) orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local") orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local") # No dependency present - don't add nosync knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2", empty_ok=True) assert frozenset() == knl.id_to_insn["insn2"].no_sync_with # Dependency present knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3") assert frozenset() == knl.id_to_insn["insn3"].no_sync_with assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with # Bidirectional knl = lp.add_nosync( orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with # Groups knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6") assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with
def test_global_temporary(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i]: 0<=i<n}", """ <> c[i] = a[i + 1] out[i] = c[i] """) knl = lp.add_and_infer_dtypes(knl, { "a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32 }) knl = lp.set_temporary_scope(knl, "c", "global") ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 #print(cgr.device_code()) #print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_global_temporary(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i]: 0<=i<n}", """ <> c[i] = a[i + 1] out[i] = c[i] """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) knl = lp.set_temporary_scope(knl, "c", "global") ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 #print(cgr.device_code()) #print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory): ctx = ctx_factory() pytest.xfail("spilling doesn't yet use local axes") knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ <> t_private_scalar = a[k,i+1] <> t_private_array[i % 2] = a[k,i+1] c[k,i] = a[k,i+1] out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2] """) knl = lp.add_and_infer_dtypes(knl, { "a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32 }) knl = lp.set_temporary_scope(knl, "t_private_scalar", "private") knl = lp.set_temporary_scope(knl, "t_private_array", "private") ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 print(cgr.device_code()) print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))