def test_barrier_counter_barriers(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}", [ """ c[i,j,k] = 2*a[i,j,k] {id=first} e[i,j,k] = c[i,j,k+1]+c[i,j,k-1] {dep=first} """ ], [ lp.TemporaryVariable("c", lp.auto, shape=(50, 10, 99)), "..." ], name="weird2", ) knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32)) knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0") poly = lp.get_synchronization_poly(knl) print(poly) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} barrier_count = poly["barrier_local"].eval_with_dict(params) assert barrier_count == 50*10*2
def test_all_counters_parallel_matmul(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} sync_poly = lp.get_synchronization_poly(knl) assert len(sync_poly) == 1 assert sync_poly["kernel_launch"].eval_with_dict(params) == 1 op_map = lp.get_op_poly(knl) f32mul = op_map[ (np.dtype(np.float32), 'mul') ].eval_with_dict(params) f32add = op_map[ (np.dtype(np.float32), 'add') ].eval_with_dict(params) i32ops = op_map[ (np.dtype(np.int32), 'add') ].eval_with_dict(params) i32ops += op_map[ (np.dtype(np.int32), 'mul') ].eval_with_dict(params) assert f32mul+f32add == n*m*l*2 assert i32ops == n*m*l*4 + l*n*4 subscript_map = lp.get_gmem_access_poly(knl) f32uncoal = subscript_map[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict(params) f32coal = subscript_map[ (np.dtype(np.float32), 'consecutive', 'load') ].eval_with_dict(params) assert f32uncoal == n*m*l assert f32coal == n*m*l f32coal = subscript_map[ (np.dtype(np.float32), 'consecutive', 'store') ].eval_with_dict(params) assert f32coal == n*l
def test_all_counters_parallel_matmul(): knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", ["c[i, j] = sum(k, a[i, k]*b[k, j])"], name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} sync_poly = lp.get_synchronization_poly(knl) assert len(sync_poly) == 1 assert sync_poly["kernel_launch"].eval_with_dict(params) == 1 op_map = lp.get_op_poly(knl) f32mul = op_map[(np.dtype(np.float32), 'mul')].eval_with_dict(params) f32add = op_map[(np.dtype(np.float32), 'add')].eval_with_dict(params) i32ops = op_map[(np.dtype(np.int32), 'add')].eval_with_dict(params) i32ops += op_map[(np.dtype(np.int32), 'mul')].eval_with_dict(params) assert f32mul + f32add == n * m * l * 2 assert i32ops == n * m * l * 4 + l * n * 4 subscript_map = lp.get_gmem_access_poly(knl) f32uncoal = subscript_map[(np.dtype(np.float32), 'nonconsecutive', 'load')].eval_with_dict(params) f32coal = subscript_map[(np.dtype(np.float32), 'consecutive', 'load')].eval_with_dict(params) assert f32uncoal == n * m * l assert f32coal == n * m * l f32coal = subscript_map[(np.dtype(np.float32), 'consecutive', 'store')].eval_with_dict(params) assert f32coal == n * l
def test_barrier_counter_nobarriers(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) sync_poly = lp.get_synchronization_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} assert len(sync_poly) == 1 assert sync_poly["kernel_launch"].eval_with_dict(params) == 1
def test_barrier_counter_nobarriers(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) sync_poly = lp.get_synchronization_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} assert len(sync_poly) == 1 assert sync_poly["kernel_launch"].eval_with_dict(params) == 1
def test_barrier_counter_barriers(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}", [ """ c[i,j,k] = 2*a[i,j,k] {id=first} e[i,j,k] = c[i,j,k+1]+c[i,j,k-1] {dep=first} """ ], [lp.TemporaryVariable("c", lp.auto, shape=(50, 10, 99)), "..."], name="weird2", ) knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32)) knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0") poly = lp.get_synchronization_poly(knl) print(poly) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} barrier_count = poly["barrier_local"].eval_with_dict(params) assert barrier_count == 50 * 10 * 2