def test_op_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) f64div = poly[(np.dtype(np.float64), 'div')].eval_with_dict(params) i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32mul == n*m assert f64div == 2*n*m # TODO why? assert f64add == n*m assert i32add == n*m
def test_op_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict( a=np.int32, b=np.int32, g=np.int64, h=np.int64)) poly = get_op_poly(knl) n = 10 m = 10 l = 10 param_values = {'n': n, 'm': m, 'l': l} i32 = poly.dict[np.dtype(np.int32)].eval_with_dict(param_values) i64 = poly.dict[np.dtype(np.int64)].eval_with_dict(param_values) not_there = poly[np.dtype(np.float64)].eval_with_dict(param_values) print(poly.dict) assert i32 == n*m + n*m*l assert i64 == 2*n*m assert not_there == 0
def test_op_counter_triangular_domain(): knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<m and i<j}", """ a[i, j] = b[i,j] * 2 """, name="bitwise", assumptions="n,m >= 1") knl = lp.add_and_infer_dtypes(knl, dict(b=np.float64)) expect_fallback = False import islpy as isl try: isl.BasicSet.card except AttributeError: expect_fallback = True else: expect_fallback = False poly = get_op_poly(knl)[(np.dtype(np.float64), 'mul')] value_dict = dict(m=13, n=200) flops = poly.eval_with_dict(value_dict) if expect_fallback: assert flops == 144 else: assert flops == 78
def test_op_counter_bitwise(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1) e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k)) """ ], name="bitwise", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict( a=np.int32, b=np.int32, g=np.int64, h=np.int64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params) i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params) i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params) i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params) i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params) assert i32add == n*m+n*m*l assert i32bw == 2*n*m*l assert i64bw == 2*n*m assert i64add == i64mul == n*m assert i64shift == 2*n*m
def test_op_counter_specialops(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1]) """ ], name="specialops", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params) f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params) i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32div == 2*n*m*l assert f32mul == f32add == n*m*l assert f64add == 2*n*m assert f64pow == i32add == n*m
def test_op_counter_basic(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k+1] = -g[i,k]*h[i,k+1] """ ], name="basic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params) f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params) i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params) assert f32add == f32mul == f32div == n*m*l assert f64mul == n*m assert i32add == n*m*2
def test_all_counters_parallel_matmul(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} barrier_count = get_barrier_poly(knl).eval_with_dict(params) assert barrier_count == 0 op_map = get_op_poly(knl) f32mul = op_map[ (np.dtype(np.float32), 'mul') ].eval_with_dict(params) f32add = op_map[ (np.dtype(np.float32), 'add') ].eval_with_dict(params) i32ops = op_map[ (np.dtype(np.int32), 'add') ].eval_with_dict(params) i32ops += op_map[ (np.dtype(np.int32), 'mul') ].eval_with_dict(params) assert f32mul+f32add == n*m*l*2 assert i32ops == n*m*l*4 + l*n*4 subscript_map = get_gmem_access_poly(knl) f32uncoal = subscript_map[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict(params) f32coal = subscript_map[ (np.dtype(np.float32), 'consecutive', 'load') ].eval_with_dict(params) assert f32uncoal == n*m*l assert f32coal == n*m*l f32coal = subscript_map[ (np.dtype(np.float32), 'consecutive', 'store') ].eval_with_dict(params) assert f32coal == n*l
def test_op_counter_reduction(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 f32 = poly.dict[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == 2*n*m*l
def test_op_counter_reduction(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul_serial", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params) f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params) assert f32add == f32mul == n*m*l
def test_op_counter_logic(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2) """ ], name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 f32 = poly.dict[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) f64 = poly.dict[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l}) i32 = poly.dict[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == n*m assert f64 == 3*n*m assert i32 == n*m
def test_op_counter_basic(): knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*h[i,k+1] """ ], name="weird", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 f32 = poly.dict[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) f64 = poly.dict[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l}) i32 = poly.dict[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == 3*n*m*l assert f64 == n*m assert i32 == n*m
def test_op_counter_specialops(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0) e[i, k] = (1+g[i,k])**(1+h[i,k+1]) """ ], name="specialops", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) poly = get_op_poly(knl) n = 512 m = 256 l = 128 f32 = poly.dict[np.dtype(np.float32)].eval_with_dict({'n': n, 'm': m, 'l': l}) f64 = poly.dict[np.dtype(np.float64)].eval_with_dict({'n': n, 'm': m, 'l': l}) i32 = poly.dict[np.dtype(np.int32)].eval_with_dict({'n': n, 'm': m, 'l': l}) assert f32 == 4*n*m*l assert f64 == 3*n*m assert i32 == n*m
def run_conv_trials(ctx, queue, nvals, configs_t, Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, train_test_config): A = [] HK_predict = [] actual = [] dtype = np.float32 ncolors = 3 for n in nvals: knl = lp.make_kernel( "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ -f_w <= f_x,f_y <= f_w \ and 0 <= im_x < im_w and 0 <= im_y < im_h \ and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ }", """ out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \ img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \ * f[ifeat, f_w+f_x, f_w+f_y, icolor]) """, [ lp.GlobalArg("f", dtype, shape=lp.auto), lp.GlobalArg("img", dtype, shape=lp.auto), lp.GlobalArg("out", dtype, shape=lp.auto), "..." ], assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", flags="annotate_inames", defines=dict(ncolors=ncolors), name="conv") f_w = 3 knl = lp.fix_parameters(knl, f_w=f_w) ref_knl = knl for BSIZEx, BSIZEy in configs_t: knl = ref_knl im_w = n im_h = n nfeats = 3 nimgs = 3 f_dev = cl.clrandom.rand(queue, (nfeats, 2*f_w+1, 2*f_w+1, ncolors), dtype=dtype) img_dev = cl.clrandom.rand(queue, (nimgs+1, n+2*f_w, n+2*f_w, ncolors), dtype=dtype) knl = lp.split_iname(knl, "im_x", BSIZEx, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "im_y", BSIZEy, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]") knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y") params = dict(im_w=im_w, im_h=im_h, f_w=f_w, nfeats=nfeats, nimgs=nimgs) #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True, # parameters=params) #print "Correctness check: \n", check # use ptx src to determine resource usage #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy) barrier_poly = get_barrier_poly(knl) barrier_ct = barrier_poly.eval_with_dict(params) op_map = get_op_poly(knl) flops, iops = get_32b_ops(op_map, params) #TODO why do blk sizes that don't fit perfecty increase total flops/iops sub_map = get_DRAM_access_poly(knl) # noqa f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses( sub_map, params) f32coal = f32coal_l + f32coal_s f32uncoal = f32uncoal_l + f32uncoal_s # execute print("running kernel...") #knl = lp.set_options(knl, write_cl=True, highlight_cl=True) trial_times = [] for i in range(averaging_trials+warmup_trials): evt, (out,) = knl(queue, f=f_dev, img=img_dev, im_w=im_w, im_h=im_h, nfeats=nfeats, nimgs=nimgs) evt.wait() trial_times.append((evt.profile.END - evt.profile.START)*1e-9) avg_time = np.average(trial_times[warmup_trials:]) gstats = GPUStats('TeslaK20') reg32_per_thread = 33 shared_mem_per_block = (ncolors * (f_w*2+1) * (f_w*2+1) + (BSIZEx+f_w*2) * (BSIZEy+f_w*2) ) * np.dtype(dtype).itemsize total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy) total_threads = total_blocks*BSIZEx*BSIZEy # TODO unused kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n), barrier_ct, reg32_per_thread, shared_mem_per_block) tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks) model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype)) cycles = model.compute_total_cycles() actual.append(avg_time) HK_predict.append(cycles/(gstats.sm_clock_freq*10**9)) update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) #TODO try total_threads for n*n update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, A, actual, HK_predict, train_test_config)
def run_mm_trials(ctx, queue, nvals, configs_t, Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, train_test_config, version): A = [] HK_predict = [] actual = [] dtype = np.float32 #TODO figure out smem usage issue for n in nvals: a_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype) b_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype) c_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype) order = "C" knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") ref_knl = knl for BSIZEx, BSIZEy in configs_t: knl = ref_knl if version == "allcoal": knl = lp.split_iname(knl, "i", BSIZEy, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", BSIZEx, outer_tag="g.1", inner_tag="l.0") elif version == "partcoal": knl = lp.split_iname(knl, "i", BSIZEy, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", BSIZEx, outer_tag="g.1", inner_tag="l.1") else: 1/0 # TODO error ksplit = BSIZEy knl = lp.split_iname(knl, "k", ksplit) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ]) #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True) #print "Correctness check: \n", check # use ptx src to determine resource usage #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy) params = {'n': n} barrier_poly = get_barrier_poly(knl) barrier_ct = barrier_poly.eval_with_dict(params) op_map = get_op_poly(knl) op_map2 = get_op_poly2(knl) flops, iops = get_32b_ops(op_map, params) amd_op32 = get_32b_amd_ops(op_map2, params) other_op32 = get_32b_ops_all(op_map2, params) - sum(amd_op32) if flops + iops != sum(amd_op32) + other_op32: #TODO remove after debug print("<debug> PROBLEM!, ops don't add up: ", flops, iops, sum(amd_op32), other_op32) sub_map = get_DRAM_access_poly(knl) # noqa f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses( sub_map, params) f32coal = f32coal_l + f32coal_s f32uncoal = f32uncoal_l + f32uncoal_s ''' print_ptx_src_msg(knl.name) print "="*40+"KERNEL STATS" print "barrier count: ", barrier_ct print "flops: ", flops print(sub_map) print "="*40 ''' # execute #print "="*40+"TIMING RESULTS" print("running kernel...") #knl = lp.set_options(knl, write_cl=True, highlight_cl=True) trial_times = [] for i in range(averaging_trials+warmup_trials): evt, (out,) = knl(queue, a=a_mat_dev, b=b_mat_dev, c=c_mat_dev) evt.wait() trial_times.append((evt.profile.END - evt.profile.START)*1e-9) avg_time = np.average(trial_times[warmup_trials:]) gstats = GPUStats('TeslaC2070') ''' for k20: if BSIZEx == 8 or BSIZEx == 32: # TODO fix hack reg32_per_thread = 25 elif BSIZEx == 24: reg32_per_thread = 18 elif BSIZEx == 16: reg32_per_thread = 22 ''' # for C2070 if BSIZEx == 8 or BSIZEx == 16: # TODO fix hack reg32_per_thread = 20 elif BSIZEx == 32: reg32_per_thread = 19 elif BSIZEx == 24: reg32_per_thread = 12 #reg32_per_thread = 1 #estimate_regs_per_thread(knl) #print(reg32_per_thread, estimate_regs_per_thread(knl)) reg32_per_thread = estimate_regs_per_thread(knl) shared_mem_per_block = 4*ksplit*(BSIZEx+BSIZEy) total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy) total_threads = total_blocks*BSIZEx*BSIZEy # TODO never used kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n), barrier_ct, reg32_per_thread, shared_mem_per_block) tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks) model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype)) cycles = model.compute_total_cycles() actual.append(avg_time) #for time in trial_times: #!!!!! # actual.append(time) HK_predict.append(cycles/(gstats.sm_clock_freq*10**9)) ''' print "actual runtime: ", actual[-1] print "total predicted time: ", predicted[-1] print "total predicted execution cycles: ", cycles print "="*40 ''' #''' #!!!!! ''' update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) ''' ops = copy.deepcopy(amd_op32) ops.append(other_op32) update_LS_matrix2(A, ops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) ''' for time in trial_times: update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) ''' update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, A, actual, HK_predict, train_test_config)
def run_varyflops_trials(ctx, queue, nvals, configs_t, Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, train_test_config): A = [] HK_predict = [] actual = [] dtype = np.float32 #TODO figure out smem usage issue for n in nvals: a_mat_dev = cl.clrandom.rand(queue, (n, n, n), dtype=dtype) b_mat_dev = cl.clrandom.rand(queue, (n, n, n), dtype=dtype) g_mat_dev = cl.clrandom.rand(queue, (n, n, n), dtype=dtype) h_mat_dev = cl.clrandom.rand(queue, (n, n, n+1), dtype=dtype) knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] """ ], name="basic", assumptions="n,m,l >= 1") ''' knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, j, k+1] = g[i,j,k]*h[i,j,k+1] """ ], name="basic", assumptions="n,m,l >= 1") knl = lp.make_kernel( "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0 e[i, j, k+1] = g[i,j,k]*h[i,j,k+1] """ ], name="basic", assumptions="n,m,l >= 1") ''' #knl = lp.add_and_infer_dtypes(knl, # dict(a=dtype, b=dtype, g=dtype, h=dtype)) knl = lp.add_and_infer_dtypes(knl, dict(a=dtype, b=dtype)) ref_knl = knl for BSIZEx, BSIZEy in configs_t: knl = ref_knl knl = lp.split_iname(knl, "i", BSIZEy, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", BSIZEx, outer_tag="g.1", inner_tag="l.0") params = dict(n=n, m=n, l=n) check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True, parameters=params) #print "Correctness check: \n", check # use ptx src to determine resource usage #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy) barrier_poly = get_barrier_poly(knl) barrier_ct = barrier_poly.eval_with_dict(params) op_map = get_op_poly(knl) flops, iops = get_32b_ops(op_map, params) sub_map = get_DRAM_access_poly(knl) # noqa f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses( sub_map, params) f32coal = f32coal_l + f32coal_s f32uncoal = f32uncoal_l + f32uncoal_s #print(sub_map) #print(f32coal/(n*n), f32uncoal/(n*n)) print(knl) print(f32coal/(n*n), f32uncoal/(n*n)) 1/0 ''' print_ptx_src_msg(knl.name) print "="*40+"KERNEL STATS" print "barrier count: ", barrier_ct print "flops: ", flops print(sub_map) print "="*40 ''' # execute #print "="*40+"TIMING RESULTS" print("running kernel...") #knl = lp.set_options(knl, write_cl=True, highlight_cl=True) trial_times = [] for i in range(averaging_trials+warmup_trials): #evt, out = knl(queue, a=a_mat_dev, b=b_mat_dev, # g=g_mat_dev, h=h_mat_dev) evt, out = knl(queue, a=a_mat_dev, b=b_mat_dev) evt.wait() trial_times.append((evt.profile.END - evt.profile.START)*1e-9) avg_time = np.average(trial_times[warmup_trials:]) gstats = GPUStats('TeslaK20') ''' if BSIZEx == 8 or BSIZEx == 32: # TODO fix hack reg32_per_thread = 25 elif BSIZEx == 24: reg32_per_thread = 18 elif BSIZEx == 16: reg32_per_thread = 22 ''' reg32_per_thread = 18 shared_mem_per_block = 0 total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy) total_threads = total_blocks*BSIZEx*BSIZEy # TODO never used kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n), barrier_ct, reg32_per_thread, shared_mem_per_block) tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks) model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype)) cycles = model.compute_total_cycles() actual.append(avg_time) HK_predict.append(cycles/(gstats.sm_clock_freq*10**9)) ''' print "actual runtime: ", actual[-1] print "total predicted time: ", predicted[-1] print "total predicted execution cycles: ", cycles print "="*40 ''' update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, A, actual, HK_predict, train_test_config)
def run_axpy_trials(ctx, queue, nvals, configs_t, Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, train_test_config): A = [] HK_predict = [] actual = [] dtype = np.float32 #TODO figure out smem usage issue for n in nvals: x_vec_dev = cl.clrandom.rand(queue, n, dtype=dtype) y_vec_dev = cl.clrandom.rand(queue, n, dtype=dtype) z_vec_dev = cl.clrandom.rand(queue, n, dtype=dtype) knl = lp.make_kernel( "[n] -> {[i]: 0<=i<%d}" % n, [ "z[i] = 5.0*x[i]+7.0*y[i]" ], [ lp.GlobalArg("x", dtype, shape=n), lp.GlobalArg("y", dtype, shape=n), lp.GlobalArg("z", dtype, shape=n), ], name="axpy") ref_knl = knl for BSIZEx, BSIZEy in configs_t: knl = ref_knl unroll = 4 knl = lp.split_iname(knl, "i", unroll*BSIZEx, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", BSIZEx, outer_tag="unr", inner_tag="l.0") #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=False) #print "Correctness check: \n", check # use ptx src to determine resource usage #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy) barrier_poly = get_barrier_poly(knl) barrier_ct = barrier_poly.eval_with_dict({'n': n}) op_map = get_op_poly(knl) flops, iops = get_32b_ops(op_map, {'n': n}) sub_map = get_DRAM_access_poly(knl) # noqa f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses( sub_map, {'n': n}) f32coal = f32coal_l + f32coal_s f32uncoal = f32uncoal_l + f32uncoal_s ''' print_ptx_src_msg(knl.name) print "="*40+"KERNEL STATS" print "barrier count: ", barrier_ct print "flops: ", flops print(sub_map) print "="*40 ''' # execute print("running kernel...") #knl = lp.set_options(knl, write_cl=True, highlight_cl=True) trial_times = [] for i in range(averaging_trials+warmup_trials): evt, (out,) = knl(queue, x=x_vec_dev, y=y_vec_dev, z=z_vec_dev) evt.wait() trial_times.append((evt.profile.END - evt.profile.START)*1e-9) avg_time = np.average(trial_times[warmup_trials:]) gstats = GPUStats('TeslaK20') reg32_per_thread = 20 shared_mem_per_block = 0 total_blocks = math.ceil(n/(BSIZEx*unroll)) kstats = KernelStats(flops*unroll/n, f32uncoal*unroll/n, f32coal*unroll/n, barrier_ct, reg32_per_thread, shared_mem_per_block) tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks) model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype)) cycles = model.compute_total_cycles() actual.append(avg_time) HK_predict.append(cycles/(gstats.sm_clock_freq*10**9)) update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n/unroll, np.dtype(dtype).itemsize, model) update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, A, actual, HK_predict, train_test_config)
def run_tp_trials(ctx, queue, nvals, configs_t, Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, train_test_config, prefetch=True): A = [] HK_predict = [] actual = [] dtype = np.float32 for n in nvals: a_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype) b_mat_dev = cl.clrandom.rand(queue, (n, n), dtype=dtype) order = "C" knl = lp.make_kernel( "{[i,j]: 0<=i,j<%d}" % n, [ "b[i, j] = a[j, i]" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), ], name="transpose") ref_knl = knl for BSIZEx, BSIZEy in configs_t: knl = ref_knl knl = lp.split_iname(knl, "i", BSIZEy, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", BSIZEx, outer_tag="g.1", inner_tag="l.0") if prefetch: knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"]) #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True) #print "Correctness check: \n", check # use ptx src to determine resource usage #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy) barrier_poly = get_barrier_poly(knl) barrier_ct = barrier_poly.eval_with_dict({'n': n}) op_map = get_op_poly(knl) flops, iops = get_32b_ops(op_map, {'n': n}) sub_map = get_DRAM_access_poly(knl) # noqa f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses( sub_map, {'n': n}) f32coal = f32coal_l + f32coal_s f32uncoal = f32uncoal_l + f32uncoal_s # execute #print "="*40+"TIMING RESULTS" print("running kernel...") #knl = lp.set_options(knl, write_cl=True, highlight_cl=True) #if not prefetch: # knl = lp.set_options(knl, write_cl=True, highlight_cl=True) trial_times = [] for i in range(averaging_trials+warmup_trials): evt, (out,) = knl(queue, a=a_mat_dev, b=b_mat_dev) evt.wait() trial_times.append((evt.profile.END - evt.profile.START)*1e-9) avg_time = np.average(trial_times[warmup_trials:]) #if not prefetch: # 1/0 gstats = GPUStats('TeslaK20') if n % BSIZEx == 0 and n % BSIZEy == 0: if prefetch: reg32_per_thread = 10 else: reg32_per_thread = 8 else: if prefetch: reg32_per_thread = 8 else: reg32_per_thread = 9 if prefetch: shared_mem_per_block = 4*BSIZEx*BSIZEy else: shared_mem_per_block = 0 # TODO why is HK way off on the non-prefetch version? total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy) total_threads = total_blocks*BSIZEx*BSIZEy # TODO unused kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n), barrier_ct, reg32_per_thread, shared_mem_per_block) tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks) model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype)) cycles = model.compute_total_cycles() actual.append(avg_time) HK_predict.append(cycles/(gstats.sm_clock_freq*10**9)) #update_LS_matrix(A, flops, f32coal_l, f32coal_s, f32uncoal_l, update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, A, actual, HK_predict, train_test_config)
def run_fd_trials(ctx, queue, nvals, configs_t, Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, train_test_config): A = [] HK_predict = [] actual = [] dtype = np.float32 for n in nvals: u_mat_dev = cl.clrandom.rand(queue, (n+2, n+2), dtype=dtype) knl = lp.make_kernel( "{[i,j]: 0<=i,j<n}", "result[i,j] = u[i, j]**2 + -1 + (-4)*u[i + 1, j + 1] \ + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \ + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]", name="finite_diff") knl = lp.add_and_infer_dtypes(knl, {"u": dtype}) ref_knl = knl for BSIZEx, BSIZEy in configs_t: knl = ref_knl knl = lp.split_iname(knl, "i", BSIZEx, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", BSIZEy, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "u", ["i_inner", "j_inner"], fetch_bounding_box=True) #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=n), # print_code=True) #print "Correctness check: \n", check # use ptx src to determine resource usage #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy) params = {'n': n} barrier_poly = get_barrier_poly(knl) barrier_ct = barrier_poly.eval_with_dict(params) op_map = get_op_poly(knl) flops, iops = get_32b_ops(op_map, params) sub_map = get_DRAM_access_poly(knl) # noqa f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses( sub_map, params) f32coal = f32coal_l + f32coal_s f32uncoal = f32uncoal_l + f32uncoal_s # execute #print "="*40+"TIMING RESULTS" print("running kernel...") #knl = lp.set_options(knl, write_cl=True, highlight_cl=True) trial_times = [] for i in range(averaging_trials+warmup_trials): evt, (out,) = knl(queue, u=u_mat_dev) evt.wait() trial_times.append((evt.profile.END - evt.profile.START)*1e-9) avg_time = np.average(trial_times[warmup_trials:]) gstats = GPUStats('TeslaK20') if n % BSIZEx == 0 and n % BSIZEy == 0: reg32_per_thread = 14 else: reg32_per_thread = 16 shared_mem_per_block = 4*(BSIZEx+2)*(BSIZEy+2) total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy) total_threads = total_blocks*BSIZEx*BSIZEy # TODO unused kstats = KernelStats(flops/(n*n), f32uncoal/(n*n), f32coal/(n*n), barrier_ct, reg32_per_thread, shared_mem_per_block) tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks) model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype)) cycles = model.compute_total_cycles() actual.append(avg_time) HK_predict.append(cycles/(gstats.sm_clock_freq*10**9)) update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, A, actual, HK_predict, train_test_config)
knl = lp.split_iname(knl, "i", BSIZEy, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", BSIZEx, outer_tag="g.1", inner_tag="l.0") knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"]) # check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True) # print "Correctness check: \n", check # use ptx src to determine resource usage cknl = lp.compiled.CompiledKernel(ctx, knl) ptx_src = cknl.cl_kernel_info().cl_kernel.program.binaries[0] ptx_src_file = open(knl.name + ".ptx", "w") ptx_src_file.write(ptx_src) barrier_poly = get_barrier_poly(knl) barrier_count = barrier_poly.eval_with_dict({"n": n}) op_map = get_op_poly(knl) flops = op_map.get(np.dtype(np.float32), isl.PwQPolynomial("{ 0 }")).eval_with_dict({"n": n}) iops = op_map.get(np.dtype(np.int32), isl.PwQPolynomial("{ 0 }")).eval_with_dict({"n": n}) sub_map = get_DRAM_access_poly(knl) # noqa f32coal_l = sub_map.get( (np.dtype(np.float32), "consecutive", "load"), isl.PwQPolynomial("{ 0 }") ).eval_with_dict({"n": n}) f32coal_s = sub_map.get( (np.dtype(np.float32), "consecutive", "store"), isl.PwQPolynomial("{ 0 }") ).eval_with_dict({"n": n}) f32coal = f32coal_l + f32coal_s # print "coalesced: %i, (stores: %i, loads: %i)" % (f32coal, f32coal_s, f32coal_l) f32uncoal_l = sub_map.get( (np.dtype(np.float32), "nonconsecutive", "load"), isl.PwQPolynomial("{ 0 }") ).eval_with_dict({"n": n})
def run_empt_trials(ctx, queue, nvals, configs_t, Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, train_test_config): A = [] HK_predict = [] actual = [] dtype = np.float32 for n in nvals: knl = lp.make_kernel( "{[i,j]: 0<=i,j<%d}" % n, [ "" ], name="empty") for BSIZEx, BSIZEy in configs_t: #check = lp.auto_test_vs_ref(ref_knl, ctx, knl, print_code=True) #print "Correctness check: \n", check # use ptx src to determine resource usage #ptx_dump(ctx, knl, n, BSIZEx, BSIZEy) params = {'n': n} barrier_poly = get_barrier_poly(knl) barrier_ct = barrier_poly.eval_with_dict(params) op_map = get_op_poly(knl) flops, iops = get_32b_ops(op_map, params) sub_map = get_DRAM_access_poly(knl) # noqa f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s = get_DRAM_f32_accesses( sub_map, params) # execute #print "="*40+"TIMING RESULTS" print("running kernel...") #knl = lp.set_options(knl, write_cl=True, highlight_cl=True) trial_times = [] for i in range(averaging_trials+warmup_trials): evt, out = knl(queue) evt.wait() trial_times.append((evt.profile.END - evt.profile.START)*1e-9) avg_time = np.average(trial_times[warmup_trials:]) gstats = GPUStats('TeslaK20') reg32_per_thread = 2 shared_mem_per_block = 0 total_blocks = math.ceil(n/BSIZEx)*math.ceil(n/BSIZEy) total_threads = total_blocks*BSIZEx*BSIZEy # TODO unused # TODO actually increase threads/blocks but expect 0 result kstats = KernelStats(0, 0, 0, barrier_ct, reg32_per_thread, shared_mem_per_block) tconfig = ThreadConfig(BSIZEx*BSIZEy, total_blocks) model = PerfModel(gstats, kstats, tconfig, np.dtype(dtype)) cycles = model.compute_total_cycles() actual.append(avg_time) HK_predict.append(cycles/(gstats.sm_clock_freq*10**9)) update_LS_matrix(A, flops, iops, f32coal_l, f32coal_s, f32uncoal_l, f32uncoal_s, barrier_ct, total_blocks, n*n, np.dtype(dtype).itemsize, model) update_lstsq_mats(Atrain_all, Atest_all, ytrain_all, ytest_all, actual_times_all, HK_predict_all, A, actual, HK_predict, train_test_config)