#=================================================================# # copy to scratchpad. a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a') # here we simply use some helper function to do the reshape and transpose. trans_buf = nnpu.utils.transpose(a_buf, (1, 0)) re_buf = nnpu.utils.reshape(trans_buf, (2, 4, 2, 4), dst_scope='buffer1') tile_buf = nnpu.utils.transpose(re_buf, (0, 2, 1, 3), dst_scope='buffer2') # copy back to host. tile_host, _ = nnpu.utils.CopyBufToH(tile_buf, 'tile') # ------ this ends the computation description. ------ #==================================# # ------ begin scheduling ------ #==================================# s = nnpu.create_schedule([tile_host.op]) # since all operations are scratchpad copy, all we need to do is pragma. # this is done by the helper functions, so nothing to do here. #==================================# # ------ this ends the scheduling ------ #==================================# print(tvm.lower(s, [a, tile_host], simple_mode=True)) print(nnpu.lower(s, [a, tile_host], simple_mode=True)) func = nnpu.build(s, [a, tile_host], 'nnpu', 'llvm', name='nnpu_func') print('------------------- device module 1 TVM IR: ') print(func.imported_modules[0].get_source('ir')) print('------------------- device module 1 uop: ') print(func.imported_modules[0].get_source('uop'))
ko = tvm.reduce_axis((0, shape1_tiled[0]), 'k0') ki = tvm.reduce_axis((0, factor), 'k0') res_shape = (shape1[0], shape2[0]) # (8, 64) res_acc = tvm.compute( res_shape, lambda i, j: tvm.sum(a_buf[ko, i, ki].astype(dtype_w) * b_buf[ko, j, ki].astype(dtype_w), axis=[ko, ki])) nnpu.utils.MarkScope(res_acc, 'acc') res_buf = nnpu.utils.CopyAccToBuf(res_acc, 'res') res_host = tvm.compute(res_shape, lambda *i: res_buf(*i), 'res_host') s = nnpu.create_schedule(res_host.op) # tensorize xi, xj = s[res_acc].op.axis xjo, xji = s[res_acc].split(xj, factor=gemm_shape[2]) ko, ki = s[res_acc].op.reduce_axis s[res_acc].reorder(xjo, ko, xi, xji, ki) s[res_acc].tensorize( xi, env.intrins.get('GEMM', shape=gemm_shape, mode='inc', scope_out='acc')) # split output core_extent = 4 xh, xw = s[res_host].op.axis xwo, xwi = s[res_host].split(xw, nparts=core_extent) s[res_host].reorder(xwo, xh, xwi)
ko = tvm.reduce_axis((0, shape1[1] // factor), 'ko') ki = tvm.reduce_axis((0, factor), 'ki') out_buf = tvm.compute( out_shape_tiled, lambda xo, yo, xi, yi: tvm.sum(a_buf[xo, ko, xi, ki].astype(dtype_w) * b_buf[yo, ko, yi, ki].astype(dtype_w), axis=[ko, ki]), 'out_buf') out_acc = out_buf # nnpu.utils.MarkScope(out_acc, 'acc') # out_buf = tvm.compute(out_shape_tiled, lambda *i: out_acc(*i), 'out_host') # nnpu.utils.MarkScope(out_buf) out_host = tvm.compute(out_shape_tiled, lambda *i: out_buf(*i), 'out_host') # schedule s = nnpu.create_schedule(out_host.op) # al = s.cache_read(a_buf, env.get_scope('buffer1'), out_acc) # bl = s.cache_read(b_buf, env.get_scope('buffer2'), out_acc) al = a_buf bl = b_buf a_buffer_scope = 'buffer1' b_buffer_scope = 'buffer2' # set scope s[a_buf].set_scope(env.get_scope(a_buffer_scope)) s[b_buf].set_scope(env.get_scope(b_buffer_scope)) s[out_buf].set_scope(env.get_scope('buffer3')) # pragma read s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy_to_buf)
a_buf = tvm.compute(shape, lambda *i: a(*i), 'a_buf') exp = tvm.compute(shape, lambda i: tvm.exp(a_buf[i]), 'exp') log = tvm.compute(shape, lambda i: tvm.log(a_buf[i]), 'exp') tanh = tvm.compute(shape, lambda i: tvm.tanh(a_buf[i]), 'exp') sigmoid = tvm.compute(shape, lambda i: tvm.sigmoid(a_buf[i]), 'exp') # k = tvm.reduce_axis((0, 16), 'k0') # sum = tvm.compute((1, ), lambda i: tvm.sum(sigmoid[k], axis=k), 'sum') # nnpu.utils.MarkScope(sum) # softmax = tvm.compute(shape, lambda i: sigmoid[i] / sum[0], 'softmax') # nnpu.utils.MarkScope(softmax) # softmax_host, _ = nnpu.utils.CopyBufToH(softmax, 'softmax') s = nnpu.create_schedule([exp.op, log.op, tanh.op, sigmoid.op]) # cache write exp_buf = s.cache_write(exp, env.get_scope('buffer0')) log_buf = s.cache_write(log, env.get_scope('buffer0')) tanh_buf = s.cache_write(tanh, env.get_scope('buffer0')) sigmoid_buf = s.cache_write(sigmoid, env.get_scope('buffer0')) # set scope s[a_buf].set_scope(env.get_scope('buffer0')) # pragma s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy_to_buf) s[exp].pragma(exp.op.axis[0], env.dma_copy_from_buf) s[log].pragma(log.op.axis[0], env.dma_copy_from_buf) s[tanh].pragma(tanh.op.axis[0], env.dma_copy_from_buf) s[sigmoid].pragma(sigmoid.op.axis[0], env.dma_copy_from_buf) # tensorize vector_unit_size = 32
conv_host = tvm.compute(conv_shape, lambda *i: conv(*i), 'conv_host') # ------ this ends the computation description. ------ #==================================# # ------ begin scheduling ------ #==================================# # set the memory scopes of tensors that should be on accelerator. # here we put keature and kernel on buffer0 and buffer1 respectively. nnpu.utils.MarkScope(feature_buf, 'buffer1') nnpu.utils.MarkScope(kernel_buf, 'buffer2') # the GEMM output is on accumulation buffer. nnpu.utils.MarkScope(conv_acc, 'acc') nnpu.utils.MarkScope(conv, 'buffer0') s = nnpu.create_schedule(conv_host.op) # reorder the GEMM compute stage. # the rule is, first make sure the axes of one GEMM instruction are the last 3 iterations. # then other reduction axes. h, wo, oco, wi, oci = s[conv_acc].op.axis s[conv_acc].reorder(h, wo, oco, kh_reduce, kw_reduce, co_reduce, wi, oci, ci_reduce) # tensorize s[conv_acc].tensorize( wi, env.intrins.get('GEMM', shape=gemm_shape, mode='inc', scope_in1='buffer1', scope_in2='buffer2',
def test(): pass if (False): print('-----') with ScheduleProcHelper(): env = nnpu.get_env() shape = (16, 64) a_host = tvm.placeholder(shape, env.cfg['dtype_n'], 'a_host') a_buf, _ = nnpu.utils.CopyHtoBuf(a_host, 'a') vctr_shape = (64, ) b_host = tvm.placeholder(vctr_shape, env.cfg['dtype_n'], 'b_host') b_buf, _ = nnpu.utils.CopyHtoBuf(b_host, 'b') dtype_w = env.cfg['dtype_w'] out_shape = (4, 16) k = tvm.reduce_axis((0, 16), 'k') c_buf = tvm.compute( out_shape, lambda j, i: tvm.sum(a_buf[i, j * 16 + k].astype( dtype_w) * b_buf[j * 16 + k].astype(dtype_w), axis=k)) utils.MarkScope(c_buf) c_host, _ = utils.CopyBufToH(c_buf, 'c') s = nnpu.create_schedule(c_host.op) # mark variable scopes # tensorize s[c_buf].tensorize( s[c_buf].op.axis[1], env.intrins.get('GEMM', shape=(16, 16, 1), mode='inc', reduce=True)) # build print(tvm.lower(s, [a_host, b_host, c_host], simple_mode=True)) print(nnpu.lower(s, [a_host, b_host, c_host], simple_mode=True)) #exit() func = nnpu.build(s, [a_host, b_host, c_host], 'nnpu', 'llvm', name='nnpu_exp') print('function built: ') print('------------------- device module 1 asm code: ') print(func.imported_modules[0].get_source('asm')) #print(func.get_source()) # prepare data ctx = tvm.nd.TVMContext(13, 0) a_np = np.random.randint(size=shape, dtype=a_host.dtype, low=-32, high=32) # a_np = np.ones(shape).astype(a_host.dtype) a_nd = tvm.nd.array(a_np, ctx) b_np = np.random.randint(size=vctr_shape, dtype=b_host.dtype, low=-16, high=16) # b_np = np.ones(vctr_shape).astype(b_host.dtype) b_nd = tvm.nd.array(b_np, ctx) out_nd = tvm.nd.array(np.zeros(out_shape).astype(c_host.dtype), ctx) # run func(a_nd, b_nd, out_nd) print('run finished') print('a=') print(a_np) print('b=') print(b_np) print('out=') out_np = out_nd.asnumpy() out_np = np.sum(out_np, axis=0) print(out_np) print('numpy ground truth is: ') gt = np.dot(a_np.astype(dtype_w), b_np.astype(dtype_w)) #gt = np.greater(np.dot(a_np.astype(dtype_w), b_np.astype(dtype_w)), bias_np) print(gt) np.testing.assert_allclose(out_np, gt)
sum1 = tvm.compute((nRow, ), lambda i: tvm.sum(sigmoid_re[ko, i, ki], axis=[ko, ki]), 'sum1') nnpu.utils.MarkScope(sum1, 'acc') sum1_buf = nnpu.utils.CopyAccToBuf(sum1, 'sum1') # sum1_buf = tvm.compute((nRow, ), lambda *i: sum1(*i)) k = tvm.reduce_axis((0, nRow), 'k') sum2 = tvm.compute((1, ), lambda i: tvm.sum(sum1_buf[k], axis=k), 'sum2') nnpu.utils.MarkScope(sum2, 'buffer0') softmax = tvm.compute(shape, lambda i: sigmoid[i] / sum2[0], 'softmax') nnpu.utils.MarkScope(softmax) softmax_host, _ = nnpu.utils.CopyBufToH(softmax, 'softmax') s = nnpu.create_schedule([softmax_host.op]) s[sigmoid_re].set_scope(env.scratchpad_scope(0)) s[sigmoid_re].pragma(sigmoid_re.op.axis[0], env.scratchpad_copy) # tensorize xo, xi = s[exp].split(exp.op.axis[0], 16) s[exp].tensorize(xi, env.intrins.get('VExp', mode='w')) xo, xi = s[exp_p1].split(exp_p1.op.axis[0], 16) s[exp_p1].tensorize(xi, env.intrins.get('VAddI', mode='w')) xo, xi = s[sigmoid].split(sigmoid.op.axis[0], 16) s[sigmoid].tensorize(xi, env.intrins.get('VDivV', mode='w')) xblock, xcol = sum1.op.reduce_axis xrow = sum1.op.axis[0] s[sum1].reorder(xblock, xrow, xcol)
a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a') b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b') sum_buf = tvm.compute(shape, lambda *i: a_buf(*i) + b_buf(*i), 'sum_buf') nnpu.utils.MarkScope(sum_buf) sum_host, sum_dram = nnpu.utils.CopyBufToH(sum_buf, 'sum') mul_buf = tvm.compute( shape, lambda *i: a_buf(*i).astype(dtype_w) * b_buf(*i).astype(dtype_w), 'mul_buf') nnpu.utils.MarkScope(mul_buf) mul_host, _ = nnpu.utils.CopyBufToH(mul_buf, 'mul') s = nnpu.create_schedule([sum_host.op, mul_host.op]) # tensorize xo, xi = s[sum_buf].split(sum_buf.op.axis[0], factor=insn_shape[0]) yo, yi = s[sum_buf].split(sum_buf.op.axis[1], factor=insn_shape[1]) s[sum_buf].reorder(xo, yo, xi, yi) s[sum_buf].tensorize(xi, env.intrins.get('MAddM', shape=insn_shape, mode='n')) # xo, xi = s[sum_buf].mul_buf(mul_buf.op.axis[0], factor=insn_shape[0]) # yo, yi = s[sum_buf].split(sum_buf.op.axis[1], factor=insn_shape[1]) # s[sum_buf].reorder(xo, yo, xi, yi) s[mul_buf].tile(mul_buf.op.axis[0], mul_buf.op.axis[1], insn_shape[0], insn_shape[1]) s[mul_buf].tensorize( s[mul_buf].leaf_iter_vars[2], env.intrins.get('MMulM', shape=insn_shape, mode='inc'))
's1') nnpu.utils.MarkScope(s_update_1) k = tvm.reduce_axis((0, m), 'k1') s_update_2 = tvm.compute(h_shape, lambda t, i: tvm.sum(u_buf[i, k] * h_state[t - 1, k], axis=k), 's2') nnpu.utils.MarkScope(s_update_2) s_update_3 = tvm.compute(h_shape, lambda t, i: s_update_1[t, i] + s_update_2[t, i], 's3') nnpu.utils.MarkScope(s_update_3) s_update_4 = tvm.compute(h_shape, lambda t, i: s_update_3[t, i] + b_buf[i], 's4') nnpu.utils.MarkScope(s_update_4) s_scan = tvm.scan(h_init_buf, s_update_4, h_state, inputs=[x_buf]) nnpu.utils.MarkScope(s_scan) #res = nnpu.utils.reshape(s_scan, h_shape) #res_host, _ = nnpu.utils.CopyBufToH(res, 'sc') s = nnpu.create_schedule(s_scan.op) # tensorize s[s_update_1].tensorize(s_update_1.op.axis[1], env.intrins.get('GEMM', shape=gemm_shape, mode='inc', reduce=True)) #s[s_update_2].tensorize(s_update_2.op.axis[1], # env.intrins.get('GEMM', shape=gemm_shape, mode='w', reduce=True)) s[s_update_3].tensorize(s_update_3.op.axis[1], env.intrins.get('VAddV', mode='w')) #s[s_update_4].tensorize(s_update_4.op.axis[1], # env.intrins.get('VAddV', mode='w')) print(tvm.lower(s, [x, w, u, b, h_init, s_scan], simple_mode=True))