def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, out_dtype): """Compute function for Cortex-M7 SIMD implementation of conv2d.""" assert isinstance(strides, int) or len(strides) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(strides, int): stride_h = stride_w = strides else: stride_h, stride_w = strides if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch_size, in_height, in_width, in_channels = data.shape kernel_h, kernel_w, out_channels, _ = kernel.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] padded_data = pad(data, pad_before, pad_after, name="padded_data") rc = te.reduce_axis((0, in_channels), name="rc") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") conv = te.compute( (batch_size, out_height, out_width, out_channels), lambda nn, yy, xx, ff: te.sum( padded_data[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * kernel[ ry, rx, ff, rc].astype(out_dtype), axis=[ry, rx, rc], ), name="conv2d", tag="conv2d_nhwc", ) ########################### # Config Space Definition # ########################### n, oh, ow, co = ( cfg.axis(batch_size.value), cfg.axis(out_height.value), cfg.axis(out_width.value), cfg.axis(out_channels.value), ) kh, kw, ci = ( cfg.reduce_axis(kernel_h.value), cfg.reduce_axis(kernel_w.value), cfg.reduce_axis(in_channels.value), ) assert in_channels.value % 4 == 0 owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2) cio, cii = cfg.define_split("tile_ci", ci, policy="factors", num_outputs=2, filter=lambda x: x.size[-1] % 4 == 0) coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2) cfg.define_reorder( "reorder_0_simd", [n, oh, owo, owi, coo, coi, kh, kw, cio, cii], policy="candidate", candidate=[ [n, oh, kh, kw, owo, coo, cio, owi, coi, cii], [n, oh, kh, kw, coo, owo, cio, owi, coi, cii], [n, kh, kw, oh, owo, coo, cio, owi, coi, cii], [n, kh, kw, oh, coo, owo, cio, owi, coi, cii], ], ) cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32]) cfg.define_knob("unroll_explicit", [0, 1]) return conv
def test_tensor_core_batch_matmal(): batch_size = 4 n = 512 m, l = n, n assert n % 32 == 0 assert m % 8 == 0 assert l % 16 == 0 nn, mm, ll = n // 32, m // 8, l // 16 A = te.placeholder((batch_size, nn, ll, 32, 16), name="A", dtype="float16") B = te.placeholder((batch_size, ll, mm, 16, 8), name="B", dtype="float16") k1 = te.reduce_axis((0, ll), name="k1") k2 = te.reduce_axis((0, 16), name="k2") C = te.compute( (batch_size, nn, mm, 32, 8), lambda b, i, j, ii, jj: te.sum( A[b, i, k1, ii, k2].astype("float") * B[b, k1, j, k2, jj].astype("float"), axis=[k1, k2] ), name="Fragment_C", ) s = te.create_schedule(C.op) warp_size = 32 kernel_size = 16 block_row_warps = 2 block_col_warps = 4 warp_row_tiles = 4 warp_col_tiles = 2 chunk = 4 block_x = te.thread_axis("blockIdx.x") block_y = te.thread_axis("blockIdx.y") block_z = te.thread_axis("blockIdx.z") thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_z = te.thread_axis("threadIdx.z") AS = s.cache_read(A, "shared", [C]) BS = s.cache_read(B, "shared", [C]) AF = s.cache_read(AS, "wmma.matrix_a", [C]) BF = s.cache_read(BS, "wmma.matrix_b", [C]) CF = s.cache_write(C, "wmma.accumulator") b, i, j, kernel_i, kernel_j = s[C].op.axis i, ii = s[C].split(i, factor=warp_row_tiles) block_i, i = s[C].split(i, factor=block_row_warps) j, jj = s[C].split(j, factor=warp_col_tiles) block_j, j = s[C].split(j, factor=block_col_warps) s[C].reorder(block_i, block_j, i, j, ii, jj, kernel_i, kernel_j) s[C].bind(b, block_z) s[C].bind(block_i, block_x) s[C].bind(block_j, block_y) s[C].bind(i, thread_y) s[C].bind(j, thread_z) s[CF].compute_at(s[C], j) b, warp_i, warp_j, _i, _j = s[CF].op.axis k, _k = CF.op.reduce_axis ko, ki = s[CF].split(k, factor=chunk) s[CF].reorder(ko, ki, warp_i, warp_j, _i, _j, _k) s[AF].compute_at(s[CF], ki) s[BF].compute_at(s[CF], ki) s[AS].compute_at(s[CF], ko) b, xo, yo, xi, yi = AS.op.axis tx, xo = s[AS].split(xo, nparts=block_row_warps) ty, yo = s[AS].split(yo, nparts=block_col_warps) t = s[AS].fuse(xi, yi) to, ti = s[AS].split(t, nparts=warp_size) s[AS].bind(tx, thread_y) s[AS].bind(ty, thread_z) s[AS].bind(to, thread_x) s[BS].compute_at(s[CF], ko) b, xo, yo, xi, yi = BS.op.axis tx, xo = s[BS].split(xo, nparts=block_row_warps) ty, yo = s[BS].split(yo, nparts=block_col_warps) t = s[BS].fuse(xi, yi) to, ti = s[BS].split(t, nparts=warp_size) s[BS].bind(tx, thread_y) s[BS].bind(ty, thread_z) s[BS].bind(to, thread_x) s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), "wmma.matrix_a")) s[BF].tensorize(BF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), "wmma.matrix_b")) s[C].tensorize(kernel_i, intrin_wmma_store_matrix((32, 8, 16))) s[CF].tensorize(_i, intrin_wmma_gemm((32, 8, 16))) func = tvm.build(s, [A, B, C], "cuda") dev = tvm.gpu(0) a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype) b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), dev) func(a, b, c) evaluator = func.time_evaluator(func.entry_name, dev, number=3) print("gemm with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3)) if VERIFY: func(a, b, c) a_np = a_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) b_np = b_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) c_np = c.asnumpy().transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) np.testing.assert_allclose( c_np, np.matmul(a_np.astype(C.dtype), b_np.astype(C.dtype)), rtol=1e-4, atol=1e-4 )
def intrin_wmma_gemm(shape): n, m, l = shape A = te.placeholder((n, l), name="A", dtype="float16") B = te.placeholder((l, m), name="B", dtype="float16") k = te.reduce_axis((0, l), name="k") C = te.compute( (n, m), lambda ii, jj: te.sum(A[ii, k].astype("float") * B[k, jj].astype("float"), axis=k), name="C", ) BA = tvm.tir.decl_buffer( A.shape, A.dtype, name="BA", scope="wmma.matrix_a", data_alignment=32, offset_factor=n * l ) BB = tvm.tir.decl_buffer( B.shape, B.dtype, name="BB", scope="wmma.matrix_b", data_alignment=32, offset_factor=l * m ) BC = tvm.tir.decl_buffer( C.shape, C.dtype, name="BC", scope="wmma.accumulator", data_alignment=32, offset_factor=n * m, ) def intrin_func(ins, outs): BA, BB = ins (BC,) = outs def init(): ib = tvm.tir.ir_builder.create() ib.emit( tvm.tir.call_intrin( "handle", "tir.tvm_fill_fragment", BC.data, n, m, l, BC.elem_offset // (n * m), 0.0, ) ) return ib.get() def update(): ib = tvm.tir.ir_builder.create() ib.emit( tvm.tir.call_intrin( "handle", "tir.tvm_mma_sync", BC.data, BC.elem_offset // (n * m), BA.data, BA.elem_offset // (n * l), BB.data, BB.elem_offset // (l * m), BC.data, BC.elem_offset // (n * m), ) ) return ib.get() return update(), init(), update() return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
def _schedule(cfg, s, C): A, B = s[C].op.input_tensors batch, m_dim, k_dim = get_const_tuple(A.shape) batch, n_dim, k_dim = get_const_tuple(B.shape) out_dtype = C.dtype # inline astype fp16 s[A].compute_inline() s[B].compute_inline() # Explicit memory access AS = s.cache_read(A, "shared", [C]) BS = s.cache_read(B, "shared", [C]) AF = s.cache_read(AS, "wmma.matrix_a", [C]) BF = s.cache_read(BS, "wmma.matrix_b", [C]) CF = s.cache_write(C, "wmma.accumulator") CS = s.cache_read(CF, "shared", [C]) # fallback support target = tvm.target.Target.current() if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log( target.kind.name, target.model, "batch_matmul_tensorcore.cuda") cfg.fallback_with_reference_log(ref_log) # Deal with op fusion, such as bias/relu and slice after padding if C.op not in s.outputs and "injective" in s.outputs[0].tag: s[C].compute_inline() C = s.outputs[0].output(0) # create tuning space cfg.define_knob("block_row_warps", [1, 2, 4]) cfg.define_knob("block_col_warps", [1, 2, 4]) cfg.define_knob("warp_row_tiles", [1, 2, 4]) cfg.define_knob("warp_col_tiles", [1, 2, 4]) cfg.define_knob("chunk", [1, 2, 4, 8]) cfg.define_knob("offset", [0, 8]) cfg.define_knob("offsetCS", [0, 8]) cfg.define_knob("vec", [1, 2, 4, 8]) # Ensure that the default parameters are applicable when autotvm is not in use if m_dim % 32 == 0 and n_dim % 8 == 0: cfg.define_knob("wmma_m", [32, 16, 8]) elif m_dim % 16 == 0 and n_dim % 16 == 0: cfg.define_knob("wmma_m", [16, 8, 32]) elif m_dim % 8 == 0 and n_dim % 32 == 0: cfg.define_knob("wmma_m", [8, 16, 32]) warp_size = 32 wmma_k = 16 block_row_warps = cfg["block_row_warps"].val block_col_warps = cfg["block_col_warps"].val warp_row_tiles = cfg["warp_row_tiles"].val warp_col_tiles = cfg["warp_col_tiles"].val chunk = cfg["chunk"].val offset = cfg["offset"].val offsetCS = cfg["offsetCS"].val wmma_m = cfg["wmma_m"].val vec = cfg["vec"].val if wmma_m == 16: wmma_n = 16 elif wmma_m == 8: wmma_n = 32 elif wmma_m == 32: wmma_n = 8 # Define the stride of intrin functions AS_align = chunk * wmma_k + offset BS_align = chunk * wmma_k + offset CS_align = warp_col_tiles * block_col_warps * wmma_n + offsetCS AS_stride = [AS_align, 1] BS_stride = [BS_align, 1] AF_stride = [wmma_k, 1] BF_stride = [wmma_k, 1] CF_stride = [warp_col_tiles * wmma_n, 1] CS_stride = [CS_align, 1] block_x = te.thread_axis("blockIdx.x") block_y = te.thread_axis("blockIdx.y") block_z = te.thread_axis("blockIdx.z") thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_z = te.thread_axis("threadIdx.z") # Schedule for dense computation block_factor_m = wmma_m * warp_row_tiles * block_row_warps block_factor_n = wmma_n * warp_col_tiles * block_col_warps b, m, n = C.op.axis block_i, bc = s[C].split(m, factor=block_factor_m) block_j, oc = s[C].split(n, factor=block_factor_n) s[C].reorder(b, block_i, block_j, bc, oc) t = s[C].fuse(bc, oc) t, vi = s[C].split(t, factor=vec) t, tx = s[C].split(t, factor=warp_size) t, ty = s[C].split(t, factor=block_row_warps) t, tz = s[C].split(t, factor=block_col_warps) s[C].bind(block_i, block_x) s[C].bind(block_j, block_y) s[C].bind(b, block_z) s[C].bind(tz, thread_z) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) s[C].vectorize(vi) # Schedule for wmma store s[CS].compute_at(s[C], block_j) bs, bb, oo = CS.op.axis s[CS].storage_align(bb, CS_align - 1, CS_align) bb, bbi = s[CS].split(bb, factor=wmma_m) oo, ooi = s[CS].split(oo, factor=wmma_n) bb, bbii = s[CS].split(bb, factor=warp_row_tiles) oo, ooii = s[CS].split(oo, factor=warp_col_tiles) s[CS].reorder(bs, bb, oo, bbii, ooii, bbi, ooi) # Schedule for wmma computation s[CF].compute_at(s[CS], oo) bs, warp_i, warp_j = CF.op.axis warp_i, _ii = s[CF].split(warp_i, factor=wmma_m) warp_j, _jj = s[CF].split(warp_j, factor=wmma_n) (k, ) = CF.op.reduce_axis k, _k = s[CF].split(k, factor=wmma_k) ko, ki = s[CF].split(k, factor=chunk) s[CF].reorder(bs, ko, ki, warp_i, warp_j, _ii, _jj, _k) # Schedule for wmma_matrix_a load s[AF].compute_at(s[CF], ki) bs, b, i = AF.op.axis b, b_ii = s[AF].split(b, factor=wmma_m) i, i_jj = s[AF].split(i, factor=wmma_k) s[AF].reorder(bs, b, i, b_ii, i_jj) # Schedule for wmma_matrix_b load s[BF].compute_at(s[CF], ki) bs, o, i = BF.op.axis o, o_ii = s[BF].split(o, factor=wmma_n) i, i_ii = s[BF].split(i, factor=wmma_k) s[BF].reorder(bs, o, i, o_ii, i_ii) # Schedule for A's(B's) shared memory load def shared_shedule(stage, strides): s[stage].compute_at(s[CF], ko) bs, xo, yo = stage.op.axis s[stage].storage_align(xo, strides - 1, strides) t = s[stage].fuse(xo, yo) t, vi = s[stage].split(t, factor=vec) t, tx = s[stage].split(t, factor=warp_size) t, ty = s[stage].split(t, factor=block_row_warps) _, tz = s[stage].split(t, factor=block_col_warps) s[stage].bind(ty, thread_y) s[stage].bind(tz, thread_z) s[stage].bind(tx, thread_x) s[stage].vectorize(vi) shared_shedule(AS, AS_align) shared_shedule(BS, BS_align) shape = (wmma_m, wmma_n, wmma_k) # TODO: add checking here, datatype casting may cause precision loss in_dtype = "float16" AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=in_dtype) BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=in_dtype) k_gemm = te.reduce_axis((0, wmma_k), name="k_gemm") CL_compute = te.compute( (wmma_m, wmma_n), lambda ii, jj: te.sum( AL_gemm[ii, k_gemm].astype(out_dtype) * BL_gemm[jj, k_gemm]. astype(out_dtype), axis=k_gemm, ), name="CL_compute", ) # lower the computation loops down to TensorCore hardware intrinsics # by mapping the dense tensorcore to tensor intrinsics s[AF].tensorize( b_ii, intrin_wmma_load_matrix_A( AF_stride, AS_stride, shape, "row_major", (wmma_m, wmma_k), (wmma_m, wmma_k), "float16", ), ) s[BF].tensorize( o_ii, intrin_wmma_load_matrix_W( BF_stride, BS_stride, shape, "col_major", (wmma_n, wmma_k), (wmma_n, wmma_k), "float16", ), ) s[CF].tensorize( _ii, intrin_wmma_gemm(AL_gemm, BL_gemm, CL_compute, AF_stride, BF_stride, CF_stride, shape), ) s[CS].tensorize( bbi, intrin_wmma_store_matrix(CS_stride, CF_stride, shape, out_dtype, (wmma_m, wmma_n), (wmma_m, wmma_n)), )
def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype): """Spatial pack compute for Conv2d NHWC""" out_dtype = out_dtype or data.dtype N, IH, IW, IC = get_const_tuple(data.shape) assert len(kernel.shape) == 4, "AlterOpLayout not enabled for NHWC yet" KH, KW, _, OC = get_const_tuple(kernel.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation dilated_kernel_h = (KH - 1) * dilation_h + 1 dilated_kernel_w = (KW - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = \ get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w)) HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1 OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1 data_pad = nn.pad(data, [0, pad_top, pad_left, 0], [0, pad_down, pad_right, 0]) # ==================== define configuration space ==================== n, oc, oh, ow = cfg.axis(N), cfg.axis(OC), cfg.axis(OH), cfg.axis(OW) ic, kh, kw = cfg.reduce_axis(IC), cfg.reduce_axis(KH), cfg.reduce_axis(KW) oco, oci = cfg.define_split('tile_co', oc, num_outputs=2) oho, ohi = cfg.define_split('tile_oh', oh, num_outputs=2) owo, owi = cfg.define_split('tile_ow', ow, num_outputs=2) cfg.define_reorder('reorder_conv', [n, oho, owo, oco, kh, kw, ic, ohi, owi, oci], policy='candidate', candidate=[ [n, oho, owo, oco, kh, kw, ic, ohi, owi, oci], [n, oho, owo, oco, ohi, kh, kw, ic, owi, oci], [n, oho, owo, oco, ohi, kh, kw, owi, ic, oci], [n, oho, owo, ohi, oco, kh, kw, owi, ic, oci]]) cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll') cfg.define_annotate("ann_spatial", [ohi, owi, oci], policy='try_unroll_vec') # ==================================================================== OCI = cfg['tile_co'].size[-1] OHI = cfg['tile_oh'].size[-1] OWI = cfg['tile_ow'].size[-1] OCO = OC // OCI OHO = OH // OHI OWO = OW // OWI kvshape = (OCO, KH, KW, IC, OCI) ovshape = (N, OHO, OWO, OCO, OHI, OWI, OCI) oshape = (N, OH, OW, OC) if dilation_h != 1 or dilation_w != 1: # undilate input data dvshape = (N, OHO, OWO, KH, KW, IC, OHI, OWI) data_vec = te.compute(dvshape, lambda n, oho, owo, kh, kw, ic, ohi, owi: data_pad[n][(oho*OHI+ohi)*HSTR+kh*dilation_h] [(owo*OWI+owi)*WSTR+kw*dilation_w][ic], name='data_vec_undilated') else: dvshape = (N, OHO, OWO, KH + (OHI-1)*HSTR, KW + (OWI-1)*WSTR, IC) data_vec = te.compute(dvshape, lambda n, oho, owo, ohi, owi, ic: data_pad[n][oho*OHI*HSTR+ohi][owo*OWI*WSTR+owi][ic], name='data_vec') kernel_vec = te.compute(kvshape, lambda oco, kh, kw, ic, oci: \ kernel[kh][kw][ic][oco*OCI+oci], name='kernel_vec') ic = te.reduce_axis((0, IC), name='ic') kh = te.reduce_axis((0, KH), name='kh') kw = te.reduce_axis((0, KW), name='kw') if dilation_h != 1 or dilation_w != 1: conv = te.compute(ovshape, lambda n, oho, owo, oco, ohi, owi, oci: \ te.sum(data_vec[n, oho, owo, kh, kw, ohi, owi, ic].astype(out_dtype) * kernel_vec[oco, kh, kw, ic, oci].astype(out_dtype), axis=[ic, kh, kw]), name='conv') else: conv = te.compute( ovshape, lambda n, oho, owo, oco, ohi, owi, oci: \ te.sum(data_vec[n, oho, owo, ohi*HSTR+kh, owi*WSTR+kw, ic].astype(out_dtype) * kernel_vec[oco, kh, kw, ic, oci].astype(out_dtype), axis=[ic, kh, kw]), name='conv') idiv = tvm.tir.indexdiv imod = tvm.tir.indexmod output = te.compute(oshape, lambda n, oho, owo, oc: conv[n][idiv(oho, OHI)][idiv(owo, OWI)][idiv(oc, OCI)]\ [imod(oho, OHI)][imod(owo, OWI)][imod(oc, OCI)], name='output_unpack', tag='spatial_conv_output_NHWC') return output
'dtype = "float32"\n' "a = numpy.random.rand(M, K).astype(dtype)\n" "b = numpy.random.rand(K, N).astype(dtype)\n", stmt="answer = numpy.dot(a, b)", number=np_repeat, ) print("Numpy running time: %f" % (np_runing_time / np_repeat)) answer = numpy.dot(a.numpy(), b.numpy()) # Algorithm k = te.reduce_axis((0, K), "k") A = te.placeholder((M, K), name="A") B = te.placeholder((K, N), name="B") C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C") # Default schedule s = te.create_schedule(C.op) func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) func(a, b, c) tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5) evaluator = func.time_evaluator(func.entry_name, dev, number=1) print("Baseline: %f" % evaluator(a, b, c).mean) ################################################################################################
def group_conv2d_NCHWc_int8( cfg, data, kernel, stride, padding, dilation, groups, out_dtype="float32" ): """Group convolution operator for 'group_conv2d_NCHWc_int8'. Parameters ---------- data : tvm.te.Tensor 4-D with shape [batch, in_channel, in_height, in_width] or 5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block] kernel : tvm.te.Tensor 4-D with shape [num_filter, in_channel // groups, filter_height, filter_width] or 6-D with shape [num_filter_chunk, in_channel_chunk // groups, filter_height, filter_width, num_filter_block, in_channel_block] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int or str Padding size, or ['VALID', 'SAME'] dilation : int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] groups : int number of groups out_dtype : str The output type. This is used for mixed precision. Returns ------- Output : tvm.te.Tensor 5-D with shape [batch, out_channel, out_height, out_width, out_channel_block] """ ic_block_factor = 4 oc_block_factor = 4 pre_computed = len(kernel.shape) == 6 if not pre_computed: batch, channels, height, width = get_const_tuple(data.shape) out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape) assert channels % groups == 0, "input channels must divide group size" assert out_channels % groups == 0, "output channels must divide group size" assert ( channels % ic_block_factor == 0 ), "Number of input channels per group must divide {}".format(ic_block_factor) assert ( out_channels % oc_block_factor == 0 ), "Number of output channels per group must divide {}".format(oc_block_factor) packed_data = te.compute( (batch, channels // ic_block_factor, height, width, ic_block_factor), lambda n, c, h, w, vc: data[n, c * ic_block_factor + vc, h, w], name="packed_data", ) packed_kernel = te.compute( ( out_channels // oc_block_factor, in_channels // ic_block_factor, kernel_h, kernel_w, oc_block_factor, ic_block_factor, ), lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block: kernel[ oc_chunk * oc_block_factor + oc_block, ic_chunk * ic_block_factor + ic_block, kh, kw ], name="packed_kernel", ) else: packed_data = data packed_kernel = kernel batch, ic_chunk, in_height, in_width, _ = get_const_tuple(packed_data.shape) oc_chunk, _, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple(packed_kernel.shape) # TODO(kumasento): these assertions ensure that the number of groups # should be smaller or equal to the number of blocks, so that each # group will have at least one block. # Shall we pad the channels to avoid raising assertions? assert ( groups <= oc_chunk ), "Number of groups {} should be less than " "output channel chunk size {}".format( groups, oc_chunk ) assert ( groups <= ic_chunk ), "Number of groups {} should be less than " "input channel chunk size {}".format( groups, ic_chunk ) if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation # pad the input data pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w)) pad_before = [0, 0, pad_top, pad_left, 0] pad_after = [0, 0, pad_down, pad_right, 0] pad_data = pad(packed_data, pad_before, pad_after, name="pad_data") # compute the output shape out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1 out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1 oshape = (batch, oc_chunk, out_height, out_width, oc_block) icc = te.reduce_axis((0, ic_chunk // groups), name="ic_chunk") icb = te.reduce_axis((0, ic_block_factor), name="ic_block") kh = te.reduce_axis((0, kernel_h), name="kh") kw = te.reduce_axis((0, kernel_w), name="kw") # NOTE(kumasento): explanation of this snippet - # oc_chunk//groups and ic_chunk//groups give you the number of blocks, # i.e., chunk, per group. # occ is the ID of the output channel block, so that occ//(oc_chunk//groups) # produces the ID of the group. # Multiplying that result with ic_chunk//groups resulting in the ID # of the beginning block of the corresponding input group. # Adding the block offset (icc) will give you the exact block ID. # # Compared with a normal convolution, group convolution only sums # input channels from the group that an output channel resides in. conv = te.compute( oshape, lambda n, occ, oh, ow, ocb: te.sum( pad_data[ n, occ // (oc_chunk // groups) * (ic_chunk // groups) + icc, oh * stride_h + kh * dilation_h, ow * stride_w + kw * dilation_w, icb, ].astype("int32") * packed_kernel[occ, icc, kh, kw, ocb, icb].astype("int32"), axis=[icc, kh, kw, icb], ), ) # Type conversion output = te.compute( oshape, lambda *index: conv(*index).astype(out_dtype), tag="group_conv2d_NCHWc_int8" ) num_flop = ( batch * oc_chunk * oc_block * out_height * out_width * ic_chunk * ic_block * kernel_h * kernel_w * 2 // groups ) cfg.add_flop(num_flop) return output
def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile): """compute define for Conv2d Spatial Pack with NCHW layout""" out_dtype = out_dtype or data.dtype N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(N, tvm.tir.Any): N = tvm.te.size_var("n") if not isinstance(IH, int) or not isinstance(IW, int): raise RuntimeError( "ARM winograd conv2d doesn't support dynamic input height or width." ) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if len(kernel.shape) == 4: pre_packed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: # kernel tensor is pre packed pre_packed = True CO, _, KH, KW, VC = get_const_tuple(kernel.shape) CO = CO * VC dilated_kernel_h = (KH - 1) * dilation_h + 1 dilated_kernel_w = (KW - 1) * dilation_w + 1 pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) OH = (IH + pad_top + pad_bottom - dilated_kernel_h) // HSTR + 1 OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1 data_pad = nn.pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right]) # ==================== define configuration space ==================== # TODO(@kevinthesun): Support tuning/optimization for dynamic shape. n_tuning_axis = N if isinstance(N, int) else 1 n, co, oh, ow = cfg.axis(n_tuning_axis), cfg.axis(CO), cfg.axis( OH), cfg.axis(OW) ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW) if num_tile == 2: # for arm cpu co, vc = cfg.define_split("tile_co", co, num_outputs=2) oh, vh = cfg.define_split("tile_oh", oh, num_outputs=2) ow, vw = cfg.define_split("tile_ow", ow, num_outputs=2) elif num_tile == 3: # for mali gpu co, _, vc = cfg.define_split("tile_co", co, num_outputs=3) oh, _, vh = cfg.define_split("tile_oh", oh, num_outputs=3) ow, _, vw = cfg.define_split("tile_ow", ow, num_outputs=3) else: raise RuntimeError("Invalid num_tile") cfg.define_reorder( "reorder_0", [n, co, oh, ow, ci, kh, kw, vh, vw, vc], policy="candidate", candidate=[ [n, co, oh, ow, ci, kh, kw, vh, vw, vc], [n, co, oh, ow, ci, kh, kw, vc, vh, vw], ], ) cfg.define_annotate("ann_reduce", [kh, kw], policy="try_unroll") cfg.define_annotate("ann_spatial", [vh, vw, vc], policy="try_unroll_vec") # fallback support if cfg.is_fallback: if num_tile == 2: # arm cpu ref_log = autotvm.tophub.load_reference_log( "arm_cpu", "rk3399", "conv2d_nchw_spatial_pack.arm_cpu") cfg.fallback_with_reference_log(ref_log) elif num_tile == 3: # mali gpu ref_log = autotvm.tophub.load_reference_log( "mali", "rk3399", "conv2d_nchw_spatial_pack.mali") cfg.fallback_with_reference_log(ref_log) # ==================================================================== VC = cfg["tile_co"].size[-1] VH = cfg["tile_oh"].size[-1] VW = cfg["tile_ow"].size[-1] kvshape = (CO // VC, CI, KH, KW, VC) ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC) oshape = (N, CO, OH, OW) if dilation_h != 1 or dilation_w != 1: # undilate input data dvshape = (N, OH // VH, OW // VW, CI, KH, KW, VH, VW) data_vec = te.compute( dvshape, lambda n, h, w, ci, kh, kw, vh, vw: data_pad[n][ci][ (h * VH + vh) * HSTR + kh * dilation_h][ (w * VW + vw) * WSTR + kw * dilation_w], name="data_vec_undilated", ) else: dvshape = (N, OH // VH, OW // VW, CI, VH * HSTR + KH - 1, VW * WSTR + KW - 1) data_vec = te.compute( dvshape, lambda n, h, w, ci, vh, vw: data_pad[n][ci][h * VH * HSTR + vh][ w * VW * WSTR + vw], name="data_vec", ) if autotvm.GLOBAL_SCOPE.in_tuning: # use "kernel_autotvm" instead of "kernel" to avoid naming conflict with OpenCL keyword kernel_vec = tvm.te.placeholder(kvshape, kernel.dtype, name="kernel_autotvm") else: if pre_packed: kernel_vec = kernel else: kernel_vec = te.compute( kvshape, lambda co, ci, kh, kw, vc: kernel[co * VC + vc][ci][kh][kw], name="kernel_vec", ) ci = te.reduce_axis((0, CI), name="ci") kh = te.reduce_axis((0, KH), name="kh") kw = te.reduce_axis((0, KW), name="kw") if dilation_h != 1 or dilation_w != 1: conv = te.compute( ovshape, lambda n, co, h, w, vh, vw, vc: te.sum( data_vec[n, h, w, ci, kh, kw, vh, vw].astype(out_dtype) * kernel_vec[co, ci, kh, kw, vc].astype(out_dtype), axis=[ci, kh, kw], ), name="conv", ) else: conv = te.compute( ovshape, lambda n, co, h, w, vh, vw, vc: te.sum( data_vec[n, h, w, ci, vh * HSTR + kh, vw * WSTR + kw].astype( out_dtype) * kernel_vec[co, ci, kh, kw, vc].astype( out_dtype), axis=[ci, kh, kw], ), name="conv", ) idxdiv = tvm.tir.indexdiv idxmod = tvm.tir.indexmod output = te.compute( oshape, lambda n, co, h, w: conv[n, idxdiv(co, VC), idxdiv(h, VH), idxdiv(w, VW), idxmod(h, VH), idxmod(w, VW), idxmod(co, VC), ], name="output_unpack", tag="spatial_conv2d_output", ) return output
def test_gemm(): # graph nn = 2048 n = te.var('n') n = tvm.runtime.convert(nn) m, l = n, n A = te.placeholder((l, n), name='A') B = te.placeholder((l, m), name='B') k = te.reduce_axis((0, l), name='k') C = te.compute((m, n), lambda ii, jj: te.sum(A[k, jj] * B[k, ii], axis=k), name='C') # schedule s = te.create_schedule(C.op) AA = s.cache_read(A, "shared", [C]) BB = s.cache_read(B, "shared", [C]) AL = s.cache_read(AA, "local", [C]) BL = s.cache_read(BB, "local", [C]) CC = s.cache_write(C, "local") scale = 8 num_thread = 8 block_factor = scale * num_thread block_x = te.thread_axis("blockIdx.x") thread_x = te.thread_axis((0, num_thread), "threadIdx.x") block_y = te.thread_axis("blockIdx.y") thread_y = te.thread_axis((0, num_thread), "threadIdx.y") thread_xz = te.thread_axis((0, 2), "vthread", name="vx") thread_yz = te.thread_axis((0, 2), "vthread", name="vy") by, yi = s[C].split(C.op.axis[0], factor=block_factor) bx, xi = s[C].split(C.op.axis[1], factor=block_factor) s[C].bind(by, block_y) s[C].bind(bx, block_x) s[C].reorder(by, bx, yi, xi) tyz, yi = s[C].split(yi, nparts=2) ty, yi = s[C].split(yi, nparts=num_thread) txz, xi = s[C].split(xi, nparts=2) tx, xi = s[C].split(xi, nparts=num_thread) s[C].bind(tyz, thread_yz) s[C].bind(txz, thread_xz) s[C].bind(ty, thread_y) s[C].bind(tx, thread_x) s[C].reorder(tyz, txz, ty, tx, yi, xi) s[CC].compute_at(s[C], tx) yo, xo = CC.op.axis ko, ki = s[CC].split(k, factor=8) kt, ki = s[CC].split(ki, factor=1) s[CC].reorder(ko, kt, ki, yo, xo) s[AA].compute_at(s[CC], ko) s[BB].compute_at(s[CC], ko) s[CC].unroll(kt) s[AL].compute_at(s[CC], kt) s[BL].compute_at(s[CC], kt) # Schedule for A's shared memory load ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) _, xi = s[AA].split(s[AA].op.axis[1], factor=num_thread * 4) tx, xi = s[AA].split(xi, nparts=num_thread) s[AA].bind(ty, thread_y) s[AA].bind(tx, thread_x) s[AA].vectorize(xi) # Schedule for B' shared memory load ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) _, xi = s[BB].split(s[BB].op.axis[1], factor=num_thread * 4) tx, xi = s[BB].split(xi, nparts=num_thread) s[BB].bind(ty, thread_y) s[BB].bind(tx, thread_x) s[BB].vectorize(xi) s[AA].double_buffer() s[BB].double_buffer() # correctness def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Device %s" % device) f = tvm.build(s, [A, B, C], device) # launch the kernel. n, m, l = nn, nn, nn a_np = np.random.uniform(size=(n, l)).astype(A.dtype) b_np = np.random.uniform(size=(m, l)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) for i in range(2): f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(b_np.T, a_np), rtol=1e-5) num_flops = 2 * nn * nn * nn num_runs = 10 timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs) t = timer_f(a, b, c).mean GFLOPS = num_flops / (t * 1e3) / 1e6 print("average time cost of %d runs = %g ms, %g GFLOPS." % (num_runs, t * 1e3, GFLOPS)) for device in ["cuda", "opencl", "rocm", "nvptx", "vulkan"]: with tvm.transform.PassContext( config={ "tir.UnrollLoop": { "auto_max_step": 128, "explicit_unroll": device != "cuda" } }): check_device(device)
def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type): """ Use integer ARM v8 instructions in order to produce a block c of 4x4 elements given two 4xK blocks a and b' (where b' is a Kx4 block transposed). The final result is c = a*b (where '*' indicates the matrix product) Every row of the matrix c is obtained (for uint8) by a sequence of umull -> uadalp -> umull2 -> uadalp The block size is constrained by the number of registers available in arvm8. This function returns a TensorIntrin that can be used to tensorize a schedule. Parameters ---------- M: int rows of the matrix A N: int columns of the matrix B K: int columns of matrix A in_type: str, {'uint8', 'int8'} out_type: str, {'uint32', 'int32'} Returns ------- intrin : TensorIntrin The ARM uint8/int8 TensorIntrin that can be used in tensorizing schedule """ A = te.placeholder((K // 16, te.var("m"), 16), dtype=in_type, name='A') B = te.placeholder((K // 16, te.var("n"), 16), dtype=in_type, name='B') idxm = tvm.tir.indexmod k = te.reduce_axis((0, K), "k") C = te.compute((te.var("m"), te.var("n")), lambda x, y: te.sum(A[k // 16, x, idxm(k, 16)].astype( out_type) * B[k // 16, y, idxm(k, 16)].astype(out_type), axis=k), name="C") a_buffer = tvm.tir.decl_buffer(A.shape, dtype=in_type, name="a_buffer", offset_factor=1, strides=[te.var('sa_1'), te.var('sa_2'), 1]) b_buffer = tvm.tir.decl_buffer(B.shape, dtype=in_type, name="b_buffer", offset_factor=1, strides=[te.var('sb_1'), te.var('sb_2'), 1]) c_buffer = tvm.tir.decl_buffer(C.shape, dtype=out_type, name="c_buffer", offset_factor=1, strides=[te.var('sc'), 1]) def _intrin_func(ins, outs): def _instr(): ib = tvm.tir.ir_builder.create() aa, bb = ins cc = outs[0] stepA = min(4, M) stepB = min(4, N) intrin_name = "gemm_quantized_{0}_{0}_int32_{1}_{2}".format( in_type, stepA, stepB) if unroll: intrin_name += ("_" + str(K)) if interleave: intrin_name += "_interleaved" ib.emit( tvm.tir.call_extern("int32", intrin_name, outs[0].access_ptr("w"), a_buffer.access_ptr("r"), b_buffer.access_ptr("r"), K)) return ib.get() # body, reset, update return _instr() buffer_params = {"offset_factor": 1} return te.decl_tensor_intrin(C.op, _intrin_func, binds={ A: a_buffer, B: b_buffer, C: c_buffer }, default_buffer_params=buffer_params)
def dot_int8_int8_int32(int32_lanes, dtype='uint'): """ Int8 dot product by every 4 elements using ARM v8.2 udot. This function takes two arrays of int8 datatype -- data[4] and kernel[int32_lanes][4] -- and computes a dot product of data[4] with every 4 elements of kernels, resulting in output[int32_lanes] of uint32 datatype. The pseudo code is as follows. .. code-block:: c void dot_int8_int8_int32(int8 data[4], int8 kernel[16][4], int32 output[16]){ for (int i = 0; i < int32_lanes; i++){ out[i] = 0; for (int k = 0; k < 4; k++){ out[i] += data[k] * kernel[i][k] } } } Physically, the kernel array sits in a vector register and the data[4] is broadcasted to another vector register. This function returns a TensorIntrin that can be used to tensorize a schedule. Parameters ---------- int32_lanes: int How many int32/uint32 to produce dtype: str, optional, {"uint", "int"} Whether it works on unsigned int or signed int Returns ------- intrin : TensorIntrin The ARM uint8 TensorIntrin that can be used in tensorizing schedule """ num_int8_elements = 4 # 4 int8 elements in int32 data = te.placeholder((num_int8_elements, ), dtype='%s8' % dtype, name='data') kernel = te.placeholder((int32_lanes, num_int8_elements), dtype='%s8' % dtype, name='kernel') k = te.reduce_axis((0, num_int8_elements), name='k') C = te.compute((int32_lanes, ), lambda i: te.sum(data[k].astype('%s32' % dtype) * kernel[ i, k].astype('%s32' % dtype), axis=k), name="C") a_buffer = tvm.tir.decl_buffer(data.shape, dtype='%s8' % dtype, name="a_buffer", offset_factor=1, strides=[1]) b_buffer = tvm.tir.decl_buffer(kernel.shape, dtype='%s8' % dtype, name="b_buffer", offset_factor=1, strides=[te.var('s'), 1]) def _intrin_func(ins, outs): def _instr(index): ib = tvm.tir.ir_builder.create() if index == 1: ib.emit(outs[0].vstore( 0, tvm.tir.const(0, '%s32x%d' % (dtype, int32_lanes)))) return ib.get() dtype_a = '%s8x%d' % (dtype, num_int8_elements) dtype_b = '%s8x%d' % (dtype, int32_lanes * num_int8_elements) dtype_c = '%s32x%d' % (dtype, int32_lanes) a_int8 = ins[0].vload([0], dtype_a) re_int32 = tvm.tir.call_intrin('%s32' % dtype, 'tir.reinterpret', a_int8) # broadcast a vec_ai32 = re_int32.astype(dtype_c) vec_a = tvm.tir.call_intrin(dtype_b, 'tir.reinterpret', vec_ai32) vec_b = ins[1].vload([0, 0], dtype_b) vec_c = outs[0].vload([0], dtype_c) inst = 'udot' if dtype == 'uint' else 'sdot' inst = 'llvm.aarch64.neon.%s.v%di32.v%di8' % ( inst, int32_lanes, int32_lanes * num_int8_elements) vdot = tvm.tir.call_llvm_pure_intrin(dtype_c, inst, tvm.tir.const(2, 'uint32'), vec_c, vec_a, vec_b) ib.emit(outs[0].vstore(0, vdot)) return ib.get() # body, reset, update return _instr(0), _instr(1), _instr(2) buffer_params = {"offset_factor": 1} return te.decl_tensor_intrin(C.op, _intrin_func, binds={ data: a_buffer, kernel: b_buffer }, default_buffer_params=buffer_params)
def conv2d_NCHWc_int8(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype="int32", n_elems=4): """Conv2D operator for nChw[x]c layout. Parameters ---------- data : tvm.te.Tensor 5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block] kernel : tvm.te.Tensor 7-D with shape [num_filter_chunk, in_channel_chunk, filter_height, filter_width, in_channel_block/4, num_filter_block, 4] stride : int or a list/tuple of two ints stride size, or [stride_height, stride_width] padding : int or a list/tuple of 2 or 4 ints padding size, or [pad_height, pad_width] for 2 ints, or [pad_top, pad_left, pad_bottom, pad_right] for 4 ints dilation: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] layout : str Input data layout out_layout : str Output data layout out_dtype : str output data type n_elems : int numer of int8 elements accumulated Returns ------- output : tvm.te.Tensor 5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block] """ # layout and out_layout are not used here, # we keep them for debug convenience when dumping autotvm workload HSTR, WSTR = stride if isinstance(stride, (tuple, list)) else (stride, stride) dilation_h, dilation_w = (dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)) n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) in_channel = ic_chunk * ic_bn oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple( kernel.shape) num_filter = oc_chunk * oc_bn groups = ic_chunk // ic_chunk_group dilated_kernel_h = (kernel_height - 1) * dilation_h + 1 dilated_kernel_w = (kernel_width - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) HPAD = pad_top + pad_down WPAD = pad_left + pad_right # output shape out_height = (ih + HPAD - dilated_kernel_h) // HSTR + 1 out_width = (iw + WPAD - dilated_kernel_w) // WSTR + 1 oshape = (n, oc_chunk, out_height, out_width, oc_bn) pad_before = (0, 0, pad_top, pad_left, 0) pad_after = (0, 0, pad_down, pad_right, 0) # DOPAD DOPAD = HPAD != 0 or WPAD != 0 if DOPAD: data_pad = pad(data, pad_before, pad_after, name="data_pad") else: data_pad = data ic = te.reduce_axis((0, in_channel), name="ic") kh = te.reduce_axis((0, kernel_height), name="kh") kw = te.reduce_axis((0, kernel_width), name="kw") if groups == 1: ic_outer = te.reduce_axis((0, in_channel // ic_bn), name="ic_outer") ic_f_inner = te.reduce_axis((0, ic_bn // n_elems), name="ic_f_inner") ic_s_inner = te.reduce_axis((0, n_elems), name="ic_s_inner") return te.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: te.sum( data_pad[n, ic_outer, oh * HSTR + kh * dilation_h, ow * WSTR + kw * dilation_w, ic_f_inner * n_elems + ic_s_inner, ]. astype(out_dtype ) * kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner].astype(out_dtype), axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner], ), name="conv2d_NCHWc_int8", tag="conv2d_NCHWc_int8", attrs={"schedule_rule": "meta_schedule.conv2d_NCHWc_int8"}, ) # for int8 group conv support ic_chunk = in_channel // ic_bn ic_outer = te.reduce_axis((0, ic_chunk // groups), name="ic_outer") ic_f_inner = te.reduce_axis((0, ic_bn // n_elems), name="ic_f_inner") ic_s_inner = te.reduce_axis((0, n_elems), name="ic_s_inner") oshape = (n, oc_chunk, out_height, out_width, oc_bn) return te.compute( oshape, lambda n, occ, oh, ow, oc_block: te.sum( data_pad[n, (occ * oc_bn // (oc_chunk * oc_bn // groups)) * (ic_chunk // groups) + ic_outer, oh * HSTR + kh, ow * WSTR + kw, ic_f_inner * n_elems + ic_s_inner, ]. astype(out_dtype) * kernel[occ, ic_outer, kh, kw, ic_f_inner, oc_block, ic_s_inner].astype(out_dtype), axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner], ), name="conv2d_NCHWc_int8", tag="conv2d_NCHWc_int8", attrs={"schedule_rule": "meta_schedule.conv2d_NCHWc_int8"}, )
def conv2d_NCHWc(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype="float32"): """Conv2D operator for nChw[x]c layout. Parameters ---------- data : tvm.te.Tensor 5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block] kernel : tvm.te.Tensor 6-D with shape [num_filter_chunk, in_channel_chunk, filter_height, filter_width, in_channel_block, num_filter_block] stride : int or a list/tuple of two ints stride size, or [stride_height, stride_width] padding : int or a list/tuple of 2 or 4 ints padding size, or [pad_height, pad_width] for 2 ints, or [pad_top, pad_left, pad_bottom, pad_right] for 4 ints dilation: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] layout : str Input data layout out_layout : str Output data layout out_dtype : str output data type Returns ------- output : tvm.te.Tensor 5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block] """ # layout and out_layout are not used here, # we keep them for debug convenience when dumping autotvm workload HSTR, WSTR = stride if isinstance(stride, (tuple, list)) else (stride, stride) dilation_h, dilation_w = (dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation)) n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape) in_channel = ic_chunk * ic_bn target = tvm.target.Target.current(allow_none=False) oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn = get_const_tuple( kernel.shape) num_filter = oc_chunk * oc_bn groups = ic_chunk // ic_chunk_group dilated_kernel_h = (kernel_height - 1) * dilation_h + 1 dilated_kernel_w = (kernel_width - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) HPAD = pad_top + pad_down WPAD = pad_left + pad_right # output shape out_height = (ih + HPAD - dilated_kernel_h) // HSTR + 1 out_width = (iw + WPAD - dilated_kernel_w) // WSTR + 1 oshape = (n, oc_chunk, out_height, out_width, oc_bn) pad_before = (0, 0, pad_top, pad_left, 0) pad_after = (0, 0, pad_down, pad_right, 0) # DOPAD DOPAD = HPAD != 0 or WPAD != 0 if DOPAD: data_pad = pad(data, pad_before, pad_after, name="data_pad") else: data_pad = data ic = te.reduce_axis((0, in_channel), name="ic") kh = te.reduce_axis((0, kernel_height), name="kh") kw = te.reduce_axis((0, kernel_width), name="kw") idxdiv = tvm.tir.indexdiv idxmod = tvm.tir.indexmod return te.compute( oshape, lambda n, oc_chunk, oh, ow, oc_block: te.sum( data_pad[n, idxdiv(ic, ic_bn), oh * HSTR + kh * dilation_h, ow * WSTR + kw * dilation_w, idxmod(ic, ic_bn), ].astype(out_dtype) * kernel[ oc_chunk, idxdiv(ic, ic_bn), kh, kw, idxmod(ic, ic_bn), oc_block].astype(out_dtype), axis=[ic, kh, kw], ), name="conv2d_NCHWc", tag="conv2d_NCHWc", )
def _conv2d_winograd_nhwc_impl( data, weight, strides, padding, dilation, out_dtype, tile_size, pre_computed=False, auto_scheduler_rewritten_layout="", ): """Conv2D Winograd implementation in NHWC layout. This is a clean version to be used by the auto-scheduler for both CPU and GPU. Parameters ---------- data : tvm.Tensor 4-D with shape [batch, in_height, in_width, in_channel] weight : tvm.Tensor 4-D with shape [filter_height, filter_width, in_channel, num_filter] strides : int or a list/tuple of two ints stride size, or [stride_height, stride_width] padding : int or a list/tuple of two ints padding size, or [pad_height, pad_width] dilation: int or a list/tuple of two ints dilation size, or [dilation_height, dilation_width] out_dtype : str, optional Specifies the output data type. tile_size : int The size of the tile to use for the Winograd filter pre_computed: bool = False Whether the kernel is precomputed auto_scheduler_rewritten_layout: str = "" The layout after auto-scheduler's layout rewrite pass. Returns ------- output : tvm.Tensor 4-D with shape [batch, out_height, out_width, out_channel] """ N, H, W, CI = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation" if not pre_computed: KH, KW, CI, CO = get_const_tuple(weight.shape) else: if auto_scheduler_rewritten_layout: H_CAT, W_CAT, CO, CI = get_const_tuple( auto_scheduler.get_shape_from_rewritten_layout( auto_scheduler_rewritten_layout, ["eps", "nu", "co", "ci"])) auto_scheduler.remove_index_check(weight) else: H_CAT, W_CAT, CO, CI = get_const_tuple(weight.shape) KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1 pad_t, pad_l, pad_b, pad_r = get_pad_tuple(padding, (KH, KW)) HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides assert HSTR == 1 and WSTR == 1 and KH == 3 and KW == 3 r = KW m = tile_size alpha = m + r - 1 A, B, G = winograd_transform_matrices(m, r, out_dtype) H = (H + pad_t + pad_b - KH) // HSTR + 1 W = (W + pad_l + pad_r - KW) // WSTR + 1 nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW pad_extra = (nW - 1) * m + alpha - (H + pad_t + pad_b) data_pad = pad( data, (0, pad_t, pad_l, 0), (0, pad_b + pad_extra, pad_r + pad_extra, 0), name="data_pad", attrs={"schedule_rule": "None"}, ) if not pre_computed: r_kh = te.reduce_axis((0, KH), name="r_kh") r_kw = te.reduce_axis((0, KW), name="r_kw") kernel_pack = te.compute( (alpha, alpha, CO, CI), lambda eps, nu, co, ci: te.sum(weight[r_kh][r_kw][ci][co] * G[eps][ r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name="kernel_pack", ) attrs = {} else: kernel_pack = weight attrs = {"layout_free_placeholders": [kernel_pack]} # pack data tile input_tile = te.compute( (alpha, alpha, P, CI), lambda eps, nu, p, ci: data_pad[p // (nH * nW)][( (p // nW) % nH) * m + eps][(p % nW) * m + nu][ci], name="input_tile", attrs={"schedule_rule": "None"}, ) # transform data target = tvm.target.Target.current(allow_none=True) if target is not None: target_kind = "meta_schedule.winograd_data_pack." + target.kind.name else: target_kind = "None" r_a = te.reduce_axis((0, alpha), "r_a") r_b = te.reduce_axis((0, alpha), "r_b") data_pack = te.compute( (alpha, alpha, P, CI), lambda eps, nu, p, ci: te.sum(input_tile[r_a][r_b][p][ci] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]), name="data_pack", attrs={ "auto_scheduler_simplify_const_tensor_indices": ["eps", "nu", "r_a", "r_b"], "schedule_rule": target_kind, }, # the attrs are necessary hints for the auto-scheduler ) # do batch gemm ci = te.reduce_axis((0, CI), name="ci") bgemm = te.compute( (alpha, alpha, P, CO), lambda eps, nu, p, co: te.sum(data_pack[eps][nu][p][ci] * kernel_pack[ eps][nu][co][ci], axis=[ci]), name="bgemm", attrs=attrs, ) if auto_scheduler_rewritten_layout: bgemm = auto_scheduler.rewrite_compute_body( bgemm, auto_scheduler_rewritten_layout) # inverse transform r_a = te.reduce_axis((0, alpha), "r_a") r_b = te.reduce_axis((0, alpha), "r_b") inverse = te.compute( (m, m, P, CO), lambda vh, vw, p, co: te.sum( bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]), name="inverse", attrs={ "auto_scheduler_simplify_const_tensor_indices": ["vh", "vw", "r_a", "r_b"], "schedule_rule": "meta_schedule.winograd_inverse", }, # the attrs are necessary hints for the auto-scheduler ) # output output = te.compute( (N, H, W, CO), lambda n, h, w, co: inverse[h % m, w % m, n * nH * nW + (h // m) * nW + (w // m), co], name="conv2d_winograd", ) return output
def dense_pack(cfg, data, weight, bias=None, out_dtype=None): """Compute dense with transformed weight.""" if out_dtype is None: out_dtype = data.dtype M, K = get_const_tuple(data.shape) # batch, in_dim if len(weight.shape) == 3: N, _, packw_bn = get_const_tuple(weight.shape) # out_dim N = N * packw_bn else: N, _ = get_const_tuple(weight.shape) # out_dim # create tuning space cfg.define_split("tile_y", 32 if isinstance(M, (tvm.tir.Var, tvm.tir.Any)) else M, num_outputs=3) cfg.define_split("tile_x", 32 if isinstance(N, (tvm.tir.Var, tvm.tir.Any)) else N, num_outputs=3) cfg.define_split("tile_k", 32 if isinstance(K, (tvm.tir.Var, tvm.tir.Any)) else K, num_outputs=2) cfg.define_split( "tile_inner", 32 if isinstance(M, (tvm.tir.Var, tvm.tir.Any)) else M, num_outputs=2, filter=lambda y: y.size[-1] <= 16, ) if cfg.is_fallback: _default_dense_pack_config(cfg, M, N, K) if len(weight.shape) == 2: packw_bn = cfg["tile_x"].size[-1] packw_shape = (N // packw_bn, K, packw_bn) if autotvm.GLOBAL_SCOPE.in_tuning: # Directly use modified data layout placeholder. packw = tvm.te.placeholder(packw_shape, weight.dtype, name="packed_weight") else: packw = te.compute(packw_shape, lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight") else: packw = weight idxdiv = tvm.tir.indexdiv idxmod = tvm.tir.indexmod k = te.reduce_axis((0, K), name="k") C = te.compute( (M, N), lambda y, x: te.sum( data[y, k].astype(out_dtype) * packw[idxdiv( x, packw_bn), k, idxmod(x, packw_bn)].astype(out_dtype), axis=k, ), tag="dense_pack", ) if bias is not None: C = te.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST) return C
import tvm from tvm import te n = 1024 dtype = "float32" A = te.placeholder((n, n), dtype=dtype, name='A') k = te.reduce_axis((0, n), name='k') B = te.compute((n, ), lambda i: te.sum(A[i, k], axis=k), name='B') s = te.create_schedule(B.op) print(tvm.lower(s, [A, B], simple_mode=True)) print("---------cutting line---------") BW = s.cache_write(B, "local") print(tvm.lower(s, [A, B], simple_mode=True))
def _run(env, remote): # declare o = 4 n = 1 m = 4 x = te.placeholder((o, n, env.BATCH, env.BLOCK_IN), name="x", dtype=env.inp_dtype) w = te.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN), name="w", dtype=env.wgt_dtype) x_buf = te.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: x(*i), "x_buf") w_buf = te.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN), lambda *i: w(*i), "w_buf") ko = te.reduce_axis((0, n), name="ko") ki = te.reduce_axis((0, env.BLOCK_IN), name="ki") y_gem = te.compute( (o, m, env.BATCH, env.BLOCK_OUT), lambda bo, co, bi, ci: te.sum(x_buf[bo, ko, bi, ki].astype( env.acc_dtype) * w_buf[co, ko, ci, ki].astype(env.acc_dtype), axis=[ko, ki]), name="y_gem") y_shf = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: y_gem(*i) >> 8, name="y_shf") y_max = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.te.max(y_shf(*i), 0), "y_max") #relu y_min = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.te.min(y_max(*i), (1 << (env.INP_WIDTH - 1)) - 1), "y_min") #relu y = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: y_min(*i).astype(env.inp_dtype), name="y") if not remote: return def verify(s, name=None): mod = vta.build(s, [x, w, y], "ext_dev", env.target_host) temp = util.tempdir() mod.save(temp.relpath("gemm.o")) remote.upload(temp.relpath("gemm.o")) f = remote.load_module("gemm.o") # verify ctx = remote.ext_dev(0) x_np = np.random.randint(-128, 128, size=(o, n, env.BATCH, env.BLOCK_IN)).astype(x.dtype) w_np = np.random.randint(-128, 128, size=(m, n, env.BLOCK_OUT, env.BLOCK_IN)).astype(w.dtype) y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype) x_nd = tvm.nd.array(x_np, ctx) w_nd = tvm.nd.array(w_np, ctx) y_nd = tvm.nd.array(y_np, ctx) y_np = y_np.astype(env.acc_dtype) for b in range(o): for i in range(m): for j in range(n): y_np[b, i, :] += np.dot( x_np[b, j, :].astype(env.acc_dtype), w_np[i, j].T.astype(env.acc_dtype)) y_np = np.right_shift(y_np, 8) y_np = np.clip(y_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(y.dtype) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(x_nd, w_nd, y_nd) np.testing.assert_equal(y_np, y_nd.asnumpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("GEMM schedule:{} execution statistics:".format(name)) for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v)) def test_schedule1(): # default schedule with no smt s = te.create_schedule(y.op) # set the scope of the SRAM buffers s[x_buf].set_scope(env.inp_scope) s[w_buf].set_scope(env.wgt_scope) s[y_gem].set_scope(env.acc_scope) s[y_shf].set_scope(env.acc_scope) s[y_max].set_scope(env.acc_scope) s[y_min].set_scope(env.acc_scope) # set pragmas for DMA transfer and ALU ops s[x_buf].compute_at(s[y_gem], ko) s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy) s[w_buf].compute_at(s[y_gem], ko) s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy) s[y_shf].pragma(s[y_shf].op.axis[0], env.alu) s[y_max].pragma(s[y_max].op.axis[0], env.alu) s[y_min].pragma(s[y_min].op.axis[0], env.alu) s[y].pragma(s[y].op.axis[0], env.dma_copy) # tensorization s[y_gem].reorder(ko, s[y_gem].op.axis[0], s[y_gem].op.axis[1], s[y_gem].op.axis[2], s[y_gem].op.axis[3], ki) s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm) verify(s, name="default") def test_smt(): # test smt schedule s = te.create_schedule(y.op) s[x_buf].set_scope(env.inp_scope) s[w_buf].set_scope(env.wgt_scope) s[y_gem].set_scope(env.acc_scope) s[y_shf].set_scope(env.acc_scope) s[y_max].set_scope(env.acc_scope) s[y_min].set_scope(env.acc_scope) abo, aco, abi, aci = s[y].op.axis abo1, abo2 = s[y].split(abo, nparts=2) s[y].bind(abo1, te.thread_axis("cthread")) s[y_gem].compute_at(s[y], abo1) s[y_shf].compute_at(s[y], abo1) s[y_max].compute_at(s[y], abo1) s[y_min].compute_at(s[y], abo1) s[y_gem].reorder(ko, s[y_gem].op.axis[0], s[y_gem].op.axis[1], s[y_gem].op.axis[2], s[y_gem].op.axis[3], ki) s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm) s[y_shf].pragma(s[y_shf].op.axis[0], env.alu) s[y_max].pragma(s[y_max].op.axis[0], env.alu) s[y_min].pragma(s[y_min].op.axis[0], env.alu) s[x_buf].compute_at(s[y_gem], ko) s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy) s[w_buf].compute_at(s[y_gem], ko) s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy) s[y].pragma(abo2, env.dma_copy) verify(s, name="smt") test_schedule1() test_smt()
def dot_16x1x16_uint8_int8_int16(): """ Int8 dot product by every 2 elements using AVX512 Skylake instructions. This function takes two arrays of uint8 and int8 datatype -- data[2] and kernel[4][32][2] -- and computes a dot product of data[2] with every 2 elements of kernels, resulting in output[4][32] of int16 datatype. The pseudo code is as follows. .. code-block:: c void dot_16x1x16_uint8_int8_int16(uint8 data[2], int8 kernel[32*4][2], int16 output[32*4]){ for (int i = 0; i< 4; i++){ for (int j = 0; j < 32; j++){ output[i][i] = 0; for (int k = 0; k < 2; k++){ output[i][j][k] += data[k] * kernel[i][j][k] } } } } Physically, the kernel array sits in four AVX512 vector registers and the data[2] is broadcasted to another AVX512 vector register. This function returns a TensorIntrin that can be used to tensorize a schedule. Returns ------- intrin : TensorIntrin The Skylake int8 TensorIntrin that can be used in tensorizing schedule """ int16_lanes = 4 * 32 # 4*32 int32 lanes in 4 AVX512 vector registers num_int8_elements = 2 # 2 int8 elements in int16 data = te.placeholder((num_int8_elements, ), dtype="uint8", name="data") kernel = te.placeholder((int16_lanes, num_int8_elements), dtype="int8", name="kernel") k = te.reduce_axis((0, num_int8_elements), name="k") C = te.compute( (int16_lanes, ), lambda i: te.sum( data[k].astype("int16") * kernel[i, k].astype("int16"), axis=k), name="C", ) a_buffer = tvm.tir.decl_buffer(data.shape, dtype="uint8", name="a_buffer", offset_factor=1, strides=[1]) b_buffer = tvm.tir.decl_buffer(kernel.shape, dtype="int8", name="b_buffer", offset_factor=1) # strides=[te.var('ldw'), 1, 1]) def _intrin_func(ins, outs): def _instr(index): ib = tvm.tir.ir_builder.create() if index == 1: for i in range(4): ib.emit(outs[0].vstore([i * 32], tvm.tir.const(0, "int16x32"))) return ib.get() a_int8 = ins[0].vload([0], "uint8x2") re_int16 = tvm.tir.call_intrin("int16", "tir.reinterpret", a_int8) vec_ai16 = re_int16.astype("int16x32") vec_a = tvm.tir.call_intrin("int8x64", "tir.reinterpret", vec_ai16) for i in range(4): vec_b = ins[1].vload([i * 32, 0], "int8x64") pair_reduction = tvm.tir.call_llvm_pure_intrin( "int16x32", "llvm.x86.avx512.pmaddubs.w.512", tvm.tir.const(0, "uint32"), vec_a, vec_b, ) if index == 0: ib.emit(outs[0].vstore([i * 32], pair_reduction)) else: ib.emit(outs[0].vstore( [i * 32], pair_reduction + outs[0].vload([i * 32], "int16x32"))) return ib.get() # body, reset, update return _instr(0), _instr(1), _instr(2) buffer_params = {"offset_factor": 1} return te.decl_tensor_intrin( C.op, _intrin_func, binds={ data: a_buffer, kernel: b_buffer }, default_buffer_params=buffer_params, )
Apad = te.compute( (in_size + 2 * pad, in_size + 2 * pad, in_channel, batch), lambda yy, xx, cc, nn: tvm.tir.if_then_else( tvm.tir.all(yy >= pad, yy - pad < in_size, xx >= pad, xx - pad < in_size), A[yy - pad, xx - pad, cc, nn], tvm.tir.const(0., "float32")), name='Apad') # Create reduction variables rc = te.reduce_axis((0, in_channel), name='rc') ry = te.reduce_axis((0, kernel), name='ry') rx = te.reduce_axis((0, kernel), name='rx') # Compute the convolution B = te.compute( (out_size, out_size, out_channel, batch), lambda yy, xx, ff, nn: te.sum(Apad[yy * stride + ry, xx * stride + rx, rc, nn] * W[ry, rx, rc, ff], axis=[ry, rx, rc]), name='B') ############################################################################### # Memory Hierarchy # ---------------- # # We first specify the memory hierarchy for buffers. The figure below shows the # GPU memory hierarchy. One important difference from CPU memory hierarchy is # that GPU provides a cache buffer called shared memory, which is managed by # programmers. Thus how to maximize the data reuse in the shared memory is # critical to achieve high performance in GPU kernels. # # .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/gpu_memory_hierarchy.png # :align: center
def dot_16x1x16_uint8_int8_int32_cascadelake(): """ Int8 dot product by every 4 elements using AVX512VNNI Cascade Lake instructions. This function takes two arrays of uint8 and int8 datatype -- data[4] and kernel[16][4] -- and computes a dot product of data[4] with every 4 elements of kernels, resulting in output[16] of int32 datatype. The pseudo code is as follows. .. code-block:: c void dot_16x1x16_uint8_int8_int32_cascadelake(uint8 data[4], int8 kernel[16][4], int32 output[16]){ for (int i = 0; i < 16; i++){ output[i] = 0; for (int k = 0; k < 4; k++){ output[i] += data[k] * kernel[i][k] } } } Physically, the kernel array sits in an AVX512 vector register and the data[4] is broadcasted to another AVX512 vector register. This function returns a TensorIntrin that can be used to tensorize a schedule. Returns ------- intrin : TensorIntrin The Cascade Lake int8 TensorIntrin that can be used in tensorizing schedule """ int32_lanes = 16 # 16 int32 lanes in AVX512 num_int8_elements = 4 # 4 int8 elements in int32 data = te.placeholder((num_int8_elements, ), dtype="uint8", name="data") kernel = te.placeholder((int32_lanes, num_int8_elements), dtype="int8", name="kernel") k = te.reduce_axis((0, num_int8_elements), name="k") C = te.compute( (int32_lanes, ), lambda i: te.sum( data[k].astype("int32") * kernel[i, k].astype("int32"), axis=k), name="C", ) a_buffer = tvm.tir.decl_buffer(data.shape, dtype="uint8", name="a_buffer", offset_factor=1, strides=[1]) b_buffer = tvm.tir.decl_buffer(kernel.shape, dtype="int8", name="b_buffer", offset_factor=1, strides=[te.var("ldw"), 1]) def _intrin_func(ins, outs): def _instr(index): ib = tvm.tir.ir_builder.create() if index == 1: ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x16"))) return ib.get() a_int8 = ins[0].vload([0], "uint8x4") re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8) vec_ai32 = re_int32.astype("int32x16") vec_b = ins[1].vload([0, 0], "int8x64") vnni_inst_name = "llvm.x86.avx512.vpdpbusd.512" llvm_id = tvm.target.codegen.llvm_lookup_intrinsic_id( vnni_inst_name) if llvm_id != 0: # VNNI is available for current LLVM version vec_bi32 = tvm.tir.call_intrin("int32x16", "tir.reinterpret", vec_b) vec_zero = tvm.tir.const(0, "int32x16") quad_reduction = tvm.tir.call_llvm_pure_intrin( "int32x16", "llvm.x86.avx512.vpdpbusd.512", tvm.tir.const(0, "uint32"), vec_zero, vec_ai32, vec_bi32, ) else: # Fall back to the normal AVX512 vec_a = tvm.tir.call_intrin("int8x64", "tir.reinterpret", vec_ai32) vec_one = tvm.tir.const(1, "int16x32") pair_reduction = tvm.tir.call_llvm_pure_intrin( "int16x32", "llvm.x86.avx512.pmaddubs.w.512", tvm.tir.const(0, "uint32"), vec_a, vec_b, ) quad_reduction = tvm.tir.call_llvm_pure_intrin( "int32x16", "llvm.x86.avx512.pmaddw.d.512", tvm.tir.const(0, "uint32"), pair_reduction, vec_one, ) if index == 0: ib.emit(outs[0].vstore(0, quad_reduction)) else: ib.emit(outs[0].vstore( 0, quad_reduction + outs[0].vload([0], "int32x16"))) return ib.get() # body, reset, update return _instr(0), _instr(1), _instr(2) buffer_params = {"offset_factor": 1} return te.decl_tensor_intrin( C.op, _intrin_func, binds={ data: a_buffer, kernel: b_buffer }, default_buffer_params=buffer_params, )
from __future__ import absolute_import, print_function import tvm from tvm import te import numpy as np N, M, L = 1024, 512, 64 A = te.placeholder((N, L), name='A') B = te.placeholder((M, L), name='B') k = te.reduce_axis((0, L), name='k') C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") s = te.create_schedule(C.op) print(tvm.lower(s, [A, B, C], simple_mode=True)) factor = 16 x, y = C.op.axis (z, ) = C.op.reduce_axis yo, yi = s[C].split(y, factor=factor) s[C].reorder(x, yo, yi, z) print(tvm.lower(s, [A, B, C], simple_mode=True)) def intrin_gemv(m, l): a = te.placeholder((l, ), name="a") b = te.placeholder((m, l), name="b") k = te.reduce_axis((0, l), name="k") c = te.compute((m, ), lambda i: te.sum(a[k] * b[i, k], axis=k), name="c") Ab = tvm.tir.decl_buffer(a.shape, a.dtype,
def dot_16x1x16_uint8_int8_int32_skylake(): """ Int8 dot product by every 4 elements using AVX512 Skylake instructions. This function takes two arrays of uint8 and int8 datatype -- data[4] and kernel[16][4] -- and computes a dot product of data[4] with every 4 elements of kernels, resulting in output[16] of int32 datatype. The pseudo code is as follows. .. code-block:: c void dot_16x1x16_uint8_int8_int32(uint8 data[4], int8 kernel[16][4], int32 output[16]){ for (int i = 0; i < 16; i++){ output[i] = 0; for (int k = 0; k < 4; k++){ output[i] += data[k] * kernel[i][k] } } } Physically, the kernel array sits in an AVX512 vector register and the data[4] is broadcasted to another AVX512 vector register. This function returns a TensorIntrin that can be used to tensorize a schedule. Returns ------- intrin : TensorIntrin The Skylake int8 TensorIntrin that can be used in tensorizing schedule """ int32_lanes = get_simd_32bit_lanes() num_int8_elements = 4 # 4 int8 elements in int32 data = te.placeholder((num_int8_elements, ), dtype="uint8", name="data") kernel = te.placeholder((int32_lanes, num_int8_elements), dtype="int8", name="kernel") k = te.reduce_axis((0, num_int8_elements), name="k") C = te.compute( (int32_lanes, ), lambda i: te.sum( data[k].astype("int32") * kernel[i, k].astype("int32"), axis=k), name="C", ) a_buffer = tvm.tir.decl_buffer(data.shape, dtype="uint8", name="a_buffer", offset_factor=1, strides=[1]) b_buffer = tvm.tir.decl_buffer(kernel.shape, dtype="int8", name="b_buffer", offset_factor=1, strides=[te.var("ldw"), 1]) def _intrin_func(ins, outs): def _instr(index): # int_lx32 - output datatype after pmaddubs - 16 bits to number of lanes # int_8xl - input datatype to pmaddubs - 8 bits to number of lanes # int_32xl - output datatype after pmaddw - 32 bits per number of lanes if int32_lanes == 4: int_lx32 = "int16x8" int_8xl = "int8x16" int_32xl = "int32x4" pmaddubs = "llvm.x86.ssse3.pmadd.ub.sw.128" pmaddw = "llvm.x86.sse2.pmadd.wd" elif int32_lanes == 8: int_lx32 = "int16x16" int_8xl = "int8x32" int_32xl = "int32x8" pmaddubs = "llvm.x86.avx2.pmadd.ub.sw" pmaddw = "llvm.x86.avx2.pmadd.wd" elif int32_lanes == 16: int_lx32 = "int16x32" int_8xl = "int8x64" int_32xl = "int32x16" pmaddubs = "llvm.x86.avx512.pmaddubs.w.512" pmaddw = "llvm.x86.avx512.pmaddw.d.512" ib = tvm.tir.ir_builder.create() if index == 1: ib.emit(outs[0].vstore(0, tvm.tir.const(0, int_32xl))) return ib.get() a_int8 = ins[0].vload([0], "uint8x4") re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8) vec_ai32 = re_int32.astype(int_32xl) vec_a = tvm.tir.call_intrin(int_8xl, "tir.reinterpret", vec_ai32) vec_b = ins[1].vload([0, 0], int_8xl) vec_one = tvm.tir.const(1, int_lx32) pair_reduction = tvm.tir.call_llvm_pure_intrin( int_lx32, pmaddubs, tvm.tir.const(0, "uint32"), vec_a, vec_b, ) quad_reduction = tvm.tir.call_llvm_pure_intrin( int_32xl, pmaddw, tvm.tir.const(0, "uint32"), pair_reduction, vec_one, ) if index == 0: ib.emit(outs[0].vstore(0, quad_reduction)) else: ib.emit(outs[0].vstore( 0, quad_reduction + outs[0].vload([0], int_32xl))) return ib.get() # body, reset, update return _instr(0), _instr(1), _instr(2) buffer_params = {"offset_factor": 1} return te.decl_tensor_intrin( C.op, _intrin_func, binds={ data: a_buffer, kernel: b_buffer }, default_buffer_params=buffer_params, )
def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile): """compute define for Conv2d Spatial Pack with NCHW layout""" out_dtype = out_dtype or data.dtype N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if len(kernel.shape) == 4: pre_packed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: # kernel tensor is pre packed pre_packed = True CO, _, KH, KW, VC = get_const_tuple(kernel.shape) CO = CO * VC dilated_kernel_h = (KH - 1) * dilation_h + 1 dilated_kernel_w = (KW - 1) * dilation_w + 1 pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) OH = (IH + pad_top + pad_bottom - dilated_kernel_h) // HSTR + 1 OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1 data_pad = nn.pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right]) # ==================== define configuration space ==================== n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW) ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW) if num_tile == 2: # for arm cpu co, vc = cfg.define_split('tile_co', co, num_outputs=2) oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2) ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2) elif num_tile == 3: # for mali gpu co, _, vc = cfg.define_split('tile_co', co, num_outputs=3) oh, _, vh = cfg.define_split('tile_oh', oh, num_outputs=3) ow, _, vw = cfg.define_split('tile_ow', ow, num_outputs=3) else: raise RuntimeError("Invalid num_tile") cfg.define_reorder("reorder_0", [n, co, oh, ow, ci, kh, kw, vh, vw, vc], policy='candidate', candidate=[ [n, co, oh, ow, ci, kh, kw, vh, vw, vc], [n, co, oh, ow, ci, kh, kw, vc, vh, vw]]) cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll') cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec') # fallback support if cfg.is_fallback: if num_tile == 2: # arm cpu ref_log = autotvm.tophub.load_reference_log( 'arm_cpu', 'rk3399', 'conv2d_nchw_spatial_pack.arm_cpu') cfg.fallback_with_reference_log(ref_log) elif num_tile == 3: # mali gpu ref_log = autotvm.tophub.load_reference_log( 'mali', 'rk3399', 'conv2d_nchw_spatial_pack.mali') cfg.fallback_with_reference_log(ref_log) # ==================================================================== VC = cfg["tile_co"].size[-1] VH = cfg["tile_oh"].size[-1] VW = cfg["tile_ow"].size[-1] kvshape = (CO // VC, CI, KH, KW, VC) ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC) oshape = (N, CO, OH, OW) if dilation_h != 1 or dilation_w != 1: # undilate input data dvshape = (N, OH // VH, OW // VW, CI, KH, KW, VH, VW) data_vec = te.compute(dvshape, lambda n, h, w, ci, kh, kw, vh, vw: data_pad[n][ci][(h*VH+vh)*HSTR+kh*dilation_h] [(w*VW+vw)*WSTR+kw*dilation_w], name='data_vec_undilated') else: dvshape = (N, OH // VH, OW // VW, CI, VH*HSTR + KH-1, VW*WSTR + KW-1) data_vec = te.compute(dvshape, lambda n, h, w, ci, vh, vw: data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw], name='data_vec') if pre_packed: kernel_vec = kernel else: kernel_vec = te.compute(kvshape, lambda co, ci, kh, kw, vc: kernel[co*VC+vc][ci][kh][kw], name='kernel_vec') ci = te.reduce_axis((0, CI), name='ci') kh = te.reduce_axis((0, KH), name='kh') kw = te.reduce_axis((0, KW), name='kw') if dilation_h != 1 or dilation_w != 1: conv = te.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ te.sum(data_vec[n, h, w, ci, kh, kw, vh, vw].astype(out_dtype) * kernel_vec[co, ci, kh, kw, vc].astype(out_dtype), axis=[ci, kh, kw]), name='conv') else: conv = te.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ te.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) * kernel_vec[co, ci, kh, kw, vc].astype(out_dtype), axis=[ci, kh, kw]), name='conv') idxdiv = tvm.tir.indexdiv idxmod = tvm.tir.indexmod output = te.compute(oshape, lambda n, co, h, w: conv[n, idxdiv(co, VC), idxdiv(h, VH), idxdiv(w, VW), idxmod(h, VH), idxmod(w, VW), idxmod(co, VC)], name='output_unpack', tag='spatial_conv2d_output') return output
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size): N, CI, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if len(kernel.shape) == 4: if dilation_h != 1 or dilation_w != 1: kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w)) pre_computed = False CO, _, KH, KW = get_const_tuple(kernel.shape) else: assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation" pre_computed = True H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape) CO *= VC KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1 HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW)) assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1 data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad") r = KW m = tile_size alpha = m + r - 1 A, B, G = winograd_transform_matrices(m, r, out_dtype) H = (IH + pt + pb - 3) // HSTR + 1 W = (IW + pl + pr - 3) // WSTR + 1 nH, nW = (H + m - 1) // m, (W + m - 1) // m P = N * nH * nW ##### space definition begin ##### tile_bna_candidates = [1, 2, 4, 8, 16] factors = get_factors(CO) cfg.define_knob('tile_bna', [x for x in tile_bna_candidates if x in factors]) cfg.define_knob('tile_bnb', [1, 2, 4, 8, 16]) cfg.define_split('tile_t1', CI, num_outputs=2, max_factor=128) cfg.define_split('tile_t2', CO, num_outputs=2, max_factor=128) cfg.define_split('c_unroll', CI, num_outputs=2, max_factor=8) cfg.define_knob('yt', [1, 2, 4, 8, 16, 32]) ##### space definition end ##### if cfg.is_fallback: cfg['tile_bnb'].val = 4 cfg['tile_bna'].val = 4 while CO % cfg['tile_bna'].val != 0: cfg['tile_bna'].val //= 2 cfg['yt'].val = 8 cfg.fallback_split('tile_t1', [-1, 128]) cfg.fallback_split('tile_t2', [-1, 128]) cfg.fallback_split('c_unroll', [-1, 8]) bna = cfg['tile_bna'].val bnb = cfg['tile_bnb'].val P_round = (P + bnb - 1) // bnb * bnb assert CO % bna == 0 and P_round % bnb == 0 # pack input tile input_tile = te.compute( (CI, P_round // bnb, alpha, alpha, bnb), lambda ci, b, eps, nu, bb: \ tvm.tir.if_then_else( b * bnb + bb < P, data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps] [(b*bnb+bb) % nW * m + nu], tvm.tir.const(0, data_pad.dtype)), name='d') if autotvm.GLOBAL_SCOPE.in_tuning: VC = cfg['tile_k'].size[-1] kvshape = (KH + tile_size - 1, KW + tile_size - 1, tvm.tir.indexdiv(CO, VC), CI, VC) U = tvm.te.placeholder(kvshape, kernel.dtype, name="U") else: # transform kernel if pre_computed: U = kernel else: r_kh = te.reduce_axis((0, KH), 'r_kh') r_kw = te.reduce_axis((0, KW), 'r_kw') U = te.compute( (alpha, alpha, CO // bna, CI, bna), lambda eps, nu, co, ci, vco: te.sum(kernel[co * bna + vco][ci][ r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw], axis=[r_kh, r_kw]), name='U') # transform image r_a = te.reduce_axis((0, alpha), 'r_a') r_b = te.reduce_axis((0, alpha), 'r_b') V = te.compute((alpha, alpha, P_round // bnb, CI, bnb), lambda eps, nu, p, ci, vp: te.sum(input_tile[ci][p][r_a][ r_b][vp] * B[r_a][eps] * B[r_b][nu], axis=[r_a, r_b]), name='V') idxdiv = tvm.tir.indexdiv idxmod = tvm.tir.indexmod # batch gemm ci = te.reduce_axis((0, CI), name='c') M = te.compute( (alpha, alpha, CO, P_round), lambda eps, nu, co, p: te.sum(U[eps][nu][idxdiv(co, bna)][ci][idxmod( co, bna)] * V[eps][nu][idxdiv(p, bnb)][ci][idxmod(p, bnb)], axis=ci), name='M') r_a = te.reduce_axis((0, alpha), 'r_a') r_b = te.reduce_axis((0, alpha), 'r_b') Y = te.compute( (CO, P, m, m), lambda co, p, vh, vw: te.sum( M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]), name='Y') # unpack output output = te.compute( (N, CO, H, W), lambda n, co, h, w: Y[co, n * nH * nW + idxdiv(h, m) * nW + idxdiv( w, m), idxmod(h, m), idxmod(w, m)] # The following hack term is used to make the padding in batch gemm ("M") # effective, otherwise the padding will be eliminated by bound inference. # Use `tvm.tir.Mul` instead of `*` to avoid issues in const folding. + tvm.tir.Mul(tvm.tir.const(0, out_dtype), M[alpha - 1][alpha - 1][ CO - 1][P_round - 1]), name='output', tag='winograd_conv2d_output') # we have to manually assign effective GFLOP for winograd cfg.add_flop(2 * N * CO * H * W * KH * KW * CI) return output
def batch_matmul_int8(cfg, x, y, out_shape=None, out_dtype=None): """Batch Matmul operator for int8 on CUDA. Parameters ---------- cfg : ConfigSpace Autotvm tuning space config file. x : tvm.te.Tensor 3-D with shape [batch, M, K] or [batch, K, M]. y : tvm.te.Tensor 3-D with shape [batch, K, N] or [batch, N, K]. out_shape : List[Optional] Explicit intended output shape of the computation. Can be useful in cases with dynamic input shapes. out_dtype : Optional[str] Specifies the output data type for mixed precision batch matmul. Returns ------- output : tvm.te.Tensor 3-D with shape [batch, M, N] """ if out_dtype is None: out_dtype = x.dtype x_shape = get_const_tuple(x.shape) y_shape = get_const_tuple(y.shape) assert len(x_shape) == 3 and len( y_shape) == 3, "only support 3-dim batch_matmul" XB, M, XK = x.shape YB, N, YK = y.shape assert XB == YB or XB == 1 or YB == 1, "batch dimension doesn't match" assert XK == YK, "shapes of x and y is inconsistent" nB = tvm.te.max(XB, YB) nK = ((XK + 3) // 4) * 4 reduce_k = te.reduce_axis((0, nK), name="k") # pad for _dp4a vectorize pad_x = te.compute( (XB, M, nK), lambda b, i, j: tvm.te.if_then_else( j >= XK, tvm.runtime.convert(0).astype(x.dtype), x[b, i, j]), ) pad_y = te.compute( (YB, N, nK), lambda b, i, j: tvm.te.if_then_else( j >= YK, tvm.runtime.convert(0).astype(y.dtype), y[b, i, j]), ) out = te.compute( (nB, M, N), lambda b, i, j: te.sum( pad_x[b if XB != 1 else 0, i, reduce_k].astype(out_dtype) * pad_y[ b if YB != 1 else 0, j, reduce_k].astype(out_dtype), axis=[reduce_k], ), tag="batch_matmul_int8", ) cfg.add_flop(XB * M * N * nK * 2) return out
def compute_depthwise_conv2d_nhwc(_, data, kernel, strides, padding, dilation, out_dtype): """TOPI compute callback for depthwise_conv2d nhwc Parameters ---------- cfg: ConfigEntity The config for this template data : tvm.te.Tensor 4-D with shape [batch, in_height, in_width, in_channel] kernel : tvm.te.Tensor 4-D with shape [filter_height, filter_width, in_channel, channel_multiplier] strides : list of two ints [stride_height, stride_width] padding : list of two ints [pad_height, pad_width] dilation : list of two ints [dilation_height, dilation_width] out_dtype: str The output type. This is used for mixed precision. Returns ------- output : tvm.te.Tensor 4-D with shape [batch, out_height, out_width, out_channel] """ out_dtype = out_dtype or data.dtype N, IH, IW, IC = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation KH, KW, IC, channel_multiplier = get_const_tuple(kernel.shape) dilated_kernel_h = (KH - 1) * dilation_h + 1 dilated_kernel_w = (KW - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1 OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1 if pad_top or pad_left or pad_down or pad_right: data_pad = nn.pad(data, [0, pad_top, pad_left, 0], [0, pad_down, pad_right, 0], name="data_pad") else: data_pad = data output_shape = (N, OH, OW, IC * channel_multiplier) idxdiv = tvm.tir.indexdiv idxmod = tvm.tir.indexmod reduce_h = te.reduce_axis((0, KH), name='reduce_h') reduce_w = te.reduce_axis((0, KW), name='reduce_w') out = te.compute( output_shape, lambda n, h, w, c: te.sum( data_pad[n, HSTR * h + dilation_h * reduce_h, w * WSTR + reduce_w * dilation_w, idxdiv(c, channel_multiplier)].astype(out_dtype) * kernel[ reduce_h, reduce_w, idxdiv(c, channel_multiplier), idxmod(c, channel_multiplier)].astype(out_dtype), axis=[reduce_h, reduce_w]), name='depthwise_conv2d_nhwc_output') return out
def test_tensor_core_batch_conv(): # The sizes of inputs and filters batch_size = 32 height = 14 width = 14 in_channels = 32 out_channels = 64 kernel_h = 3 kernel_w = 3 pad_h = 1 pad_w = 1 stride_h = 1 stride_w = 1 block_size = 16 block_row_warps = 2 block_col_warps = 4 warp_row_tiles = 4 warp_col_tiles = 2 warp_size = 32 chunk = 2 # Input feature map: (N, H, W, IC, n, ic) data_shape = ( batch_size // block_size, height, width, in_channels // block_size, block_size, block_size, ) # Kernel: (H, W, IC, OC, ic, oc) kernel_shape = ( kernel_h, kernel_w, in_channels // block_size, out_channels // block_size, block_size, block_size, ) # Output feature map: (N, H, W, OC, n, oc) output_shape = ( batch_size // block_size, height, width, out_channels // block_size, block_size, block_size, ) assert batch_size % block_size == 0 assert in_channels % block_size == 0 assert out_channels % block_size == 0 kh = te.reduce_axis((0, kernel_h), name="kh") kw = te.reduce_axis((0, kernel_w), name="kw") ic = te.reduce_axis((0, in_channels // block_size), name="ic") ii = te.reduce_axis((0, block_size), name="ii") # Algorithm A = te.placeholder(data_shape, name="A", dtype="float16") W = te.placeholder(kernel_shape, name="W", dtype="float16") Apad = te.compute( ( batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size, block_size, ), lambda n, h, w, i, nn, ii: tvm.tir.if_then_else( tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width), A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.tir.const(0.0, "float16"), ), name="Apad", ) Conv = te.compute( output_shape, lambda n, h, w, o, nn, oo: te.sum( Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32") * W[kh, kw, ic, o, ii, oo].astype("float32"), axis=[ic, kh, kw, ii], ), name="Conv", ) s = te.create_schedule(Conv.op) s[Apad].compute_inline() AS = s.cache_read(Apad, "shared", [Conv]) WS = s.cache_read(W, "shared", [Conv]) AF = s.cache_read(AS, "wmma.matrix_a", [Conv]) WF = s.cache_read(WS, "wmma.matrix_b", [Conv]) ConvF = s.cache_write(Conv, "wmma.accumulator") block_x = te.thread_axis("blockIdx.x") block_y = te.thread_axis("blockIdx.y") block_z = te.thread_axis("blockIdx.z") thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_z = te.thread_axis("threadIdx.z") nc, hc, wc, oc, nnc, ooc = Conv.op.axis block_k = s[Conv].fuse(hc, wc) s[Conv].bind(block_k, block_z) nc, nci = s[Conv].split(nc, factor=warp_row_tiles) block_i, nc = s[Conv].split(nc, factor=block_row_warps) oc, oci = s[Conv].split(oc, factor=warp_col_tiles) block_j, oc = s[Conv].split(oc, factor=block_col_warps) s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc) s[Conv].bind(block_i, block_x) s[Conv].bind(block_j, block_y) s[Conv].bind(nc, thread_y) s[Conv].bind(oc, thread_z) s[ConvF].compute_at(s[Conv], oc) n, h, w, o, nnf, oof = ConvF.op.axis ko, ki = s[ConvF].split(ic, factor=chunk) s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii) s[AF].compute_at(s[ConvF], kw) s[WF].compute_at(s[ConvF], kw) s[WS].compute_at(s[ConvF], kh) s[AS].compute_at(s[ConvF], kh) n, h, w, i, nn, ii = AS.op.axis tx, xo = s[AS].split(n, nparts=block_row_warps) ty, yo = s[AS].split(xo, nparts=block_col_warps) t = s[AS].fuse(nn, ii) to, ti = s[AS].split(t, factor=warp_size) s[AS].bind(tx, thread_y) s[AS].bind(ty, thread_z) s[AS].bind(ti, thread_x) kh, kw, ic, o, ii, oo = WS.op.axis tx, xo = s[WS].split(o, nparts=block_row_warps) ty, yo = s[WS].split(xo, nparts=block_col_warps) t = s[WS].fuse(ii, oo) to, ti = s[WS].split(t, nparts=warp_size) s[WS].bind(tx, thread_y) s[WS].bind(ty, thread_z) s[WS].bind(to, thread_x) s[WS].vectorize(ti) s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_a")) s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_b")) s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16))) s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16))) func = tvm.build(s, [A, W, Conv], "cuda") dev = tvm.gpu(0) a_np = np.random.uniform(size=data_shape).astype(A.dtype) w_np = np.random.uniform(size=kernel_shape).astype(W.dtype) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), dev) evaluator = func.time_evaluator(func.entry_name, dev, number=3) print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3)) if VERIFY: func(a, w, c) a_np = a_np.transpose(0, 4, 1, 2, 3, 5).reshape(batch_size, height, width, in_channels) w_np = w_np.transpose(0, 1, 2, 4, 3, 5).reshape( kernel_h, kernel_w, in_channels, out_channels ) c_np = ( c.asnumpy() .transpose((0, 4, 1, 2, 3, 5)) .reshape(batch_size, height, width, out_channels) ) c_std = conv2d_nhwc_python( a_np.astype(Conv.dtype), w_np.astype(Conv.dtype), (stride_h, stride_w), (pad_h, pad_w) ).astype(Conv.dtype) np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)
def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation, out_dtype, num_tile): out_dtype = out_dtype or data.dtype N, C, IH, IW = get_const_tuple(data.shape) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation if len(kernel.shape) == 4: pre_packed = False C, M, KH, KW = get_const_tuple(kernel.shape) else: # kernel tensor is pre packed pre_packed = True C, M, KH, KW, VC = get_const_tuple(kernel.shape) C = C * VC dilated_kernel_h = (KH - 1) * dilation_h + 1 dilated_kernel_w = (KW - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1 OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1 # pack data HPAD = pad_top + pad_down WPAD = pad_left + pad_right DOPAD = (HPAD != 0 or WPAD != 0) if DOPAD: data_pad = nn.pad(data, (0, 0, pad_top, pad_left), (0, 0, pad_down, pad_right), name="data_pad") else: data_pad = data # fallback support # Currently, Mali schedule doesn't use it like conv2d. if cfg.is_fallback: ref_log = autotvm.tophub.load_reference_log( 'arm_cpu', 'rk3399', 'depthwise_conv2d_nchw_spatial_pack.arm_cpu') cfg.fallback_with_reference_log(ref_log) # ==================== define configuration space ==================== n, c, oh, ow = cfg.axis(N), cfg.axis(C), cfg.axis(OH), cfg.axis(OW) kh, kw = cfg.reduce_axis(KH), cfg.reduce_axis(KW) # Currently, Mali schedule doesn't use it like conv2d. # Leave num_tile for possible future use of Mali schedule if num_tile == 2: # for arm cpu co, vc = cfg.define_split('tile_co', c, num_outputs=2) oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2) ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2) else: raise RuntimeError("Invalid num_tile") cfg.define_reorder("reorder_0", [n, co, oh, ow, kh, kw, vh, vw, vc], policy='candidate', candidate=[[n, co, oh, ow, kh, kw, vh, vw, vc], [n, co, oh, ow, kh, kw, vc, vh, vw]]) cfg.define_reorder("reorder_1", [n, co, oh, ow, vh, vw, vc], policy='candidate', candidate=[[n, co, oh, ow, vh, vw, vc], [n, co, oh, ow, vc, vh, vw], [n, co, oh, ow, vh, vc, vw]]) cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll') cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec') # ==================================================================== VC = cfg["tile_co"].size[-1] VH = cfg["tile_oh"].size[-1] VW = cfg["tile_ow"].size[-1] kvshape = (C // VC, M, KH, KW, VC) ovshape = (N, C * M // VC, OH // VH, OW // VW, VH, VW, VC) oshape = (N, C * M, OH, OW) if dilation_h != 1 or dilation_w != 1: # undilate input data dvshape = (N, OH // VH, OW // VW, C, KH, KW, VH, VW) data_vec = te.compute( dvshape, lambda n, h, w, c, kh, kw, vh, vw: data_pad[n][c][ (h * VH + vh) * HSTR + kh * dilation_h][ (w * VW + vw) * WSTR + kw * dilation_w], name='data_vec_undilated') else: dvshape = (N, OH // VH, OW // VW, C, VH * HSTR + KH - 1, VW * WSTR + KW - 1) data_vec = te.compute(dvshape, lambda n, h, w, c, vh, vw: data_pad[n][c][ h * VH * HSTR + vh][w * VW * WSTR + vw], name='data_vec') if pre_packed: kernel_vec = kernel else: kernel_vec = te.compute( kvshape, lambda co, m, kh, kw, vc: kernel[co * VC + vc][m][kh][kw], name='kernel_vec') kh = te.reduce_axis((0, KH), name='kh') kw = te.reduce_axis((0, KW), name='kw') idxdiv = tvm.tir.indexdiv idxmod = tvm.tir.indexmod if dilation_h != 1 or dilation_w != 1: conv = te.compute( ovshape, lambda n, co, h, w, vh, vw, vc: \ te.sum(data_vec[n, h, w, idxdiv(co * VC + vc, M), kh, kw, vh, vw] .astype(out_dtype) * kernel_vec[idxdiv(co, M), idxmod(co, M), kh, kw, vc].astype(out_dtype), axis=[kh, kw]), name='depthwise_conv') else: conv = te.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ te.sum(data_vec[n, h, w, idxdiv((co * VC + vc), M), vh * HSTR + kh, vw * WSTR + kw].astype(out_dtype) * kernel_vec[idxdiv(co, M), idxmod(co, M), kh, kw, vc].astype(out_dtype), axis=[kh, kw]), name='depthwise_conv') output = te.compute(oshape, lambda n, co, h, w: conv[n, idxdiv(co, VC), idxdiv(h, VH), idxdiv(w, VW), idxmod(h, VH), idxmod(w, VW), idxmod(co, VC)], name='output_unpack', tag='spatial_depthwise_conv2d_nchw_output') return output
def test_tensorize_matmul(): n = 1024 m = n l = n A = te.placeholder((n, l), name='A') B = te.placeholder((m, l), name='B') k = te.reduce_axis((0, l), name='k') C = te.compute((n, m), lambda i, j: te.sum(B[j, k] * A[i, k], axis=k), name='C') def check(factor): s = te.create_schedule(C.op) x, y = C.op.axis yo, yi = s[C].split(y, factor=factor) gemv = intrin_gemv(factor, l) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.te.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.tir.ir_pass.Equal(out_dom[x].extent, 1) assert tvm.tir.ir_pass.Equal(out_dom[y].extent, factor) assert tvm.tir.ir_pass.Equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) assert tvm.tir.ir_pass.Equal( tvm.tir.ir_pass.CanonicalSimplify(body[0]), tvm.tir.ir_pass.CanonicalSimplify(gemv.op.body[0])) stmt = tvm.te.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C]) def check_rfactor(factor, rfactor): s = te.create_schedule(C.op) x, y = C.op.axis rk = C.op.reduce_axis[0] yo, yi = s[C].split(y, factor=factor) ro, ri = s[C].split(rk, factor=rfactor) s[C].reorder(yo, ro, yi, ri) gemv = intrin_gemv(factor, rfactor) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.te.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.tir.ir_pass.Equal(out_dom[x].extent, 1) assert tvm.tir.ir_pass.Equal(out_dom[y].extent, factor) assert tvm.tir.ir_pass.Equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) assert tvm.tir.ir_pass.Equal( tvm.tir.ir_pass.CanonicalSimplify(body[0]), tvm.tir.ir_pass.CanonicalSimplify(gemv.op.body[0])) stmt = tvm.te.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C]) def check_rfactor_no_reset(factor, rfactor): s = te.create_schedule(C.op) x, y = C.op.axis rk = C.op.reduce_axis[0] yo, yi = s[C].split(y, factor=factor) ro, ri = s[C].split(rk, factor=rfactor) s[C].reorder(yo, ro, yi, ri) gemv = intrin_gemv_no_reset(factor, rfactor) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.te.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.tir.ir_pass.Equal(out_dom[x].extent, 1) assert tvm.tir.ir_pass.Equal(out_dom[y].extent, factor) assert tvm.tir.ir_pass.Equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) assert tvm.tir.ir_pass.Equal( tvm.tir.ir_pass.CanonicalSimplify(body[0]), tvm.tir.ir_pass.CanonicalSimplify(gemv.op.body[0])) stmt = tvm.te.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C]) def check_rfactor_no_reset_multi_reduction(factor, rfactor): s = te.create_schedule(C.op) x, y = C.op.axis rk = C.op.reduce_axis[0] yo, yi = s[C].split(y, factor=factor) ro, ri = s[C].split(rk, factor=rfactor) roo, roi = s[C].split(ro, factor=2) s[C].reorder(yo, roo, roi, yi, ri) gemv = intrin_gemv_no_reset(factor, rfactor) s[C].tensorize(yi, gemv) s = s.normalize() dom_map = tvm.te.schedule.InferBound(s) finfer = tvm.get_global_func("test.op.InferTensorizeRegion") out_dom, in_dom = finfer(s[C], dom_map) assert tvm.tir.ir_pass.Equal(out_dom[x].extent, 1) assert tvm.tir.ir_pass.Equal(out_dom[y].extent, factor) assert tvm.tir.ir_pass.Equal(out_dom[y].min, yo * factor) fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") body = fmatch(s[C], out_dom, in_dom, gemv) assert tvm.tir.ir_pass.Equal( tvm.tir.ir_pass.CanonicalSimplify(body[0]), tvm.tir.ir_pass.CanonicalSimplify(gemv.op.body[0])) stmt = tvm.te.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [A, B, C]) check(16) check_rfactor(16, 16) check_rfactor_no_reset(16, 16) check_rfactor_no_reset_multi_reduction(16, 16)
def run_gemm_packed(env, remote, batch_size, channel, block): data_shape = (batch_size // env.BATCH, channel // env.BLOCK_IN, env.BATCH, env.BLOCK_IN) weight_shape = ( channel // env.BLOCK_OUT, channel // env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN, ) res_shape = (batch_size // env.BATCH, channel // env.BLOCK_OUT, env.BATCH, env.BLOCK_OUT) # To compute number of ops, use a x2 factor for FMA num_ops = 2 * channel * channel * batch_size ko = te.reduce_axis((0, channel // env.BLOCK_IN), name="ko") ki = te.reduce_axis((0, env.BLOCK_IN), name="ki") data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) weight = te.placeholder(weight_shape, name="weight", dtype=env.wgt_dtype) data_buf = te.compute(data_shape, lambda *i: data(*i), "data_buf") weight_buf = te.compute(weight_shape, lambda *i: weight(*i), "weight_buf") res_gem = te.compute( res_shape, lambda bo, co, bi, ci: te.sum( data_buf[bo, ko, bi, ki].astype(env.acc_dtype) * weight_buf[ co, ko, ci, ki].astype(env.acc_dtype), axis=[ko, ki], ), name="res_gem", ) res_shf = te.compute(res_shape, lambda *i: res_gem(*i) >> 8, name="res_shf") res_max = te.compute(res_shape, lambda *i: tvm.te.max(res_shf(*i), 0), "res_max") # relu res_min = te.compute(res_shape, lambda *i: tvm.te.min(res_max(*i), (1 << (env.INP_WIDTH - 1)) - 1), "res_min") # relu res = te.compute(res_shape, lambda *i: res_min(*i).astype(env.inp_dtype), name="res") def verify(s): mod = vta.build(s, [data, weight, res], "ext_dev", env.target_host, name="gemm") temp = utils.tempdir() mod.save(temp.relpath("gemm.o")) remote.upload(temp.relpath("gemm.o")) f = remote.load_module("gemm.o") # verify dev = remote.ext_dev(0) # Data in original format data_orig = np.random.randint(-128, 128, size=(batch_size, channel)).astype(data.dtype) weight_orig = np.random.randint( -128, 128, size=(channel, channel)).astype(weight.dtype) data_packed = data_orig.reshape(batch_size // env.BATCH, env.BATCH, channel // env.BLOCK_IN, env.BLOCK_IN).transpose( (0, 2, 1, 3)) weight_packed = weight_orig.reshape(channel // env.BLOCK_OUT, env.BLOCK_OUT, channel // env.BLOCK_IN, env.BLOCK_IN).transpose( (0, 2, 1, 3)) res_np = np.zeros(res_shape).astype(res.dtype) data_arr = tvm.nd.array(data_packed, dev) weight_arr = tvm.nd.array(weight_packed, dev) res_arr = tvm.nd.array(res_np, dev) res_ref = np.zeros(res_shape).astype(env.acc_dtype) for b in range(batch_size // env.BATCH): for i in range(channel // env.BLOCK_OUT): for j in range(channel // env.BLOCK_IN): res_ref[b, i, :] += np.dot( data_packed[b, j, :].astype(env.acc_dtype), weight_packed[i, j].T.astype(env.acc_dtype), ) res_ref = np.right_shift(res_ref, 8) res_ref = np.clip(res_ref, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype) time_f = f.time_evaluator("gemm", dev, number=20) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() cost = time_f(data_arr, weight_arr, res_arr) if env.TARGET in ["sim", "tsim"]: stats = simulator.stats() print("Execution statistics:") for k, v in stats.items(): print("\t{:<16}: {:>16}".format(k, v)) res_unpack = res_arr.numpy().reshape(batch_size // env.BATCH, channel // env.BLOCK_OUT, env.BATCH, env.BLOCK_OUT) return cost def run_schedule(load_inp, load_wgt, gemm, alu, store_out, print_ir): s = te.create_schedule(res.op) s[data_buf].set_scope(env.inp_scope) s[weight_buf].set_scope(env.wgt_scope) s[res_gem].set_scope(env.acc_scope) s[res_shf].set_scope(env.acc_scope) s[res_min].set_scope(env.acc_scope) s[res_max].set_scope(env.acc_scope) if block: bblock = block // env.BATCH iblock = block // env.BLOCK_IN oblock = block // env.BLOCK_OUT xbo, xco, xbi, xci = s[res].op.axis xb1, xco1, xb2, xco2 = s[res].tile(xbo, xco, bblock, oblock) store_pt = xb2 s[res_gem].compute_at(s[res], xco1) s[res_shf].compute_at(s[res], xco1) s[res_min].compute_at(s[res], xco1) s[res_max].compute_at(s[res], xco1) xbo, xco, xbi, xci = s[res_gem].op.axis # Compute one line at a time ko1, ko2 = s[res_gem].split(ko, iblock) s[res_gem].reorder(ko1, ko2, xbo, xco, xbi, xci, ki) s[data_buf].compute_at(s[res_gem], ko1) s[weight_buf].compute_at(s[res_gem], ko1) # Use VTA instructions s[data_buf].pragma(s[data_buf].op.axis[0], load_inp) s[weight_buf].pragma(s[weight_buf].op.axis[0], load_wgt) s[res_gem].tensorize(xbi, gemm) s[res_shf].pragma(s[res_shf].op.axis[0], alu) s[res_min].pragma(s[res_min].op.axis[0], alu) s[res_max].pragma(s[res_max].op.axis[0], alu) s[res].pragma(store_pt, store_out) else: xbo, xco, xbi, xci = s[res_gem].op.axis s[res_gem].reorder(ko, xbo, xco, xbi, xci, ki) # Use VTA instructions s[data_buf].pragma(s[data_buf].op.axis[0], load_inp) s[weight_buf].pragma(s[weight_buf].op.axis[0], load_wgt) s[res_gem].tensorize(xbi, gemm) s[res_shf].pragma(s[res_shf].op.axis[0], alu) s[res_min].pragma(s[res_min].op.axis[0], alu) s[res_max].pragma(s[res_max].op.axis[0], alu) s[res].pragma(s[res].op.axis[0], store_out) if print_ir: print(tvm.lower(s, [data, weight, res], simple_mode=True)) return verify(s) def gemm_normal(print_ir): mock = env.mock print("----- GEMM GOPS End-to-End Test-------") def run_test(header, print_ir): cost = run_schedule( env.dma_copy, env.dma_copy, env.gemm, env.alu, env.dma_copy, print_ir, ) gops = (num_ops / cost.mean) / float(10**9) print(header) print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) with vta.build_config(): run_test("NORMAL", print_ir) def gemm_unittest(print_ir): mock = env.mock print("----- GEMM Unit Test-------") def run_test(header, print_ir): cost = run_schedule(mock.dma_copy, mock.dma_copy, env.gemm, mock.alu, mock.dma_copy, print_ir) gops = (num_ops / cost.mean) / float(10**9) print(header) print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) with vta.build_config(): run_test("NORMAL", print_ir) def alu_unittest(print_ir): mock = env.mock print("----- ALU Unit Test-------") def run_test(header, print_ir): cost = run_schedule(mock.dma_copy, mock.dma_copy, mock.gemm, env.alu, mock.dma_copy, print_ir) gops = (num_ops / cost.mean) / float(10**9) print(header) print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) with vta.build_config(): run_test("NORMAL", print_ir) print("") def load_inp_unittest(print_ir): mock = env.mock print("----- LoadInp Unit Test-------") def run_test(header, print_ir): cost = run_schedule(env.dma_copy, mock.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir) gops = (num_ops / cost.mean) / float(10**9) bandwith = (batch_size * channel * env.INP_WIDTH / cost.mean) / float(10**9) print(header) print("\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits" % (cost.mean, gops, bandwith)) with vta.build_config(): run_test("NORMAL", print_ir) print("") def load_wgt_unittest(print_ir): mock = env.mock print("----- LoadWgt Unit Test-------") def run_test(header, print_ir): cost = run_schedule(mock.dma_copy, env.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir) gops = (num_ops / cost.mean) / float(10**9) bandwith = (channel * channel * env.WGT_WIDTH / cost.mean) / float(10**9) print(header) print("\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits" % (cost.mean, gops, bandwith)) with vta.build_config(): run_test("NORMAL", print_ir) print("") def store_out_unittest(print_ir): mock = env.mock print("----- StoreOut Unit Test-------") def run_test(header, print_ir): cost = run_schedule(mock.dma_copy, mock.dma_copy, mock.gemm, mock.alu, env.dma_copy, print_ir) gops = (num_ops / cost.mean) / float(10**9) bandwith = (batch_size * channel * env.OUT_WIDTH / cost.mean) / float(10**9) print(header) print("\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits" % (cost.mean, gops, bandwith)) with vta.build_config(): run_test("NORMAL", print_ir) print("") gemm_normal(False) gemm_unittest(False) alu_unittest(False)