def test_conv2d( self, conv2d_impl, shape_nhwc, shape_oihw, shape_oihw8i32o4i, kernel, stride, pad, dtype, target, ): inputs = [ np.random.uniform(0, 255, size=shape_nhwc).astype(dtype), np.random.uniform(0, 255, size=shape_oihw8i32o4i).astype(dtype), ] np_filter = (inputs[1].transpose(0, 5, 1, 4, 6, 2, 3).reshape(shape_oihw).transpose( 2, 3, 1, 0)) ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad) output = build_and_run( inputs, conv2d_impl, target, target, shape_nhwc=shape_nhwc, shape_oihw8i32o4i=shape_oihw8i32o4i, kernel_size=(kernel, kernel), stride=(stride, stride), padding=(pad, pad, pad, pad), dtype=dtype, ) return output, ref_output
def test_conv2d( self, batch, in_size, in_channel, pad, stride, kernel_size, out_channel, k_split_factor, h_split_factor, dtype, target, ): # TODO: no support for dilation dilation = 1 shape_input = [batch, in_size, in_size, in_channel] shape_filter_oihw = [out_channel, in_channel, kernel_size, kernel_size] shape_filter_oihw8i32o4i = get_packed_filter_shape(shape_filter_oihw) inputs = [ np.random.uniform(0, 255, size=shape_input).astype(dtype), np.random.uniform(0, 255, size=shape_filter_oihw8i32o4i).astype(dtype), ] np_filter = ( inputs[1] .transpose(0, 5, 1, 4, 6, 2, 3) .reshape(shape_filter_oihw) .transpose(2, 3, 1, 0) ) ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad) output = build_and_run( inputs, conv2d_nhwc8h8w32c, target, target, shape_input=shape_input, pad=(pad, pad, pad, pad), stride=(stride, stride), dilation=(dilation, dilation), shape_filter=shape_filter_oihw8i32o4i, k_split_factor=k_split_factor, h_split_factor=h_split_factor, dtype=dtype, ) conv2d_verify(output, ref_output, dtype)
def test_conv2d(self, shape_nhwc, shape_oihw, kernel, stride, pad, dtype, target): inputs = [ np.random.uniform(0, 255, size=shape_nhwc).astype(dtype), np.random.uniform(0, 255, size=shape_oihw).astype(dtype), ] np_filter = inputs[1].transpose(2, 3, 1, 0) ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad) output = build_and_run( inputs, conv2d_logical, target, target, shape_nhwc=shape_nhwc, shape_oihw=shape_oihw, kernel_size=(kernel, kernel), stride=(stride, stride), padding=(pad, pad, pad, pad), dtype=dtype, ) # nhwc8h8w32c -> nhwc output = output.transpose(0, 1, 4, 2, 5, 3, 6).reshape( output.shape[0], output.shape[1] * output.shape[4], output.shape[2] * output.shape[5], output.shape[3] * output.shape[6], ) # slice output to match ref_output shape # e.g. 8x8 spatial 3x3 filter = 6x6 ref output # but still 8x8 output given the blocked layout output = output[ 0 : ref_output.shape[0] : 1, 0 : ref_output.shape[1] : 1, 0 : ref_output.shape[2] : 1, 0 : ref_output.shape[3] : 1, ] if "int" in dtype: tol = {"atol": 0, "rtol": 0} elif dtype == "float32": tol = {"rtol": 1e-4, "atol": 2e-4} tvm.testing.assert_allclose(output, ref_output, **tol)
def test_conv2d( self, conv2d_impl, shape_nhwc, shape_oihw, kernel, stride, pad, dtype, target, k_split_factor, h_split_factor, ): inputs = [ np.random.uniform(0, 255, size=shape_nhwc).astype(dtype), np.random.uniform(0, 255, size=shape_oihw).astype(dtype), ] np_filter = inputs[1].transpose(2, 3, 1, 0) ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad) output = build_and_run( inputs, conv2d_impl, target, target, shape_nhwc=shape_nhwc, shape_filter=shape_oihw, kernel_size=(kernel, kernel), stride=(stride, stride), padding=(pad, pad, pad, pad), dtype=dtype, k_split_factor=k_split_factor, h_split_factor=h_split_factor, ) verify_conv2d(output, ref_output, dtype)
def test_tensor_core_batch_conv(): if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled..") return if not nvcc.have_tensorcore(tvm.gpu(0).compute_version): print("skip because gpu does not support tensor core") return # The sizes of inputs and filters batch_size = 32 height = 14 width = 14 in_channels = 32 out_channels = 64 kernel_h = 3 kernel_w = 3 pad_h = 1 pad_w = 1 stride_h = 1 stride_w = 1 block_size = 16 block_row_warps = 2 block_col_warps = 4 warp_row_tiles = 4 warp_col_tiles = 2 warp_size = 32 chunk = 2 # Input feature map: (N, H, W, IC, n, ic) data_shape = (batch_size // block_size, height, width, in_channels // block_size, block_size, block_size) # Kernel: (H, W, IC, OC, ic, oc) kernel_shape = (kernel_h, kernel_w, in_channels // block_size, out_channels // block_size, block_size, block_size) # Output feature map: (N, H, W, OC, n, oc) output_shape = (batch_size // block_size, height, width, out_channels // block_size, block_size, block_size) assert (batch_size % block_size == 0) assert (in_channels % block_size == 0) assert (out_channels % block_size == 0) kh = te.reduce_axis((0, kernel_h), name='kh') kw = te.reduce_axis((0, kernel_w), name='kw') ic = te.reduce_axis((0, in_channels // block_size), name='ic') ii = te.reduce_axis((0, block_size), name='ii') # Algorithm A = te.placeholder(data_shape, name='A', dtype="float16") W = te.placeholder(kernel_shape, name='W', dtype="float16") Apad = te.compute( (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size, block_size), lambda n, h, w, i, nn, ii: tvm.tir.if_then_else( tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width), A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.tir.const(0., "float16")), name='Apad') Conv = te.compute( output_shape, lambda n, h, w, o, nn, oo: te.sum(Apad[ n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype( "float32") * W[kh, kw, ic, o, ii, oo].astype("float32"), axis=[ic, kh, kw, ii]), name="Conv") s = te.create_schedule(Conv.op) s[Apad].compute_inline() AS = s.cache_read(Apad, 'shared', [Conv]) WS = s.cache_read(W, 'shared', [Conv]) AF = s.cache_read(AS, 'wmma.matrix_a', [Conv]) WF = s.cache_read(WS, 'wmma.matrix_b', [Conv]) ConvF = s.cache_write(Conv, 'wmma.accumulator') block_x = te.thread_axis('blockIdx.x') block_y = te.thread_axis('blockIdx.y') block_z = te.thread_axis('blockIdx.z') thread_x = te.thread_axis('threadIdx.x') thread_y = te.thread_axis('threadIdx.y') thread_z = te.thread_axis('threadIdx.z') nc, hc, wc, oc, nnc, ooc = Conv.op.axis block_k = s[Conv].fuse(hc, wc) s[Conv].bind(block_k, block_z) nc, nci = s[Conv].split(nc, factor=warp_row_tiles) block_i, nc = s[Conv].split(nc, factor=block_row_warps) oc, oci = s[Conv].split(oc, factor=warp_col_tiles) block_j, oc = s[Conv].split(oc, factor=block_col_warps) s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc) s[Conv].bind(block_i, block_x) s[Conv].bind(block_j, block_y) s[Conv].bind(nc, thread_y) s[Conv].bind(oc, thread_z) s[ConvF].compute_at(s[Conv], oc) n, h, w, o, nnf, oof = ConvF.op.axis ko, ki = s[ConvF].split(ic, factor=chunk) s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii) s[AF].compute_at(s[ConvF], kw) s[WF].compute_at(s[ConvF], kw) s[WS].compute_at(s[ConvF], kh) s[AS].compute_at(s[ConvF], kh) n, h, w, i, nn, ii = AS.op.axis tx, xo = s[AS].split(n, nparts=block_row_warps) ty, yo = s[AS].split(xo, nparts=block_col_warps) t = s[AS].fuse(nn, ii) to, ti = s[AS].split(t, factor=warp_size) s[AS].bind(tx, thread_y) s[AS].bind(ty, thread_z) s[AS].bind(ti, thread_x) kh, kw, ic, o, ii, oo = WS.op.axis tx, xo = s[WS].split(o, nparts=block_row_warps) ty, yo = s[WS].split(xo, nparts=block_col_warps) t = s[WS].fuse(ii, oo) to, ti = s[WS].split(t, nparts=warp_size) s[WS].bind(tx, thread_y) s[WS].bind(ty, thread_z) s[WS].bind(to, thread_x) s[WS].vectorize(ti) s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_a')) s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_b')) s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16))) s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16))) func = tvm.build(s, [A, W, Conv], 'cuda') ctx = tvm.gpu(0) a_np = np.random.uniform(size=data_shape).astype(A.dtype) w_np = np.random.uniform(size=kernel_shape).astype(W.dtype) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx) evaluator = func.time_evaluator(func.entry_name, ctx, number=3) print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3)) if VERIFY: func(a, w, c) a_np = a_np.transpose(0, 4, 1, 2, 3, 5).reshape(batch_size, height, width, in_channels) w_np = w_np.transpose(0, 1, 2, 4, 3, 5).reshape(kernel_h, kernel_w, in_channels, out_channels) c_np = c.asnumpy().transpose( (0, 4, 1, 2, 3, 5)).reshape(batch_size, height, width, out_channels) c_std = conv2d_nhwc_python(a_np.astype(Conv.dtype), w_np.astype(Conv.dtype), (stride_h, stride_w), (pad_h, pad_w)).astype(Conv.dtype) np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)
def test_conv2d( self, batch, in_size, in_channel, pad1, stride1, kernel_size1, out_channel1, stride2, kernel_size2, out_channel2, k_split_factor, h_split_factor, dtype, target, ): # TODO: no support for padding in conv2d #2 pad2 = 0 # TODO: no support for dilation dilation1 = 1 dilation2 = 1 shape_input = [batch, in_size, in_size, in_channel] shape_filter1_oihw = [ out_channel1, in_channel, kernel_size1, kernel_size1 ] shape_filter1_oihw8i32o4i = get_packed_filter_shape(shape_filter1_oihw) shape_filter2_oihw = [ out_channel2, out_channel1, kernel_size2, kernel_size2 ] shape_filter2_oihw8i32o4i = get_packed_filter_shape(shape_filter2_oihw) inputs = [ np.random.uniform(0, 255, size=shape_input).astype(dtype), np.random.uniform(0, 255, size=shape_filter1_oihw8i32o4i).astype(dtype), np.random.uniform(0, 255, size=shape_filter2_oihw8i32o4i).astype(dtype), ] np_filter1 = (inputs[1].transpose( 0, 5, 1, 4, 6, 2, 3).reshape(shape_filter1_oihw).transpose(2, 3, 1, 0)) np_filter2 = (inputs[2].transpose( 0, 5, 1, 4, 6, 2, 3).reshape(shape_filter2_oihw).transpose(2, 3, 1, 0)) temp_output = testing.conv2d_nhwc_python(inputs[0], np_filter1, stride1, pad1) ref_output = testing.conv2d_nhwc_python(temp_output, np_filter2, stride2, pad2) output = build_and_run( inputs, conv2dconv2d_nhwc8h8w32c, target, target, shape_input=shape_input, pad1=(pad1, pad1, pad1, pad1), stride1=(stride1, stride1), dilation1=(dilation1, dilation1), shape_filter1=shape_filter1_oihw8i32o4i, pad2=(pad2, pad2, pad1, pad1), stride2=(stride2, stride2), dilation2=(dilation2, dilation2), shape_filter2=shape_filter2_oihw8i32o4i, k_split_factor=k_split_factor, h_split_factor=h_split_factor, dtype=dtype, ) conv2d_verify(output, ref_output, dtype)
def expected_output_np(self, input_np, dilated_weights_np, stride): ref_np = conv2d_nhwc_python(input_np.astype("float32"), dilated_weights_np.astype("float32"), stride, padding=0).astype("float16") return ref_np
def test_tensor_core_batch_conv(): # The sizes of inputs and filters batch_size = 32 height = 14 width = 14 in_channels = 32 out_channels = 64 kernel_h = 3 kernel_w = 3 pad_h = 1 pad_w = 1 stride_h = 1 stride_w = 1 block_size = 16 block_row_warps = 2 block_col_warps = 4 warp_row_tiles = 4 warp_col_tiles = 2 warp_size = 32 chunk = 2 # Input feature map: (N, H, W, IC, n, ic) data_shape = ( batch_size // block_size, height, width, in_channels // block_size, block_size, block_size, ) # Kernel: (H, W, IC, OC, ic, oc) kernel_shape = ( kernel_h, kernel_w, in_channels // block_size, out_channels // block_size, block_size, block_size, ) # Output feature map: (N, H, W, OC, n, oc) output_shape = ( batch_size // block_size, height, width, out_channels // block_size, block_size, block_size, ) assert batch_size % block_size == 0 assert in_channels % block_size == 0 assert out_channels % block_size == 0 kh = te.reduce_axis((0, kernel_h), name="kh") kw = te.reduce_axis((0, kernel_w), name="kw") ic = te.reduce_axis((0, in_channels // block_size), name="ic") ii = te.reduce_axis((0, block_size), name="ii") # Algorithm A = te.placeholder(data_shape, name="A", dtype="float16") W = te.placeholder(kernel_shape, name="W", dtype="float16") Apad = te.compute( ( batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size, block_size, ), lambda n, h, w, i, nn, ii: tvm.tir.if_then_else( tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width), A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.tir.const(0.0, "float16"), ), name="Apad", ) Conv = te.compute( output_shape, lambda n, h, w, o, nn, oo: te.sum( Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype( "float32") * W[kh, kw, ic, o, ii, oo].astype("float32"), axis=[ic, kh, kw, ii], ), name="Conv", ) s = te.create_schedule(Conv.op) s[Apad].compute_inline() AS = s.cache_read(Apad, "shared", [Conv]) WS = s.cache_read(W, "shared", [Conv]) AF = s.cache_read(AS, "wmma.matrix_a", [Conv]) WF = s.cache_read(WS, "wmma.matrix_b", [Conv]) ConvF = s.cache_write(Conv, "wmma.accumulator") block_x = te.thread_axis("blockIdx.x") block_y = te.thread_axis("blockIdx.y") block_z = te.thread_axis("blockIdx.z") thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_z = te.thread_axis("threadIdx.z") nc, hc, wc, oc, nnc, ooc = Conv.op.axis block_k = s[Conv].fuse(hc, wc) s[Conv].bind(block_k, block_z) nc, nci = s[Conv].split(nc, factor=warp_row_tiles) block_i, nc = s[Conv].split(nc, factor=block_row_warps) oc, oci = s[Conv].split(oc, factor=warp_col_tiles) block_j, oc = s[Conv].split(oc, factor=block_col_warps) s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc) s[Conv].bind(block_i, block_x) s[Conv].bind(block_j, block_y) s[Conv].bind(nc, thread_y) s[Conv].bind(oc, thread_z) s[ConvF].compute_at(s[Conv], oc) n, h, w, o, nnf, oof = ConvF.op.axis ko, ki = s[ConvF].split(ic, factor=chunk) s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii) s[AF].compute_at(s[ConvF], kw) s[WF].compute_at(s[ConvF], kw) s[WS].compute_at(s[ConvF], kh) s[AS].compute_at(s[ConvF], kh) n, h, w, i, nn, ii = AS.op.axis tx, xo = s[AS].split(n, nparts=block_row_warps) ty, yo = s[AS].split(xo, nparts=block_col_warps) t = s[AS].fuse(nn, ii) to, ti = s[AS].split(t, factor=warp_size) s[AS].bind(tx, thread_y) s[AS].bind(ty, thread_z) s[AS].bind(ti, thread_x) kh, kw, ic, o, ii, oo = WS.op.axis tx, xo = s[WS].split(o, nparts=block_row_warps) ty, yo = s[WS].split(xo, nparts=block_col_warps) t = s[WS].fuse(ii, oo) to, ti = s[WS].split(t, nparts=warp_size) s[WS].bind(tx, thread_y) s[WS].bind(ty, thread_z) s[WS].bind(to, thread_x) s[WS].vectorize(ti) s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_a")) s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_b")) s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16))) s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16))) func = tvm.build(s, [A, W, Conv], "cuda") dev = tvm.cuda(0) a_np = np.random.uniform(size=data_shape).astype(A.dtype) w_np = np.random.uniform(size=kernel_shape).astype(W.dtype) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), dev) evaluator = func.time_evaluator(func.entry_name, dev, number=3) print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3)) if VERIFY: func(a, w, c) a_np = a_np.transpose(0, 4, 1, 2, 3, 5).reshape(batch_size, height, width, in_channels) w_np = w_np.transpose(0, 1, 2, 4, 3, 5).reshape(kernel_h, kernel_w, in_channels, out_channels) c_np = (c.numpy().transpose( (0, 4, 1, 2, 3, 5)).reshape(batch_size, height, width, out_channels)) c_std = conv2d_nhwc_python(a_np.astype(Conv.dtype), w_np.astype(Conv.dtype), (stride_h, stride_w), (pad_h, pad_w)).astype(Conv.dtype) np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)