def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return if not nvcc.have_tensorcore(ctx.compute_version): print("skip because gpu does not support Tensor Cores") return print("Running on target: %s" % device) with tvm.target.create(device): fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv2d_nhwc_tensorcore_implement) C = fcompute(A, W, stride, padding, dilation, 'float32') if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) func(a, w, c) rtol = 1e-3 tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol)
def check_target(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return if not nvcc.have_tensorcore(dev.compute_version): print("skip because gpu does not support Tensor Cores") return print("Running on target: %s" % target) with tvm.target.Target(target): fcompute, fschedule = topi.testing.dispatch(target, _conv2d_hwnc_tensorcore_implement) C = fcompute(A, W, stride, padding, dilation, dtype, "int32") s = fschedule([C]) a = tvm.nd.array(a_np.transpose((1, 2, 0, 3)), dev) w = tvm.nd.array(w_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) rtol = 1e-3 tvm.testing.assert_allclose(c.asnumpy().transpose((2, 0, 1, 3)), c_np, rtol=rtol)
def batch_matmul_strategy_cuda(attrs, inputs, out_type, target): """batch_matmul cuda strategy""" strategy = _op.OpStrategy() strategy.add_implementation( wrap_compute_batch_matmul(topi.cuda.batch_matmul), wrap_topi_schedule(topi.cuda.schedule_batch_matmul), name="batch_matmul.cuda", plevel=10, ) if target.kind.name == "cuda" and "cublas" in target.libs: strategy.add_implementation( wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas), wrap_topi_schedule(topi.generic.schedule_extern), name="batch_matmul_cublas.cuda", plevel=15, ) if target.kind.name == "cuda" and nvcc.have_tensorcore(target=target): x, y = inputs _, M, K = get_const_tuple(x.shape) _, N, K = get_const_tuple(y.shape) if x.dtype in ["float16", "int8", "uint8" ] and ((M % 8 == 0 and K % 16 == 0 and N % 32 == 0) or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)): strategy.add_implementation( wrap_compute_batch_matmul(topi.cuda.batch_matmul_tensorcore), wrap_topi_schedule(topi.cuda.schedule_batch_matmul_tensorcore), name="batch_matmul_tensorcore.cuda", plevel=20, ) return strategy
def test_conv2d_nhwc_winograd_tensorcore(): """Test the conv2d with winograd for nhwc layout""" if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled..") return if not nvcc.have_tensorcore(tvm.gpu(0).compute_version): return verify_conv2d_nhwc(8, 64, 56, 64, 3, 1, 1, bgemm="tensorcore") verify_conv2d_nhwc(8, 128, 28, 128, 3, 1, 1, bgemm="tensorcore") verify_conv2d_nhwc(8, 256, 14, 256, 3, 1, 1, bgemm="tensorcore") verify_conv2d_nhwc(2, 64, 56, 64, 3, 1, (1, 1), add_relu=True, bgemm="tensorcore") verify_conv2d_nhwc(2, 64, 56, 64, 3, 1, "SAME", add_relu=True, bgemm="tensorcore")
def conv3d_strategy_cuda(attrs, inputs, out_type, target): """conv3d cuda strategy""" strategy = _op.OpStrategy() data, kernel = inputs layout = attrs.data_layout _, stride_h, stride_w = attrs.get_int_tuple("strides") _, dilation_h, dilation_w = attrs.get_int_tuple("dilation") assert layout in ["NCDHW", "NDHWC"], "Not support this layout {} yet".format(layout) if layout == "NCDHW": strategy.add_implementation( wrap_compute_conv3d(topi.cuda.conv3d_ncdhw), wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw), name="conv3d_ncdhw.cuda", plevel=10, ) _, _, _, kh, kw = get_const_tuple(kernel.shape) if (2 < kh < 8 and 2 < kw < 8 and kh == kw and stride_h == 1 and stride_w == 1 and dilation_h == 1 and dilation_w == 1 and attrs["groups"] == 1): strategy.add_implementation( wrap_compute_conv3d(topi.cuda.conv3d_ncdhw_winograd), wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw_winograd), name="conv3d_ncdhw_winograd.cuda", plevel=5, ) else: # layout == "NDHWC": strategy.add_implementation( wrap_compute_conv3d(topi.cuda.conv3d_ndhwc), wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc), name="conv3d_ndhwc.cuda", plevel=10, ) N, _, _, _, _ = get_const_tuple(data.shape) _, _, _, CI, CO = get_const_tuple(kernel.shape) if target.kind.name == "cuda": if nvcc.have_tensorcore(target=target): if ((N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)) and out_type == "float16": strategy.add_implementation( wrap_compute_conv3d(topi.cuda.conv3d_ndhwc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv3d_ndhwc_tensorcore), name="conv3d_ndhwc_tensorcore.cuda", plevel=20, ) if target.kind.name == "cuda" and "cudnn" in target.libs: strategy.add_implementation( wrap_compute_conv3d(topi.cuda.conv3d_cudnn, True), wrap_topi_schedule(topi.cuda.schedule_conv3d_cudnn), name="conv3d_cudnn.cuda", plevel=25, ) return strategy
def test_tensor_core_batch_matmul(): if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled..") return if not nvcc.have_tensorcore(tvm.gpu(0).compute_version): print("skip because gpu does not support tensor core") return tensor_core_batch_matmul()
def batch_matmul_strategy_cuda(attrs, inputs, out_type, target): """batch_matmul cuda strategy""" strategy = _op.OpStrategy() x, y = inputs if ( x.dtype == "int8" and y.dtype == "int8" and out_type.dtype == "int32" and not attrs["transpose_a"] and attrs["transpose_b"] ): strategy.add_implementation( wrap_compute_batch_matmul(topi.cuda.batch_matmul_int8, need_out_dtype=True), wrap_topi_schedule(topi.cuda.schedule_batch_matmul_int8), name="batch_matmul_int8.cuda", plevel=10, ) else: strategy.add_implementation( wrap_compute_batch_matmul(topi.cuda.batch_matmul, need_out_dtype=True), wrap_topi_schedule(topi.cuda.schedule_batch_matmul), name="batch_matmul.cuda", plevel=10, ) if target.kind.name == "cuda" and "cublas" in target.libs: strategy.add_implementation( wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas), wrap_topi_schedule(topi.generic.schedule_extern), name="batch_matmul_cublas.cuda", plevel=15, ) if ( target.kind.name == "cuda" and nvcc.have_tensorcore(target=target) and not attrs["transpose_a"] and attrs["transpose_b"] ): x, y = inputs _, M, K = get_const_tuple(x.shape) _, N, K = get_const_tuple(y.shape) if ( x.dtype in ["float16", "int8", "uint8"] and ( (M % 8 == 0 and K % 16 == 0 and N % 32 == 0) or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0) ) ) or (x.dtype in ["int4", "uint4"] and K % 32 == 0 and M % 8 == 0 and N % 8 == 0): strategy.add_implementation( wrap_compute_batch_matmul(topi.cuda.batch_matmul_tensorcore, need_out_dtype=True), wrap_topi_schedule(topi.cuda.schedule_batch_matmul_tensorcore), name="batch_matmul_tensorcore.cuda", plevel=20, ) return strategy
def test_conv2d_nhwc_winograd_tensorcore(): """Test the conv2d with winograd for nhwc layout""" if not nvcc.have_tensorcore(tvm.gpu(0).compute_version): return verify_conv2d_nhwc(8, 64, 56, 64, 3, 1, 1, bgemm="tensorcore") verify_conv2d_nhwc(8, 128, 28, 128, 3, 1, 1, bgemm="tensorcore") verify_conv2d_nhwc(8, 256, 14, 256, 3, 1, 1, bgemm="tensorcore") verify_conv2d_nhwc(2, 64, 56, 64, 3, 1, (1, 1), add_relu=True, bgemm="tensorcore") verify_conv2d_nhwc(2, 64, 56, 64, 3, 1, "SAME", add_relu=True, bgemm="tensorcore")
def test_tensor_core_matmul(): if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled..") return if not nvcc.have_tensorcore(tvm.gpu(0).compute_version): print("skip because gpu does not support tensor core") return tensor_core_matmul(16) #test with warp_tile 16x16x16 tensor_core_matmul(8) #test with warp_tile 8x32x16 tensor_core_matmul(32) #test with warp_tile 32x8x16
def dense_strategy_cuda(attrs, inputs, out_type, target): """dense cuda strategy""" strategy = _op.OpStrategy() data, weights = inputs b, i = get_const_tuple(data.shape) o, _ = get_const_tuple(weights.shape) if (target.kind.name in ["cuda", "vulkan", "rocm"] and data.dtype == "int8" and weights.dtype == "int8" and out_type.dtype == "int32"): strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_int8), wrap_topi_schedule(topi.cuda.schedule_dense_int8), name="dense_int8.cuda", ) else: strategy.add_implementation( wrap_compute_dense(topi.gpu.dense_small_batch), wrap_topi_schedule(topi.gpu.schedule_dense_small_batch), name="dense_small_batch.gpu", ) with SpecializedCondition(b >= 32): strategy.add_implementation( wrap_compute_dense(topi.gpu.dense_large_batch), wrap_topi_schedule(topi.gpu.schedule_dense_large_batch), name="dense_large_batch.gpu", plevel=5, ) if target.kind.name == "cuda": if nvcc.have_tensorcore(target=target): if ((data.dtype in ["float16", "int8", "uint8"] and ((i % 16 == 0 and b % 16 == 0 and o % 16 == 0) or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0) or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0))) or (data.dtype in ["int4", "uint4"] and i % 32 == 0 and b % 8 == 0 and o % 8 == 0) or (data.dtype in ["int1", "uint1"] and i % 128 == 0 and b % 8 == 0 and o % 8 == 0)): strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_tensorcore), wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore), name="dense_tensorcore.cuda", plevel=20, ) if target.kind.name == "cuda" and "cublas" in target.libs: strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_cublas), wrap_topi_schedule(topi.cuda.schedule_dense_cublas), name="dense_cublas.cuda", plevel=25, ) return strategy
def dense_strategy_cuda(attrs, inputs, out_type, target): """dense cuda strategy""" strategy = _op.OpStrategy() data, weights = inputs b, i = get_const_tuple(data.shape) o, _ = get_const_tuple(weights.shape) if out_type.dtype == "int8": strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_int8), wrap_topi_schedule(topi.cuda.schedule_dense_int8), name="dense_int8.cuda", ) else: strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_small_batch), wrap_topi_schedule(topi.cuda.schedule_dense_small_batch), name="dense_small_batch.cuda", ) strategy.add_auto_scheduler( wrap_compute_dense(topi.nn.dense), name="dense", ) with SpecializedCondition(b >= 32): strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_large_batch), wrap_topi_schedule(topi.cuda.schedule_dense_large_batch), name="dense_large_batch.cuda", plevel=5, ) if target.kind.name == "cuda": if nvcc.have_tensorcore(tvm.gpu(0).compute_version): if ( (i % 16 == 0 and b % 16 == 0 and o % 16 == 0) or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0) or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0) ): strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_tensorcore), wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore), name="dense_tensorcore.cuda", plevel=20, ) if target.kind.name == "cuda" and "cublas" in target.libs: strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_cublas), wrap_topi_schedule(topi.cuda.schedule_dense_cublas), name="dense_cublas.cuda", plevel=25, ) return strategy
def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_type, target): """conv2d_winograd_without_weight_transfrom cuda strategy""" dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") layout = attrs.data_layout data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") assert dilation == (1, 1), "Do not support dilate now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout == "NCHW": strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd_without_weight_transform), wrap_topi_schedule( topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform), name="conv2d_nchw_winograd_without_weight_transform.cuda") elif layout == "NHWC": N, H, W, _ = get_const_tuple(data.shape) alpha, _, CI, CO = get_const_tuple(kernel.shape) dilation_h, dilation_w = dilation judge_winograd_tensorcore, _ = winograd_judge(N, H, W, alpha, alpha, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, pre_flag=True) if target.target_name == "cuda" and \ nvcc.have_tensorcore(tvm.gpu(0).compute_version) and \ judge_winograd_tensorcore: strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_tensorcore_without_weight_transform), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform), name="conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda") else: strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_direct_without_weight_transform), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_direct_without_weight_transform), name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda") else: raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}". format(layout)) return strategy
def requires_tensorcore(*args): """Mark a test as requiring a tensorcore to run. Tests with this mark will not be run unless a tensorcore is present. Parameters ---------- f : function Function to mark """ _requires_tensorcore = [ pytest.mark.tensorcore, pytest.mark.skipif( not tvm.cuda().exist or not nvcc.have_tensorcore(tvm.cuda(0).compute_version), reason="No tensorcore present", ), *requires_gpu(), ] return _compose(args, _requires_tensorcore)
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return if not nvcc.have_tensorcore(ctx.compute_version): print("skip because gpu does not support Tensor Cores") return print("Running on target: %s" % device) for fcompute, fschedule in topi.testing.dispatch(device, _dense_implement): with tvm.target.create(device): D = fcompute(A, B, C if use_bias else None) D = topi.nn.relu(D) s = fschedule([D]) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(c_np, ctx) d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx) f = tvm.build(s, [A, B, C, D], device, name="dense") f(a, b, c, d) tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-3)
def test_tensor_core_batch_conv(): if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled..") return if not nvcc.have_tensorcore(tvm.gpu(0).compute_version): print("skip because gpu does not support tensor core") return # The sizes of inputs and filters batch_size = 32 height = 14 width = 14 in_channels = 32 out_channels = 64 kernel_h = 3 kernel_w = 3 pad_h = 1 pad_w = 1 stride_h = 1 stride_w = 1 block_size = 16 block_row_warps = 2 block_col_warps = 4 warp_row_tiles = 4 warp_col_tiles = 2 warp_size = 32 chunk = 2 # Input feature map: (N, H, W, IC, n, ic) data_shape = (batch_size // block_size, height, width, in_channels // block_size, block_size, block_size) # Kernel: (H, W, IC, OC, ic, oc) kernel_shape = (kernel_h, kernel_w, in_channels // block_size, out_channels // block_size, block_size, block_size) # Output feature map: (N, H, W, OC, n, oc) output_shape = (batch_size // block_size, height, width, out_channels // block_size, block_size, block_size) assert (batch_size % block_size == 0) assert (in_channels % block_size == 0) assert (out_channels % block_size == 0) kh = te.reduce_axis((0, kernel_h), name='kh') kw = te.reduce_axis((0, kernel_w), name='kw') ic = te.reduce_axis((0, in_channels // block_size), name='ic') ii = te.reduce_axis((0, block_size), name='ii') # Algorithm A = te.placeholder(data_shape, name='A', dtype="float16") W = te.placeholder(kernel_shape, name='W', dtype="float16") Apad = te.compute( (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w, in_channels // block_size, block_size, block_size), lambda n, h, w, i, nn, ii: tvm.tir.if_then_else( tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width), A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.tir.const(0., "float16")), name='Apad') Conv = te.compute( output_shape, lambda n, h, w, o, nn, oo: te.sum(Apad[ n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype( "float32") * W[kh, kw, ic, o, ii, oo].astype("float32"), axis=[ic, kh, kw, ii]), name="Conv") s = te.create_schedule(Conv.op) s[Apad].compute_inline() AS = s.cache_read(Apad, 'shared', [Conv]) WS = s.cache_read(W, 'shared', [Conv]) AF = s.cache_read(AS, 'wmma.matrix_a', [Conv]) WF = s.cache_read(WS, 'wmma.matrix_b', [Conv]) ConvF = s.cache_write(Conv, 'wmma.accumulator') block_x = te.thread_axis('blockIdx.x') block_y = te.thread_axis('blockIdx.y') block_z = te.thread_axis('blockIdx.z') thread_x = te.thread_axis('threadIdx.x') thread_y = te.thread_axis('threadIdx.y') thread_z = te.thread_axis('threadIdx.z') nc, hc, wc, oc, nnc, ooc = Conv.op.axis block_k = s[Conv].fuse(hc, wc) s[Conv].bind(block_k, block_z) nc, nci = s[Conv].split(nc, factor=warp_row_tiles) block_i, nc = s[Conv].split(nc, factor=block_row_warps) oc, oci = s[Conv].split(oc, factor=warp_col_tiles) block_j, oc = s[Conv].split(oc, factor=block_col_warps) s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc) s[Conv].bind(block_i, block_x) s[Conv].bind(block_j, block_y) s[Conv].bind(nc, thread_y) s[Conv].bind(oc, thread_z) s[ConvF].compute_at(s[Conv], oc) n, h, w, o, nnf, oof = ConvF.op.axis ko, ki = s[ConvF].split(ic, factor=chunk) s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii) s[AF].compute_at(s[ConvF], kw) s[WF].compute_at(s[ConvF], kw) s[WS].compute_at(s[ConvF], kh) s[AS].compute_at(s[ConvF], kh) n, h, w, i, nn, ii = AS.op.axis tx, xo = s[AS].split(n, nparts=block_row_warps) ty, yo = s[AS].split(xo, nparts=block_col_warps) t = s[AS].fuse(nn, ii) to, ti = s[AS].split(t, factor=warp_size) s[AS].bind(tx, thread_y) s[AS].bind(ty, thread_z) s[AS].bind(ti, thread_x) kh, kw, ic, o, ii, oo = WS.op.axis tx, xo = s[WS].split(o, nparts=block_row_warps) ty, yo = s[WS].split(xo, nparts=block_col_warps) t = s[WS].fuse(ii, oo) to, ti = s[WS].split(t, nparts=warp_size) s[WS].bind(tx, thread_y) s[WS].bind(ty, thread_z) s[WS].bind(to, thread_x) s[WS].vectorize(ti) s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_a')) s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_b')) s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16))) s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16))) func = tvm.build(s, [A, W, Conv], 'cuda') ctx = tvm.gpu(0) a_np = np.random.uniform(size=data_shape).astype(A.dtype) w_np = np.random.uniform(size=kernel_shape).astype(W.dtype) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx) evaluator = func.time_evaluator(func.entry_name, ctx, number=3) print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3)) if VERIFY: func(a, w, c) a_np = a_np.transpose(0, 4, 1, 2, 3, 5).reshape(batch_size, height, width, in_channels) w_np = w_np.transpose(0, 1, 2, 4, 3, 5).reshape(kernel_h, kernel_w, in_channels, out_channels) c_np = c.asnumpy().transpose( (0, 4, 1, 2, 3, 5)).reshape(batch_size, height, width, out_channels) c_std = conv2d_nhwc_python(a_np.astype(Conv.dtype), w_np.astype(Conv.dtype), (stride_h, stride_w), (pad_h, pad_w)).astype(Conv.dtype) np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)
def test_tensor_core_batch_matmal(): if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): print("skip because cuda is not enabled..") return if not nvcc.have_tensorcore(tvm.gpu(0).compute_version): print("skip because gpu does not support tensor core") return batch_size = 4 n = 512 m, l = n, n assert (n % 32 == 0) assert (m % 8 == 0) assert (l % 16 == 0) nn, mm, ll = n // 32, m // 8, l // 16 A = te.placeholder((batch_size, nn, ll, 32, 16), name='A', dtype='float16') B = te.placeholder((batch_size, ll, mm, 16, 8), name='B', dtype='float16') k1 = te.reduce_axis((0, ll), name='k1') k2 = te.reduce_axis((0, 16), name='k2') C = te.compute((batch_size, nn, mm, 32, 8), lambda b, i, j, ii, jj: te.sum(A[b, i, k1, ii, k2].astype( 'float') * B[b, k1, j, k2, jj].astype('float'), axis=[k1, k2]), name='Fragment_C') s = te.create_schedule(C.op) warp_size = 32 kernel_size = 16 block_row_warps = 2 block_col_warps = 4 warp_row_tiles = 4 warp_col_tiles = 2 chunk = 4 block_x = te.thread_axis('blockIdx.x') block_y = te.thread_axis('blockIdx.y') block_z = te.thread_axis('blockIdx.z') thread_x = te.thread_axis('threadIdx.x') thread_y = te.thread_axis('threadIdx.y') thread_z = te.thread_axis('threadIdx.z') AS = s.cache_read(A, 'shared', [C]) BS = s.cache_read(B, 'shared', [C]) AF = s.cache_read(AS, 'wmma.matrix_a', [C]) BF = s.cache_read(BS, 'wmma.matrix_b', [C]) CF = s.cache_write(C, 'wmma.accumulator') b, i, j, kernel_i, kernel_j = s[C].op.axis i, ii = s[C].split(i, factor=warp_row_tiles) block_i, i = s[C].split(i, factor=block_row_warps) j, jj = s[C].split(j, factor=warp_col_tiles) block_j, j = s[C].split(j, factor=block_col_warps) s[C].reorder(block_i, block_j, i, j, ii, jj, kernel_i, kernel_j) s[C].bind(b, block_z) s[C].bind(block_i, block_x) s[C].bind(block_j, block_y) s[C].bind(i, thread_y) s[C].bind(j, thread_z) s[CF].compute_at(s[C], j) b, warp_i, warp_j, _i, _j = s[CF].op.axis k, _k = CF.op.reduce_axis ko, ki = s[CF].split(k, factor=chunk) s[CF].reorder(ko, ki, warp_i, warp_j, _i, _j, _k) s[AF].compute_at(s[CF], ki) s[BF].compute_at(s[CF], ki) s[AS].compute_at(s[CF], ko) b, xo, yo, xi, yi = AS.op.axis tx, xo = s[AS].split(xo, nparts=block_row_warps) ty, yo = s[AS].split(yo, nparts=block_col_warps) t = s[AS].fuse(xi, yi) to, ti = s[AS].split(t, nparts=warp_size) s[AS].bind(tx, thread_y) s[AS].bind(ty, thread_z) s[AS].bind(to, thread_x) s[BS].compute_at(s[CF], ko) b, xo, yo, xi, yi = BS.op.axis tx, xo = s[BS].split(xo, nparts=block_row_warps) ty, yo = s[BS].split(yo, nparts=block_col_warps) t = s[BS].fuse(xi, yi) to, ti = s[BS].split(t, nparts=warp_size) s[BS].bind(tx, thread_y) s[BS].bind(ty, thread_z) s[BS].bind(to, thread_x) s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), 'wmma.matrix_a')) s[BF].tensorize(BF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), 'wmma.matrix_b')) s[C].tensorize(kernel_i, intrin_wmma_store_matrix((32, 8, 16))) s[CF].tensorize(_i, intrin_wmma_gemm((32, 8, 16))) func = tvm.build(s, [A, B, C], 'cuda') ctx = tvm.gpu(0) a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype) b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), ctx) func(a, b, c) evaluator = func.time_evaluator(func.entry_name, ctx, number=3) print('gemm with tensor core: %f ms' % (evaluator(a, b, c).mean * 1e3)) if VERIFY: func(a, b, c) a_np = a_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) b_np = b_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) c_np = c.asnumpy().transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) np.testing.assert_allclose(c_np, np.matmul(a_np.astype(C.dtype), b_np.astype(C.dtype)), rtol=1e-4, atol=1e-4)
def conv2d_strategy_cuda(attrs, inputs, out_type, target): """conv2d cuda strategy""" strategy = _op.OpStrategy() data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") dilation_h, dilation_w = attrs.get_int_tuple("dilation") padding = attrs.get_int_tuple("padding") groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" if data.dtype in ('int8', 'uint8') and kernel.dtype in ('int8', 'uint8'): assert data.dtype == kernel.dtype strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.cuda") else: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), name="conv2d_nchw.cuda") _, _, kh, kw = get_const_tuple(kernel.shape) if 2 < kh < 8 and 2 < kw < 8 and kh == kw and stride_h == 1 and stride_w == 1 and \ dilation_h == 1 and dilation_w == 1: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), wrap_topi_schedule( topi.cuda.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.cuda", plevel=5) elif layout == "HWCN": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwcn), wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn), name="conv2d_hwcn.cuda") elif layout == "NHWC": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc), name="conv2d_nhwc.cuda") N, H, W, _ = get_const_tuple(data.shape) KH, KW, CI, CO = get_const_tuple(kernel.shape) # Winograd shape related judgment judge_winograd_tensorcore, judge_winograd_shape = winograd_judge( N, H, W, KH, KW, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, pre_flag=False) if judge_winograd_shape: if target.id.name == "cuda" and \ nvcc.have_tensorcore(tvm.gpu(0).compute_version) and \ judge_winograd_tensorcore: strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore ), name="conv2d_nhwc_winograd_tensorcore.cuda", plevel=5) else: strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_direct), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_direct), name="conv2d_nhwc_winograd_direct.cuda", plevel=5) if target.id.name == "cuda": if nvcc.have_tensorcore(tvm.gpu(0).compute_version): if (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or \ (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or \ (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0): strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_tensorcore), name="conv2d_nhwc_tensorcore.cuda", plevel=20) elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8), name="conv2d_NCHWc_int8.cuda") else: raise RuntimeError( "Unsupported conv2d layout {} for CUDA".format(layout)) # add cudnn implementation if target.id.name == "cuda" and "cudnn" in target.libs: if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \ padding[1] == padding[3]: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.cuda") elif layout == "NHWC": assert kernel_layout == "HWOI" strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.cuda") else: raise RuntimeError( "Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d # add cudnn implementation, if any cudnn_impl = False if target.id.name == "cuda" and "cudnn" in target.libs: if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \ padding[1] == padding[3]: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25) cudnn_impl = True if layout == 'NCHW': # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8. assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw), name="group_conv2d_nchw.cuda") elif layout == 'NCHW4c' and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8), name="group_conv2d_NCHWc_int8.cuda") elif not cudnn_impl: raise RuntimeError( "Unsupported group_conv2d layout {}".format(layout)) return strategy
def conv2d_strategy_cuda(attrs, inputs, out_type, target): """conv2d cuda strategy""" strategy = _op.OpStrategy() data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") dilation_h, dilation_w = attrs.get_int_tuple("dilation") padding = attrs.get_int_tuple("padding") groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" if data.dtype in ("int8", "uint8") and kernel.dtype in ("int8", "uint8"): assert data.dtype == kernel.dtype strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.cuda", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), name="conv2d_nchw.cuda", ) _, _, kh, kw = get_const_tuple(kernel.shape) if ((2 < kh < 8 and 2 < kw < 8 and kh == kw) and (stride_h == 1 and stride_w == 1) and (dilation_h == 1 and dilation_w == 1)): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), wrap_topi_schedule( topi.cuda.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.cuda", plevel=5, ) strategy.add_auto_scheduler(wrap_compute_conv2d( topi.nn.conv2d_nchw), name="conv2d_nchw") elif layout == "HWCN": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwcn), wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn), name="conv2d_hwcn.cuda", ) elif layout == "NHWC": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc), name="conv2d_nhwc.cuda", ) N, H, W, _ = get_const_tuple(data.shape) KH, KW, CI, CO = get_const_tuple(kernel.shape) # Winograd shape related judgment ( judge_winograd_tensorcore, judge_winograd_autotvm, judge_winograd_auto_scheduler, ) = judge_winograd( N, H, W, KH, KW, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, data.dtype, kernel.dtype, pre_flag=False, ) if judge_winograd_autotvm: if (target.kind.name == "cuda" and nvcc.have_tensorcore(tvm.gpu(0).compute_version) and judge_winograd_tensorcore): strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore ), name="conv2d_nhwc_winograd_tensorcore.cuda", plevel=5, ) else: strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_direct), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_direct), name="conv2d_nhwc_winograd_direct.cuda", plevel=5, ) if (target.kind.name == "cuda" and nvcc.have_tensorcore(tvm.gpu(0).compute_version) and ((N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0))): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_tensorcore), name="conv2d_nhwc_tensorcore.cuda", plevel=20, ) # register auto-scheduler implementations if judge_winograd_auto_scheduler: strategy.add_auto_scheduler(wrap_compute_conv2d( topi.nn.conv2d_winograd_nhwc), name="conv2d_nhwc.winograd") else: strategy.add_auto_scheduler(wrap_compute_conv2d( topi.nn.conv2d_nhwc), name="conv2d_nhwc") elif layout == "HWNC": assert kernel_layout in [ "HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i" ] _, _, N, in_channels = get_const_tuple(data.shape) pre_computed = len(kernel.shape) == 6 if pre_computed: _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple( kernel.shape) out_channels = oc_chunk * oc_block_factor else: _, _, out_channels, _ = get_const_tuple(kernel.shape) tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"] if ((N % 16 == 0 and in_channels % 16 == 0 and out_channels % 16 == 0) or (N % 8 == 0 and in_channels % 16 == 0 and out_channels % 32 == 0) or (N % 32 == 0 and in_channels % 16 == 0 and out_channels % 8 == 0) and (data.dtype in tensorcore_dtypes and kernel.dtype in tensorcore_dtypes)): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_hwnc_tensorcore), name="conv2d_hwnc_tensorcore_direct.cuda", plevel=20, ) else: raise RuntimeError("Unsupported shape for conv2d HWNC.\ Need to satisfy tensor core schedule.") elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8), name="conv2d_NCHWc_int8.cuda", ) else: raise RuntimeError( "Unsupported conv2d layout {} for CUDA".format(layout)) # add cudnn implementation if target.kind.name == "cuda" and "cudnn" in target.libs: if layout in [ "NCHW", "NHWC" ] and padding[0] == padding[2] and padding[1] == padding[3]: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25, ) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.cuda", ) strategy.add_auto_scheduler( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.cuda", ) elif layout == "NHWC": assert kernel_layout == "HWOI" strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.cuda", ) strategy.add_auto_scheduler( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.cuda", ) else: raise RuntimeError( "Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d # add cudnn implementation, if any cudnn_impl = False if target.kind.name == "cuda" and "cudnn" in target.libs: if layout in [ "NCHW", "NHWC" ] and padding[0] == padding[2] and padding[1] == padding[3]: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25, ) cudnn_impl = True if layout == "NCHW": # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8. assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw), name="group_conv2d_nchw.cuda", ) elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8), name="group_conv2d_NCHWc_int8.cuda", ) elif not cudnn_impl: raise RuntimeError( "Unsupported group_conv2d layout {}".format(layout)) return strategy
def test_tune_matmul_cuda_tensor_core(): n = 512 mod = create_prim_func(te_workload.matmul_fp16(n, n, n)) target = Target("nvidia/geforce-rtx-3070") config = ReplayTraceConfig( num_trials_per_iter=32, num_trials_total=320, ) class DefaultTensorCore: @staticmethod def _sch_rules(): from tvm.meta_schedule import ( # pylint: disable=import-outside-toplevel schedule_rule as M, ) return [ M.AutoInline( into_producer=False, into_consumer=True, # into_cache_only=False, inline_const_tensor=True, disallow_if_then_else=False, require_injective=False, require_ordered=False, disallow_op=None, ), M.MultiLevelTiling( structure="SSSRRSRS", tile_binds=["blockIdx.x", "blockIdx.y", "threadIdx.y"], # use_tensor_core=True, max_innermost_factor=64, vector_load_lens=[1, 2, 3, 4], reuse_read=schedule_rule.ReuseType( req="must", levels=[4], scope="shared", ), reuse_write=schedule_rule.ReuseType( req="no", levels=[], scope="", ), ), M.AutoInline( into_producer=True, into_consumer=True, # into_cache_only=True, inline_const_tensor=True, disallow_if_then_else=False, require_injective=False, require_ordered=False, disallow_op=None, ), M.ParallelizeVectorizeUnroll( max_jobs_per_core=-1, # disable parallelize max_vectorize_extent=-1, # disable vectorize unroll_max_steps=[0, 16, 64, 512, 1024], unroll_explicit=True, ), ] @staticmethod def _postproc(): from tvm.meta_schedule import ( # pylint: disable=import-outside-toplevel postproc as M, ) return [ # M.RewriteCooperativeFetch(), M.RewriteParallelVectorizeUnroll(), M.RewriteReductionBlock(), # M.RewriteTensorCore(), M.VerifyGPUCode(), ] with tempfile.TemporaryDirectory() as work_dir: sch: Schedule = tune_tir( mod=mod, target=target, config=config, work_dir=work_dir, space=PostOrderApply(), sch_rules=DefaultTensorCore._sch_rules, postprocs=DefaultTensorCore._postproc, num_threads=None, ) if sch is None: print("No valid schedule found!") else: print(sch.mod.script()) print(sch.trace) from tvm.contrib import nvcc import numpy as np ctx = tvm.gpu(0) if nvcc.have_tensorcore(ctx.compute_version): with tvm.transform.PassContext(): func = tvm.build(sch.mod["main"], [], "cuda") print(sch.mod.script()) print(func.imported_modules[0].get_source()) a_np = np.random.uniform(size=(n, n)).astype("float16") b_np = np.random.uniform(size=(n, n)).astype("float16") a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros((n, n), dtype="float32"), ctx) evaluator = func.time_evaluator(func.entry_name, ctx, number=3, repeat=1, min_repeat_ms=40) print("matmul with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3)) np.testing.assert_allclose( c.asnumpy(), np.matmul(a_np.astype("float32"), b_np.astype("float32")), rtol=1e-4, atol=1e-4, )
def conv2d_strategy_cuda(attrs, inputs, out_type, target): """conv2d cuda strategy""" strategy = _op.OpStrategy() data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") dilation_h, dilation_w = attrs.get_int_tuple("dilation") padding = attrs.get_int_tuple("padding") groups = attrs.groups layout = attrs.data_layout kernel_layout = attrs.kernel_layout if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if groups == 1: if layout == "NCHW": assert kernel_layout == "OIHW" if ((target.kind.name in ["cuda", "vulkan", "rocm"]) and data.dtype in ("int8", "uint8") and kernel.dtype in ("int8", "uint8")): assert data.dtype == kernel.dtype strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.cuda", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), name="conv2d_nchw.cuda", ) _, _, kh, kw = get_const_tuple(kernel.shape) if ((2 < kh < 8 and 2 < kw < 8 and kh == kw) and (stride_h == 1 and stride_w == 1) and (dilation_h == 1 and dilation_w == 1)): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), wrap_topi_schedule( topi.cuda.schedule_conv2d_nchw_winograd), name="conv2d_nchw_winograd.cuda", plevel=5, ) elif layout == "HWCN": assert kernel_layout == "HWIO" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwcn), wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn), name="conv2d_hwcn.cuda", ) elif layout == "NHWC" and kernel_layout == "HWIO": strategy.add_implementation( wrap_compute_conv2d(topi.gpu.conv2d_nhwc), wrap_topi_schedule(topi.gpu.schedule_conv2d_nhwc), name="conv2d_nhwc.gpu", ) N, H, W, _ = get_const_tuple(data.shape) KH, KW, CI, CO = get_const_tuple(kernel.shape) # Winograd shape related judgment ( judge_winograd_tensorcore, judge_winograd_autotvm, judge_winograd_auto_scheduler, ) = judge_winograd( N, H, W, KH, KW, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, data.dtype, kernel.dtype, pre_flag=False, ) if judge_winograd_autotvm: if (target.kind.name == "cuda" and nvcc.have_tensorcore(target=target) and judge_winograd_tensorcore): strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore ), name="conv2d_nhwc_winograd_tensorcore.cuda", plevel=5, ) else: strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_direct), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_direct), name="conv2d_nhwc_winograd_direct.cuda", plevel=5, ) if (target.kind.name == "cuda" and nvcc.have_tensorcore(target=target) and ((N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0))): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_tensorcore), name="conv2d_nhwc_tensorcore.cuda", plevel=20, ) # register auto-scheduler implementations if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler: strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc.winograd", plevel=15, ) elif layout == "HWNC": assert kernel_layout in [ "HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i" ] _, _, N, in_channels = get_const_tuple(data.shape) pre_computed = len(kernel.shape) == 6 if pre_computed: _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple( kernel.shape) out_channels = oc_chunk * oc_block_factor else: _, _, out_channels, _ = get_const_tuple(kernel.shape) tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"] if (target.kind.name == "cuda" and nvcc.have_tensorcore(target=target) and kernel.dtype in tensorcore_dtypes and ((data.dtype in ["int4", "uint4"] and N % 8 == 0 and in_channels % 32 == 0 and out_channels % 8 == 0) or (data.dtype in ["int8", "uint8"] and N % 8 == 0 and in_channels % 16 == 0 and out_channels % 32 == 0))): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore), wrap_topi_schedule( topi.cuda.schedule_conv2d_hwnc_tensorcore), name="conv2d_hwnc_tensorcore_direct.cuda", plevel=20, ) else: raise RuntimeError("Unsupported shape for conv2d HWNC.\ Need to satisfy tensor core schedule.") elif ((target.kind.name in ["cuda", "vulkan", "rocm"]) and layout == "NCHW4c" and data.dtype in ["int8", "uint8"]): assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True), wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8), name="conv2d_NCHWc_int8.cuda", ) elif target.kind.name == "cuda" and "cudnn" not in target.libs: # No TVM native kernel applicable raise RuntimeError( "Unsupported conv2d layout {} for CUDA".format(layout)) if (target.kind.name == "cuda" and "cudnn" in target.libs and layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and padding[1] == padding[3] and not (data.dtype in ["uint8", "int8"] or kernel.dtype in ["uint8", "int8"])): # add cudnn implementation if layout == "NHWC": assert kernel_layout == "OHWI" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25, ) elif is_depthwise_conv2d( data.shape, layout, kernel.shape, kernel_layout, groups) and ( layout == "NCHW" or "cudnn" not in target.libs ): # cuDNN requires a different kernel layout for NHWC inputs. if layout == "NCHW": assert kernel_layout == "OIHW" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw), name="depthwise_conv2d_nchw.cuda", ) elif layout == "NHWC": assert kernel_layout == "HWOI" strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc), name="depthwise_conv2d_nhwc.cuda", ) else: raise RuntimeError( "Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d # add cudnn implementation, if any cudnn_impl = False if target.kind.name == "cuda" and "cudnn" in target.libs: if (layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and padding[1] == padding[3] and not (data.dtype in ["uint8", "int8"] or kernel.dtype in ["uint8", "int8"])): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_cudnn, need_data_layout=True, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn), name="conv2d_cudnn.cuda", plevel=25, ) cudnn_impl = True if layout == "NCHW": assert kernel_layout == "OIHW" _, channels, _, _ = get_const_tuple(data.shape) out_channels, in_channels, _, _ = get_const_tuple(kernel.shape) oc_chunk = out_channels // 4 ic_chunk = in_channels // 4 if ((target.kind.name in ["cuda", "vulkan", "rocm"]) and data.dtype in ["int8", "uint8"] and kernel.dtype in ["int8", "uint8"] and channels % groups == 0 and out_channels % groups == 0 and channels % 4 == 0 and out_channels % 4 == 0 and groups <= oc_chunk and groups <= ic_chunk): strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw_int8, has_groups=True), wrap_topi_schedule( topi.cuda.schedule_group_conv2d_nchw_int8), name="group_conv2d_nchw_int8.cuda", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_nchw, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw), name="group_conv2d_nchw.cuda", ) elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]: assert kernel_layout == "OIHW4o4i" strategy.add_implementation( wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, has_groups=True), wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8), name="group_conv2d_NCHWc_int8.cuda", ) elif not cudnn_impl: raise RuntimeError( "Unsupported group_conv2d layout {}".format(layout)) return strategy
def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_type, target): """conv2d_winograd_without_weight_transfrom cuda strategy""" dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") layout = attrs.data_layout data, kernel = inputs stride_h, stride_w = attrs.get_int_tuple("strides") padding = attrs.get_int_tuple("padding") assert dilation == (1, 1), "Do not support dilate now" assert groups == 1, "Do not supoort arbitrary group number" strategy = _op.OpStrategy() if layout == "NCHW": strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd_without_weight_transform), wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform), name="conv2d_nchw_winograd_without_weight_transform.cuda", ) elif layout == "NHWC": N, H, W, _ = get_const_tuple(data.shape) alpha, _, CI, CO = get_const_tuple(kernel.shape) dilation_h, dilation_w = dilation judge_winograd_tensorcore, _, _ = judge_winograd( N, H, W, alpha, alpha, CI, CO, padding, stride_h, stride_w, dilation_h, dilation_w, data.dtype, kernel.dtype, pre_flag=True, ) if ( target.kind.name == "cuda" and nvcc.have_tensorcore(target=target) and judge_winograd_tensorcore ): strategy.add_implementation( wrap_compute_conv2d( topi.cuda.conv2d_nhwc_winograd_tensorcore_without_weight_transform ), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform ), name="conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda", ) else: strategy.add_implementation( wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_direct_without_weight_transform), wrap_topi_schedule( topi.cuda.schedule_conv2d_nhwc_winograd_direct_without_weight_transform ), name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda", ) if is_auto_scheduler_enabled(): strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform), naive_schedule, # this implementation should never be picked by autotvm name="conv2d_nhwc_winograd_without_weight_transform", plevel=15, ) else: raise RuntimeError( "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout) ) return strategy
return s, [A, B, C] ############################################################################### # AutoTune and Test # ----------------- # Finally we use a tuner to tune the schedule, generate code with best config # and run the kernel to compare with numpy to check whether the results are correct. # check whether the gpu has tensorcore if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"): raise Exception( "skip building this tutorial because cuda is not enabled..") dev = tvm.gpu() if not nvcc.have_tensorcore(dev.compute_version): raise Exception("the gpu has no tensorcore, skipping...") M, N, L = 512, 32, 512 dtype = "float16" layout = "NN" if len(sys.argv) >= 4: M, N, L = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]) if len(sys.argv) >= 5: dtype = sys.argv[4] if len(sys.argv) >= 6: layout = sys.argv[5] # check whether current gpu arch support support current dtype's wmma codegen cuda_compute_capability = tvm.runtime._ffi_api.GetDeviceAttr(2, 0, 4) major, minor = nvcc.parse_compute_version(cuda_compute_capability)
s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_a')) s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_b')) s[Conv].tensorize(nnc, intrin_wmma_store_matrix()) s[ConvF].tensorize(nnf, intrin_wmma_gemm()) print(tvm.lower(s, [A, W, Conv], simple_mode=True)) ############################################################################### # Generate CUDA Kernel # -------------------- # Finally we use TVM to generate and compile the CUDA kernel, and evaluate the latency of convolution. # Since TensorCores are only supported in NVIDIA GPU with Compute Capability 7.0 or higher, it may not # be able to run on our build server ctx = tvm.gpu(0) if nvcc.have_tensorcore(ctx.compute_version): with tvm.transform.PassContext( config={"tir.UnrollLoop": { "auto_max_step": 16 }}): func = tvm.build(s, [A, W, Conv], 'cuda') a_np = np.random.uniform(size=data_shape).astype(A.dtype) w_np = np.random.uniform(size=kernel_shape).astype(W.dtype) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx) evaluator = func.time_evaluator(func.entry_name, ctx, number=10) print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3)) ############################################################################### # Summary