def check_device(device):
        ctx = tvm.context(device, 0)
        if not tvm.testing.device_enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        if not nvcc.have_tensorcore(ctx.compute_version):
            print("skip because gpu does not support Tensor Cores")
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv2d_nhwc_tensorcore_implement)
            C = fcompute(A, W, stride, padding, dilation, 'float32')
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = fschedule([C])

        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
        if add_bias:
            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (
                batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation))
            func(a, w, b, c)
        else:
            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (
                batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation))
            func(a, w, c)

        rtol = 1e-3
        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol)
Пример #2
0
    def check_target(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        if not nvcc.have_tensorcore(dev.compute_version):
            print("skip because gpu does not support Tensor Cores")
            return
        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            fcompute, fschedule = topi.testing.dispatch(target, _conv2d_hwnc_tensorcore_implement)
            C = fcompute(A, W, stride, padding, dilation, dtype, "int32")
            s = fschedule([C])

        a = tvm.nd.array(a_np.transpose((1, 2, 0, 3)), dev)
        w = tvm.nd.array(w_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev)

        func = tvm.build(
            s,
            [A, W, C],
            target,
            name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
        )
        func(a, w, c)

        rtol = 1e-3
        tvm.testing.assert_allclose(c.asnumpy().transpose((2, 0, 1, 3)), c_np, rtol=rtol)
Пример #3
0
def batch_matmul_strategy_cuda(attrs, inputs, out_type, target):
    """batch_matmul cuda strategy"""
    strategy = _op.OpStrategy()
    strategy.add_implementation(
        wrap_compute_batch_matmul(topi.cuda.batch_matmul),
        wrap_topi_schedule(topi.cuda.schedule_batch_matmul),
        name="batch_matmul.cuda",
        plevel=10,
    )
    if target.kind.name == "cuda" and "cublas" in target.libs:
        strategy.add_implementation(
            wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas),
            wrap_topi_schedule(topi.generic.schedule_extern),
            name="batch_matmul_cublas.cuda",
            plevel=15,
        )
    if target.kind.name == "cuda" and nvcc.have_tensorcore(target=target):
        x, y = inputs
        _, M, K = get_const_tuple(x.shape)
        _, N, K = get_const_tuple(y.shape)
        if x.dtype in ["float16", "int8", "uint8"
                       ] and ((M % 8 == 0 and K % 16 == 0 and N % 32 == 0) or
                              (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) or
                              (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)):
            strategy.add_implementation(
                wrap_compute_batch_matmul(topi.cuda.batch_matmul_tensorcore),
                wrap_topi_schedule(topi.cuda.schedule_batch_matmul_tensorcore),
                name="batch_matmul_tensorcore.cuda",
                plevel=20,
            )

    return strategy
Пример #4
0
def test_conv2d_nhwc_winograd_tensorcore():
    """Test the conv2d with winograd for nhwc layout"""
    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
        print("skip because cuda is not enabled..")
        return
    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
        return
    verify_conv2d_nhwc(8, 64, 56, 64, 3, 1, 1, bgemm="tensorcore")
    verify_conv2d_nhwc(8, 128, 28, 128, 3, 1, 1, bgemm="tensorcore")
    verify_conv2d_nhwc(8, 256, 14, 256, 3, 1, 1, bgemm="tensorcore")

    verify_conv2d_nhwc(2,
                       64,
                       56,
                       64,
                       3,
                       1, (1, 1),
                       add_relu=True,
                       bgemm="tensorcore")
    verify_conv2d_nhwc(2,
                       64,
                       56,
                       64,
                       3,
                       1,
                       "SAME",
                       add_relu=True,
                       bgemm="tensorcore")
Пример #5
0
def conv3d_strategy_cuda(attrs, inputs, out_type, target):
    """conv3d cuda strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    layout = attrs.data_layout
    _, stride_h, stride_w = attrs.get_int_tuple("strides")
    _, dilation_h, dilation_w = attrs.get_int_tuple("dilation")
    assert layout in ["NCDHW",
                      "NDHWC"], "Not support this layout {} yet".format(layout)
    if layout == "NCDHW":
        strategy.add_implementation(
            wrap_compute_conv3d(topi.cuda.conv3d_ncdhw),
            wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw),
            name="conv3d_ncdhw.cuda",
            plevel=10,
        )
        _, _, _, kh, kw = get_const_tuple(kernel.shape)
        if (2 < kh < 8 and 2 < kw < 8 and kh == kw and stride_h == 1
                and stride_w == 1 and dilation_h == 1 and dilation_w == 1
                and attrs["groups"] == 1):
            strategy.add_implementation(
                wrap_compute_conv3d(topi.cuda.conv3d_ncdhw_winograd),
                wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw_winograd),
                name="conv3d_ncdhw_winograd.cuda",
                plevel=5,
            )
    else:  # layout == "NDHWC":
        strategy.add_implementation(
            wrap_compute_conv3d(topi.cuda.conv3d_ndhwc),
            wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc),
            name="conv3d_ndhwc.cuda",
            plevel=10,
        )
        N, _, _, _, _ = get_const_tuple(data.shape)
        _, _, _, CI, CO = get_const_tuple(kernel.shape)
        if target.kind.name == "cuda":
            if nvcc.have_tensorcore(target=target):
                if ((N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or
                    (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or
                    (N % 32 == 0 and CI % 16 == 0
                     and CO % 8 == 0)) and out_type == "float16":
                    strategy.add_implementation(
                        wrap_compute_conv3d(topi.cuda.conv3d_ndhwc_tensorcore),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv3d_ndhwc_tensorcore),
                        name="conv3d_ndhwc_tensorcore.cuda",
                        plevel=20,
                    )

    if target.kind.name == "cuda" and "cudnn" in target.libs:
        strategy.add_implementation(
            wrap_compute_conv3d(topi.cuda.conv3d_cudnn, True),
            wrap_topi_schedule(topi.cuda.schedule_conv3d_cudnn),
            name="conv3d_cudnn.cuda",
            plevel=25,
        )
    return strategy
Пример #6
0
def test_tensor_core_batch_matmul():
    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
        print("skip because cuda is not enabled..")
        return
    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
        print("skip because gpu does not support tensor core")
        return

    tensor_core_batch_matmul()
Пример #7
0
def batch_matmul_strategy_cuda(attrs, inputs, out_type, target):
    """batch_matmul cuda strategy"""
    strategy = _op.OpStrategy()
    x, y = inputs
    if (
        x.dtype == "int8"
        and y.dtype == "int8"
        and out_type.dtype == "int32"
        and not attrs["transpose_a"]
        and attrs["transpose_b"]
    ):
        strategy.add_implementation(
            wrap_compute_batch_matmul(topi.cuda.batch_matmul_int8, need_out_dtype=True),
            wrap_topi_schedule(topi.cuda.schedule_batch_matmul_int8),
            name="batch_matmul_int8.cuda",
            plevel=10,
        )
    else:
        strategy.add_implementation(
            wrap_compute_batch_matmul(topi.cuda.batch_matmul, need_out_dtype=True),
            wrap_topi_schedule(topi.cuda.schedule_batch_matmul),
            name="batch_matmul.cuda",
            plevel=10,
        )
    if target.kind.name == "cuda" and "cublas" in target.libs:
        strategy.add_implementation(
            wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas),
            wrap_topi_schedule(topi.generic.schedule_extern),
            name="batch_matmul_cublas.cuda",
            plevel=15,
        )
    if (
        target.kind.name == "cuda"
        and nvcc.have_tensorcore(target=target)
        and not attrs["transpose_a"]
        and attrs["transpose_b"]
    ):
        x, y = inputs
        _, M, K = get_const_tuple(x.shape)
        _, N, K = get_const_tuple(y.shape)
        if (
            x.dtype in ["float16", "int8", "uint8"]
            and (
                (M % 8 == 0 and K % 16 == 0 and N % 32 == 0)
                or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0)
                or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0)
            )
        ) or (x.dtype in ["int4", "uint4"] and K % 32 == 0 and M % 8 == 0 and N % 8 == 0):
            strategy.add_implementation(
                wrap_compute_batch_matmul(topi.cuda.batch_matmul_tensorcore, need_out_dtype=True),
                wrap_topi_schedule(topi.cuda.schedule_batch_matmul_tensorcore),
                name="batch_matmul_tensorcore.cuda",
                plevel=20,
            )

    return strategy
Пример #8
0
def test_conv2d_nhwc_winograd_tensorcore():
    """Test the conv2d with winograd for nhwc layout"""
    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
        return
    verify_conv2d_nhwc(8,  64, 56,  64, 3, 1, 1, bgemm="tensorcore")
    verify_conv2d_nhwc(8, 128, 28, 128, 3, 1, 1, bgemm="tensorcore")
    verify_conv2d_nhwc(8, 256, 14, 256, 3, 1, 1, bgemm="tensorcore")

    verify_conv2d_nhwc(2,  64, 56,  64, 3, 1, (1, 1), add_relu=True, bgemm="tensorcore")
    verify_conv2d_nhwc(2,  64, 56,  64, 3, 1, "SAME", add_relu=True, bgemm="tensorcore")
Пример #9
0
def test_tensor_core_matmul():
    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
        print("skip because cuda is not enabled..")
        return
    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
        print("skip because gpu does not support tensor core")
        return

    tensor_core_matmul(16)  #test with warp_tile 16x16x16
    tensor_core_matmul(8)  #test with warp_tile 8x32x16
    tensor_core_matmul(32)  #test with warp_tile 32x8x16
Пример #10
0
def dense_strategy_cuda(attrs, inputs, out_type, target):
    """dense cuda strategy"""
    strategy = _op.OpStrategy()
    data, weights = inputs
    b, i = get_const_tuple(data.shape)
    o, _ = get_const_tuple(weights.shape)
    if (target.kind.name in ["cuda", "vulkan", "rocm"] and data.dtype == "int8"
            and weights.dtype == "int8" and out_type.dtype == "int32"):
        strategy.add_implementation(
            wrap_compute_dense(topi.cuda.dense_int8),
            wrap_topi_schedule(topi.cuda.schedule_dense_int8),
            name="dense_int8.cuda",
        )
    else:
        strategy.add_implementation(
            wrap_compute_dense(topi.gpu.dense_small_batch),
            wrap_topi_schedule(topi.gpu.schedule_dense_small_batch),
            name="dense_small_batch.gpu",
        )

        with SpecializedCondition(b >= 32):
            strategy.add_implementation(
                wrap_compute_dense(topi.gpu.dense_large_batch),
                wrap_topi_schedule(topi.gpu.schedule_dense_large_batch),
                name="dense_large_batch.gpu",
                plevel=5,
            )

    if target.kind.name == "cuda":
        if nvcc.have_tensorcore(target=target):
            if ((data.dtype in ["float16", "int8", "uint8"] and
                 ((i % 16 == 0 and b % 16 == 0 and o % 16 == 0) or
                  (i % 16 == 0 and b % 8 == 0 and o % 32 == 0) or
                  (i % 16 == 0 and b % 32 == 0 and o % 8 == 0)))
                    or (data.dtype in ["int4", "uint4"] and i % 32 == 0
                        and b % 8 == 0 and o % 8 == 0)
                    or (data.dtype in ["int1", "uint1"] and i % 128 == 0
                        and b % 8 == 0 and o % 8 == 0)):
                strategy.add_implementation(
                    wrap_compute_dense(topi.cuda.dense_tensorcore),
                    wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore),
                    name="dense_tensorcore.cuda",
                    plevel=20,
                )

    if target.kind.name == "cuda" and "cublas" in target.libs:
        strategy.add_implementation(
            wrap_compute_dense(topi.cuda.dense_cublas),
            wrap_topi_schedule(topi.cuda.schedule_dense_cublas),
            name="dense_cublas.cuda",
            plevel=25,
        )
    return strategy
Пример #11
0
def dense_strategy_cuda(attrs, inputs, out_type, target):
    """dense cuda strategy"""
    strategy = _op.OpStrategy()
    data, weights = inputs
    b, i = get_const_tuple(data.shape)
    o, _ = get_const_tuple(weights.shape)
    if out_type.dtype == "int8":
        strategy.add_implementation(
            wrap_compute_dense(topi.cuda.dense_int8),
            wrap_topi_schedule(topi.cuda.schedule_dense_int8),
            name="dense_int8.cuda",
        )
    else:
        strategy.add_implementation(
            wrap_compute_dense(topi.cuda.dense_small_batch),
            wrap_topi_schedule(topi.cuda.schedule_dense_small_batch),
            name="dense_small_batch.cuda",
        )

        strategy.add_auto_scheduler(
            wrap_compute_dense(topi.nn.dense),
            name="dense",
        )

        with SpecializedCondition(b >= 32):
            strategy.add_implementation(
                wrap_compute_dense(topi.cuda.dense_large_batch),
                wrap_topi_schedule(topi.cuda.schedule_dense_large_batch),
                name="dense_large_batch.cuda",
                plevel=5,
            )
        if target.kind.name == "cuda":
            if nvcc.have_tensorcore(tvm.gpu(0).compute_version):
                if (
                    (i % 16 == 0 and b % 16 == 0 and o % 16 == 0)
                    or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0)
                    or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0)
                ):
                    strategy.add_implementation(
                        wrap_compute_dense(topi.cuda.dense_tensorcore),
                        wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore),
                        name="dense_tensorcore.cuda",
                        plevel=20,
                    )
    if target.kind.name == "cuda" and "cublas" in target.libs:
        strategy.add_implementation(
            wrap_compute_dense(topi.cuda.dense_cublas),
            wrap_topi_schedule(topi.cuda.schedule_dense_cublas),
            name="dense_cublas.cuda",
            plevel=25,
        )
    return strategy
Пример #12
0
def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_type, target):
    """conv2d_winograd_without_weight_transfrom cuda strategy"""
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    layout = attrs.data_layout
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    padding = attrs.get_int_tuple("padding")
    assert dilation == (1, 1), "Do not support dilate now"
    assert groups == 1, "Do not supoort arbitrary group number"
    strategy = _op.OpStrategy()
    if layout == "NCHW":
        strategy.add_implementation(
            wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd_without_weight_transform),
            wrap_topi_schedule(
                topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform),
            name="conv2d_nchw_winograd_without_weight_transform.cuda")
    elif layout == "NHWC":
        N, H, W, _ = get_const_tuple(data.shape)
        alpha, _, CI, CO = get_const_tuple(kernel.shape)
        dilation_h, dilation_w = dilation
        judge_winograd_tensorcore, _ = winograd_judge(N, H, W, alpha, alpha, CI, CO,
                                                      padding, stride_h, stride_w,
                                                      dilation_h, dilation_w,
                                                      pre_flag=True)
        if target.target_name == "cuda" and \
            nvcc.have_tensorcore(tvm.gpu(0).compute_version) and \
            judge_winograd_tensorcore:
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.cuda.conv2d_nhwc_winograd_tensorcore_without_weight_transform),
                wrap_topi_schedule(
                    topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform),
                name="conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda")
        else:
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.cuda.conv2d_nhwc_winograd_direct_without_weight_transform),
                wrap_topi_schedule(
                    topi.cuda.schedule_conv2d_nhwc_winograd_direct_without_weight_transform),
                name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda")
    else:
        raise RuntimeError("Unsupported conv2d_winograd_without_weight_transfrom layout {}".
                           format(layout))
    return strategy
Пример #13
0
def requires_tensorcore(*args):
    """Mark a test as requiring a tensorcore to run.

    Tests with this mark will not be run unless a tensorcore is present.

    Parameters
    ----------
    f : function
        Function to mark
    """
    _requires_tensorcore = [
        pytest.mark.tensorcore,
        pytest.mark.skipif(
            not tvm.cuda().exist or not nvcc.have_tensorcore(tvm.cuda(0).compute_version),
            reason="No tensorcore present",
        ),
        *requires_gpu(),
    ]
    return _compose(args, _requires_tensorcore)
Пример #14
0
 def check_device(device):
     ctx = tvm.context(device, 0)
     if not ctx.exist:
         print("Skip because %s is not enabled" % device)
         return
     if not nvcc.have_tensorcore(ctx.compute_version):
         print("skip because gpu does not support Tensor Cores")
         return
     print("Running on target: %s" % device)
     for fcompute, fschedule in topi.testing.dispatch(device, _dense_implement):
         with tvm.target.create(device):
             D = fcompute(A, B, C if use_bias else None)
             D = topi.nn.relu(D)
             s = fschedule([D])
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(b_np, ctx)
         c = tvm.nd.array(c_np, ctx)
         d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=dtype), ctx)
         f = tvm.build(s, [A, B, C, D], device, name="dense")
         f(a, b, c, d)
         tvm.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-3)
def test_tensor_core_batch_conv():
    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
        print("skip because cuda is not enabled..")
        return
    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
        print("skip because gpu does not support tensor core")
        return

    # The sizes of inputs and filters
    batch_size = 32
    height = 14
    width = 14
    in_channels = 32
    out_channels = 64
    kernel_h = 3
    kernel_w = 3
    pad_h = 1
    pad_w = 1
    stride_h = 1
    stride_w = 1
    block_size = 16

    block_row_warps = 2
    block_col_warps = 4
    warp_row_tiles = 4
    warp_col_tiles = 2
    warp_size = 32
    chunk = 2

    # Input feature map: (N, H, W, IC, n, ic)
    data_shape = (batch_size // block_size, height, width,
                  in_channels // block_size, block_size, block_size)
    # Kernel: (H, W, IC, OC, ic, oc)
    kernel_shape = (kernel_h, kernel_w, in_channels // block_size,
                    out_channels // block_size, block_size, block_size)

    # Output feature map: (N, H, W, OC, n, oc)
    output_shape = (batch_size // block_size, height, width,
                    out_channels // block_size, block_size, block_size)

    assert (batch_size % block_size == 0)
    assert (in_channels % block_size == 0)
    assert (out_channels % block_size == 0)

    kh = te.reduce_axis((0, kernel_h), name='kh')
    kw = te.reduce_axis((0, kernel_w), name='kw')
    ic = te.reduce_axis((0, in_channels // block_size), name='ic')
    ii = te.reduce_axis((0, block_size), name='ii')

    # Algorithm
    A = te.placeholder(data_shape, name='A', dtype="float16")
    W = te.placeholder(kernel_shape, name='W', dtype="float16")
    Apad = te.compute(
        (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w,
         in_channels // block_size, block_size, block_size),
        lambda n, h, w, i, nn, ii: tvm.tir.if_then_else(
            tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w <
                        width), A[n, h - pad_h, w - pad_w, i, nn, ii],
            tvm.tir.const(0., "float16")),
        name='Apad')
    Conv = te.compute(
        output_shape,
        lambda n, h, w, o, nn, oo: te.sum(Apad[
            n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype(
                "float32") * W[kh, kw, ic, o, ii, oo].astype("float32"),
                                          axis=[ic, kh, kw, ii]),
        name="Conv")

    s = te.create_schedule(Conv.op)
    s[Apad].compute_inline()

    AS = s.cache_read(Apad, 'shared', [Conv])
    WS = s.cache_read(W, 'shared', [Conv])
    AF = s.cache_read(AS, 'wmma.matrix_a', [Conv])
    WF = s.cache_read(WS, 'wmma.matrix_b', [Conv])
    ConvF = s.cache_write(Conv, 'wmma.accumulator')

    block_x = te.thread_axis('blockIdx.x')
    block_y = te.thread_axis('blockIdx.y')
    block_z = te.thread_axis('blockIdx.z')
    thread_x = te.thread_axis('threadIdx.x')
    thread_y = te.thread_axis('threadIdx.y')
    thread_z = te.thread_axis('threadIdx.z')

    nc, hc, wc, oc, nnc, ooc = Conv.op.axis
    block_k = s[Conv].fuse(hc, wc)
    s[Conv].bind(block_k, block_z)
    nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
    block_i, nc = s[Conv].split(nc, factor=block_row_warps)
    oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
    block_j, oc = s[Conv].split(oc, factor=block_col_warps)
    s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
    s[Conv].bind(block_i, block_x)
    s[Conv].bind(block_j, block_y)
    s[Conv].bind(nc, thread_y)
    s[Conv].bind(oc, thread_z)

    s[ConvF].compute_at(s[Conv], oc)
    n, h, w, o, nnf, oof = ConvF.op.axis
    ko, ki = s[ConvF].split(ic, factor=chunk)
    s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)

    s[AF].compute_at(s[ConvF], kw)
    s[WF].compute_at(s[ConvF], kw)

    s[WS].compute_at(s[ConvF], kh)
    s[AS].compute_at(s[ConvF], kh)

    n, h, w, i, nn, ii = AS.op.axis
    tx, xo = s[AS].split(n, nparts=block_row_warps)
    ty, yo = s[AS].split(xo, nparts=block_col_warps)
    t = s[AS].fuse(nn, ii)
    to, ti = s[AS].split(t, factor=warp_size)
    s[AS].bind(tx, thread_y)
    s[AS].bind(ty, thread_z)
    s[AS].bind(ti, thread_x)

    kh, kw, ic, o, ii, oo = WS.op.axis
    tx, xo = s[WS].split(o, nparts=block_row_warps)
    ty, yo = s[WS].split(xo, nparts=block_col_warps)
    t = s[WS].fuse(ii, oo)
    to, ti = s[WS].split(t, nparts=warp_size)
    s[WS].bind(tx, thread_y)
    s[WS].bind(ty, thread_z)
    s[WS].bind(to, thread_x)
    s[WS].vectorize(ti)

    s[AF].tensorize(AF.op.axis[-2],
                    intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_a'))
    s[WF].tensorize(WF.op.axis[-2],
                    intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_b'))
    s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16)))
    s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16)))

    func = tvm.build(s, [A, W, Conv], 'cuda')

    ctx = tvm.gpu(0)
    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
    a = tvm.nd.array(a_np, ctx)
    w = tvm.nd.array(w_np, ctx)
    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
    print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3))

    if VERIFY:
        func(a, w, c)
        a_np = a_np.transpose(0, 4, 1, 2, 3,
                              5).reshape(batch_size, height, width,
                                         in_channels)
        w_np = w_np.transpose(0, 1, 2, 4, 3,
                              5).reshape(kernel_h, kernel_w, in_channels,
                                         out_channels)
        c_np = c.asnumpy().transpose(
            (0, 4, 1, 2, 3, 5)).reshape(batch_size, height, width,
                                        out_channels)
        c_std = conv2d_nhwc_python(a_np.astype(Conv.dtype),
                                   w_np.astype(Conv.dtype),
                                   (stride_h, stride_w),
                                   (pad_h, pad_w)).astype(Conv.dtype)
        np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)
def test_tensor_core_batch_matmal():
    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
        print("skip because cuda is not enabled..")
        return
    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
        print("skip because gpu does not support tensor core")
        return

    batch_size = 4
    n = 512
    m, l = n, n
    assert (n % 32 == 0)
    assert (m % 8 == 0)
    assert (l % 16 == 0)
    nn, mm, ll = n // 32, m // 8, l // 16
    A = te.placeholder((batch_size, nn, ll, 32, 16), name='A', dtype='float16')
    B = te.placeholder((batch_size, ll, mm, 16, 8), name='B', dtype='float16')
    k1 = te.reduce_axis((0, ll), name='k1')
    k2 = te.reduce_axis((0, 16), name='k2')
    C = te.compute((batch_size, nn, mm, 32, 8),
                   lambda b, i, j, ii, jj: te.sum(A[b, i, k1, ii, k2].astype(
                       'float') * B[b, k1, j, k2, jj].astype('float'),
                                                  axis=[k1, k2]),
                   name='Fragment_C')
    s = te.create_schedule(C.op)

    warp_size = 32
    kernel_size = 16
    block_row_warps = 2
    block_col_warps = 4
    warp_row_tiles = 4
    warp_col_tiles = 2
    chunk = 4

    block_x = te.thread_axis('blockIdx.x')
    block_y = te.thread_axis('blockIdx.y')
    block_z = te.thread_axis('blockIdx.z')
    thread_x = te.thread_axis('threadIdx.x')
    thread_y = te.thread_axis('threadIdx.y')
    thread_z = te.thread_axis('threadIdx.z')

    AS = s.cache_read(A, 'shared', [C])
    BS = s.cache_read(B, 'shared', [C])
    AF = s.cache_read(AS, 'wmma.matrix_a', [C])
    BF = s.cache_read(BS, 'wmma.matrix_b', [C])
    CF = s.cache_write(C, 'wmma.accumulator')

    b, i, j, kernel_i, kernel_j = s[C].op.axis
    i, ii = s[C].split(i, factor=warp_row_tiles)
    block_i, i = s[C].split(i, factor=block_row_warps)
    j, jj = s[C].split(j, factor=warp_col_tiles)
    block_j, j = s[C].split(j, factor=block_col_warps)
    s[C].reorder(block_i, block_j, i, j, ii, jj, kernel_i, kernel_j)
    s[C].bind(b, block_z)
    s[C].bind(block_i, block_x)
    s[C].bind(block_j, block_y)
    s[C].bind(i, thread_y)
    s[C].bind(j, thread_z)

    s[CF].compute_at(s[C], j)
    b, warp_i, warp_j, _i, _j = s[CF].op.axis
    k, _k = CF.op.reduce_axis
    ko, ki = s[CF].split(k, factor=chunk)
    s[CF].reorder(ko, ki, warp_i, warp_j, _i, _j, _k)

    s[AF].compute_at(s[CF], ki)
    s[BF].compute_at(s[CF], ki)

    s[AS].compute_at(s[CF], ko)
    b, xo, yo, xi, yi = AS.op.axis
    tx, xo = s[AS].split(xo, nparts=block_row_warps)
    ty, yo = s[AS].split(yo, nparts=block_col_warps)
    t = s[AS].fuse(xi, yi)
    to, ti = s[AS].split(t, nparts=warp_size)
    s[AS].bind(tx, thread_y)
    s[AS].bind(ty, thread_z)
    s[AS].bind(to, thread_x)

    s[BS].compute_at(s[CF], ko)
    b, xo, yo, xi, yi = BS.op.axis
    tx, xo = s[BS].split(xo, nparts=block_row_warps)
    ty, yo = s[BS].split(yo, nparts=block_col_warps)
    t = s[BS].fuse(xi, yi)
    to, ti = s[BS].split(t, nparts=warp_size)
    s[BS].bind(tx, thread_y)
    s[BS].bind(ty, thread_z)
    s[BS].bind(to, thread_x)

    s[AF].tensorize(AF.op.axis[-2],
                    intrin_wmma_load_matrix((32, 8, 16), 'wmma.matrix_a'))
    s[BF].tensorize(BF.op.axis[-2],
                    intrin_wmma_load_matrix((32, 8, 16), 'wmma.matrix_b'))
    s[C].tensorize(kernel_i, intrin_wmma_store_matrix((32, 8, 16)))
    s[CF].tensorize(_i, intrin_wmma_gemm((32, 8, 16)))

    func = tvm.build(s, [A, B, C], 'cuda')

    ctx = tvm.gpu(0)
    a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype)
    b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype)
    a = tvm.nd.array(a_np, ctx)
    b = tvm.nd.array(b_np, ctx)
    c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), ctx)
    func(a, b, c)
    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
    print('gemm with tensor core: %f ms' % (evaluator(a, b, c).mean * 1e3))

    if VERIFY:
        func(a, b, c)
        a_np = a_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
        b_np = b_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
        c_np = c.asnumpy().transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
        np.testing.assert_allclose(c_np,
                                   np.matmul(a_np.astype(C.dtype),
                                             b_np.astype(C.dtype)),
                                   rtol=1e-4,
                                   atol=1e-4)
Пример #17
0
def conv2d_strategy_cuda(attrs, inputs, out_type, target):
    """conv2d cuda strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
    padding = attrs.get_int_tuple("padding")
    groups = attrs.groups
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            if data.dtype in ('int8', 'uint8') and kernel.dtype in ('int8',
                                                                    'uint8'):
                assert data.dtype == kernel.dtype
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8),
                    name="conv2d_nchw_int8.cuda")
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
                    name="conv2d_nchw.cuda")
            _, _, kh, kw = get_const_tuple(kernel.shape)
            if 2 < kh < 8 and 2 < kw < 8 and kh == kw and stride_h == 1 and stride_w == 1 and \
                dilation_h == 1 and dilation_w == 1:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nchw_winograd),
                    name="conv2d_nchw_winograd.cuda",
                    plevel=5)
        elif layout == "HWCN":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
                name="conv2d_hwcn.cuda")
        elif layout == "NHWC":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
                name="conv2d_nhwc.cuda")
            N, H, W, _ = get_const_tuple(data.shape)
            KH, KW, CI, CO = get_const_tuple(kernel.shape)
            # Winograd shape related judgment
            judge_winograd_tensorcore, judge_winograd_shape = winograd_judge(
                N,
                H,
                W,
                KH,
                KW,
                CI,
                CO,
                padding,
                stride_h,
                stride_w,
                dilation_h,
                dilation_w,
                pre_flag=False)
            if judge_winograd_shape:
                if target.id.name == "cuda" and \
                    nvcc.have_tensorcore(tvm.gpu(0).compute_version) and \
                    judge_winograd_tensorcore:
                    strategy.add_implementation(
                        wrap_compute_conv2d(
                            topi.cuda.conv2d_nhwc_winograd_tensorcore),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore
                        ),
                        name="conv2d_nhwc_winograd_tensorcore.cuda",
                        plevel=5)
                else:
                    strategy.add_implementation(
                        wrap_compute_conv2d(
                            topi.cuda.conv2d_nhwc_winograd_direct),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv2d_nhwc_winograd_direct),
                        name="conv2d_nhwc_winograd_direct.cuda",
                        plevel=5)
            if target.id.name == "cuda":
                if nvcc.have_tensorcore(tvm.gpu(0).compute_version):
                    if (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or \
                            (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or \
                            (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0):
                        strategy.add_implementation(
                            wrap_compute_conv2d(
                                topi.cuda.conv2d_nhwc_tensorcore),
                            wrap_topi_schedule(
                                topi.cuda.schedule_conv2d_nhwc_tensorcore),
                            name="conv2d_nhwc_tensorcore.cuda",
                            plevel=20)
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
                name="conv2d_NCHWc_int8.cuda")
        else:
            raise RuntimeError(
                "Unsupported conv2d layout {} for CUDA".format(layout))
        # add cudnn implementation
        if target.id.name == "cuda" and "cudnn" in target.libs:
            if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \
                    padding[1] == padding[3]:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
                                        need_data_layout=True,
                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=25)
    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout,
                             groups):
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
                name="depthwise_conv2d_nchw.cuda")
        elif layout == "NHWC":
            assert kernel_layout == "HWOI"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
                name="depthwise_conv2d_nhwc.cuda")
        else:
            raise RuntimeError(
                "Unsupported depthwise_conv2d layout {}".format(layout))
    else:  # group_conv2d
        # add cudnn implementation, if any
        cudnn_impl = False
        if target.id.name == "cuda" and "cudnn" in target.libs:
            if layout in ["NCHW", "NHWC"] and padding[0] == padding[2] and \
                    padding[1] == padding[3]:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
                                        need_data_layout=True,
                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=25)
                cudnn_impl = True

        if layout == 'NCHW':
            # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8.
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_nchw,
                                    has_groups=True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
                name="group_conv2d_nchw.cuda")
        elif layout == 'NCHW4c' and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
                name="group_conv2d_NCHWc_int8.cuda")
        elif not cudnn_impl:
            raise RuntimeError(
                "Unsupported group_conv2d layout {}".format(layout))
    return strategy
Пример #18
0
def conv2d_strategy_cuda(attrs, inputs, out_type, target):
    """conv2d cuda strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
    padding = attrs.get_int_tuple("padding")
    groups = attrs.groups
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            if data.dtype in ("int8", "uint8") and kernel.dtype in ("int8",
                                                                    "uint8"):
                assert data.dtype == kernel.dtype
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8),
                    name="conv2d_nchw_int8.cuda",
                )
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
                    name="conv2d_nchw.cuda",
                )
            _, _, kh, kw = get_const_tuple(kernel.shape)
            if ((2 < kh < 8 and 2 < kw < 8 and kh == kw)
                    and (stride_h == 1 and stride_w == 1)
                    and (dilation_h == 1 and dilation_w == 1)):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nchw_winograd),
                    name="conv2d_nchw_winograd.cuda",
                    plevel=5,
                )

            strategy.add_auto_scheduler(wrap_compute_conv2d(
                topi.nn.conv2d_nchw),
                                        name="conv2d_nchw")
        elif layout == "HWCN":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
                name="conv2d_hwcn.cuda",
            )
        elif layout == "NHWC":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_nhwc),
                name="conv2d_nhwc.cuda",
            )

            N, H, W, _ = get_const_tuple(data.shape)
            KH, KW, CI, CO = get_const_tuple(kernel.shape)
            # Winograd shape related judgment
            (
                judge_winograd_tensorcore,
                judge_winograd_autotvm,
                judge_winograd_auto_scheduler,
            ) = judge_winograd(
                N,
                H,
                W,
                KH,
                KW,
                CI,
                CO,
                padding,
                stride_h,
                stride_w,
                dilation_h,
                dilation_w,
                data.dtype,
                kernel.dtype,
                pre_flag=False,
            )
            if judge_winograd_autotvm:
                if (target.kind.name == "cuda"
                        and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
                        and judge_winograd_tensorcore):
                    strategy.add_implementation(
                        wrap_compute_conv2d(
                            topi.cuda.conv2d_nhwc_winograd_tensorcore),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore
                        ),
                        name="conv2d_nhwc_winograd_tensorcore.cuda",
                        plevel=5,
                    )
                else:
                    strategy.add_implementation(
                        wrap_compute_conv2d(
                            topi.cuda.conv2d_nhwc_winograd_direct),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv2d_nhwc_winograd_direct),
                        name="conv2d_nhwc_winograd_direct.cuda",
                        plevel=5,
                    )
            if (target.kind.name == "cuda"
                    and nvcc.have_tensorcore(tvm.gpu(0).compute_version)
                    and ((N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or
                         (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or
                         (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0))):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nhwc_tensorcore),
                    name="conv2d_nhwc_tensorcore.cuda",
                    plevel=20,
                )

            # register auto-scheduler implementations
            if judge_winograd_auto_scheduler:
                strategy.add_auto_scheduler(wrap_compute_conv2d(
                    topi.nn.conv2d_winograd_nhwc),
                                            name="conv2d_nhwc.winograd")
            else:
                strategy.add_auto_scheduler(wrap_compute_conv2d(
                    topi.nn.conv2d_nhwc),
                                            name="conv2d_nhwc")

        elif layout == "HWNC":
            assert kernel_layout in [
                "HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i"
            ]
            _, _, N, in_channels = get_const_tuple(data.shape)
            pre_computed = len(kernel.shape) == 6
            if pre_computed:
                _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple(
                    kernel.shape)
                out_channels = oc_chunk * oc_block_factor
            else:
                _, _, out_channels, _ = get_const_tuple(kernel.shape)

            tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"]
            if ((N % 16 == 0 and in_channels % 16 == 0
                 and out_channels % 16 == 0)
                    or (N % 8 == 0 and in_channels % 16 == 0
                        and out_channels % 32 == 0)
                    or (N % 32 == 0 and in_channels % 16 == 0
                        and out_channels % 8 == 0) and
                (data.dtype in tensorcore_dtypes
                 and kernel.dtype in tensorcore_dtypes)):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_hwnc_tensorcore),
                    name="conv2d_hwnc_tensorcore_direct.cuda",
                    plevel=20,
                )
            else:
                raise RuntimeError("Unsupported shape for conv2d HWNC.\
                                    Need to satisfy tensor core schedule.")
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
                name="conv2d_NCHWc_int8.cuda",
            )
        else:
            raise RuntimeError(
                "Unsupported conv2d layout {} for CUDA".format(layout))
        # add cudnn implementation
        if target.kind.name == "cuda" and "cudnn" in target.libs:
            if layout in [
                    "NCHW", "NHWC"
            ] and padding[0] == padding[2] and padding[1] == padding[3]:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
                                        need_data_layout=True,
                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=25,
                )
    elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout,
                             groups):
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
                name="depthwise_conv2d_nchw.cuda",
            )

            strategy.add_auto_scheduler(
                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nchw),
                name="depthwise_conv2d_nchw.cuda",
            )
        elif layout == "NHWC":
            assert kernel_layout == "HWOI"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
                name="depthwise_conv2d_nhwc.cuda",
            )

            strategy.add_auto_scheduler(
                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                name="depthwise_conv2d_nhwc.cuda",
            )
        else:
            raise RuntimeError(
                "Unsupported depthwise_conv2d layout {}".format(layout))
    else:  # group_conv2d
        # add cudnn implementation, if any
        cudnn_impl = False
        if target.kind.name == "cuda" and "cudnn" in target.libs:
            if layout in [
                    "NCHW", "NHWC"
            ] and padding[0] == padding[2] and padding[1] == padding[3]:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
                                        need_data_layout=True,
                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=25,
                )
                cudnn_impl = True

        if layout == "NCHW":
            # TODO(@vinx13, @icemelon9): Use group_conv2d_NCHWc_int8 when dtype is int8/uint8.
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_nchw,
                                    has_groups=True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
                name="group_conv2d_nchw.cuda",
            )
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
                name="group_conv2d_NCHWc_int8.cuda",
            )
        elif not cudnn_impl:
            raise RuntimeError(
                "Unsupported group_conv2d layout {}".format(layout))
    return strategy
Пример #19
0
def test_tune_matmul_cuda_tensor_core():
    n = 512
    mod = create_prim_func(te_workload.matmul_fp16(n, n, n))
    target = Target("nvidia/geforce-rtx-3070")
    config = ReplayTraceConfig(
        num_trials_per_iter=32,
        num_trials_total=320,
    )

    class DefaultTensorCore:
        @staticmethod
        def _sch_rules():
            from tvm.meta_schedule import (  # pylint: disable=import-outside-toplevel
                schedule_rule as M, )

            return [
                M.AutoInline(
                    into_producer=False,
                    into_consumer=True,
                    # into_cache_only=False,
                    inline_const_tensor=True,
                    disallow_if_then_else=False,
                    require_injective=False,
                    require_ordered=False,
                    disallow_op=None,
                ),
                M.MultiLevelTiling(
                    structure="SSSRRSRS",
                    tile_binds=["blockIdx.x", "blockIdx.y", "threadIdx.y"],
                    # use_tensor_core=True,
                    max_innermost_factor=64,
                    vector_load_lens=[1, 2, 3, 4],
                    reuse_read=schedule_rule.ReuseType(
                        req="must",
                        levels=[4],
                        scope="shared",
                    ),
                    reuse_write=schedule_rule.ReuseType(
                        req="no",
                        levels=[],
                        scope="",
                    ),
                ),
                M.AutoInline(
                    into_producer=True,
                    into_consumer=True,
                    # into_cache_only=True,
                    inline_const_tensor=True,
                    disallow_if_then_else=False,
                    require_injective=False,
                    require_ordered=False,
                    disallow_op=None,
                ),
                M.ParallelizeVectorizeUnroll(
                    max_jobs_per_core=-1,  # disable parallelize
                    max_vectorize_extent=-1,  # disable vectorize
                    unroll_max_steps=[0, 16, 64, 512, 1024],
                    unroll_explicit=True,
                ),
            ]

        @staticmethod
        def _postproc():
            from tvm.meta_schedule import (  # pylint: disable=import-outside-toplevel
                postproc as M, )

            return [
                # M.RewriteCooperativeFetch(),
                M.RewriteParallelVectorizeUnroll(),
                M.RewriteReductionBlock(),
                # M.RewriteTensorCore(),
                M.VerifyGPUCode(),
            ]

    with tempfile.TemporaryDirectory() as work_dir:
        sch: Schedule = tune_tir(
            mod=mod,
            target=target,
            config=config,
            work_dir=work_dir,
            space=PostOrderApply(),
            sch_rules=DefaultTensorCore._sch_rules,
            postprocs=DefaultTensorCore._postproc,
            num_threads=None,
        )
        if sch is None:
            print("No valid schedule found!")
        else:
            print(sch.mod.script())
            print(sch.trace)

            from tvm.contrib import nvcc
            import numpy as np

            ctx = tvm.gpu(0)
            if nvcc.have_tensorcore(ctx.compute_version):
                with tvm.transform.PassContext():
                    func = tvm.build(sch.mod["main"], [], "cuda")
                    print(sch.mod.script())
                    print(func.imported_modules[0].get_source())
                a_np = np.random.uniform(size=(n, n)).astype("float16")
                b_np = np.random.uniform(size=(n, n)).astype("float16")
                a = tvm.nd.array(a_np, ctx)
                b = tvm.nd.array(b_np, ctx)
                c = tvm.nd.array(np.zeros((n, n), dtype="float32"), ctx)
                evaluator = func.time_evaluator(func.entry_name,
                                                ctx,
                                                number=3,
                                                repeat=1,
                                                min_repeat_ms=40)
                print("matmul with tensor core: %f ms" %
                      (evaluator(a, b, c).mean * 1e3))

                np.testing.assert_allclose(
                    c.asnumpy(),
                    np.matmul(a_np.astype("float32"), b_np.astype("float32")),
                    rtol=1e-4,
                    atol=1e-4,
                )
Пример #20
0
def conv2d_strategy_cuda(attrs, inputs, out_type, target):
    """conv2d cuda strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    dilation_h, dilation_w = attrs.get_int_tuple("dilation")
    padding = attrs.get_int_tuple("padding")
    groups = attrs.groups
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    if dilation_h < 1 or dilation_w < 1:
        raise ValueError("dilation should be positive value")

    if groups == 1:
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            if ((target.kind.name in ["cuda", "vulkan", "rocm"])
                    and data.dtype in ("int8", "uint8")
                    and kernel.dtype in ("int8", "uint8")):
                assert data.dtype == kernel.dtype
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_int8),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_int8),
                    name="conv2d_nchw_int8.cuda",
                )
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw),
                    name="conv2d_nchw.cuda",
                )
            _, _, kh, kw = get_const_tuple(kernel.shape)
            if ((2 < kh < 8 and 2 < kw < 8 and kh == kw)
                    and (stride_h == 1 and stride_w == 1)
                    and (dilation_h == 1 and dilation_w == 1)):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nchw_winograd),
                    name="conv2d_nchw_winograd.cuda",
                    plevel=5,
                )
        elif layout == "HWCN":
            assert kernel_layout == "HWIO"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_hwcn),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_hwcn),
                name="conv2d_hwcn.cuda",
            )
        elif layout == "NHWC" and kernel_layout == "HWIO":
            strategy.add_implementation(
                wrap_compute_conv2d(topi.gpu.conv2d_nhwc),
                wrap_topi_schedule(topi.gpu.schedule_conv2d_nhwc),
                name="conv2d_nhwc.gpu",
            )

            N, H, W, _ = get_const_tuple(data.shape)
            KH, KW, CI, CO = get_const_tuple(kernel.shape)
            # Winograd shape related judgment
            (
                judge_winograd_tensorcore,
                judge_winograd_autotvm,
                judge_winograd_auto_scheduler,
            ) = judge_winograd(
                N,
                H,
                W,
                KH,
                KW,
                CI,
                CO,
                padding,
                stride_h,
                stride_w,
                dilation_h,
                dilation_w,
                data.dtype,
                kernel.dtype,
                pre_flag=False,
            )
            if judge_winograd_autotvm:
                if (target.kind.name == "cuda"
                        and nvcc.have_tensorcore(target=target)
                        and judge_winograd_tensorcore):
                    strategy.add_implementation(
                        wrap_compute_conv2d(
                            topi.cuda.conv2d_nhwc_winograd_tensorcore),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore
                        ),
                        name="conv2d_nhwc_winograd_tensorcore.cuda",
                        plevel=5,
                    )
                else:
                    strategy.add_implementation(
                        wrap_compute_conv2d(
                            topi.cuda.conv2d_nhwc_winograd_direct),
                        wrap_topi_schedule(
                            topi.cuda.schedule_conv2d_nhwc_winograd_direct),
                        name="conv2d_nhwc_winograd_direct.cuda",
                        plevel=5,
                    )
            if (target.kind.name == "cuda"
                    and nvcc.have_tensorcore(target=target)
                    and ((N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or
                         (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or
                         (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0))):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_nhwc_tensorcore),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_nhwc_tensorcore),
                    name="conv2d_nhwc_tensorcore.cuda",
                    plevel=20,
                )

            # register auto-scheduler implementations
            if is_auto_scheduler_enabled() and judge_winograd_auto_scheduler:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc),
                    naive_schedule,  # this implementation should never be picked by autotvm
                    name="conv2d_nhwc.winograd",
                    plevel=15,
                )

        elif layout == "HWNC":
            assert kernel_layout in [
                "HWOI", "HWOI16o16i", "HWOI8o32i", "HWOI32o16i"
            ]
            _, _, N, in_channels = get_const_tuple(data.shape)
            pre_computed = len(kernel.shape) == 6
            if pre_computed:
                _, _, oc_chunk, _, oc_block_factor, _ = get_const_tuple(
                    kernel.shape)
                out_channels = oc_chunk * oc_block_factor
            else:
                _, _, out_channels, _ = get_const_tuple(kernel.shape)

            tensorcore_dtypes = ["int4", "uint4", "int8", "uint8"]
            if (target.kind.name == "cuda"
                    and nvcc.have_tensorcore(target=target)
                    and kernel.dtype in tensorcore_dtypes and
                ((data.dtype in ["int4", "uint4"] and N % 8 == 0
                  and in_channels % 32 == 0 and out_channels % 8 == 0) or
                 (data.dtype in ["int8", "uint8"] and N % 8 == 0
                  and in_channels % 16 == 0 and out_channels % 32 == 0))):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_hwnc_tensorcore),
                    wrap_topi_schedule(
                        topi.cuda.schedule_conv2d_hwnc_tensorcore),
                    name="conv2d_hwnc_tensorcore_direct.cuda",
                    plevel=20,
                )
            else:
                raise RuntimeError("Unsupported shape for conv2d HWNC.\
                                    Need to satisfy tensor core schedule.")
        elif ((target.kind.name in ["cuda", "vulkan", "rocm"])
              and layout == "NCHW4c" and data.dtype in ["int8", "uint8"]):
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_NCHWc_int8, True),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_NCHWc_int8),
                name="conv2d_NCHWc_int8.cuda",
            )
        elif target.kind.name == "cuda" and "cudnn" not in target.libs:
            # No TVM native kernel applicable
            raise RuntimeError(
                "Unsupported conv2d layout {} for CUDA".format(layout))

        if (target.kind.name == "cuda" and "cudnn" in target.libs
                and layout in ["NCHW", "NHWC"] and padding[0] == padding[2]
                and padding[1] == padding[3]
                and not (data.dtype in ["uint8", "int8"]
                         or kernel.dtype in ["uint8", "int8"])):
            # add cudnn implementation
            if layout == "NHWC":
                assert kernel_layout == "OHWI"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
                                    need_data_layout=True,
                                    has_groups=True),
                wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                name="conv2d_cudnn.cuda",
                plevel=25,
            )

    elif is_depthwise_conv2d(
            data.shape, layout, kernel.shape, kernel_layout, groups) and (
                layout == "NCHW" or "cudnn" not in target.libs
            ):  # cuDNN requires a different kernel layout for NHWC inputs.
        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.depthwise_conv2d_nchw),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nchw),
                name="depthwise_conv2d_nchw.cuda",
            )
        elif layout == "NHWC":
            assert kernel_layout == "HWOI"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
                wrap_topi_schedule(topi.cuda.schedule_depthwise_conv2d_nhwc),
                name="depthwise_conv2d_nhwc.cuda",
            )
        else:
            raise RuntimeError(
                "Unsupported depthwise_conv2d layout {}".format(layout))
    else:  # group_conv2d
        # add cudnn implementation, if any
        cudnn_impl = False
        if target.kind.name == "cuda" and "cudnn" in target.libs:
            if (layout in ["NCHW", "NHWC"] and padding[0] == padding[2]
                    and padding[1] == padding[3]
                    and not (data.dtype in ["uint8", "int8"]
                             or kernel.dtype in ["uint8", "int8"])):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.conv2d_cudnn,
                                        need_data_layout=True,
                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_conv2d_cudnn),
                    name="conv2d_cudnn.cuda",
                    plevel=25,
                )
                cudnn_impl = True

        if layout == "NCHW":
            assert kernel_layout == "OIHW"
            _, channels, _, _ = get_const_tuple(data.shape)
            out_channels, in_channels, _, _ = get_const_tuple(kernel.shape)
            oc_chunk = out_channels // 4
            ic_chunk = in_channels // 4

            if ((target.kind.name in ["cuda", "vulkan", "rocm"])
                    and data.dtype in ["int8", "uint8"]
                    and kernel.dtype in ["int8", "uint8"]
                    and channels % groups == 0 and out_channels % groups == 0
                    and channels % 4 == 0 and out_channels % 4 == 0
                    and groups <= oc_chunk and groups <= ic_chunk):
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.group_conv2d_nchw_int8,
                                        has_groups=True),
                    wrap_topi_schedule(
                        topi.cuda.schedule_group_conv2d_nchw_int8),
                    name="group_conv2d_nchw_int8.cuda",
                )
            else:
                strategy.add_implementation(
                    wrap_compute_conv2d(topi.cuda.group_conv2d_nchw,
                                        has_groups=True),
                    wrap_topi_schedule(topi.cuda.schedule_group_conv2d_nchw),
                    name="group_conv2d_nchw.cuda",
                )
        elif layout == "NCHW4c" and data.dtype in ["int8", "uint8"]:
            assert kernel_layout == "OIHW4o4i"
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.group_conv2d_NCHWc_int8,
                                    has_groups=True),
                wrap_topi_schedule(topi.cuda.schedule_group_conv2d_NCHWc_int8),
                name="group_conv2d_NCHWc_int8.cuda",
            )
        elif not cudnn_impl:
            raise RuntimeError(
                "Unsupported group_conv2d layout {}".format(layout))
    return strategy
Пример #21
0
def conv2d_winograd_without_weight_transfrom_strategy_cuda(attrs, inputs, out_type, target):
    """conv2d_winograd_without_weight_transfrom cuda strategy"""
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int("groups")
    layout = attrs.data_layout
    data, kernel = inputs
    stride_h, stride_w = attrs.get_int_tuple("strides")
    padding = attrs.get_int_tuple("padding")
    assert dilation == (1, 1), "Do not support dilate now"
    assert groups == 1, "Do not supoort arbitrary group number"
    strategy = _op.OpStrategy()
    if layout == "NCHW":
        strategy.add_implementation(
            wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd_without_weight_transform),
            wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd_without_weight_transform),
            name="conv2d_nchw_winograd_without_weight_transform.cuda",
        )
    elif layout == "NHWC":
        N, H, W, _ = get_const_tuple(data.shape)
        alpha, _, CI, CO = get_const_tuple(kernel.shape)
        dilation_h, dilation_w = dilation
        judge_winograd_tensorcore, _, _ = judge_winograd(
            N,
            H,
            W,
            alpha,
            alpha,
            CI,
            CO,
            padding,
            stride_h,
            stride_w,
            dilation_h,
            dilation_w,
            data.dtype,
            kernel.dtype,
            pre_flag=True,
        )
        if (
            target.kind.name == "cuda"
            and nvcc.have_tensorcore(target=target)
            and judge_winograd_tensorcore
        ):
            strategy.add_implementation(
                wrap_compute_conv2d(
                    topi.cuda.conv2d_nhwc_winograd_tensorcore_without_weight_transform
                ),
                wrap_topi_schedule(
                    topi.cuda.schedule_conv2d_nhwc_winograd_tensorcore_without_weight_transform
                ),
                name="conv2d_nhwc_winograd_tensorcore_without_weight_transform.cuda",
            )
        else:
            strategy.add_implementation(
                wrap_compute_conv2d(topi.cuda.conv2d_nhwc_winograd_direct_without_weight_transform),
                wrap_topi_schedule(
                    topi.cuda.schedule_conv2d_nhwc_winograd_direct_without_weight_transform
                ),
                name="conv2d_nhwc_winograd_direct_without_weight_transform.cuda",
            )

        if is_auto_scheduler_enabled():
            strategy.add_implementation(
                wrap_compute_conv2d(topi.nn.conv2d_winograd_nhwc_without_weight_transform),
                naive_schedule,  # this implementation should never be picked by autotvm
                name="conv2d_nhwc_winograd_without_weight_transform",
                plevel=15,
            )
    else:
        raise RuntimeError(
            "Unsupported conv2d_winograd_without_weight_transfrom layout {}".format(layout)
        )
    return strategy
Пример #22
0
    return s, [A, B, C]


###############################################################################
# AutoTune and Test
# -----------------
# Finally we use a tuner to tune the schedule, generate code with best config
# and run the kernel to compare with numpy to check whether the results are correct.

# check whether the gpu has tensorcore
if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
    raise Exception(
        "skip building this tutorial because cuda is not enabled..")

dev = tvm.gpu()
if not nvcc.have_tensorcore(dev.compute_version):
    raise Exception("the gpu has no tensorcore, skipping...")

M, N, L = 512, 32, 512
dtype = "float16"
layout = "NN"
if len(sys.argv) >= 4:
    M, N, L = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3])
if len(sys.argv) >= 5:
    dtype = sys.argv[4]
if len(sys.argv) >= 6:
    layout = sys.argv[5]

# check whether current gpu arch support support current dtype's wmma codegen
cuda_compute_capability = tvm.runtime._ffi_api.GetDeviceAttr(2, 0, 4)
major, minor = nvcc.parse_compute_version(cuda_compute_capability)
Пример #23
0
s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_a'))
s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix('wmma.matrix_b'))
s[Conv].tensorize(nnc, intrin_wmma_store_matrix())
s[ConvF].tensorize(nnf, intrin_wmma_gemm())
print(tvm.lower(s, [A, W, Conv], simple_mode=True))

###############################################################################
# Generate CUDA Kernel
# --------------------
# Finally we use TVM to generate and compile the CUDA kernel, and evaluate the latency of convolution.
# Since TensorCores are only supported in NVIDIA GPU with Compute Capability 7.0 or higher, it may not
# be able to run on our build server

ctx = tvm.gpu(0)
if nvcc.have_tensorcore(ctx.compute_version):
    with tvm.transform.PassContext(
            config={"tir.UnrollLoop": {
                "auto_max_step": 16
            }}):
        func = tvm.build(s, [A, W, Conv], 'cuda')
    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
    a = tvm.nd.array(a_np, ctx)
    w = tvm.nd.array(w_np, ctx)
    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
    evaluator = func.time_evaluator(func.entry_name, ctx, number=10)
    print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3))

###############################################################################
# Summary