Exemplo n.º 1
0
 def test_conv2d(
     self,
     conv2d_impl,
     shape_nhwc,
     shape_oihw,
     shape_oihw8i32o4i,
     kernel,
     stride,
     pad,
     dtype,
     target,
 ):
     inputs = [
         np.random.uniform(0, 255, size=shape_nhwc).astype(dtype),
         np.random.uniform(0, 255, size=shape_oihw8i32o4i).astype(dtype),
     ]
     np_filter = (inputs[1].transpose(0, 5, 1, 4, 6, 2,
                                      3).reshape(shape_oihw).transpose(
                                          2, 3, 1, 0))
     ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride,
                                             pad)
     output = build_and_run(
         inputs,
         conv2d_impl,
         target,
         target,
         shape_nhwc=shape_nhwc,
         shape_oihw8i32o4i=shape_oihw8i32o4i,
         kernel_size=(kernel, kernel),
         stride=(stride, stride),
         padding=(pad, pad, pad, pad),
         dtype=dtype,
     )
     return output, ref_output
Exemplo n.º 2
0
    def test_conv2d(
        self,
        batch,
        in_size,
        in_channel,
        pad,
        stride,
        kernel_size,
        out_channel,
        k_split_factor,
        h_split_factor,
        dtype,
        target,
    ):
        # TODO: no support for dilation
        dilation = 1

        shape_input = [batch, in_size, in_size, in_channel]
        shape_filter_oihw = [out_channel, in_channel, kernel_size, kernel_size]
        shape_filter_oihw8i32o4i = get_packed_filter_shape(shape_filter_oihw)

        inputs = [
            np.random.uniform(0, 255, size=shape_input).astype(dtype),
            np.random.uniform(0, 255, size=shape_filter_oihw8i32o4i).astype(dtype),
        ]
        np_filter = (
            inputs[1]
            .transpose(0, 5, 1, 4, 6, 2, 3)
            .reshape(shape_filter_oihw)
            .transpose(2, 3, 1, 0)
        )
        ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad)
        output = build_and_run(
            inputs,
            conv2d_nhwc8h8w32c,
            target,
            target,
            shape_input=shape_input,
            pad=(pad, pad, pad, pad),
            stride=(stride, stride),
            dilation=(dilation, dilation),
            shape_filter=shape_filter_oihw8i32o4i,
            k_split_factor=k_split_factor,
            h_split_factor=h_split_factor,
            dtype=dtype,
        )

        conv2d_verify(output, ref_output, dtype)
Exemplo n.º 3
0
    def test_conv2d(self, shape_nhwc, shape_oihw, kernel, stride, pad, dtype, target):
        inputs = [
            np.random.uniform(0, 255, size=shape_nhwc).astype(dtype),
            np.random.uniform(0, 255, size=shape_oihw).astype(dtype),
        ]
        np_filter = inputs[1].transpose(2, 3, 1, 0)
        ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad)
        output = build_and_run(
            inputs,
            conv2d_logical,
            target,
            target,
            shape_nhwc=shape_nhwc,
            shape_oihw=shape_oihw,
            kernel_size=(kernel, kernel),
            stride=(stride, stride),
            padding=(pad, pad, pad, pad),
            dtype=dtype,
        )

        # nhwc8h8w32c -> nhwc
        output = output.transpose(0, 1, 4, 2, 5, 3, 6).reshape(
            output.shape[0],
            output.shape[1] * output.shape[4],
            output.shape[2] * output.shape[5],
            output.shape[3] * output.shape[6],
        )

        # slice output to match ref_output shape
        # e.g. 8x8 spatial 3x3 filter = 6x6 ref output
        # but still 8x8 output given the blocked layout
        output = output[
            0 : ref_output.shape[0] : 1,
            0 : ref_output.shape[1] : 1,
            0 : ref_output.shape[2] : 1,
            0 : ref_output.shape[3] : 1,
        ]

        if "int" in dtype:
            tol = {"atol": 0, "rtol": 0}
        elif dtype == "float32":
            tol = {"rtol": 1e-4, "atol": 2e-4}
        tvm.testing.assert_allclose(output, ref_output, **tol)
Exemplo n.º 4
0
    def test_conv2d(
        self,
        conv2d_impl,
        shape_nhwc,
        shape_oihw,
        kernel,
        stride,
        pad,
        dtype,
        target,
        k_split_factor,
        h_split_factor,
    ):
        inputs = [
            np.random.uniform(0, 255, size=shape_nhwc).astype(dtype),
            np.random.uniform(0, 255, size=shape_oihw).astype(dtype),
        ]
        np_filter = inputs[1].transpose(2, 3, 1, 0)
        ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride,
                                                pad)
        output = build_and_run(
            inputs,
            conv2d_impl,
            target,
            target,
            shape_nhwc=shape_nhwc,
            shape_filter=shape_oihw,
            kernel_size=(kernel, kernel),
            stride=(stride, stride),
            padding=(pad, pad, pad, pad),
            dtype=dtype,
            k_split_factor=k_split_factor,
            h_split_factor=h_split_factor,
        )

        verify_conv2d(output, ref_output, dtype)
def test_tensor_core_batch_conv():
    if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
        print("skip because cuda is not enabled..")
        return
    if not nvcc.have_tensorcore(tvm.gpu(0).compute_version):
        print("skip because gpu does not support tensor core")
        return

    # The sizes of inputs and filters
    batch_size = 32
    height = 14
    width = 14
    in_channels = 32
    out_channels = 64
    kernel_h = 3
    kernel_w = 3
    pad_h = 1
    pad_w = 1
    stride_h = 1
    stride_w = 1
    block_size = 16

    block_row_warps = 2
    block_col_warps = 4
    warp_row_tiles = 4
    warp_col_tiles = 2
    warp_size = 32
    chunk = 2

    # Input feature map: (N, H, W, IC, n, ic)
    data_shape = (batch_size // block_size, height, width,
                  in_channels // block_size, block_size, block_size)
    # Kernel: (H, W, IC, OC, ic, oc)
    kernel_shape = (kernel_h, kernel_w, in_channels // block_size,
                    out_channels // block_size, block_size, block_size)

    # Output feature map: (N, H, W, OC, n, oc)
    output_shape = (batch_size // block_size, height, width,
                    out_channels // block_size, block_size, block_size)

    assert (batch_size % block_size == 0)
    assert (in_channels % block_size == 0)
    assert (out_channels % block_size == 0)

    kh = te.reduce_axis((0, kernel_h), name='kh')
    kw = te.reduce_axis((0, kernel_w), name='kw')
    ic = te.reduce_axis((0, in_channels // block_size), name='ic')
    ii = te.reduce_axis((0, block_size), name='ii')

    # Algorithm
    A = te.placeholder(data_shape, name='A', dtype="float16")
    W = te.placeholder(kernel_shape, name='W', dtype="float16")
    Apad = te.compute(
        (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w,
         in_channels // block_size, block_size, block_size),
        lambda n, h, w, i, nn, ii: tvm.tir.if_then_else(
            tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w <
                        width), A[n, h - pad_h, w - pad_w, i, nn, ii],
            tvm.tir.const(0., "float16")),
        name='Apad')
    Conv = te.compute(
        output_shape,
        lambda n, h, w, o, nn, oo: te.sum(Apad[
            n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype(
                "float32") * W[kh, kw, ic, o, ii, oo].astype("float32"),
                                          axis=[ic, kh, kw, ii]),
        name="Conv")

    s = te.create_schedule(Conv.op)
    s[Apad].compute_inline()

    AS = s.cache_read(Apad, 'shared', [Conv])
    WS = s.cache_read(W, 'shared', [Conv])
    AF = s.cache_read(AS, 'wmma.matrix_a', [Conv])
    WF = s.cache_read(WS, 'wmma.matrix_b', [Conv])
    ConvF = s.cache_write(Conv, 'wmma.accumulator')

    block_x = te.thread_axis('blockIdx.x')
    block_y = te.thread_axis('blockIdx.y')
    block_z = te.thread_axis('blockIdx.z')
    thread_x = te.thread_axis('threadIdx.x')
    thread_y = te.thread_axis('threadIdx.y')
    thread_z = te.thread_axis('threadIdx.z')

    nc, hc, wc, oc, nnc, ooc = Conv.op.axis
    block_k = s[Conv].fuse(hc, wc)
    s[Conv].bind(block_k, block_z)
    nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
    block_i, nc = s[Conv].split(nc, factor=block_row_warps)
    oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
    block_j, oc = s[Conv].split(oc, factor=block_col_warps)
    s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
    s[Conv].bind(block_i, block_x)
    s[Conv].bind(block_j, block_y)
    s[Conv].bind(nc, thread_y)
    s[Conv].bind(oc, thread_z)

    s[ConvF].compute_at(s[Conv], oc)
    n, h, w, o, nnf, oof = ConvF.op.axis
    ko, ki = s[ConvF].split(ic, factor=chunk)
    s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)

    s[AF].compute_at(s[ConvF], kw)
    s[WF].compute_at(s[ConvF], kw)

    s[WS].compute_at(s[ConvF], kh)
    s[AS].compute_at(s[ConvF], kh)

    n, h, w, i, nn, ii = AS.op.axis
    tx, xo = s[AS].split(n, nparts=block_row_warps)
    ty, yo = s[AS].split(xo, nparts=block_col_warps)
    t = s[AS].fuse(nn, ii)
    to, ti = s[AS].split(t, factor=warp_size)
    s[AS].bind(tx, thread_y)
    s[AS].bind(ty, thread_z)
    s[AS].bind(ti, thread_x)

    kh, kw, ic, o, ii, oo = WS.op.axis
    tx, xo = s[WS].split(o, nparts=block_row_warps)
    ty, yo = s[WS].split(xo, nparts=block_col_warps)
    t = s[WS].fuse(ii, oo)
    to, ti = s[WS].split(t, nparts=warp_size)
    s[WS].bind(tx, thread_y)
    s[WS].bind(ty, thread_z)
    s[WS].bind(to, thread_x)
    s[WS].vectorize(ti)

    s[AF].tensorize(AF.op.axis[-2],
                    intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_a'))
    s[WF].tensorize(WF.op.axis[-2],
                    intrin_wmma_load_matrix((16, 16, 16), 'wmma.matrix_b'))
    s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16)))
    s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16)))

    func = tvm.build(s, [A, W, Conv], 'cuda')

    ctx = tvm.gpu(0)
    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
    a = tvm.nd.array(a_np, ctx)
    w = tvm.nd.array(w_np, ctx)
    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), ctx)
    evaluator = func.time_evaluator(func.entry_name, ctx, number=3)
    print('conv2d with tensor core: %f ms' % (evaluator(a, w, c).mean * 1e3))

    if VERIFY:
        func(a, w, c)
        a_np = a_np.transpose(0, 4, 1, 2, 3,
                              5).reshape(batch_size, height, width,
                                         in_channels)
        w_np = w_np.transpose(0, 1, 2, 4, 3,
                              5).reshape(kernel_h, kernel_w, in_channels,
                                         out_channels)
        c_np = c.asnumpy().transpose(
            (0, 4, 1, 2, 3, 5)).reshape(batch_size, height, width,
                                        out_channels)
        c_std = conv2d_nhwc_python(a_np.astype(Conv.dtype),
                                   w_np.astype(Conv.dtype),
                                   (stride_h, stride_w),
                                   (pad_h, pad_w)).astype(Conv.dtype)
        np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)
Exemplo n.º 6
0
    def test_conv2d(
        self,
        batch,
        in_size,
        in_channel,
        pad1,
        stride1,
        kernel_size1,
        out_channel1,
        stride2,
        kernel_size2,
        out_channel2,
        k_split_factor,
        h_split_factor,
        dtype,
        target,
    ):
        # TODO: no support for padding in conv2d #2
        pad2 = 0

        # TODO: no support for dilation
        dilation1 = 1
        dilation2 = 1

        shape_input = [batch, in_size, in_size, in_channel]
        shape_filter1_oihw = [
            out_channel1, in_channel, kernel_size1, kernel_size1
        ]
        shape_filter1_oihw8i32o4i = get_packed_filter_shape(shape_filter1_oihw)

        shape_filter2_oihw = [
            out_channel2, out_channel1, kernel_size2, kernel_size2
        ]
        shape_filter2_oihw8i32o4i = get_packed_filter_shape(shape_filter2_oihw)

        inputs = [
            np.random.uniform(0, 255, size=shape_input).astype(dtype),
            np.random.uniform(0, 255,
                              size=shape_filter1_oihw8i32o4i).astype(dtype),
            np.random.uniform(0, 255,
                              size=shape_filter2_oihw8i32o4i).astype(dtype),
        ]
        np_filter1 = (inputs[1].transpose(
            0, 5, 1, 4, 6, 2,
            3).reshape(shape_filter1_oihw).transpose(2, 3, 1, 0))
        np_filter2 = (inputs[2].transpose(
            0, 5, 1, 4, 6, 2,
            3).reshape(shape_filter2_oihw).transpose(2, 3, 1, 0))
        temp_output = testing.conv2d_nhwc_python(inputs[0], np_filter1,
                                                 stride1, pad1)
        ref_output = testing.conv2d_nhwc_python(temp_output, np_filter2,
                                                stride2, pad2)
        output = build_and_run(
            inputs,
            conv2dconv2d_nhwc8h8w32c,
            target,
            target,
            shape_input=shape_input,
            pad1=(pad1, pad1, pad1, pad1),
            stride1=(stride1, stride1),
            dilation1=(dilation1, dilation1),
            shape_filter1=shape_filter1_oihw8i32o4i,
            pad2=(pad2, pad2, pad1, pad1),
            stride2=(stride2, stride2),
            dilation2=(dilation2, dilation2),
            shape_filter2=shape_filter2_oihw8i32o4i,
            k_split_factor=k_split_factor,
            h_split_factor=h_split_factor,
            dtype=dtype,
        )

        conv2d_verify(output, ref_output, dtype)
Exemplo n.º 7
0
 def expected_output_np(self, input_np, dilated_weights_np, stride):
     ref_np = conv2d_nhwc_python(input_np.astype("float32"),
                                 dilated_weights_np.astype("float32"),
                                 stride,
                                 padding=0).astype("float16")
     return ref_np
def test_tensor_core_batch_conv():
    # The sizes of inputs and filters
    batch_size = 32
    height = 14
    width = 14
    in_channels = 32
    out_channels = 64
    kernel_h = 3
    kernel_w = 3
    pad_h = 1
    pad_w = 1
    stride_h = 1
    stride_w = 1
    block_size = 16

    block_row_warps = 2
    block_col_warps = 4
    warp_row_tiles = 4
    warp_col_tiles = 2
    warp_size = 32
    chunk = 2

    # Input feature map: (N, H, W, IC, n, ic)
    data_shape = (
        batch_size // block_size,
        height,
        width,
        in_channels // block_size,
        block_size,
        block_size,
    )
    # Kernel: (H, W, IC, OC, ic, oc)
    kernel_shape = (
        kernel_h,
        kernel_w,
        in_channels // block_size,
        out_channels // block_size,
        block_size,
        block_size,
    )

    # Output feature map: (N, H, W, OC, n, oc)
    output_shape = (
        batch_size // block_size,
        height,
        width,
        out_channels // block_size,
        block_size,
        block_size,
    )

    assert batch_size % block_size == 0
    assert in_channels % block_size == 0
    assert out_channels % block_size == 0

    kh = te.reduce_axis((0, kernel_h), name="kh")
    kw = te.reduce_axis((0, kernel_w), name="kw")
    ic = te.reduce_axis((0, in_channels // block_size), name="ic")
    ii = te.reduce_axis((0, block_size), name="ii")

    # Algorithm
    A = te.placeholder(data_shape, name="A", dtype="float16")
    W = te.placeholder(kernel_shape, name="W", dtype="float16")
    Apad = te.compute(
        (
            batch_size // block_size,
            height + 2 * pad_h,
            width + 2 * pad_w,
            in_channels // block_size,
            block_size,
            block_size,
        ),
        lambda n, h, w, i, nn, ii: tvm.tir.if_then_else(
            tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w <
                        width),
            A[n, h - pad_h, w - pad_w, i, nn, ii],
            tvm.tir.const(0.0, "float16"),
        ),
        name="Apad",
    )
    Conv = te.compute(
        output_shape,
        lambda n, h, w, o, nn, oo: te.sum(
            Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype(
                "float32") * W[kh, kw, ic, o, ii, oo].astype("float32"),
            axis=[ic, kh, kw, ii],
        ),
        name="Conv",
    )

    s = te.create_schedule(Conv.op)
    s[Apad].compute_inline()

    AS = s.cache_read(Apad, "shared", [Conv])
    WS = s.cache_read(W, "shared", [Conv])
    AF = s.cache_read(AS, "wmma.matrix_a", [Conv])
    WF = s.cache_read(WS, "wmma.matrix_b", [Conv])
    ConvF = s.cache_write(Conv, "wmma.accumulator")

    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    block_z = te.thread_axis("blockIdx.z")
    thread_x = te.thread_axis("threadIdx.x")
    thread_y = te.thread_axis("threadIdx.y")
    thread_z = te.thread_axis("threadIdx.z")

    nc, hc, wc, oc, nnc, ooc = Conv.op.axis
    block_k = s[Conv].fuse(hc, wc)
    s[Conv].bind(block_k, block_z)
    nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
    block_i, nc = s[Conv].split(nc, factor=block_row_warps)
    oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
    block_j, oc = s[Conv].split(oc, factor=block_col_warps)
    s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
    s[Conv].bind(block_i, block_x)
    s[Conv].bind(block_j, block_y)
    s[Conv].bind(nc, thread_y)
    s[Conv].bind(oc, thread_z)

    s[ConvF].compute_at(s[Conv], oc)
    n, h, w, o, nnf, oof = ConvF.op.axis
    ko, ki = s[ConvF].split(ic, factor=chunk)
    s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)

    s[AF].compute_at(s[ConvF], kw)
    s[WF].compute_at(s[ConvF], kw)

    s[WS].compute_at(s[ConvF], kh)
    s[AS].compute_at(s[ConvF], kh)

    n, h, w, i, nn, ii = AS.op.axis
    tx, xo = s[AS].split(n, nparts=block_row_warps)
    ty, yo = s[AS].split(xo, nparts=block_col_warps)
    t = s[AS].fuse(nn, ii)
    to, ti = s[AS].split(t, factor=warp_size)
    s[AS].bind(tx, thread_y)
    s[AS].bind(ty, thread_z)
    s[AS].bind(ti, thread_x)

    kh, kw, ic, o, ii, oo = WS.op.axis
    tx, xo = s[WS].split(o, nparts=block_row_warps)
    ty, yo = s[WS].split(xo, nparts=block_col_warps)
    t = s[WS].fuse(ii, oo)
    to, ti = s[WS].split(t, nparts=warp_size)
    s[WS].bind(tx, thread_y)
    s[WS].bind(ty, thread_z)
    s[WS].bind(to, thread_x)
    s[WS].vectorize(ti)

    s[AF].tensorize(AF.op.axis[-2],
                    intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_a"))
    s[WF].tensorize(WF.op.axis[-2],
                    intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_b"))
    s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16)))
    s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16)))

    func = tvm.build(s, [A, W, Conv], "cuda")

    dev = tvm.cuda(0)
    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
    a = tvm.nd.array(a_np, dev)
    w = tvm.nd.array(w_np, dev)
    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), dev)
    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
    print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3))

    if VERIFY:
        func(a, w, c)
        a_np = a_np.transpose(0, 4, 1, 2, 3,
                              5).reshape(batch_size, height, width,
                                         in_channels)
        w_np = w_np.transpose(0, 1, 2, 4, 3,
                              5).reshape(kernel_h, kernel_w, in_channels,
                                         out_channels)
        c_np = (c.numpy().transpose(
            (0, 4, 1, 2, 3, 5)).reshape(batch_size, height, width,
                                        out_channels))
        c_std = conv2d_nhwc_python(a_np.astype(Conv.dtype),
                                   w_np.astype(Conv.dtype),
                                   (stride_h, stride_w),
                                   (pad_h, pad_w)).astype(Conv.dtype)
        np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)