示例#1
0
def test_bfloat_add_and_cast_2():
    X = te.placeholder((3, ), name="X")
    Y = te.placeholder((3, ), name="Y")
    Z = topi.cast(topi.cast(X, dtype="custom[bfloat]16") +
                  topi.cast(Y, dtype="custom[bfloat]16"),
                  dtype="float")

    s = te.create_schedule([Z.op])
    built_cast = lower_datatypes_and_build(s, [X, Y, Z])

    ctx = tvm.context(tgt, 0)

    # Used float32 calculator at http://www.weitz.de/ieee/. Generated
    # unconstrained float32s for the operands and copied them in to x and y.
    # Then, to simulate float32->bfloat16 conversion implemented by the mybfloat
    # library, I cut off all but 7 bits of the mantissa. I then added the
    # numbers. To simulate bfloat16 add implemented in mybfloat, I cut off all
    # but 7 bits of the result's mantissa. I then copied that value into
    # z_expected.
    x = tvm.nd.array(np.array([1.2348297, -1.0298302E25,
                               1.2034023E-30]).astype("float32"),
                     ctx=ctx)
    y = tvm.nd.array(np.array([-2.4992788, -9.888288E19,
                               9.342338E-29]).astype("float32"),
                     ctx=ctx)
    z_expected = np.array([-1.25, -1.027587E25,
                           9.426888E-29]).astype("float32")
    z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx)

    built_cast(x, y, z)

    assert np.array_equal(z_expected, z.asnumpy())
示例#2
0
def test_bfloat_add_and_cast_1():
    X = te.placeholder((3, ), name="X")
    Y = te.placeholder((3, ), name="Y")
    Z = topi.cast(topi.cast(X, dtype="custom[bfloat]16") +
                  topi.cast(Y, dtype="custom[bfloat]16"),
                  dtype="float")

    s = te.create_schedule([Z.op])
    built_cast = lower_datatypes_and_build(s, [X, Y, Z])

    ctx = tvm.context(tgt, 0)

    # Used float32 calculator at http://www.weitz.de/ieee/. Generated float32s
    # with at most 7-bit mantissas which, when added, produce a result with at
    # most 7-bit mantissas. This is to ensure there are no errors due to
    # float32->bfloat16 conversions.
    x = tvm.nd.array(np.array([4.4103796E-32, 14942208.0,
                               1.78125]).astype("float32"),
                     ctx=ctx)
    y = tvm.nd.array(np.array([-3.330669E-14, 19660800.0,
                               2.25]).astype("float32"),
                     ctx=ctx)
    z_expected = np.array([-3.330669E-14, 34603008.0,
                           4.03125]).astype("float32")
    z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx)

    built_cast(x, y, z)

    assert np.array_equal(z_expected, z.asnumpy())
示例#3
0
 def get_promoted(op):
     a = te.placeholder((100, ), dtype='bfloat16')
     b = te.placeholder((100, ), dtype='bfloat16')
     c = te.compute((100, ), lambda i: topi.cast(
         op(topi.cast(a[i], 'float'), topi.cast(b[i], 'float')), 'bfloat16')
                    )
     s = te.create_schedule(c.op)
     func = tvm.driver.build_module.form_irmodule(s, [a, b, c], "main",
                                                  None)["main"]
     return func.body
示例#4
0
 def get_promoted(op):
     a = te.placeholder((100, ), dtype="bfloat16")
     b = te.placeholder((100, ), dtype="bfloat16")
     c = te.compute(
         (100, ),
         lambda i: topi.cast(
             op(topi.cast(a[i], "float"), topi.cast(b[i], "float")),
             "bfloat16"),
     )
     s = te.create_schedule(c.op)
     func = tvm.driver.build_module.schedule_to_module(
         s, [a, b, c], "main", None)["main"]
     return func.body
    def check(t0, t1, factor):
        if (t0 == "float16" or t1
                == "float16") and not have_fp16(tvm.gpu(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        # compute
        n = 128
        A = te.placeholder((n, ), dtype=t0, name="A")
        B = te.placeholder((n, ), dtype=t1, name="B")
        C = te.compute((n, ),
                       lambda i: A[i] + topi.cast(B[i], A.dtype),
                       name="C")

        # schedule
        s = tvm.te.create_schedule(C.op)
        ob, ib = s[C].split(s[C].op.axis[0], factor=factor)
        s[C].vectorize(ib)
        s[C].bind(ob, tx)
        func = tvm.build(s, [A, B, C], "cuda")

        # correctness
        dev = tvm.gpu(0)
        low, high = (0,
                     20) if t0.startswith("u") or t1.startswith("u") else (-10,
                                                                           10)
        a_np = np.random.randint(low, high, size=n).astype(A.dtype)
        b_np = np.random.randint(low, high, size=n).astype(B.dtype)
        c_np = (a_np + b_np).astype(A.dtype)
        a_nd = tvm.nd.array(a_np, dev)
        b_nd = tvm.nd.array(b_np, dev)
        c_nd = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np.dtype), dev)
        func(a_nd, b_nd, c_nd)
        tvm.testing.assert_allclose(c_nd.asnumpy(), c_np, rtol=1e-3)
def conv2d_transpose(N, CI, H, W, CO, KH, KW, strides, padding, opadding):
    data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH,
                  env.BLOCK_IN)
    kernel_shape = (CO // env.BLOCK_OUT, CI // env.BLOCK_IN, KH, KW,
                    env.BLOCK_OUT, env.BLOCK_IN)

    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)

    with tvm.target.vta():
        res = topi.nn.conv2d_transpose_nchw(
            Input=data,
            Filter=kernel,
            strides=strides,
            padding=padding,
            out_dtype=env.acc_dtype,
            output_padding=opadding,
        )
        res = topi.right_shift(res, env.WGT_WIDTH)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)

    if tvm.target.Target.current().device_name == "vta":
        s = topi.generic.schedule_conv2d_transpose_nchw([res])
    else:
        s = te.create_schedule([res.op])

    return s, [data, kernel, res]
def group_conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, group):

    CI_G = CI // groups
    data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH,
                  env.BLOCK_IN)
    kernel_shape = (CO // env.BLOCK_OUT, CI_G // env.BLOCK_IN, KH, KW,
                    env.BLOCK_OUT, env.BLOCK_IN)
    bias_shape = (N // env.BATCH, CO // env.BLOCK_OUT, 1, 1, env.BATCH,
                  env.BLOCK_OUT)

    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)

    with tvm.target.vta():
        res = topi.nn.group_conv2d_nchw(data, kernel, strides, padding,
                                        dilation, groups, env.acc_dtype)
        res = topi.right_shift(res, env.WGT_WIDTH)
        res = topi.add(res, bias)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)

    if tvm.target.Target.current().device_name == "vta":
        s = topi.generic.schedule_group_conv2d_nchw([res])
    else:
        s = te.create_schedule([res.op])

    return s, [data, kernel, bias, res]
示例#8
0
    def verify(from_dtype, to_dtype, low=-100, high=100):
        shape = (5, 4)
        A = te.placeholder(shape, dtype=from_dtype, name="A")
        B = topi.cast(A, to_dtype)

        if from_dtype == "bool":
            a_np = np.random.choice([True, False], size=shape)
        else:
            a_np = np.random.uniform(low, high, size=shape).astype(from_dtype)
        if to_dtype == "bool":
            a_np = a_np - a_np[2, 3]
        b_np = a_np.astype(to_dtype)

        for device in get_all_backend():
            ctx = tvm.context(device, 0)
            if not ctx.exist:
                print("Skip because %s is not enabled" % device)
                continue
            print("Running on target: %s" % device)
            with tvm.target.create(device):
                s = tvm.topi.testing.get_injective_schedule(device)(B)
            foo = tvm.build(s, [A, B], device)
            a = tvm.nd.array(a_np, ctx)
            b = tvm.nd.empty(shape=shape, dtype=to_dtype, ctx=ctx)
            foo(a, b)
            tvm.testing.assert_allclose(b.asnumpy(), b_np)
示例#9
0
def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation):
    data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH,
                  env.BLOCK_IN)
    kernel_shape = (CO // env.BLOCK_OUT, CI // env.BLOCK_IN, KH, KW,
                    env.BLOCK_OUT, env.BLOCK_IN)
    bias_shape = (N // env.BATCH, CO // env.BLOCK_OUT, 1, 1, env.BATCH,
                  env.BLOCK_OUT)

    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)

    with tvm.target.vta():
        res = topi.nn.conv2d(input=data,
                             filter=kernel,
                             padding=padding,
                             strides=strides,
                             dilation=dilation,
                             layout='NCHW%dn%dc' % (env.BATCH, env.BLOCK_IN),
                             out_dtype=env.acc_dtype)
        res = topi.right_shift(res, env.WGT_WIDTH)
        res = topi.add(res, bias)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)

    if tvm.target.Target.current().device_name == 'vta':
        s = topi.generic.schedule_conv2d_nchw([res])
    else:
        s = te.create_schedule([res.op])

    return s, [data, kernel, bias, res]
示例#10
0
def upsampling(data, scale_h, scale_w, layout="NCHW", method='nearest_neighbor',
               align_corners=False, output_shape=None):
    """Perform upsampling on the data.
       Nearest neighbor and bilinear upsampling are supported.

    Parameters
    ----------
    inputs : tvm.te.Tensor
        inputs is a 4-D tensor with shape
        [batch, channel, in_height, in_width]
        or  [batch, in_height, in_width, channel]

    scale_h : float
        Scaling factor for height

    scale_w : float
        Scaling factor for width

    layout : string, optional
        either "NCHW" or "NHWC"

    method : {"bilinear", "nearest_neighbor", "bicubic"}
        Method to be used for upsampling.

    Returns
    -------
    output : tvm.te.Tensor
        4-D with shape [batch, channel, in_height*scale_h, in_width*scale_w]
        or [batch, in_height*scale, in_width*scale, channel]
    """
    base_layout = layout[0:4]
    if base_layout == "NCHW":
        if not output_shape: #static case
            scaled_h = data.shape[2] * scale_h
            scaled_w = data.shape[3] * scale_w
            reshape_size = (simplify(topi.cast(te.round(scaled_h), data.shape[2].dtype)),
                            simplify(topi.cast(te.round(scaled_w), data.shape[3].dtype)))
        else: #dynamic case -- we don't need to scale; already done in shape func
            reshape_size = (simplify(topi.cast(te.round(output_shape[2]), output_shape[2].dtype)),
                            simplify(topi.cast(te.round(output_shape[3]), output_shape[3].dtype)))
    elif layout == "NHWC":
        if not output_shape: #static case
            scaled_h = data.shape[1] * scale_h
            scaled_w = data.shape[2] * scale_w
            reshape_size = (simplify(topi.cast(te.round(scaled_h), data.shape[1].dtype)),
                            simplify(topi.cast(te.round(scaled_w), data.shape[2].dtype)))
        else: #dynamic case
            reshape_size = (simplify(topi.cast(te.round(output_shape[1]), output_shape[1].dtype)),
                            simplify(topi.cast(te.round(output_shape[2]), output_shape[2].dtype)))

    else:
        raise ValueError("not support this layout {} yet".format(layout))
    coord_trans = "align_corners" if align_corners else "asymmetric"
    return topi.image.resize(data, reshape_size, layout=layout,
                             method=method, coordinate_transformation_mode=coord_trans,
                             output_shape=output_shape)
示例#11
0
def test_bfloat_add_and_cast_FloatImm():
    X = te.placeholder((3, ), name="X")
    Z = topi.cast(topi.add(topi.cast(X, dtype="custom[bfloat]16"),
                           tvm.tir.FloatImm("custom[bfloat]16", 1.5)),
                  dtype="float")

    s = te.create_schedule([Z.op])
    built_cast = lower_datatypes_and_build(s, [X, Z])

    ctx = tvm.context(tgt, 0)

    x = tvm.nd.array(np.array([0.0, 1.0, 1.5]).astype("float32"), ctx=ctx)
    z_expected = np.array([1.5, 2.5, 3.0]).astype("float32")
    z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx)

    built_cast(x, z)

    assert np.array_equal(z_expected, z.asnumpy())
示例#12
0
 def to16(v):
     uint32_v = tvm.tir.call_intrin("uint32", "tir.reinterpret", v)
     rounding_bias = tvm.tir.call_intrin("uint32", "tir.shift_right",
                                         uint32_v,
                                         tvm.tir.const(16, "uint32"))
     rounding_bias = tvm.tir.call_intrin("uint32", "tir.bitwise_and",
                                         rounding_bias,
                                         tvm.tir.const(1, "uint32"))
     rounding_bias = rounding_bias + tvm.tir.const(0x7FFF, "uint16")
     uint32_v = uint32_v + rounding_bias
     uint32_v = tvm.tir.call_intrin("uint32", "tir.shift_right", uint32_v,
                                    tvm.tir.const(16, "uint32"))
     return topi.cast(uint32_v, "uint16")
示例#13
0
    def _topi_multiply(*args, **kwargs):
        assert not kwargs, "Do not support kwargs in template function call"
        A, B = args[:2]

        with tvm.target.vta():
            res = vta.top.op.multiply_packed(*args, **kwargs)
            res = my_clip(res, 0, 127)
            res = topi.cast(res, "int8")

        if tvm.target.Target.current().device_name == "vta":
            s = vta.top.op.schedule_multiply_packed([res])
        else:
            s = te.create_schedule([res.op])
        return s, [A, B, res]
示例#14
0
def test_cast(target, dev, cast_ref_data, from_dtype, to_dtype):
    m = te.var("m")
    l = te.var("l")
    A = te.placeholder((m, l), dtype=from_dtype, name="A")
    B = topi.cast(A, to_dtype)

    a_np, b_np = cast_ref_data

    with tvm.target.Target(target):
        s = tvm.topi.testing.get_injective_schedule(target)(B)
    foo = tvm.build(s, [A, B], target)
    a = tvm.nd.array(a_np, dev)
    b = tvm.nd.empty(b_np.shape, dtype=to_dtype, device=dev)
    foo(a, b)
    tvm.testing.assert_allclose(b.numpy(), b_np)
示例#15
0
    def _topi_nn_conv2d(*args, **kwargs):
        assert not kwargs, "Do not support kwargs in template function call"
        A, W = args[:2]

        with tvm.target.vta():
            res = vta.top.conv2d_packed(*args, **kwargs)
            res = topi.right_shift(res, 8)
            res = my_clip(res, 0, 127)
            res = topi.cast(res, "int8")

        if tvm.target.Target.current().device_name == 'vta':
            s = vta.top.schedule_conv2d_packed([res])
        else:
            s = te.create_schedule([res.op])
        return s, [A, W, res]
示例#16
0
    def _topi_nn_conv2d(*args, **kwargs):
        assert not kwargs, "Do not support kwargs in template function call"
        args = deserialize_args(args)
        A, W = args[:2]

        with tvm.target.vta():
            res = topi.nn.conv2d(*args, **kwargs)
            res = topi.right_shift(res, 8)
            res = my_clip(res, 0, 127)
            res = topi.cast(res, "int8")

        if tvm.target.Target.current().device_name == "vta":
            s = topi.generic.schedule_conv2d_nchw([res])
        else:
            s = te.create_schedule([res.op])
        return s, [A, W, res]
示例#17
0
def upsampling3d(data, scale_d, scale_h, scale_w, layout="NCDHW", method='nearest_neighbor',
                 coordinate_transformation_mode="half_pixel"):
    """Perform upsampling on the data.
       Nearest neighbor and bilinear upsampling are supported.

    Parameters
    ----------
    inputs : tvm.te.Tensor
        inputs is a 5-D tensor with shape
        [batch, channel, in_depth, in_height, in_width]
        or  [batch, in_depth, in_height, in_width, channel]

    scale_d : float
        Scaling factor for depth

    scale_h : float
        Scaling factor for height

    scale_w : float
        Scaling factor for width

    layout : string, optional
        either "NCDHW" or "NDHWC"

    method : {"trilinear", "nearest_neighbor"}
        Method to be used for upsampling.

    coordinate_transformation_mode: string, optional
        Describes how to transform the coordinate in the resized tensor
        to the coordinate in the original tensor.
        Refer to the ONNX Resize operator specification for details.
        Available options are "half_pixel", "align_corners" and "asymmetric".

    Returns
    -------
    output : tvm.te.Tensor
        5-D with shape [batch, channel, in_depth*scale, in_height*scale, in_width*scale]
        or [batch, in_depth*scale, in_height*scale, in_width*scale, channel]
    """
    base_layout = layout[0:5]
    if base_layout == "NCDHW":
        out_shape = (simplify(topi.cast(te.round(data.shape[2] * scale_d), data.shape[2].dtype)),
                     simplify(topi.cast(te.round(data.shape[3] * scale_h), data.shape[3].dtype)),
                     simplify(topi.cast(te.round(data.shape[4] * scale_w), data.shape[4].dtype)))
    elif layout == "NDHWC":
        out_shape = (simplify(topi.cast(te.round(data.shape[1] * scale_d), data.shape[1].dtype)),
                     simplify(topi.cast(te.round(data.shape[2] * scale_h), data.shape[2].dtype)),
                     simplify(topi.cast(te.round(data.shape[3] * scale_w), data.shape[3].dtype)))

    else:
        raise ValueError("not support this layout {} yet".format(layout))
    return topi.image.resize3d(data, out_shape, layout=layout, method=method,
                               coordinate_transformation_mode=coordinate_transformation_mode)
示例#18
0
def dense(N, CI, CO):
    data_shape = (N // env.BATCH, CI // env.BLOCK_IN, env.BATCH, env.BLOCK_IN)
    kernel_shape = (CO // env.BLOCK_OUT, CI // env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN)

    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)

    with tvm.target.vta():
        res = topi.nn.dense(data, kernel, None, "int32")
        res = topi.right_shift(res, 8)
        res = my_clip(res, 0, 127)
        res = topi.cast(res, "int8")

    if tvm.target.Target.current().device_name == "vta":
        s = topi.generic.schedule_dense([res])
    else:
        s = te.create_schedule([res.op])

    return s, [data, kernel, res]
示例#19
0
    def verify(from_dtype, to_dtype, low=-100, high=100):
        shape = (5, 4)
        A = te.placeholder(shape, dtype=from_dtype, name="A")
        B = topi.cast(A, to_dtype)

        if from_dtype == "bool":
            a_np = np.random.choice([True, False], size=shape)
        else:
            a_np = np.random.uniform(low, high, size=shape).astype(from_dtype)
        if to_dtype == "bool":
            a_np = a_np - a_np[2, 3]
        b_np = a_np.astype(to_dtype)

        for target, dev in tvm.testing.enabled_targets():
            print("Running on target: %s" % target)
            with tvm.target.Target(target):
                s = tvm.topi.testing.get_injective_schedule(target)(B)
            foo = tvm.build(s, [A, B], target)
            a = tvm.nd.array(a_np, dev)
            b = tvm.nd.empty(shape=shape, dtype=to_dtype, device=dev)
            foo(a, b)
            tvm.testing.assert_allclose(b.asnumpy(), b_np)
示例#20
0
 def to16(v):
     return topi.cast(v, "bfloat16")
示例#21
0
 def to32(v):
     return topi.cast(v, "float")
def run_conv2d_transpose(
    env, remote, wl, target, check_correctness=True, print_ir=False, samples=4
):

    # Workload assertions
    assert wl.hpad == wl.wpad

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
        layout = "NCHW"
        fcompute = topi.arm_cpu.conv2d_transpose_nchw
        fschedule = topi.arm_cpu.schedule_conv2d_transpose_nchw
    elif "vta" in target.keys:
        data_pack = True
        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
        fcompute = vta.top.conv2d_transpose_packed
        fschedule = vta.top.schedule_conv2d_transpose_packed

    # Derive shapes depending upon packing

    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
    w_shape = (wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel)
    if data_pack:
        data_shape = (
            wl.batch // env.BATCH,
            wl.in_filter // env.BLOCK_IN,
            wl.height,
            wl.width,
            env.BATCH,
            env.BLOCK_IN,
        )
        kernel_shape = (
            wl.out_filter // env.BLOCK_OUT,
            wl.in_filter // env.BLOCK_IN,
            wl.hkernel,
            wl.wkernel,
            env.BLOCK_OUT,
            env.BLOCK_IN,
        )
    else:
        data_shape = a_shape
        kernel_shape = w_shape
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))

    # Define base computation schedule
    with target:

        res = fcompute(
            data, kernel, (wl.hstride, wl.wstride), padding, env.acc_dtype, (wl.o_hpad, wl.o_wpad)
        )
        res = topi.right_shift(res, env.WGT_WIDTH)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = fschedule([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, res], simple_mode=True))

    # Derive number of ops
    fout_height = (wl.height - 1) * wl.hstride - 2 * wl.hpad + wl.hkernel + wl.o_hpad
    fout_width = (wl.width - 1) * wl.wstride - 2 * wl.wpad + wl.wkernel + wl.o_wpad
    num_ops = (
        2
        * wl.batch
        * fout_height
        * fout_width
        * wl.hkernel
        * wl.wkernel
        * wl.out_filter
        * wl.in_filter
    )

    # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
    def get_ref_data():
        # derive min max for act and wgt types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(
            w_min, w_max, size=(wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel)
        ).astype(kernel.dtype)
        r_np = tvm.topi.testing.conv2d_transpose_nchw_python(
            a_np.astype(env.acc_dtype),
            w_np.astype(env.acc_dtype),
            (wl.hstride, wl.wstride),
            wl.hpad,
            (wl.o_hpad, wl.o_wpad),
        ).astype(env.acc_dtype)
        return a_np, w_np, r_np

    # Data in original format
    data_np, kernel_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(
            wl.batch // env.BATCH,
            env.BATCH,
            wl.in_filter // env.BLOCK_IN,
            env.BLOCK_IN,
            wl.height,
            wl.width,
        ).transpose((0, 2, 4, 5, 1, 3))
        kernel_np = kernel_np.reshape(
            wl.in_filter // env.BLOCK_IN,
            env.BLOCK_IN,
            wl.out_filter // env.BLOCK_OUT,
            env.BLOCK_OUT,
            wl.hkernel,
            wl.wkernel,
        ).transpose((2, 0, 4, 5, 3, 1))
        kernel_np = np.flip(kernel_np, 2)
        kernel_np = np.flip(kernel_np, 3)

    # Build
    if "vta" in target.keys:
        with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
            mod = vta.build(
                s,
                [data, kernel, res],
                target=target,
                target_host=env.target_host,
                name="conv2d_transpose",
            )
    else:
        mod = tvm.build(
            s,
            [data, kernel, res],
            target=target,
            target_host=env.target_host,
            name="conv2d_transpose",
        )
    temp = utils.tempdir()
    mod.save(temp.relpath("conv2d_transpose.o"))
    remote.upload(temp.relpath("conv2d_transpose.o"))
    f = remote.load_module("conv2d_transpose.o")
    dev = remote.device(str(target))

    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, dev)
    kernel_arr = tvm.nd.array(kernel_np, dev)
    res_arr = tvm.nd.array(res_np, dev)
    time_f = f.time_evaluator("conv2d_transpose", dev, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.numpy()
        if data_pack:
            res_orig = res_orig.transpose((0, 4, 1, 5, 2, 3)).reshape(
                wl.batch, wl.out_filter, fout_height, fout_width
            )
        res_ref = res_ref >> env.WGT_WIDTH
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))

    return correct, cost, stats
示例#23
0
 def to32(v):
     uint32_v = topi.cast(v, "uint32")
     uint32_v = tvm.tir.call_intrin("uint32", "tir.shift_left", uint32_v,
                                    tvm.tir.const(16, "uint32"))
     return tvm.tir.call_intrin("float32", "tir.reinterpret", uint32_v)
示例#24
0
 def to32(v):
     return topi.cast(v, 'float')
示例#25
0
def upsampling3d(data,
                 scale_d,
                 scale_h,
                 scale_w,
                 layout="NCDHW",
                 method='nearest_neighbor',
                 coordinate_transformation_mode="half_pixel",
                 output_shape=None):
    """Perform upsampling on the data.
       Nearest neighbor and bilinear upsampling are supported.

    Parameters
    ----------
    inputs : tvm.te.Tensor
        inputs is a 5-D tensor with shape
        [batch, channel, in_depth, in_height, in_width]
        or  [batch, in_depth, in_height, in_width, channel]

    scale_d : float
        Scaling factor for depth

    scale_h : float
        Scaling factor for height

    scale_w : float
        Scaling factor for width

    layout : string, optional
        either "NCDHW" or "NDHWC"

    method : {"trilinear", "nearest_neighbor"}
        Method to be used for upsampling.

    coordinate_transformation_mode: string, optional
        Describes how to transform the coordinate in the resized tensor
        to the coordinate in the original tensor.
        Refer to the ONNX Resize operator specification for details.
        Available options are "half_pixel", "align_corners" and "asymmetric".

    output_shape: tvm.tir.container.Array, optional
        Shape to return. If left None will be inferred
        (If shape is determined dynamically, pass out_dtype.shape as output_shape)

    Returns
    -------
    output : tvm.te.Tensor
        5-D with shape [batch, channel, in_depth*scale, in_height*scale, in_width*scale]
        or [batch, in_depth*scale, in_height*scale, in_width*scale, channel]
    """
    base_layout = layout[0:5]
    if base_layout == "NCDHW":
        if not output_shape:  # static case
            scaled_d = data.shape[2] * scale_d
            scaled_h = data.shape[3] * scale_h
            scaled_w = data.shape[4] * scale_w
            resize_shape = (simplify(
                topi.cast(te.round(scaled_d), data.shape[2].dtype)),
                            simplify(
                                topi.cast(te.round(scaled_h),
                                          data.shape[3].dtype)),
                            simplify(
                                topi.cast(te.round(scaled_w),
                                          data.shape[4].dtype)))
        else:  # dynamic case -- don't need to scale; already done in shape func
            resize_shape = (simplify(
                topi.cast(te.round(output_shape[2]), data.shape[2].dtype)),
                            simplify(
                                topi.cast(te.round(output_shape[3]),
                                          data.shape[3].dtype)),
                            simplify(
                                topi.cast(te.round(output_shape[4]),
                                          data.shape[4].dtype)))
    elif layout == "NDHWC":
        if not output_shape:  # static case
            scaled_d = data.shape[1] * scale_d
            scaled_h = data.shape[2] * scale_h
            scaled_w = data.shape[3] * scale_w
            resize_shape = (simplify(
                topi.cast(te.round(scaled_d), data.shape[1].dtype)),
                            simplify(
                                topi.cast(te.round(scaled_h),
                                          data.shape[2].dtype)),
                            simplify(
                                topi.cast(te.round(scaled_w),
                                          data.shape[3].dtype)))
        else:  # dynamic case
            resize_shape = (simplify(
                topi.cast(te.round(output_shape[1]), data.shape[1].dtype)),
                            simplify(
                                topi.cast(te.round(output_shape[2]),
                                          data.shape[2].dtype)),
                            simplify(
                                topi.cast(te.round(output_shape[3]),
                                          data.shape[3].dtype)))
    else:
        raise ValueError("not support this layout {} yet".format(layout))
    return topi.image.resize3d(
        data,
        resize_shape,
        layout=layout,
        method=method,
        coordinate_transformation_mode=coordinate_transformation_mode)
示例#26
0
 def to16(v):
     return topi.cast(v, 'bfloat16')
示例#27
0
def run_group_conv2d(env,
                     remote,
                     wl,
                     target,
                     check_correctness=True,
                     print_ir=False,
                     samples=4):

    # Workload assertions
    assert wl.hpad == wl.wpad

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
        layout = "NCHW"
        fcompute = topi.nn.group_conv2d_nchw
        fschedule = topi.generic.schedule_group_conv2d_nchw
    elif "vta" in target.keys:
        data_pack = True
        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
        fcompute = vta.top.group_conv2d_packed
        fschedule = vta.top.schedule_group_conv2d_packed

    # Derive shapes depending upon packing
    CI_G = wl.in_filter // wl.groups
    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
    w_shape = (wl.out_filter, CI_G, wl.hkernel, wl.wkernel)
    b_shape = (wl.batch, wl.out_filter, 1, 1)
    if data_pack:
        data_shape = (wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN,
                      wl.height, wl.width, env.BATCH, env.BLOCK_IN)
        kernel_shape = (wl.out_filter // env.BLOCK_OUT, CI_G // env.BLOCK_IN,
                        wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN)
        bias_shape = (wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1,
                      1, env.BATCH, env.BLOCK_OUT)
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        bias_shape = b_shape
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))

    # Define base computation schedule
    with target:
        res = fcompute(data, kernel, (wl.hstride, wl.wstride), padding, (1, 1),
                       wl.groups, env.acc_dtype)
        res = topi.right_shift(res, 8)
        res = topi.add(res, bias)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = fschedule([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))

    # Derive number of ops
    fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
    fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
    num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * \
        wl.out_filter * wl.in_filter // wl.groups

    def get_ref_data():
        # derive min max for act, wgt, and bias types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 <<
                                                        (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 <<
                                                        (env.WGT_WIDTH - 1))
        b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH -
                                 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2)
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(w_min, w_max,
                                 size=w_shape).astype(kernel.dtype)
        b_np = np.random.randint(b_min, b_max,
                                 size=b_shape).astype(env.acc_dtype)
        r_np = tvm.topi.testing.conv2d_nchw_python(
            a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype),
            (wl.hstride, wl.wstride), wl.hpad, wl.groups).astype(env.acc_dtype)
        return a_np, w_np, b_np, r_np

    # Data in original format
    data_np, kernel_np, bias_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(wl.batch // env.BATCH, env.BATCH,
                                  wl.in_filter // env.BLOCK_IN, env.BLOCK_IN,
                                  wl.height, wl.width).transpose(
                                      (0, 2, 4, 5, 1, 3))
        kernel_np = kernel_np.reshape(wl.out_filter // env.BLOCK_OUT,
                                      env.BLOCK_OUT, CI_G // env.BLOCK_IN,
                                      env.BLOCK_IN, wl.hkernel,
                                      wl.wkernel).transpose((0, 2, 4, 5, 1, 3))
        bias_np = bias_np.reshape(wl.batch // env.BATCH,
                                  wl.out_filter // env.BLOCK_OUT, 1, 1,
                                  env.BATCH, env.BLOCK_OUT)

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, kernel, bias, res],
                        target=target,
                        target_host=env.target_host,
                        name="conv2d")
    else:
        mod = tvm.build(s, [data, kernel, bias, res],
                        target=target,
                        target_host=env.target_host,
                        name="conv2d")
    temp = util.tempdir()
    mod.save(temp.relpath("conv2d.o"))
    remote.upload(temp.relpath("conv2d.o"))
    f = remote.load_module("conv2d.o")
    ctx = remote.context(str(target))

    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, ctx)
    kernel_arr = tvm.nd.array(kernel_np, ctx)
    bias_arr = tvm.nd.array(bias_np, ctx)
    res_arr = tvm.nd.array(res_np, ctx)
    time_f = f.time_evaluator("conv2d", ctx, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(
                    remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(
                    remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.asnumpy()
        if data_pack:
            res_orig = res_orig.transpose(
                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter,
                                            fout_height, fout_width)
            bias_np = bias_np.transpose(
                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1)
        res_ref = res_ref >> env.WGT_WIDTH
        res_ref += bias_np
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s GROUP CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" %
          (device, status, cost.mean, gops))

    return correct, cost, stats
def run_gemm(
    env,
    remote,
    target,
    batch_size,
    in_feat,
    out_feat,
    check_correctness=True,
    print_ir=True,
    samples=4,
):

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
    elif "vta" in target.keys:
        data_pack = True

    # Derive shapes depending upon packing
    a_shape = (batch_size, in_feat)
    w_shape = (out_feat, in_feat)
    if data_pack:
        data_shape = (batch_size // env.BATCH, in_feat // env.BLOCK_IN,
                      env.BATCH, env.BLOCK_IN)
        kernel_shape = (
            out_feat // env.BLOCK_OUT,
            in_feat // env.BLOCK_IN,
            env.BLOCK_OUT,
            env.BLOCK_IN,
        )
        fcompute = vta.top.dense_packed
        fschedule = vta.top.schedule_dense_packed
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        fcompute = topi.x86.dense_nopack
        fschedule = topi.x86.schedule_dense_nopack
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)

    # Define base computation schedule
    with target:
        res = fcompute(data, kernel, None, env.acc_dtype)
        res = topi.right_shift(res, 8)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = fschedule([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, res], simple_mode=True))

    # Derive number of ops
    num_ops = 2 * batch_size * in_feat * out_feat

    # @memoize("vta.tests.test_benchmark_topi.dense.verify")
    def get_ref_data():
        # derive min max for act, wgt types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 <<
                                                        (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 <<
                                                        (env.WGT_WIDTH - 1))
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(w_min, w_max,
                                 size=w_shape).astype(kernel.dtype)

        r_np = np.dot(a_np.astype(env.acc_dtype),
                      w_np.T.astype(env.acc_dtype)).astype(env.acc_dtype)
        return a_np, w_np, r_np

    # Data in original format
    data_np, kernel_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(batch_size // env.BATCH, env.BATCH,
                                  in_feat // env.BLOCK_IN,
                                  env.BLOCK_IN).transpose((0, 2, 1, 3))
        kernel_np = kernel_np.reshape(out_feat // env.BLOCK_OUT, env.BLOCK_OUT,
                                      in_feat // env.BLOCK_IN,
                                      env.BLOCK_IN).transpose((0, 2, 1, 3))

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, kernel, res],
                        target=target,
                        target_host=env.target_host,
                        name="dense")
    else:
        mod = tvm.build(s, [data, kernel, res],
                        target=target,
                        target_host=env.target_host,
                        name="dense")
    temp = utils.tempdir()
    mod.save(temp.relpath("dense.o"))
    remote.upload(temp.relpath("dense.o"))
    f = remote.load_module("dense.o")
    dev = remote.device(str(target))

    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, dev)
    kernel_arr = tvm.nd.array(kernel_np, dev)
    res_arr = tvm.nd.array(res_np, dev)
    time_f = f.time_evaluator("dense", dev, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(
                    remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(
                    remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.numpy()
        if data_pack:
            res_orig = res_orig.reshape(batch_size, out_feat)
        res_ref = res_ref >> 8
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s DENSE TEST %s: Time cost = %g sec/op, %g GOPS" %
          (device, status, cost.mean, gops))

    return correct, cost, stats