def test_bfloat_add_and_cast_1():
    X = te.placeholder((3, ), name="X")
    Y = te.placeholder((3, ), name="Y")
    Z = topi.cast(topi.cast(X, dtype="custom[bfloat]16") +
                  topi.cast(Y, dtype="custom[bfloat]16"),
                  dtype="float")

    s = te.create_schedule([Z.op])
    built_cast = lower_datatypes_and_build(s, [X, Y, Z])

    ctx = tvm.context(tgt, 0)

    # Used float32 calculator at http://www.weitz.de/ieee/. Generated float32s
    # with at most 7-bit mantissas which, when added, produce a result with at
    # most 7-bit mantissas. This is to ensure there are no errors due to
    # float32->bfloat16 conversions.
    x = tvm.nd.array(np.array([4.4103796E-32, 14942208.0,
                               1.78125]).astype("float32"),
                     ctx=ctx)
    y = tvm.nd.array(np.array([-3.330669E-14, 19660800.0,
                               2.25]).astype("float32"),
                     ctx=ctx)
    z_expected = np.array([-3.330669E-14, 34603008.0,
                           4.03125]).astype("float32")
    z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx)

    built_cast(x, y, z)

    assert np.array_equal(z_expected, z.asnumpy())
def test_bfloat_add_and_cast_2():
    X = te.placeholder((3, ), name="X")
    Y = te.placeholder((3, ), name="Y")
    Z = topi.cast(topi.cast(X, dtype="custom[bfloat]16") +
                  topi.cast(Y, dtype="custom[bfloat]16"),
                  dtype="float")

    s = te.create_schedule([Z.op])
    built_cast = lower_datatypes_and_build(s, [X, Y, Z])

    ctx = tvm.context(tgt, 0)

    # Used float32 calculator at http://www.weitz.de/ieee/. Generated
    # unconstrained float32s for the operands and copied them in to x and y.
    # Then, to simulate float32->bfloat16 conversion implemented by the mybfloat
    # library, I cut off all but 7 bits of the mantissa. I then added the
    # numbers. To simulate bfloat16 add implemented in mybfloat, I cut off all
    # but 7 bits of the result's mantissa. I then copied that value into
    # z_expected.
    x = tvm.nd.array(np.array([1.2348297, -1.0298302E25,
                               1.2034023E-30]).astype("float32"),
                     ctx=ctx)
    y = tvm.nd.array(np.array([-2.4992788, -9.888288E19,
                               9.342338E-29]).astype("float32"),
                     ctx=ctx)
    z_expected = np.array([-1.25, -1.027587E25,
                           9.426888E-29]).astype("float32")
    z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx)

    built_cast(x, y, z)

    assert np.array_equal(z_expected, z.asnumpy())
    def _transform(theta, input_dim, out_size, input_shape, dtype):
        
        num_batch = input_shape[0]
        height = input_shape[1]
        width = input_shape[2]
        num_channels = input_shape[3]

        theta = topi.reshape(theta, (num_batch, 2, 3))
        theta = topi.cast(theta, dtype)

        out_height = out_size[0]
        out_width = out_size[1]
                
        grid = _meshgrid(out_height, out_width)       
        grid = topi.reshape(grid, (num_batch, 3, out_height*out_width))
        grid = topi.cast(grid, dtype=dtype)
        
        k = tvm.reduce_axis((0, 3), 'k')
        T_g = tvm.compute((num_batch, 2, out_height*out_width),lambda b, y, x: tvm.sum(theta[b, y, k] * grid[b, k, x], axis = k), name = 'T_g')
              
        x_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,0,k], name = 'x_s')
        y_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,1,k], name = 'y_s')
              
        x_s_flat = topi.reshape(x_s, (num_batch*out_height*out_width,))
        y_s_flat = topi.reshape(y_s, (num_batch*out_height*out_width,))
                      
        input_transformed = _interpolate(input_dim, input_shape, x_s_flat, y_s_flat, out_size, dtype)
        output = topi.reshape(input_transformed, [num_batch, out_height, out_width, num_channels])
        return output 
示例#4
0
def upsampling(data,
               scale_h,
               scale_w,
               layout="NCHW",
               method='nearest_neighbor',
               align_corners=False):
    """Perform upsampling on the data.
       Nearest neighbor and bilinear upsampling are supported.

    Parameters
    ----------
    inputs : tvm.Tensor
        inputs is a 4-D tensor with shape
        [batch, channel, in_height, in_width]
        or  [batch, in_height, in_width, channel]

    scale_h : float
        Scaling factor for height

    scale_w : float
        Scaling factor for width

    layout : string, optional
        either "NCHW" or "NHWC"

    method : {"bilinear", "nearest_neighbor", "bicubic"}
        Method to be used for upsampling.

    Returns
    -------
    output : tvm.Tensor
        4-D with shape [batch, channel, in_height*scale_h, in_width*scale_w]
        or [batch, in_height*scale, in_width*scale, channel]
    """
    base_layout = layout[0:4]
    if base_layout == "NCHW":
        out_shape = (simplify(
            topi.cast(tvm.round(data.shape[2] * scale_h),
                      data.shape[2].dtype)),
                     simplify(
                         topi.cast(tvm.round(data.shape[3] * scale_w),
                                   data.shape[3].dtype)))
    elif layout == "NHWC":
        out_shape = (simplify(
            topi.cast(tvm.round(data.shape[1] * scale_h),
                      data.shape[1].dtype)),
                     simplify(
                         topi.cast(tvm.round(data.shape[2] * scale_w),
                                   data.shape[2].dtype)))

    else:
        raise ValueError("not support this layout {} yet".format(layout))
    coord_trans = "align_corners" if align_corners else "asymmetric"
    return topi.image.resize(data,
                             out_shape,
                             layout=layout,
                             method=method,
                             coordinate_transformation_mode=coord_trans)
 def get_promoted(op):
     a = te.placeholder((100, ), dtype='bfloat16')
     b = te.placeholder((100, ), dtype='bfloat16')
     c = te.compute((100, ), lambda i: topi.cast(
         op(topi.cast(a[i], 'float'), topi.cast(b[i], 'float')), 'bfloat16')
                    )
     s = te.create_schedule(c.op)
     func = tvm.driver.build_module.form_irmodule(s, [a, b, c], "main",
                                                  None)["main"]
     return func.body
示例#6
0
def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation):
    data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH,
                  env.BLOCK_IN)
    kernel_shape = (CO // env.BLOCK_OUT, CI // env.BLOCK_IN, KH, KW,
                    env.BLOCK_OUT, env.BLOCK_IN)
    bias_shape = (N // env.BATCH, CO // env.BLOCK_OUT, 1, 1, env.BATCH,
                  env.BLOCK_OUT)

    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)

    with tvm.target.vta():
        res = topi.nn.conv2d(input=data,
                             filter=kernel,
                             padding=padding,
                             strides=strides,
                             dilation=dilation,
                             layout='NCHW%dn%dc' % (env.BATCH, env.BLOCK_IN),
                             out_dtype=env.acc_dtype)
        res = topi.right_shift(res, env.WGT_WIDTH)
        res = topi.add(res, bias)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)

    if tvm.target.Target.current().device_name == 'vta':
        s = topi.generic.schedule_conv2d_nchw([res])
    else:
        s = tvm.create_schedule([res.op])

    return s, [data, kernel, bias, res]
    def verify(from_dtype, to_dtype, low=-100, high=100):
        shape = (5, 4)
        A = tvm.placeholder(shape, dtype=from_dtype, name="A")
        B = topi.cast(A, to_dtype)

        if from_dtype == "bool":
            a_np = np.random.choice([True, False], size=shape)
        else:
            a_np = np.random.uniform(low, high, size=shape).astype(from_dtype)
        if to_dtype == "bool":
            a_np = a_np - a_np[2, 3]
        b_np = a_np.astype(to_dtype)

        for device in get_all_backend():
            ctx = tvm.context(device, 0)
            if not ctx.exist:
                print("Skip because %s is not enabled" % device)
                continue
            print("Running on target: %s" % device)
            with tvm.target.create(device):
                s = topi.generic.schedule_injective(B)
            foo = tvm.build(s, [A, B], device)
            a = tvm.nd.array(a_np, ctx)
            b = tvm.nd.empty(shape=shape, dtype=to_dtype, ctx=ctx)
            foo(a, b)
            tvm.testing.assert_allclose(b.asnumpy(), b_np)
示例#8
0
    def check(t0, t1):
        if (t0 == "float16" or t1
                == "float16") and not have_fp16(tvm.gpu(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        # compute
        n = 128
        A = te.placeholder((n, ), dtype=t0, name='A')
        B = te.placeholder((n, ), dtype=t1, name='B')
        C = te.compute((n, ),
                       lambda i: A[i] + topi.cast(B[i], A.dtype),
                       name='C')

        # schedule
        s = tvm.te.create_schedule(C.op)
        ob, ib = s[C].split(s[C].op.axis[0], nparts=32)
        _, iib = s[C].split(ib, factor=4)
        s[C].vectorize(iib)
        s[C].bind(ob, tx)
        func = tvm.build(s, [A, B, C], "cuda")

        # correctness
        ctx = tvm.gpu(0)
        low, high = (0,
                     20) if t0.startswith('u') or t1.startswith('u') else (-10,
                                                                           10)
        a_np = np.random.randint(low, high, size=n).astype(A.dtype)
        b_np = np.random.randint(low, high, size=n).astype(B.dtype)
        c_np = (a_np + b_np).astype(A.dtype)
        a_nd = tvm.nd.array(a_np, ctx)
        b_nd = tvm.nd.array(b_np, ctx)
        c_nd = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np.dtype), ctx)
        func(a_nd, b_nd, c_nd)
        tvm.testing.assert_allclose(c_nd.asnumpy(), c_np, rtol=1e-3)
def group_conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, group):

    CI_G = CI // groups
    data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH,
                  env.BLOCK_IN)
    kernel_shape = (CO // env.BLOCK_OUT, CI_G // env.BLOCK_IN, KH, KW,
                    env.BLOCK_OUT, env.BLOCK_IN)
    bias_shape = (N // env.BATCH, CO // env.BLOCK_OUT, 1, 1, env.BATCH,
                  env.BLOCK_OUT)

    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)

    with tvm.target.vta():
        res = topi.nn.group_conv2d_nchw(data, kernel, strides, padding,
                                        dilation, groups, env.acc_dtype)
        res = topi.right_shift(res, env.WGT_WIDTH)
        res = topi.add(res, bias)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)

    if tvm.target.Target.current().device_name == 'vta':
        s = topi.generic.schedule_group_conv2d_nchw([res])
    else:
        s = te.create_schedule([res.op])

    return s, [data, kernel, bias, res]
示例#10
0
def conv2d_transpose(N, CI, H, W, CO, KH, KW, strides, padding):
    data_shape = (N // env.BATCH, CI // env.BLOCK_IN, H, W, env.BATCH,
                  env.BLOCK_IN)
    kernel_shape = (CO // env.BLOCK_OUT, CI // env.BLOCK_IN, KH, KW,
                    env.BLOCK_OUT, env.BLOCK_IN)

    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)

    with tvm.target.vta():
        res = topi.nn.conv2d_transpose_nchw(Input=data,
                                            Filter=kernel,
                                            strides=strides,
                                            padding=padding,
                                            out_dtype=env.acc_dtype)
        res = topi.right_shift(res, env.WGT_WIDTH)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)

    if tvm.target.current_target().device_name == 'vta':
        s = topi.generic.schedule_conv2d_transpose_nchw([res])
    else:
        s = tvm.create_schedule([res.op])

    return s, [data, kernel, res]
示例#11
0
def Cast(device="llvm",
         lib_path="./",
         ndim=None,
         src_dtype=None,
         dst_dtype=None):
    '''
    cast
    Args:
        device:
        lib_path:
        ndim:
        src_dtype:
        dst_dtype:

    Returns:
    '''
    shape = [tvm.var("n" + str(i)) for i in range(ndim)]
    opname = "Cast_ndim%d_%s_%s" % (ndim, src_dtype, dst_dtype)
    print(opname)

    # define compute
    in_tensor = tvm.placeholder(shape, dtype=src_dtype, name='in_tensor')
    out_tensor = topi.cast(in_tensor, dst_dtype)
    tensor_list = [in_tensor, out_tensor]
    s = topi.generic.schedule_injective(out_tensor)
    Genlib(s, tensor_list, device, opname, lib_path)
def test_bfloat_add_and_cast_FloatImm():
    X = te.placeholder((3, ), name="X")
    Z = topi.cast(topi.add(topi.cast(X, dtype="custom[bfloat]16"),
                           tvm.tir.FloatImm("custom[bfloat]16", 1.5)),
                  dtype="float")

    s = te.create_schedule([Z.op])
    built_cast = lower_datatypes_and_build(s, [X, Z])

    ctx = tvm.context(tgt, 0)

    x = tvm.nd.array(np.array([0.0, 1.0, 1.5]).astype("float32"), ctx=ctx)
    z_expected = np.array([1.5, 2.5, 3.0]).astype("float32")
    z = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=ctx)

    built_cast(x, z)

    assert np.array_equal(z_expected, z.asnumpy())
示例#13
0
def _compute_offset(in_tensor, in_shape, out_shape, attr_list, nz_format_flag):
    """
    the compute of scale
    Parameters
    ----------
    in_tensor : input tensor

    in_shape : the shape of input tensor

    out_shape :the shape of output tensor

    attr_list : the attr list

    nz_format_flag: the format of input tensor

    Returns
    -------
    res tensor
    """
    offset = attr_list[0]
    reform_flag = attr_list[1]
    scale = attr_list[2]
    if offset != 0 or scale == 1:
        offset_value = tvm.const(offset, "float16")
        if reform_flag:
            offset_ub = _reform_by_vadds(in_tensor, in_shape, out_shape,
                                         offset_value, nz_format_flag)
        else:
            offset_ub = tvm.compute(
                out_shape,
                lambda *indice: in_tensor(*indice) + offset_value,
                name="offset_ub")
        cast_i8_ub = tvm.compute(
            out_shape,
            lambda *indice: topi.cast(offset_ub(*indice), "int8"),
            name='cast_i8_ub')
    else:
        cast_i8_ub = tvm.compute(
            out_shape,
            lambda *indice: topi.cast(in_tensor(*indice), "int8"),
            name='cast_i8_ub')
    return cast_i8_ub
 def to16(v):
     uint32_v = tvm.tir.call_intrin("uint32", "tir.reinterpret", v)
     rounding_bias = tvm.tir.call_intrin("uint32", "tir.shift_right",
                                         uint32_v,
                                         tvm.tir.const(16, "uint32"))
     rounding_bias = tvm.tir.call_intrin("uint32", "tir.bitwise_and",
                                         rounding_bias,
                                         tvm.tir.const(1, "uint32"))
     rounding_bias = rounding_bias + tvm.tir.const(0x7FFF, "uint16")
     uint32_v = uint32_v + rounding_bias
     uint32_v = tvm.tir.call_intrin("uint32", "tir.shift_right", uint32_v,
                                    tvm.tir.const(16, "uint32"))
     return topi.cast(uint32_v, 'uint16')
示例#15
0
def pick_single_out(outs):
    type_level = {"bool": 1, "int8": 2, "int32": 3, "float16": 4, "float32": 5}
    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
    ori_out = outs
    fake_out = False
    fork_node = []

    if len(outs) > 1:
        outs, fork_node = check_multi_out(outs)
        if len(outs) > 1:
            fake_op = outs[0]
            highest_type = outs[0].dtype
            for node in outs[1:]:
                if node.dtype != highest_type:
                    if type_level[highest_type] > type_level[node.dtype]:
                        node = topi.cast(node, highest_type)
                    else:
                        highest_type = node.dtype
                        fake_op = topi.cast(fake_op, highest_type)
                fake_op = topi.add(node, fake_op)
                fake_out = True
            outs = [fake_op]
    tmp_out = [op for op in ori_out if op not in outs]
    return outs, tmp_out, fake_out, fork_node
    def _topi_nn_conv2d(*args, **kwargs):
        assert not kwargs, "Do not support kwargs in template function call"
        A, W = args[:2]

        with tvm.target.vta():
            res = vta.top.conv2d_packed(*args, **kwargs)
            res = topi.right_shift(res, 8)
            res = my_clip(res, 0, 127)
            res = topi.cast(res, "int8")

        if tvm.target.Target.current().device_name == 'vta':
            s = vta.top.schedule_conv2d_packed([res])
        else:
            s = te.create_schedule([res.op])
        return s, [A, W, res]
示例#17
0
    def _topi_nn_conv2d(*args, **kwargs):
        assert not kwargs, "Do not support kwargs in template function call"
        args = deserialize_args(args)
        A, W = args[:2]

        with tvm.target.vta():
            res = topi.nn.conv2d(*args, **kwargs)
            res = topi.right_shift(res, 8)
            res = my_clip(res, 0, 127)
            res = topi.cast(res, "int8")

        if tvm.target.current_target().device_name == 'vta':
            s = topi.generic.schedule_conv2d_nchw([res])
        else:
            s = tvm.create_schedule([res.op])
        return s, [A, W, res]
示例#18
0
def dense(N, CI, CO):
    data_shape = (N//env.BATCH, CI//env.BLOCK_IN, env.BATCH, env.BLOCK_IN)
    kernel_shape = (CO//env.BLOCK_OUT, CI//env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN)

    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)

    with tvm.target.vta():
        res = topi.nn.dense(data, kernel, None, 'int32')
        res = topi.right_shift(res, 8)
        res = my_clip(res, 0, 127)
        res = topi.cast(res, "int8")

    if tvm.target.Target.current().device_name == 'vta':
        s = topi.generic.schedule_dense([res])
    else:
        s = tvm.create_schedule([res.op])

    return s, [data, kernel, res]
示例#19
0
    def run_cpu_conv2d(env, remote, key, batch_size, wl, profile=True):
        data_shape = (batch_size, wl.in_filter, wl.height, wl.width)
        kernel_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)

        fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
        fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
        data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
        kernel = tvm.placeholder(kernel_shape,
                                 name="kernel",
                                 dtype=env.wgt_dtype)
        res_conv = topi.nn.conv2d(data,
                                  kernel,
                                  padding=(wl.hpad, wl.wpad),
                                  strides=(wl.hstride, wl.wstride),
                                  out_dtype="int32")
        res = topi.right_shift(res_conv, 8)
        res = my_clip(res, 0, 127)
        res = topi.cast(res, "int8")

        # To compute number of ops, use a x2 factor for FMA
        num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter

        a_shape = (batch_size, wl.in_filter, wl.height, wl.width)
        w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
        stride = (wl.hstride, wl.wstride)
        data_dtype = data.dtype
        kernel_dtype = kernel.dtype
        acc_dtype = env.acc_dtype
        assert wl.hpad == wl.wpad
        padding = wl.hpad

        @memoize("vta.tests.test_benchmark_topi.conv2d.cpu.verify_nhwc")
        def get_ref_data():
            a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype)
            w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype)
            a_np = np.abs(a_np)
            w_np = np.abs(w_np)
            b_np = topi.testing.conv2d_nchw_python(a_np.astype(acc_dtype),
                                                   w_np.astype(acc_dtype),
                                                   stride,
                                                   padding).astype(acc_dtype)
            return a_np, w_np, b_np

        def verify(s, check_correctness):
            mod = tvm.build(s, [data, kernel, res],
                            target_host=env.target_host,
                            name="conv2d")
            temp = util.tempdir()
            mod.save(temp.relpath("conv2d.o"))
            remote.upload(temp.relpath("conv2d.o"))
            f = remote.load_module("conv2d.o")
            # verify
            ctx = remote.cpu(0)
            # Data in original format
            data_orig, kernel_orig, res_ref = get_ref_data()
            res_shape = topi.util.get_const_tuple(res.shape)
            res_np = np.zeros(res_shape).astype(res.dtype)
            data_arr = tvm.nd.array(data_orig, ctx)
            kernel_arr = tvm.nd.array(kernel_orig, ctx)
            res_arr = tvm.nd.array(res_np, ctx)
            time_f = f.time_evaluator("conv2d", ctx, number=5)
            cost = time_f(data_arr, kernel_arr, res_arr)
            res_unpack = res_arr.asnumpy()
            if check_correctness:
                assert wl.hpad == wl.wpad
                stride = (wl.hstride, wl.wstride)
                padding = wl.hpad
                res_ref = res_ref >> 8
                res_ref = np.clip(res_ref, 0, 127).astype("int8")
                tvm.testing.assert_allclose(res_unpack, res_ref)
            return cost

        def conv_normal(print_ir):
            print("----- CONV2D CPU End-to-End Test-------")
            s = topi.generic.schedule_conv2d_nchw([res])
            if print_ir:
                print(tvm.lower(s, [data, kernel, res], simple_mode=True))
            cost = verify(s, True)
            gops = (num_ops / cost.mean) / float(10**9)
            print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))

        conv_normal(False)
 def to32(v):
     uint32_v = topi.cast(v, "uint32")
     uint32_v = tvm.tir.call_pure_intrin("uint32", "tir.shift_left",
                                         uint32_v,
                                         tvm.tir.const(16, "uint32"))
     return tvm.tir.call_pure_intrin("float32", "tir.reinterpret", uint32_v)
 def to32(v):
     return topi.cast(v, 'float')
示例#22
0
def run_conv2d(env, remote, wl, target,
               check_correctness=True, print_ir=False,
               samples=4):

    # Workload assertions
    assert wl.hpad == wl.wpad

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
        layout = "NCHW"
    elif "vta" in target.keys:
        data_pack = True
        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)

    # Derive shapes depending upon packing
    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
    w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
    b_shape = (wl.batch, wl.out_filter, 1, 1)
    if data_pack:
        data_shape = (wl.batch//env.BATCH, wl.in_filter//env.BLOCK_IN,
                      wl.height, wl.width, env.BATCH, env.BLOCK_IN)
        kernel_shape = (wl.out_filter//env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN,
                        wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN)
        bias_shape = (wl.batch//env.BATCH, wl.out_filter//env.BLOCK_OUT,
                      1, 1, env.BATCH, env.BLOCK_OUT)
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        bias_shape = b_shape
    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)

    # Define base computation schedule
    with target:
        res = topi.nn.conv2d(
            data, kernel, (wl.hstride, wl.wstride), (wl.hpad, wl.wpad), (1, 1),
            layout, env.acc_dtype)
        res = topi.right_shift(res, 8)
        res = topi.add(res, bias)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = topi.generic.schedule_conv2d_nchw([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))

    # Derive number of ops
    fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
    fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
    num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter

    # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
    def get_ref_data():
        # derive min max for act, wgt, and bias types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
        b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2)
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype)
        b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype)
        r_np = topi.testing.conv2d_nchw_python(
            a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype), (wl.hstride, wl.wstride), wl.hpad).astype(env.acc_dtype)
        return a_np, w_np, b_np, r_np

    # Data in original format
    data_np, kernel_np, bias_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(
            wl.batch//env.BATCH, env.BATCH,
            wl.in_filter//env.BLOCK_IN, env.BLOCK_IN,
            wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3))
        kernel_np = kernel_np.reshape(
            wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT,
            wl.in_filter//env.BLOCK_IN, env.BLOCK_IN,
            wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3))
        bias_np = bias_np.reshape(
            wl.batch//env.BATCH, wl.out_filter//env.BLOCK_OUT,
            1, 1, env.BATCH, env.BLOCK_OUT)

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, kernel, bias, res],
                        target=target,
                        target_host=env.target_host,
                        name="conv2d")
    else:
        mod = tvm.build(s, [data, kernel, bias, res],
                        target=target,
                        target_host=env.target_host,
                        name="conv2d")
    temp = util.tempdir()
    mod.save(temp.relpath("conv2d.o"))
    remote.upload(temp.relpath("conv2d.o"))
    f = remote.load_module("conv2d.o")
    ctx = remote.context(str(target))

    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, ctx)
    kernel_arr = tvm.nd.array(kernel_np, ctx)
    bias_arr = tvm.nd.array(bias_np, ctx)
    res_arr = tvm.nd.array(res_np, ctx)
    time_f = f.time_evaluator("conv2d", ctx, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.asnumpy()
        if data_pack:
            res_orig = res_orig.transpose(
                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, fout_height, fout_width)
            bias_np = bias_np.transpose(
                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1)
        res_ref = res_ref >> env.WGT_WIDTH
        res_ref += bias_np
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10 ** 9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))

    return correct, cost, stats
 def to16(v):
     return topi.cast(v, 'bfloat16')
示例#24
0
文件: tensor.py 项目: aswinjohn/tvm
def compute_cast(attrs, inputs, _):
    """Compute definition of cast"""
    dtype = attrs.get_string("dtype")
    return topi.cast(inputs[0], dtype)
def upsampling3d(data,
                 scale_d,
                 scale_h,
                 scale_w,
                 layout="NCDHW",
                 method='nearest_neighbor',
                 coordinate_transformation_mode="half_pixel"):
    """Perform upsampling on the data.
       Nearest neighbor and bilinear upsampling are supported.

    Parameters
    ----------
    inputs : tvm.te.Tensor
        inputs is a 5-D tensor with shape
        [batch, channel, in_depth, in_height, in_width]
        or  [batch, in_depth, in_height, in_width, channel]

    scale_d : float
        Scaling factor for depth

    scale_h : float
        Scaling factor for height

    scale_w : float
        Scaling factor for width

    layout : string, optional
        either "NCDHW" or "NDHWC"

    method : {"trilinear", "nearest_neighbor"}
        Method to be used for upsampling.

    coordinate_transformation_mode: string, optional
        Describes how to transform the coordinate in the resized tensor
        to the coordinate in the original tensor.
        Refer to the ONNX Resize operator specification for details.
        Available options are "half_pixel", "align_corners" and "asymmetric".

    Returns
    -------
    output : tvm.te.Tensor
        5-D with shape [batch, channel, in_depth*scale, in_height*scale, in_width*scale]
        or [batch, in_depth*scale, in_height*scale, in_width*scale, channel]
    """
    base_layout = layout[0:5]
    if base_layout == "NCDHW":
        out_shape = (simplify(
            topi.cast(te.round(data.shape[2] * scale_d), data.shape[2].dtype)),
                     simplify(
                         topi.cast(te.round(data.shape[3] * scale_h),
                                   data.shape[3].dtype)),
                     simplify(
                         topi.cast(te.round(data.shape[4] * scale_w),
                                   data.shape[4].dtype)))
    elif layout == "NDHWC":
        out_shape = (simplify(
            topi.cast(te.round(data.shape[1] * scale_d), data.shape[1].dtype)),
                     simplify(
                         topi.cast(te.round(data.shape[2] * scale_h),
                                   data.shape[2].dtype)),
                     simplify(
                         topi.cast(te.round(data.shape[3] * scale_w),
                                   data.shape[3].dtype)))

    else:
        raise ValueError("not support this layout {} yet".format(layout))
    return topi.image.resize3d(
        data,
        out_shape,
        layout=layout,
        method=method,
        coordinate_transformation_mode=coordinate_transformation_mode)
    def _interpolate(im, im_shape, x, y, out_size, dtype):
        
        num_batch = im_shape[0]
        height = im_shape[1]
        width = im_shape[2]
        channels = im_shape[3]
            
        out_height = out_size[0]
        out_width = out_size[1]
        max_y = int(im_shape[1] - 1)
        max_x = int(im_shape[2] - 1)
               
        #[-1,1] -> [0, width-1]
        x = topi.multiply(topi.add(x, tvm.const(1, dtype=dtype)), width / tvm.const(2, dtype=dtype))
        y = topi.multiply(topi.add(y, tvm.const(1, dtype=dtype)), height / tvm.const(2, dtype=dtype))
            
        # do sampling
        dim3 = out_height * out_width * num_batch
            
        x0 = topi.cast(topi.floor(x), 'int32')  
        y0 = topi.cast(topi.floor(y), 'int32')
        x1 = topi.add(x0,tvm.const(1, dtype="int32"))
        y1 = topi.add(y0,tvm.const(1, dtype="int32"))

        x0 = topi.clip(x0, 0, max_x)
        x1 = topi.clip(x1, 0, max_x)
        y0 = topi.clip(y0, 0, max_y)
        y1 = topi.clip(y1, 0, max_y)

        dim2 = width
        dim1 = width * height

        base = tvm.compute((dim3,),lambda i:(i // (out_height * out_width)) * width * height, name = 'base')        
        base_y0 = topi.add(base, topi.multiply(y0, dim2))
        base_y1 = topi.add(base, topi.multiply(y1, dim2))

        idx_a = topi.add(base_y0, x0)
        idx_b = topi.add(base_y1, x0)
        idx_c = topi.add(base_y0, x1)
        idx_d = topi.add(base_y1, x1)
                
        im_flat = topi.reshape(im, (num_batch * height * width, channels))
        im_flat = topi.cast(im_flat, dtype)
        
        Ia = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_a[i], j], name = 'Ia')       
        Ib = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_b[i], j], name = 'Ib') 
        Ic = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_c[i], j], name = 'Ic')
        Id = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_d[i], j], name = 'Id')
            
        x0_f = topi.cast(x0, dtype)
        x1_f = topi.cast(x1, dtype)
        y0_f = topi.cast(y0, dtype)
        y1_f = topi.cast(y1, dtype)
        wa = topi.expand_dims(topi.multiply(topi.subtract(x1_f, x), topi.subtract(y1_f, y)), 1)
        wb = topi.expand_dims(topi.multiply(topi.subtract(x1_f, x), topi.subtract(y, y0_f)), 1)
        wc = topi.expand_dims(topi.multiply(topi.subtract(x, x0_f), topi.subtract(y1_f, y)), 1)
        wd = topi.expand_dims(topi.multiply(topi.subtract(x, x0_f), topi.subtract(y, y0_f)), 1)
 
        output = topi.add(topi.add(topi.add(topi.multiply(wa, Ia), topi.multiply(wb, Ib)),topi.multiply(wc, Ic)), topi.multiply(wd, Id))
        
        return output
# the op to a Call to a function of the provided name, e.g. BFloat16Add_wrapper.
tvm.datatype.register_op(tvm.datatype.create_lower_func("FloatToBFloat16_wrapper"),
                         "Cast", target, "bfloat", "float")
tvm.datatype.register_op(tvm.datatype.create_lower_func("BFloat16ToFloat_wrapper"),
                         "Cast", target, "float", "bfloat")
tvm.datatype.register_op(tvm.datatype.create_lower_func("BFloat16Add_wrapper"),
                         "Add", target, "bfloat")


# The basic program, but with casts to a custom datatype.
# Note how we specify the custom datatype: we indicate it using the special
# `custom[...]` syntax.
# Additionally, note the "16" after the datatype: this is the bitwidth of the
# custom datatype. This tells TVM that each instance of bfloat is 16 bits wide.
Z = topi.cast(
    topi.cast(X, dtype="custom[bfloat]16") +
    topi.cast(Y, dtype="custom[bfloat]16"),
    dtype="float32")

# Compile for LLVM (schedule, lower, and build)
schedule = tvm.create_schedule([Z.op])
lowered_func = tvm.lower(schedule, [X, Y, Z])
# Here, we manually lower custom datatypes. Soon, this will be incorporated
# directly into the TVM lower and build process.
lowered_func = tvm.ir_pass.LowerCustomDatatypes(lowered_func, target)
built_program = tvm.build(lowered_func, target=target)


# Finally, create a new array to hold the output and run the program.
z_bfloat = tvm.nd.empty(Z.shape, dtype=Z.dtype, ctx=context)

built_program(x, y, z_bfloat)
    def run_vta_conv2d(env, remote, key, batch_size, wl, profile=True):
        data_shape = (batch_size//env.BATCH, wl.in_filter//env.BLOCK_IN,
                      wl.height, wl.width, env.BATCH, env.BLOCK_IN)
        kernel_shape = (wl.out_filter//env.BLOCK_OUT, wl.in_filter//env.BLOCK_IN,
                        wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN)
        bias_shape = (1, wl.out_filter//env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT)

        fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
        fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
        data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
        kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
        bias = tvm.placeholder(bias_shape, name="kernel", dtype=env.acc_dtype)

        res_conv = vta.top.packed_conv2d(
            data, kernel, padding=(wl.hpad, wl.wpad), strides=(wl.hstride, wl.wstride))
        res = topi.right_shift(res_conv, 8)
        res = topi.add(res, bias)
        res = my_clip(res, 0, 127)
        res = topi.cast(res, "int8")

        # To compute number of ops, use a x2 factor for FMA
        num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter

        a_shape = (batch_size, wl.in_filter, wl.height, wl.width)
        w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
        stride = (wl.hstride, wl.wstride)
        data_dtype = data.dtype
        kernel_dtype = kernel.dtype
        acc_dtype = env.acc_dtype
        assert wl.hpad == wl.wpad
        padding = wl.hpad

        @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
        def get_ref_data():
            a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype)
            w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype)
            a_np = np.abs(a_np)
            w_np = np.abs(w_np)
            b_np = topi.testing.conv2d_nchw_python(
                a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype)
            return a_np, w_np, b_np

        def verify(s, check_correctness):
            mod = vta.build(s, [data, kernel, bias, res], "ext_dev",
                            env.target_host, name="conv2d")
            temp = util.tempdir()

            mod.save(temp.relpath("conv2d.o"))
            remote.upload(temp.relpath("conv2d.o"))
            f = remote.load_module("conv2d.o")
            # verify
            ctx = remote.ext_dev(0)
            # Data in original format
            data_orig, kernel_orig, res_ref = get_ref_data()
            bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 4).astype("int32")
            bias_orig = np.abs(bias_orig)

            data_packed = data_orig.reshape(
                batch_size//env.BATCH, env.BATCH,
                wl.in_filter//env.BLOCK_IN, env.BLOCK_IN,
                wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3))
            kernel_packed = kernel_orig.reshape(
                wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT,
                wl.in_filter//env.BLOCK_IN, env.BLOCK_IN,
                wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3))
            bias_packed = bias_orig.reshape(
                1, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT)
            res_shape = topi.util.get_const_tuple(res.shape)

            res_np = np.zeros(res_shape).astype(res.dtype)
            data_arr = tvm.nd.array(data_packed, ctx)
            kernel_arr = tvm.nd.array(kernel_packed, ctx)
            bias_arr = tvm.nd.array(bias_packed, ctx)
            res_arr = tvm.nd.array(res_np, ctx)
            time_f = f.time_evaluator("conv2d", ctx, number=5)
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            res_unpack = res_arr.asnumpy().transpose(
                (0, 4, 1, 5, 2, 3)).reshape(batch_size, wl.out_filter, fout_height, fout_width)
            if check_correctness:
                assert wl.hpad == wl.wpad
                stride = (wl.hstride, wl.wstride)
                padding = wl.hpad
                res_ref = res_ref >> 8
                res_ref += bias_orig.reshape(wl.out_filter, 1, 1)
                res_ref = np.clip(res_ref, 0, 127).astype("int8")
                tvm.testing.assert_allclose(res_unpack, res_ref)
            return cost

        def conv_normal(print_ir):
            print("----- CONV2D End-to-End Test-------")
            with vta.build_config():
                s = vta.top.schedule_packed_conv2d([res])
                if print_ir:
                    print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))
            cost = verify(s, True)
            gops = (num_ops / cost.mean) / float(10 ** 9)
            print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))

        conv_normal(False)
示例#29
0
def tanh_split_input_by_val(shape, input_x, symbol):
    """
    split input into two tensor by 0.5
    shape : tensor shape
    input_x : tensor
    symbol : tensor symbol
    return: res, operations, scope
    """
    res = {}
    operation = {}
    scope = {}
    dtype_x = input_x.dtype
    const_zero = tvm.const(0.0, dtype="float16")
    const_0 = tvm.const(0.5, dtype="float16")

    key = "input_abs_" + symbol
    input_abs = tvm.compute(shape, lambda *i: tvm.abs(input_x(*i)), name=key)
    res[key] = input_abs
    operation[key] = "vector_abs"
    scope[key] = cce.scope_ubuf

    # vcmp only support fp16
    if dtype_x == "float32":
        key = "cmp_val_fp16_" + symbol
        cmp_val_fp16 = tvm.compute(
            shape, lambda *i: topi.cast(input_abs(*i), "float16"), name=key)
        res[key] = cmp_val_fp16
        operation[key] = "vector_conv"
        scope[key] = cce.scope_ubuf

        key = "input_val_fp16_" + symbol
        input_val_fp16 = tvm.compute(
            shape, lambda *i: topi.cast(input_x(*i), "float16"), name=key)
        res[key] = input_val_fp16
        operation[key] = "vector_conv"
        scope[key] = cce.scope_ubuf

        key = "input_gt_fp16_" + symbol
        input_gt_fp16 = \
            tvm.compute(shape,
                        lambda *i: tvm.select(cmp_val_fp16(*i) > const_0,
                                              input_val_fp16(*i), const_zero),
                        name=key)
        res[key] = input_gt_fp16
        operation[key] = "vector_select_gt"
        scope[key] = cce.scope_ubuf

        key = "input_lt_fp16_" + symbol
        input_lt_fp16 = \
            tvm.compute(shape,
                        lambda *i: tvm.select(cmp_val_fp16(*i) <= const_0,
                                              input_val_fp16(*i), const_zero),
                        name=key)
        res[key] = input_lt_fp16
        operation[key] = "vector_select_le"
        scope[key] = cce.scope_ubuf

        key = "input_gt_" + symbol
        input_gt = tvm.compute(
            shape,
            lambda *i: topi.cast(input_gt_fp16(*i), "float32"),
            name=key)
        res[key] = input_gt
        operation[key] = "vector_conv"
        scope[key] = cce.scope_ubuf

        key = "input_lt_" + symbol
        input_lt = tvm.compute(
            shape,
            lambda *i: topi.cast(input_lt_fp16(*i), "float32"),
            name=key)
        res[key] = input_lt
        operation[key] = "vector_conv"
        scope[key] = cce.scope_ubuf
    else:
        key = "input_gt_" + symbol
        input_gt = tvm.compute(
            shape,
            lambda *i: tvm.select(
                input_abs(*i) > const_0, input_x(*i), const_zero),
            name=key)
        res[key] = input_gt
        operation[key] = "vector_select_gt"
        scope[key] = cce.scope_ubuf

        key = "input_lt_" + symbol
        input_lt = tvm.compute(
            shape,
            lambda *i: tvm.select(
                input_abs(*i) <= const_0, input_x(*i), const_zero),
            name=key)
        res[key] = input_lt
        operation[key] = "vector_select_le"
        scope[key] = cce.scope_ubuf

    return res, operation, scope
示例#30
0
def tanh_compute_high_performance(shape, input_x, symbol):
    """
    the function of tanh
    Parameters
    ----------
    shape : tensor shape
    input_x : tensor
    symbol : tensor symbol
    Returns
    -------
    """
    res = {}
    operation = {}
    scope = {}

    dtype_x = input_x.dtype
    const_one = tvm.const(1, dtype=dtype_x)
    const_neg_two = tvm.const(-2, dtype=dtype_x)
    const_fp32_min = tvm.const(2**(-126), dtype=dtype_x)

    key = "input_abs_" + symbol
    input_abs = tvm.compute(shape, lambda *i: tvm.abs(input_x(*i)), name=key)
    res[key] = input_abs
    operation[key] = "vector_abs"
    scope[key] = cce.scope_ubuf

    key = "power_val_" + symbol
    power_val = tvm.compute(shape,
                            lambda *i: input_abs(*i) * const_neg_two,
                            name=key)
    res[key] = power_val
    operation[key] = "vector_muls"
    scope[key] = cce.scope_ubuf

    if dtype_x == "float32":
        key = "exp_val_fp16_" + symbol
        exp_val_fp16 = tvm.compute(
            shape, lambda *i: topi.cast(power_val(*i), "float16"), name=key)
        res[key] = exp_val_fp16
        operation[key] = "vector_conv"
        scope[key] = cce.scope_ubuf

        key = "exp_val_" + symbol
        exp_val = tvm.compute(shape,
                              lambda *i: tvm.exp(exp_val_fp16(*i)),
                              name=key)
        res[key] = exp_val
        operation[key] = "vector_exp"
        scope[key] = cce.scope_ubuf

        key = "exp_val_fp32_" + symbol
        exp_val_fp32 = tvm.compute(
            shape, lambda *i: topi.cast(exp_val(*i), "float32"), name=key)
        res[key] = exp_val_fp32
        operation[key] = "vector_conv"
        scope[key] = cce.scope_ubuf

        exp_val_true = exp_val_fp32
    else:
        key = "exp_val_" + symbol
        exp_val = tvm.compute(shape,
                              lambda *i: tvm.exp(power_val(*i)),
                              name=key)
        res[key] = exp_val
        operation[key] = "vector_exp"
        scope[key] = cce.scope_ubuf
        exp_val_true = exp_val

    key = "up_val_tmp_" + symbol
    up_val_tmp = tvm.compute(shape,
                             lambda *i: exp_val_true(*i) * input_x(*i),
                             name=key)
    res[key] = up_val_tmp
    operation[key] = "vector_mul"
    scope[key] = cce.scope_ubuf

    key = "up_val_" + symbol
    up_val = tvm.compute(shape,
                         lambda *i: input_x(*i) - up_val_tmp(*i),
                         name=key)
    res[key] = up_val
    operation[key] = "vector_sub"
    scope[key] = cce.scope_ubuf

    key = "input_tmp_" + symbol
    input_tmp = tvm.compute(shape,
                            lambda *i: input_abs(*i) + const_fp32_min,
                            name=key)
    res[key] = input_tmp
    operation[key] = "vector_adds"
    scope[key] = cce.scope_ubuf

    key = "down_val_tmp_" + symbol
    down_val_tmp = tvm.compute(shape,
                               lambda *i: exp_val_true(*i) + const_one,
                               name=key)
    res[key] = down_val_tmp
    operation[key] = "vector_adds"
    scope[key] = cce.scope_ubuf

    key = "down_val_" + symbol
    down_val = tvm.compute(shape,
                           lambda *i: down_val_tmp(*i) * input_tmp(*i),
                           name=key)
    res[key] = down_val
    operation[key] = "vector_mul"
    scope[key] = cce.scope_ubuf

    ub_rec = tvm.compute(shape,
                         lambda *i: const_one / down_val(*i),
                         name="ub_rec_" + symbol)
    res["ub_rec_" + symbol] = ub_rec
    operation["ub_rec_" + symbol] = "vector_rec"
    scope["ub_rec_" + symbol] = cce.scope_ubuf

    iter_num = 1
    tensor_list, scope_list, emit_list = newton_iteration(
        shape, ub_rec, down_val, symbol, iter_num)
    res.update(tensor_list)
    operation.update(emit_list)
    scope.update(scope_list)

    newton_res = tensor_list["tmp_" + symbol + str(iter_num - 1)]

    ub_tanh = tvm.compute(shape,
                          lambda *i: up_val(*i) * newton_res(*i),
                          name="ub_tanh_" + symbol)
    res["ub_tanh_" + symbol] = ub_tanh
    operation["ub_tanh_" + symbol] = "vector_mul"
    scope["ub_tanh_" + symbol] = cce.scope_ubuf

    return res, operation, scope
    def run_cpu_conv2d(env, remote, key, batch_size, wl, profile=True):
        data_shape = (batch_size, wl.in_filter, wl.height, wl.width)
        kernel_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)

        fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
        fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
        data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
        kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
        res_conv = topi.nn.conv2d(
            data, kernel, padding=(wl.hpad, wl.wpad),
            strides=(wl.hstride, wl.wstride),
            dilation=(1, 1),
            out_dtype="int32")
        res = topi.right_shift(res_conv, 8)
        res = my_clip(res, 0, 127)
        res = topi.cast(res, "int8")

        # To compute number of ops, use a x2 factor for FMA
        num_ops = 2 * batch_size * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter

        a_shape = (batch_size, wl.in_filter, wl.height, wl.width)
        w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
        stride = (wl.hstride, wl.wstride)
        data_dtype = data.dtype
        kernel_dtype = kernel.dtype
        acc_dtype = env.acc_dtype
        assert wl.hpad == wl.wpad
        padding = wl.hpad

        @memoize("vta.tests.test_benchmark_topi.conv2d.cpu.verify_nhwc")
        def get_ref_data():
            a_np = (np.random.uniform(size=a_shape) * 4).astype(data_dtype)
            w_np = (np.random.uniform(size=w_shape) * 4).astype(kernel_dtype)
            a_np = np.abs(a_np)
            w_np = np.abs(w_np)
            b_np = topi.testing.conv2d_nchw_python(
                a_np.astype(acc_dtype), w_np.astype(acc_dtype), stride, padding).astype(acc_dtype)
            return a_np, w_np, b_np


        def verify(s, check_correctness):
            mod = tvm.build(s, [data, kernel, res],
                            target_host=env.target_host,
                            name="conv2d")
            temp = util.tempdir()
            mod.save(temp.relpath("conv2d.o"))
            remote.upload(temp.relpath("conv2d.o"))
            f = remote.load_module("conv2d.o")
            # verify
            ctx = remote.cpu(0)
            # Data in original format
            data_orig, kernel_orig, res_ref = get_ref_data()
            res_shape = topi.util.get_const_tuple(res.shape)
            res_np = np.zeros(res_shape).astype(res.dtype)
            data_arr = tvm.nd.array(data_orig, ctx)
            kernel_arr = tvm.nd.array(kernel_orig, ctx)
            res_arr = tvm.nd.array(res_np, ctx)
            time_f = f.time_evaluator("conv2d", ctx, number=5)
            cost = time_f(data_arr, kernel_arr, res_arr)
            res_unpack = res_arr.asnumpy()
            if check_correctness:
                assert wl.hpad == wl.wpad
                stride = (wl.hstride, wl.wstride)
                padding = wl.hpad
                res_ref = res_ref >> 8
                res_ref = np.clip(res_ref, 0, 127).astype("int8")
                tvm.testing.assert_allclose(res_unpack, res_ref)
            return cost

        def conv_normal(print_ir):
            print("----- CONV2D CPU End-to-End Test-------")
            s = topi.generic.schedule_conv2d_nchw([res])
            if print_ir:
                print(tvm.lower(s, [data, kernel, res], simple_mode=True))
            cost = verify(s, True)
            gops = (num_ops / cost.mean) / float(10 ** 9)
            print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))

        conv_normal(False)
示例#32
0
def run_gemm(env, remote, target,
             batch_size, in_feat, out_feat,
             check_correctness=True, print_ir=True,
             samples=4):

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
    elif "vta" in target.keys:
        data_pack = True

    # Derive shapes depending upon packing
    a_shape = (batch_size, in_feat)
    w_shape = (out_feat, in_feat)
    if data_pack:
        data_shape = (batch_size//env.BATCH, in_feat//env.BLOCK_IN,
                      env.BATCH, env.BLOCK_IN)
        kernel_shape = (out_feat//env.BLOCK_OUT, in_feat//env.BLOCK_IN,
                        env.BLOCK_OUT, env.BLOCK_IN)
        fcompute = vta.top.dense_packed
        fschedule = vta.top.schedule_dense_packed
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        fcompute = topi.x86.dense_nopack
        fschedule = topi.x86.schedule_dense_nopack
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)

    # Define base computation schedule
    with target:
        res = fcompute(
            data, kernel, None, env.acc_dtype)
        res = topi.right_shift(res, 8)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = fschedule([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, res], simple_mode=True))

    # Derive number of ops
    num_ops = 2 * batch_size * in_feat * out_feat

    # @memoize("vta.tests.test_benchmark_topi.dense.verify")
    def get_ref_data():
        # derive min max for act, wgt types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype)

        r_np = np.dot(a_np.astype(env.acc_dtype), w_np.T.astype(env.acc_dtype)).astype(env.acc_dtype)
        return a_np, w_np, r_np

    # Data in original format
    data_np, kernel_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(
            batch_size//env.BATCH, env.BATCH,
            in_feat//env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3))
        kernel_np = kernel_np.reshape(
            out_feat//env.BLOCK_OUT, env.BLOCK_OUT,
            in_feat//env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3))

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, kernel, res],
                        target=target,
                        target_host=env.target_host,
                        name="dense")
    else:
        mod = tvm.build(s, [data, kernel, res],
                        target=target,
                        target_host=env.target_host,
                        name="dense")
    temp = util.tempdir()
    mod.save(temp.relpath("dense.o"))
    remote.upload(temp.relpath("dense.o"))
    f = remote.load_module("dense.o")
    ctx = remote.context(str(target))

    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, ctx)
    kernel_arr = tvm.nd.array(kernel_np, ctx)
    res_arr = tvm.nd.array(res_np, ctx)
    time_f = f.time_evaluator("dense", ctx, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.asnumpy()
        if data_pack:
            res_orig = res_orig.reshape(batch_size, out_feat)
        res_ref = res_ref >> 8
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10 ** 9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s DENSE TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))

    return correct, cost, stats