Exemplo n.º 1
0
    #=================================================================#
    # copy to scratchpad.
    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a')

    # here we simply use some helper function to do the reshape and transpose.
    trans_buf = nnpu.utils.transpose(a_buf, (1, 0))
    re_buf = nnpu.utils.reshape(trans_buf, (2, 4, 2, 4), dst_scope='buffer1')
    tile_buf = nnpu.utils.transpose(re_buf, (0, 2, 1, 3), dst_scope='buffer2')
    # copy back to host.
    tile_host, _ = nnpu.utils.CopyBufToH(tile_buf, 'tile')
    # ------ this ends the computation description. ------

    #==================================#
    # ------ begin scheduling ------
    #==================================#
    s = nnpu.create_schedule([tile_host.op])

    # since all operations are scratchpad copy, all we need to do is pragma.
    # this is done by the helper functions, so nothing to do here.
    
    #==================================#
    # ------ this ends the scheduling ------
    #==================================#

    print(tvm.lower(s, [a, tile_host], simple_mode=True))
    print(nnpu.lower(s, [a, tile_host], simple_mode=True))
    func = nnpu.build(s, [a, tile_host], 'nnpu', 'llvm', name='nnpu_func')
    print('------------------- device module 1 TVM IR: ')
    print(func.imported_modules[0].get_source('ir'))
    print('------------------- device module 1 uop: ')
    print(func.imported_modules[0].get_source('uop'))
Exemplo n.º 2
0
    ko = tvm.reduce_axis((0, shape1_tiled[0]), 'k0')
    ki = tvm.reduce_axis((0, factor), 'k0')

    res_shape = (shape1[0], shape2[0])  # (8, 64)

    res_acc = tvm.compute(
        res_shape, lambda i, j: tvm.sum(a_buf[ko, i, ki].astype(dtype_w) *
                                        b_buf[ko, j, ki].astype(dtype_w),
                                        axis=[ko, ki]))
    nnpu.utils.MarkScope(res_acc, 'acc')

    res_buf = nnpu.utils.CopyAccToBuf(res_acc, 'res')
    res_host = tvm.compute(res_shape, lambda *i: res_buf(*i), 'res_host')

    s = nnpu.create_schedule(res_host.op)

    # tensorize
    xi, xj = s[res_acc].op.axis
    xjo, xji = s[res_acc].split(xj, factor=gemm_shape[2])
    ko, ki = s[res_acc].op.reduce_axis
    s[res_acc].reorder(xjo, ko, xi, xji, ki)
    s[res_acc].tensorize(
        xi,
        env.intrins.get('GEMM', shape=gemm_shape, mode='inc', scope_out='acc'))

    # split output
    core_extent = 4
    xh, xw = s[res_host].op.axis
    xwo, xwi = s[res_host].split(xw, nparts=core_extent)
    s[res_host].reorder(xwo, xh, xwi)
Exemplo n.º 3
0
    ko = tvm.reduce_axis((0, shape1[1] // factor), 'ko')
    ki = tvm.reduce_axis((0, factor), 'ki')

    out_buf = tvm.compute(
        out_shape_tiled,
        lambda xo, yo, xi, yi: tvm.sum(a_buf[xo, ko, xi, ki].astype(dtype_w) *
                                       b_buf[yo, ko, yi, ki].astype(dtype_w),
                                       axis=[ko, ki]), 'out_buf')
    out_acc = out_buf
    # nnpu.utils.MarkScope(out_acc, 'acc')
    # out_buf = tvm.compute(out_shape_tiled, lambda *i: out_acc(*i), 'out_host')
    # nnpu.utils.MarkScope(out_buf)
    out_host = tvm.compute(out_shape_tiled, lambda *i: out_buf(*i), 'out_host')

    # schedule
    s = nnpu.create_schedule(out_host.op)
    # al = s.cache_read(a_buf, env.get_scope('buffer1'), out_acc)
    # bl = s.cache_read(b_buf, env.get_scope('buffer2'), out_acc)
    al = a_buf
    bl = b_buf

    a_buffer_scope = 'buffer1'
    b_buffer_scope = 'buffer2'

    # set scope
    s[a_buf].set_scope(env.get_scope(a_buffer_scope))
    s[b_buf].set_scope(env.get_scope(b_buffer_scope))
    s[out_buf].set_scope(env.get_scope('buffer3'))

    # pragma read
    s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy_to_buf)
Exemplo n.º 4
0
    a_buf = tvm.compute(shape, lambda *i: a(*i), 'a_buf')

    exp = tvm.compute(shape, lambda i: tvm.exp(a_buf[i]), 'exp')
    log = tvm.compute(shape, lambda i: tvm.log(a_buf[i]), 'exp')
    tanh = tvm.compute(shape, lambda i: tvm.tanh(a_buf[i]), 'exp')
    sigmoid = tvm.compute(shape, lambda i: tvm.sigmoid(a_buf[i]), 'exp')

    # k = tvm.reduce_axis((0, 16), 'k0')
    # sum = tvm.compute((1, ), lambda i: tvm.sum(sigmoid[k], axis=k), 'sum')
    # nnpu.utils.MarkScope(sum)

    # softmax = tvm.compute(shape, lambda i: sigmoid[i] / sum[0], 'softmax')
    # nnpu.utils.MarkScope(softmax)
    # softmax_host, _ = nnpu.utils.CopyBufToH(softmax, 'softmax')

    s = nnpu.create_schedule([exp.op, log.op, tanh.op, sigmoid.op])
    # cache write
    exp_buf = s.cache_write(exp, env.get_scope('buffer0'))
    log_buf = s.cache_write(log, env.get_scope('buffer0'))
    tanh_buf = s.cache_write(tanh, env.get_scope('buffer0'))
    sigmoid_buf = s.cache_write(sigmoid, env.get_scope('buffer0'))
    # set scope
    s[a_buf].set_scope(env.get_scope('buffer0'))
    # pragma
    s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy_to_buf)
    s[exp].pragma(exp.op.axis[0], env.dma_copy_from_buf)
    s[log].pragma(log.op.axis[0], env.dma_copy_from_buf)
    s[tanh].pragma(tanh.op.axis[0], env.dma_copy_from_buf)
    s[sigmoid].pragma(sigmoid.op.axis[0], env.dma_copy_from_buf)
    # tensorize
    vector_unit_size = 32
Exemplo n.º 5
0
    conv_host = tvm.compute(conv_shape, lambda *i: conv(*i), 'conv_host')
    # ------ this ends the computation description. ------

    #==================================#
    # ------ begin scheduling ------
    #==================================#

    # set the memory scopes of tensors that should be on accelerator.
    # here we put keature and kernel on buffer0 and buffer1 respectively.
    nnpu.utils.MarkScope(feature_buf, 'buffer1')
    nnpu.utils.MarkScope(kernel_buf, 'buffer2')
    # the GEMM output is on accumulation buffer.
    nnpu.utils.MarkScope(conv_acc, 'acc')
    nnpu.utils.MarkScope(conv, 'buffer0')

    s = nnpu.create_schedule(conv_host.op)

    # reorder the GEMM compute stage.
    # the rule is, first make sure the axes of one GEMM instruction are the last 3 iterations.
    # then other reduction axes.
    h, wo, oco, wi, oci = s[conv_acc].op.axis
    s[conv_acc].reorder(h, wo, oco, kh_reduce, kw_reduce, co_reduce, wi, oci,
                        ci_reduce)
    # tensorize
    s[conv_acc].tensorize(
        wi,
        env.intrins.get('GEMM',
                        shape=gemm_shape,
                        mode='inc',
                        scope_in1='buffer1',
                        scope_in2='buffer2',
Exemplo n.º 6
0
def test():
    pass
    if (False):
        print('-----')
    with ScheduleProcHelper():
        env = nnpu.get_env()

        shape = (16, 64)
        a_host = tvm.placeholder(shape, env.cfg['dtype_n'], 'a_host')
        a_buf, _ = nnpu.utils.CopyHtoBuf(a_host, 'a')

        vctr_shape = (64, )
        b_host = tvm.placeholder(vctr_shape, env.cfg['dtype_n'], 'b_host')
        b_buf, _ = nnpu.utils.CopyHtoBuf(b_host, 'b')

        dtype_w = env.cfg['dtype_w']

        out_shape = (4, 16)
        k = tvm.reduce_axis((0, 16), 'k')
        c_buf = tvm.compute(
            out_shape, lambda j, i: tvm.sum(a_buf[i, j * 16 + k].astype(
                dtype_w) * b_buf[j * 16 + k].astype(dtype_w),
                                            axis=k))
        utils.MarkScope(c_buf)
        c_host, _ = utils.CopyBufToH(c_buf, 'c')

        s = nnpu.create_schedule(c_host.op)

        # mark variable scopes

        # tensorize
        s[c_buf].tensorize(
            s[c_buf].op.axis[1],
            env.intrins.get('GEMM', shape=(16, 16, 1), mode='inc',
                            reduce=True))

        # build
        print(tvm.lower(s, [a_host, b_host, c_host], simple_mode=True))

        print(nnpu.lower(s, [a_host, b_host, c_host], simple_mode=True))
        #exit()
        func = nnpu.build(s, [a_host, b_host, c_host],
                          'nnpu',
                          'llvm',
                          name='nnpu_exp')

        print('function built: ')
        print('------------------- device module 1 asm code: ')
        print(func.imported_modules[0].get_source('asm'))
        #print(func.get_source())

        # prepare data
        ctx = tvm.nd.TVMContext(13, 0)

        a_np = np.random.randint(size=shape,
                                 dtype=a_host.dtype,
                                 low=-32,
                                 high=32)
        # a_np = np.ones(shape).astype(a_host.dtype)
        a_nd = tvm.nd.array(a_np, ctx)

        b_np = np.random.randint(size=vctr_shape,
                                 dtype=b_host.dtype,
                                 low=-16,
                                 high=16)
        # b_np = np.ones(vctr_shape).astype(b_host.dtype)
        b_nd = tvm.nd.array(b_np, ctx)

        out_nd = tvm.nd.array(np.zeros(out_shape).astype(c_host.dtype), ctx)

        # run
        func(a_nd, b_nd, out_nd)

        print('run finished')

        print('a=')
        print(a_np)
        print('b=')
        print(b_np)
        print('out=')
        out_np = out_nd.asnumpy()
        out_np = np.sum(out_np, axis=0)
        print(out_np)

        print('numpy ground truth is: ')
        gt = np.dot(a_np.astype(dtype_w), b_np.astype(dtype_w))
        #gt = np.greater(np.dot(a_np.astype(dtype_w), b_np.astype(dtype_w)), bias_np)
        print(gt)

        np.testing.assert_allclose(out_np, gt)
Exemplo n.º 7
0
    sum1 = tvm.compute((nRow, ),
                       lambda i: tvm.sum(sigmoid_re[ko, i, ki], axis=[ko, ki]),
                       'sum1')
    nnpu.utils.MarkScope(sum1, 'acc')
    sum1_buf = nnpu.utils.CopyAccToBuf(sum1, 'sum1')
    # sum1_buf = tvm.compute((nRow, ), lambda *i: sum1(*i))

    k = tvm.reduce_axis((0, nRow), 'k')
    sum2 = tvm.compute((1, ), lambda i: tvm.sum(sum1_buf[k], axis=k), 'sum2')
    nnpu.utils.MarkScope(sum2, 'buffer0')

    softmax = tvm.compute(shape, lambda i: sigmoid[i] / sum2[0], 'softmax')
    nnpu.utils.MarkScope(softmax)
    softmax_host, _ = nnpu.utils.CopyBufToH(softmax, 'softmax')

    s = nnpu.create_schedule([softmax_host.op])

    s[sigmoid_re].set_scope(env.scratchpad_scope(0))
    s[sigmoid_re].pragma(sigmoid_re.op.axis[0], env.scratchpad_copy)

    # tensorize
    xo, xi = s[exp].split(exp.op.axis[0], 16)
    s[exp].tensorize(xi, env.intrins.get('VExp', mode='w'))
    xo, xi = s[exp_p1].split(exp_p1.op.axis[0], 16)
    s[exp_p1].tensorize(xi, env.intrins.get('VAddI', mode='w'))
    xo, xi = s[sigmoid].split(sigmoid.op.axis[0], 16)
    s[sigmoid].tensorize(xi, env.intrins.get('VDivV', mode='w'))

    xblock, xcol = sum1.op.reduce_axis
    xrow = sum1.op.axis[0]
    s[sum1].reorder(xblock, xrow, xcol)
Exemplo n.º 8
0
    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a')
    b_buf, b_dram = nnpu.utils.CopyHtoBuf(b, 'b')

    sum_buf = tvm.compute(shape, lambda *i: a_buf(*i) + b_buf(*i), 'sum_buf')
    nnpu.utils.MarkScope(sum_buf)
    sum_host, sum_dram = nnpu.utils.CopyBufToH(sum_buf, 'sum')

    mul_buf = tvm.compute(
        shape,
        lambda *i: a_buf(*i).astype(dtype_w) * b_buf(*i).astype(dtype_w),
        'mul_buf')
    nnpu.utils.MarkScope(mul_buf)
    mul_host, _ = nnpu.utils.CopyBufToH(mul_buf, 'mul')

    s = nnpu.create_schedule([sum_host.op, mul_host.op])
    # tensorize
    xo, xi = s[sum_buf].split(sum_buf.op.axis[0], factor=insn_shape[0])
    yo, yi = s[sum_buf].split(sum_buf.op.axis[1], factor=insn_shape[1])
    s[sum_buf].reorder(xo, yo, xi, yi)
    s[sum_buf].tensorize(xi,
                         env.intrins.get('MAddM', shape=insn_shape, mode='n'))

    # xo, xi = s[sum_buf].mul_buf(mul_buf.op.axis[0], factor=insn_shape[0])
    # yo, yi = s[sum_buf].split(sum_buf.op.axis[1], factor=insn_shape[1])
    # s[sum_buf].reorder(xo, yo, xi, yi)
    s[mul_buf].tile(mul_buf.op.axis[0], mul_buf.op.axis[1], insn_shape[0],
                    insn_shape[1])
    s[mul_buf].tensorize(
        s[mul_buf].leaf_iter_vars[2],
        env.intrins.get('MMulM', shape=insn_shape, mode='inc'))
Exemplo n.º 9
0
                        's1')
    nnpu.utils.MarkScope(s_update_1)
    k = tvm.reduce_axis((0, m), 'k1')
    s_update_2 = tvm.compute(h_shape, 
                        lambda t, i: tvm.sum(u_buf[i, k] * h_state[t - 1, k], axis=k),
                        's2')
    nnpu.utils.MarkScope(s_update_2)
    s_update_3 = tvm.compute(h_shape,
                        lambda t, i: s_update_1[t, i] + s_update_2[t, i], 
                        's3')
    nnpu.utils.MarkScope(s_update_3)
    s_update_4 = tvm.compute(h_shape,
                        lambda t, i: s_update_3[t, i] + b_buf[i],
                        's4')
    nnpu.utils.MarkScope(s_update_4)
    s_scan = tvm.scan(h_init_buf, s_update_4, h_state, inputs=[x_buf])
    nnpu.utils.MarkScope(s_scan)

    #res = nnpu.utils.reshape(s_scan, h_shape)
    #res_host, _ = nnpu.utils.CopyBufToH(res, 'sc')
    s = nnpu.create_schedule(s_scan.op)
    # tensorize
    s[s_update_1].tensorize(s_update_1.op.axis[1], 
                            env.intrins.get('GEMM', shape=gemm_shape, mode='inc', reduce=True))
    #s[s_update_2].tensorize(s_update_2.op.axis[1],
    #                        env.intrins.get('GEMM', shape=gemm_shape, mode='w', reduce=True))
    s[s_update_3].tensorize(s_update_3.op.axis[1],
                            env.intrins.get('VAddV', mode='w'))
    #s[s_update_4].tensorize(s_update_4.op.axis[1],
    #                        env.intrins.get('VAddV', mode='w'))
    print(tvm.lower(s, [x, w, u, b, h_init, s_scan], simple_mode=True))