예제 #1
0
파일: tensor_intrin.py 프로젝트: bddppq/tvm
        def _instr(index):
            ib = tvm.ir_builder.create()
            if index == 1:
                ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
                return ib.get()

            a_int8 = ins[0].vload([0], "uint8x4")
            re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8)
            vec_ai32 = re_int32.astype('int32x16')
            vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
            vec_b = ins[1].vload([0, 0], "int8x64")
            vec_one = tvm.const(1, "int16x32")
            pair_reduction = tvm.call_llvm_intrin('int16x32',
                                                  'llvm.x86.avx512.pmaddubs.w.512',
                                                  tvm.const(0, 'uint32'),
                                                  vec_a, vec_b)
            quad_reduction = tvm.call_llvm_intrin('int32x16',
                                                  'llvm.x86.avx512.pmaddw.d.512',
                                                  tvm.const(0, 'uint32'),
                                                  pair_reduction, vec_one)
            if index == 0:
                ib.emit(outs[0].vstore(0, quad_reduction))
            else:
                ib.emit(outs[0].vstore(0, quad_reduction + outs[0].vload([0], 'int32x16')))
            return ib.get()
예제 #2
0
def my_clip(x, a_min, a_max):
    """Unlike topi's current clip, put min and max into two stages."""
    const_min = tvm.const(a_min, x.dtype)
    const_max = tvm.const(a_max, x.dtype)
    x = tvm.compute(x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
    x = tvm.compute(x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
    return x
예제 #3
0
파일: multibox.py 프로젝트: bddppq/tvm
def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01,
                           variances=(0.1, 0.1, 0.2, 0.2)):
    """Location transformation for multibox detection

    Parameters
    ----------
    cls_prob : tvm.Tensor
        Class probabilities.

    loc_pred : tvm.Tensor
        Location regression predictions.

    anchor : tvm.Tensor
        Prior anchor boxes.

    clip : boolean
        Whether to clip out-of-boundary boxes.

    threshold : float
        Threshold to be a positive prediction.

    variances : tuple of float
        Variances to be decoded from box regression output.

    Returns
    -------
    ret : tuple of tvm.Tensor
    """
    return hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
                                         tvm.const(clip, "bool"),
                                         tvm.const(threshold, "float32"),
                                         tvm.convert(variances))
예제 #4
0
 def stmt_generater(dtype_list, length):
     ib = tvm.ir_builder.create()
     base_dtype = dtype_list[0]
     global_a = tvm.placeholder((length,), name = "global_a", dtype = base_dtype)
     assert len(dtype_list) == 4
     with ib.for_range(0, length, name="j") as j:
         dtype = dtype_list[0]
         A = ib.allocate(dtype, length, name="A", scope="local.L0A")
         A[j] = tvm.const(1, dtype = dtype)
     with ib.for_range(0, length, name="j") as j:
         dtype = dtype_list[1]
         B = ib.allocate(dtype, length, name="B", scope="local.L0A")
         B[j] = tvm.const(1, dtype = dtype)
     with ib.for_range(0, length, name="j") as j:
         dtype = dtype_list[2]
         C = ib.allocate(dtype, length, name="C", scope="local.L0A")
         C[j] = tvm.const(1, dtype = dtype)
     with ib.for_range(0, length, name="j") as j:
         dtype = dtype_list[3]
         D = ib.allocate(dtype, length, name="D", scope="local.L0A")
         D[j] = tvm.const(1, dtype = dtype)
     with ib.for_range(0, length, name="j") as j:
         dtype = "int8"
         E = ib.allocate(dtype, length, name="E", scope="local.L0A")
         E[j] = A[j].astype(dtype) + B[j].astype(dtype) + C[j].astype(dtype) + D[j].astype(dtype)
     return ib.get()
예제 #5
0
def test_reuse_small_buffer():
    ib = tvm.ir_builder.create()
    n = tvm.var("n")
    with ib.for_range(0, n, name="i") as i:
        with ib.for_range(0, 10, name="j") as j:
            A = ib.allocate("int16", 200, name="A", scope="local.L0A")
            A[j] = tvm.const(1, "int16")
            B = ib.allocate("int16", 200, name="B", scope="local.L0A")
            B[j] = tvm.const(1, "int16")
            B1 = ib.allocate("int16", 200, name="B1", scope="local.L0A")
            B1[j] = A[j] + B[j]
            C = ib.allocate("int16", 400, name="C", scope="local.L0A")
            C[j] = tvm.const(1, "int16")
            D = ib.allocate("int16", 400, name="D", scope="local.L0A")
            D[j] = tvm.const(1, "int16")
            E = ib.allocate("int16", 400, name="E", scope="local.L0A")
            E[j] = C[j]

    body = ib.get()
    body = tvm.ir_pass.StorageRewrite(body)

    num_alloc = [0]

    def verify(n):
        if isinstance(n, tvm.stmt.Allocate):
            num_alloc[0] += 1
            assert n.extents[0].value == 800
    tvm.ir_pass.PostOrderVisit(body, verify)
    assert num_alloc[0] == 1
예제 #6
0
def test_alloc_seq_type():
    ib = tvm.ir_builder.create()
    n = tvm.var("n")
    with ib.for_range(0, n, name="i") as i:
        with ib.for_range(0, 10, name="j") as j:
            A = ib.allocate("float32", 200, name="A", scope="local.L0A")
            A1 = ib.allocate("float32", 200, name="A1", scope="local.L0A")
            A[j] = 1.2
            A1[j] = 1.3
            B = ib.allocate("int16", 200, name="B", scope="local.L0A")
            B[j] = tvm.const(1, "int16")
            C = ib.allocate("int16", 200, name="C", scope="local.L0A")
            C[j] = tvm.const(1, "int16")
            D = ib.allocate("int16", 200, name="D", scope="local.L0A")
            D[j] = B[j] + C[j]
            A2 = ib.allocate("float32", 200, name="A2", scope="local.L0A")
            A2[j] = A[j]

    body = ib.get()
    body = tvm.ir_pass.StorageRewrite(body)
    num_alloc = [0]
    def verify(n):
        if isinstance(n, tvm.stmt.Allocate):
            num_alloc[0] += 1
            assert n.extents[0].value == 500
    tvm.ir_pass.PostOrderVisit(body, verify)
    assert num_alloc[0] == 1
예제 #7
0
파일: roi_align.py 프로젝트: bddppq/tvm
    def _sample(i, c, ph, pw):
        roi = rois[i]
        batch_index = roi[0].astype('int32')
        roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4]
        roi_start_h *= spatial_scale
        roi_end_h *= spatial_scale
        roi_start_w *= spatial_scale
        roi_end_w *= spatial_scale

        # force malformed ROIs to be 1x1
        roi_h = tvm.max(roi_end_h - roi_start_h, tvm.const(1.0, dtype))
        roi_w = tvm.max(roi_end_w - roi_start_w, tvm.const(1.0, dtype))

        bin_h = roi_h / pooled_size_h
        bin_w = roi_w / pooled_size_w

        if sample_ratio > 0:
            roi_bin_grid_h = roi_bin_grid_w = tvm.const(sample_ratio, 'int32')
        else:
            roi_bin_grid_h = tvm.ceil(roi_h / pooled_size_h).astype('int32')
            roi_bin_grid_w = tvm.ceil(roi_w / pooled_size_w).astype('int32')

        count = roi_bin_grid_h * roi_bin_grid_w
        rh = tvm.reduce_axis((0, roi_bin_grid_h))
        rw = tvm.reduce_axis((0, roi_bin_grid_w))
        roi_start_h += ph * bin_h
        roi_start_w += pw * bin_w
        return tvm.sum(_bilinear(batch_index, c,
                                 roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
                                 roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w) / count,
                       axis=[rh, rw])
예제 #8
0
    def _bitpack(*indices):
        packed_data = [tvm.const(0, pack_type)] * bits
        for k in range(data_width):
            # Translate indices for packed data back to original
            idx = [0] * n
            j = 0
            for i in range(n+1):
                if i == bit_axis:
                    continue
                elif i == pack_axis:
                    idx[j] = indices[i] * data_width + k
                else:
                    idx[j] = indices[i]
                j += 1

            element = data(*idx)
            for b in range(bits):
                extracted_bit = ((element & tvm.const(masks[b], "int32")) >> b).astype(pack_type)
                packed_data[b] = (packed_data[b] | extracted_bit)
                if k < data_width - 1:
                    packed_data[b] = packed_data[b] << 1

            if k == data_width - 1:
                return tuple(packed_data)
        return tuple(packed_data)
예제 #9
0
파일: util.py 프로젝트: bddppq/tvm
 def select_array(i, j):
     now = tvm.const(0.0, dtype)
     for ii in range(row):
         for jj in range(col):
             now = tvm.expr.Select(tvm.all(i % row == ii, j % col == jj),
                                   tvm.const(matrix[ii][jj], dtype),
                                   now)
     return now
예제 #10
0
def test_const_saveload_json():
    # save load json
    x = tvm.const(1, "int32")
    y = tvm.const(10, "int32")
    z = x + y
    z = z + z
    json_str = tvm.save_json(z)
    zz = tvm.load_json(json_str)
    assert tvm.save_json(zz) == tvm.save_json(z)
예제 #11
0
def test_make_smap():
    # save load json
    x = tvm.const(1, "int32")
    y = tvm.const(10, "int32")
    z = tvm.expr.Add(x, y)
    smap = tvm.convert({"z": z, "x": x})
    json_str = tvm.save_json(tvm.convert([smap]))
    arr = tvm.load_json(json_str)
    assert len(arr) == 1
    assert arr[0]["z"].a == arr[0]["x"]
예제 #12
0
    def _intrin_func(ins, outs):
        ww, xx = ins
        zz = outs[0]
        vpadd = "llvm.arm.neon.vpadd.v8u8"
        vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16"
        args_1 = tvm.const(1, 'uint32')
        args_2 = tvm.const(2, 'uint32')

        def _instr(index):
            irb = tvm.ir_builder.create()
            if index == 1:
                irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8')))
                return irb.get()

            cnts8 = [None] * 8
            cnts4 = [None] * 4
            cnts2 = [None] * 2
            for bw in range(w_b):
                for bx in range(x_b):
                    if k_i == 16:
                        for i in range(m):
                            ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16')
                            cnts = tvm.popcount(ands)
                            upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts)
                            lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts)
                            cnts8[i] = upper_half + lower_half
                        for i in range(m//2):
                            cnts4[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
                                                            args_1, cnts8[i*2], cnts8[i*2+1])
                        for i in range(m//4):
                            cnts2[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
                                                            args_1, cnts4[i*2], cnts4[i*2+1])
                        cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
                        shifted_cnts = cnts << tvm.const(bw+bx, dtype)
                        out = tvm.call_llvm_intrin('uint16x8', vpadalu,
                                                   args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
                    else: # ki == 8
                        for i in range(m):
                            ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8')
                            cnts8[i] = tvm.popcount(ands)
                        for i in range(m//2):
                            cnts4[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
                                                            args_1, cnts8[i*2], cnts8[i*2+1])
                        for i in range(m//4):
                            cnts2[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
                                                            args_1, cnts4[i*2], cnts4[i*2+1])
                        cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
                        shifted_cnts = cnts << tvm.const(bw+bx, dtype)
                        out = tvm.call_llvm_intrin('uint16x8', vpadalu,
                                                   args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
                    irb.emit(zz.vstore(0, out))
            return irb.get()
        # body, reset, update
        return _instr(0), _instr(1), _instr(2)
예제 #13
0
def test_bitwise():
    x = tvm.var('x')
    y = tvm.var('y')
    assert str(x << y) == 'shift_left(x, y)'
    assert str(x >> y) == 'shift_right(x, y)'
    assert str(x & y) == 'bitwise_and(x, y)'
    assert str(x | y) == 'bitwise_or(x, y)'
    assert str(x ^ y) == 'bitwise_xor(x, y)'
    assert str(~x) == 'bitwise_not(x)'
    assert(tvm.const(1, "int8x2") >> 1).dtype == "int8x2"
    assert(x >> tvm.const(1, "int32x2")).dtype == "int32x2"
    assert(tvm.var("z", "int8x2") << tvm.const(1, "int8x2")).dtype == "int8x2"
예제 #14
0
파일: conv2d.py 프로젝트: gwli/tvm
def _decl_im2col(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
    """declare the Im2Col method for conv2d"""
    _, CI, IH, IW = [x.value for x in data.shape]
    CO, _, KH, KW = [x.value for x in kernel.shape]
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    if isinstance(stride, (tuple, list)):
        HSTR, WSTR = stride
    else:
        HSTR, WSTR = stride, stride

    N = 1
    OH = (IH + 2*HPAD - KH) // HSTR + 1
    OW = (IW + 2*WPAD - KW) // WSTR + 1

    DO_PAD = (HPAD != 0 and WPAD != 0)
    if DO_PAD:
        data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")
    else:
        data_pad = data

    ALIGN = 16
    def upround(x, align):
        return (x + align - 1) // align * align

    # A [CO, CI * KH * KW]
    reduce_len = upround(CI * KH * KW, ALIGN)
    A = tvm.compute((upround(CO, ALIGN), reduce_len), lambda i, j:
                    kernel[i][j // KW // KH][j // KW % KH][j % KW], name='A')

    # B [CI * KH * KW, N * OH * OW]
    B = tvm.compute((reduce_len, upround(N * OH * OW, ALIGN)), lambda i, j:\
            tvm.select(tvm.all(i < CI * KH * KW, j < N * OH * OW),
                       data_pad[j // (OH*OW)][i // (KH*KW)][j // OW % OH*HSTR + i // KW % KH]
                       [j % OW*WSTR + i % KW],
                       tvm.const(0, data_pad.dtype)), name='B')

    gemm_n, gemm_l, gemm_m = A.shape[0], reduce_len, B.shape[1]

    # C [CO, N * OH * OW]
    k = tvm.reduce_axis((0, gemm_l), name='k')
    C = tvm.compute((gemm_n, gemm_m), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')

    # output
    # the last term C[gemm_n-1, gemm_m-1] is for enabling the alignment,
    # otherwise the alignment above will be eliminated by bound inference
    output = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:\
                 C[co][n * OW * OW + h * OW + w] + tvm.const(0, C.dtype) * C[gemm_n-1, gemm_m-1],
                         name='output', tag='im2col_conv_output')

    return output
예제 #15
0
def test_const_propagation():
    x1 = tvm.const(4, "int32")
    x2 = x1 + 5
    assert isinstance(x2, tvm.expr.IntImm) and x2.value == 9
    x3 = x2 / 3
    assert isinstance(x3, tvm.expr.IntImm) and x3.value == 3
    x4 = x3 + 0.5
    assert isinstance(x4, tvm.expr.FloatImm) and x4.value == 3.5
    x5 = tvm.ceil(x4)
    assert isinstance(x5, tvm.expr.FloatImm) and x5.value == 4
    x6 = x5.astype('int')
    assert isinstance(x6, tvm.expr.IntImm) and x6.value == 4
    y = (tvm.round((tvm.const(6.5, 'float32') - 1) / 1.5) + 2).astype('int')
    assert isinstance(y, tvm.expr.IntImm) and y.value == 6
예제 #16
0
def compute_clip(attrs, inputs, _):
    """ Clip operator.
    """
    x = inputs[0]
    a_min = attrs.get_float("a_min")
    a_max = attrs.get_float("a_max")
    const_min = tvm.const(a_min, x.dtype)
    const_max = tvm.const(a_max, x.dtype)
    with tvm.tag_scope(topi.tag.ELEMWISE):
        x = tvm.compute(
            x.shape, lambda *i: tvm.min(x(*i), const_max), name="clipA")
        x = tvm.compute(
            x.shape, lambda *i: tvm.max(x(*i), const_min), name="clipB")
    return x
예제 #17
0
    def check_select(ctx, n, dtype):
        A = tvm.placeholder((n,), name='A', dtype=dtype)
        true_value = tvm.const(1, dtype=dtype)
        false_value = tvm.const(3, dtype=dtype)
        max_lhs = tvm.const(2, dtype=dtype)
        max_rhs = tvm.expr.Select(A[0] > 0, true_value, false_value)
        C = tvm.compute((n,), lambda i: tvm.max(max_lhs, max_rhs), name='C')
        s = tvm.create_schedule(C.op)
        s[C].bind(s[C].op.axis[0], tvm.thread_axis("threadIdx.x"))
        fun = tvm.build(s, [A, C], target)

        a = tvm.nd.empty((n,), A.dtype, ctx)
        c = tvm.nd.empty((n,), A.dtype, ctx)
        # Only need to test compiling here
        fun(a, c)
예제 #18
0
def test_scan():
    m = tvm.var("m")
    n = tvm.var("n")
    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
    s_state = tvm.placeholder((m, n))
    s_init = tvm.compute((1, n), lambda _, i: x[0, i], name="s_init")
    x_trans = tvm.compute((m, n), lambda i, j: x[i, j] + 1, name="x_trans")
    s_up1 = tvm.compute((m, n), lambda t, i: s_state[t - 1, i] + 1, name="up1")
    s_update = tvm.compute((m, n), lambda t, i: s_up1[t, i] + x_trans[t, i], name="update")
    s_scan = tvm.scan(s_init, s_update, s_state)

    def test_getbody():
        body = tvm.schedule.ScanGetBody(s_scan.op)
        assert set(body) == set([s_scan.op, s_update.op, s_up1.op])

    def test_attach_path():
        s = tvm.create_schedule(s_scan.op)
        s[x_trans].compute_at(s[s_update], s_update.op.axis[0])
        apath = tvm.schedule.CreateAttachPath(s)
        assert(tuple(apath[s_update.op]) == tuple([s_scan.op.scan_axis]))
        assert(tuple(apath[x_trans.op]) == tuple([s_update.op.axis[0], s_scan.op.scan_axis]))

    def test_fix_pt():
        body = tvm.schedule.ScanGetBody(s_scan.op)
        fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op, body)
        assert(fxpt[s_scan.spatial_axis_[0]].value != 0)
예제 #19
0
def test_unroll_loop():
    ib = tvm.ir_builder.create()
    dtype = 'int64'
    n = tvm.var('n')
    Ab = tvm.decl_buffer((n, ), dtype)
    Aptr = ib.buffer_ptr(Ab)
    # for i in 0 to n-1:
    with ib.for_range(n, n + 2, name="i") as i:
        with ib.for_range(0, 8, name="i", for_type="unroll") as j:
            Aptr[j + 1] = Aptr[i] + 1

    stmt = ib.get()
    assert isinstance(stmt, tvm.stmt.For)
    ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, True)
    assert not isinstance(ret, tvm.stmt.For)
    ret = tvm.ir_pass.UnrollLoop(stmt, 15, 8, 0, True)
    assert isinstance(ret, tvm.stmt.For)
    ret = tvm.ir_pass.UnrollLoop(stmt, 16, 8, 0, False)
    assert isinstance(ret, tvm.stmt.For)
    assert ret.for_type == tvm.stmt.For.Unrolled

    ib = tvm.ir_builder.create()
    ib.scope_attr(tvm.const(0, "int32"), "pragma_auto_unroll_max_step", 16)
    ib.emit(stmt)
    wrapped = ib.get()
    wrapped = tvm.make.Block(wrapped, stmt)
    assert isinstance(ret, tvm.stmt.For)
    ret = tvm.ir_pass.UnrollLoop(wrapped, 0, 8, 0, False)
    assert isinstance(ret.first, tvm.stmt.For)
    assert ret.first.for_type == tvm.stmt.For.Unrolled
    assert isinstance(ret.rest, tvm.stmt.For)
    assert ret.rest.for_type != tvm.stmt.For.Unrolled
예제 #20
0
def test_tensor_comm_reducer():
    m = tvm.var('m')
    n = tvm.var('n')
    A = tvm.placeholder((m, n), name='A')
    k = tvm.reduce_axis((0, n), "k")
    mysum = tvm.comm_reducer(lambda x, y: x+y, lambda t: tvm.const(0, dtype=t))
    C = tvm.compute((m,), lambda i: mysum(A[i, k], axis=k))
예제 #21
0
def test_const_param():
    @tvm.hybrid.script
    def add_something(a, b):
        c = output_tensor((11, ), 'int32')
        for i in range(11):
            c[i] = a[i] + b
        return c

    a = tvm.placeholder((11, ), dtype='int32', name='a')
    b = tvm.const(11, 'int32')
    c = add_something(a, b)
    sch = tvm.create_schedule(c.op)
    module = tvm.build(sch, [a, c], 'llvm')
    assert(module)

    np_a = numpy.arange(11).astype('int32')
    np_b = 11
    np_c = numpy.zeros((11, )).astype('int32')

    nd_a = tvm.ndarray.array(np_a)
    nd_c = tvm.ndarray.array(numpy.zeros((11, )).astype('int32'))
    module(nd_a, nd_c)
    ref = add_something(np_a, 11)

    tvm.testing.assert_allclose(nd_c.asnumpy(), ref, 1e-5, 1e-5)
예제 #22
0
def test_parallel_alloc():
    ib = tvm.ir_builder.create()
    n = tvm.var("n")
    with ib.for_range(0, n, name="i", for_type="parallel") as i:
        with ib.for_range(0, 10, name="j") as j:
            A = ib.allocate("float32", n, name="A", scope="global")
            A[j] = A[j] + 2

    body = ib.get()
    body = tvm.ir_pass.StorageRewrite(body)
    assert (isinstance(body.body.body, tvm.stmt.Allocate))

    ib = tvm.ir_builder.create()
    n = tvm.var("n")
    with ib.for_range(0, n, name="t") as i:
        ib.scope_attr(
            tvm.const(1, "int32") , "pragma_scope",
            tvm.make.StringImm("parallel_launch_point"))
        with ib.for_range(0, n, name="i", for_type="parallel") as i:
            with ib.for_range(0, 10, name="j") as j:
                A = ib.allocate("float32", n, name="A", scope="global")
                A[j] = A[j] + 2
    body = ib.get()
    body = tvm.ir_pass.StorageRewrite(body)

    assert(isinstance(body.body.body.body.body, tvm.stmt.Allocate))
예제 #23
0
def test_alloc_seq_type2():
    scope_tb = "local.L0A2"
    max_bits=1024 * 1024 * 1024

    register_mem(scope_tb, max_bits)

    ib = tvm.ir_builder.create()
    n = tvm.var("n")
    with ib.for_range(0, n, name="i") as i:
        with ib.for_range(0, 10, name="j") as j:
            A = ib.allocate("float32", 200, name="A", scope=scope_tb)
            A[j] = 1.2
        with ib.for_range(0, 20, name="j") as j:
            B = ib.allocate("int16", 400, name="B", scope=scope_tb)
            B[j] = tvm.const(1, "int16")
        with ib.for_range(0, 10, name="j") as j:
            C = ib.allocate("float32", 200, name="C", scope=scope_tb)
            C[j] = 1.2

    body = ib.get()
    body = tvm.ir_pass.StorageRewrite(body)
    num_alloc = [0]
    def verify(n):
        if isinstance(n, tvm.stmt.Allocate):
            num_alloc[0] += 1
            assert n.extents[0].value == 200
    tvm.ir_pass.PostOrderVisit(body, verify)
    assert num_alloc[0] == 1
예제 #24
0
def test_scan_group():
    m = tvm.var("m")
    n = tvm.var("n")
    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
    s_state = tvm.placeholder((m, n))
    s_init = tvm.compute((1, n), lambda _, i: x[0, i])

    s_update1 = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + x[t, i])
    s_update2 = tvm.compute((m, n), lambda t, i: s_update1[t, i] + 1)
    s_update3 = tvm.compute((m, n), lambda t, i: s_update2[t, i] + 1)
    res = tvm.scan(s_init, s_update3, s_state, inputs=x)

    s = tvm.create_schedule(res.op)
    assert s[s_update1].group is not None
    assert s[s_update2].group == s[s_update1].group
    # Assign within group, is valid
    s[s_update1].compute_at(s[s_update2], s_update2.op.axis[1])
    # create a new group, for [s_update2 and s_update1]
    g2 = s.create_group(outputs=s_update2, inputs=[s_state, x])
    assert g2.group is not None
    assert g2.group == s[s_update3].group
    assert s[s_update2].group == g2
    assert s[s_update1].group == g2
    g2.compute_at(s[s_update3], s_update3.op.axis[1])
    assert g2.attach_stage == s[s_update3]
    try:
        # compute outside group error.
        s[s_update2].compute_at(s[s_init], s_init.op.axis[0])
        assert False
    except tvm.TVMError:
        pass
예제 #25
0
def test_in_bounds_vectorize_llvm():
    n = 512
    lanes = 2
    A = tvm.placeholder((n,), name='A', dtype="float32x%d" % lanes)
    B = tvm.compute((n,), lambda i: A[i], name='B')
    C = tvm.compute((n,), lambda i: B[i] + tvm.const(1, A.dtype), name='C')
    s = tvm.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], nparts=2)
    _, xi = s[C].split(xi, factor=2)
    s[C].parallel(xo)
    s[C].vectorize(xi)
    s[B].compute_at(s[C], xo)
    xo, xi = s[B].split(B.op.axis[0], factor=2)
    s[B].vectorize(xi)
    # build and invoke the kernel.
    lowered_func = tvm.lower (s, [A, C], "llvm", simple_mode=False)
    print (lowered_func.body)
    f = tvm.build(s, [A, C], "llvm")
    ctx = tvm.cpu(0)
    # launch the kernel.
    a = tvm.nd.empty((n,), A.dtype).copyfrom(
        np.random.uniform(size=(n, lanes)))
    c = tvm.nd.empty((n,), C.dtype, ctx)
    f(a, c)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
예제 #26
0
파일: ir_pass.py 프로젝트: bddppq/tvm
 def _check_compact(buf):
     ndim = len(buf.shape)
     size = tvm.const(1, buf.shape[0].dtype)
     for i in reversed(range(ndim)):
         if not util.equal_const_int(size - buf.strides[i], 0):
             raise RuntimeError(
                 "Cannot prove compact: shape=%s, strides=%s" % (buf.shape, buf.strides))
         size = size * buf.shape[i]
예제 #27
0
def test_deduce():
    a = tvm.var('a')
    b = tvm.var('b')
    c = tvm.var('c')
    d = tvm.var('d')

    b_s = tvm.arith.intset_interval(2, 3)
    c_s = tvm.arith.intset_interval(10, 15)
    d_s = tvm.arith.intset_interval(-3, -1)
    zero = tvm.const(0, "int32")

    e0 = (-b)*a+c-d
    res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
    ans0 = ((d - c) /(b*-1))
    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)

    # expression containing variable a is on rhs
    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)

    e0 = d*a+c-d
    res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
    ans0 = ((0-c)/d + 1)
    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)

    # expression containing variable a is on rhs
    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)

    e1 = (a*4+b < c)
    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
    ans1 = (((c - b) + -1)/4)
    assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1)

    # expression containing variable a is on rhs
    e1 = (c > a*4+b)
    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
    assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1)

    e2 = (tvm.max(5, a * 4) < 0)
    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
    assert str(res2.max()) == "neg_inf"
    assert str(res2.min()) == "pos_inf"

    # expression containing variable a is on rhs
    e2 = (zero < tvm.max(5, a * 4))
    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
    assert str(res2.max()) == "neg_inf"
    assert str(res2.min()) == "pos_inf"


    e3 = (-b)+a*c-d
    res3 = tvm.arith.DeduceBound(a, e3>=0, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
    ans3 = 2/c+1
    assert str(tvm.ir_pass.Simplify(res3.min())) == str(ans3)

    res3 = tvm.arith.DeduceBound(a, zero <= e3, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
    assert str(tvm.ir_pass.Simplify(res3.min())) == str(ans3)
예제 #28
0
 def check_mod(start, end, divisor, dtype):
     T = tvm.compute((end - start,),
                     lambda i: tvm.expr.Cast(dtype, (start + i)) % tvm.const(divisor, dtype))
     s = tvm.create_schedule([T.op])
     f = tvm.build(s, [T], "llvm")
     a = tvm.nd.empty((end - start,), dtype)
     f(a)
     ref = [int(math.fmod(i, divisor)) for i in range(start, end)]
     tvm.testing.assert_allclose(a.asnumpy(), ref)
예제 #29
0
def test_llvm_lookup_intrin():
    ib = tvm.ir_builder.create()
    m = tvm.var("m")
    A = ib.pointer("uint8x8", name="A")
    x = tvm.call_llvm_intrin("uint8x8", "llvm.ctpop.i8", tvm.const(1, 'uint32'), A)
    ib.emit(x)
    body = ib.get()
    func = tvm.ir_pass.MakeAPI(body, "ctpop", [A], 1, True)
    fcode = tvm.build(func, None, "llvm")
예제 #30
0
        def _instr(index):
            irb = tvm.ir_builder.create()
            if index == 1:
                irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8')))
                return irb.get()

            cnts8 = [None] * 8
            cnts4 = [None] * 4
            cnts2 = [None] * 2
            for bw in range(w_b):
                for bx in range(x_b):
                    if k_i == 16:
                        for i in range(m):
                            ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload([bx, 0], 'uint8x16')
                            cnts = tvm.popcount(ands)
                            upper_half = tvm.call_pure_intrin('uint8x8', 'vectorhigh', cnts)
                            lower_half = tvm.call_pure_intrin('uint8x8', 'vectorlow', cnts)
                            cnts8[i] = upper_half + lower_half
                        for i in range(m//2):
                            cnts4[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
                                                            args_1, cnts8[i*2], cnts8[i*2+1])
                        for i in range(m//4):
                            cnts2[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
                                                            args_1, cnts4[i*2], cnts4[i*2+1])
                        cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
                        shifted_cnts = cnts << tvm.const(bw+bx, dtype)
                        out = tvm.call_llvm_intrin('uint16x8', vpadalu,
                                                   args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
                    else: # ki == 8
                        for i in range(m):
                            ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload([bx, 0], 'uint8x8')
                            cnts8[i] = tvm.popcount(ands)
                        for i in range(m//2):
                            cnts4[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
                                                            args_1, cnts8[i*2], cnts8[i*2+1])
                        for i in range(m//4):
                            cnts2[i] = tvm.call_llvm_intrin('uint8x8', vpadd,
                                                            args_1, cnts4[i*2], cnts4[i*2+1])
                        cnts = tvm.call_pure_intrin('uint8x16', 'vectorcombine', cnts2[0], cnts2[1])
                        shifted_cnts = cnts << tvm.const(bw+bx, dtype)
                        out = tvm.call_llvm_intrin('uint16x8', vpadalu,
                                                   args_2, zz.vload(0, 'uint16x8'), shifted_cnts)
                    irb.emit(zz.vstore(0, out))
            return irb.get()
예제 #31
0
def conv2d_transpose_nchw_cuda(cfg, data, kernel, stride, padding, out_dtype):
    """Transposed 2D convolution nchw forward operator.

    Parameters
    ----------
    cfg: ConfigEntity
        The config for this template
    Input : tvm.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]
    Filter : tvm.Tensor
        4-D with shape [in_channel, num_filter, filter_height, filter_width]
    strides : tuple of two ints
        The spatial stride along height and width
    padding : int or str
        Padding size, or ['VALID', 'SAME']
    out_dtype: str
        The output type. This is used in mixed precision

    Returns
    -------
    Output : tvm.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    batch, inp_channels, inp_height, inp_width = get_const_tuple(data.shape)
    _, out_channels, kernel_height, kernel_width = get_const_tuple(
        kernel.shape)
    stride_height, stride_width = stride
    cfg.stride = stride
    pad_top, pad_left, pad_bottom, pad_right = nn.get_pad_tuple(
        padding, (kernel_height, kernel_width))

    out_width = (inp_width - 1) * stride_width + \
                kernel_width - pad_left - pad_right
    pad_left = kernel_width - 1 - pad_left
    pad_right = kernel_width - 1 - pad_right
    dilated_width = stride_width * (inp_width - 1) + 1

    out_height = (inp_height - 1) * stride_height + \
                 kernel_height - pad_top - pad_bottom
    pad_top = kernel_height - 1 - pad_top
    pad_bottom = kernel_height - 1 - pad_bottom
    dilated_height = stride_height * (inp_height - 1) + 1

    # compute pad
    data = tvm.compute(
        (batch, inp_channels, pad_top + dilated_height + pad_bottom,
         pad_left + dilated_width + pad_right),
        lambda n, c, y, x: tvm.if_then_else(
            tvm.all(x >= pad_left, x < pad_left + dilated_width,
                    tvm.indexmod(x - pad_left, stride_width).equal(0), y >=
                    pad_top, y < pad_top + dilated_height,
                    tvm.indexmod(y - pad_top, stride_height).equal(0)), data[
                        n, c,
                        tvm.indexdiv(y - pad_top, stride_height),
                        tvm.indexdiv(x - pad_left, stride_width)],
            tvm.const(0., "float32")),
        name='data_pad')

    # compute transposed conv
    dc = tvm.reduce_axis((0, inp_channels), name='dc')
    dh = tvm.reduce_axis((0, kernel_height), name='dh')
    dw = tvm.reduce_axis((0, kernel_width), name='dw')
    data_out = tvm.compute(
        (batch, out_channels, out_height, out_width),
        lambda b, c, h, w: tvm.sum(data[b, dc, h + dh, w + dw].astype(
            out_dtype) * kernel[dc, c, kernel_height - 1 - dh, kernel_width - 1
                                - dw].astype(out_dtype),
                                   axis=[dc, dh, dw]),
        tag="conv2d_transpose_nchw")

    return data_out
예제 #32
0
def test_tensor_comm_reducer_overload():
    m = tvm.var('m')
    n = tvm.var('n')
    mysum = tvm.comm_reducer(lambda x, y: x + y,
                             lambda t: tvm.const(0, dtype=t))
    sum_res = mysum(m, n)
예제 #33
0
def test_bound():
    m = tvm.var('m')
    vrange = tvm.convert({m: tvm.Range(tvm.const(0), tvm.const(10))})
    ret = tvm.ir_pass.Simplify(m % 10, vrange)
    assert ret == m
예제 #34
0
out_channel = 512
in_size = 14
kernel = 3
pad = 1
stride = 1

# Algorithm
A = tvm.placeholder((in_size, in_size, in_channel, batch), name='A')
W = tvm.placeholder((kernel, kernel, in_channel, out_channel), name='W')
out_size = (in_size - kernel + 2 * pad) // stride + 1
# Pad input
Apad = tvm.compute(
    (in_size + 2 * pad, in_size + 2 * pad, in_channel, batch),
    lambda yy, xx, cc, nn: tvm.select(
        tvm.all(yy >= pad, yy - pad < in_size, xx >= pad, xx - pad < in_size),
        A[yy - pad, xx - pad, cc, nn], tvm.const(0.)),
    name='Apad')
# Create reduction variables
rc = tvm.reduce_axis((0, in_channel), name='rc')
ry = tvm.reduce_axis((0, kernel), name='ry')
rx = tvm.reduce_axis((0, kernel), name='rx')
# Compute the convolution
B = tvm.compute(
    (out_size, out_size, out_channel, batch),
    lambda yy, xx, ff, nn: tvm.sum(Apad[yy * stride + ry, xx * stride + rx, rc,
                                        nn] * W[ry, rx, rc, ff],
                                   axis=[ry, rx, rc]),
    name='B')

###############################################################################
# Memory Hierarchy
def test_deduce():
    a = tvm.var('a')
    b = tvm.var('b')
    c = tvm.var('c')
    d = tvm.var('d')

    b_s = tvm.arith.IntervalSet(2, 3)
    c_s = tvm.arith.IntervalSet(10, 15)
    d_s = tvm.arith.IntervalSet(-3, -1)
    zero = tvm.const(0, "int32")

    e0 = (-b) * a + c - d
    res0 = tvm.arith.DeduceBound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {})
    ans0 = ((d - c) / (b * -1) + (-1))
    assert_expr_equal(res0.max_value, ans0)

    # expression containing variable a is on rhs
    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
    assert_expr_equal(res0.max_value, ans0)

    e0 = d * a + c - d
    res0 = tvm.arith.DeduceBound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {})
    ans0 = ((d - c) / d - 1)
    assert_expr_equal(res0.max_value, ans0)

    # expression containing variable a is on rhs
    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
    assert_expr_equal(res0.max_value, ans0)

    e1 = (a * 4 + b < c)
    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
    ans1 = (((c - b) + -1) / 4 - 1)
    assert_expr_equal(res1.max_value, ans1)

    # expression containing variable a is on rhs
    e1 = (c > a * 4 + b)
    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
    assert_expr_equal(res1.max_value, ans1)

    e2 = (tvm.max(5, a * 4) < 0)
    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
    assert str(res2.max_value) == "neg_inf"
    assert str(res2.min_value) == "pos_inf"

    # expression containing variable a is on rhs
    e2 = (zero < tvm.max(5, a * 4))
    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
    assert str(res2.max_value) == "neg_inf"
    assert str(res2.min_value) == "pos_inf"

    e3 = (-b) + a * c - d
    res3 = tvm.arith.DeduceBound(a, e3 >= 0, {
        b: b_s,
        c: c_s,
        d: d_s
    }, {
        b: b_s,
        d: d_s
    })
    ans3 = 2 / c + 1
    assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3)

    res3 = tvm.arith.DeduceBound(a, zero <= e3, {
        b: b_s,
        c: c_s,
        d: d_s
    }, {
        b: b_s,
        d: d_s
    })
    assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3)
예제 #36
0
파일: tensor.py 프로젝트: zheng-da/tvm
 def _compute(attrs, x, _):
     x = x[0]
     scalar = attrs.get_float("scalar")
     scalar = tvm.const(scalar, x.dtype)
     return tvm.compute(x.shape, lambda *i: f(x(*i), scalar))
 def f(n):
     rv = tvm.reduce_axis((0, n))
     init = lambda dtype: tvm.expr.Select(n > 1, tvm.const(0, dtype), n.astype(dtype))
     sum = tvm.comm_reducer(lambda x, y: tvm.max(x + y, n.astype('float32')), init, name='sum')
     return sum(X[rv], axis=rv)
예제 #38
0
    def check(start, end, dstart, dend, dtype, floor_div=False):
        div = tvm.floordiv if floor_div else tvm.truncdiv
        mod = tvm.floormod if floor_div else tvm.truncmod

        # A are dividends, B are divisors. Note that we add 1 to make include end in the range.
        A = tvm.placeholder((end - start + 1,), name="A", dtype=dtype)
        B = tvm.placeholder((dend - dstart + 1,), name="B", dtype=dtype)
        # We clip values with min and max so that simplifiers know the ranges of values
        clipa = lambda x: tvm.min(tvm.const(end, dtype), tvm.max(tvm.const(start, dtype), x))
        clipb = lambda x: tvm.min(tvm.const(dend, dtype), tvm.max(tvm.const(dstart, dtype), x))
        # If the range is just a single point, use the constant itself
        if start == end:
            clipa = lambda x: tvm.const(start, dtype)
        if dstart == dend:
            clipb = lambda x: tvm.const(dstart, dtype)
        # D are division results and M are modulo results
        [D, M] = tvm.compute((end - start + 1, dend - dstart + 1),
                             lambda i, j: (div(clipa(A[i]), clipb(B[j])),
                                          mod(clipa(A[i]), clipb(B[j]))))

        s = tvm.create_schedule([D.op, M.op])
        f = tvm.build(s, [A, B, D, M], "llvm")

        # Fill input arrays with values
        A_arr = tvm.nd.empty((end - start + 1,), dtype)
        B_arr = tvm.nd.empty((dend - dstart + 1,), dtype)
        A_arr.copyfrom(np.arange(start, end + 1, dtype=dtype))
        B_np = np.arange(dstart, dend + 1, dtype=dtype)
        # If the range of the divisor contains 0, replace it with 1 to avoid division by zero
        if dend >= 0 and dstart <= 0:
            B_np[-dstart] = 1
        B_arr.copyfrom(B_np)
        D_arr = tvm.nd.empty((end - start + 1, dend - dstart + 1), dtype)
        M_arr = tvm.nd.empty((end - start + 1, dend - dstart + 1), dtype)

        # Run the function and convert the results to numpy
        f(A_arr, B_arr, D_arr, M_arr)
        D_arr = D_arr.asnumpy()
        M_arr = M_arr.asnumpy()

        # This helper just prints additional info on failure
        def _show_info():
            print("dtype: {}".format(dtype))
            print("dividend range: [{}, {}]".format(start, end))
            print("divisor range: [{}, {}]".format(dstart, dend))
            lowered = tvm.lower(s, [A, B, D, M], simple_mode=True)
            print("Lowered code:")
            print(lowered)

        # Check that the computed values are correct
        for i in range(start, end + 1):
            for j in range(dstart, dend + 1):
                if j == 0:
                    continue

                if floor_div:
                    dref = i // j
                    mref = i % j
                else:
                    dref = int(float(i) / j)
                    mref = int(math.fmod(i, j))

                if D_arr[i - start, j - dstart] != dref:
                    _show_info()
                    raise AssertionError("Incorrect division result: {}({}, {}) is {} "
                                         "but should be {}".format(div.__name__, i, j,
                                                                   D_arr[i - start, j - dstart],
                                                                   dref))
                if M_arr[i - start, j - dstart] != mref:
                    _show_info()
                    raise AssertionError("Incorrect modulo result: {}({}, {}) is {} "
                                         "but should be {}".format(mod.__name__, i, j,
                                                                   M_arr[i - start, j - dstart],
                                                                   mref))
예제 #39
0
def pool(data, kernel, stride, padding, pool_type, ceil_mode=False):
    """Perform pooling on the data

    Parameters
    ----------
    data : tvm.Tensor
        4-D with shape [batch, channel, in_height, in_width]

    kernel : list/tuple of two ints
        Kernel size, [kernel_height, kernel_width]

    stride : list/tuple of two ints
        Stride size, [stride_height, stride_width]

    paddding : list/tuple of two ints
        Pad size, [pad_height, pad_width]

    pool_type : str
        Pool type, 'max' or 'avg'

    ceil_mode : bool
        Whether to use ceil when caculate output size.

    Returns
    -------
    output : tvm.Tensor
        4-D with shape [batch, channel, out_height, out_width]
    """
    assert len(data.shape) == 4, "only support 4-dim pooling"
    assert len(stride) == 2, "only support 2-dim stride"
    kernel_height, kernel_width = kernel
    stride_height, stride_width = stride
    batch, channel, height, width = data.shape

    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (kernel_height, kernel_width))

    pad_before = [0, 0, pad_top, pad_left]
    pad_after = [0, 0, pad_down, pad_right]

    if ceil_mode:
        # Additional padding to ensure we do ceil instead of floor when divide stride.
        pad_down += stride_height - 1
        pad_right += stride_width - 1

    out_height = util.simplify((height - kernel_height + pad_top + pad_down) //
                               stride_height + 1)
    out_width = util.simplify((width - kernel_width + pad_left + pad_right) //
                              stride_width + 1)

    dheight = tvm.reduce_axis((0, kernel_height))
    dwidth = tvm.reduce_axis((0, kernel_width))

    if pool_type == 'max':
        temp = pad(data, pad_before, pad_after, name="pad_temp", \
            pad_value=tvm.min_value(data.dtype))
        return tvm.compute((batch, channel, out_height, out_width), \
                            lambda n, c, h, w: \
                            tvm.max(temp[n, c, h*stride_height+dheight, w*stride_width+dwidth], \
                                axis=[dheight, dwidth]), \
                            tag="pool_max")
    elif pool_type == 'avg':
        temp = pad(data, pad_before, pad_after, name="pad_temp", \
            pad_value=tvm.const(0.).astype(data.dtype))
        tsum = tvm.compute((batch, channel, out_height, out_width), \
                            lambda n, c, h, w: \
                            tvm.sum(temp[n, c, h*stride_height+dheight, w*stride_width+dwidth], \
                                axis=[dheight, dwidth]), \
                            tag="pool_avg")
        return tvm.compute((batch, channel, out_height, out_width), \
                            lambda n, c, h, w: \
                            tsum[n, c, h, w] / (kernel_height*kernel_width), \
                            tag=tag.ELEMWISE)
    else:
        raise ValueError("Pool type should be 'avg' or 'max'.")
예제 #40
0
def test_logical_simplify():
    ck = RewriteChecker()
    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")

    ck.verify(tvm.expr.And(tvm.expr.EQ(x, y), tvm.expr.NE(x, y)),
              tvm.const(False, "bool"))
    ck.verify(tvm.expr.And(tvm.expr.NE(x, y), tvm.expr.EQ(x, y)),
              tvm.const(False, "bool"))
    ck.verify(tvm.expr.And(x > 1, tvm.expr.Not(x > 1)),
              tvm.const(False, "bool"))
    ck.verify(tvm.expr.And(x <= y, y < x), tvm.const(False, "bool"))
    ck.verify(tvm.expr.And(y < x, y <= x), tvm.const(False, "bool"))
    ck.verify(tvm.expr.And(x < 1, 0 < x), tvm.const(False, "bool"))
    ck.verify(tvm.expr.And(x < 0, 1 < x), tvm.const(False, "bool"))
    ck.verify(tvm.expr.And(x < 1, 1 <= x), tvm.const(False, "bool"))
    ck.verify(tvm.expr.And(x <= 1, 1 < x), tvm.const(False, "bool"))
    ck.verify(tvm.expr.And(1 <= x, x < 1), tvm.const(False, "bool"))
    ck.verify(tvm.expr.And(1 < x, x <= 1), tvm.const(False, "bool"))
    ck.verify(tvm.expr.And(x <= 1, 2 <= x), tvm.const(False, "bool"))
    ck.verify(tvm.expr.And(2 <= x, x <= 1), tvm.const(False, "bool"))
    ck.verify(tvm.expr.And(x == 1, x != 2), x == 1)

    ck.verify(tvm.expr.Or(tvm.expr.EQ(x, y), tvm.expr.NE(x, y)),
              tvm.const(True, "bool"))
    ck.verify(tvm.expr.Or(tvm.expr.NE(x, y), tvm.expr.EQ(x, y)),
              tvm.const(True, "bool"))
    ck.verify(tvm.expr.Or(x > y, tvm.expr.Not(x < y)), tvm.const(True, "bool"))

    ck.verify(tvm.expr.Or(x <= y, y < x), tvm.const(True, "bool"))
    ck.verify(tvm.expr.Or(y < x, y <= x), tvm.const(True, "bool"))

    ck.verify(tvm.expr.Or(x < 1, 0 < x), tvm.const(True, "bool"))
    ck.verify(tvm.expr.Or(0 < x, x < 1), tvm.const(True, "bool"))

    ck.verify(tvm.expr.Or(x < 1, 1 <= x), tvm.const(True, "bool"))
    ck.verify(tvm.expr.Or(x <= 1, 1 < x), tvm.const(True, "bool"))
    ck.verify(tvm.expr.Or(1 <= x, x < 1), tvm.const(True, "bool"))
    ck.verify(tvm.expr.Or(1 < x, x <= 1), tvm.const(True, "bool"))
    ck.verify(tvm.expr.Or(x <= 1, 2 <= x), tvm.const(True, "bool"))
    ck.verify(tvm.expr.Or(2 <= x, x <= 1), tvm.const(True, "bool"))
    ck.verify(tvm.expr.Or(x != 1, x == 2), x != 1)
예제 #41
0
def test_cmp_simplify():
    ck = RewriteChecker()
    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
    # const int bound
    ck.verify((x % 2 + 10).equal(0), tvm.const(0, "bool"))
    ck.verify(tvm.expr.NE(x % 2 + 10, 0), tvm.const(1, "bool"))
    ck.verify(x % 2 + 10 > 1, tvm.const(1, "bool"))
    ck.verify(x % 2 + 10 <= 1, tvm.const(0, "bool"))
    ck.verify(x * 3 + 10 == 0, tvm.const(0, "bool"))
    ck.verify(x * 3 + 10 != 0, tvm.const(1, "bool"))

    # canonicalization
    ck.verify((x - 10).equal(0), x.equal(10))
    ck.verify((10 - x).equal(0), x.equal(10))
    ck.verify((x * y).equal(0), tvm.expr.Or(x.equal(0), y.equal(0)))

    # cmp bound
    ck.verify(x + y < x + z, y < z)
    ck.verify(x + y < z + x, y < z)
    ck.verify(y + x < x + z, y < z)
    ck.verify(y + x < z + x, y < z)
    ck.verify(y - x < z - x, y < z)
    ck.verify(x - y < x - z, z < y)

    ck.verify(x < z + x, tvm.expr.LT(0, z))
    ck.verify(x < x + z, tvm.expr.LT(0, z))

    ck.verify(100 < x + 1, tvm.expr.LT(99, x))
    ck.verify(1 < 100 - x, tvm.expr.LT(x, 99))
    ck.verify(x * 3 < y * 3, x < y)
    ck.verify(x * (-3) < y * (-3), y < x)
    ck.verify(x * 3 >= y * 3, y <= x)

    ck.verify(x * 4 >= 2, tvm.expr.LE(1, x))
    ck.verify(x * 2 >= 50, tvm.expr.LE(25, x))
    ck.verify(x / 2 < 3, x < 6)
    ck.verify(x * 4 <= 2, x <= 0)
    ck.verify(3 < x / 2, tvm.expr.LT(7, x))

    ck.verify(x / 4 * 4 < x, tvm.expr.LT(0, x % 4))
    ck.verify(x / 4 * 4 >= x, tvm.expr.LE(x % 4, 0))

    ck.verify(x / 4 * 4 < x + y, tvm.expr.LT(0, x % 4 + y))
    ck.verify(x / 4 * 4 < x - y, tvm.expr.LT(y, x % 4))

    ck.verify((x + 2) / 4 * 4 >= x, tvm.expr.LE((x + 2) % 4, 2))
    ck.verify((x + 2) / 4 * 4 >= x + y, tvm.expr.LE((x + 2) % 4 + y, 2))
    ck.verify((x + 2) / 4 * 4 >= x - y, tvm.expr.LE((x + 2) % 4 + (-2), y))

    ck.verify(tvm.min(x, 11) < 10, x < 10)
    ck.verify(tvm.min(x, 8) < 10, tvm.const(1, "bool"))
    ck.verify(tvm.max(8, x) > 10, tvm.expr.LT(10, x))
    ck.verify(x + 1 < tvm.max(8, x), x < 7)

    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 10), override=True)
    ck.analyzer.update(y, tvm.arith.ConstIntBound(-10, 0), override=True)
    ck.analyzer.update(z, tvm.arith.ConstIntBound(-5, 5), override=True)

    ck.verify(x < 11, tvm.const(1, "bool"))
    ck.verify(x <= 10, tvm.const(1, "bool"))
    ck.verify(z <= 5, tvm.const(1, "bool"))
    ck.verify(x + y <= 10, tvm.const(1, "bool"))
    ck.verify(x + y >= -10, tvm.const(1, "bool"))
    ck.verify(z - 5 <= y + 10, tvm.const(1, "bool"))
    ck.verify(tvm.all(x > -1, z <= x + 5), tvm.const(1, "bool"))
    ck.verify(x * y <= 0, tvm.const(1, "bool"))
    ck.verify((x + 1) * (y - 1) < 0, tvm.const(1, "bool"))
    ck.verify(y * y >= 0, tvm.const(1, "bool"))
예제 #42
0
def test_util():
    x = tvm.const(100, "int32")
    assert util.get_const_int(x) == 100
    assert util.get_const_tuple((x, x)) == (100, 100)
예제 #43
0
def fidentity(t0, t1):
    return tvm.const(-1, t0), tvm.min_value(t1)
예제 #44
0
def test_deduce():
    a = tvm.var('a')
    b = tvm.var('b')
    c = tvm.var('c')
    d = tvm.var('d')

    b_s = tvm.arith.IntervalSet(2, 3)
    c_s = tvm.arith.IntervalSet(10, 15)
    d_s = tvm.arith.IntervalSet(-3, -1)
    zero = tvm.const(0, "int32")

    e0 = (-b) * a + c - d
    res0 = tvm.arith.DeduceBound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {})
    ans0 = ((d - c) / (b * -1) + (-1))
    assert_expr_equal(res0.max_value, ans0)

    # expression containing variable a is on rhs
    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
    assert_expr_equal(res0.max_value, ans0)

    e0 = d * a + c - d
    res0 = tvm.arith.DeduceBound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {})
    ans0 = ((d - c) / d - 1)
    assert_expr_equal(res0.max_value, ans0)

    # expression containing variable a is on rhs
    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
    assert_expr_equal(res0.max_value, ans0)

    e1 = (a * 4 + b < c)
    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
    ans1 = (((c - b) + -1) / 4 - 1)
    assert_expr_equal(res1.max_value, ans1)

    # expression containing variable a is on rhs
    e1 = (c > a * 4 + b)
    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
    assert_expr_equal(res1.max_value, ans1)

    e2 = (tvm.max(5, a * 4) < 0)
    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
    assert str(res2.max_value) == "neg_inf"
    assert str(res2.min_value) == "pos_inf"

    # expression containing variable a is on rhs
    e2 = (zero < tvm.max(5, a * 4))
    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
    assert str(res2.max_value) == "neg_inf"
    assert str(res2.min_value) == "pos_inf"

    e3 = (-b) + a * c - d
    res3 = tvm.arith.DeduceBound(a, e3 >= 0, {
        b: b_s,
        c: c_s,
        d: d_s
    }, {
        b: b_s,
        d: d_s
    })
    ans3 = 2 / c + 1
    assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3)

    res3 = tvm.arith.DeduceBound(a, zero <= e3, {
        b: b_s,
        c: c_s,
        d: d_s
    }, {
        b: b_s,
        d: d_s
    })
    assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3)

    # tests for `EQ` op
    res4 = tvm.arith.DeduceBound(a, a == b, {}, {})
    assert_expr_equal(res4.max_value, b)
    assert_expr_equal(res4.min_value, b)

    # Unsatisfiable `EQ`, variable as one of the Operand
    res5 = tvm.arith.DeduceBound(a, (a == b), {b: b_s}, {b: b_s})
    assert str(res5.max_value) == "neg_inf"
    assert str(res5.min_value) == "pos_inf"

    # variable `a` on the RHS side
    res6 = tvm.arith.DeduceBound(a, 10 == a, {}, {})
    assert_expr_equal(res6.max_value, 10)
    assert_expr_equal(res6.min_value, 10)

    # Add, Sub in `EQ`
    e4 = ((a - c) == (b + d))
    ans4 = (b + d + c)
    res7 = tvm.arith.DeduceBound(a, e4, {b: b_s, c: c_s, d: d_s}, {})
    assert_expr_equal(res7.max_value, ans4)
    assert_expr_equal(res7.min_value, ans4)

    # Satisfiable Mul in `EQ` with negative sign
    res8 = tvm.arith.DeduceBound(a, (5 * a == -10), {}, {})
    assert_expr_equal(res8.max_value, -2)
    assert_expr_equal(res8.min_value, -2)

    # Unsatisfiable Mul in `EQ`
    e5 = (4 * a == b)
    res9 = tvm.arith.DeduceBound(a, e5, {b: b_s}, {})
    assert str(res9.max_value) == "neg_inf"
    assert str(res9.min_value) == "pos_inf"

    # Unsatisfiable Mul in `EQ`
    res10 = tvm.arith.DeduceBound(
        a, (b * a == b), {b: b_s},
        {})  # simplifier is not able to prove that (b % b == 0)
    assert str(res10.max_value) == "neg_inf"
    assert str(res10.min_value) == "pos_inf"
예제 #45
0
    def _intrin_func(ins, outs):
        ww, xx = ins
        zz = outs[0]
        vpadd = "llvm.arm.neon.vpadd.v8u8"
        vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16"
        args_1 = tvm.const(1, 'uint32')
        args_2 = tvm.const(2, 'uint32')

        def _instr(index):
            irb = tvm.ir_builder.create()
            if index == 1:
                irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8')))
                return irb.get()

            cnts8 = [None] * 8
            cnts4 = [None] * 4
            cnts2 = [None] * 2
            for bw in range(w_b):
                for bx in range(x_b):
                    if k_i == 16:
                        for i in range(m):
                            ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload(
                                [bx, 0], 'uint8x16')
                            cnts = tvm.popcount(ands)
                            upper_half = tvm.call_pure_intrin(
                                'uint8x8', 'vectorhigh', cnts)
                            lower_half = tvm.call_pure_intrin(
                                'uint8x8', 'vectorlow', cnts)
                            cnts8[i] = upper_half + lower_half
                        for i in range(m // 2):
                            cnts4[i] = tvm.call_llvm_intrin(
                                'uint8x8', vpadd, args_1, cnts8[i * 2],
                                cnts8[i * 2 + 1])
                        for i in range(m // 4):
                            cnts2[i] = tvm.call_llvm_intrin(
                                'uint8x8', vpadd, args_1, cnts4[i * 2],
                                cnts4[i * 2 + 1])
                        cnts = tvm.call_pure_intrin('uint8x16',
                                                    'vectorcombine', cnts2[0],
                                                    cnts2[1])
                        shifted_cnts = cnts << tvm.const(bw + bx, dtype)
                        out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2,
                                                   zz.vload(0, 'uint16x8'),
                                                   shifted_cnts)
                    else:  # ki == 8
                        for i in range(m):
                            ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload(
                                [bx, 0], 'uint8x8')
                            cnts8[i] = tvm.popcount(ands)
                        for i in range(m // 2):
                            cnts4[i] = tvm.call_llvm_intrin(
                                'uint8x8', vpadd, args_1, cnts8[i * 2],
                                cnts8[i * 2 + 1])
                        for i in range(m // 4):
                            cnts2[i] = tvm.call_llvm_intrin(
                                'uint8x8', vpadd, args_1, cnts4[i * 2],
                                cnts4[i * 2 + 1])
                        cnts = tvm.call_pure_intrin('uint8x16',
                                                    'vectorcombine', cnts2[0],
                                                    cnts2[1])
                        shifted_cnts = cnts << tvm.const(bw + bx, dtype)
                        out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2,
                                                   zz.vload(0, 'uint16x8'),
                                                   shifted_cnts)
                    irb.emit(zz.vstore(0, out))
            return irb.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)
예제 #46
0
 def argmax_init(idx_typ, val_typ):
     return tvm.const(-1, idx_typ), tvm.min_value(val_typ)
예제 #47
0
 def _compute(*indices):
     value = x(*indices)
     calpha = tvm.const(alpha, value.dtype)
     return tvm.select(value > 0, value, value * calpha)
예제 #48
0
def test_const():
    x = tvm.const(1, "int32")
    print(x.dtype)
    assert x.dtype == tvm.int32
    assert isinstance(x, tvm.tir.IntImm)
예제 #49
0
def test_cmp_simplify():
    ck = RewriteChecker()
    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
    flm = tvm.floormod
    fld = tvm.floordiv
    # const int bound
    ck.verify((x % 2 + 10).equal(0), tvm.const(0, "bool"))
    ck.verify(tvm.expr.NE(x % 2 + 10, 0), tvm.const(1, "bool"))
    ck.verify(x % 2 + 10 > 1, tvm.const(1, "bool"))
    ck.verify(x % 2 + 10 <= 1, tvm.const(0, "bool"))
    ck.verify(flm(x, 2) + 2 > 1, tvm.const(1, "bool"))
    ck.verify(flm(x, 2) + 10 <= 1, tvm.const(0, "bool"))

    ck.verify(x * 3 + 10 == 0, tvm.const(0, "bool"))
    ck.verify(x * 3 + 10 != 0, tvm.const(1, "bool"))

    # canonicalization
    ck.verify((x - 10).equal(0), x.equal(10))
    ck.verify((10 - x).equal(0), x.equal(10))
    ck.verify((x * y).equal(0), tvm.expr.Or(x.equal(0), y.equal(0)))

    # cmp bound
    ck.verify(x + y < x + z, y < z)
    ck.verify(x + y < z + x, y < z)
    ck.verify(y + x < x + z, y < z)
    ck.verify(y + x < z + x, y < z)
    ck.verify(y - x < z - x, y < z)
    ck.verify(x - y < x - z, z < y)

    ck.verify(x < z + x, tvm.expr.LT(0, z))
    ck.verify(x < x + z, tvm.expr.LT(0, z))

    ck.verify(100 < x + 1, tvm.expr.LT(99, x))
    ck.verify(1 < 100 - x, tvm.expr.LT(x, 99))
    ck.verify(x * 3 < y * 3, x < y)
    ck.verify(x * (-3) < y * (-3), y < x)
    ck.verify(x * 3 >= y * 3, y <= x)

    ck.verify(x * 4 >= 2, tvm.expr.LE(1, x))
    ck.verify(x * 2 >= 50, tvm.expr.LE(25, x))
    ck.verify(x * 4 <= 2, x <= 0)
    ck.verify((0 - x * 3) <= 0, tvm.expr.LE(0, x))
    ck.verify((0 - x * 3) >= 0, tvm.expr.LE(x, 0))
    ck.verify(2 * x <= 0, x <= 0)

    ck.verify(x * 2 >= 3, tvm.expr.LE(2, x))
    ck.verify(x * 2 >= 2, tvm.expr.LE(1, x))
    ck.verify(x * 2 >= 1, tvm.expr.LE(1, x))
    ck.verify(x * 2 >= 0, tvm.expr.LE(0, x))
    ck.verify(x * 2 >= -1, tvm.expr.LE(0, x))
    ck.verify(x * 2 >= -2, tvm.expr.LE(-1, x))
    ck.verify(x * 2 >= -3, tvm.expr.LE(-1, x))

    ck.verify(x * 2 <= 3, tvm.expr.LE(x, 1))
    ck.verify(x * 2 <= 2, tvm.expr.LE(x, 1))
    ck.verify(x * 2 <= 1, tvm.expr.LE(x, 0))
    ck.verify(x * 2 <= 0, tvm.expr.LE(x, 0))
    ck.verify(x * 2 <= -1, tvm.expr.LE(x, -1))
    ck.verify(x * 2 <= -2, tvm.expr.LE(x, -1))
    ck.verify(x * 2 <= -3, tvm.expr.LE(x, -2))

    ck.verify(x * (-2) >= 3, tvm.expr.LE(x, -2))
    ck.verify(x * (-2) >= 2, tvm.expr.LE(x, -1))
    ck.verify(x * (-2) >= 1, tvm.expr.LE(x, -1))
    ck.verify(x * (-2) >= 0, tvm.expr.LE(x, 0))
    ck.verify(x * (-2) >= -1, tvm.expr.LE(x, 0))
    ck.verify(x * (-2) >= -2, tvm.expr.LE(x, 1))
    ck.verify(x * (-2) >= -3, tvm.expr.LE(x, 1))

    ck.verify(x * (-2) <= 3, tvm.expr.LE(-1, x))
    ck.verify(x * (-2) <= 2, tvm.expr.LE(-1, x))
    ck.verify(x * (-2) <= 1, tvm.expr.LE(0, x))
    ck.verify(x * (-2) <= 0, tvm.expr.LE(0, x))
    ck.verify(x * (-2) <= -1, tvm.expr.LE(1, x))
    ck.verify(x * (-2) <= -2, tvm.expr.LE(1, x))
    ck.verify(x * (-2) <= -3, tvm.expr.LE(2, x))

    # DivMod rules
    # truc div
    ck.verify(x / 2 < 3, x < 6)
    ck.verify(3 < x / 2, tvm.expr.LT(7, x))
    ck.verify(x / 3 >= 0, tvm.expr.LE(-2, x))
    ck.verify(x / 2 >= 1, tvm.expr.LE(2, x))
    ck.verify(x / 2 >= 0, tvm.expr.LE(-1, x))
    ck.verify(x / 2 >= -1, tvm.expr.LE(-3, x))

    ck.verify(x / 2 <= 1, tvm.expr.LE(x, 3))
    ck.verify(x / 2 <= 0, tvm.expr.LE(x, 1))
    ck.verify(x / 2 <= -1, tvm.expr.LE(x, -2))

    ck.verify(x / 4 * 4 < x, tvm.expr.LT(0, x % 4))
    ck.verify(x / 4 * 4 >= x, tvm.expr.LE(x % 4, 0))

    ck.verify(x / 4 * 4 < x + y, tvm.expr.LT(0, x % 4 + y))
    ck.verify(x / 4 * 4 < x - y, tvm.expr.LT(y, x % 4))

    ck.verify((x + 2) / 4 * 4 >= x, tvm.expr.LE((x + 2) % 4, 2))
    ck.verify((x + 2) / 4 * 4 >= x + y, tvm.expr.LE((x + 2) % 4 + y, 2))
    ck.verify((x + 2) / 4 * 4 >= x - y, tvm.expr.LE((x + 2) % 4 + (-2), y))

    # floor div
    ck.verify(fld(x, 2) < 3, x < 6)
    ck.verify(3 < fld(x, 2), tvm.expr.LT(7, x))
    ck.verify(-3 < fld(x, 2), tvm.expr.LT(-5, x))
    ck.verify(fld(x, 3) >= 0, tvm.expr.LE(0, x))
    ck.verify(fld(x, 2) >= 1, tvm.expr.LE(2, x))
    ck.verify(fld(x, 2) >= 0, tvm.expr.LE(0, x))
    ck.verify(fld(x, 2) >= -1, tvm.expr.LE(-2, x))

    ck.verify(fld(x, 2) <= 1, tvm.expr.LE(x, 3))
    ck.verify(fld(x, 2) <= 0, tvm.expr.LE(x, 1))
    ck.verify(fld(x, 2) <= -1, tvm.expr.LE(x, -1))

    ck.verify(fld(x, 4) * 4 < x, tvm.expr.LT(0, flm(x, 4)))
    ck.verify(fld(x, 4) * 4 >= x, tvm.expr.LE(flm(x, 4), 0))

    ck.verify(fld(x, 4) * 4 < x + y, tvm.expr.LT(0, flm(x, 4) + y))
    ck.verify(fld(x, 4) * 4 < x - y, tvm.expr.LT(y, flm(x, 4)))

    ck.verify(fld(x + 2, 4) * 4 >= x, tvm.expr.LE(flm(x + 2, 4), 2))
    ck.verify(fld(x + 2, 4) * 4 >= x + y, tvm.expr.LE(flm(x + 2, 4) + y, 2))
    ck.verify(fld(x + 2, 4) * 4 >= x - y, tvm.expr.LE(flm(x + 2, 4) + (-2), y))
    # End DivMod Rules

    ck.verify(tvm.min(x, 11) < 10, x < 10)
    ck.verify(tvm.min(x, 8) < 10, tvm.const(1, "bool"))
    ck.verify(tvm.max(8, x) > 10, tvm.expr.LT(10, x))
    ck.verify(x + 1 < tvm.max(8, x), x < 7)

    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 10), override=True)
    ck.analyzer.update(y, tvm.arith.ConstIntBound(-10, 0), override=True)
    ck.analyzer.update(z, tvm.arith.ConstIntBound(-5, 5), override=True)

    ck.verify(x < 11, tvm.const(1, "bool"))
    ck.verify(x <= 10, tvm.const(1, "bool"))
    ck.verify(z <= 5, tvm.const(1, "bool"))
    ck.verify(x + y <= 10, tvm.const(1, "bool"))
    ck.verify(x + y >= -10, tvm.const(1, "bool"))
    ck.verify(z - 5 <= y + 10, tvm.const(1, "bool"))
    ck.verify(tvm.all(x > -1, z <= x + 5), tvm.const(1, "bool"))
    ck.verify(x*y <= 0, tvm.const(1, "bool"))
    ck.verify((x + 1)*(y - 1) < 0, tvm.const(1, "bool"))
    ck.verify(y*y >= 0, tvm.const(1, "bool"))
    ck.verify(x*6 <= -3, tvm.const(0, "bool"))
    ck.verify((y - 1) % 3 == 0, (y + (-1)) % 3 == 0)
예제 #50
0
def deformable_conv2d_nchw(data, offset, kernel, strides, padding, dilation,
                           deformable_groups, groups, out_dtype):
    """Deformable conv2D operator in NCHW layout.

    The deformable convolution operation is described in https://arxiv.org/abs/1703.06211

    Parameters
    ----------
    data : tvm.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]

    offset : tvm.Tensor
        4-D with shape [batch, deformable_groups * filter_height * filter_width * 2,
        out_height, out_width].

    kernel : tvm.Tensor
        4-D with shape [num_filter, in_channel, filter_height, filter_width]

    strides : int or a list/tuple of two ints
        stride size, or [stride_height, stride_width]

    padding : int or a list/tuple of two ints
        padding size, or [pad_height, pad_width]

    dilation : int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    deformable_groups : int
        number of deformable groups

    groups : int
        number of groups

    Returns
    -------
    output : tvm.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    if out_dtype is None:
        out_dtype = data.dtype

    if isinstance(strides, int):
        stride_h = stride_w = strides
    else:
        stride_h, stride_w = strides

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_channel, in_height, in_width = get_const_tuple(data.shape)
    out_channel, channel, kernel_h, kernel_w = get_const_tuple(kernel.shape)
    _, _, out_height, out_width = get_const_tuple(offset.shape)
    assert in_channel % deformable_groups == 0, "Input cahnnels must divide deformable group size"
    assert groups == 1, "deformable_conv2d_nchw does not support groups > 1"

    ic_per_dgroup = channel // deformable_groups

    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, _, _ = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    rc = tvm.reduce_axis((0, in_channel), name='rc')
    ry = tvm.reduce_axis((0, kernel_h), name='ry')
    rx = tvm.reduce_axis((0, kernel_w), name='rx')

    zero = tvm.const(0.0, data.dtype)

    def _bilinear(n, c, h, w):
        outside = tvm.any(h < 0, w < 0, h >= in_height, w >= in_width)
        val = bilinear_sample_nchw(data, (n, c, h, w), in_height - 1,
                                   in_width - 1)
        return tvm.if_then_else(outside, zero, val)

    data_deform = \
        tvm.compute((batch, in_channel, kernel_h, kernel_w, out_height, out_width),
                    lambda n, c, kh, kw, y, x:
                    _bilinear(n, c,
                              y * stride_h - pad_top + kh * dilation_h +
                              offset[n, c // ic_per_dgroup * (kernel_w*kernel_h*2) +
                                     (kh * kernel_w + kw) * 2, y, x],
                              x * stride_w - pad_left + kw * dilation_w +
                              offset[n, c // ic_per_dgroup * (kernel_w*kernel_h*2) +
                                     (kh * kernel_w + kw) * 2 + 1, y, x]))
    return tvm.compute(
        (batch, out_channel, out_height, out_width),
        lambda n, f, y, x: tvm.sum(data_deform[n, rc, ry, rx, y, x].astype(
            out_dtype) * kernel[f, rc, ry, rx].astype(out_dtype),
                                   axis=[rc, ry, rx]),
        tag="deformable_conv2d_nchw")
예제 #51
0
파일: conv2d.py 프로젝트: yyht/neo-ai-tvm
def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype,
                   tile_size):
    N, CI, IH, IW = get_const_tuple(data.shape)
    if len(kernel.shape) == 4:
        pre_computed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
    else:
        pre_computed = True
        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
        CO *= VC
        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
    HSTR, WSTR = strides if isinstance(strides,
                                       (tuple, list)) else (strides, strides)
    HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel)

    assert layout == 'NCHW'
    assert KH == 3 and KW == 3 and HPAD == 1 and WPAD == 1 and HSTR == 1 and WSTR == 1
    data_pad = pad(data, (0, 0, HPAD, WPAD), name="data_pad")

    if tile_size == 4:
        G_data = np.array(
            [[1 / 4.0, 0, 0], [-1 / 6.0, -1 / 6.0, -1 / 6.0],
             [-1 / 6.0, 1 / 6.0, -1 / 6.0], [1 / 24.0, 1 / 12.0, 1 / 6.0],
             [1 / 24.0, -1 / 12.0, 1 / 6.0], [0, 0, 1]], out_dtype)

        B_data = np.array(
            [[4, 0, 0, 0, 0, 0], [0, -4, 4, -2, 2, 4], [-5, -4, -4, -1, -1, 0],
             [0, 1, -1, 2, -2, -5], [1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 1]],
            out_dtype)

        A_data = np.array([[1, 0, 0, 0], [1, 1, 1, 1], [1, -1, 1, -1],
                           [1, 2, 4, 8], [1, -2, 4, -8], [0, 0, 0, 1]],
                          out_dtype)
    elif tile_size == 2:
        G_data = np.array([[1, 0, 0], [1.0 / 2, 1.0 / 2, 1.0 / 2],
                           [1.0 / 2, -1.0 / 2, 1.0 / 2], [0, 0, 1]], out_dtype)

        B_data = np.array(
            [[1, 0, 0, 0], [0, 1, -1, 1], [-1, 1, 1, 0], [0, 0, 0, -1]],
            out_dtype)

        A_data = np.array([[1, 0], [1, 1], [1, -1], [0, -1]], out_dtype)
    else:
        raise ValueError("Unsupported tile size for winograd: " +
                         str(tile_size))

    m = A_data.shape[1]
    r = 3
    alpha = m + r - 1

    H = (IH + 2 * HPAD - 3) // HSTR + 1
    W = (IW + 2 * WPAD - 3) // WSTR + 1
    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW

    ##### space definition begin #####
    tile_bna_candidates = [1, 2, 4, 8, 16]
    factors = get_factors(CO)
    cfg.define_knob('tile_bna',
                    [x for x in tile_bna_candidates if x in factors])
    cfg.define_knob('tile_bnb', [1, 2, 4, 8, 16])
    cfg.define_split('tile_t1', CI, num_outputs=2, max_factor=128)
    cfg.define_split('tile_t2', CO, num_outputs=2, max_factor=128)
    cfg.define_split('c_unroll', CI, num_outputs=2, max_factor=8)
    cfg.define_knob('yt', [1, 2, 4, 8, 16, 32])
    ##### space definition end #####

    if cfg.is_fallback:
        cfg['tile_bnb'].val = 4
        cfg['tile_bna'].val = 4
        while CO % cfg['tile_bna'].val != 0:
            cfg['tile_bna'].val //= 2
        cfg['yt'].val = 8
        cfg.fallback_split('tile_t1', [-1, 128])
        cfg.fallback_split('tile_t2', [-1, 128])
        cfg.fallback_split('c_unroll', [-1, 8])

    bna = cfg['tile_bna'].val
    bnb = cfg['tile_bnb'].val

    P_round = (P + bnb - 1) // bnb * bnb
    assert CO % bna == 0 and P_round % bnb == 0

    # pack input tile
    input_tile = tvm.compute((CI, P_round // bnb, alpha, alpha, bnb), lambda ci, b, eps, nu, bb: \
         tvm.select(b * bnb + bb < P,
                    data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps]
                    [(b*bnb+bb) % nW * m + nu], tvm.const(0, data_pad.dtype)), name='d')

    # transform kernel
    if pre_computed:
        U = kernel
    else:
        G = const_matrix(G_data, 'G')
        r_kh = tvm.reduce_axis((0, KH), 'r_kh')
        r_kw = tvm.reduce_axis((0, KW), 'r_kw')
        U = tvm.compute(
            (alpha, alpha, CO // bna, CI, bna),
            lambda eps, nu, co, ci, vco: tvm.sum(kernel[co * bna + vco][ci][
                r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
                                                 axis=[r_kh, r_kw]),
            name='U')

    # transform image
    B = const_matrix(B_data, 'B')
    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_b')
    V = tvm.compute((alpha, alpha, P_round // bnb, CI, bnb),
                    lambda eps, nu, p, ci, vp: tvm.sum(input_tile[ci][p][r_a][
                        r_b][vp] * B[r_a][eps] * B[r_b][nu],
                                                       axis=[r_a, r_b]),
                    name='V')

    # batch gemm
    ci = tvm.reduce_axis((0, CI), name='c')
    M = tvm.compute((alpha, alpha, CO, P_round),
                    lambda eps, nu, co, p: tvm.sum(U[eps][nu][co // bna][ci][
                        co % bna] * V[eps][nu][p // bnb][ci][p % bnb],
                                                   axis=ci),
                    name='M')

    A = const_matrix(A_data, 'A')
    r_a = tvm.reduce_axis((0, alpha), 'r_a')
    r_b = tvm.reduce_axis((0, alpha), 'r_b')
    Y = tvm.compute(
        (CO, P, m, m),
        lambda co, p, vh, vw: tvm.sum(
            M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]),
        name='Y')

    # unpack output
    output = tvm.compute(
        (N, CO, H, W),
        lambda n, co, h, w: Y[co][n * nH * nW +
                                  (h // m) * nW + w // m][h % m][w % m]
        # thw following term is used to make the padding effective,
        # otherwise the padding will be eliminated by bound inference
        + tvm.const(0, out_dtype) * M[alpha - 1][alpha - 1][CO - 1][P_round - 1
                                                                    ],
        name='output',
        tag='winograd_conv2d_output',
        attrs={
            'workload':
            _winograd_conv_arg_to_workload(data, kernel, strides, padding,
                                           layout, out_dtype, tile_size)
        })

    # we have to manually assign effective GFLOP for winograd
    cfg.add_flop(2 * N * CO * H * W * KH * KW * CI)
    return output
예제 #52
0
    def _do_fold(stmt):
        def _equal(x, y):
            return tvm.ir_pass.Equal(tvm.ir_pass.Simplify(x - y), 0)

        def _flatten_loop(src_coeff, dst_coeff, extents):
            src_coeff = list(src_coeff)
            dst_coeff = list(dst_coeff)
            extents = list(extents)
            rev_src_coeff = [src_coeff.pop()]
            rev_dst_coeff = [dst_coeff.pop()]
            rev_extents = []
            assert src_coeff
            vsrc = src_coeff.pop()
            vdst = dst_coeff.pop()
            vext = extents.pop()
            while src_coeff:
                next_src = src_coeff.pop()
                next_dst = dst_coeff.pop()
                next_ext = extents.pop()

                if _equal(next_src, vsrc * vext) and _equal(
                        next_dst, vdst * vext):
                    vext = tvm.ir_pass.Simplify(vext * next_ext)
                else:
                    rev_src_coeff.append(vsrc)
                    rev_dst_coeff.append(vdst)
                    rev_extents.append(vext)
                    vsrc = next_src
                    vdst = next_dst
                    vext = next_ext
            rev_src_coeff.append(vsrc)
            rev_dst_coeff.append(vdst)
            rev_extents.append(vext)
            rev_src_coeff.reverse()
            rev_dst_coeff.reverse()
            rev_extents.reverse()

            return rev_src_coeff, rev_dst_coeff, rev_extents

        if _match_pragma(stmt, "alu"):
            # Get to the innermost loop body
            loop_body = stmt.body
            nest_size = 0
            while isinstance(loop_body, tvm.stmt.For):
                loop_body = loop_body.body
                nest_size += 1
            # Get the src/dst arguments
            dst_var = loop_body.buffer_var
            dst_idx = loop_body.index
            # Derive loop variables and extents
            tmp_body = stmt.body
            indices = []
            extents = []
            for _ in range(nest_size):
                indices.append(tmp_body.loop_var)
                extents.append(tmp_body.extent)
                tmp_body = tmp_body.body
            # Derive opcode
            if isinstance(loop_body.value, tvm.expr.Add):
                alu_opcode = env.dev.ALU_OPCODE_ADD
                lhs = loop_body.value.a
                rhs = loop_body.value.b
            elif isinstance(loop_body.value, tvm.expr.Sub):
                alu_opcode = env.dev.ALU_OPCODE_SUB
                lhs = loop_body.value.a
                rhs = loop_body.value.b
            elif isinstance(loop_body.value, tvm.expr.Mul):
                alu_opcode = env.dev.ALU_OPCODE_MUL
                lhs = loop_body.value.a
                rhs = loop_body.value.b
            elif isinstance(loop_body.value, tvm.expr.Min):
                alu_opcode = env.dev.ALU_OPCODE_MIN
                lhs = loop_body.value.a
                rhs = loop_body.value.b
            elif isinstance(loop_body.value, tvm.expr.Max):
                alu_opcode = env.dev.ALU_OPCODE_MAX
                lhs = loop_body.value.a
                rhs = loop_body.value.b
            elif isinstance(loop_body.value, tvm.expr.Call):
                if loop_body.value.name == 'shift_left':
                    alu_opcode = env.dev.ALU_OPCODE_SHR
                    lhs = loop_body.value.args[0]
                    rhs = tvm.ir_pass.Simplify(-loop_body.value.args[1])
                elif loop_body.value.name == 'shift_right':
                    alu_opcode = env.dev.ALU_OPCODE_SHR
                    lhs = loop_body.value.args[0]
                    rhs = loop_body.value.args[1]
                else:
                    raise RuntimeError("Function call not recognized %s" %
                                       (loop_body.value.name))
            elif isinstance(loop_body.value, tvm.expr.Load):
                alu_opcode = env.dev.ALU_OPCODE_SHR
                lhs = loop_body.value
                rhs = tvm.const(0, "int32")
            else:
                raise RuntimeError(
                    "Expression not recognized %s, %s, %s" %
                    (type(loop_body.value), str(loop_body.value), str(stmt)))

            # Derive array index coefficients
            dst_coeff = tvm.arith.DetectLinearEquation(dst_idx, indices)
            # Check if lhs/rhs is immediate
            use_imm = False
            imm_val = None
            if isinstance(rhs, tvm.expr.IntImm):
                assert lhs.buffer_var.same_as(dst_var)
                src_coeff = tvm.arith.DetectLinearEquation(lhs.index, indices)
                use_imm = True
                imm_val = rhs
            if isinstance(lhs, tvm.expr.IntImm):
                assert rhs.buffer_var.same_as(dst_var)
                src_coeff = tvm.arith.DetectLinearEquation(rhs.index, indices)
                use_imm = True
                imm_val = lhs
            if imm_val is None:
                imm_val = 0
                assert lhs.buffer_var.same_as(
                    dst_var) and rhs.buffer_var.same_as(dst_var)
                src_lhs_coeff = tvm.arith.DetectLinearEquation(
                    lhs.index, indices)
                src_rhs_coeff = tvm.arith.DetectLinearEquation(
                    rhs.index, indices)
                # Determine which side has the same coefficients
                lhs_equal = True
                rhs_equal = True
                for i, coef in enumerate(dst_coeff):
                    if not tvm.ir_pass.Equal(coef, src_lhs_coeff[i]):
                        lhs_equal = False
                    if not tvm.ir_pass.Equal(coef, src_rhs_coeff[i]):
                        rhs_equal = False
                # Make sure at least one of the source is identical to the
                # destination (in-place computation)
                assert lhs_equal or rhs_equal
                # Assign the source coefficients
                if lhs_equal:
                    src_coeff = src_rhs_coeff
                else:
                    src_coeff = src_lhs_coeff

            # Ensure that we have the proper tensor dimensions in the
            # innermost loop (pattern match)
            src_coeff = list(src_coeff)
            dst_coeff = list(dst_coeff)
            extents = list(extents)
            assert len(src_coeff) > 1
            assert len(dst_coeff) > 1
            assert len(extents) != 0
            assert tvm.ir_pass.Equal(
                tvm.ir_pass.Simplify(src_coeff[-1] %
                                     (env.BATCH * env.BLOCK_OUT)), 0)
            assert tvm.ir_pass.Equal(
                tvm.ir_pass.Simplify(dst_coeff[-1] %
                                     (env.BATCH * env.BLOCK_OUT)), 0)
            assert tvm.ir_pass.Equal(src_coeff[-2], 1)
            assert tvm.ir_pass.Equal(dst_coeff[-2], 1)
            if env.BATCH > 1:
                assert len(src_coeff) > 2
                assert len(dst_coeff) > 2
                assert len(extents) > 1
                assert tvm.ir_pass.Equal(src_coeff[-3], env.BLOCK_OUT)
                assert tvm.ir_pass.Equal(dst_coeff[-3], env.BLOCK_OUT)

            # Apply tensorization of the loop coefficients
            src_offset = src_coeff[-1]
            dst_offset = dst_coeff[-1]
            if env.BATCH == 1:
                src_coeff = src_coeff[:-2]
                dst_coeff = dst_coeff[:-2]
                extents = extents[:-1]
            else:
                src_coeff = src_coeff[:-3]
                dst_coeff = dst_coeff[:-3]
                extents = extents[:-2]
            src_coeff.append(src_offset)
            dst_coeff.append(dst_offset)
            src_coeff = [
                tvm.ir_pass.Simplify(c // (env.BATCH * env.BLOCK_OUT))
                for c in src_coeff
            ]
            dst_coeff = [
                tvm.ir_pass.Simplify(c // (env.BATCH * env.BLOCK_OUT))
                for c in dst_coeff
            ]

            # Flatten the outer loops
            if extents:
                src_coeff, dst_coeff, extents = _flatten_loop(
                    src_coeff, dst_coeff, extents)

            # Insert ALU micro-ops
            irb = tvm.ir_builder.create()
            for idx, extent in enumerate(extents):
                irb.emit(
                    tvm.call_extern("int32", "VTAUopLoopBegin", extent,
                                    dst_coeff[idx], src_coeff[idx], 0))
            use_imm = int(use_imm)
            irb.emit(
                tvm.call_extern("int32", "VTAUopPush", 1, 0,
                                dst_coeff[len(dst_coeff) - 1],
                                src_coeff[len(src_coeff) - 1], 0, alu_opcode,
                                use_imm, imm_val))
            for extent in extents:
                irb.emit(tvm.call_extern("int32", "VTAUopLoopEnd"))
            return irb.get()
        return stmt
예제 #53
0
def test_ir():
    x = tvm.const(1, "int32")
    y = tvm.tir.IntImm('int32', 1)
    z = x + y
    stmt = tvm.tir.Evaluate(z)
    assert isinstance(stmt, tvm.tir.Evaluate)
예제 #54
0
def argmin_identity(t0, t1):
    return tvm.const(-1, t0), tvm.max_value(t1)
예제 #55
0
import nnpu
import tvm
import topi
from nnpu.utils import ScheduleProcHelper
import numpy as np

with (ScheduleProcHelper()):
    env = nnpu.get_env()
    nnpu.set_device(env, type='S0')
    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']

    a = tvm.placeholder((2, 4, 16), dtype_n, 'a')
    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a')

    pad_buf = tvm.compute((2, 6, 16), lambda i, j, k: tvm.expr.Select(
        j >= 2, a_buf[i, j - 2, k], tvm.const(0, dtype_n)), 'pad')
    nnpu.utils.MarkScope(pad_buf)
    nnpu.utils.PragmaCopy(pad_buf)
    tile_host, _ = nnpu.utils.CopyBufToH(pad_buf, 'tile')

    s = nnpu.create_schedule([tile_host.op])

    print(tvm.lower(s, [a, tile_host], simple_mode=True))
    print(nnpu.lower(s, [a, tile_host], simple_mode=True))
    # exit(0)
    func = nnpu.build(s, [a, tile_host], 'nnpu', 'llvm', name='nnpu_func')

    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=(2, 4, 16),
                             dtype=a.dtype,
                             low=-128,
예제 #56
0
    def _do_fold(op):
        if _match_pragma(op, "conv2d_transpose_gemm"):
            is_init = ".init" in str(op)
            tvm.ir_pass.PostOrderVisit(op, _find_basics)

            if is_init:
                # create inner most block
                irb = tvm.ir_builder.create()
                dev = env.dev
                irb.scope_attr(dev.vta_axis, "coproc_scope",
                               dev.get_task_qid(dev.QID_COMPUTE))
                irb.scope_attr(dev.vta_axis, "coproc_uop_scope",
                               dev.vta_push_uop)
                irb.emit(
                    tvm.call_extern("int32", "VTAUopPush", 0, 1,
                                    dout.access_ptr("rw", "int32"), 0, 0, 0, 0,
                                    0))
                inner = irb.get()
                args = op.body.body.args
                res_tensor = op.body.body.func.output(0)
                tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0,
                       env.BLOCK_OUT)
                inner = tvm.make.AttrStmt(
                    [dout, res_tensor], 'buffer_bind_scope',
                    tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner)
                return inner
            else:
                conv_call, data_call, kernel_call = calls[-3:]
                pad_data_tensor = data_call.func.output(0)
                kernel_tensor = kernel_call.func.output(0)
                res_tensor = conv_call.func.output(0)

                if selects:
                    condition = selects[0].condition
                else:
                    condition = tvm.const(1, 'int')

                # create inner most block
                irb = tvm.ir_builder.create()
                with irb.if_scope(condition):
                    dev = env.dev
                    irb.scope_attr(dev.vta_axis, "coproc_scope",
                                   dev.get_task_qid(dev.QID_COMPUTE))
                    irb.scope_attr(dev.vta_axis, "coproc_uop_scope",
                                   dev.vta_push_uop)
                    irb.emit(
                        tvm.call_extern("int32", "VTAUopPush", 0, 0,
                                        dout.access_ptr("rw", "int32"),
                                        dinp.access_ptr("r", "int32"),
                                        dwgt.access_ptr("r", "int32"), 0, 0,
                                        0))
                inner = irb.get()

                args = conv_call.args
                tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0,
                       env.BLOCK_OUT)
                inner = tvm.make.AttrStmt(
                    [dout, res_tensor], 'buffer_bind_scope',
                    tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner)
                args = kernel_call.args
                tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0,
                       env.BLOCK_OUT, 0, env.BLOCK_IN)
                inner = tvm.make.AttrStmt(
                    [dwgt, kernel_tensor], 'buffer_bind_scope',
                    tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner)
                args = data_call.args
                tpl = (args[0], 1, args[1], 1, args[2], 1, args[3], 1, 0, 1, 0,
                       env.BLOCK_IN)
                inner = tvm.make.AttrStmt(
                    [dinp, pad_data_tensor], 'buffer_bind_scope',
                    tvm.call_intrin('handle', 'tvm_tuple', *tpl), inner)
                return inner
        return None
예제 #57
0
def test_make():
    x = tvm.const(1, "int32")
    y = tvm.var("x")
    z = x + y
    assert isinstance(tvm.max(x, y), tvm.tir.Max)
    assert isinstance(tvm.min(x, y), tvm.tir.Min)
예제 #58
0
파일: math.py 프로젝트: chisuhua/graph
 def _compute(*indices):
     value = x(*indices)
     const_min = tvm.const(a_min, value.dtype)
     const_max = tvm.const(a_max, value.dtype)
     return tvm.max(tvm.min(value, const_max), const_min)
예제 #59
0
# Reduction axes
kh = tvm.reduce_axis((0, kernel_h), name='kh')
kw = tvm.reduce_axis((0, kernel_w), name='kw')
ic = tvm.reduce_axis((0, in_channels // block_size), name='ic')
ii = tvm.reduce_axis((0, block_size), name='ii')

# Algorithm
A = tvm.placeholder(data_shape, name='A', dtype="float16")
W = tvm.placeholder(kernel_shape, name='W', dtype="float16")
Apad = tvm.compute(
    (batch_size // block_size, height + 2 * pad_h, width + 2 * pad_w,
     in_channels // block_size, block_size, block_size),
    lambda n, h, w, i, nn, ii: tvm.if_then_else(
        tvm.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width),
        A[n, h - pad_h, w - pad_w, i, nn, ii], tvm.const(0., "float16")),
    name='Apad')
Conv = tvm.compute(
    output_shape,
    lambda n, h, w, o, nn, oo: tvm.sum(Apad[
        n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype(
            "float32") * W[kh, kw, ic, o, ii, oo].astype("float32"),
                                       axis=[ic, kh, kw, ii]),
    name="Conv")

s = tvm.create_schedule(Conv.op)
s[Apad].compute_inline()

###############################################################################
# Memory Scope
# ----------------
예제 #60
0
out_channel = 512
in_size = 14
kernel = 3
pad = 1
stride = 1

# Algorithm
A = tvm.placeholder((in_size, in_size, in_channel, batch), name='A')
W = tvm.placeholder((kernel, kernel, in_channel, out_channel), name='W')
out_size = (in_size - kernel + 2 * pad) // stride + 1
# Pad input
Apad = tvm.compute(
    (in_size + 2 * pad, in_size + 2 * pad, in_channel, batch),
    lambda yy, xx, cc, nn: tvm.if_then_else(
        tvm.all(yy >= pad, yy - pad < in_size, xx >= pad, xx - pad < in_size),
        A[yy - pad, xx - pad, cc, nn], tvm.const(0., "float32")),
    name='Apad')
# Create reduction variables
rc = tvm.reduce_axis((0, in_channel), name='rc')
ry = tvm.reduce_axis((0, kernel), name='ry')
rx = tvm.reduce_axis((0, kernel), name='rx')
# Compute the convolution
B = tvm.compute(
    (out_size, out_size, out_channel, batch),
    lambda yy, xx, ff, nn: tvm.sum(Apad[yy * stride + ry, xx * stride + rx, rc,
                                        nn] * W[ry, rx, rc, ff],
                                   axis=[ry, rx, rc]),
    name='B')

# Designate the memory hierarchy
s = tvm.create_schedule(B.op)