Exemplo n.º 1
0
def test_buffer_index_merge_mult_mod():
    m = tvm.var('m')
    n = tvm.var('n')
    s = tvm.var('s')
    k0 = tvm.var('k0')
    k1 = tvm.var('k1')
    A = tvm.decl_buffer((m, n), tvm.float32)
    A_stride = tvm.decl_buffer((m, n), tvm.float32, strides=(s, 1))
    def assert_simplified_equal(index_simplified, index_direct):
        assert tvm.ir_pass.Equal(index_simplified, index_direct),\
        "index_simplified=%s, index_direct=%s" %(index_simplified, index_direct)
    # Test Case1
    index_simplified = A_stride.vload(((k0 % k1) / s, (k0 % k1) % s + (k0 / k1) * k1))
    index_direct = A_stride.vload((0, k0))
    assert_simplified_equal(index_simplified, index_direct)
    # Test Case2
    index_simplified = A.vload(((k0 % (k1 / s)) / n,
                                (k0 % (k1 / s)) % n + (k0 % k1)))
    index_direct = A.vload((0, k0 % k1 + k0 % (k1 / s)))
    assert_simplified_equal(index_simplified, index_direct)
    # Test Case3
    index_simplified = A.vload((((k0 / (k1 / s)) * (k1 / s)) / n + (k0 % (k1 / s)) / n,
                                ((k0 / (k1 / s)) * (k1 / s)) % n + (k0 % (k1 / s)) % n))
    index_direct = A.vload((0, k0))
    assert_simplified_equal(index_simplified, index_direct)
    # Test Case4 (not able to simplify)
    index_simplified = A.vload(((k0 % (k1 / s)) / n,
                                (k0 % (k1 / n)) % n + (k0 % k1)))
    index_direct = A.vload((0, ((k0 % (k1 / s)) / n) * n + ((k0 % (k1 / n)) % n + (k0 % k1))))
    assert_simplified_equal(index_simplified, index_direct)
Exemplo n.º 2
0
def test_lrn():
    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
    x = relay.var("x", shape=(n, c , h, w))
    y = relay.nn.lrn(x, size=10, axis=2, bias=0.5, alpha=.00001, beta=0.75)
    "alpha=" in y.astext()
    yy = relay.ir_pass.infer_type(y)
    assert yy.checked_type == relay.TensorType((n, c , h, w))

    shape = (1, 5, 10, 10)
    dtype = "float32"
    x = relay.var("x", relay.TensorType(shape, dtype))
    size=5
    axis=1
    bias=0.5
    alpha=.00001
    beta=0.75
    z = relay.nn.lrn(x, size=size, axis=axis, bias=bias, alpha=alpha, beta=beta)
    yy = relay.ir_pass.infer_type(z)
    assert yy.checked_type == relay.TensorType(shape, dtype)
    func = relay.Function([x], z)
    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
    ref_res = topi.testing.lrn_python(x_data, size, axis, bias, alpha, beta)

    for target, ctx in ctx_list():
        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
        op_res1 = intrp1.evaluate(func)(x_data)
        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
        op_res2 = intrp2.evaluate(func)(x_data)
        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
Exemplo n.º 3
0
def test_l2_normalize():
    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
    x = relay.var("x", shape=(n, c , h, w))
    y = relay.nn.l2_normalize(x, eps=0.001, axis=[1])
    "axis=" in y.astext()
    yy = relay.ir_pass.infer_type(y)
    assert yy.checked_type == relay.TensorType((n, c , h, w))

    shape = (1, 5, 10, 10)
    dtype = "float32"
    x = relay.var("x", relay.TensorType(shape, dtype))
    eps=0.001
    axis=1
    z = relay.nn.l2_normalize(x, eps=0.001, axis=[axis])
    yy = relay.ir_pass.infer_type(z)
    assert yy.checked_type == relay.TensorType(shape, dtype)
    func = relay.Function([x], z)
    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
    ref_res = topi.testing.l2_normalize_python(x_data, eps, axis)

    for target, ctx in ctx_list():
        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
        op_res1 = intrp1.evaluate(func)(x_data)
        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
        op_res2 = intrp2.evaluate(func)(x_data)
        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
Exemplo n.º 4
0
def test_conv2d_transpose_infer_type():
    # symbolic in batch dimension
    n, c, h, w = tvm.var("n"), 10, 10, 12
    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
    w = relay.var("w", relay.IncompleteType())
    y = relay.nn.conv2d_transpose(x, w,
                                  kernel_size=(3, 3),
                                  padding=(1, 1),
                                  channels=15)
    assert "channels=15" in y.astext()
    yy = relay.ir_pass.infer_type(y)
    assert yy.checked_type == relay.TensorType(
        (n, 15, 10, 12), "float32")
    assert yy.args[1].checked_type == relay.TensorType(
        (10, 15, 3, 3), "float32")

    # infer by shape of w, mixed precision
    n, c, h, w = tvm.var("n"), 10, 10, 12
    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
    w = relay.var("w", relay.TensorType((12, 11, 5, 5), "float32"))
    y = relay.nn.conv2d_transpose(x, w,
                                  output_padding=(1, 1),
                                  channels=11,
                                  data_layout="NHWC")
    yy = relay.ir_pass.infer_type(y)
    assert yy.checked_type == relay.TensorType(
        (n, 15, 15, 11), "float32")
Exemplo n.º 5
0
def test_flatten_infer_type():
    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
    x = relay.var("x", relay.TensorType((d1, d2, d3, d4), "float32"))
    y = relay.nn.batch_flatten(x)
    yy = relay.ir_pass.infer_type(y)
    assert yy.checked_type == relay.TensorType((d1, ((d2*d3)*d4)), "float32")

    x = relay.var("x", relay.TensorType((3, 2, 4, 3), "float32"))
    y = relay.nn.batch_flatten(x)
    yy = relay.ir_pass.infer_type(y)
    assert yy.checked_type == relay.TensorType((3, 24), "float32")

    x = relay.var("x", relay.TensorType((d1, 2, d3, 3), "float32"))
    y = relay.nn.batch_flatten(x)
    yy = relay.ir_pass.infer_type(y)
    assert yy.checked_type == relay.TensorType((d1, ((2*d3)*3)), "float32")

    shape = (1, 5, 10, 10)
    o_shape = (1, 500)
    dtype = "float32"
    x = relay.var("x", relay.TensorType(shape, dtype))
    z = relay.nn.batch_flatten(x)
    yy = relay.ir_pass.infer_type(z)
    assert yy.checked_type == relay.TensorType(o_shape, dtype)
    func = relay.Function([x], z)
    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
    ref_res = x_data.flatten().reshape(o_shape)

    for target, ctx in ctx_list():
        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
        op_res1 = intrp1.evaluate(func)(x_data)
        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
        op_res2 = intrp2.evaluate(func)(x_data)
        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
Exemplo n.º 6
0
def test_reduce_functions():
    def _with_keepdims(func):
        def _wrapper(data, axis=None, keepdims=False):
            if not keepdims:
                return func(data, axis=axis)
            else:
                if axis is not None:
                    axis = axis[0]
                    out_shape = list(data.shape)
                    out_shape[axis] = 1
                else:
                    out_shape = [1 for _ in range(len(data.shape))]
                return func(data, axis=axis).reshape(out_shape)
        return _wrapper

    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
    for func in [[relay.sum, np.sum],
                 [relay.max, np.max],
                 [relay.min, np.min],
                 [relay.mean, np.mean],
                 [relay.prod, np.prod],
                 [relay.argmin, _with_keepdims(np.argmin)],
                 [relay.argmax, _with_keepdims(np.argmax)]]:
        verify_reduce(func, (d1, d2, d3, d4), (2,), True, False, (d1, d2, 1, d4))
        verify_reduce(func, (d1, d2, d3), (1,), True, False, (d1, 1, d3))
        verify_reduce(func, (d1, d2, d3), None, True, False, (1, 1, 1))
        verify_reduce(func, (d1, d2, d3), (0, 1), True, False, (1, 1, d3))
        verify_reduce(func, (2, 3, 4), (1,), True, False, (2, 1, 4))
        verify_reduce(func, (2, 3, 4), (0, 1, 2), False, False, ())
        verify_reduce(func, (4, 4, 3), None, False, True, ())
        verify_reduce(func, (4, 4, 3), (0, 2), False, False, (4,))
        verify_reduce(func, (128, 24, 128), (0, 1), False, False, (128,))
        verify_reduce(func, (128, 24, 128), (0, 2), False, False, (24,))
        verify_reduce(func, (128, 24, 128), (0, 1), True, False, (1, 1, 128))
        verify_reduce(func, (128, 24, 128), (0, 2), True, False, (1, 24, 1))
Exemplo n.º 7
0
def test_strided_slice():
    def verify(dshape, begin, end, strides, output, test_ref=True):
        x = relay.var("x", relay.TensorType(dshape, "float32"))
        z = relay.strided_slice(x, begin=begin, end=end, strides=strides)
        func = relay.Function([x], z)
        func = relay.ir_pass.infer_type(func)
        text = func.astext()
        assert "begin=" in text
        assert "end=" in text
        if output:
            assert func.body.checked_type == relay.ty.TensorType(output, "float32")
        if not test_ref:
            return
        x_data = np.random.uniform(size=dshape).astype("float32")
        ref_res = topi.testing.strided_slice_python(
            x_data, begin, end, strides)
        for target, ctx in ctx_list():
            intrp = relay.create_executor("graph", ctx=ctx, target=target)
            op_res = intrp.evaluate(func)(x_data)
            tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)

    d1, d2, d3, d4 = tvm.var("d1"), tvm.var("d2"), tvm.var("d3"), tvm.var("d4")
    verify((d1, d2, 3), [None, None, 1], [None, None, 2], None, (d1, d2, 1), False)
    verify((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2], (3, 1, 2))
    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1], (1, 3, 3))
    verify((3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], (1, 4, 3))
    verify((3, 4, 3), [1, 0, 0], [2, 2, 3], [1, 1, 2], (1, 2, 2))
    verify((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1], (1, 2, 3))
    verify((3, 4, 3), [1, 1, 0], [4, 4, 3], None, (2, 3, 3))
    verify((3, 4, 3), [1, 1, 0], [4, 1000, 3], None, (2, 3, 3))
    verify((3, 4, 3), [1, 1, 0], [4, 4], None, (2, 3, 3))
    verify((3, 4, 3), [1, 1], [4, 4, 3], None, (2, 3, 3))
Exemplo n.º 8
0
def test_min_max_bound():
    analyzer = tvm.arith.Analyzer()
    x, y = tvm.var("x"), tvm.var("y")

    analyzer.update(x, tvm.arith.ConstIntBound(-9, 11))
    analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
    bd = analyzer.const_int_bound(tvm.min(x, y))
    assert bd.min_value == -9
    assert bd.max_value == 10

    analyzer.update(x, tvm.arith.ConstIntBound(bd.NEG_INF, bd.POS_INF), override=True)
    analyzer.update(y, tvm.arith.ConstIntBound(4, 10), override=True)
    bd = analyzer.const_int_bound(tvm.min(x, y))
    assert bd.min_value == bd.NEG_INF
    assert bd.max_value == 10

    bd = analyzer.const_int_bound(tvm.max(x, y))
    assert bd.min_value == 4
    assert bd.max_value == bd.POS_INF

    analyzer.update(x, tvm.arith.ConstIntBound(1, bd.POS_INF), override=True)
    analyzer.update(y, tvm.arith.ConstIntBound(4, 10), override=True)
    bd = analyzer.const_int_bound(tvm.max(x, y))
    assert bd.min_value == 4
    assert bd.max_value == bd.POS_INF
Exemplo n.º 9
0
def test_add_sub_bound():
    analyzer = tvm.arith.Analyzer()
    x, y = tvm.var("x", "int64"), tvm.var("y", "int64")
    bd = analyzer.const_int_bound(x + y)
    assert bd.min_value == bd.NEG_INF
    assert bd.max_value == bd.POS_INF

    analyzer.update(x, tvm.arith.ConstIntBound(0, 4))
    analyzer.update(y, tvm.arith.ConstIntBound(1, 10))
    bd = analyzer.const_int_bound(x + y)
    assert bd.min_value == 1
    assert bd.max_value == 14

    bd = analyzer.const_int_bound(x - y)
    assert bd.min_value == -10
    assert bd.max_value == 3

    analyzer.update(x, tvm.arith.ConstIntBound(0, bd.POS_INF), override=True)
    bd = analyzer.const_int_bound(x - y)
    assert bd.min_value == -10
    assert bd.max_value == bd.POS_INF

    bd = analyzer.const_int_bound(1 - x)
    assert bd.min_value == bd.NEG_INF
    assert bd.max_value == 1
Exemplo n.º 10
0
def test_vectorize_if_then_else():
    n = tvm.var('n')
    x = tvm.var('x')
    ib = tvm.ir_builder.create()
    A = ib.pointer("float32", name="A")
    with ib.for_range(0, 4, for_type="vectorize") as i:
        A[i] = tvm.call_intrin("float32", "tvm_if_then_else",
                               i > 0,
                               A[i] + 1, A[i])
    stmt = ib.get()
    stmt = tvm.ir_pass.VectorizeLoop(stmt)
    assert isinstance(stmt, tvm.stmt.For)


    ib = tvm.ir_builder.create()
    A = ib.pointer("float32", name="A")
    with ib.for_range(0, n) as k:
        with ib.for_range(0, 4, for_type="vectorize") as i:
            A[k * 4 + i] = tvm.call_intrin("float32", "tvm_if_then_else",
                                           k > 0,
                                           A[k * 4 + i], 0)
    stmt = ib.get()
    assert isinstance(stmt.body, tvm.stmt.For)
    stmt = tvm.ir_pass.VectorizeLoop(stmt)
    assert not isinstance(stmt.body, tvm.stmt.For)
    assert isinstance(stmt.body.value.args[2], tvm.expr.Broadcast)
Exemplo n.º 11
0
def test_buffer_vload():
    m = tvm.var('m')
    n = tvm.var('n')
    Ab = tvm.decl_buffer((m, n), tvm.float32, elem_offset=100)
    load = Ab.vload([2, 3])
    offset = tvm.ir_pass.Simplify(load.index)
    assert tvm.ir_pass.Equal(offset, n * 2 + 103)
Exemplo n.º 12
0
def test_scan():
    m = tvm.var("m")
    n = tvm.var("n")
    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
    s_state = tvm.placeholder((m, n))
    s_init = tvm.compute((1, n), lambda _, i: x[0, i], name="s_init")
    x_trans = tvm.compute((m, n), lambda i, j: x[i, j] + 1, name="x_trans")
    s_up1 = tvm.compute((m, n), lambda t, i: s_state[t - 1, i] + 1, name="up1")
    s_update = tvm.compute((m, n), lambda t, i: s_up1[t, i] + x_trans[t, i], name="update")
    s_scan = tvm.scan(s_init, s_update, s_state)

    def test_getbody():
        body = tvm.schedule.ScanGetBody(s_scan.op)
        assert set(body) == set([s_scan.op, s_update.op, s_up1.op])

    def test_attach_path():
        s = tvm.create_schedule(s_scan.op)
        s[x_trans].compute_at(s[s_update], s_update.op.axis[0])
        apath = tvm.schedule.CreateAttachPath(s)
        assert(tuple(apath[s_update.op]) == tuple([s_scan.op.scan_axis]))
        assert(tuple(apath[x_trans.op]) == tuple([s_update.op.axis[0], s_scan.op.scan_axis]))

    def test_fix_pt():
        body = tvm.schedule.ScanGetBody(s_scan.op)
        fxpt = tvm.schedule.ScanFixPointAnalysis(s_scan.op, body)
        assert(fxpt[s_scan.spatial_axis_[0]].value != 0)
Exemplo n.º 13
0
def test_expand_dims_infer_type():
    n, t, d = tvm.var("n"), tvm.var("t"), 100
    x = relay.var("x", shape=(n, t, d))
    y = relay.expand_dims(x, axis=2)
    assert "axis=2" in y.astext()
    checked = relay.ir_pass.infer_type(y)
    assert checked.checked_type == relay.TensorType((n, t, 1, 100))
Exemplo n.º 14
0
def test_nms():
    num_anchors = 60

    overlap_threshold = 0.5
    force_suppress = True
    nms_topk = 10

    n = tvm.var("n")
    x0 = relay.var("x0", relay.ty.TensorType((n, num_anchors, 6), "float32"))
    x1 = relay.var("x1", relay.ty.TensorType((n,), "int"))

    z = relay.vision.nms(x0, x1, overlap_threshold, force_suppress, nms_topk)

    assert "overlap_threshold" in z.astext()
    zz = relay.ir_pass.infer_type(z)
    assert zz.checked_type == relay.ty.TensorType(
        (n, num_anchors, 6), "float32")

    n = tvm.var("n")
    x0 = relay.var("x0", relay.ty.TensorType((n, num_anchors, 6), "float32"))
    x1 = relay.var("x1", relay.ty.TensorType((n,), "int"))

    z = relay.vision.nms(x0, x1)

    zz = relay.ir_pass.infer_type(z)
    assert zz.checked_type == relay.ty.TensorType(
        (n, num_anchors, 6), "float32")
Exemplo n.º 15
0
def test_equal_compute():
    x = tvm.var('x')
    y = tvm.var('y')
    n = 128
    A = tvm.placeholder((n, n), name='A')
    B = tvm.placeholder((n, n), name='B')
    ii = tvm.var('i')
    jj = tvm.var('j')

    def func1():
        k = tvm.reduce_axis((0, n), name='k')
        return tvm.sum(A[ii, k] * B[jj, k], axis=k)

    Ab = tvm.decl_buffer((n,), name='A')
    n = tvm.var("n")
    def func2():
        ib = tvm.ir_builder.create()
        A = ib.buffer_ptr(Ab)
        with ib.for_range(0, n, name="i") as i:
            A[i] = A[i] + 1
            with ib.for_range(0, 10, name="j") as j:
                A[j] = A[j] + 2
        return ib.get()

    assert tvm.ir_pass.Equal(func1(), func1())
    assert tvm.ir_pass.Equal(func2(), func2())
Exemplo n.º 16
0
def test_deduce():
    a = tvm.var('a')
    b = tvm.var('b')
    c = tvm.var('c')
    d = tvm.var('d')

    b_s = tvm.arith.intset_interval(2, 3)
    c_s = tvm.arith.intset_interval(10, 15)
    d_s = tvm.arith.intset_interval(-3, -1)

    e0 = (-b)*a+c-d
    res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
    ans0 = ((d - c) /(b*-1))
    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)

    e0 = d*a+c-d
    res0 = tvm.arith.DeduceBound(a, e0>=0, {b: b_s, c: c_s, d: d_s}, {})
    ans0 = ((0-c)/d + 1)
    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)

    e1 = (a*4+b < c)
    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
    ans1 = (((c - b) + -1)/4)
    assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1)

    e2 = (tvm.max(5, a * 4) < 0)
    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
    assert str(res2.max()) == "neg_inf"
    assert str(res2.min()) == "pos_inf"

    e3 = (-b)+a*c-d
    res3 = tvm.arith.DeduceBound(a, e3>=0, {b: b_s, c: c_s, d: d_s}, {b: b_s, d: d_s})
    ans3 = 2/c+1
    assert str(tvm.ir_pass.Simplify(res3.min())) == str(ans3)
Exemplo n.º 17
0
def test_dynamic_tensor():
    dtype = 'float32'
    stype = 'csr'
    target = 'llvm'
    ctx = tvm.context(target, 0)
    nr, nc, n = tvm.var('nr'), tvm.var('nc'), tvm.var('n')
    A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name='A', dtype=dtype)
    assert(A.stype == 'csr')
    C = tvm.compute(A.data.shape, lambda i: A.data[i] * 2., tag='cs_scatter')
    s = tvm.create_schedule(C.op)
    _nr, _nc = 3, 5
    a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype)-.6, 0.)
    a = tvmsp.array(a, ctx)
    assert a.data.dtype == a.dtype
    Ab = namedtuple('CSRBuffer', ['data', 'indices', 'indptr'])
    Ab.data = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_data')
    Ab.indices = tvm.decl_buffer(a.data.shape, a.data.dtype, name='A_indices')
    binds = {A.data: Ab.data, A.indices: Ab.indices}
    f = tvm.build(s, [nr, A.data, C], target, binds=binds)
    c = tvmsp.array(np.zeros((_nr, _nc), dtype), ctx)
    c.data = tvm.nd.empty(a.data.shape, dtype)
    c.indices = a.indices
    c.indptr = a.indptr
    f(a.data.shape[0], a.data, c.data)
    tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() * 2., rtol=1e-5)
Exemplo n.º 18
0
def test_equality():
    a = tvm.var('a')
    b = tvm.var('b')
    c = (a == b)
    assert not c
    d = (c != c)
    assert not d
Exemplo n.º 19
0
def test_schedule_create():
    m = tvm.var('m')
    n = tvm.var('n')
    l = tvm.var('l')
    A = tvm.placeholder((m, l), name='A')
    B = tvm.placeholder((n, l), name='B')
    AA = tvm.compute((m, l), lambda i, j: A[i, j])
    T = tvm.compute((m, n, l), lambda i, j, k: AA(i, k) * B(j, k))
    s = tvm.create_schedule(T.op)
    s[AA].set_scope("shared")
    xo, xi = s[T].split(T.op.axis[0], factor=10)
    xi1, xi2 = s[T].split(xi, factor=2)
    s[AA].compute_at(s[T], xi1)
    xo, xi = s[AA].split(AA.op.axis[0], factor=10)
    s[T].reorder(xi2, xi1)
    assert T.op.axis[1] in s[T].leaf_iter_vars

    # save load json
    json_str = tvm.save_json(s)
    s_loaded = tvm.load_json(json_str)
    assert isinstance(s_loaded, tvm.schedule.Schedule)
    assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))

    # pickle unpickle
    dump = pkl.dumps(s)
    s_loaded = pkl.loads(dump)
    assert isinstance(s_loaded, tvm.schedule.Schedule)
    assert(str(s_loaded.outputs[0].body) == str(s.outputs[0].body))
Exemplo n.º 20
0
def test_infer_type_leaky_relu():
    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
    y = relay.nn.leaky_relu(x, alpha=0.1)
    "alpha=0.1" in y.astext()
    yy = relay.ir_pass.infer_type(y)
    assert yy.checked_type == relay.TensorType((n, c, h, w), "float32")

    shape = (1, 5, 10, 10)
    dtype = "float32"
    x = relay.var("x", relay.TensorType(shape, dtype))
    z = relay.nn.leaky_relu(x, alpha=0.1)
    assert "alpha=0.1" in z.astext()
    yy = relay.ir_pass.infer_type(z)
    assert yy.checked_type == relay.TensorType(shape, dtype)
    func = relay.Function([x], z)
    x_data = np.random.uniform(low=-1, high=1, size=shape).astype(dtype)
    ref_res = np.where(x_data > 0, x_data, x_data * 0.1)

    for target, ctx in ctx_list():
        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
        op_res1 = intrp1.evaluate(func)(x_data)
        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
        op_res2 = intrp2.evaluate(func)(x_data)
        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
Exemplo n.º 21
0
def test_basic():
    a = tvm.var()
    b = tvm.var()
    m = tvm.arith.EvalModular(a * 4 + b * 6 + 7)
    assert m.coeff == 2
    assert m.base == 1

    m = tvm.arith.EvalModular((a * 4 + 1) * (b * 8 + 3))
    assert m.coeff == 4
    assert m.base == 3

    m = tvm.arith.EvalModular((a * 4 + 1) / (b * 8 + 3))
    assert m.coeff == 1
    assert m.base == 0

    m = tvm.arith.EvalModular((a * 4 + 1) * (b * 8 / 4))
    assert m.coeff == 2
    assert m.base == 0

    m = tvm.arith.EvalModular((a * 12 + 1) - (b * 3 * 7  + 2))
    assert m.coeff == 3
    assert m.base == 2


    m = tvm.arith.EvalModular(a * 12 + tvm.min(b * 3 * 7, 2))
    assert m.coeff == 1
    assert m.base == 0
Exemplo n.º 22
0
def test_scan_group():
    m = tvm.var("m")
    n = tvm.var("n")
    x = tvm.compute((m, n), lambda i, j: tvm.const(1, "float32"), name="x")
    s_state = tvm.placeholder((m, n))
    s_init = tvm.compute((1, n), lambda _, i: x[0, i])

    s_update1 = tvm.compute((m, n), lambda t, i: s_state[t-1, i] + x[t, i])
    s_update2 = tvm.compute((m, n), lambda t, i: s_update1[t, i] + 1)
    s_update3 = tvm.compute((m, n), lambda t, i: s_update2[t, i] + 1)
    res = tvm.scan(s_init, s_update3, s_state, inputs=x)

    s = tvm.create_schedule(res.op)
    assert s[s_update1].group is not None
    assert s[s_update2].group == s[s_update1].group
    # Assign within group, is valid
    s[s_update1].compute_at(s[s_update2], s_update2.op.axis[1])
    # create a new group, for [s_update2 and s_update1]
    g2 = s.create_group(outputs=s_update2, inputs=[s_state, x])
    assert g2.group is not None
    assert g2.group == s[s_update3].group
    assert s[s_update2].group == g2
    assert s[s_update1].group == g2
    g2.compute_at(s[s_update3], s_update3.op.axis[1])
    assert g2.attach_stage == s[s_update3]
    try:
        # compute outside group error.
        s[s_update2].compute_at(s[s_init], s_init.op.axis[0])
        assert False
    except tvm.TVMError:
        pass
Exemplo n.º 23
0
def test_storage_share():
    m = tvm.var('m')
    l = tvm.var('l')
    A = tvm.placeholder((m, l), name='A')
    num_stage = 5
    B = A
    for t in range(num_stage):
        B = tvm.compute((m, l), lambda i, j: B[i, j] + (t+1), name='A%d' % t)

    s = tvm.create_schedule(B.op)
    bounds = tvm.schedule.InferBound(s)
    assert isinstance(bounds, tvm.container.Map)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb}, 64)
    stmt = tvm.ir_pass.CanonicalSimplify(stmt)
    stmt = tvm.ir_pass.Simplify(stmt)
    stmt = tvm.ir_pass.StorageRewrite(stmt)
    # verify only have one allocations.
    # verify inplace folding works
    num_alloc = [0]
    def verify(n):
        if isinstance(n, tvm.stmt.Allocate):
            num_alloc[0] += 1
    tvm.ir_pass.PostOrderVisit(stmt, verify)
    assert num_alloc[0] == 1
Exemplo n.º 24
0
def test_parallel_alloc():
    ib = tvm.ir_builder.create()
    n = tvm.var("n")
    with ib.for_range(0, n, name="i", for_type="parallel") as i:
        with ib.for_range(0, 10, name="j") as j:
            A = ib.allocate("float32", n, name="A", scope="global")
            A[j] = A[j] + 2

    body = ib.get()
    body = tvm.ir_pass.StorageRewrite(body)
    assert (isinstance(body.body.body, tvm.stmt.Allocate))

    ib = tvm.ir_builder.create()
    n = tvm.var("n")
    with ib.for_range(0, n, name="t") as i:
        ib.scope_attr(
            tvm.const(1, "int32") , "pragma_scope",
            tvm.make.StringImm("parallel_launch_point"))
        with ib.for_range(0, n, name="i", for_type="parallel") as i:
            with ib.for_range(0, 10, name="j") as j:
                A = ib.allocate("float32", n, name="A", scope="global")
                A[j] = A[j] + 2
    body = ib.get()
    body = tvm.ir_pass.StorageRewrite(body)

    assert(isinstance(body.body.body.body.body, tvm.stmt.Allocate))
Exemplo n.º 25
0
def test_storage_sync():
    m = tvm.var('m')
    l = tvm.var('l')
    A = tvm.placeholder((m, l), name='A')

    A1 = tvm.compute((m, l), lambda i, j: A[i, j], name='A1')
    A2 = tvm.compute((m, l), lambda i, j: A1[i, j] + 3, name='A2')

    s = tvm.create_schedule(A2.op)
    xo, xi = s[A2].split(A2.op.axis[0], factor=8)
    s[A2].bind(xo, tvm.thread_axis("blockIdx.x"))
    s[A1].compute_at(s[A2], xo)
    s[A1].set_scope("shared")

    bounds = tvm.schedule.InferBound(s)
    assert isinstance(bounds, tvm.container.Map)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    A2b = tvm.decl_buffer(A2.shape, A2.dtype, name='A2')
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, A2: A2b}, 64)
    f = tvm.ir_pass.MakeAPI(stmt, "test", [Ab, A2b], 0, True)
    flist = tvm.ir_pass.SplitHostDevice(f)
    f = flist[1]
    f = tvm.ir_pass.ThreadSync(f, "shared")
    body_list = tvm.make.stmt_list(f.body.body.body.body)
    assert(body_list[1].value.name == "tvm_storage_sync")
Exemplo n.º 26
0
def test_multivariate():
    v = [tvm.var("v%d" % i) for i in range(4)]
    b = tvm.var("b")
    m = tvm.arith.DetectLinearEquation(v[0] * (b + 4) + v[0] + v[1] * 8, v)
    assert(tvm.ir_pass.Equal(tvm.ir_pass.Simplify(m[0]), b + 5))
    assert(m[1].value == 8)

    m = tvm.arith.DetectLinearEquation(v[0] * (b + 4) + v[0] + v[1] * 8 * v[2], v)
    assert(len(m) == 0)

    m = tvm.arith.DetectLinearEquation(v[0] * (b + 4) + v[0] + v[1] * 8 * v[1] + v[3], v)
    assert(len(m) == 0)

    m = tvm.arith.DetectLinearEquation(((v[0] * b + v[1]) * 8 + v[2] + 1) * 2, v)
    assert(m[1].value == 16)
    assert(m[2].value == 2)
    assert(m[len(m)-1].value == 2)

    m = tvm.arith.DetectLinearEquation((v[0] - v[1]), [v[2]])
    assert(m[0].value == 0)
    assert(tvm.ir_pass.Simplify(m[1] - (v[0] - v[1])).value == 0)

    m = tvm.arith.DetectLinearEquation((v[0] - v[1]), [])
    assert(len(m) == 1)
    assert(tvm.ir_pass.Simplify(m[0] - (v[0] - v[1])).value == 0)
Exemplo n.º 27
0
 def _post_order(op):
     if isinstance(op, tvm.stmt.Allocate):
         buffer_var = op.buffer_var
         if not buffer_var in rw_info:
             return None
         new_var = rw_info[buffer_var]
         let_stmt = tvm.make.LetStmt(
             new_var, tvm.call_extern(
                 "handle", "VTABufferCPUPtr",
                 env.dev.command_handle,
                 buffer_var), op.body)
         alloc = tvm.make.Allocate(
             buffer_var, op.dtype, op.extents,
             op.condition, let_stmt)
         del rw_info[buffer_var]
         return alloc
     if isinstance(op, tvm.expr.Load):
         buffer_var = op.buffer_var
         if not buffer_var in rw_info:
             rw_info[buffer_var] = tvm.var(
                 buffer_var.name + "_ptr", "handle")
         new_var = rw_info[buffer_var]
         return tvm.make.Load(op.dtype, new_var, op.index)
     if isinstance(op, tvm.stmt.Store):
         buffer_var = op.buffer_var
         if not buffer_var in rw_info:
             rw_info[buffer_var] = tvm.var(
                 buffer_var.name + "_ptr", "handle")
         new_var = rw_info[buffer_var]
         return tvm.make.Store(new_var, op.value, op.index)
     raise RuntimeError("not reached")
Exemplo n.º 28
0
def test_tensor_comm_reducer():
    m = tvm.var('m')
    n = tvm.var('n')
    A = tvm.placeholder((m, n), name='A')
    k = tvm.reduce_axis((0, n), "k")
    mysum = tvm.comm_reducer(lambda x, y: x+y, lambda t: tvm.const(0, dtype=t))
    C = tvm.compute((m,), lambda i: mysum(A[i, k], axis=k))
Exemplo n.º 29
0
def test_simplify_if_then_else():
    ck = CanonicalChecker()
    x = tvm.var("x")
    y = tvm.var("y")
    # simplification that takes condition into account.
    res = tvm.if_then_else((x * 4 + y) >= 466036,
                           tvm.if_then_else(24512 <= ((((x*4) + y) - 466036) % 24528),
                                            (((((x*4) + y)  - 466036) % 24528) -24512) % 16,
                                            x), y)
    expected = tvm.if_then_else(
        tvm.expr.LE(466036, (x * 4 + y)),
        tvm.if_then_else(tvm.expr.LE(24512, ((((x*4) + y) - 4) % 24528)),
                         (((x*4) + y)  - 4) % 16,
                         x), y)
    ck.verify(res, expected)
    # can only simplify if condition
    res = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 100) % 3, (x + 100) % 3)
    expected = tvm.expr.Select(tvm.all(x >= -1, y >= 0), (x + y + 1) % 3, (x + 100) % 3)
    ck.verify(res, ck.analyzer.canonical_simplify(expected))

    res = tvm.expr.Select(x >= 10,
                          tvm.if_then_else(x / 3 > 2, x, 0), 0)
    expected = tvm.expr.Select(x >= 10, x, 0)
    ck.verify(res, ck.analyzer.canonical_simplify(expected))

    res = tvm.expr.Select(x >= 10,
                          tvm.if_then_else(x / 3 < 2, x, 0), 0)
    ck.verify(res, 0)
Exemplo n.º 30
0
def test_mix_index():
    a = tvm.var("a")
    b = tvm.var("b")
    analyzer = tvm.arith.Analyzer()
    m = analyzer.modular_set(a * 4 + b * 6 + 7)
    assert m.coeff == 2
    assert m.base == 1

    m = analyzer.modular_set((a * 4 + 1) * (b * 8 + 3))
    assert m.coeff == 4
    assert m.base == 3

    m = analyzer.modular_set((a * 4 + 1) / (b * 8 + 3))
    assert m.coeff == 1
    assert m.base == 0

    m = analyzer.modular_set((a * 4 + 1) * (b * 8 / 4))
    assert m.coeff == 2
    assert m.base == 0

    m = analyzer.modular_set((a * 12 + 1) - (b * 3 * 7  + 2))
    assert m.coeff == 3
    assert m.base == 2

    m = analyzer.modular_set(a * 12 + tvm.min(b * 3 * 7, 2))
    assert m.coeff == 1
    assert m.base == 0
Exemplo n.º 31
0
def test_let():
    x = tvm.var('x')
    y = tvm.var('y')
    stmt = tvm.make.LetStmt(x, 10, tvm.make.Evaluate(x + 1))
Exemplo n.º 32
0
def test_ir2():
    x = tvm.var("n")
    a = tvm.var("array", tvm.handle)
    st = tvm.make.Store(a, x + 1, 1)
    assert isinstance(st, tvm.stmt.Store)
    assert (st.buffer_var == a)
Exemplo n.º 33
0
def _intrin_popcount(m, k_i, w_b, x_b):
    dtype = 'uint8'
    w = tvm.placeholder((w_b, m, k_i), dtype=dtype, name='w')
    x = tvm.placeholder((
        x_b,
        k_i,
    ), dtype=dtype, name='x')
    k = tvm.reduce_axis((0, k_i), name='k')
    bw = tvm.reduce_axis((0, w_b), name='bw')
    bx = tvm.reduce_axis((0, x_b), name='bx')
    z = tvm.compute((m, ),
                    lambda i: tvm.sum(tvm.popcount(w[bw, i, k].astype(
                        'uint16') & x[bx, k].astype('uint16')) <<
                                      (bw + bx).astype('uint16'),
                                      axis=[bw, bx, k]),
                    name='z')

    Wb = tvm.decl_buffer(w.shape,
                         w.dtype,
                         name="W",
                         offset_factor=k_i,
                         strides=[tvm.var('ldw'),
                                  tvm.var('ldw'), 1])
    Xb = tvm.decl_buffer(x.shape,
                         x.dtype,
                         name="X",
                         offset_factor=k_i,
                         strides=[tvm.var('ldw'), 1])

    def _intrin_func(ins, outs):
        ww, xx = ins
        zz = outs[0]
        vpadd = "llvm.arm.neon.vpadd.v8u8"
        vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16"
        args_1 = tvm.const(1, 'uint32')
        args_2 = tvm.const(2, 'uint32')

        def _instr(index):
            irb = tvm.ir_builder.create()
            if index == 1:
                irb.emit(zz.vstore(0, tvm.const(0, 'uint16x8')))
                return irb.get()

            cnts8 = [None] * 8
            cnts4 = [None] * 4
            cnts2 = [None] * 2
            for bw in range(w_b):
                for bx in range(x_b):
                    if k_i == 16:
                        for i in range(m):
                            ands = ww.vload([bw, i, 0], 'uint8x16') & xx.vload(
                                [bx, 0], 'uint8x16')
                            cnts = tvm.popcount(ands)
                            upper_half = tvm.call_pure_intrin(
                                'uint8x8', 'vectorhigh', cnts)
                            lower_half = tvm.call_pure_intrin(
                                'uint8x8', 'vectorlow', cnts)
                            cnts8[i] = upper_half + lower_half
                        for i in range(m // 2):
                            cnts4[i] = tvm.call_llvm_intrin(
                                'uint8x8', vpadd, args_1, cnts8[i * 2],
                                cnts8[i * 2 + 1])
                        for i in range(m // 4):
                            cnts2[i] = tvm.call_llvm_intrin(
                                'uint8x8', vpadd, args_1, cnts4[i * 2],
                                cnts4[i * 2 + 1])
                        cnts = tvm.call_pure_intrin('uint8x16',
                                                    'vectorcombine', cnts2[0],
                                                    cnts2[1])
                        shifted_cnts = cnts << tvm.const(bw + bx, dtype)
                        out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2,
                                                   zz.vload(0, 'uint16x8'),
                                                   shifted_cnts)
                    else:  # ki == 8
                        for i in range(m):
                            ands = ww.vload([bw, i, 0], 'uint8x8') & xx.vload(
                                [bx, 0], 'uint8x8')
                            cnts8[i] = tvm.popcount(ands)
                        for i in range(m // 2):
                            cnts4[i] = tvm.call_llvm_intrin(
                                'uint8x8', vpadd, args_1, cnts8[i * 2],
                                cnts8[i * 2 + 1])
                        for i in range(m // 4):
                            cnts2[i] = tvm.call_llvm_intrin(
                                'uint8x8', vpadd, args_1, cnts4[i * 2],
                                cnts4[i * 2 + 1])
                        cnts = tvm.call_pure_intrin('uint8x16',
                                                    'vectorcombine', cnts2[0],
                                                    cnts2[1])
                        shifted_cnts = cnts << tvm.const(bw + bx, dtype)
                        out = tvm.call_llvm_intrin('uint16x8', vpadalu, args_2,
                                                   zz.vload(0, 'uint16x8'),
                                                   shifted_cnts)
                    irb.emit(zz.vstore(0, out))
            return irb.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)

    with tvm.build_config(offset_factor=1, partition_const_loop=True):
        return tvm.decl_tensor_intrin(z.op, _intrin_func, binds={w: Wb, x: Xb})
Exemplo n.º 34
0
import pdb
import tvm
import numpy as np
import timeit
import os
from tvm.contrib import util

input(os.getpid())
tgt_host="cpu"
#device = "cpu"

M = tvm.var("n")
K = tvm.var("n")
N = tvm.var("n")

A = tvm.placeholder((M, K), name='A',dtype='float32')
B = tvm.placeholder((K, N), name='B',dtype='float32')

k = tvm.reduce_axis((0, K), 'k')
C = tvm.compute((M, N), lambda i, j: 0, name='C')

s = tvm.create_schedule(C.op)
func = tvm.build(s, [A, B, C], "c",name='DPUGemm')
print(tvm.lower(s, [A, B, C], simple_mode=True))

#print(func.get_source())
batch = 2
in_channels = 3
out_channels = 2
in_height = 6
in_width = 6
Exemplo n.º 35
0
def test_select():
    ck = IntSetChecker()
    x, y = tvm.var("x"), tvm.var("y")
    ck.verify(tvm.tir.Select(x > 0, x - 1, x + 1),
              {x: tvm.arith.IntervalSet(0, 10)}, (-1, 11))
Exemplo n.º 36
0
def test_dir():
    x = tvm.var('x')
    dir(x)
Exemplo n.º 37
0
def rnn_matexp():
    n_num_step = 128
    n_num_hidden = 1152
    n_batch_size = 4
    detect_global_barrier = DETECT_GLOBAL_BARRIER

    num_step = tvm.var("num_step")
    num_hidden = tvm.convert(n_num_hidden)
    batch_size = tvm.convert(n_batch_size)
    num_thread_y = 8
    num_thread_x = 16 * 3
    num_sm = 24

    Whh = tvm.placeholder((num_hidden, num_hidden), name="Whh")
    s_init = tvm.compute((1, batch_size, num_hidden),
                         lambda _, i, j: 1.0,
                         name="init")
    s_state = tvm.placeholder((num_step, batch_size, num_hidden))
    kh = tvm.reduce_axis((0, num_hidden), name="kh")
    s_update = tvm.compute(
        (num_step, batch_size, num_hidden),
        lambda t, i, j: tvm.sum(s_state[t - 1, i, kh] * Whh[kh, j], axis=kh),
        name="update")
    s_scan = tvm.scan(s_init, s_update, s_state)
    # schedule
    s = tvm.create_schedule(s_scan.op)
    CL = s_update
    SS = s.cache_read(s_state, "shared", [CL])
    SL = s.cache_read(SS, "local", [CL])
    WhhL = s.cache_read(Whh, "local", [CL])
    ko, ki = s[CL].split(s[CL].op.reduce_axis[0], nparts=num_thread_y)
    CLF = s.rfactor(CL, ko)

    block_x = tvm.thread_axis((0, num_sm), "blockIdx.x")
    thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
    thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
    if PERSIST_KERNEL:
        s[s_scan.op].env_threads([block_x, thread_y, thread_x])

    bx, xi = s[s_init].split(s_init.op.axis[2], nparts=num_sm)
    tx, xi = s[s_init].split(xi, nparts=num_thread_x)
    s[s_init].bind(bx, block_x)
    s[s_init].bind(tx, thread_x)

    bx, xi = s[s_update].split(s[CL].op.axis[2], nparts=num_sm)
    tx, xi = s[s_update].split(xi, nparts=num_thread_x)
    s[s_update].bind(bx, block_x)
    s[s_update].bind(tx, thread_x)
    s[CL].bind(s[CL].op.reduce_axis[0], thread_y)
    s[CLF].compute_at(s[CL], s[CL].op.reduce_axis[0])
    # Duplicate store predicate.
    s[CL].set_store_predicate(thread_y.equal(0))

    if PERSIST_KERNEL:
        s[WhhL].compute_at(s[s_scan], thread_x)
        s[WhhL].unroll(WhhL.op.axis[0])
    else:
        s[WhhL].compute_at(s[CLF], CLF.op.axis[3])

    kr, ki = s[CLF].split(CLF.op.reduce_axis[0], nparts=1)
    ko, ki = s[CLF].split(ki, factor=4)
    s[SS].compute_at(s[CLF], kr)
    s[SL].compute_at(s[CLF], ko)

    xo, xi = s[SS].split(SS.op.axis[2], factor=num_thread_x * num_thread_y * 3)
    ty, xi = s[SS].split(xi, nparts=num_thread_y)
    tx, xi = s[SS].split(xi, nparts=num_thread_x)
    s[SS].bind(ty, thread_y)
    s[SS].bind(tx, thread_x)

    def check_device(target):
        with tvm.build_config(detect_global_barrier=detect_global_barrier,
                              auto_unroll_min_depth=2,
                              auto_unroll_max_step=128,
                              unroll_explicit=False):
            f = tvm.build(s, [s_scan, Whh], target)
        ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
        # launch the kernel.
        res_np = np.zeros(
            (n_num_step, n_batch_size, n_num_hidden)).astype("float32")
        Whh_np = np.zeros((n_num_hidden, n_num_hidden)).astype("float32")
        Whh_np[:] = 2.0 / n_num_hidden
        Whh_np[:, n_num_hidden // 2:] = 0

        res_a = tvm.nd.array(res_np, ctx)
        Whh_a = tvm.nd.array(Whh_np, ctx)
        # Skip first pass as it is compilation
        f(res_a, Whh_a)
        ctx.sync()
        # measure time cost of second step.
        tstart = time.time()
        f(res_a, Whh_a)
        ctx.sync()
        tgap = time.time() - tstart
        print("Time cost=%g" % tgap)
        # correctness
        if not SKIP_CHECK:
            res_gpu = res_a.asnumpy()
            res_cmp = np.ones_like(res_np).astype("float64")
            Whh_np = Whh_np.astype("float64")
            for t in range(1, n_num_step):
                res_cmp[t][:] = np.dot(res_cmp[t - 1], Whh_np)
            for i in range(n_num_step):
                for j in range(n_num_hidden):
                    if abs(res_cmp[i, 0, j] - res_gpu[i, 0, j]) > 1e-5:
                        print("%d, %d: %g vs %g" %
                              (i, j, res_cmp[i, 0, j], res_gpu[i, 0, j]))
            np.testing.assert_allclose(res_gpu, res_cmp, rtol=1e-3)

    check_device("cuda")
Exemplo n.º 38
0
def test_multibox_prior():
    def get_ref_result(dshape,
                       sizes=(1.0, ),
                       ratios=(1.0, ),
                       steps=(-1.0, -1.0),
                       offsets=(0.5, 0.5),
                       clip=True):
        in_height = dshape[2]
        in_width = dshape[3]
        num_sizes = len(sizes)
        num_ratios = len(ratios)
        size_ratio_concat = sizes + ratios
        steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
        steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
        offset_h = offsets[0]
        offset_w = offsets[1]

        oshape = (1, in_height * in_width * (num_sizes + num_ratios - 1), 4)
        dtype = "float32"
        np_out = np.zeros(oshape).astype(dtype)

        for i in range(in_height):
            center_h = (i + offset_h) * steps_h
            for j in range(in_width):
                center_w = (j + offset_w) * steps_w
                for k in range(num_sizes + num_ratios - 1):
                    w = size_ratio_concat[k] * in_height / in_width / 2.0 if k < num_sizes else \
                        size_ratio_concat[0] * in_height / in_width * math.sqrt(size_ratio_concat[k + 1]) / 2.0
                    h = size_ratio_concat[k] / 2.0 if k < num_sizes else \
                        size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0
                    count = i * in_width * (num_sizes + num_ratios - 1) + j * (
                        num_sizes + num_ratios - 1) + k
                    np_out[0][count][0] = center_w - w
                    np_out[0][count][1] = center_h - h
                    np_out[0][count][2] = center_w + w
                    np_out[0][count][3] = center_h + h
        if clip:
            np_out = np.clip(np_out, 0, 1)

        return np_out

    def verify_multibox_prior(x,
                              dshape,
                              ref_res,
                              sizes=(1.0, ),
                              ratios=(1.0, ),
                              steps=(-1.0, -1.0),
                              offsets=(0.5, 0.5),
                              clip=True,
                              check_size=False,
                              check_type_only=False):

        z = relay.vision.multibox_prior(x, sizes, ratios, steps, offsets, clip)
        zz = run_infer_type(z)
        if check_size:
            assert "sizes=" in z.astext()
        assert zz.checked_type == relay.TensorType(
            (1, dshape[2] * dshape[3] * (len(sizes) + len(ratios) - 1), 4),
            "float32")

        if check_type_only:
            return

        data = np.random.uniform(low=-1, high=1, size=dshape).astype("float32")
        func = relay.Function([x], z)
        func = run_infer_type(func)
        for target, ctx in ctx_list():
            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
            op_res1 = intrp1.evaluate(func)(data)
            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
            op_res2 = intrp2.evaluate(func)(data)
            tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)

    sizes = (0.3, 1.5, 0.7)
    ratios = (1.3, 2.4)
    steps = (2.0, 1.5)
    offsets = (0.2, 0.3)
    dshape = (1, 3, 56, 56)
    ref_res = get_ref_result(dshape, sizes, ratios, steps, offsets)
    x = relay.var("x", relay.TensorType(dshape, "float32"))
    verify_multibox_prior(x,
                          dshape,
                          ref_res,
                          sizes,
                          ratios,
                          steps,
                          offsets,
                          check_size=True)
    y = relay.var("y", relay.TensorType((tvm.var("n"), 3, 56, 56), "float32"))
    verify_multibox_prior(x,
                          dshape,
                          ref_res,
                          sizes,
                          ratios,
                          steps,
                          offsets,
                          check_size=True,
                          check_type_only=True)

    dshape = (1, 24, 32, 32)
    ref_res = get_ref_result(dshape, clip=False)
    x = relay.var("x", relay.TensorType(dshape, "float32"))
    verify_multibox_prior(x, dshape, ref_res, clip=False)
    y = relay.var("y", relay.TensorType((tvm.var("n"), 24, 32, 32), "float32"))
    verify_multibox_prior(x, dshape, ref_res, clip=False, check_type_only=True)
Exemplo n.º 39
0
def test_non_max_suppression():
    def verify_nms(x0_data,
                   x1_data,
                   dshape,
                   ref_res,
                   ref_indices_res,
                   iou_threshold=0.5,
                   force_suppress=False,
                   top_k=-1,
                   check_type_only=False):
        x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
        x1 = relay.var("x1", relay.ty.TensorType((dshape[0], ), "int32"))
        z = relay.vision.non_max_suppression(x0, x1, max_output_size = -1, \
            iou_threshold = iou_threshold, force_suppress = force_suppress, \
            top_k = top_k, return_indices=False)
        z_indices = relay.vision.non_max_suppression(x0, x1, max_output_size = -1, \
                    iou_threshold = iou_threshold, force_suppress = force_suppress, \
                    top_k = top_k)
        assert "iou_threshold" in z.astext()
        assert "iou_threshold" in z_indices.astext()
        zz = run_infer_type(z)
        zz_indices = run_infer_type(z_indices)
        assert zz.checked_type == relay.ty.TensorType(dshape, "float32")
        assert zz_indices.checked_type == relay.ty.TensorType(
            (dshape[0], dshape[1]), "int32")

        if check_type_only:
            return

        func = relay.Function([x0, x1], z)
        func = run_infer_type(func)
        func_indices = relay.Function([x0, x1], z_indices)
        func_indices = run_infer_type(func_indices)
        for target, ctx in ctx_list():
            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
            op_res1 = intrp1.evaluate(func)(x0_data, x1_data)
            op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data)
            tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
            tvm.testing.assert_allclose(op_indices_res1.asnumpy(),
                                        ref_indices_res,
                                        rtol=1e-5)
            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
            op_res2 = intrp2.evaluate(func)(x0_data, x1_data)
            op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data)
            tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
            tvm.testing.assert_allclose(op_indices_res2.asnumpy(),
                                        ref_indices_res,
                                        rtol=1e-5)

    np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                         [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
                         [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
    np_valid_count = np.array([4]).astype("int32")
    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                           [-1, -1, -1, -1, -1, -1]]])
    np_indices_result = np.array([[3, 0, -1, -1, -1]])
    num_anchors = 5

    dshape = (tvm.var("n"), num_anchors, 6)
    verify_nms(np_data,
               np_valid_count,
               dshape,
               np_result,
               np_indices_result,
               force_suppress=True,
               top_k=2,
               check_type_only=True)
    dshape = (1, num_anchors, 6)
    verify_nms(np_data,
               np_valid_count,
               dshape,
               np_result,
               np_indices_result,
               force_suppress=True,
               top_k=2,
               check_type_only=False)

    np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
                           [1, 0.7, 30, 60, 50, 80], [-1, -1, -1, -1, -1, -1],
                           [-1, -1, -1, -1, -1, -1]]])
    np_indices_result = np.array([[3, 0, 1, -1, -1]])
    dshape = (tvm.var("n"), num_anchors, 6)
    verify_nms(np_data,
               np_valid_count,
               dshape,
               np_result,
               np_indices_result,
               check_type_only=True)
    dshape = (1, num_anchors, 6)
    verify_nms(np_data,
               np_valid_count,
               dshape,
               np_result,
               np_indices_result,
               top_k=3)
Exemplo n.º 40
0
def lstm():
    if not PERSIST_KERNEL:
        raise ValueError("Non persist LSTM not yet supported")
    num_thread_y = 8
    num_thread_x = 16 * 3 // 2
    num_sm = 24
    n_num_step = 128
    num_step = tvm.var('num_step')
    num_hidden = 1152 // 2  # cell的个数
    batch_size = 1  # 处理对象是图片时,表示同时处理这么多张图片
    # Global transition matrix
    # Input hidden channel can be pre-caculated by a gemm
    Xi2h = tvm.placeholder((num_step, batch_size, 4, num_hidden), name="Xi2h")
    # Only handle hidden transition, saves space.
    Wh2h = tvm.placeholder((4, num_hidden, num_hidden), name="Wh2h")
    # h: output hidden state, c: cell state.
    s_state_h = tvm.placeholder((num_step, batch_size, num_hidden))  # h(t-1)
    s_state_c = tvm.placeholder((num_step, batch_size, num_hidden))  # c(t-1)
    s_init_c = tvm.compute((1, batch_size, num_hidden),
                           lambda *i: 0.0,
                           name="init_c")  # 初始c状态为0
    s_init_h = tvm.compute((1, batch_size, num_hidden),
                           lambda *i: 0.0,
                           name="init_h")  # 初始h状态为0
    # LSTM transition
    k = tvm.reduce_axis((0, num_hidden), name="ki2h")
    # H*W,axis = k
    s_h2h = tvm.compute(
        (num_step, batch_size, 4, num_hidden),
        lambda t, i, x, j: tvm.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k],
                                   axis=k),
        name="s_h2h")
    # Gate rules
    # gates = w*[h(t-1),x(t)]
    gates = tvm.compute(Xi2h.shape,
                        lambda *i: Xi2h(*i) + s_h2h(*i),
                        name="gates")
    # 把4个门一个个的拆开,这样就能降到3维?
    gshape = (num_step, batch_size, num_hidden)
    # 输入门:in_gate = i(t) = sigmoid(w0*[h(t-1),x(t)])    w0=wi
    in_gate = tvm.compute(gshape,
                          lambda t, i, j: tvm.sigmoid(gates[t, i, 0, j]),
                          name="in_gate")
    # 候选值:in_transform = \tilde{C}(t) = tanh(w1*[h(t-1),x(t)])    w1=wc
    in_transform = tvm.compute(gshape,
                               lambda t, i, j: tvm.tanh(gates[t, i, 1, j]),
                               name="in_transform")
    # 忘记门:forget_gate = f(t) = sigmoid(w2*[h(t-1),x(t)])    w2=wf
    forget_gate = tvm.compute(gshape,
                              lambda t, i, j: tvm.sigmoid(gates[t, i, 2, j]),
                              name="forget_gate")
    # 输出门:out_gate = o(t)= sigmoid(w3*[h(t-1),x(t)])   w3=wo
    out_gate = tvm.compute(gshape,
                           lambda t, i, j: tvm.sigmoid(gates[t, i, 3, j]),
                           name="out_gate")
    # 更新细胞状态:next_c=c(t)=f(t)*c(t-1)+i(t)*\tilde{C}(t)
    next_c = tvm.compute(
        gshape,
        lambda t, i, j: forget_gate[t, i, j] * s_state_c[
            t - 1, i, j] + in_gate[t, i, j] * in_transform[t, i, j],
        name="next_c")
    # 最终的输出:next_h=h(t)=o(t)*tanh(c(t))
    next_h = tvm.compute(
        gshape,
        lambda t, i, j: out_gate[t, i, j] * tvm.tanh(next_c[t, i, j]),
        name="next_h")
    update_c = tvm.compute(gshape, lambda *i: next_c(*i),
                           name="update_c")  # 更新细胞状态
    update_h = tvm.compute(gshape, lambda *i: next_h(*i),
                           name="update_h")  # 更新输出
    # schedule:h和c都是时间序列的形式,tvm中用scan来描述
    scan_h, scan_c = tvm.scan([s_init_h, s_init_c], [update_h, update_c],
                              [s_state_h, s_state_c],
                              inputs=[Xi2h],
                              name="lstm_scan")
    # schedule
    s = tvm.create_schedule(scan_h.op)
    # Inline gate computations
    s[gates].compute_inline()
    s[in_gate].compute_inline()
    s[in_transform].compute_inline()
    s[forget_gate].compute_inline()
    s[out_gate].compute_inline()

    block_x = tvm.thread_axis((0, num_sm), "blockIdx.x")
    thread_x = tvm.thread_axis((0, num_thread_x), "threadIdx.x")
    thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")

    s_state_h_S = s.cache_read(s_state_h, "shared", [s_h2h])
    s_state_c_S = s.cache_read(s_state_c, "shared", [next_c])
    Wh2hL = s.cache_read(Wh2h, "local", [s_h2h])

    ko, ki = s[s_h2h].split(s[s_h2h].op.reduce_axis[0], nparts=num_thread_y)
    s_h2h_rf = s.rfactor(s_h2h, ko)
    s[s_h2h].bind(s[s_h2h].op.reduce_axis[0], thread_y)
    s[s_h2h_rf].compute_at(s[s_h2h], s[s_h2h].op.reduce_axis[0])

    if PERSIST_KERNEL:
        s[scan_h.op].env_threads([block_x, thread_y, thread_x])
        s[Wh2hL].compute_at(s[scan_h.op], thread_x)
    else:
        s[Wh2hL].compute_at(s[s_h2h], s[s_h2h].op.axis[3])

    if UNROLL_WLOAD:
        s[Wh2hL].unroll(Wh2hL.op.axis[0])
        s[Wh2hL].unroll(Wh2hL.op.axis[2])

    s[s_state_h_S].compute_at(s[s_h2h_rf], s[s_h2h_rf].op.axis[3])
    s[s_state_c_S].compute_at(s[scan_h.op], s[scan_h].op.scan_axis)

    for ss in [s_state_h_S]:
        xo, xi = s[ss].split(ss.op.axis[2], factor=num_thread_x * num_thread_y)
        ty, xi = s[ss].split(xi, nparts=num_thread_y)
        tx, xi = s[ss].split(xi, nparts=num_thread_x)
        s[ss].bind(ty, thread_y)
        s[ss].bind(tx, thread_x)

    for init in [s_init_c, s_init_h]:
        bx, xi = s[init].split(init.op.axis[2], nparts=num_sm)
        tx, xi = s[init].split(xi, nparts=num_thread_x)
        s[init].bind(bx, block_x)
        s[init].bind(tx, thread_x)

    s[next_c].set_store_predicate(thread_y.equal(0))
    s[next_h].set_store_predicate(thread_y.equal(0))

    for update in [update_c, update_h]:
        bx, xi = s[update].split(s[update].op.axis[2], nparts=num_sm)
        tx, xi = s[update].split(xi, nparts=num_thread_x)
        s[update].bind(bx, block_x)
        s[update].bind(tx, thread_x)
        s[update].set_store_predicate(thread_y.equal(0))

    # verify we can lower correctly
    def check_device(target):
        num_step = n_num_step
        flstm = tvm.build(s, [Xi2h, Wh2h, scan_h, scan_c], target)
        ctx = tvm.gpu(0) if target == "cuda" else tvm.cl(0)
        # launch the kernel.
        scan_h_np = np.zeros(
            (num_step, batch_size, num_hidden)).astype("float32")
        scan_c_np = np.zeros(
            (num_step, batch_size, num_hidden)).astype("float32")
        Xi2h_np = np.random.normal(size=(num_step, batch_size, 4,
                                         num_hidden)).astype("float32")
        Wh2h_np = np.random.normal(size=(4, num_hidden,
                                         num_hidden)).astype("float32")
        scan_h_a = tvm.nd.array(scan_h_np, ctx)
        scan_c_a = tvm.nd.array(scan_c_np, ctx)
        Xi2h_a = tvm.nd.array(Xi2h_np, ctx)
        Wh2h_a = tvm.nd.array(Wh2h_np, ctx)
        flstm(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
        ctx.sync()
        # measure time cost of second step.
        evaluator = flstm.time_evaluator(flstm.entry_name, ctx, 1, repeat=1000)
        eval_result = evaluator(Xi2h_a, Wh2h_a, scan_h_a, scan_c_a)
        print("Time cost=%g" % eval_result.mean)

    # set unroll_explicit for more readable code.
    with tvm.build_config(detect_global_barrier=DETECT_GLOBAL_BARRIER,
                          auto_unroll_max_step=128,
                          unroll_explicit=False):
        check_device("cuda")
Exemplo n.º 41
0
def test_basic():
    m = tvm.var('m')
    ret = tvm.ir_pass.CanonicalSimplify(tvm.make.Evaluate(m - 1))
    assert str(ret.value) == "(m - 1)"
Exemplo n.º 42
0
def test_basic():
    a = tvm.var('a')
    b = tvm.var('b')
    c = a + b
    assert str(c) == '(%s + %s)' % (a.name, b.name)
Exemplo n.º 43
0
def test_stmt():
    x = tvm.make.Evaluate(0)
    tvm.make.For(tvm.var('i'), 0, 1, tvm.stmt.For.Serial, 0, x)
Exemplo n.º 44
0
def test_add_pipeline():
    n = tvm.var('n')
    A = tvm.placeholder((n, ), name='A')
    B = tvm.placeholder((), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(), name='C')
    D = tvm.compute(A.shape, lambda *i: C(*i) + 1, name='D')
    s = tvm.create_schedule(D.op)

    # GPU schedule have to split by gridIdx and threadIdx
    num_thread = 256
    xo, xi = s[C].split(C.op.axis[0], factor=num_thread)
    s[C].bind(xi, tvm.thread_axis("threadIdx.x"))
    s[C].bind(xo, tvm.thread_axis("blockIdx.x"))

    xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
    s[D].bind(xi, tvm.thread_axis("threadIdx.x"))
    s[D].bind(xo, tvm.thread_axis("blockIdx.x"))

    # compile to IR
    s = s.normalize()
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    Ab = tvm.decl_buffer(A.shape, A.dtype, name='A')
    Bb = tvm.decl_buffer(B.shape, B.dtype, name='B')
    Db = tvm.decl_buffer(D.shape, D.dtype, name='D')
    stmt = tvm.ir_pass.LoopPartition(stmt, False)
    stmt = tvm.ir_pass.StorageFlatten(stmt, {A: Ab, B: Bb, D: Db}, 64)
    stmt = tvm.ir_pass.Simplify(stmt)
    fapi = tvm.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Db], 0, True)
    fsplits = [x for x in tvm.ir_pass.SplitHostDevice(fapi)]
    fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])

    def check_target(device, host="stackvm"):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            return
        if not tvm.module.enabled(host):
            return
        mhost = tvm.codegen.build_module(fsplits[0], host)
        mdev = tvm.codegen.build_module(fsplits[1:], device)
        mhost.import_module(mdev)
        code = mdev.get_source()
        f = mhost.entry_func
        # launch the kernel.
        n = 1027
        a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=()).astype(Bb.dtype), ctx)
        d = tvm.nd.array(np.zeros(n, dtype=Db.dtype), ctx)
        f(a, b, d)
        np.testing.assert_allclose(d.asnumpy(), a.asnumpy() + b.asnumpy() + 1)

    def check_module_save(device, host="stackvm"):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            return
        if not tvm.module.enabled(host):
            return
        fmt = "ptx" if device == "cuda" else device
        mhost = tvm.codegen.build_module(fsplits[0], host)
        mdev = tvm.codegen.build_module(fsplits[1:], device)
        temp = util.tempdir()
        mpath = temp.relpath("test.%s" % fmt)
        mdev.save(mpath)
        mdev2 = tvm.module.load(mpath)
        mhost.import_module(mdev2)
        f = mhost.entry_func
        # launch the kernel.
        n = 1027
        a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=()).astype(Bb.dtype), ctx)
        d = tvm.nd.array(np.zeros(n, dtype=Db.dtype), ctx)
        f(a, b, d)
        np.testing.assert_allclose(d.asnumpy(), a.asnumpy() + b.asnumpy() + 1)

    check_target("cuda", host="stackvm")
    check_target("cuda", host="llvm")
    check_module_save("cuda", host="stackvm")
    check_target("nvptx", host="llvm")
    check_target("vulkan", host="llvm")
    check_target("rocm", host="llvm")
    check_module_save("vulkan", host="stackvm")
Exemplo n.º 45
0
def test_dtype():
    x = tvm.var('x')
    assert x.dtype == 'int32'
    y = tvm.var('y')
    assert (x > y).dtype == 'uint1'
Exemplo n.º 46
0
def test_deduce():
    a = tvm.var('a')
    b = tvm.var('b')
    c = tvm.var('c')
    d = tvm.var('d')

    b_s = tvm.arith.IntervalSet(2, 3)
    c_s = tvm.arith.IntervalSet(10, 15)
    d_s = tvm.arith.IntervalSet(-3, -1)
    zero = tvm.const(0, "int32")

    fdiv = tvm.floordiv

    e0 = (-b) * a + c - d
    res0 = tvm.arith.deduce_bound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {})
    ans0 = fdiv(d - c, b * -1)
    assert_expr_equal(res0.max_value, ans0)

    # expression containing variable a is on rhs
    res0 = tvm.arith.deduce_bound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
    assert_expr_equal(res0.max_value, ans0)

    e0 = d * a + c - d
    res0 = tvm.arith.deduce_bound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {})
    ans0 = fdiv(d - c, d)
    assert_expr_equal(res0.max_value, ans0)

    # expression containing variable a is on rhs
    res0 = tvm.arith.deduce_bound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
    assert_expr_equal(res0.max_value, ans0)

    e1 = (a * 4 + b < c)
    res1 = tvm.arith.deduce_bound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
    ans1 = fdiv(c - 1 - b, 4)
    assert_expr_equal(res1.max_value, ans1)

    # expression containing variable a is on rhs
    e1 = (c > a * 4 + b)
    res1 = tvm.arith.deduce_bound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
    assert_expr_equal(res1.max_value, ans1)

    e2 = (tvm.max(5, a * 4) < 0)
    res2 = tvm.arith.deduce_bound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
    assert str(res2.max_value) == "neg_inf"
    assert str(res2.min_value) == "pos_inf"

    # expression containing variable a is on rhs
    e2 = (zero < tvm.max(5, a * 4))
    res2 = tvm.arith.deduce_bound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
    assert str(res2.max_value) == "neg_inf"
    assert str(res2.min_value) == "pos_inf"

    e3 = (-b) + a * c - d
    res3 = tvm.arith.deduce_bound(a, e3 >= 0, {
        b: b_s,
        c: c_s,
        d: d_s
    }, {
        b: b_s,
        d: d_s
    })
    ans3 = fdiv(2, c) + 1
    assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3)

    res3 = tvm.arith.deduce_bound(a, zero <= e3, {
        b: b_s,
        c: c_s,
        d: d_s
    }, {
        b: b_s,
        d: d_s
    })
    assert str(tvm.ir_pass.Simplify(res3.min_value)) == str(ans3)

    # tests for `EQ` op
    res4 = tvm.arith.deduce_bound(a, a == b, {}, {})
    assert_expr_equal(res4.max_value, b)
    assert_expr_equal(res4.min_value, b)

    # Unsatisfiable `EQ`, variable as one of the Operand
    res5 = tvm.arith.deduce_bound(a, (a == b), {b: b_s}, {b: b_s})
    assert str(res5.max_value) == "neg_inf"
    assert str(res5.min_value) == "pos_inf"

    # variable `a` on the RHS side
    res6 = tvm.arith.deduce_bound(a, 10 == a, {}, {})
    assert_expr_equal(res6.max_value, 10)
    assert_expr_equal(res6.min_value, 10)

    # Add, Sub in `EQ`
    e4 = ((a - c) == (b + d))
    ans4 = (b + d + c)
    res7 = tvm.arith.deduce_bound(a, e4, {b: b_s, c: c_s, d: d_s}, {})
    assert_expr_equal(res7.max_value, ans4)
    assert_expr_equal(res7.min_value, ans4)

    # Satisfiable Mul in `EQ` with negative sign
    res8 = tvm.arith.deduce_bound(a, (5 * a == -10), {}, {})
    assert_expr_equal(res8.max_value, -2)
    assert_expr_equal(res8.min_value, -2)

    # Unsatisfiable Mul in `EQ`
    e5 = (4 * a == b)
    res9 = tvm.arith.deduce_bound(a, e5, {b: b_s}, {})
    assert str(res9.max_value) == "neg_inf"
    assert str(res9.min_value) == "pos_inf"

    # Unsatisfiable Mul in `EQ`
    res10 = tvm.arith.deduce_bound(
        a, (b * a == b), {b: b_s},
        {})  # simplifier is not able to prove that (b % b == 0)
    assert str(res10.max_value) == "neg_inf"
    assert str(res10.min_value) == "pos_inf"
Exemplo n.º 47
0
def test_sym_add():
    a = tvm.var('a')
    b = tvm.var('b')
    c = tvm_ext.sym_add(a, b)
    assert c.a == a and c.b == b
Exemplo n.º 48
0
# As a first step, we need to describe our computation.
# TVM adopts tensor semantics, with each intermediate result
# represented as multi-dimensional array. The user need to describe
# the computation rule that generate the tensors.
#
# We first define a symbolic variable n to represent the shape.
# We then define two placeholder Tensors, A and B, with given shape (n,)
#
# We then describe the result tensor C, with a compute operation.
# The compute function takes the shape of the tensor, as well as a lambda function
# that describes the computation rule for each position of the tensor.
#
# No computation happens during this phase, as we are only declaring how
# the computation should be done.
#
n = tvm.var("n")
A = tvm.placeholder((n, ), name='A')
B = tvm.placeholder((n, ), name='B')
C = tvm.compute(A.shape, lambda i: A[i] + B[i], name="C")
print(type(C))

######################################################################
# Schedule the Computation
# ------------------------
# While the above lines describes the computation rule, we can compute
# C in many ways since the axis of C can be computed in data parallel manner.
# TVM asks user to provide a description of computation called schedule.
#
# A schedule is a set of transformation of computation that transforms
# the loop of computations in the program.
#
Exemplo n.º 49
0
import tvm
import numpy as np

# 同一个计算有多种不同的计算方式,更会有不同的性能
# Schedule来决定如何计算,schedule是一组计算转换,用于转化程序中的循环计算
# schedule 是由一组opts组成
# 默认情况下,以行优先的串行方式计算
n = tvm.var('n')
m = tvm.var('m')
A = tvm.placeholder((m, n), name='A')
B = tvm.placeholder((m, n), name='B')
C = tvm.compute((m, n), lambda i, j: A[i, j] * B[i, j], name='C')

s = tvm.create_schedule([C.op])
# lower会将计算从定义转换为真正的可调用函数。 使用参数`simple_mode = True`,
# 它将返回一个可读的C like语句,我们在此处使用它来打印计划结果。
# print(tvm.lower(s, [A, B, C], simple_mode=True))

# 一个schedule由多个stage组成,一个stage代表一个opt
# 每个stage提供多种方法

# split
# 将特定的一维拆成两维
A = tvm.placeholder((m, ), name='A')
B = tvm.compute((m, ), lambda i: A[i] * 2, name='B')
s = tvm.create_schedule(B.op)
xo, xi = s[B].split(B.op.axis[0], factor=32)
# print(tvm.lower(s, [A, B], simple_mode=True))

s = tvm.create_schedule(B.op)
bx, tx = s[B].split(B.op.axis[0], nparts=32)
Exemplo n.º 50
0
def dot_16x1x16_int8_int8_int32():
    """
    Int8 dot product by every 4 elements using AVX2 Skylake instructions.
    This function takes two arrays of int8 datatype -- data[4] and
    kernel[16][4] -- and computes a dot product of data[4] with every
    4 elements of kernels, resulting in output[16] of int32 datatype.
    The pseudo code is as follows.
    .. code-block:: c
        void dot_16x1x16_int8_int8_int32(int8 data[4], int8 kernel[16][4],
                int32 output[16]){
            for (int i = 0; i < 16; i++){
                out[i] = 0;
                for (int k = 0; k < 4; k++){
                    out[i] += data[k] * kernel[i][k]
                }
            }
        }

    Physically, the kernel array sits in an AVX512 vector register and
    the data[4] is broadcasted to another AVX512 vector register. This
    function returns a TensorIntrin that can be used to tensorize
    a schedule.

    Returns
    -------
    intrin : TensorIntrin
        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
    """

    int32_lanes = 16  # 16 int32 lanes in AVX512
    num_int8_elements = 4  # 4 int8 elements in int32
    data = tvm.placeholder((num_int8_elements, ), dtype='uint8', name='data')
    kernel = tvm.placeholder((int32_lanes, num_int8_elements),
                             dtype='int8',
                             name='kernel')
    k = tvm.reduce_axis((0, num_int8_elements), name='k')
    C = tvm.compute(
        (int32_lanes, ),
        lambda i: tvm.sum(
            data[k].astype('int32') * kernel[i, k].astype('int32'), axis=k),
        name="C")

    a_buffer = tvm.decl_buffer(data.shape,
                               dtype='uint8',
                               name="a_buffer",
                               offset_factor=1,
                               strides=[1])
    b_buffer = tvm.decl_buffer(kernel.shape,
                               dtype='int8',
                               name="b_buffer",
                               offset_factor=1,
                               strides=[tvm.var('ldw'), 1])

    def _intrin_func(ins, outs):
        def _instr(index):
            ib = tvm.ir_builder.create()
            if index == 1:
                ib.emit(outs[0].vstore(0, tvm.const(0, 'int32x16')))
                return ib.get()

            a_int8 = ins[0].vload([0], "uint8x4")
            re_int32 = tvm.call_pure_intrin('int32', 'reinterpret', a_int8)
            vec_ai32 = re_int32.astype('int32x16')
            vec_a = tvm.call_pure_intrin('int8x64', 'reinterpret', vec_ai32)
            vec_b = ins[1].vload([0, 0], "int8x64")
            vec_one = tvm.const(1, "int16x32")
            pair_reduction = tvm.call_llvm_intrin(
                'int16x32', 'llvm.x86.avx512.pmaddubs.w.512',
                tvm.const(0, 'uint32'), vec_a, vec_b)
            quad_reduction = tvm.call_llvm_intrin(
                'int32x16', 'llvm.x86.avx512.pmaddw.d.512',
                tvm.const(0, 'uint32'), pair_reduction, vec_one)
            if index == 0:
                ib.emit(outs[0].vstore(0, quad_reduction))
            else:
                ib.emit(outs[0].vstore(
                    0, quad_reduction + outs[0].vload([0], 'int32x16')))
            return ib.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)

    with tvm.build_config(offset_factor=1, partition_const_loop=True):
        return tvm.decl_tensor_intrin(C.op,
                                      _intrin_func,
                                      binds={
                                          data: a_buffer,
                                          kernel: b_buffer
                                      })
Exemplo n.º 51
0
def test_gemm():
    # graph
    nn = 1024
    n = tvm.var('n')
    n = tvm.convert(nn)
    m = n
    l = n
    A = tvm.placeholder((n, l), name='A')
    B = tvm.placeholder((m, l), name='B')
    k = tvm.reduce_axis((0, l), name='k')
    C = tvm.compute((n, m),
                    lambda ii, jj: tvm.sum(A[ii, k] * B[jj, k], axis=k),
                    name='CC')
    # schedule
    s = tvm.create_schedule(C.op)
    xtile, ytile = 32, 32
    scale = 8
    num_thread = 8
    block_factor = scale * num_thread
    block_x = tvm.thread_axis("blockIdx.x")
    thread_x = tvm.thread_axis("threadIdx.x")
    block_y = tvm.thread_axis("blockIdx.y")
    thread_y = tvm.thread_axis("threadIdx.y")

    CC = s.cache_write(C, "local")
    AA = s.cache_read(A, "shared", [CC])
    BB = s.cache_read(B, "shared", [CC])
    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
    s[C].reorder(by, bx, yi, xi)
    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    ty, yi = s[C].split(yi, nparts=num_thread)
    tx, xi = s[C].split(xi, nparts=num_thread)
    s[C].reorder(ty, tx, yi, xi)
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    yo, xo = CC.op.axis
    s[CC].reorder(k, yo, xo)

    s[CC].compute_at(s[C], tx)
    s[AA].compute_at(s[CC], k)
    s[BB].compute_at(s[CC], k)

    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
    tx, xi = s[AA].split(xi, nparts=num_thread)
    s[AA].bind(ty, thread_y)
    s[AA].bind(tx, thread_x)

    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
    tx, xi = s[BB].split(xi, nparts=num_thread)
    s[BB].bind(ty, thread_y)
    s[BB].bind(tx, thread_x)

    # lowering test
    s = s.normalize()

    # one line to build the function.
    def check_device(device):
        if not tvm.module.enabled(device):
            print("skip because %s is not enabled.." % device)
            return

        f = tvm.build(s, [A, B, C], device)
        ctx = tvm.context(device, 0)
        # launch the kernel.
        n = nn
        m = n
        l = n
        a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
        b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
        ftimer = f.time_evaluator(f.entry_name, ctx, number=1)
        tcost = ftimer(a, b, c).mean
        print("%s: exec=%g sec/op" % (ctx, tcost))
        np.testing.assert_allclose(c.asnumpy(),
                                   np.dot(a_np, b_np.T),
                                   rtol=1e-5)

    check_device("nvptx -mcpu=sm_20")
    check_device("metal")
    check_device("opencl")
    check_device("cuda")
Exemplo n.º 52
0
def test_canonical_mixed():
    ck = CanonicalChecker()
    x = tvm.var("x")
    z = tvm.const(3, "int32")
    ck.verify(x / (z * z) - x / (z * z), 0)
    ck.verify(x / (z + z) - x / (z + z), 0)
Exemplo n.º 53
0
def single_lstm():
    num_gate = 4
    hidden_size = tvm.var('hidden_size')
    batch_size = tvm.var('batch_size')
    input_size = tvm.var('input_size')

    # A single LSTM block operations without unrolling
    # '*' linear transformation
    # '(*)' elementwise multiplication
    # F_t = sigmoid( W_f * x_t + R_f * h_t-1 + b_f )
    # I_t = sigmoid( W_i * x_t + R_i * h_t-1 + b_i )
    # O_t = sigmoid( W_o * x_t + R_o * h_t-1 + b_o )
    # C'_t = tanh( W_c * x_t + R_c * h_t-1 + b_c )
    # C_t = F_t (*) C_t-1 + I_t (*) C'_t
    # h_t = O_t (*) tanh( C_t )

    # Global transition matrix

    # input X[0..t-1]
    X = tvm.placeholder((batch_size, input_size), name="X")
    Prev_h = tvm.placeholder((batch_size, hidden_size), name="Prev_h")
    Prev_c = tvm.placeholder((batch_size, hidden_size), name="Prev_c")

    # Parameters
    # Weight matrices [W_i, W_f, W_o, W_c]: 4 * hidden_size * input_size
    # Bias: 4 * hidden_size
    Wi2h = tvm.placeholder((num_gate, hidden_size, input_size), name="Wi2h")
    Bi2h = tvm.placeholder((num_gate, hidden_size), name="Bi2h")

    # Weight matrices [R_i, R_f, R_o, R_c]: 4 * hidden_size * hidden_size
    # Only handle hidden transition, saves space.
    Wh2h = tvm.placeholder((num_gate, hidden_size, hidden_size), name="Wh2h")
    Bh2h = tvm.placeholder((num_gate, hidden_size), name="Bh2h")

    # LSTM transition
    # [W_i, W_f, W_o, W_c] * X_t: 4 * num_hidden
    l = tvm.reduce_axis((0, input_size), name="li2h")
    i2h = tvm.compute((batch_size, num_gate, hidden_size),
                      lambda i, x, j: tvm.sum(X[i, l] * Wi2h[x, j, l], axis=l),
                      name="i2h")

    # [R_i, R_f, R_o, R_c] * h_t-1: 4 * hidden_size
    # R: hidden_size * hidden_size, h: hidden_size * 1
    k = tvm.reduce_axis((0, hidden_size), name="ki2h")
    h2h = tvm.compute(
        (batch_size, num_gate, hidden_size),
        lambda i, x, j: tvm.sum(Prev_h[i, k] * Wh2h[x, j, k], axis=k),
        name="h2h")

    gates = tvm.compute(
        (batch_size, num_gate, hidden_size),
        lambda i, j, k: i2h[i, j, k] + h2h[i, j, k] + Bi2h[j, k] + Bh2h[j, k],
        name="gates")
    gshape = (batch_size, hidden_size)
    in_gate = tvm.compute(gshape,
                          lambda i, j: tvm.sigmoid(gates[i, 0, j]),
                          name="in_gate")
    forget_gate = tvm.compute(gshape,
                              lambda i, j: tvm.sigmoid(gates[i, 1, j]),
                              name="forget_gate")
    out_gate = tvm.compute(gshape,
                           lambda i, j: tvm.sigmoid(gates[i, 2, j]),
                           name="out_gate")
    in_transform = tvm.compute(gshape,
                               lambda i, j: tvm.tanh(gates[i, 3, j]),
                               name="in_transform")

    # C_t = F_t o C_t-1 + I_t o C'_t
    state_c = tvm.compute((batch_size, hidden_size),
                          lambda i, j: forget_gate[i, j] * Prev_c[i, j] +
                          in_gate[i, j] * in_transform[i, j],
                          name="state_c")
    # h_t = O_t o tanh( C_t )
    # state_h = tvm.compute((batch_size, hidden_size),
    #    lambda i, j: out_gate[i, j] * tvm.tanh(state_c[i, j]), name="state_h")
    out_c, out_h = tvm.compute(
        (batch_size, hidden_size),
        lambda i, j: (state_c[i, j], out_gate[i, j] * tvm.tanh(state_c[i, j])),
        name="outputs_c_h")
    # schedule
    s = tvm.create_schedule(out_h.op)
    print(
        tvm.lower(s, [X, Prev_h, Prev_c, Wi2h, Bi2h, Wh2h, Bh2h, out_c, out_h],
                  simple_mode=True))
    lstm = tvm.build(s,
                     [X, Prev_h, Prev_c, Wi2h, Bi2h, Wh2h, Bh2h, out_c, out_h],
                     name="single_lstm")
    print(lstm)

    lstm.save("remy_single_lstm.o")
    print(lstm.imported_modules)
    cc.create_shared("remy_single_lstm.so", ["remy_single_lstm.o"])
import tvm
import numpy as np

m = tvm.var('m')
n = tvm.var('n')
X = tvm.placeholder((m, n), name='X')
s_state = tvm.placeholder((m, n))
s_init = tvm.compute((1, n), lambda _, i: X[0, i])
s_update = tvm.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i])
s_scan = tvm.scan(s_init, s_update, s_state, inputs=[X])

# Schedule the Scan Cell
s = tvm.create_schedule(s_scan.op)
num_thread = 256
block_x = tvm.thread_axis('blockIdx.x')
thread_x = tvm.thread_axis('threadIdx.x')
xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread)
s[s_init].bind(xo, block_x)
s[s_init].bind(xi, thread_x)
xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread)
s[s_update].bind(xo, block_x)
s[s_update].bind(xi, thread_x)
print(tvm.lower(s, [X, s_scan], simple_mode=True))

# Build and Verify
f_scan = tvm.build(s, [X, s_scan], 'cuda', name='my_scan')
ctx = tvm.gpu(0)
n = 1024
m = 10
a_np = np.random.uniform(size=(m, n)).astype(s_scan.dtype)
a = tvm.nd.array(a_np, ctx=ctx)
Exemplo n.º 55
0
from __future__ import absolute_import, print_function

import tvm
import numpy as np

N = tvm.var('N') # Data set size
V = tvm.var('V') # Feature number
C = tvm.var('C') # Center number

data = tvm.placeholder((N, V), name='data')
center = tvm.placeholder((C, V), name='center')

# === Start computation
# Compute distances
rv = tvm.reduce_axis((0, V), name='rv')
dis = tvm.compute((N, C), lambda n, c: tvm.sum(
    (data[n, rv]-center[c, rv]).astype('float64')*
    (data[n, rv]-center[c, rv]).astype('float64'), axis=rv),
    name='dis')

rc = tvm.reduce_axis((0, C), name='rc')
mse_n = tvm.compute((N,), lambda n: tvm.sum(dis[n, rc], axis=rc), name='mse_n')
rn = tvm.reduce_axis((0, N), name='rn')
mse = tvm.compute((1,), lambda i: tvm.sum(mse_n[rn], axis=rn), name='mse')

# === End computation

# Scheduling
s = tvm.create_schedule(mse.op)

# Compilation
Exemplo n.º 56
0
def test_make():
    x = tvm.const(1, "int32")
    y = tvm.var("x")
    z = x + y
    assert isinstance(tvm.max(x, y), tvm.expr.Max)
    assert isinstance(tvm.min(x, y), tvm.expr.Min)
Exemplo n.º 57
0
def test_vector_simplify():
    ck = RewriteChecker()
    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
    # Add rules
    ck.verify(
        tvm.expr.Ramp(x, 1, 4) + tvm.expr.Ramp(y, 2, 4),
        tvm.expr.Ramp(x + y, 3, 4))
    ck.verify(tvm.expr.Ramp(x, 1, 2) + y, tvm.expr.Ramp(x + y, 1, 2))
    ck.verify(y + tvm.expr.Ramp(x, 1, 2), tvm.expr.Ramp(y + x, 1, 2))
    ck.verify(
        y.astype("int32x2") + x.astype("int32x2"), (y + x).astype("int32x2"))
    # Sub rules
    ck.verify(
        tvm.expr.Ramp(x, 4, 4) - tvm.expr.Ramp(y, 2, 4),
        tvm.expr.Ramp(x - y, 2, 4))
    ck.verify(tvm.expr.Ramp(x, 1, 2) - y, tvm.expr.Ramp(x - y, 1, 2))
    ck.verify(y - tvm.expr.Ramp(x, 1, 2), tvm.expr.Ramp(y - x, -1, 2))
    ck.verify(
        y.astype("int32x2") - x.astype("int32x2"), (y - x).astype("int32x2"))

    # Mul rules
    ck.verify(
        y.astype("int32x2") * x.astype("int32x2"), (y * x).astype("int32x2"))
    ck.verify(tvm.expr.Ramp(x, 4, 4) * 2, tvm.expr.Ramp(x * 2, 8, 4))
    ck.verify(2 * tvm.expr.Ramp(x, 4, 4), tvm.expr.Ramp(x * 2, 8, 4))

    ## Div rules
    ck.verify(
        y.astype("int32x2") / x.astype("int32x2"), (y / x).astype("int32x2"))
    ck.verify(tvm.expr.Ramp(x, 4, 4) / 2, tvm.expr.Ramp(x / 2, 2, 4))
    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
    ck.verify(tvm.expr.Ramp(x * 8 + 1, 1, 4) / 8, (x).astype("int32x4"))
    ck.verify(
        tvm.expr.Ramp(x * 8 + 15, 1, 4) / 8,
        tvm.expr.Ramp(x * 8 + 15, 1, 4) / 8)

    ## Mod rules
    ck.verify(
        y.astype("int32x2") % x.astype("int32x2"), (y % x).astype("int32x2"))
    ck.verify(tvm.expr.Ramp(x, 4, 4) % 2, tvm.expr.Broadcast(x % 2, 4))
    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
    ck.verify(tvm.expr.Ramp(x * 8 + 1, 1, 4) % 8, tvm.expr.Ramp(1, 1, 4))
    ck.verify(tvm.expr.Ramp(x * 8 + 1, 15, 4) % 8, tvm.expr.Ramp(1, 15, 4) % 8)

    # Min/Max rules
    vx = tvm.var("vx", dtype="int32x2")
    vc = tvm.var("vc", dtype="uint1")
    ck.verify(tvm.min(y.astype("int32x2"), x.astype("int32x2")),
              tvm.min(y, x).astype("int32x2"))
    ck.verify(tvm.min(tvm.min(vx, y.astype("int32x2")), x.astype("int32x2")),
              tvm.min(vx,
                      tvm.min(y, x).astype("int32x2")))
    ck.verify(tvm.max(y.astype("int32x2"), x.astype("int32x2")),
              tvm.max(y, x).astype("int32x2"))
    ck.verify(tvm.max(tvm.max(vx, y.astype("int32x2")), x.astype("int32x2")),
              tvm.max(vx,
                      tvm.max(y, x).astype("int32x2")))

    ## Logical rules
    ck.verify(
        y.astype("int32x2").equal(x.astype("int32x2")),
        (y.equal(x)).astype("uint1x2"))
    ck.verify(tvm.expr.NE(y.astype("int32x2"), (x.astype("int32x2"))),
              (tvm.expr.NE(y, x)).astype("uint1x2"))
    ck.verify(
        y.astype("int32x2") > x.astype("int32x2"), (x < y).astype("uint1x2"))
    ck.verify(
        y.astype("int32x2") >= x.astype("int32x2"), (x <= y).astype("uint1x2"))
    ck.verify(
        y.astype("int32x2") < x.astype("int32x2"), (y < x).astype("uint1x2"))
    ck.verify(
        y.astype("int32x2") <= x.astype("int32x2"), (y <= x).astype("uint1x2"))
    ck.verify(
        tvm.expr.And(
            y.astype("int32x2") <= x.astype("int32x2"), vc.astype("uint1x2")),
        (tvm.expr.And(y <= x, vc)).astype("uint1x2"))
    ck.verify(
        tvm.expr.Or(
            y.astype("int32x2") <= x.astype("int32x2"), vc.astype("uint1x2")),
        (tvm.expr.Or(y <= x, vc)).astype("uint1x2"))
Exemplo n.º 58
0
    lstm = tvm.build(s,
                     [X, Prev_h, Prev_c, Wi2h, Bi2h, Wh2h, Bh2h, out_c, out_h],
                     name="single_lstm")
    print(lstm)

    lstm.save("remy_single_lstm.o")
    print(lstm.imported_modules)
    cc.create_shared("remy_single_lstm.so", ["remy_single_lstm.o"])


if __name__ == "__main__":
    #single_lstm()
    num_gate = 4
    batch_size = 1
    hidden_size = 2
    input_size = tvm.var('input_size')

    lstm = tvm.module.load("./remy_single_lstm.so")

    x_np = np.array([[5]], dtype='float32')
    Wi2h_np = np.array([[[1], [3]], [[-5], [7]], [[1], [1]], [[1], [1]]],
                       dtype='float32')
    Bi2h_np = np.array([[0, 0], [0, 0], [0, 0], [0, 0]], dtype='float32')
    Wh2h_np = np.array([[[0, 0], [0, 0]], [[0, 0], [0, 0]], [[0, 0], [0, 0]],
                        [[0, 0], [0, 0]]],
                       dtype='float32')
    Bh2h_np = np.array([[0, 0], [0, 0], [0, 0], [0, 0]], dtype='float32')
    scan_h_np = np.zeros(shape=(batch_size, hidden_size)).astype("float32")
    scan_c_np = np.zeros(shape=(batch_size, hidden_size)).astype("float32")
    x = tvm.nd.array(x_np)
    Wi2h = tvm.nd.array(Wi2h_np)
Exemplo n.º 59
0
def test_bound():
    m = tvm.var('m')
    vrange = tvm.convert(
        {m: tvm.Range(tvm.const(0, "int32"), tvm.const(10, "int32"))})
    ret = tvm.ir_pass.Simplify(m % 10, vrange)
    assert ret == m
def test_deduce():
    a = tvm.var('a')
    b = tvm.var('b')
    c = tvm.var('c')
    d = tvm.var('d')

    b_s = tvm.arith.intset_interval(2, 3)
    c_s = tvm.arith.intset_interval(10, 15)
    d_s = tvm.arith.intset_interval(-3, -1)
    zero = tvm.const(0, "int32")

    e0 = (-b) * a + c - d
    res0 = tvm.arith.DeduceBound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {})
    ans0 = ((d - c) / (b * -1))
    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)

    # expression containing variable a is on rhs
    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)

    e0 = d * a + c - d
    res0 = tvm.arith.DeduceBound(a, e0 >= 0, {b: b_s, c: c_s, d: d_s}, {})
    ans0 = ((0 - c) / d + 1)
    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)

    # expression containing variable a is on rhs
    res0 = tvm.arith.DeduceBound(a, zero <= e0, {b: b_s, c: c_s, d: d_s}, {})
    assert str(tvm.ir_pass.Simplify(res0.max())) == str(ans0)

    e1 = (a * 4 + b < c)
    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
    ans1 = (((c - b) + -1) / 4)
    assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1)

    # expression containing variable a is on rhs
    e1 = (c > a * 4 + b)
    res1 = tvm.arith.DeduceBound(a, e1, {b: b_s, c: c_s, d: d_s}, {})
    assert str(tvm.ir_pass.Simplify(res1.max())) == str(ans1)

    e2 = (tvm.max(5, a * 4) < 0)
    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
    assert str(res2.max()) == "neg_inf"
    assert str(res2.min()) == "pos_inf"

    # expression containing variable a is on rhs
    e2 = (zero < tvm.max(5, a * 4))
    res2 = tvm.arith.DeduceBound(a, e2, {b: b_s, c: c_s, d: d_s}, {})
    assert str(res2.max()) == "neg_inf"
    assert str(res2.min()) == "pos_inf"

    e3 = (-b) + a * c - d
    res3 = tvm.arith.DeduceBound(a, e3 >= 0, {
        b: b_s,
        c: c_s,
        d: d_s
    }, {
        b: b_s,
        d: d_s
    })
    ans3 = 2 / c + 1
    assert str(tvm.ir_pass.Simplify(res3.min())) == str(ans3)

    res3 = tvm.arith.DeduceBound(a, zero <= e3, {
        b: b_s,
        c: c_s,
        d: d_s
    }, {
        b: b_s,
        d: d_s
    })
    assert str(tvm.ir_pass.Simplify(res3.min())) == str(ans3)