Exemplo n.º 1
0
def test_add_pipeline():
    nn = 64
    max_threads = 4
    n = tvm.convert(nn)
    A = tvm.placeholder((n, ), name='A')

    def extern_generator(ins, outs):
        """Manually write the IR for the extern function, add pipeline"""
        ib = tvm.ir_builder.create()
        with ib.for_range(0, (n + 1) // 2) as i:
            ib.emit(outs[0].vstore(
                i * 2,
                ins[0].vload(i * 2, "float32x2") + tvm.const(1, "float32x2")))
        return ib.get()

    def extern_generator_gpu(ins, outs):
        """Manually write the IR for the extern function, add pipeline"""
        ib = tvm.ir_builder.create()
        bx = tvm.thread_axis("blockIdx.x")
        tx = tvm.thread_axis("threadIdx.x")
        ib.scope_attr(bx, "thread_extent",
                      (nn + max_threads - 1) // max_threads)
        ib.scope_attr(tx, "thread_extent", max_threads)
        idx = bx.var * max_threads + tx.var
        with ib.if_scope(ib.likely(idx < n)):
            ib.emit(outs[0].vstore(
                idx * 2, ins[0].vload(idx * 2, "float32x2") +
                tvm.const(1, "float32x2")))
        return ib.get()

    C_cpu = tvm.extern(A.shape, [A], extern_generator, name='C')
    C_gpu = tvm.extern(A.shape, [A], extern_generator_gpu, name='C')
    s_cpu = tvm.create_schedule(C_cpu.op)
    s_gpu = tvm.create_schedule(C_gpu.op)
    print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True))
    print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True))

    def check_target(target):
        if not tvm.module.enabled(target):
            return
        s = s_gpu if target in ['opencl', 'cuda'] else s_cpu
        C = C_gpu if target in ['opencl', 'cuda'] else C_cpu
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], target)
        ctx = tvm.context(target, 0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        f(a, c)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)

    check_target("llvm")
    check_target("opencl")
    check_target("cuda")
Exemplo n.º 2
0
Arquivo: sort.py Projeto: uwsampl/tvm
def argsort_gpu(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0):
    """Performs sorting along the given axis and returns an array of indicies
    having same shape as an input array that index data in sorted order.

    Parameters
    ----------
    data: tvm.Tensor
        The input array.

    valid_count : tvm.Tensor
        The number of valid elements to be sorted.

    axis : int
        Axis long which to sort the input tensor.

    is_ascend : boolean
        Whether to sort in ascending or descending order.

    flag : boolean
        Whether this argsort is used in nms operator

    Returns
    -------
    out : tvm.Tensor
        The output of this function.
    """
    sorted_data_buf = api.decl_buffer(data.shape, data.dtype, "sorted_data_buf", data_alignment=8)
    sorted_data = identity(data)
    if flag:
        valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype,
                                          "valid_count_buf", data_alignment=4)
        out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=4)
        out = tvm.extern([data.shape],
                         [sorted_data, valid_count],
                         lambda ins, outs: sort_nms_ir(
                             ins[0], ins[1], outs[0], axis, is_ascend),
                         dtype="int32",
                         in_buffers=[sorted_data_buf, valid_count_buf],
                         out_buffers=[out_buf],
                         name="argsort_nms_gpu",
                         tag="argsort_nms_gpu")
    else:
        out_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
        out = tvm.extern([data.shape],
                         [sorted_data],
                         lambda ins, outs: sort_ir(
                             ins[0], outs[0], axis, is_ascend),
                         dtype=dtype,
                         in_buffers=[sorted_data_buf],
                         out_buffers=[out_buf],
                         name="argsort_gpu",
                         tag="argsort_gpu")
    return out
Exemplo n.º 3
0
def argsort_gpu(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
    """Performs sorting along the given axis and returns an array of indicies
    having same shape as an input array that index data in sorted order.

    Parameters
    ----------
    data: tvm.Tensor
        The input array.

    valid_count : tvm.Tensor, optional
        The number of valid elements to be sorted.

    axis : int, optional
        Axis long which to sort the input tensor.

    is_ascend : boolean, optional
        Whether to sort in ascending or descending order.

    dtype : string, optional
        DType of the output indices.

    Returns
    -------
    out : tvm.Tensor
        The output of this function.
    """
    if valid_count is not None:
        sorted_data = identity(data)
        sorted_data_buf = api.decl_buffer(data.shape, data.dtype, "sorted_data_buf",
                                          data_alignment=8)
        valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype,
                                          "valid_count_buf", data_alignment=4)
        out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=4)
        out = tvm.extern([data.shape],
                         [sorted_data, valid_count],
                         lambda ins, outs: sort_nms_ir(
                             ins[0], ins[1], outs[0], axis, is_ascend),
                         dtype="int32",
                         in_buffers=[sorted_data_buf, valid_count_buf],
                         out_buffers=[out_buf],
                         name="argsort_nms_gpu",
                         tag="argsort_nms_gpu")
    else:
        value_buf = api.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8)
        indices_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
        out = tvm.extern([data.shape, data.shape],
                         [data],
                         lambda ins, outs: sort_ir(
                             ins[0], outs[0], axis, is_ascend, indices_out=outs[1]),
                         out_buffers=[value_buf, indices_buf],
                         name="argsort_gpu",
                         tag="argsort_gpu")[1]
    return out
Exemplo n.º 4
0
Arquivo: sort.py Projeto: bddppq/tvm
def argsort_gpu(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0):
    """Performs sorting along the given axis and returns an array of indicies
    having same shape as an input array that index data in sorted order.

    Parameters
    ----------
    data: tvm.Tensor
        The input array.

    valid_count : tvm.Tensor
        The number of valid elements to be sorted.

    axis : int
        Axis long which to sort the input tensor.

    is_ascend : boolean
        Whether to sort in ascending or descending order.

    flag : boolean
        Whether this argsort is used in nms operator

    Returns
    -------
    out : tvm.Tensor
        The output of this function.
    """
    data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
    if flag:
        valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype,
                                          "valid_count_buf", data_alignment=4)
        out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=4)
        out = tvm.extern([data.shape],
                         [data, valid_count],
                         lambda ins, outs: sort_nms_ir(
                             ins[0], ins[1], outs[0], axis, is_ascend),
                         dtype="int32",
                         in_buffers=[data_buf, valid_count_buf],
                         out_buffers=[out_buf],
                         name="argsort_nms_gpu",
                         tag="argsort_nms_gpu")
    else:
        out_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
        out = tvm.extern([data.shape],
                         [data],
                         lambda ins, outs: sort_ir(
                             ins[0], outs[0], axis, is_ascend),
                         dtype=dtype,
                         in_buffers=[data_buf],
                         out_buffers=[out_buf],
                         name="argsort_gpu",
                         tag="argsort_gpu")
    return out
Exemplo n.º 5
0
def test_add_pipeline():
    nn = 64
    max_threads = 4
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')

    def extern_generator(ins, outs):
        """Manually write the IR for the extern function, add pipeline"""
        ib = tvm.ir_builder.create()
        with ib.for_range(0, (n+1) // 2) as i:
            ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2")))
        return ib.get()

    def extern_generator_gpu(ins, outs):
        """Manually write the IR for the extern function, add pipeline"""
        ib = tvm.ir_builder.create()
        bx = tvm.thread_axis("blockIdx.x")
        tx = tvm.thread_axis("threadIdx.x")
        ib.scope_attr(bx, "thread_extent", (nn+max_threads-1) // max_threads)
        ib.scope_attr(tx, "thread_extent", max_threads)
        idx = bx.var * max_threads + tx.var
        with ib.if_scope(ib.likely(idx < n)):
            ib.emit(outs[0].vstore(idx*2, ins[0].vload(idx*2, "float32x2") + tvm.const(1, "float32x2")))
        return ib.get()

    C_cpu = tvm.extern(A.shape, [A], extern_generator, name='C')
    C_gpu = tvm.extern(A.shape, [A], extern_generator_gpu, name='C')
    s_cpu = tvm.create_schedule(C_cpu.op)
    s_gpu = tvm.create_schedule(C_gpu.op)
    print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True))
    print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True))

    def check_target(target):
        if not tvm.module.enabled(target):
            return
        s = s_gpu if target in ['opencl', 'cuda'] else s_cpu
        C = C_gpu if target in ['opencl', 'cuda'] else C_cpu
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], target)
        ctx = tvm.context(target, 0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        f(a, c)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
        
    check_target("llvm")
    check_target("opencl")
    check_target("cuda")
Exemplo n.º 6
0
def multibox_transform_loc_gpu(cls_prob,
                               loc_pred,
                               anchor,
                               clip=True,
                               threshold=0.01,
                               variances=(0.1, 0.1, 0.2, 0.2)):
    """Location transformation for multibox detection

    Parameters
    ----------
    cls_prob : tvm.Tensor
        Class probabilities.

    loc_pred : tvm.Tensor
        Location regression predictions.

    anchor : tvm.Tensor
        Prior anchor boxes.

    clip : boolean
        Whether to clip out-of-boundary boxes.

    threshold : float
        Threshold to be a positive prediction.

    variances : tuple of float
        Variances to be decoded from box regression output.

    Returns
    -------
    ret : tuple of tvm.Tensor composed of

    out : tvm.Tensor
        3-D tensor with shape (batch_size, num_anchors, 6)

    valid_count : tvm.Tensor
        1-D tensor with shape (batch_size,), number of valid anchor boxes.
    """
    batch_size = cls_prob.shape[0]
    num_anchors = anchor.shape[1]
    oshape = (batch_size, num_anchors, 6)
    # Define data alignment for intermediate buffer
    valid_count_dtype = "int32"
    valid_count_buf = api.decl_buffer((batch_size, ),
                                      valid_count_dtype,
                                      "valid_count_buf",
                                      data_alignment=4)
    out_buf = api.decl_buffer(oshape,
                              cls_prob.dtype,
                              "out_buf",
                              data_alignment=8)
    valid_count, out = \
        tvm.extern([(batch_size,), oshape],
                   [cls_prob, loc_pred, anchor],
                   lambda ins, outs: transform_loc_ir(
                       ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances),
                   dtype=[valid_count_dtype, cls_prob.dtype],
                   out_buffers=[valid_count_buf, out_buf],
                   tag="multibox_transform_loc")
    return [out, valid_count]
Exemplo n.º 7
0
def test_cpu():
    n = 1024
    dtype = "float32"
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    def test_device_ir(A, B, C):
        n = A.shape[0]
        max_threads = 8
        ib = tvm.ir_builder.create()
        Aptr = ib.buffer_ptr(A)
        Bptr = ib.buffer_ptr(B)
        Cptr = ib.buffer_ptr(C)
        with ib.for_range(0, n, name="i") as i:
            Cptr[i] = Aptr[i] + Bptr[i]
        body = ib.get()
        return body
    C = tvm.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
                   name="vector_add", dtype=dtype)
    s = tvm.create_schedule(C.op)
    def check_target(target):
        if not tvm.runtime.enabled(target):
            return
        # build and invoke the kernel.
        fadd = tvm.build(s, [A, B, C], target)
        ctx = tvm.context(target, 0)
        # launch the kernel.
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        fadd(a, b, c)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
    check_target("llvm")
Exemplo n.º 8
0
def test_sort_np():
    dshape = (1, 2, 3, 4, 5, 6)
    axis = 4
    reduced_shape = (1, 2, 3, 4, 6)
    is_descend = False
    data = tvm.placeholder(dshape, name='data')
    sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32")
    out = tvm.extern(data.shape, [data, sort_num],
                     lambda ins, outs: tvm.call_packed(
                         "tvm.contrib.sort.argsort", ins[0],
                         ins[1], outs[0], axis, is_descend),
                     dtype='int32', name="sort_tensor")

    ctx = tvm.cpu(0)
    target = "llvm"
    s = tvm.create_schedule(out.op)
    f = tvm.build(s, [data, sort_num, out], target)

    np_data = np.random.uniform(size=dshape)
    np_out = np.argsort(np_data, axis=axis)
    sort_num_input = np.full(reduced_shape, dshape[axis])
    a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx)
    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
    f(a, b, c)
    tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
Exemplo n.º 9
0
def test_sort():
    n = 2
    l = 5
    m = 3
    data = tvm.placeholder((n, l, m), name='data')
    sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32")
    axis = 1
    is_descend = True
    out = tvm.extern(data.shape, [data, sort_num],
                     lambda ins, outs: tvm.call_packed(
                         "tvm.contrib.sort.argsort", ins[0],
                         ins[1], outs[0], axis, is_descend),
                     dtype='int32', name="sort_tensor")
    input = [[[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5], [1.5, 0, 0]],
             [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]]
    sort_num_input = [[1, 2, 3], [4, 5, 5]]
    sorted_index = [[[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]],
                    [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]]]

    ctx = tvm.cpu(0)
    target = "llvm"
    s = tvm.create_schedule(out.op)
    f = tvm.build(s, [data, sort_num, out], target)
    a = tvm.nd.array(np.array(input).astype(data.dtype), ctx)
    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
    f(a, b, c)
    tvm.testing.assert_allclose(c.asnumpy(), np.array(sorted_index).astype(out.dtype), rtol=1e-5)
Exemplo n.º 10
0
def test_cpu():
    n = 1024
    dtype = "float32"
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    def test_device_ir(A, B, C):
        n = A.shape[0]
        max_threads = 8
        ib = tvm.ir_builder.create()
        Aptr = ib.buffer_ptr(A)
        Bptr = ib.buffer_ptr(B)
        Cptr = ib.buffer_ptr(C)
        with ib.for_range(0, n, name="i") as i:
            Cptr[i] = Aptr[i] + Bptr[i]
        body = ib.get()
        return body
    C = tvm.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
                   name="vector_add", dtype=dtype)
    s = tvm.create_schedule(C.op)
    def check_target(target):
        if not tvm.module.enabled(target):
            return
        # build and invoke the kernel.
        fadd = tvm.build(s, [A, B, C], target)
        ctx = tvm.context(target, 0)
        # launch the kernel.
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        fadd(a, b, c)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
    check_target("llvm")
Exemplo n.º 11
0
def test_add_pipeline():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')

    def extern_generator(ins, outs):
        """Manually write the IR for the extern function, add pipeline"""
        ib = tvm.ir_builder.create()
        with ib.for_range(0, n/2) as i:
            ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2")))
        return ib.get()

    C = tvm.extern(A.shape, [A], extern_generator, name='C')
    s = tvm.create_schedule(C.op)
    print(tvm.lower(s, [A, C], simple_mode=True))

    def check_llvm():
        if not tvm.module.enabled("llvm"):
            return
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], "llvm")
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        f(a, c)
        np.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + 1)
    check_llvm()
Exemplo n.º 12
0
def test_add_pipeline():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')

    def extern_generator(ins, outs):
        """Manually write the IR for the extern function, add pipeline"""
        ib = tvm.ir_builder.create()
        with ib.for_range(0, n/2) as i:
            ib.emit(outs[0].vstore(i*2, ins[0].vload(i*2, "float32x2") + tvm.const(1, "float32x2")))
        return ib.get()

    C = tvm.extern(A.shape, [A], extern_generator, name='C')
    s = tvm.create_schedule(C.op)

    def check_llvm():
        if not tvm.module.enabled("llvm"):
            return
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], "llvm")
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        f(a, c)
        np.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + 1)
    check_llvm()
Exemplo n.º 13
0
def test_pack_buffer_intermediate():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n,), name='A')
    B = tvm.compute((n,), lambda i: A[i] + 1, name="B")
    def extern_generator(ins, outs):
        """Manually write the IR for the extern function, add pipeline."""
        return tvm.call_packed("my_extern_array_func2", ins[0], outs[0])

    C = tvm.extern(B.shape, [B], extern_generator, name='C')
    s = tvm.create_schedule(C.op)

    def check_target(target):
        if not tvm.module.enabled(target):
            return
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], target)
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)

        @tvm.register_func
        def my_extern_array_func2(aa, bb):
            assert aa.shape == a.shape
            tvm.testing.assert_allclose(
                aa.asnumpy(), a.asnumpy() + 1)
            aa.copyto(bb)

        f(a, c)
        tvm.testing.assert_allclose(
            c.asnumpy(), a.asnumpy() + 1)

    check_target("llvm")
Exemplo n.º 14
0
def test_pack_buffer_simple():
    nn = 1024
    n = tvm.convert(nn)
    A = tvm.placeholder((n, ), name='A')

    def extern_generator(ins, outs):
        """Manually write the IR for the extern function, add pipeline."""
        return tvm.call_packed("my_extern_array_func1", ins[0], outs[0])

    C = tvm.extern(A.shape, [A], extern_generator, name='C')
    s = tvm.create_schedule(C.op)

    @tvm.register_func
    def my_extern_array_func1(aa, bb):
        aa.copyto(bb)

    def check_target(target):
        if not tvm.module.enabled(target):
            return
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], target)
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)

        f(a, c)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy())

    check_target("stackvm")
    check_target("llvm")
Exemplo n.º 15
0
def test_sort_np():
    dshape = (1, 2, 3, 4, 5, 6)
    axis = 4
    reduced_shape = (1, 2, 3, 4, 6)
    is_descend = False
    data = tvm.placeholder(dshape, name='data')
    sort_num = tvm.placeholder(reduced_shape, name="sort_num", dtype="int32")
    out = tvm.extern(
        data.shape, [data, sort_num],
        lambda ins, outs: tvm.call_packed("tvm.contrib.sort.argsort", ins[0],
                                          ins[1], outs[0], axis, is_descend),
        dtype='int32',
        name="sort_tensor")

    ctx = tvm.cpu(0)
    target = "llvm"
    s = tvm.create_schedule(out.op)
    f = tvm.build(s, [data, sort_num, out], target)

    np_data = np.random.uniform(size=dshape)
    np_out = np.argsort(np_data, axis=axis)
    sort_num_input = np.full(reduced_shape, dshape[axis])
    a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx)
    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
    f(a, b, c)
    tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
Exemplo n.º 16
0
def test_sort():
    n = 2
    l = 5
    m = 3
    data = tvm.placeholder((n, l, m), name='data')
    sort_num = tvm.placeholder((n, m), name="sort_num", dtype="int32")
    axis = 1
    is_descend = True
    out = tvm.extern(
        data.shape, [data, sort_num],
        lambda ins, outs: tvm.call_packed("tvm.contrib.sort.argsort", ins[0],
                                          ins[1], outs[0], axis, is_descend),
        dtype='int32',
        name="sort_tensor")
    input = [[[1, 2, 3], [2, 4.5, 3.5], [1.1, 0.5, 1], [3.2, -5, 0.5],
              [1.5, 0, 0]],
             [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]]
    sort_num_input = [[1, 2, 3], [4, 5, 5]]
    sorted_index = [[[0, 1, 1], [1, 0, 0], [2, 2, 2], [3, 3, 3], [4, 4, 4]],
                    [[3, 4, 4], [2, 3, 3], [1, 2, 2], [0, 1, 1], [4, 0, 0]]]

    ctx = tvm.cpu(0)
    target = "llvm"
    s = tvm.create_schedule(out.op)
    f = tvm.build(s, [data, sort_num, out], target)
    a = tvm.nd.array(np.array(input).astype(data.dtype), ctx)
    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
    f(a, b, c)
    tvm.testing.assert_allclose(c.asnumpy(),
                                np.array(sorted_index).astype(out.dtype),
                                rtol=1e-5)
Exemplo n.º 17
0
def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"):
    """Get the top k elements in an input tensor along the given axis.

    Parameters
    ----------
    data : tvm.Tensor
        The input tensor.

    k : int, optional
        Number of top elements to select. Return all elements if k < 1.

    axis : int, optional
        Axis long which to sort the input tensor.

    ret_type: str, optional
        The return type [both, values, indices].
        "both": return both top k data and indices.
        "values": return top k data only.
        "indices": return top k indices only.

    is_ascend : boolean, optional
        Whether to sort in ascending or descending order.

    dtype : string, optional
        The data type of the indices output.

    Returns
    -------
    out : tvm.Tensor or List[tvm.Tensor]
        The computed result.
    """
    assert ret_type in ["both", "values", "indices"]
    data_buf = api.decl_buffer(data.shape,
                               data.dtype,
                               "data_buf",
                               data_alignment=8)
    out_shape = list(get_const_tuple(data.shape))
    if k >= 1:
        out_shape[axis] = k
    out_bufs = []
    if ret_type in ["both", "values"]:
        out_bufs.append(
            api.decl_buffer(out_shape,
                            data.dtype,
                            "value_buf",
                            data_alignment=8))
    if ret_type in ["both", "indices"]:
        out_bufs.append(
            api.decl_buffer(out_shape, dtype, "indices_buf", data_alignment=8))
    out_shapes = [out_shape] * len(out_bufs)

    out = tvm.extern(
        out_shapes, [data],
        lambda ins, outs: tvm.call_packed("tvm.contrib.sort.topk", ins[0], *
                                          outs, k, axis, ret_type, is_ascend),
        in_buffers=[data_buf],
        out_buffers=out_bufs,
        name="topk_cpu",
        tag="topk_cpu")
    return out
Exemplo n.º 18
0
def test_extern():
    m = tvm.var('m')
    A = tvm.placeholder((m,), name='A')

    def extern_func(ins, outs):
        assert(isinstance(ins[0], tvm.schedule.Buffer))
        return tvm.call_packed("myadd", ins[0].data, outs[0].data, m)
    B = tvm.extern((m,), [A], extern_func)
    assert(tuple(B.shape) == (m,))
Exemplo n.º 19
0
def test_extern():
    m = tvm.var('m')
    A = tvm.placeholder((m,), name='A')

    def extern_func(ins, outs):
        assert(isinstance(ins[0], tvm.schedule.Buffer))
        return tvm.call_packed("myadd", ins[0].data, outs[0].data, m)
    B = tvm.extern((m,), [A], extern_func)
    assert(tuple(B.shape) == (m,))
Exemplo n.º 20
0
def test_extern_multi_out():
    m = tvm.var('m')
    A = tvm.placeholder((m,), name='A')
    B = tvm.compute((m,), lambda i: A[i] * 10)

    def extern_func(ins, outs):
        assert(isinstance(ins[0], tvm.schedule.Buffer))
        return tvm.call_packed(
            "myadd", ins[0].data, outs[0].data, outs[1].data, m)
    res = tvm.extern([A.shape, A.shape], [A, B], extern_func)
    assert(len(res) == 2)
    assert(res[1].value_index == 1)
Exemplo n.º 21
0
def test_extern_multi_out():
    m = tvm.var('m')
    A = tvm.placeholder((m,), name='A')
    B = tvm.compute((m,), lambda i: A[i] * 10)

    def extern_func(ins, outs):
        assert(isinstance(ins[0], tvm.schedule.Buffer))
        return tvm.call_packed(
            "myadd", ins[0].data, outs[0].data, outs[1].data, m)
    res = tvm.extern([A.shape, A.shape], [A, B], extern_func)
    assert(len(res) == 2)
    assert(res[1].value_index == 1)
Exemplo n.º 22
0
def test_gpu():
    n = tvm.var('n')
    dtype = "float32"
    A = tvm.placeholder((n, ), name='A')
    B = tvm.placeholder((n, ), name='B')
    fld = tvm.floordiv

    def test_device_ir(A, B, C):
        n = A.shape[0]
        max_threads = 32
        ib = tvm.ir_builder.create()
        bx = tvm.thread_axis("blockIdx.x")
        tx = tvm.thread_axis("threadIdx.x")
        ib.scope_attr(bx, "thread_extent", fld(n + max_threads - 1,
                                               max_threads))
        ib.scope_attr(tx, "thread_extent", max_threads)
        idx = bx.var * max_threads + tx.var
        Aptr = ib.buffer_ptr(A)
        Bptr = ib.buffer_ptr(B)
        Cptr = ib.buffer_ptr(C)
        with ib.if_scope(ib.likely(idx < n)):
            Cptr[idx] = Aptr[idx] + Bptr[idx]
        body = ib.get()
        return body

    C = tvm.extern(A.shape, [A, B],
                   lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
                   name="vector_add",
                   dtype=dtype)
    s = tvm.create_schedule(C.op)
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)

    def check_target(target):
        n = 1024
        if not tvm.module.enabled(target):
            return
        # build and invoke the kernel.
        fadd = tvm.build(s, [A, B, C], target)
        ctx = tvm.context(target, 0)
        # launch the kernel.
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        fadd(a, b, c)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())

    check_target("opencl")
    check_target("cuda")
Exemplo n.º 23
0
def sparse_transpose(sparse_data, sparse_indices, sparse_indptr):
    """
    Transpose a square sparse matrix,
    `A` is an n-by-n sparse matrix in the CSR format.
    ** Currently only support Square Matrices **

    Parameters
    ----------
    sparse_data : tvm.Tensor
        1-D with shape [nonzeros], dtype of 'float32'

    sparse_indices : tvm.Tensor
        1-D with shape [nonzeros], dtype of 'int32'

    sparse_indptr : tvm.Tensor
        1-D with shape [n+1], dtype of 'int32'

    Returns
    -------
    out_data : tvm.Tensor
        1-D with shape [nonzeros], dtype of 'float32'

    out_indices : tvm.Tensor
        1-D with shape [nonzeros], dtype of 'int32'

    out_indptr : tvm.Tensor
        1-D with shape [n+1], dtype of 'int32'
    """
    assert len(sparse_data.shape) == 1, "error in data dimension"
    assert len(sparse_indices.shape) == 1, "error in indices dimension"
    assert len(sparse_indptr.shape) == 1, "error in indptr dimension"

    nnz = get_const_tuple(sparse_data.shape)[0]
    n = get_const_tuple(sparse_indptr.shape)[0] - 1
    output_shape = [(nnz, ), (nnz, ), (n + 1, )]

    # TODO: Add BSR transpose support

    output_data, output_indices, output_indptr = tvm.extern(
        shape=output_shape,
        inputs=[sparse_data, sparse_indices, sparse_indptr],
        fcompute=lambda ins, outs: _csr_transpose_ir(ins[0], ins[1], ins[
            2], outs[0], outs[1], outs[2]),
        tag="sparse_transpose_csr",
        dtype=['float32', 'int32', 'int32'],
        name='out')

    return [output_data, output_indices, output_indptr]
Exemplo n.º 24
0
def gen_copy_reduce_sum(isfwd):
    indptrN = tvm.var('indptrN')
    indicesN = tvm.var('indicesN')
    outN = tvm.var('outN')
    inN = tvm.var('inN')
    x_len = tvm.var('x_len')
    indices = tvm.placeholder((indicesN,), name='indices', dtype=tvm.int32)
    indptr = tvm.placeholder((indptrN,), name='indptr', dtype=tvm.int32)
    inbuf = tvm.placeholder((inN, x_len), name='inbuf', dtype=tvm.float32)
    #outbuf = tvm.placeholder((outN, x_len), name='outbuf')
    def gen(ins, outs):
        irb = tvm.ir_builder.create()
        outptr = irb.buffer_ptr(outs[0])
        gen_zero_out_tensor(irb, outs[0])
        block_size = 32
        x_len_s = topi.util.simplify(x_len)
        '''with irb.for_range(0, tvm.floordiv(x_len_s + (block_size - 1), block_size), for_type="parallel" ,name='blkIdx') as blkIdx:
            def workload(irb, src, dst, eid, inptr):
                with irb.for_range(0, blkIdx * block_size, name='i') as i: #for_type="vectorize"
                    with irb.if_scope(irb.likely(blkIdx * block_size + i < x_len_s)) :
                        if isfwd:
                            outptr[dst * x_len_s + blkIdx * block_size + i] += inptr[src * x_len_s + blkIdx * block_size + i]
                        else:
                            outptr[src * x_len_s + blkIdx * block_size + i] += inptr[dst * x_len_s + blkIdx * block_size + i]
            gen_csr_iterate(irb, ins[0], ins[1], False, workload, inptr = ins[2])'''
        def for_each_edge(irb, src, dst, eid, inptr):
            def assign(idx):
                if isfwd:
                    outptr[dst * x_len_s + idx] += inptr[src * x_len_s + idx]
                else:
                    outptr[src * x_len_s + idx] += inptr[dst * x_len_s + idx]
            gen_vectorized_for_loop(irb, x_len_s, simd_size, assign)               
        gen_csr_iterate(irb, ins[0], ins[1], not isfwd, for_each_edge, inptr = ins[2])
        '''def workload(irb, src, dst, eid, inptr):
            blkSize=16
            #with irb.for_range(0, tvm.floordiv(x_len_s, blkSize), name='x_len.outer') as outer: #for_type="vectorize"
            with irb.for_range(0, x_len_s, name='x_len.inner') as inner: #
                    if isfwd:
                        outptr[dst * x_len_s + inner] += inptr[src * x_len_s + inner]
                    else:
                        outptr[src * x_len_s + inner] += inptr[dst * x_len_s + inner]
        gen_csr_iterate(irb, ins[0], ins[1], True, workload, inptr = ins[2])'''
        return irb.get()
    C = tvm.extern((outN, x_len),[indices, indptr, inbuf], gen, dtype=tvm.float32, name = "C")
    return C,indices,indptr,inbuf
Exemplo n.º 25
0
def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01,
                           variances=(0.1, 0.1, 0.2, 0.2)):
    """Location transformation for multibox detection

    Parameters
    ----------
    cls_prob : tvm.Tensor
        Class probabilities.

    loc_pred : tvm.Tensor
        Location regression predictions.

    anchor : tvm.Tensor
        Prior anchor boxes.

    clip : boolean
        Whether to clip out-of-boundary boxes.

    threshold : float
        Threshold to be a positive prediction.

    variances : tuple of float
        Variances to be decoded from box regression output.

    Returns
    -------
    ret : tuple of tvm.Tensor
    """
    batch_size = cls_prob.shape[0]
    num_anchors = anchor.shape[1]
    oshape = (batch_size, num_anchors, 6)
    # Define data alignment for intermediate buffer
    valid_count_dtype = "int32"
    valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype,
                                      "valid_count_buf", data_alignment=4)
    out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8)
    valid_count, out = \
        tvm.extern([(batch_size,), oshape],
                   [cls_prob, loc_pred, anchor],
                   lambda ins, outs: transform_loc_ir(
                       ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances),
                   dtype=[valid_count_dtype, cls_prob.dtype],
                   out_buffers=[valid_count_buf, out_buf],
                   tag="multibox_transform_loc")
    return [out, valid_count]
Exemplo n.º 26
0
def multibox_prior_gpu(data,
                       sizes=(1, ),
                       ratios=(1, ),
                       steps=(-1, -1),
                       offsets=(0.5, 0.5),
                       clip=False):
    """Generate prior(anchor) boxes from data, sizes and ratios.

    Parameters
    ----------
    data : tvm.Tensor
        4-D with shape [batch, c_in, h_in, w_in]]

    sizes : tuple of float
        Tuple of sizes for anchor boxes.

    ratios : tuple of float
        Tuple of ratios for anchor boxes.

    steps : Tuple of float
        Priorbox step across y and x, -1 for auto calculation.

    offsets : tuple of int
        Priorbox center offsets, y and x respectively.

    clip : boolean
        Whether to clip out-of-boundary boxes.

    Returns
    -------
    out : tvm.Tensor
        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
    """
    num_sizes = len(sizes)
    num_ratios = len(ratios)
    oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1),
              4)
    out = tvm.extern(oshape, [data],
                     lambda ins, outs: multibox_prior_ir(
                         ins[0], outs[0], sizes, ratios, steps, offsets),
                     tag="multibox_prior")
    if clip:
        out = topi.clip(out, 0, 1)
    return out
Exemplo n.º 27
0
def lesson1():
    ######################################################################
    # Use Extern Tensor Function
    # --------------------------
    # In the example below, we use :any:`tvm.extern` to add an extern
    # array function call. In the extern call, we declare the shape
    # of output tensors. In the second argument we provide the list of inputs.
    #
    # User will need to provide a function describing how to compute the result.
    # The compute function takes list of symbolic placeholder for the inputs,
    # list of symbolic placeholder for the outputs and returns the executing statement.
    #
    # In this case we simply call a registered tvm function, which invokes a CBLAS call.
    # TVM does not control internal of the extern array function and treats it as blackbox.
    # We can further mix schedulable TVM calls that add a bias term to the result.
    #
    n = 1024
    l = 128
    m = 235
    bias = tvm.var('bias', dtype=tvm.float32)
    A = tvm.placeholder((n, l), name='A')
    B = tvm.placeholder((l, m), name='B')
    C = tvm.extern(
        (n, m), [A, B],
        lambda ins, outs: tvm.call_packed("tvm.contrib.cblas.matmul", ins[0],
                                          ins[1], outs[0], False, False),
        name="C")
    D = tvm.compute(C.shape, lambda i, j: C[i, j] + bias, name="D")
    s = tvm.create_schedule(D.op)
    ######################################################################
    # Verify the Result
    # -----------------
    # We can verify that the result matches what we expected.
    #
    ctx = tvm.cpu(0)
    f = tvm.build(s, [A, B, D, bias], "llvm")
    a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
    b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
    d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
    bb = 10.0
    f(a, b, d, bb)
    np.testing.assert_allclose(d.asnumpy(),
                               np.dot(a.asnumpy(), b.asnumpy()) + 10,
                               rtol=1e-5)
Exemplo n.º 28
0
def gen_binary_op_dot_bwd_lhs(islhs):
    indptrN = tvm.var('indptrN')
    indicesN = tvm.var('indicesN')
    rhsDataN = tvm.var('rhsDataN')
    gradoutDataN = tvm.var('gradoutDataN')
    lhsgradoutDataN = tvm.var('lhsgradoutDataN')
    #xLen = tvm.var('xLen')
    xLen = 1
    #fix-me: we eliminated x_len dimension here
    dataLen = tvm.var('dataLen')
    indices = tvm.placeholder((indicesN,), name='indices', dtype=tvm.int32)
    indptr = tvm.placeholder((indptrN,), name='indptr', dtype=tvm.int32)
    rhsData = tvm.placeholder((rhsDataN, dataLen), name='rhsData', dtype=tvm.float32)
    gradoutData = tvm.placeholder((gradoutDataN, ), name='gradoutData', dtype=tvm.float32)
    outMapping = tvm.placeholder((gradoutDataN, ), name='outMapping', dtype=tvm.int32)
    #lhsgradoutData = tvm.placeholder((lhsgradoutDataN, xLen, dataLen), name='lhsgradoutData', dtype=tvm.float32)

    def gen_func(ins, outs):
        irb = tvm.ir_builder.create()
        gen_zero_out_tensor(irb, outs[0])
        indices, indptr, rhsData, gradoutData, outMapping = ins[0], ins[1], ins[2], ins[3], ins[4]
        #with irb.for_range(0, xLen, name='i') as i:
        def for_each_edge(irb, src, dst, eid, rhsDataPtr, gradoutDataPtr, lhsgradoutDataPtr, outMappingPtr):
            lhsIdx = topi.util.simplify(src * dataLen)
            outIdx = topi.util.simplify(outMappingPtr[eid])
            rhsIdx = topi.util.simplify(dst * dataLen)
            grad = gradoutDataPtr[outIdx]
            def fcompute(j):
                    if islhs:
                        lhsgradoutDataPtr[lhsIdx + j] += grad * rhsDataPtr[rhsIdx +j]
                    else:
                        lhsgradoutDataPtr[rhsIdx + j] += grad * rhsDataPtr[lhsIdx +j]
            gen_vectorized_for_loop(irb, dataLen, simd_size, fcompute)
        gen_csr_iterate(irb, indices, indptr, islhs, for_each_edge, rhsDataPtr = rhsData, gradoutDataPtr = gradoutData, lhsgradoutDataPtr= outs[0], outMappingPtr = outMapping )
        return irb.get()
    #outbuf = tvm.placeholder((outN, x_len), name='outbuf')

    
    C = tvm.extern((lhsgradoutDataN, dataLen),[indices, indptr, rhsData, gradoutData, outMapping],
        gen_func,
        dtype=tvm.float32, name = "lhsgradoutData"
    )
    return C,indices,indptr,rhsData, gradoutData, outMapping
Exemplo n.º 29
0
def test_gpu():
    n = tvm.var('n')
    dtype = "float32"
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    def test_device_ir(A, B, C):
        n = A.shape[0]
        max_threads = 32
        ib = tvm.ir_builder.create()
        bx = tvm.thread_axis("blockIdx.x")
        tx = tvm.thread_axis("threadIdx.x")
        ib.scope_attr(bx, "thread_extent", (n+max_threads-1) // max_threads)
        ib.scope_attr(tx, "thread_extent", max_threads)
        idx = bx.var * max_threads + tx.var
        Aptr = ib.buffer_ptr(A)
        Bptr = ib.buffer_ptr(B)
        Cptr = ib.buffer_ptr(C)
        with ib.if_scope(ib.likely(idx<n)):
            Cptr[idx] = Aptr[idx] + Bptr[idx]
        body = ib.get()
        return body
    C = tvm.extern(A.shape, [A, B], lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
                   name="vector_add", dtype=dtype)
    s = tvm.create_schedule(C.op)
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.schedule.ScheduleOps(s, bounds)
    def check_target(target):
        n = 1024
        if not tvm.module.enabled(target):
            return
        # build and invoke the kernel.
        fadd = tvm.build(s, [A, B, C], target)
        ctx = tvm.context(target, 0)
        # launch the kernel.
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        fadd(a, b, c)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
    check_target("opencl")
    check_target("cuda")
Exemplo n.º 30
0
def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1),
                       offsets=(0.5, 0.5), clip=False):
    """Generate prior(anchor) boxes from data, sizes and ratios.

    Parameters
    ----------
    data : tvm.Tensor
        4-D with shape [batch, c_in, h_in, w_in]]

    sizes : tuple of float
        Tuple of sizes for anchor boxes.

    ratios : tuple of float
        Tuple of ratios for anchor boxes.

    steps : Tuple of float
        Priorbox step across y and x, -1 for auto calculation.

    offsets : tuple of int
        Priorbox center offsets, y and x respectively.

    clip : boolean
        Whether to clip out-of-boundary boxes.

    Returns
    -------
    out : tvm.Tensor
        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
    """
    num_sizes = len(sizes)
    num_ratios = len(ratios)
    oshape = (
        1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
    out = tvm.extern(oshape, [data], lambda ins, outs:
                     multibox_prior_ir(
                         ins[0], outs[0], sizes, ratios, steps, offsets),
                     tag="multibox_prior")
    if clip:
        out = topi.clip(out, 0, 1)
    return out
Exemplo n.º 31
0
def main():
    ctx = tvm.cpu(0)
    n = 1024
    l = 128
    m = 235
    bias = tvm.var('bias', dtype=tvm.float32)
    A = tvm.placeholder((n, l), name='A')
    B = tvm.placeholder((l, m), name='B')
    C = tvm.extern(
        (n, m), [A, B],
        lambda ins, outs: tvm.call_packed("tvm.contrib.cblas.matmul", ins[0],
                                          ins[1], outs[0], False, False),
        name="C")
    D = tvm.compute(C.shape, lambda i, j: C(i, j) + bias, name="D")
    s = tvm.create_schedule(D.op)
    f = tvm.build(s, [A, B, D, bias], "llvm")
    a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
    b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
    d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
    bb = 10.0
    print(d.asnumpy())
    tvm.testing.assert_allclose(d.asnumpy(),
                                np.dot(a.asnumpy(), b.asnumpy()) + 10,
                                rtol=1e-5)
Exemplo n.º 32
0
    rD = ((P*Q*N-1)//block_len+1)*block_len
    rF = ((K-1)//block_len+1)*block_len

    loop_len = (rD*rF//block_len//block_len-1)//block_num+1              # length of the main loop
    index_len = loop_len*cD//16*8+8
    
    print(loop_len)
    print(P,Q,cD)

    OFFIND = np.ones((block_num,thread_num),dtype = np.int32)
    with tvm.target.create('cuda'):
        D = tvm.placeholder((N,C,H,W),dtype = 'float16')
        F = tvm.placeholder((K,C,R,S),dtype = 'float16')
        LOAD_INDEX_D = tvm.placeholder((block_num,thread_num,index_len),dtype = 'int32')
        LOAD_INDEX_F = tvm.placeholder((block_num,thread_num,index_len),dtype = 'int32')
        O = tvm.extern((N,K,P,Q),[D,F,LOAD_INDEX_D,LOAD_INDEX_F],lambda ins,outs:convolutionf16(ins[0],ins[1],ins[2],ins[3],outs[0]),name = "conv",dtype = 'float16')
        s = schedule_conv_fp16()
        
        print(tvm.lower(s,[D,F,LOAD_INDEX_D,LOAD_INDEX_F,O],name ='convf16',simple_mode = True))
        f = tvm.build(s, [D,F,LOAD_INDEX_D,LOAD_INDEX_F,O], target='cuda', name='conv')

        print("build finished")
        ctx = tvm.context('cuda', 0)
        a_np = np.float16(np.random.uniform(0.,1.,size=(N,C,H,W)))
        b_np = np.float16(np.random.uniform(0.,1.,size=(K,C,R,S)))
        c_np = np.zeros((N,K,P,Q), dtype=O.dtype)
        d1_np = -1*np.ones((block_num,thread_num,index_len),dtype = np.int32)
        d2_np = -1*np.ones((block_num,thread_num,index_len),dtype = np.int32)
        #d1_np = np.zeros((rD,cD),dtype = np.int32)
        #d2_np = np.zeros((rF,cF),dtype = np.int32)
        print("now start compute index")
Exemplo n.º 33
0
def non_max_suppression_gpu(data,
                            valid_count,
                            max_output_size=-1,
                            iou_threshold=0.5,
                            force_suppress=False,
                            top_k=-1,
                            coord_start=2,
                            score_index=1,
                            id_index=0,
                            return_indices=True,
                            invalid_to_bottom=False):
    """Non-maximum suppression operator for object detection.

    Parameters
    ----------
    data : tvm.Tensor
        3-D tensor with shape [batch_size, num_anchors, elem_length].
        The last dimension should be in format of
        [class_id, score, box_left, box_top, box_right, box_bottom].

    valid_count : tvm.Tensor
        1-D tensor for valid number of boxes.

    max_output_size : optional, int
        Max number of output valid boxes for each instance.
        By default all valid boxes are returned.

    iou_threshold : optional, float
        Non-maximum suppression threshold.

    force_suppress : optional, boolean
        Whether to suppress all detections regardless of class_id.

    top_k : optional, int
        Keep maximum top k detections before nms, -1 for no limit.

    coord_start : required, int
        Start index of the consecutive 4 coordinates.

    score_index : optional, int
        Index of the scores/confidence of boxes.

    id_index : optional, int
        index of the class categories, -1 to disable.

    return_indices : boolean
        Whether to return box indices in input data.

    invalid_to_bottom : optional, boolean
        Whether to move all valid bounding boxes to the top.

    Returns
    -------
    out : tvm.Tensor
        3-D tensor with shape [batch_size, num_anchors, elem_length].

    Example
    --------
    .. code-block:: python

        # An example to use nms
        dshape = (1, 5, 6)
        data = tvm.placeholder(dshape, name="data")
        valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
        iou_threshold = 0.7
        force_suppress = True
        top_k = -1
        out = non_max_suppression(data=data, valid_count=valid_count, iou_threshold=iou_threshold,
                                 force_suppress=force_supress, top_k=top_k, return_indices=False)
        np_data = np.random.uniform(dshape)
        np_valid_count = np.array([4])
        s = topi.generic.schedule_nms(out)
        f = tvm.build(s, [data, valid_count, out], "cuda")
        ctx = tvm.gpu(0)
        tvm_data = tvm.nd.array(np_data, ctx)
        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
        f(tvm_data, tvm_valid_count, tvm_out)
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]

    valid_count_dtype = "int32"
    valid_count_buf = api.decl_buffer(valid_count.shape,
                                      valid_count_dtype,
                                      "valid_count_buf",
                                      data_alignment=4)
    score_axis = score_index
    score_shape = (batch_size, num_anchors)
    score_tensor = tvm.compute(score_shape,
                               lambda i, j: data[i, j, score_axis],
                               tag=tag.ELEMWISE)
    sort_tensor = argsort(score_tensor,
                          valid_count=valid_count,
                          axis=1,
                          is_ascend=False)

    sort_tensor_buf = api.decl_buffer(sort_tensor.shape,
                                      sort_tensor.dtype,
                                      "sort_tensor_buf",
                                      data_alignment=8)

    data_buf = api.decl_buffer(data.shape,
                               data.dtype,
                               "data_buf",
                               data_alignment=8)

    out_buf = api.decl_buffer(data.shape,
                              data.dtype,
                              "out_buf",
                              data_alignment=8)

    out, box_indices = \
        tvm.extern([data.shape, score_shape],
                   [data, sort_tensor, valid_count],
                   lambda ins, outs: nms_ir(
                       ins[0], ins[1], ins[2], outs[0], outs[1],
                       max_output_size, iou_threshold, force_suppress,
                       top_k, coord_start, id_index, score_index),
                   dtype=[data.dtype, "int32"],
                   in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
                   name="nms",
                   tag="nms")

    if return_indices:
        return box_indices

    if invalid_to_bottom:
        output_buf = api.decl_buffer(data.shape,
                                     data.dtype,
                                     "output_buf",
                                     data_alignment=8)
        temp_flag_buf = api.decl_buffer(score_shape,
                                        valid_count_dtype,
                                        "temp_flag",
                                        data_alignment=8)
        temp_idx_buf = api.decl_buffer(score_shape,
                                       valid_count_dtype,
                                       "temp_idx",
                                       data_alignment=8)
        temp_flag, temp_idx = tvm.extern(
            [score_shape, score_shape], [out],
            lambda ins, outs: invalid_to_bottom_pre(ins[0], outs[0], outs[1]),
            dtype=["int32", "int32"],
            in_buffers=[out_buf],
            out_buffers=[temp_flag_buf, temp_idx_buf],
            name="invalid_to_bottom_phase_one")

        output = tvm.extern([data.shape], [out, temp_flag, temp_idx],
                            lambda ins, outs: invalid_to_bottom_ir(
                                ins[0], ins[1], ins[2], outs[0]),
                            dtype=[data.dtype],
                            in_buffers=[out_buf, temp_flag_buf, temp_idx_buf],
                            out_buffers=[output_buf],
                            name="invalid_to_bottom",
                            tag="invalid_to_bottom")
        return output

    return out
Exemplo n.º 34
0
def get_valid_counts_gpu(data, score_threshold=0, id_index=0, score_index=1):
    """Get valid count of bounding boxes given a score threshold.
    Also moves valid boxes to the top of input data.

    Parameters
    ----------
    data : tvm.Tensor
        Input data. 3-D tensor with shape [batch_size, num_anchors, elem_length].

    score_threshold : optional, float
        Lower limit of score for valid bounding boxes.

    id_index : optional, int
        index of the class categories, -1 to disable.

    score_index: optional, int
        Index of the scores/confidence of boxes.

    Returns
    -------
    valid_count : tvm.Tensor
        1-D tensor for valid number of boxes.

    out_tensor : tvm.Tensor
        Rearranged data tensor.
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    max_threads = int(
        tvm.target.current_target(allow_none=False).max_num_threads)
    elem_per_thread = num_anchors // max_threads + 1
    new_range = num_anchors // elem_per_thread + 1
    temp_flag_buf = api.decl_buffer((
        batch_size,
        num_anchors,
    ),
                                    "int32",
                                    "temp_flag",
                                    data_alignment=8)
    temp_idx_buf = api.decl_buffer((
        batch_size,
        num_anchors,
    ),
                                   "int32",
                                   "temp_idx",
                                   data_alignment=8)
    temp_partial_buf = api.decl_buffer((batch_size, new_range),
                                       "int32",
                                       "temp_partial",
                                       data_alignment=8)
    data_buf = api.decl_buffer(data.shape,
                               data.dtype,
                               "data_buf",
                               data_alignment=8)

    temp_flag, temp_idx = \
        tvm.extern([(batch_size, num_anchors,), (batch_size, num_anchors,)], [data],
                   lambda ins, outs: get_valid_counts_pre(
                       ins[0], outs[0], outs[1], score_threshold, id_index, score_index),
                   dtype=["int32", "int32"],
                   out_buffers=[temp_flag_buf, temp_idx_buf],
                   name="get_valid_counts_phase_one")
    temp_idx_new, temp_partial = \
        tvm.extern([(batch_size, num_anchors,), (batch_size, new_range)], [data, temp_idx],
                   lambda ins, outs: get_valid_counts_upsweep(
                       ins[0], ins[1], outs[0], outs[1]),
                   dtype=["int32", "int32"],
                   out_buffers=[temp_idx_buf, temp_partial_buf],
                   name="get_valid_counts_phase_two")
    temp_partial_new = \
        tvm.extern([(batch_size, new_range)], [data, temp_partial],
                   lambda ins, outs: get_valid_counts_scan(
                       ins[0], ins[1], outs[0]),
                   dtype=["int32"],
                   out_buffers=[temp_partial_buf],
                   name="get_valid_counts_phase_three")
    temp_idx_final = \
        tvm.extern([(batch_size, num_anchors)], [data, temp_idx_new, temp_partial_new],
                   lambda ins, outs: get_valid_counts_downsweep(
                       ins[0], ins[1], ins[2], outs[0]),
                   dtype=["int32"],
                   out_buffers=[temp_idx_buf],
                   name="get_valid_counts_phase_four")
    valid_count, out_tensor = \
 tvm.extern([(batch_size,), data.shape], [data, temp_flag, temp_idx_final],
            lambda ins, outs: get_valid_counts_ir(
                ins[0], ins[1], ins[2], outs[0], outs[1]),
            dtype=["int32", data.dtype],
            in_buffers=[data_buf, temp_flag_buf, temp_idx_buf],
            name="get_valid_counts_phase_five",
            tag="get_valid_counts_gpu")

    return [valid_count, out_tensor]
Exemplo n.º 35
0
Arquivo: dense.py Projeto: bddppq/tvm
def dense_sw(data, w_data, w_indices, w_indptr, bias=None):
    # pylint: disable=invalid-name
    """The implementation of dense in topi, assuming sparse weight.

    Parameters
    ----------
    data : tvm.Tensor
        2-D with shape [m, k]

    w_data : tvm.Tensor
        1-D with shape [nonzeros]

    w_indices : tvm.Tensor
        1-D with shape [nonzeros]

    w_indptr : tvm.Tensor
        1-D with shape [n+1]

    bias : tvm.Tensor, optional
        1-D with shape [n]

    Returns
    -------
    output : tvm.Tensor
        2-D with shape [m, n]
    """
    assert len(w_data.shape) == 1 and len(w_indices.shape) == 1 and len(w_indptr.shape) == 1 \
        and len(data.shape) == 2, "only support 2-dim dense"
    assert isinstance(data, tvm.tensor.Tensor), \
        "data matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(data))
    if bias is not None:
        assert len(bias.shape) == 1
    dtype = data.dtype
    M, _ = data.shape
    N = simplify(w_indptr.shape[0]-1)
    def dense_default_ir(data, w_data, w_indices, w_indptr, out):
        """Define IR for Dense"""
        dtype = data.dtype
        irb = tvm.ir_builder.create()
        data_ptr = irb.buffer_ptr(data)
        w_data_ptr = irb.buffer_ptr(w_data)
        w_indices_ptr = irb.buffer_ptr(w_indices)
        w_indptr_ptr = irb.buffer_ptr(w_indptr)
        out_ptr = irb.buffer_ptr(out)
        M, K = data.shape
        N = simplify(w_indptr.shape[0]-1)
        with irb.for_range(0, M, for_type="vectorize", name='m') as m:
            with irb.for_range(0, N, for_type="parallel", name='n') as n:
                dot = irb.allocate(dtype, (1,), name='dot', scope='local')
                out_ptr[m*N+n] = tvm.const(0, dtype)
                dot[0] = tvm.const(0, dtype)
                row_start = w_indptr_ptr[n]
                row_elems = w_indptr_ptr[n+1]-row_start
                with irb.for_range(0, row_elems, name='k') as k:
                    elem = row_start+k
                    dot[0] += w_data_ptr[elem] * data_ptr[w_indices_ptr[elem]+m*K]
                out_ptr[m*N+n] += dot[0]
        return irb.get()
    oshape = (M, N)
    matmul = tvm.extern(oshape, [data, w_data, w_indices, w_indptr],
                        lambda ins, outs: dense_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
                        tag="dense", dtype=dtype, name='out')
    if bias is not None:
        matmul = tvm.compute(oshape, lambda i, j: matmul[i, j] + bias[j], \
                             tag=tag.BROADCAST)
    return matmul
Exemplo n.º 36
0
def proposal_cuda(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, threshold,
                  rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss):
    """Proposal operator.

    Parameters
    ----------
    cls_prob : tvm.Tensor
        4-D with shape [batch, 2 * num_anchors, height, width]

    bbox_pred : tvm.Tensor
        4-D with shape [batch, 4 * num_anchors, height, width]

    im_info : tvm.Tensor
        2-D with shape [batch, 3]

    scales : list/tuple of float
        Scales of anchor windoes.

    ratios : list/tuple of float
        Ratios of anchor windoes.

    feature_stride : int
        The size of the receptive field each unit in the convolution layer of the rpn, for example
        the product of all stride's prior to this layer.

    threshold : float
        Non-maximum suppression threshold.

    rpn_pre_nms_top_n : int
        Number of top scoring boxes to apply NMS. -1 to use all boxes.

    rpn_post_nms_top_n : int
        Number of top scoring boxes to keep after applying NMS to RPN proposals.

    rpn_min_size : int
        Minimum height or width in proposal.

    iou_loss : bool
        Usage of IoU loss.

    Returns
    -------
    out : tvm.Tensor
        2-D tensor with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
        [batch_index, w_start, h_start, w_end, h_end].
    """

    batch, _, height, width = get_const_tuple(cls_prob.shape)
    num_anchors = len(scales) * len(ratios)
    num_bbox = height * width * num_anchors
    rpn_pre_nms_top_n = min(rpn_pre_nms_top_n, num_bbox) if rpn_pre_nms_top_n > 0 else num_bbox

    bbox = tvm.extern((batch, num_bbox, 5), [cls_prob, bbox_pred, im_info], lambda ins, outs:
                      predict_bbox_ir(ins[0], ins[1], ins[2], outs[0], scales, ratios,
                                      feature_stride, rpn_min_size, iou_loss),
                      dtype=bbox_pred.dtype)
    score = tvm.compute((batch, num_bbox), lambda b, i: bbox[b, i, 4], tag='bbox_score')
    sorted_index = tvm.extern([score.shape], [score],
                              lambda ins, outs: argsort_ir(ins[0], outs[0]),
                              dtype='int32')
    sorted_bbox = tvm.compute((batch, rpn_pre_nms_top_n, 5),
                              lambda b, i, j: bbox[b, sorted_index[b, i], j], tag='sorted_bbox')
    nms_remove_mask = tvm.extern((batch, rpn_pre_nms_top_n), [sorted_bbox],
                                 lambda ins, outs: nms_ir(ins[0], outs[0], threshold),
                                 dtype='bool')
    nms_out = tvm.extern((batch * rpn_post_nms_top_n, 5), [sorted_bbox, nms_remove_mask],
                         lambda ins, outs: prepare_output_ir(ins[0], ins[1], outs[0]),
                         dtype=sorted_bbox.dtype)
    return nms_out
Exemplo n.º 37
0
# f = tvm.build(s, [A, B, bias,D], 'llvm')
a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx=ctx)
b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx=ctx)
d = tvm.nd.array(np.zeros(shape=(n, m), dtype=D.dtype), ctx=ctx)
bb = 10.0
f(a, b, d, bb)
np.testing.assert_allclose(d.asnumpy(),
                           np.dot(a.asnumpy(), b.asnumpy()) + 10,
                           rtol=1e-5)
print(d.shape)


@tvm.register_func('tvm.contrib.my_tvm_add_one')
def my_tvm_add_one(x, y):
    print('my tvm add one signatures :%s, %s' % (type(x), type(y)))
    tvm.nd.array(x.asnumpy() + 1).copyto(y)


A = tvm.placeholder((n, ), name='A')
B = tvm.extern(A.shape, [A],
               lambda ins, outs: tvm.call_packed('tvm.contrib.my_tvm_add_one',
                                                 ins[0], outs[0]),
               name='C')
s = tvm.create_schedule(B.op)
f = tvm.build(s, [A, B], 'llvm')
a = tvm.nd.array(np.random.uniform(size=(n, )).astype(A.dtype), ctx=ctx)
b = tvm.nd.array(np.random.uniform(size=(n, )).astype(B.dtype), ctx=ctx)
f(a, b)
np.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1, rtol=1e-5)
print(b.shape)
Exemplo n.º 38
0
def sort_gpu(data, data_buf, index, index_buf, output_buf, axis, is_descend):
    """Function to generate low level IR to do sorting on the GPU, use it by calling sort_gpu.

    Parameters
    ----------
    data: tvm.Tensor
        3-D tensor with shape [batch_size, num_anchors, 6].
        The last dimension should be in format of
        [class_id, score, box_left, box_top, box_right, box_bottom].

    data_buf: Buffer
        2D Buffer of input boxes' score with shape [batch_size, num_anchors].

    index : tvm.Tensor
        1-D tensor for valid number of boxes.

    index_buf : Buffer
        Buffer of number of valid number of boxes.

    output_buf : Buffer
        Output buffer of indicies of sorted tensor.

    axis : int
        The axis used for sorting.

    is_descend : bool
        If the sorted data is in descending order.

    Returns
    -------
    out : tvm.Tensor
        3-D tensor with shape [batch_size, num_anchors].
    """

    ndim = len(data.shape)
    assert data.dtype == "float32", "Currently only supports input dtype to be float32"
    assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim

    axis_mul_before = 1
    axis_mul_after = 1
    if axis < 0:
        axis = ndim + axis
    for i in range(0, ndim):
        if i < axis:
            axis_mul_before *= data.shape[i]
        elif i > axis:
            axis_mul_after *= data.shape[i]

    dshape = axis_mul_before*axis_mul_after
    fshape = data.shape[axis] * dshape

    loc_buf = api.decl_buffer(dshape, index.dtype, "sizes", data_alignment=8)
    new_index_buf = api.decl_buffer(
        fshape, index.dtype, "index_new", data_alignment=8)
    out_index_buf = api.decl_buffer(
        fshape, index.dtype, "index_out", data_alignment=8)
    new_data_buf = api.decl_buffer(
        dshape, data.dtype, "data_new", data_alignment=8)

    loc = \
        tvm.extern([(dshape,)],
                   [index],
                   lambda ins, outs: sort_pre_ir(
                       ins[0], outs[0], axis_mul_before, axis_mul_after),
                   dtype=[index.dtype],
                   in_buffers=index_buf,
                   out_buffers=[loc_buf],
                   tag="sorting_prepare")

    data_new, index_new = \
        tvm.extern([(dshape,), (fshape,)],
                   [data, index, loc],
                   lambda ins, outs: sort_pre_ir_data(
                       ins[0], ins[1], ins[2], outs[0], outs[1], axis,
                       axis_mul_before, axis_mul_after),
                   dtype=[data.dtype, index.dtype],
                   in_buffers=[data_buf, index_buf, loc_buf],
                   out_buffers=[new_data_buf, new_index_buf],
                   tag="sorting_data")

    index_out = \
        tvm.extern([(fshape,)],
                   [data, index, data_new, index_new, loc],
                   lambda ins, outs: sort_oet_ir(
                       ins[0], ins[1], ins[2], ins[3], ins[4], outs[0],
                       axis_mul_before, axis_mul_after, axis, is_descend),
                   dtype=[index.dtype],
                   in_buffers=[data_buf, index_buf,
                               new_data_buf, new_index_buf, loc_buf],
                   out_buffers=[out_index_buf],
                   tag="sorting_oet")
    out = \
        tvm.extern([data.shape],
                   [data, index, index_out, loc],
                   lambda ins, outs: sort_ir_out(
                       ins[0], ins[1], ins[2], ins[3], outs[0],
                       axis_mul_before, axis_mul_after, axis),
                   dtype=[index.dtype],
                   in_buffers=[data_buf, index_buf, out_index_buf, loc_buf],
                   out_buffers=output_buf,
                   tag="sorting_output")
    return out
Exemplo n.º 39
0
def nms_gpu(data,
            valid_count,
            nms_threshold=0.5,
            force_suppress=False,
            nms_topk=-1):
    """Non-maximum suppression operator for object detection.

    Parameters
    ----------
    data: tvm.Tensor
        3-D tensor with shape [batch_size, num_anchors, 6].
        The last dimension should be in format of
        [class_id, score, box_left, box_top, box_right, box_bottom].

    valid_count : tvm.Tensor
        1-D tensor for valid number of boxes.

    nms_threshold : float
        Non-maximum suppression threshold.

    force_suppress : boolean
        Whether to suppress all detections regardless of class_id.

    nms_topk : int
        Keep maximum top k detections before nms, -1 for no limit.

    Returns
    -------
    out : tvm.Tensor
        3-D tensor with shape [batch_size, num_anchors, 6].

    Example
    --------
    .. code-block:: python

        # An example to use nms
        dshape = (1, 5, 6)
        data = tvm.placeholder(dshape, name="data")
        valid_count = tvm.placeholder(
            (dshape[0],), dtype="int32", name="valid_count")
        nms_threshold = 0.7
        force_suppress = True
        nms_topk = -1
        out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
        np_data = np.random.uniform(dshape)
        np_valid_count = np.array([4])
        s = topi.generic.schedule_nms(out)
        f = tvm.build(s, [data, valid_count, out], "llvm")
        ctx = tvm.cpu()
        tvm_data = tvm.nd.array(np_data, ctx)
        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
        f(tvm_data, tvm_valid_count, tvm_out)
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    valid_count_dtype = "int32"
    valid_count_buf = api.decl_buffer(valid_count.shape,
                                      valid_count_dtype,
                                      "valid_count_buf",
                                      data_alignment=4)
    data_buf = api.decl_buffer(data.shape,
                               data.dtype,
                               "data_buf",
                               data_alignment=8)
    score_axis = 1
    score_shape = (batch_size, num_anchors)
    score_tensor = tvm.compute(score_shape,
                               lambda i, j: data[i, j, score_axis],
                               name="score_tensor")
    score_tensor_buf = api.decl_buffer(score_tensor.shape,
                                       data.dtype,
                                       "score_tensor_buf",
                                       data_alignment=8)
    sort_tensor_dtype = "int32"
    sort_tensor_buf = api.decl_buffer(score_shape,
                                      sort_tensor_dtype,
                                      "sort_tensor_buf",
                                      data_alignment=8)

    sort_tensor = sort_gpu(score_tensor, score_tensor_buf, valid_count,
                           valid_count_buf, sort_tensor_buf, score_axis, True)
    out = \
        tvm.extern(data.shape,
                   [data, sort_tensor, valid_count],
                   lambda ins, outs: nms_ir(
                       ins[0], ins[1], ins[2], outs[0], nms_threshold,
                       force_suppress, nms_topk),
                   dtype="float32",
                   in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
                   tag="nms")
    return out
Exemplo n.º 40
0
def measure_compute_mad(total_item, item_per_thread, base_type, bits, lanes,
                        target, target_host, remote, ctx, n_times):
    """ measure peak compute speed by computing mad for a type

    The IR for measurement is

    for each thread
        for i in 1..item_per_thread
            x = mad(x, x, y)
            y = mad(y, y, x)

    Parameters
    ----------
    total_item: int
        number of elements in input array
    item_per_thread: int
        number of operations each thread does
    base_type: str
        can be "int", "float"
    bits: int
        can be 16, 32
    lanes: int
       lane of the vector type, can be 1, 2, 4, 8, 16
    target: :any:`tvm.target.Target`
        the target and option of the compilation.
    target_host : str or :any:`tvm.target.Target`
        host compilation target
    remote: tvm.rpc.RPCSession
        if it is not None, use remote rpc session
    ctx: TVMcontext
        the context of array
    n_times: int
        number of runs for taking mean

    Returns
    -------
    GOPS: float
         giga operation per second
    """

    n = total_item

    if bits >= 64 or lanes >= 16:
        n //= 2

    max_threads = target.max_num_threads

    base_type = str(base_type) + str(bits)
    dtype = base_type if lanes == 1 else base_type + "x" + str(lanes)

    def extern(ins, outs):
        # pylint: disable=unused-argument
        """construct measurement function by building IR directly"""
        ib = tvm.ir_builder.create()

        bx = tvm.thread_axis("blockIdx.x")
        tx = tvm.thread_axis("threadIdx.x")

        ib.scope_attr(bx, "thread_extent", n // max_threads)
        ib.scope_attr(tx, "thread_extent", max_threads)

        idx = bx.var * max_threads + tx.var

        a = ib.allocate(dtype, (1), name='a', scope='local')
        b = ib.allocate(dtype, (1), name='b', scope='local')

        a[0] = outs[0].vload(idx, dtype)
        b[0] = outs[0].vload(idx, dtype)

        if base_type.find('float') != -1:
            mad_func = lambda x, y: (x * x + y)
        else:
            mad_func = lambda x, y: y * y + x

        for _ in range(item_per_thread // 4 // lanes):
            a[0] = mad_func(a[0], b[0])
            b[0] = mad_func(b[0], a[0])

        ib.emit(outs[0].vstore(idx, b[0]))
        return ib.get()

    y = tvm.extern((n,), [], extern, name="y", dtype=dtype)
    s = tvm.create_schedule(y.op)

    try:
        func = tvm.build(s, [y], target, target_host=target_host)
        func = _convert_to_remote(func, remote)
        time_f = func.time_evaluator(func.entry_name, ctx, number=n_times)
        y = tvm.nd.empty((n,), dtype=dtype, ctx=ctx)
        time = time_f(y).mean
    except tvm._ffi.base.TVMError:
        # build error (occur when device does not support half)
        return -1

    return 1.0 * (n * item_per_thread) / 1e9 / time
Exemplo n.º 41
0
def conv2d_c16str1HMMA(cfg, data, kernel, data_shape, kernel_shape,
                       output_shape, dilation, dir_path):

    #print the size of current kernel
    print("data size %s:" % data.name, data_shape)
    print("kernel size in layer %s:" % kernel.name, kernel_shape)
    print(output_shape)

    fortype = "unroll"

    #block_para
    blk_q = 8
    blk_p = 8
    blk_size = blk_p * blk_q
    ko_part = 2

    #tiling parameters
    block_row_warp = 2
    block_col_warp = 2
    warp_row_tile = 2
    warp_col_tile = 2
    #offset preset
    shieft = 8
    offset_D_im2col = (2 + blk_q) * (2 + blk_p) * 16
    offset_F = offset_D_im2col + (shieft + 16) * blk_size

    npq = output_shape[1] * output_shape[2] / blk_size

    #shared memory usage
    output_copy = blk_size * blk_size
    im2col_use = offset_D_im2col + (shieft + 16) * blk_size * 2

    shmem_use = max(output_copy, im2col_use)

    def convolutionfp16(D, F, shmem):
        #ir builder for constructing the main body
        ib = tvm.ir_builder.create()

        #id of current warp and offset of shared memory when storing
        warpid = tidx / 32
        warp_offset_output = warpid%block_row_warp*16*warp_row_tile\
            +warpid/block_row_warp*warp_col_tile*block_row_warp*warp_row_tile*256

        #include necessary head files
        include_file = tvm.call_intrin("float32", "include_cpp_head",
                                       dir_path + "/conv2d_HMMA.h")
        ib.emit(include_file)

        #declare the matrix fragment
        declare_a = tvm.call_intrin("float32", "wmma_fragment", "matrix_a",
                                    "half", "row_major", "a_frag",
                                    warp_col_tile)
        declare_b = tvm.call_intrin("float32", "wmma_fragment", "matrix_b",
                                    "half", "col_major", "b_frag",
                                    warp_row_tile)
        declare_c = tvm.call_intrin("float32", "wmma_fragment", "accumulator",
                                    "half", "c_frag", warp_col_tile,
                                    warp_row_tile)
        ib.emit(declare_a)
        ib.emit(declare_b)
        ib.emit(declare_c)

        #define the shared memory for loading data and offset for loading the data
        offset_D_warp = offset_D_im2col + tidx / 2 * (16 +
                                                      shieft) + tidx % 2 * 8
        offset_F_warp = offset_F + tidx / 2 * (16 + shieft) + tidx % 2 * 8

        #ir template for thread synchronization
        sync = tvm.call_extern("float32", "__syncthreads")

        #main for conducting the computation
        #set the pointer to first address of D
        Dp = D.access_ptr("r")
        Sp = shmem.access_ptr("r")
        Fp = F.access_ptr("r")

        #load the first data from global memory for the reuse of 9 times
        load_first_data = tvm.call_extern("float32","load_matrix_D",Dp,Sp,\
                                            output_shape[0],data_shape[1],data_shape[2],data_shape[3],0,dilation,0)
        ib.emit(load_first_data)

        #load the first filter from global memory:
        load_filter=tvm.call_extern("float32","load_matrix_F",Fp,Sp,offset_F_warp,kernel_shape[0],\
                                    kernel_shape[3],data_shape[0],data_shape[1],data_shape[2],tidx%2*8,0,0)
        ib.emit(load_filter)

        #fill fragment c with 0
        with ib.for_range(0, warp_col_tile, name="col_id_fi") as col_id_fi:
            with ib.for_range(0, warp_row_tile, name="row_id_fi") as row_id_fi:
                fill_O_zero = tvm.call_intrin("float", "wmma_fill_fragment",
                                              "c_frag", col_id_fi, row_id_fi,
                                              "half", 0.)
                ib.emit(fill_O_zero)
        ib.emit(sync)

        #do im2col for the first data
        im2col = tvm.call_extern("float32", "im2col", Sp, offset_D_warp, 0, 0)
        ib.emit(im2col)
        ib.emit(sync)

        with ib.for_range(0, data_shape[3] / 16, name="c_id",
                          for_type=fortype) as c_id:
            with ib.for_range(0, 9, name="ker_id", for_type=fortype) as ker_id:
                #now load matrix fragment
                with ib.for_range(0, warp_col_tile, name="col") as col:
                    load_matrix_frag_F = tvm.call_intrin("float32","wmma_load_matrix_sync","a_frag",col,Sp,\
                                                        offset_D_im2col+tidx/(32*block_row_warp)*\
                                                        (16*warp_col_tile*(16+shieft))+col*(16*(16+shieft)),16+shieft)
                    ib.emit(load_matrix_frag_F)

                with ib.for_range(0, warp_row_tile, name="row") as row:
                    load_matrix_frag_D = tvm.call_intrin("float32","wmma_load_matrix_sync","b_frag",row,Sp,\
                                                        offset_F+tidx%(32*block_row_warp)/32*\
                                                        (16*warp_row_tile*(16+shieft))+row*(16*(16+shieft)),16+shieft)
                    ib.emit(load_matrix_frag_D)
                ib.emit(sync)
                #now compute
                with ib.for_range(0, warp_col_tile, name="mma_col") as mma_col:
                    with ib.for_range(0, warp_row_tile,
                                      name="mma_row") as mma_row:
                        wmma_compute = tvm.call_intrin("float16",
                                                       "wmma_mma_sync",
                                                       "c_frag", "a_frag",
                                                       "b_frag", "c_frag",
                                                       mma_col, mma_row)
                        ib.emit(wmma_compute)

                with ib.if_scope(ker_id < 8):
                    #load filer of the next ieration
                    load_filter=tvm.call_extern("float32","load_matrix_F",Fp,Sp,offset_F_warp,kernel_shape[0],kernel_shape[3],\
                                                data_shape[0],data_shape[1],data_shape[2],c_id*16+tidx%2*8,ker_id+1,0)
                    ib.emit(load_filter)
                    #load data for next iteration
                    im2col = tvm.call_extern("float32", "im2col", Sp,
                                             offset_D_warp, ker_id + 1, 0)
                    ib.emit(im2col)
                ib.emit(sync)

            with ib.if_scope(c_id < data_shape[3] / 16 - 1):
                #load the next 9 iteration data from global memory
                load_data = tvm.call_extern("float32","load_matrix_D",Dp,Sp,\
                                output_shape[0],output_shape[1],output_shape[2],data_shape[3],c_id*16+16,dilation,0)
                ib.emit(load_data)

                #load filter for next cd iter
                load_filter=tvm.call_extern("float32","load_matrix_F",Fp,Sp,offset_F_warp,kernel_shape[0],\
                                            data_shape[3],data_shape[0],data_shape[1],data_shape[2],c_id*16+16+tidx%2*8,0,0)
                ib.emit(load_filter)
                ib.emit(sync)

                #load the first data from shmem to im2col shmem
                im2col = tvm.call_extern("float32", "im2col", Sp,
                                         offset_D_warp, 0, 0)
                ib.emit(im2col)
                ib.emit(sync)

        #store fragment in shared memory first
        with ib.for_range(0, warp_col_tile, name="col_id_st") as col_id_st:
            with ib.for_range(0, warp_row_tile, name="row_id_st") as row_id_st:
                store_O_fragment = tvm.call_intrin(
                    "float32", "wmma_store_matrix_sync", Sp,
                    warp_offset_output + col_id_st *
                    (256 * warp_row_tile * block_row_warp) + row_id_st * 16,
                    "c_frag", col_id_st, row_id_st, 64)
                ib.emit(store_O_fragment)
        ib.emit(sync)

        body = ib.get()
        return (body)


    shmem = tvm.extern((shmem_use,),[data,kernel],lambda ins,outs:convolutionfp16(ins[0],ins[1],outs[0]),\
                        name = "shmem",dtype = 'float16',\
                        out_buffers=tvm.decl_buffer((shmem_use,),dtype='float16',scope='shared',offset_factor=1))
    #O = tvm.compute(output_shape,lambda dn,dp,dq,dk:shmem[dk%blk_size+dq%blk_q*blk_size+dp%blk_p*blk_size*blk_q],tag="conv2d_NHWC_HMMA",\
    #                attrs={"blk_size":blk_size,"npq":npq})
    #conv = tvm.compute(output_shape,lambda dn,dp,dq,dk:shmem[dk%blk_size+dp/dilation%blk_p*blk_q*blk_size+dq/dilation%blk_q*blk_size])

    O = tvm.compute(output_shape,lambda dn,dp,dq,dk:shmem[dk%blk_size+dp/dilation%blk_p*blk_q*blk_size+dq/dilation%blk_q*blk_size],tag="conv2d_NHWC_HMMA",\
                    attrs={"blk_size":blk_size,"dilation":dilation,"version":0})
    num_flop = data_shape[0] * output_shape[2] * output_shape[3] * kernel_shape[
        0] * 2 * data_shape[3] * kernel_shape[1] * kernel_shape[2]
    cfg.add_flop(num_flop)

    return (O)
Exemplo n.º 42
0
def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, \
                               threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)):
    """Location transformation for multibox detection

    Parameters
    ----------
    cls_prob : tvm.Tensor
        Class probabilities.

    loc_pred : tvm.Tensor
        Location regression predictions.

    anchor : tvm.Tensor
        Prior anchor boxes.

    clip : boolean
        Whether to clip out-of-boundary boxes.

    threshold : float
        Threshold to be a positive prediction.

    variances : tuple of float
        Variances to be decoded from box regression output.

    Returns
    -------
    ret : tuple of tvm.Tensor composed of

    out : tvm.Tensor
        3-D tensor with shape (batch_size, num_anchors, 6)

    valid_count : tvm.Tensor
        1-D tensor with shape (batch_size,), number of valid anchor boxes.
    """
    batch_size = cls_prob.shape[0]
    num_classes = cls_prob.shape[1]
    num_anchors = cls_prob.shape[2]
    oshape = (batch_size, num_anchors, 6)
    # Define data alignment for intermediate buffer
    valid_count_dtype = "int32"
    valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype,
                                      "valid_count_buf", data_alignment=4)
    out_buf = api.decl_buffer(
        oshape, cls_prob.dtype, "out_buf", data_alignment=8)
    size = num_anchors
    temp_flag_buf = api.decl_buffer(
        (size,), valid_count_dtype, "flag", data_alignment=8)
    temp_id_buf = api.decl_buffer(
        (size,), valid_count_dtype, "cls_id", data_alignment=8)
    temp_score_buf = api.decl_buffer(
        (size,), cls_prob.dtype, "score", data_alignment=8)

    valid_count, temp_flag, temp_id, temp_score = \
        tvm.extern([(batch_size,), (size,), (size,), (size,)],
                   [cls_prob],
                   lambda ins, outs: transform_loc_pre(
                       ins[0], outs[0], outs[1], outs[2], outs[3], threshold),
                   dtype=[valid_count_dtype,
                          valid_count_dtype, valid_count_dtype, cls_prob.dtype],
                   out_buffers=[valid_count_buf,
                                temp_flag_buf, temp_id_buf, temp_score_buf],
                   tag="multibox_transform_loc_first_step")

    out = \
        tvm.extern([oshape],
                   [loc_pred, anchor, temp_flag, temp_id, temp_score],
                   lambda ins, outs: transform_loc_ir(
                       ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], clip, \
                       variances, batch_size, num_classes, num_anchors),
                   dtype=[cls_prob.dtype],
                   out_buffers=[out_buf],
                   tag="multibox_transform_loc")
    return [out, valid_count]
Exemplo n.º 43
0
# User will need to provide a function describing how to compute the result.
# The compute function takes list of symbolic placeholder for the inputs,
# list of symbolic placeholder for the outputs and returns the executing statement.
#
# In this case we simply call a registered TVM function, which invokes a CBLAS call.
# TVM does not control internal of the extern array function and treats it as blackbox.
# We can further mix schedulable TVM calls that add a bias term to the result.
#
n = 1024
l = 128
m = 235
bias = tvm.var('bias', dtype=tvm.float32)
A = tvm.placeholder((n, l), name='A')
B = tvm.placeholder((l, m), name='B')
C = tvm.extern((n, m), [A, B],
               lambda ins, outs: tvm.call_packed(
                   "tvm.contrib.cblas.matmul",
                   ins[0], ins[1], outs[0], False, False), name="C")
D = tvm.compute(C.shape, lambda i, j: C[i,j] + bias, name="D")
s = tvm.create_schedule(D.op)

######################################################################
# Verify the Result
# -----------------
# We can verify that the result matches what we expected.
#
ctx = tvm.cpu(0)
f = tvm.build(s, [A, B, D, bias], "llvm")
a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
bb = 10.0
Exemplo n.º 44
0
Arquivo: nms.py Projeto: bddppq/tvm
def non_max_suppression_gpu(data, valid_count, max_output_size=-1,
                            iou_threshold=0.5, force_suppress=False, top_k=-1,
                            coord_start=2, score_index=1, id_index=0,
                            return_indices=True, invalid_to_bottom=False):
    """Non-maximum suppression operator for object detection.

    Parameters
    ----------
    data : tvm.Tensor
        3-D tensor with shape [batch_size, num_anchors, elem_length].
        The last dimension should be in format of
        [class_id, score, box_left, box_top, box_right, box_bottom].

    valid_count : tvm.Tensor
        1-D tensor for valid number of boxes.

    max_output_size : optional, int
        Max number of output valid boxes for each instance.
        By default all valid boxes are returned.

    iou_threshold : optional, float
        Non-maximum suppression threshold.

    force_suppress : optional, boolean
        Whether to suppress all detections regardless of class_id.

    top_k : optional, int
        Keep maximum top k detections before nms, -1 for no limit.

    coord_start : required, int
        Start index of the consecutive 4 coordinates.

    score_index : optional, int
        Index of the scores/confidence of boxes.

    id_index : optional, int
        index of the class categories, -1 to disable.

    return_indices : boolean
        Whether to return box indices in input data.

    invalid_to_bottom : optional, boolean
        Whether to move all valid bounding boxes to the top.

    Returns
    -------
    out : tvm.Tensor
        3-D tensor with shape [batch_size, num_anchors, elem_length].

    Example
    --------
    .. code-block:: python

        # An example to use nms
        dshape = (1, 5, 6)
        data = tvm.placeholder(dshape, name="data")
        valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
        iou_threshold = 0.7
        force_suppress = True
        top_k = -1
        out = non_max_suppression(data=data, valid_count=valid_count, iou_threshold=iou_threshold,
                                 force_suppress=force_supress, top_k=top_k, return_indices=False)
        np_data = np.random.uniform(dshape)
        np_valid_count = np.array([4])
        s = topi.generic.schedule_nms(out)
        f = tvm.build(s, [data, valid_count, out], "cuda")
        ctx = tvm.gpu(0)
        tvm_data = tvm.nd.array(np_data, ctx)
        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
        f(tvm_data, tvm_valid_count, tvm_out)
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]

    valid_count_dtype = "int32"
    valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype,
                                      "valid_count_buf", data_alignment=4)
    score_axis = score_index
    score_shape = (batch_size, num_anchors)
    score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis])
    sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False, flag=True)

    sort_tensor_buf = api.decl_buffer(sort_tensor.shape, sort_tensor.dtype,
                                      "sort_tensor_buf", data_alignment=8)

    data_buf = api.decl_buffer(
        data.shape, data.dtype, "data_buf", data_alignment=8)

    out_buf = api.decl_buffer(
        data.shape, data.dtype, "out_buf", data_alignment=8)

    out, box_indices = \
        tvm.extern([data.shape, score_shape],
                   [data, sort_tensor, valid_count],
                   lambda ins, outs: nms_ir(
                       ins[0], ins[1], ins[2], outs[0], outs[1],
                       max_output_size, iou_threshold, force_suppress,
                       top_k, coord_start, id_index),
                   dtype=[data.dtype, "int32"],
                   in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
                   name="nms",
                   tag="nms")

    if return_indices:
        return box_indices

    if invalid_to_bottom:
        output_buf = api.decl_buffer(
            data.shape, data.dtype, "output_buf", data_alignment=8)
        temp_flag_buf = api.decl_buffer(
            score_shape, valid_count_dtype, "temp_flag", data_alignment=8)
        temp_idx_buf = api.decl_buffer(
            score_shape, valid_count_dtype, "temp_idx", data_alignment=8)
        temp_flag, temp_idx = tvm.extern([score_shape, score_shape], [out],
                                         lambda ins, outs: invalid_to_bottom_pre(
                                             ins[0], outs[0], outs[1]),
                                         dtype=["int32", "int32"],
                                         in_buffers=[out_buf],
                                         out_buffers=[temp_flag_buf, temp_idx_buf],
                                         name="invalid_to_bottom_phase_one")

        output = tvm.extern([data.shape], [out, temp_flag, temp_idx],
                            lambda ins, outs: invalid_to_bottom_ir(
                                ins[0], ins[1], ins[2], outs[0]),
                            dtype=[data.dtype],
                            in_buffers=[out_buf, temp_flag_buf, temp_idx_buf],
                            out_buffers=[output_buf],
                            name="invalid_to_bottom",
                            tag="invalid_to_bottom")
        return output

    return out
Exemplo n.º 45
0
def sort_gpu(data, data_buf, index, index_buf, output_buf, axis, is_descend):
    """Function to generate low level IR to do sorting on the GPU, use it by calling sort_gpu.

    Parameters
    ----------
    data: tvm.Tensor
        3-D tensor with shape [batch_size, num_anchors, 6].
        The last dimension should be in format of
        [class_id, score, box_left, box_top, box_right, box_bottom].

    data_buf: Buffer
        2D Buffer of input boxes' score with shape [batch_size, num_anchors].

    index : tvm.Tensor
        1-D tensor for valid number of boxes.

    index_buf : Buffer
        Buffer of number of valid number of boxes.

    output_buf : Buffer
        Output buffer of indicies of sorted tensor.

    axis : int
        The axis used for sorting.

    is_descend : bool
        If the sorted data is in descending order.

    Returns
    -------
    out : tvm.Tensor
        3-D tensor with shape [batch_size, num_anchors].
    """

    ndim = len(data.shape)
    assert data.dtype == "float32", "Currently only supports input dtype to be float32"
    assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim

    axis_mul_before = 1
    axis_mul_after = 1
    if axis < 0:
        axis = ndim + axis
    for i in range(0, ndim):
        if i < axis:
            axis_mul_before *= data.shape[i]
        elif i > axis:
            axis_mul_after *= data.shape[i]

    dshape = axis_mul_before * axis_mul_after
    fshape = data.shape[axis] * dshape

    loc_buf = api.decl_buffer(dshape, index.dtype, "sizes", data_alignment=8)
    new_index_buf = api.decl_buffer(fshape,
                                    index.dtype,
                                    "index_new",
                                    data_alignment=8)
    out_index_buf = api.decl_buffer(fshape,
                                    index.dtype,
                                    "index_out",
                                    data_alignment=8)
    new_data_buf = api.decl_buffer(dshape,
                                   data.dtype,
                                   "data_new",
                                   data_alignment=8)

    loc = \
        tvm.extern([(dshape,)],
                   [index],
                   lambda ins, outs: sort_pre_ir(
                       ins[0], outs[0], axis_mul_before, axis_mul_after),
                   dtype=[index.dtype],
                   in_buffers=index_buf,
                   out_buffers=[loc_buf],
                   tag="sorting_prepare")

    data_new, index_new = \
        tvm.extern([(dshape,), (fshape,)],
                   [data, index, loc],
                   lambda ins, outs: sort_pre_ir_data(
                       ins[0], ins[1], ins[2], outs[0], outs[1], axis,
                       axis_mul_before, axis_mul_after),
                   dtype=[data.dtype, index.dtype],
                   in_buffers=[data_buf, index_buf, loc_buf],
                   out_buffers=[new_data_buf, new_index_buf],
                   tag="sorting_data")

    index_out = \
        tvm.extern([(fshape,)],
                   [data, index, data_new, index_new, loc],
                   lambda ins, outs: sort_oet_ir(
                       ins[0], ins[1], ins[2], ins[3], ins[4], outs[0],
                       axis_mul_before, axis_mul_after, axis, is_descend),
                   dtype=[index.dtype],
                   in_buffers=[data_buf, index_buf,
                               new_data_buf, new_index_buf, loc_buf],
                   out_buffers=[out_index_buf],
                   tag="sorting_oet")
    out = \
        tvm.extern([data.shape],
                   [data, index, index_out, loc],
                   lambda ins, outs: sort_ir_out(
                       ins[0], ins[1], ins[2], ins[3], outs[0],
                       axis_mul_before, axis_mul_after, axis),
                   dtype=[index.dtype],
                   in_buffers=[data_buf, index_buf, out_index_buf, loc_buf],
                   out_buffers=output_buf,
                   tag="sorting_output")
    return out
Exemplo n.º 46
0
def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1):
    """Non-maximum suppression operator for object detection.

    Parameters
    ----------
    data: tvm.Tensor
        3-D tensor with shape [batch_size, num_anchors, 6].
        The last dimension should be in format of
        [class_id, score, box_left, box_top, box_right, box_bottom].

    valid_count : tvm.Tensor
        1-D tensor for valid number of boxes.

    nms_threshold : float
        Non-maximum suppression threshold.

    force_suppress : boolean
        Whether to suppress all detections regardless of class_id.

    nms_topk : int
        Keep maximum top k detections before nms, -1 for no limit.

    Returns
    -------
    out : tvm.Tensor
        3-D tensor with shape [batch_size, num_anchors, 6].

    Example
    --------
    .. code-block:: python

        # An example to use nms
        dshape = (1, 5, 6)
        data = tvm.placeholder(dshape, name="data")
        valid_count = tvm.placeholder(
            (dshape[0],), dtype="int32", name="valid_count")
        nms_threshold = 0.7
        force_suppress = True
        nms_topk = -1
        out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
        np_data = np.random.uniform(dshape)
        np_valid_count = np.array([4])
        s = topi.generic.schedule_nms(out)
        f = tvm.build(s, [data, valid_count, out], "llvm")
        ctx = tvm.cpu()
        tvm_data = tvm.nd.array(np_data, ctx)
        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
        f(tvm_data, tvm_valid_count, tvm_out)
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    valid_count_dtype = "int32"
    valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype,
                                      "valid_count_buf", data_alignment=4)
    data_buf = api.decl_buffer(
        data.shape, data.dtype, "data_buf", data_alignment=8)
    score_axis = 1
    score_shape = (batch_size, num_anchors)
    score_tensor = tvm.compute(
        score_shape, lambda i, j: data[i, j, score_axis], name="score_tensor")
    score_tensor_buf = api.decl_buffer(score_tensor.shape, data.dtype,
                                       "score_tensor_buf", data_alignment=8)
    sort_tensor_dtype = "int32"
    sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype,
                                      "sort_tensor_buf", data_alignment=8)

    sort_tensor = sort_gpu(score_tensor, score_tensor_buf, valid_count,
                           valid_count_buf, sort_tensor_buf, score_axis, True)
    out = \
        tvm.extern(data.shape,
                   [data, sort_tensor, valid_count],
                   lambda ins, outs: nms_ir(
                       ins[0], ins[1], ins[2], outs[0], nms_threshold,
                       force_suppress, nms_topk),
                   dtype="float32",
                   in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
                   tag="nms")
    return out
Exemplo n.º 47
0
Arquivo: csrmm.py Projeto: bddppq/tvm
def csrmm_default(data, indices, indptr, weight, bias=None):
    # pylint: disable=invalid-name
    """The default implementation of csrmm in topi.

    Parameters
    ----------
    data : tvm.Tensor
        1-D with shape [nonzeros]

    indices : tvm.Tensor
        1-D with shape [nonzeros]

    indptr : tvm.Tensor
        1-D with shape [m+1]

    weight : tvm.Tensor
        2-D with shape [k, n]

    bias : tvm.Tensor, optional
        1-D with shape [m]

    Returns
    -------
    output : tvm.Tensor
        2-D with shape [m, n]
    """
    assert len(data.shape) == 1 and len(indices.shape) == 1 and len(indptr.shape) == 1 \
        and len(weight.shape) == 2, "only support 2-dim csrmm"
    assert isinstance(weight, tvm.tensor.Tensor), \
        "weight matrix is assumed to be tvm.Tensor, but weight is `%s`" % (type(weight))
    if bias is not None:
        assert len(bias.shape) == 1
    M = simplify(indptr.shape[0]-1)
    _, N = weight.shape
    def csrmm_default_ir(data, indices, indptr, weight, out):
        """define ir for csrmm"""
        irb = tvm.ir_builder.create()
        data_ptr = irb.buffer_ptr(data)
        indices_ptr = irb.buffer_ptr(indices)
        indptr_ptr = irb.buffer_ptr(indptr)
        weight_ptr = irb.buffer_ptr(weight)
        out_ptr = irb.buffer_ptr(out)
        M = simplify(indptr.shape[0]-1)
        _, N = weight.shape
        with irb.for_range(0, N, for_type="vectorize", name='n') as n:
            with irb.for_range(0, M, for_type="parallel", name='row') as row:
                dot = irb.allocate('float32', (1,), name='dot', scope='local')
                out_ptr[row*N+n] = 0.
                dot[0] = 0.
                row_start = indptr_ptr[row]
                row_end = indptr_ptr[row+1]
                row_elems = row_end-row_start
                with irb.for_range(0, row_elems, name='idx') as idx:
                    elem = row_start+idx
                    dot[0] += data_ptr[elem] * weight_ptr[indices_ptr[elem]*N+n]
                out_ptr[row*N+n] += dot[0]
        return irb.get()
    oshape = (M, N)
    matmul = tvm.extern(oshape, [data, indices, indptr, weight],
                        lambda ins, outs: csrmm_default_ir(ins[0], ins[1], ins[2], ins[3], outs[0]),
                        tag="csrmm", dtype='float32', name='out')
    if bias is not None:
        matmul = tvm.compute(oshape, lambda i, j: matmul[i, j] + bias[i], \
                             tag=tag.BROADCAST)
    return matmul
Exemplo n.º 48
0
# The compute function takes list of symbolic placeholder for the inputs,
# list of symbolic placeholder for the outputs and returns the executing statement.
#
# In this case we simply call a registered tvm function, which invokes a CBLAS call.
# TVM does not control internal of the extern array function and treats it as blackbox.
# We can further mix schedulable TVM calls that add a bias term to the result.
#
n = 1024
l = 128
m = 235
bias = tvm.var('bias', dtype=tvm.float32)
A = tvm.placeholder((n, l), name='A')
B = tvm.placeholder((l, m), name='B')
C = tvm.extern(
    (n, m), [A, B],
    lambda ins, outs: tvm.call_packed("tvm.contrib.cblas.matmul", ins[0], ins[
        1], outs[0], False, False),
    name="C")
D = tvm.compute(C.shape, lambda i, j: C[i, j] + bias, name="D")
s = tvm.create_schedule(D.op)

######################################################################
# Verify the Result
# -----------------
# We can verify that the result matches what we expected.
#
ctx = tvm.cpu(0)
f = tvm.build(s, [A, B, D, bias], "llvm")
a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)
b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)
d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)
Exemplo n.º 49
0
Arquivo: nms.py Projeto: bddppq/tvm
def get_valid_counts_gpu(data, score_threshold=0):
    """Get valid count of bounding boxes given a score threshold.
    Also moves valid boxes to the top of input data.

    Parameters
    ----------
    data : tvm.Tensor
        Input data. 3-D tensor with shape [batch_size, num_anchors, elem_length].

    score_threshold : optional, float
        Lower limit of score for valid bounding boxes.

    Returns
    -------
    valid_count : tvm.Tensor
        1-D tensor for valid number of boxes.

    out_tensor : tvm.Tensor
        Rearranged data tensor.
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
    elem_per_thread = num_anchors // max_threads + 1
    new_range = num_anchors // elem_per_thread + 1
    temp_flag_buf = api.decl_buffer(
        (batch_size, num_anchors,), "int32", "temp_flag", data_alignment=8)
    temp_idx_buf = api.decl_buffer(
        (batch_size, num_anchors,), "int32", "temp_idx", data_alignment=8)
    temp_partial_buf = api.decl_buffer(
        (batch_size, new_range), "int32", "temp_partial", data_alignment=8)
    data_buf = api.decl_buffer(
        data.shape, data.dtype, "data_buf", data_alignment=8)

    temp_flag, temp_idx = \
        tvm.extern([(batch_size, num_anchors,), (batch_size, num_anchors,)], [data],
                   lambda ins, outs: get_valid_counts_pre(
                       ins[0], outs[0], outs[1], score_threshold),
                   dtype=["int32", "int32"],
                   out_buffers=[temp_flag_buf, temp_idx_buf],
                   name="get_valid_counts_phase_one")
    temp_idx_new, temp_partial = \
        tvm.extern([(batch_size, num_anchors,), (batch_size, new_range)], [data, temp_idx],
                   lambda ins, outs: get_valid_counts_upsweep(
                       ins[0], ins[1], outs[0], outs[1]),
                   dtype=["int32", "int32"],
                   out_buffers=[temp_idx_buf, temp_partial_buf],
                   name="get_valid_counts_phase_two")
    temp_partial_new = \
        tvm.extern([(batch_size, new_range)], [data, temp_partial],
                   lambda ins, outs: get_valid_counts_scan(
                       ins[0], ins[1], outs[0]),
                   dtype=["int32"],
                   out_buffers=[temp_partial_buf],
                   name="get_valid_counts_phase_three")
    temp_idx_final = \
        tvm.extern([(batch_size, num_anchors)], [data, temp_idx_new, temp_partial_new],
                   lambda ins, outs: get_valid_counts_downsweep(
                       ins[0], ins[1], ins[2], outs[0]),
                   dtype=["int32"],
                   out_buffers=[temp_idx_buf],
                   name="get_valid_counts_phase_four")
    valid_count, out_tensor = \
	tvm.extern([(batch_size,), data.shape], [data, temp_flag, temp_idx_final],
            lambda ins, outs: get_valid_counts_ir(
                ins[0], ins[1], ins[2], outs[0], outs[1]),
            dtype=["int32", data.dtype],
            in_buffers=[data_buf, temp_flag_buf, temp_idx_buf],
            name="get_valid_counts_phase_five",
            tag="get_valid_counts_gpu")

    return [valid_count, out_tensor]
Exemplo n.º 50
0
def argsort(data, valid_count, axis=-1, is_ascend=1, dtype="float32", flag=0):
    """Performs sorting along the given axis and returns an array
    of indices having the same shape as an input array that index
    data in sorted order.

    Parameters
    ----------
    data : tvm.Tensor
        The input tensor.

    valid_count : tvm.Tensor
        1-D tensor for valid number of boxes only for ssd.

    axis : optional, int
	Axis along which to sort the input tensor.
        By default the flattened array is used.

    is_ascend : optional, boolean
        Whether to sort in ascending or descending order.

    dtype : optional, string
        DType of the output indices.

    flag : optional, boolean
        Whether valid_count is valid.

    Returns
    -------
    out : tvm.Tensor
        Sorted index tensor.

    Example
    --------
    .. code-block:: python

        # An example to use argsort
        dshape = (1, 5, 6)
        data = tvm.placeholder(dshape, name="data")
        valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
        axis = 0
        is_ascend = False
        flag = False
        out = argsort(data, valid_count, axis, is_ascend, flag)
        np_data = np.random.uniform(dshape)
        np_valid_count = np.array([4])
        s = topi.generic.schedule_argsort(out)
        f = tvm.build(s, [data, valid_count, out], "llvm")
        ctx = tvm.cpu()
        tvm_data = tvm.nd.array(np_data, ctx)
        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
        f(tvm_data, tvm_valid_count, tvm_out)
    """
    data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
    if flag:
        valid_count_buf = api.decl_buffer(valid_count.shape, valid_count.dtype,
                                          "valid_count_buf", data_alignment=4)
        out_buf = api.decl_buffer(data.shape, "int32", "out_buf", data_alignment=8)
        out = \
            tvm.extern(data.shape,
                       [data, valid_count],
                       lambda ins, outs: tvm.call_packed(
                           "tvm.contrib.sort.argsort_nms", ins[0], ins[1],
                           outs[0], axis, is_ascend),
                       dtype="int32",
                       in_buffers=[data_buf, valid_count_buf],
                       out_buffers=out_buf,
                       name="argsort_nms_cpu",
                       tag="argsort_nms_cpu")
    else:
        out_buf = api.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
        out = \
            tvm.extern(data.shape,
                       [data],
                       lambda ins, outs: tvm.call_packed(
                           "tvm.contrib.sort.argsort", ins[0],
                           outs[0], axis, is_ascend),
                       dtype=dtype,
                       in_buffers=[data_buf],
                       out_buffers=out_buf,
                       name="argsort_cpu",
                       tag="argsort_cpu")
    return out
Exemplo n.º 51
0
def test_ib():
    print('aaaa')
    env = nnpu.get_env()
    nnpu.set_device(env)
    shape = (16, )
    dtype_n, dtype_w = env.cfg['dtype_n'], env.cfg['dtype_w']
    a = tvm.placeholder(shape, dtype_w, name='a')
    w = shape[0]
    e = 16

    def build_nms_ir(ten_in, ten_out):
        ib = tvm.ir_builder.create()
        imm_value = 10
        ib.scope_attr(env.nnpu_axis, "coproc_scope", 0)
        p_in = ib.buffer_ptr(ten_in[0])
        p_out = ib.buffer_ptr(ten_out[0])
        #with ib.for_range(0,w, name="k") as k:
        with ib.for_range(0, w / e, name="i") as i:
            ib.emit(
                make_intrin_call(
                    "void", 'VAddI', ten_out[0].access_ptr("w", 'uint32') +
                    i * dtype_bytes(dtype_w),
                    ten_in[0].access_ptr("r", 'uint32') +
                    i * dtype_bytes(dtype_w), tvm.const(imm_value, 'float64'),
                    env.cfg['vector_unit']['size'], 3))
        stmt = ib.get()
        return stmt

    sph = ScheduleProcHelper()
    a_buf, a_dram = nnpu.utils.CopyHtoBuf(a, 'a', sph)
    sph.MarkScope(a_buf)
    out = tvm.extern(a_buf.shape, [a_buf],
                     build_nms_ir,
                     in_buffers=[
                         tvm.decl_buffer(a_buf.shape,
                                         dtype_w,
                                         data_alignment=dtype_bytes(dtype_w),
                                         scope='local.nnpu_scratchpad0')
                     ],
                     out_buffers=[
                         tvm.decl_buffer(a_buf.shape,
                                         dtype_w,
                                         data_alignment=dtype_bytes(dtype_w),
                                         scope='local.nnpu_scratchpad0')
                     ],
                     dtype=dtype_w,
                     name="test_ir")
    sph.MarkScope(out)
    out_host, out_dram = nnpu.utils.CopyBufToH(out, 'out', sph)
    s = tvm.create_schedule([out_host.op])
    sph.Transform(s)
    print(tvm.lower(s, [a, out_host], simple_mode=True))
    print(nnpu.lower(s, [a, out_host], simple_mode=True))
    # exit(0)
    func = nnpu.build(s, [a, out_host], 'nnpu', 'llvm', name='nnpu_test')
    ctx = tvm.nd.TVMContext(13, 0)
    a_np = np.random.randint(size=(16, ), dtype=a.dtype, low=0, high=127)
    a_nd = tvm.nd.array(a_np, ctx)

    b_nd = tvm.nd.array(np.zeros(16, ).astype(out_host.dtype), ctx)

    func(a_nd, b_nd)

    print('a = ')
    print(a_np)
    print('xjb sum = ')
    print(b_nd.asnumpy())
    return
Exemplo n.º 52
0
def proposal(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, threshold,
             rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss):
    """Proposal operator.

    Parameters
    ----------
    cls_prob : tvm.Tensor
        4-D with shape [batch, 2 * num_anchors, height, width]

    bbox_pred : tvm.Tensor
        4-D with shape [batch, 4 * num_anchors, height, width]

    im_info : tvm.Tensor
        2-D with shape [batch, 3]

    scales : list/tuple of float
        Scales of anchor windoes.

    ratios : list/tuple of float
        Ratios of anchor windoes.

    feature_stride : int
        The size of the receptive field each unit in the convolution layer of the rpn, for example
        the product of all stride's prior to this layer.

    threshold : float
        Non-maximum suppression threshold.

    rpn_pre_nms_top_n : int
        Number of top scoring boxes to apply NMS. -1 to use all boxes.

    rpn_post_nms_top_n : int
        Number of top scoring boxes to keep after applying NMS to RPN proposals.

    rpn_min_size : int
        Minimum height or width in proposal.

    iou_loss : bool
        Usage of IoU loss.

    Returns
    -------
    out : tvm.Tensor
        2-D tensor with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
        [batch_index, w_start, h_start, w_end, h_end].
    """

    batch, _, height, width = get_const_tuple(cls_prob.shape)
    num_anchors = len(scales) * len(ratios)
    num_bbox = height * width * num_anchors
    rpn_pre_nms_top_n = min(rpn_pre_nms_top_n, num_bbox) if rpn_pre_nms_top_n > 0 else num_bbox

    bbox = tvm.extern((batch, num_bbox, 5), [cls_prob, bbox_pred, im_info], lambda ins, outs:
                      predict_bbox_ir(ins[0], ins[1], ins[2], outs[0], scales, ratios,
                                      feature_stride, rpn_min_size, iou_loss),
                      dtype=bbox_pred.dtype)
    score = tvm.compute((batch, num_bbox), lambda b, i: bbox[b, i, 4], tag='bbox_score')
    sorted_index = tvm.extern([score.shape], [score],
                              lambda ins, outs: argsort_ir(ins[0], outs[0]),
                              dtype='int32')
    sorted_bbox = tvm.compute((batch, rpn_pre_nms_top_n, 5),
                              lambda b, i, j: bbox[b, sorted_index[b, i], j], tag='sorted_bbox')
    nms_remove_mask = tvm.extern((batch, rpn_pre_nms_top_n), [sorted_bbox],
                                 lambda ins, outs: nms_ir(ins[0], outs[0], threshold),
                                 dtype='bool')
    nms_out = tvm.extern((batch * rpn_post_nms_top_n, 5), [sorted_bbox, nms_remove_mask],
                         lambda ins, outs: prepare_output_ir(ins[0], ins[1], outs[0]),
                         dtype=sorted_bbox.dtype)
    return nms_out