Exemplo n.º 1
0
Arquivo: scan.py Projeto: vinx13/tvm
    def do_scan(data, output_dtype):
        target = tvm.target.Target.current()
        # TODO(masahi): Check -libs=thrust option
        if target and target.kind.name in ["cuda", "rocm"
                                           ] and is_thrust_available():
            return scan_thrust(data,
                               output_dtype,
                               exclusive=True,
                               return_reduction=return_reduction,
                               binop=binop)

        if ndim == 1:
            # TIR exclusive scan accepts only 2D or higher-rank inputs.
            data = expand_dims(data, axis=0)

        data_buf = tvm.tir.decl_buffer(data.shape,
                                       data.dtype,
                                       "data_buf",
                                       data_alignment=8)
        output_buf = tvm.tir.decl_buffer(data.shape,
                                         output_dtype,
                                         "output_buf",
                                         data_alignment=8)

        if return_reduction:
            output, reduction = te.extern(
                [data.shape, data.shape[:-1]],
                [data],
                lambda ins, outs: exclusive_scan_ir(
                    ins[0], outs[0], outs[1], binop=binop),
                dtype=[data.dtype, output_dtype],
                in_buffers=[data_buf],
                name="exclusive_scan",
                tag="exclusive_scan_gpu",
            )
        else:
            output = te.extern(
                [data.shape],
                [data],
                lambda ins, outs: exclusive_scan_ir(
                    ins[0], outs[0], binop=binop),
                dtype=[output_dtype],
                in_buffers=[data_buf],
                out_buffers=[output_buf],
                name="exclusive_scan",
                tag="exclusive_scan_gpu",
            )
            reduction = None

        if ndim == 1:
            output = squeeze(output, 0)
            if return_reduction:
                reduction = squeeze(reduction, 0)

        if return_reduction:
            return output, reduction

        return output
Exemplo n.º 2
0
    def do_scan(data, output_dtype):
        target = tvm.target.Target.current()
        if target and (can_use_thrust(target, "tvm.contrib.thrust.sum_scan")
                       or can_use_rocthrust(target,
                                            "tvm.contrib.thrust.sum_scan")):
            return scan_thrust(data,
                               output_dtype,
                               exclusive=True,
                               return_reduction=return_reduction,
                               binop=binop)

        if ndim == 1:
            # TIR exclusive scan accepts only 2D or higher-rank inputs.
            data = expand_dims(data, axis=0)

        data_buf = tvm.tir.decl_buffer(data.shape,
                                       data.dtype,
                                       "data_buf",
                                       data_alignment=8)
        output_buf = tvm.tir.decl_buffer(data.shape,
                                         output_dtype,
                                         "output_buf",
                                         data_alignment=8)

        if return_reduction:
            output, reduction = te.extern(
                [data.shape, data.shape[:-1]],
                [data],
                lambda ins, outs: exclusive_scan_ir(
                    ins[0], outs[0], outs[1], binop=binop),
                dtype=[data.dtype, output_dtype],
                in_buffers=[data_buf],
                name="exclusive_scan",
                tag="exclusive_scan_gpu",
            )
        else:
            output = te.extern(
                [data.shape],
                [data],
                lambda ins, outs: exclusive_scan_ir(
                    ins[0], outs[0], binop=binop),
                dtype=[output_dtype],
                in_buffers=[data_buf],
                out_buffers=[output_buf],
                name="exclusive_scan",
                tag="exclusive_scan_gpu",
            )
            reduction = None

        if ndim == 1:
            output = squeeze(output, 0)
            if return_reduction:
                reduction = squeeze(reduction, 0)

        if return_reduction:
            return output, reduction

        return output
Exemplo n.º 3
0
def argsort(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"):
    """Performs sorting along the given axis and returns an array of indicies
    having same shape as an input array that index data in sorted order.

    Parameters
    ----------
    data: tvm.te.Tensor
        The input array.

    valid_count : tvm.te.Tensor, optional
        The number of valid elements to be sorted.

    axis : int, optional
        Axis long which to sort the input tensor.

    is_ascend : boolean, optional
        Whether to sort in ascending or descending order.

    dtype : string, optional
        DType of the output indices.

    Returns
    -------
    out : tvm.te.Tensor
        The output of this function.
    """
    if valid_count is not None:
        sorted_data = identity(data)
        sorted_data_buf = tvm.tir.decl_buffer(
            data.shape, data.dtype, "sorted_data_buf", data_alignment=8
        )
        valid_count_buf = tvm.tir.decl_buffer(
            valid_count.shape, valid_count.dtype, "valid_count_buf", data_alignment=4
        )
        out_buf = tvm.tir.decl_buffer(data.shape, "int32", "out_buf", data_alignment=4)
        out = te.extern(
            [data.shape],
            [sorted_data, valid_count],
            lambda ins, outs: sort_nms_ir(ins[0], ins[1], outs[0], axis, is_ascend),
            dtype="int32",
            in_buffers=[sorted_data_buf, valid_count_buf],
            out_buffers=[out_buf],
            name="argsort_nms_gpu",
            tag="argsort_nms_gpu",
        )
    else:
        value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8)
        indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8)
        out = te.extern(
            [data.shape, data.shape],
            [data],
            lambda ins, outs: sort_ir(ins[0], outs[0], axis, is_ascend, indices_out=outs[1]),
            out_buffers=[value_buf, indices_buf],
            name="argsort_gpu",
            tag="argsort_gpu",
        )[1]
    return out
Exemplo n.º 4
0
def test_add_pipeline():
    nn = 64
    max_threads = 4
    n = tvm.runtime.convert(nn)
    A = te.placeholder((n, ), name='A')

    def extern_generator(ins, outs):
        """Manually write the IR for the extern function, add pipeline"""
        ib = tvm.tir.ir_builder.create()
        with ib.for_range(0, (n + 1) // 2) as i:
            ib.emit(outs[0].vstore(
                i * 2, ins[0].vload(i * 2, "float32x2") +
                tvm.tir.const(1, "float32x2")))
        return ib.get()

    def extern_generator_gpu(ins, outs):
        """Manually write the IR for the extern function, add pipeline"""
        ib = tvm.tir.ir_builder.create()
        bx = te.thread_axis("blockIdx.x")
        tx = te.thread_axis("threadIdx.x")
        ib.scope_attr(bx, "thread_extent",
                      (nn + max_threads - 1) // max_threads)
        ib.scope_attr(tx, "thread_extent", max_threads)
        idx = bx.var * max_threads + tx.var
        with ib.if_scope(ib.likely(idx < n)):
            ib.emit(outs[0].vstore(
                idx * 2, ins[0].vload(idx * 2, "float32x2") +
                tvm.tir.const(1, "float32x2")))
        return ib.get()

    C_cpu = te.extern(A.shape, [A], extern_generator, name='C')
    C_gpu = te.extern(A.shape, [A], extern_generator_gpu, name='C')
    s_cpu = te.create_schedule(C_cpu.op)
    s_gpu = te.create_schedule(C_gpu.op)
    print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True))
    print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True))

    def check_target(target):
        if not tvm.runtime.enabled(target):
            return
        s = s_gpu if target in ['opencl', 'cuda'] else s_cpu
        C = C_gpu if target in ['opencl', 'cuda'] else C_cpu
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], target)
        ctx = tvm.context(target, 0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
        f(a, c)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)

    check_target("llvm")
    check_target("opencl")
    check_target("cuda")
Exemplo n.º 5
0
def test_pack_buffer_simple():
    nn = 1024
    n = tvm.runtime.convert(nn)
    A = te.placeholder((n, ), name='A')

    def extern_generator(ins, outs):
        """Manually write the IR for the extern function, add pipeline."""
        return tvm.tir.call_packed("my_extern_array_func1", ins[0], outs[0])

    C = te.extern(A.shape, [A], extern_generator, name='C')
    s = te.create_schedule(C.op)

    @tvm.register_func
    def my_extern_array_func1(aa, bb):
        aa.copyto(bb)

    def check_target(target):
        if not tvm.runtime.enabled(target):
            return
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], target)
        ctx = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)

        f(a, c)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy())

    check_target("stackvm")
    check_target("llvm")
Exemplo n.º 6
0
def _get_valid_box_count(scores, score_threshold):
    batch_classes, num_boxes = scores.shape

    def searchsorted_ir(scores, valid_count):
        ib = tvm.tir.ir_builder.create()
        scores = ib.buffer_ptr(scores)
        valid_count = ib.buffer_ptr(valid_count)

        with ib.for_range(0, batch_classes, name="i", kind="parallel") as i:
            binary_search(ib, i, num_boxes, scores, score_threshold,
                          valid_count)

        return ib.get()

    scores_buf = tvm.tir.decl_buffer(scores.shape,
                                     scores.dtype,
                                     "scores_buf",
                                     data_alignment=8)

    return te.extern(
        [(batch_classes, )],
        [scores],
        lambda ins, outs: searchsorted_ir(ins[0], outs[0]),
        dtype=["int32"],
        in_buffers=[scores_buf],
        name="searchsorted",
        tag="searchsorted",
    )
Exemplo n.º 7
0
def uniform(low, high, size):
    """Draw samples from a uniform distribution.

    Samples are uniformly distributed over the half-open interval [low, high)
    (includes low, but excludes high). In other words, any value within the
    given interval is equally likely to be drawn by uniform.

    Parameters
    ----------
    low : float
        Lower boundary of the output interval. All values generated will be
        greater than or equal to low.
    high : float
        Upper boundary of the output interval. All values generated will be
        less than high.
    size : tuple of ints
        Output shape. If the given shape is, e.g., (m, n, k), then m * n * k
        samples are drawn.

    Returns
    -------
    out : Tensor
        A tensor with specified size and dtype.
    """
    return te.extern(
        size,
        [],
        lambda ins, outs: tvm.tir.call_packed(
            "tvm.contrib.random.uniform", float(low), float(high), outs[0]),
        dtype="float32",
    )
Exemplo n.º 8
0
def randint(low, high, size, dtype="int32"):
    """Return random integers from low (inclusive) to high (exclusive).
    Return random integers from the "discrete uniform" distribution of the
    specified dtype in the "half-open" interval [low, high).

    Parameters
    ----------
    low : int
        Lowest (signed) integer to be drawn from the distribution
    high : int
        One above the largest (signed) integer to be drawn from the distribution

    Returns
    -------
    out : Tensor
        A tensor with specified size and dtype
    """
    assert "int" in dtype, "the type of randint output must be int or uint"
    return te.extern(
        size,
        [],
        lambda ins, outs: tvm.tir.call_packed("tvm.contrib.random.randint",
                                              int(low), int(high), outs[0]),
        dtype=dtype,
    )
Exemplo n.º 9
0
Arquivo: nms.py Projeto: jchia/tvm
def _get_sorted_indices(data, data_buf, score_index, score_shape):
    """Extract a 1D score tensor from the packed input and do argsort on it."""
    score_buf = tvm.tir.decl_buffer(score_shape,
                                    data.dtype,
                                    "score_buf",
                                    data_alignment=8)
    score_tensor = te.extern(
        [score_shape],
        [data],
        lambda ins, outs: _fetch_score_ir(
            ins[0],
            outs[0],
            score_index,
        ),
        dtype=[data.dtype],
        in_buffers=[data_buf],
        out_buffers=[score_buf],
        name="fetch_score",
        tag="fetch_score",
    )

    if is_thrust_available():
        sort_tensor = argsort_thrust(score_tensor,
                                     axis=1,
                                     is_ascend=False,
                                     dtype="int32")
    else:
        sort_tensor = argsort(score_tensor,
                              axis=1,
                              is_ascend=False,
                              dtype="int32")

    return sort_tensor
Exemplo n.º 10
0
    def check_target(target, ir):
        dtype = "float32"
        A = te.placeholder((n, ), name="A", dtype=dtype)
        B = te.placeholder((n, ), name="B", dtype=dtype)

        C = te.extern(
            (n, ),
            [A, B],
            lambda ins, outs: ir(ins[0], ins[1], outs[0]),
            name="while_vectorize",
            dtype=dtype,
        )
        s = te.create_schedule(C.op)

        with tvm.transform.PassContext(opt_level=3):
            func = tvm.build(s, [A, B, C], target)

        dev = tvm.device(target, 0)
        a_np = np.random.uniform(size=n).astype(A.dtype)
        b_np = np.random.uniform(size=n).astype(B.dtype)
        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
        func(a, b, c)
        ref = num_iter * (a_np + b_np)
        tvm.testing.assert_allclose(c.numpy(), ref, rtol=1e-5, atol=1e-5)
Exemplo n.º 11
0
    def mod(self, target, load_type, store_type, indirect_indices):
        target = tvm.target.Target(target)

        n = 4
        dtype = "int32"
        A = te.placeholder((n, ), dtype=dtype, name="A")
        R = te.placeholder((n, ), dtype=dtype, name="R")

        def do_compute(ins, outs):
            ib = tvm.tir.ir_builder.create()
            A, R = map(ib.buffer_ptr, ins)
            B = ib.buffer_ptr(outs[0])

            if "gpu" in target.keys:
                ib.scope_attr(te.thread_axis("blockIdx.x"), "thread_extent", 0)

            index_map = {
                "ramp": tvm.tir.Ramp(0, 1, 4),
                "broadcast": tvm.tir.Broadcast(0, 4),
            }

            load_index = index_map[load_type]
            store_index = index_map[store_type]

            if indirect_indices:
                load_index = R[load_index]

            B[store_index] = A[load_index]

            return ib.get()

        B = te.extern(A.shape, [A, R], do_compute, dtype="int32")
        s = te.create_schedule(B.op)

        return tvm.lower(s, [A, R, B])
Exemplo n.º 12
0
Arquivo: unique.py Projeto: Xuxue1/tvm
def _calc_adjacent_diff(data, out_dtype="int32", binop=tir.Sub):
    """Function calculate adjacent difference in an 1-D array.

    Parameters
    ----------
    data : tvm.te.Tensor
        Input 1-D tensor.

    output_dtype : str
        The output tensor data type.

    binop: function, optional
        A binary associative op to use for calculating difference. The function takes two
        TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to
        compute the adjacent difference.

    Returns
    -------
    output : tvm.te.Tensor
        1-D tensor storing the adjacent difference of the input tensor. The adjacent difference
        is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1])
        where i > 0 and i < len(data).
    """
    return te.extern(
        [data.shape],
        [data],
        lambda ins, outs: _calc_adjacent_diff_ir(ins[0], outs[0], binop=binop),
        dtype=[out_dtype],
        name="_calc_adjacent_diff",
        tag="_calc_adjacent_diff_cpu",
    )
Exemplo n.º 13
0
Arquivo: sort.py Projeto: zkzt/tvm
def argsort_nms_thrust(data, valid_count, axis=-1, is_ascend=1, dtype="float32"):
    """Performs sorting along the given axis and returns an array of indicies
    having same shape as an input array that index data in sorted order.

    Parameters
    ----------
    data: tvm.te.Tensor
        The input array.

    valid_count : tvm.te.Tensor, optional
        The number of valid elements to be sorted.

    axis : int, optional
        Axis long which to sort the input tensor.

    is_ascend : boolean, optional
        Whether to sort in ascending or descending order.

    dtype : string, optional
        DType of the output indices.

    Returns
    -------
    out : tvm.te.Tensor
        The output of this function.
    """
    ndim = len(data.shape)
    if axis < 0:
        axis = ndim + axis
    if axis != ndim - 1:
        # Prepare for sorting along axis -1.
        axes = swap(list(range(ndim)), axis)
        data = transpose(data, axes)

    data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
    valid_count_buf = tvm.tir.decl_buffer(
        valid_count.shape, valid_count.dtype, "valid_count_buf", data_alignment=4
    )
    out_bufs = [
        tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8),
        tvm.tir.decl_buffer(data.shape, "int32", "indices_buf", data_alignment=8),
    ]
    out = te.extern(
        [data.shape, data.shape],
        [data, valid_count],
        lambda ins, outs: tvm.tir.call_packed(
            "tvm.contrib.thrust.sort_nms", ins[0], ins[1], outs[0], outs[1], is_ascend
        ),
        in_buffers=[data_buf, valid_count_buf],
        out_buffers=out_bufs,
        dtype=[data.dtype, "int32"],
        name="nms_argsort_gpu",
        tag="nms_argsort_gpu",
    )

    if axis != ndim - 1:
        axes = swap(list(range(ndim)), axis)
        out = [transpose(o, axes) for o in out]

    return out[1]
Exemplo n.º 14
0
def test_pack_buffer_intermediate():
    nn = 1024
    n = tvm.runtime.convert(nn)
    A = te.placeholder((n, ), name="A")
    B = te.compute((n, ), lambda i: A[i] + 1, name="B")

    def extern_generator(ins, outs):
        """Manually write the IR for the extern function, add pipeline."""
        return tvm.tir.call_packed("my_extern_array_func2", ins[0], outs[0])

    C = te.extern(B.shape, [B], extern_generator, name="C")
    s = te.create_schedule(C.op)

    def check_target(target):
        if not tvm.testing.device_enabled(target):
            return
        # build and invoke the kernel.
        f = tvm.build(s, [A, C], target)
        dev = tvm.cpu(0)
        # launch the kernel.
        n = nn
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)

        @tvm.register_func
        def my_extern_array_func2(aa, bb):
            assert aa.shape == a.shape
            tvm.testing.assert_allclose(aa.numpy(), a.numpy() + 1)
            aa.copyto(bb)

        f(a, c)
        tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)

    check_target("llvm")
Exemplo n.º 15
0
def fully_connected_inference(lhs, rhs, nthreads=1):
    """Create an extern op that compute fully connected of 1D tensor lhs and
    2D tensor rhs with nnpack.

    Parameters
    ----------
    lhs : Tensor
        lhs 1D array input[input_channels] of FP32 elements
    rhs : Tensor
        lhs 2D matrix kernel[output_channels][input_channels] of FP32 elements

    Returns
    -------
    C : Tensor
        lhs 1D array out[output_channels] of FP32 elements.
    """
    m = rhs.shape[0]
    return te.extern(
        (m, ),
        [lhs, rhs],
        lambda ins, outs: tvm.tir.call_packed(
            "tvm.contrib.nnpack.fully_connected_inference", ins[0], ins[1],
            outs[0], nthreads),
        name="C",
    )
Exemplo n.º 16
0
def _get_sorted_indices(data, data_buf, score_index, score_shape):
    """Extract a 1D score tensor from the packed input and do argsort on it."""
    score_buf = tvm.tir.decl_buffer(score_shape, data.dtype, "score_buf", data_alignment=8)
    score_tensor = te.extern(
        [score_shape],
        [data],
        lambda ins, outs: _fetch_score_ir(
            ins[0],
            outs[0],
            score_index,
        ),
        dtype=[data.dtype],
        in_buffers=[data_buf],
        out_buffers=[score_buf],
        name="fetch_score",
        tag="fetch_score",
    )

    target = tvm.target.Target.current()
    # TODO(masahi): Check -libs=thrust option
    if target and target.kind.name in ["cuda", "rocm"] and is_thrust_available():
        sort_tensor = argsort_thrust(score_tensor, axis=1, is_ascend=False, dtype="int32")
    else:
        sort_tensor = argsort(score_tensor, axis=1, is_ascend=False, dtype="int32")

    return sort_tensor
Exemplo n.º 17
0
def conv2d(data, weight, pad="SAME", stride=1):
    """
    Create an extern op that compute data * weight and return result in output

    Parameters:
    ----------
    data: Tensor
        The input data, format NHWC
    weight: Tensor
        The conv weight, format output_feature * kH * kW * input_feature
    pad: str
        Padding method, 'SAME' or 'VALID'
    stride: int
        convolution stride

    Returns
    -------
    output: Tensor
        The result tensor
    """
    n, hi, wi, ci = data.shape
    co, kh, kw, ciw = weight.shape
    padding = 0 if pad == "SAME" else 1
    ho = hi // stride
    wo = wi // stride

    return te.extern(
        (n, ho, wo, co),
        [data, weight],
        lambda ins, outs: tvm.tir.call_packed("tvm.contrib.mps.conv2d", ins[
            0], ins[1], outs[0], padding, stride),
        name="C",
    )
Exemplo n.º 18
0
Arquivo: sort.py Projeto: zkzt/tvm
def sort(data, axis=-1, is_ascend=1):
    """Performs sorting along the given axis and returns an array of
    sorted values with the same shape as the input data.

    Parameters
    ----------
    data: tvm.te.Tensor
        The input array.

    axis : int, optional
        Axis long which to sort the input tensor.

    is_ascend : boolean, optional
        Whether to sort in ascending or descending order.

    Returns
    -------
    out : tvm.te.Tensor
        The output of this function.
    """
    value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8)
    value_buf_swap = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf_swap", data_alignment=8)
    out = te.extern(
        [data.shape, data.shape],
        [data],
        lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], axis, is_ascend),
        out_buffers=[value_buf, value_buf_swap],
        name="sort_gpu",
        tag="sort_gpu",
    )[0]
    return out
Exemplo n.º 19
0
def convolution_inference_without_weight_transform(
        data,
        transformed_kernel,
        bias,
        padding,
        stride,
        nthreads=1,
        algorithm=ConvolutionAlgorithm.AUTO):
    """Create an extern op to do inference convolution of 4D tensor data and
    4D pre-transformed tensor kernel and 1D tensor bias with nnpack.

    Parameters
    ----------
    data : Tensor
        data 4D tensor input[batch][input_channels][input_height][input_width] of
        FP32 elements.
    transformed_kernel : Tensor
        transformed_kernel 4D tensor kernel[output_channels][input_channels][tile]
        [tile] of FP32 elements.
    bias : Tensor
        bias 1D array bias[output_channels][input_channels][kernel_height]
        [kernel_width] of FP32 elements.
    padding : list
        padding A 4-dim list of [pad_top, pad_bottom, pad_left, pad_right],
        which indicates the padding around the feature map.
    stride : list
        stride A 2-dim list of [stride_height, stride_width], which indicates
        the stride.

    Returns
    -------
    output : Tensor
        output 4D tensor output[batch][output_channels][output_height][output_width]
        of FP32 elements.
    """

    assert algorithm in (ConvolutionAlgorithm.WT_8x8,
                         ConvolutionAlgorithm.WT_8x8_FP16)
    assert isinstance(padding, list) and len(padding) == 4
    assert isinstance(stride, list) and len(stride) == 2
    batch, _, input_height, input_width = data.shape
    output_channels, _, _, _ = transformed_kernel.shape
    kernel_height, kernel_width = (3, 3)
    idxdiv = te.indexdiv
    output_height = idxdiv(
        input_height + padding[0] + padding[1] - kernel_height, stride[0]) + 1
    output_width = idxdiv(input_width + padding[0] + padding[1] - kernel_width,
                          stride[1]) + 1

    return te.extern(
        (batch, output_channels, output_height, output_width),
        [data, transformed_kernel, bias]
        if bias is not None else [data, transformed_kernel],
        lambda ins, outs: tvm.tir.call_packed(
            "tvm.contrib.nnpack.convolution_inference_without_weight_transform",
            ins[0], ins[1], ins[2]
            if bias is not None else 0, outs[0], padding[0], padding[1],
            padding[2], padding[3], stride[0], stride[1], nthreads, algorithm),
        name="C",
        dtype='float32')
Exemplo n.º 20
0
def _get_sorted_indices(data, data_buf, score_index, score_shape):
    """Extract a 1D score tensor from the packed input and do argsort on it."""
    score_buf = tvm.tir.decl_buffer(score_shape, data.dtype, "score_buf", data_alignment=8)
    score_tensor = te.extern(
        [score_shape],
        [data],
        lambda ins, outs: _fetch_score_ir(
            ins[0],
            outs[0],
            score_index,
        ),
        dtype=[data.dtype],
        in_buffers=[data_buf],
        out_buffers=[score_buf],
        name="fetch_score",
        tag="fetch_score",
    )

    target = tvm.target.Target.current()
    if target and (
        can_use_thrust(target, "tvm.contrib.thrust.sort")
        or can_use_rocthrust(target, "tvm.contrib.thrust.sort")
    ):
        sort_tensor = argsort_thrust(score_tensor, axis=1, is_ascend=False, dtype="int32")
    else:
        sort_tensor = argsort(score_tensor, axis=1, is_ascend=False, dtype="int32")

    return sort_tensor
Exemplo n.º 21
0
def convolution_inference_weight_transform(kernel,
                                           nthreads=1,
                                           algorithm=ConvolutionAlgorithm.AUTO,
                                           dtype='float32'):
    """Create an extern op to do inference convolution of 3D tensor data and
    4D tensor kernel and 1D tensor bias with nnpack.

    Parameters
    ----------
    kernel : Tensor
        kernel 4D tensor kernel[output_channels][input_channels][kernel_height]
        [kernel_width] of FP32 elements.

    Returns
    -------
    output : Tensor
        output 4D tensor output[output_channels][input_channels][tile][tile]
        of FP32 elements.
    """
    assert algorithm in (ConvolutionAlgorithm.WT_8x8,
                         ConvolutionAlgorithm.WT_8x8_FP16)
    output_channels, input_channels, _, _ = kernel.shape
    transform_tile_size = 8
    if not isinstance(dtype, str):
        dtype = dtype.dtype
    return te.extern(
        (output_channels, input_channels, transform_tile_size,
         transform_tile_size), [kernel],
        lambda ins, outs: tvm.tir.call_packed(
            "tvm.contrib.nnpack.convolution_inference_weight_transform", ins[
                0], outs[0], nthreads, algorithm),
        name="transform_kernel",
        dtype=dtype)
Exemplo n.º 22
0
def test_shared_mem_alloc(target, dev):
    alloc_nbytes = 16384 * 2

    def do_compute(ins, outs):
        ib = tvm.tir.ir_builder.create()
        out = ib.buffer_ptr(outs[0])

        ib.scope_attr(te.thread_axis("blockIdx.x"), "thread_extent", 0)

        array = ib.allocate("int32", (alloc_nbytes, ),
                            name="array",
                            scope="shared")
        array[0] = 0
        out[0] = array[0]

        return ib.get()

    Out = te.extern(
        shape=(1, ),
        inputs=[],
        fcompute=do_compute,
        dtype="int32",
    )
    s = te.create_schedule(Out.op)

    # Codegen should raise error when allocating more memory than the
    # target supports.
    with pytest.raises(tvm.TVMError):
        tvm.build(s, [Out], target)
Exemplo n.º 23
0
def test_sort_np():
    dshape = (1, 2, 3, 4, 5, 6)
    axis = 4
    reduced_shape = (1, 2, 3, 4, 6)
    is_ascend = True
    data = te.placeholder(dshape, name="data")
    sort_num = te.placeholder(reduced_shape, name="sort_num", dtype="int32")
    out = te.extern(
        data.shape,
        [data, sort_num],
        lambda ins, outs: tvm.tir.
        call_packed("tvm.contrib.sort.argsort_nms", ins[0], ins[1], outs[0],
                    axis, is_ascend),
        dtype="int32",
        name="sort_tensor",
    )

    ctx = tvm.cpu(0)
    target = "llvm"
    s = te.create_schedule(out.op)
    f = tvm.build(s, [data, sort_num, out], target)

    np_data = np.random.uniform(size=dshape)
    np_out = np.argsort(np_data, axis=axis)
    sort_num_input = np.full(reduced_shape, dshape[axis])
    a = tvm.nd.array(np.array(np_data).astype(data.dtype), ctx)
    b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), ctx)
    c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), ctx)
    f(a, b, c)
    tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5)
Exemplo n.º 24
0
    def check_target(target, ir):
        if not tvm.testing.device_enabled(target):
            return

        C = te.extern(
            A.shape,
            [A, B],
            lambda ins, outs: ir(ins[0], ins[1], outs[0], n),
            name="searchsorted_ir",
            dtype="int32",
        )
        s = te.create_schedule(C.op)

        with tvm.transform.PassContext(opt_level=3):
            func = tvm.build(s, [A, B, C], target)

        dev = tvm.device(target, 0)
        a_np = np.random.uniform(size=n).astype(A.dtype)
        b_np = np.random.uniform(size=n).astype(B.dtype)
        a_np = np.sort(a_np)
        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
        func(a, b, c)
        ref = np.searchsorted(a_np, b_np)
        tvm.testing.assert_allclose(c.numpy(), ref)
Exemplo n.º 25
0
def matmul(lhs, rhs, transa=False, transb=False, **kwargs):
    """Create an extern op that compute matrix mult of A and rhs with CrhsLAS
    This function serves as an example on how to call external libraries.

    Parameters
    ----------
    lhs: Tensor
        The left matrix operand
    rhs: Tensor
        The right matrix operand
    transa: bool
        Whether transpose lhs
    transb: bool
        Whether transpose rhs

    Returns
    -------
    C: Tensor
        The result tensor.
    """
    n = lhs.shape[1] if transa else lhs.shape[0]
    m = rhs.shape[0] if transb else rhs.shape[1]
    return te.extern(
        (n, m),
        [lhs, rhs],
        lambda ins, outs: tvm.tir.call_packed("tvm.contrib.cblas.matmul", ins[
            0], ins[1], outs[0], transa, transb),
        name="C",
        **kwargs,
    )
Exemplo n.º 26
0
def matmul(lhs, rhs, transa=False, transb=False):
    """Create an extern op that compute matrix mult of A and rhs with rocBLAS

    Parameters
    ----------
    lhs : Tensor
        The left matrix operand
    rhs : Tensor
        The right matrix operand
    transa : bool
        Whether transpose lhs
    transb : bool
        Whether transpose rhs

    Returns
    -------
    C : Tensor
        The result tensor.
    """
    n = lhs.shape[1] if transa else lhs.shape[0]
    m = rhs.shape[0] if transb else rhs.shape[1]
    return te.extern((n, m), [lhs, rhs],
                     lambda ins, outs: tvm.tir.call_packed(
                         "tvm.contrib.rocblas.matmul", ins[0], ins[1], outs[0],
                         transa, transb),
                     name="C")
Exemplo n.º 27
0
def normal(loc, scale, size):
    """Draw samples from a normal distribution.

    Return random samples from a normal distribution.

    Parameters
    ----------
    loc : float
        loc of the distribution.
    scale : float
        Standard deviation of the distribution.
    size : tuple of ints
        Output shape. If the given shape is, e.g., (m, n, k), then m * n * k
        samples are drawn.

    Returns
    ------
    out : Tensor
        A tensor with specified size and dtype
    """
    return te.extern(
        size,
        [],
        lambda ins, outs: tvm.tir.call_packed(
            "tvm.contrib.random.normal", float(loc), float(scale), outs[0]),
        dtype="float32",
    )
Exemplo n.º 28
0
def batch_matmul(lhs, rhs, transa=False, transb=False, dtype=None):
    """Create an extern op that compute batch matrix mult of A and rhs with cuBLAS

    Parameters
    ----------
    lhs : Tensor
        The left matrix operand
    rhs : Tensor
        The right matrix operand
    transa : bool
        Whether transpose lhs
    transb : bool
        Whether transpose rhs

    Returns
    -------
    C : Tensor
        The result tensor.
    """
    b = lhs.shape[0]
    n = lhs.shape[2] if transa else lhs.shape[1]
    m = rhs.shape[1] if transb else rhs.shape[2]
    dtype = dtype if dtype is not None else lhs.dtype
    return te.extern(
        (b, n, m),
        [lhs, rhs],
        lambda ins, outs: tvm.tir.call_packed(
            "tvm.contrib.cublas.batch_matmul", ins[0], ins[1], outs[0], transa,
            transb),
        dtype=dtype,
        name="batch_matmul_cublas",
    )
Exemplo n.º 29
0
def test_tensor_scalar_mixed():
    # test te with tensor and scalar
    a = np.array(np.random.uniform(size=(10, )), "float32")
    b = np.array(np.random.uniform(size=(1))[0], "float32")
    c = np.array(np.random.uniform(size=(10, )), "float32")

    @tvm.register_func("tvm.test_tensor_scalar_scale")
    def my_scale(tensor, scalar, out):
        out_np = tensor.numpy() * scalar.numpy()
        tvm.nd.array(out_np).copyto(out)

    A = te.placeholder(a.shape, name="A")
    B = te.placeholder(b.shape, name="B")
    C = te.extern(
        a.shape,
        [A, B],
        lambda ins, outs: tvm.tir.call_packed("tvm.test_tensor_scalar_scale",
                                              ins[0], ins[1], outs[0]),
        name="C",
    )
    s = te.create_schedule(C.op)
    f = tvm.build(s, [A, B, C], "llvm")

    ta = tvm.nd.array(a)
    tb = tvm.nd.array(b)
    tc = tvm.nd.array(c)
    f(ta, tb, tc)
    tvm.testing.assert_allclose(a * b, tc.numpy())
Exemplo n.º 30
0
def stable_sort_by_key_thrust(keys, values, for_scatter=False):
    """Sort values with respect to keys using thrust.
    Both keys and values will be sorted and returned.
    Sorting is done via stable sort, so relative ordering among
    ties are preserved.

    Parameters
    ----------
    keys: tvm.te.Tensor
        The 1D input keys.

    values : tvm.te.Tensor,
        The 1D input values.

    for_scatter: bool, optional
        If True, negative keys are interpreted as negative indices.
        Before sorting, negative indices are converted to corresponding positive indices.
        The output keys (indices) are all positive.
        This option is introduced to optimize the scatter implementation.

    Returns
    -------
    keys_sorted : tvm.te.Tensor
        The sorted keys

    values_sorted : tvm.te.Tensor
        The values sorted with respect to the keys
    """
    keys_buf = tvm.tir.decl_buffer(keys.shape,
                                   keys.dtype,
                                   "keys_buf",
                                   data_alignment=8)
    values_buf = tvm.tir.decl_buffer(values.shape,
                                     values.dtype,
                                     "values_buf",
                                     data_alignment=8)
    out_bufs = [
        tvm.tir.decl_buffer(keys.shape,
                            keys.dtype,
                            "keys_buf",
                            data_alignment=8),
        tvm.tir.decl_buffer(keys.shape,
                            values.dtype,
                            "values_buf",
                            data_alignment=8),
    ]
    out = te.extern(
        [keys.shape, values.shape],
        [keys, values],
        lambda ins, outs: tvm.tir.call_packed(
            "tvm.contrib.thrust.stable_sort_by_key", ins[0], ins[1], outs[0],
            outs[1], for_scatter),
        in_buffers=[keys_buf, values_buf],
        out_buffers=out_bufs,
        dtype=[keys.dtype, values.dtype],
        name="stable_sort_by_key",
        tag="stable_sort_by_key",
    )
    return out[0], out[1]