Пример #1
0
    def _get_pixel_value(n, c, h, w):
        if padding_mode == "zeros":
            return te.if_then_else(
                te.all(h >= 0, w >= 0, h < in_height, w < in_width),
                data[n, c, h, w],
                tir.const(0.0, dtype=data.dtype),
            )
        if padding_mode == "border":
            h_b = te.max(te.min(h, in_height - 1), 0)
            w_b = te.max(te.min(w, in_width - 1), 0)
            return data[n, c, h_b, w_b]

        raise AssertionError("unsupported padding_mode")
Пример #2
0
def batch_matmul(x, y, oshape=None, auto_scheduler_rewritten_layout=""):
    """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
    data in batch. Supports broadcasting for batch dimension.

    Parameters
    ----------
    x : tvm.te.Tensor
        3-D with shape [batch, M, K]

    y : tvm.te.Tensor
        3-D with shape [batch, N, K]

    oshape : List[Optional]
        Explicit intended output shape of the computation. Can be useful in cases
        with dynamic input shapes.

    auto_scheduler_rewritten_layout: str = ""
        The layout after auto-scheduler's layout rewrite pass.

    Returns
    -------
    output : tvm.te.Tensor
        3-D with shape [batch, M, N]
    """
    x_shape = get_const_tuple(x.shape)
    if auto_scheduler_rewritten_layout:
        # Infer shape for the rewritten layout
        y_shape = auto_scheduler.get_shape_from_rewritten_layout(
            auto_scheduler_rewritten_layout, ["b", "j", "k"])
        auto_scheduler.remove_index_check(y)
    else:
        y_shape = get_const_tuple(y.shape)
    assert len(x_shape) == 3 and len(
        y_shape) == 3, "only support 3-dim batch_matmul"

    XB = x_shape[0]
    YB = y_shape[0]
    _, M, K = x.shape
    k = te.reduce_axis((0, K), name="k")
    if oshape is None:
        assert XB == YB or XB == 1 or YB == 1, "batch dimension doesn't match"
        assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistant"
        batch = te.max(XB, YB)
        N = y.shape[1]
        oshape = (batch, M, N)

    output = te.compute(
        oshape,
        lambda b, i, j: te.sum(x[b if XB != 1 else 0, i, k] * y[b if YB != 1
                                                                else 0, j, k],
                               axis=k),
        tag="batch_matmul",
        attrs={"layout_free_placeholders": [y]},
    )

    if auto_scheduler_rewritten_layout:
        output = auto_scheduler.rewrite_compute_body(
            output, auto_scheduler_rewritten_layout)

    return output
Пример #3
0
    def check_target(device, m, n):
        dev = tvm.device(device, 0)
        if not tvm.testing.device_enabled(device):
            print("skip because %s is not enabled.." % device)
            return

        # compute
        placeholder_a = te.placeholder((m, n), name="A")
        axis_k = te.reduce_axis((0, n))
        placeholder_b = te.compute(
            (m,), lambda i: te.max(placeholder_a[i][axis_k], axis=axis_k), name="B"
        )
        schedule = te.create_schedule(placeholder_b.op)

        # schedule
        axis_k = schedule[placeholder_b].op.reduce_axis[0]
        axis_ko, _ = schedule[placeholder_b].split(axis_k, nparts=nthx)
        schedule[placeholder_b].bind(axis_ko, thread_x)
        axis_xo, axis_xi = schedule[placeholder_b].split(
            schedule[placeholder_b].op.axis[0], factor=nthy
        )
        schedule[placeholder_b].bind(axis_xi, thread_y)
        schedule[placeholder_b].bind(axis_xo, block_x)

        tvm.lower(schedule, [placeholder_a, placeholder_b], simple_mode=True)

        # validation
        func = tvm.build(schedule, [placeholder_a, placeholder_b], device, name="warp_reduction")
        a_np = np.random.uniform(size=(m, n)).astype(placeholder_a.dtype)
        b_np = np.zeros((m,), dtype=placeholder_a.dtype)
        buff_a = tvm.nd.array(a_np, dev)
        buff_b = tvm.nd.array(b_np, dev)
        b_np = np.max(a_np, axis=1)
        func(buff_a, buff_b)
        tvm.testing.assert_allclose(buff_b.numpy(), b_np, rtol=1e-3, atol=1e-3)
Пример #4
0
    def check_target(device, m, n):
        dev = tvm.device(device, 0)
        if not tvm.testing.device_enabled(device):
            print("skip because %s is not enabled.." % device)
            return

        # compute
        A = te.placeholder((m, n), name="A")
        k = te.reduce_axis((0, n))
        B = te.compute((m, ), lambda i: te.max(A[i][k], axis=k), name="B")
        s = te.create_schedule(B.op)

        # schedule
        k = s[B].op.reduce_axis[0]
        ko, _ = s[B].split(k, nparts=nthx)
        s[B].bind(ko, thread_x)
        xo, xi = s[B].split(s[B].op.axis[0], factor=nthy)
        s[B].bind(xi, thread_y)
        s[B].bind(xo, block_x)

        tvm.lower(s, [A, B], simple_mode=True)

        # validation
        func = tvm.build(s, [A, B], device, name="warp_reduction")
        a_np = np.random.uniform(size=(m, n)).astype(A.dtype)
        b_np = np.zeros((m, ), dtype=A.dtype)
        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(b_np, dev)
        b_np = np.max(a_np, axis=1)
        func(a, b)
        tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3, atol=1e-3)
Пример #5
0
def batch_matmul(cfg, x, y, out_shape=None, out_dtype=None):
    """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
    data in batch. Supports broadcasting in batch dimension.

    Parameters
    ----------
    cfg : ConfigSpace
        Autotvm tuning space config file
    x : tvm.te.Tensor
        3-D with shape [batch, M, K]
    y : tvm.te.Tensor
        3-D with shape [batch, N, K]
    out_shape : tuple or None
        Shape of the outputs

    Returns
    -------
    output : tvm.te.Tensor
        3-D with shape [batch, M, N]
    """
    assert len(x.shape) == 3 and len(
        y.shape) == 3, "only support 3-dim batch_matmul"
    XB, M, XK = get_const_tuple(x.shape)
    YB, N, YK = get_const_tuple(y.shape)
    assert (XB == YB) or (YB == 1) or (XB
                                       == 1), "batch dimension doesn't match"
    assert XK == YK, "shapes of x and y is inconsistent"
    B = te.max(XB, YB)
    K = XK
    if out_shape is not None:
        assert out_shape[0] == B, "got invalid output shape"
        assert out_shape[1] == M, "got invalid output shape"
        assert out_shape[2] == N, "got invalid output shape"
    if cfg.is_fallback:
        _default_batch_matmul_config(cfg, M, N, K)

    k = te.reduce_axis((0, K), name="k")
    if out_dtype is None or out_dtype == x.dtype:
        C = te.compute(
            (B, M, N),
            lambda b, i, j: te.sum(x[b if XB != 1 else 0, i, k] * y[
                b if YB != 1 else 0, j, k],
                                   axis=k),
            tag="batch_matmul",
        )
    else:
        C = te.compute(
            (B, M, N),
            lambda b, i, j: te.sum(
                x[b if XB != 1 else 0, i, k].astype(out_dtype) * y[
                    b if YB != 1 else 0, j, k].astype(out_dtype),
                axis=k,
            ),
            tag="batch_matmul",
        )
    return C
Пример #6
0
    def _sample(i, c, ph, pw):
        roi = rois[i]
        batch_index = roi[0].astype("int32")
        roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[
            3], roi[4]
        roi_start_h *= spatial_scale
        roi_end_h *= spatial_scale
        roi_start_w *= spatial_scale
        roi_end_w *= spatial_scale

        # force malformed ROIs to be 1x1
        roi_h = tvm.te.max(roi_end_h - roi_start_h, tvm.tir.const(1.0, dtype))
        roi_w = tvm.te.max(roi_end_w - roi_start_w, tvm.tir.const(1.0, dtype))

        bin_h = roi_h / pooled_size_h
        bin_w = roi_w / pooled_size_w

        if sample_ratio > 0:
            roi_bin_grid_h = roi_bin_grid_w = tvm.tir.const(
                sample_ratio, "int32")
        else:
            roi_bin_grid_h = te.ceil(roi_h / pooled_size_h).astype("int32")
            roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype("int32")

        count = roi_bin_grid_h * roi_bin_grid_w
        rh = te.reduce_axis((0, roi_bin_grid_h))
        rw = te.reduce_axis((0, roi_bin_grid_w))
        roi_start_h += ph * bin_h
        roi_start_w += pw * bin_w
        if avg_mode:
            return te.sum(
                _bilinear(
                    batch_index,
                    c,
                    roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
                    roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
                ) / count,
                axis=[rh, rw],
            )
        # max mode
        return te.max(
            _bilinear(
                batch_index,
                c,
                roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
                roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
            ),
            axis=[rh, rw],
        )
Пример #7
0
 def _compute_intn(dtype, value, *indices):
     assert output_scale is not None and output_zero_point is not None
     const_min = tvm.tir.min_value(dtype)
     const_max = tvm.tir.max_value(dtype)
     # Use indexmod to handle both scalar and per-channel QNN parameters.
     scale_idx = tir.indexmod(indices[axis], topi.shape(output_scale)[0])
     zp_idx = tir.indexmod(indices[axis], topi.shape(output_zero_point)[0])
     return te.max(
         te.min(
             te.round(value[indices] / output_scale[scale_idx]) +
             output_zero_point[zp_idx],
             const_max,
         ),
         const_min,
     )
Пример #8
0
 def stmt_calc(t, n, c, h, w, i, j):
     if trace_mode.mode == 'tvm':
         if pool_type == 'max':
             t['out'][n, c, h, w] = te.max(
                 t['out'][n, c, h, w], t['x'][n, c, h * stride_height + i, w * stride_width + j])
         else:
             t['out'][n, c, h, w] = t['out'][n, c, h, w] \
                                    + t['x'][n, c, h * stride_height + i, w * stride_width + j]
     elif trace_mode.mode == 'tensor_access':
         t['out'][n, c, h, w] = t['x'][n, c, h * stride_height + i, w * stride_width + j]
     else:
         if pool_type == 'max':
             t['out'][n, c, h, w] = max(t['x'][n, c, h, w],
                                        t['x'][n, c, h * stride_height + i, w * stride_width + j])
         else:
             t['out'][n, c, h, w] = t['out'][n, c, h, w] \
                                    + t['x'][n, c, h * stride_height + i, w * stride_width + j]
Пример #9
0
    def compute(n, ho, wo, co, hi, wi, ci):
        # Construct blockized strided maxpool height indices
        h = ho * block_H + hi
        h_contig = h * stride[0] + rh
        h_block_id = h_contig // block_H
        h_block_offset = h_contig % block_H

        # Construct blockized strided maxpool width indices
        w = wo * block_W + wi
        w_contig = w * stride[1] + rw
        w_block_id = w_contig // block_W
        w_block_offset = w_contig % block_W

        return te.max(
            X_packed[n, h_block_id, w_block_id, co, h_block_offset, w_block_offset, ci],
            axis=[rh, rw],
        )
Пример #10
0
    def compute(batch, h_outer, w_outer, c_outer, h_inner, w_inner, c_inner):
        # Construct blockized strided maxpool height indices
        h = h_outer * block_h + h_inner
        h_contig = h * stride[0] + reduce_h
        h_block_id = h_contig // block_h
        h_block_offset = h_contig % block_h

        # Construct blockized strided maxpool width indices
        w_idx = w_outer * block_w + w_inner
        w_contig = w_idx * stride[1] + reduce_w
        w_block_id = w_contig // block_w
        w_block_offset = w_contig % block_w

        return te.max(
            x_packed[batch, h_block_id, w_block_id, c_outer, h_block_offset,
                     w_block_offset, c_inner],
            axis=[reduce_h, reduce_w],
        )
Пример #11
0
def batch_matmul(lhs,
                 rhs,
                 transa=False,
                 transb=False,
                 iterative=False,
                 **kwargs):
    """Create an extern op that compute batched matrix mult of A and rhs with CBLAS
    This function serves as an example on how to call external libraries.

    Parameters
    ----------
    lhs: Tensor
        The left matrix operand
    rhs: Tensor
        The right matrix operand
    transa: bool
        Whether transpose lhs
    transb: bool
        Whether transpose rhs

    Returns
    -------
    C: Tensor
        The result tensor.
    """
    b = te.max(lhs.shape[0], rhs.shape[0])
    n = lhs.shape[2] if transa else lhs.shape[1]
    m = rhs.shape[1] if transb else rhs.shape[2]
    return te.extern(
        (b, n, m),
        [lhs, rhs],
        lambda ins, outs: tvm.tir.call_packed(
            "tvm.contrib.cblas.batch_matmul"
            if not iterative else "tvm.contrib.cblas.batch_matmul_iterative",
            ins[0],
            ins[1],
            outs[0],
            transa,
            transb,
        ),
        name="C",
        **kwargs,
    )
Пример #12
0
def max_pool2d_compute(A, out_shape, kernel, stride, dilation):
    """max_pool2d compute"""
    kh, kw = kernel
    rh = te.reduce_axis((0, kh), name="rh")
    rw = te.reduce_axis((0, kw), name="rw")
    ob, oh, ow, oc = out_shape
    if isinstance(ob, int):
        validate_out_shape(out_shape, A.shape, kernel, stride, dilation)

    sh, sw = stride
    dh, dw = dilation

    Max = te.compute(
        out_shape,
        lambda b, h, w, c: te.max(A[b, h * sh + dh * rh, w * sw + dw * rw, c].
                                  astype(A.dtype),
                                  axis=[rh, rw]),
        name="max",
    )
    return Max
Пример #13
0
def pool(pool_type, c, nh, nw, kh, kw, ph=0, pw=0, sh=1, sw=1):
    """2D pooling
    
    pool_type: pooling type, 'max' or 'avg'
    c : channels
    nh, nw : input width and height
    kh, kw : kernel width and height
    ph, pw : height and width padding sizes, default 0
    sh, sw : height and width strides, default 1
    """
    # reduction axes
    rkh = te.reduce_axis((0, kh), name='rkh')
    rkw = te.reduce_axis((0, kw), name='rkw')
    # output height and weights
    oh = d2ltvm.conv_out_size(nh, kh, ph, sh)
    ow = d2ltvm.conv_out_size(nw, kw, pw, sw)
    # pad X and then compute Y
    X = te.placeholder((c, nh, nw), name='X')

    if pool_type == 'max':
        PaddedX = d2ltvm.padding(X, ph, pw, val=te.min_value(X.dtype)) \
            if ph * pw != 0 else X
        Y = te.compute((c, oh, ow), \
                            lambda c, h, w: \
                            te.max(PaddedX[c, h*sh+rkh, w*sw+rkw], \
                                axis=[rkh, rkw]), \
                            tag="pool_max", name='PoolMax')
    elif pool_type == 'avg':
        PaddedX = d2ltvm.padding(X, ph, pw) if ph * pw != 0 else X
        tsum = te.compute((c, oh, ow), \
                            lambda c, h, w: \
                            te.sum(PaddedX[c, h*sh+rkh, w*sw+rkw], \
                                axis=[rkh, rkw]), \
                            tag="pool_avg1", name='PoolSum')
        Y = te.compute((c, oh, ow), \
                            lambda c, h, w: \
                            tsum[c, h, w] / (kh*kw), \
                            tag='pool_avg2', name='PoolAvg')
    else:
        raise ValueError("Pool type should be 'avg' or 'max'.")
    return X, Y, PaddedX
Пример #14
0
def batch_matmul(x, y, oshape=None):
    """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
    data in batch. Supports broadcasting for batch dimension.

    Parameters
    ----------
    x : tvm.te.Tensor
        3-D with shape [batch, M, K]

    y : tvm.te.Tensor
        3-D with shape [batch, N, K]

    oshape : List[Optional]
        Explicit intended output shape of the computation. Can be useful in cases
        with dynamic input shapes.

    Returns
    -------
    output : tvm.te.Tensor
        3-D with shape [batch, M, N]
    """
    assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_matmul"
    x_shape = get_const_tuple(x.shape)
    y_shape = get_const_tuple(y.shape)
    XB = x_shape[0]
    YB = y_shape[0]
    _, M, K = x.shape
    k = te.reduce_axis((0, K), name="k")
    if oshape is None:
        assert XB == YB or XB == 1 or YB == 1, "batch dimension doesn't match"
        assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistant"
        batch = te.max(XB, YB)
        N = y.shape[1]
        oshape = (batch, M, N)
    return te.compute(
        oshape,
        lambda b, i, j: te.sum(x[b if XB != 1 else 0, i, k] * y[b if YB != 1 else 0, j, k], axis=k),
        tag="batch_matmul",
    )
Пример #15
0
def process_post_ops(layer_idx,
                     Input,
                     Bias,
                     post_op,
                     pack=False,
                     out_dtype="float32"):
    if pack:
        _, _, _, _, OC_vec = Input.shape
        BiasAdd = te.compute(
            Input.shape,
            lambda n, c_chunk, h, w, c_vec: Input[
                n, c_chunk, h, w, c_vec] + Bias[c_chunk * OC_vec + c_vec],
            name='FusedConv2D_BiasAdd_{}'.format(layer_idx),
            tag='biasadd')
    else:
        BiasAdd = te.compute(Input.shape,
                             lambda n, h, w, c: Input[n, h, w, c] + Bias[c],
                             name='FusedConv2D_BiasAdd_{}'.format(layer_idx),
                             tag='biasadd')

    # TODO: Recover this
    # if block_input is not None:
    #     inputs = block_input if isinstance(block_input, list) else [block_input]
    #     First = inputs[0] # TODO: Support multiple branches addition later
    #     Last = self.stages[-1][-1] # Output if post_op is None, BiasAdd if it's not None
    #     assert sorted(get_const_tuple(First.shape)) == sorted(get_const_tuple(Last.shape)), '{} is not the same as {}'.format(First.shape, Last.shape)
    #     if self.pack:
    #         Output = te.compute(self.output_shape,
    #                             lambda n, c_chunk, h, w, c_vec: (First[n, c_chunk, h, w, c_vec] + (Last[n, c_chunk, h, w, c_vec])),
    #                             name='ElementwiseAddOutput_{}'.format(self.layer_idx),
    #                             tag='elem_{}'.format(tag_suffix))
    #     else:
    #         Output = te.compute(self.output_shape,
    #                             lambda n, h, w, c: (First[n, h, w, c] + (Last[n, h, w, c])),
    #                             name='ElementwiseAddOutput_{}'.format(self.layer_idx),
    #                             tag='elem_{}'.format(tag_suffix))
    #     self.stages[-1].append(Output)
    # Last = self.stages[-1][-1] # BiasAdd if it's not a block, Output if it's a block

    # Else: only bias_add
    Last = BiasAdd
    if post_op == 'relu':
        Last = te.compute(
            Last.shape,
            lambda *i: te.max(Last(*i), tvm.runtime.const(0, Last.dtype)),
            name='FusedConv2D_ReLU_{}'.format(layer_idx),
            tag='relu')
    elif post_op == 'sigmoid':
        Last = te.compute(Last.shape,
                          lambda *i: te.sigmoid(Last(*i)),
                          name='FusedConv2D_Sigmoid_{}'.format(layer_idx),
                          tag='sigmoid')
    elif post_op == 'relu6':
        Last = te.compute(
            Last.shape,
            lambda *i: te.min(
                te.max(Last(*i), tvm.runtime.const(0, Last.dtype)),
                tvm.runtime.const(6, Last.dtype)),
            name='FusedConv2D_ReLU6_{}'.format(layer_idx),
            tag='relu6')
    return Last
Пример #16
0
def pooling_compute(
    ifm: te.Tensor,
    lut: te.Tensor,
    pooling_type: str,
    ifm_scale: float,
    ifm_zero_point: int,
    ofm_scale: float,
    ofm_zero_point: int,
    pool_shape: Tuple[int, int],
    ofm_channels: int,
    strides: Tuple[int, int],
    padding: Tuple[int, int, int, int],
    activation: str,
    clip_min: int,
    clip_max: int,
    rounding_mode: str,
    upscale: str,
    ifm_layout: str,
    ofm_layout: str,
) -> te.Tensor:
    """A compute operator representing the capabilities of pooling for the NPU.

    Parameters
    ----------
    ifm : te.Tensor
        The Input Feature Map tensor (IFM).
    lut : te.Tensor
        The look-up table of values to use if activation = "LUT".
    pooling_type: str
        The type of the pooling. "AVG" - average pool,   "MAX" - max pool.
    ifm_scale : float
        The quantization scale for the Input Feature Map tensor.
    ifm_zero_point : int
        The quantization zero point for the Input Feature Map tensor.
    ofm_scale : float
        The quantization scale for the Output Feature Map tensor.
    ofm_zero_point : int
        The quantization zero point for the Output Feature Map tensor.
    pool_shape : Tuple[int, int]
        The 2 dimensional pool shape as (pool_shape_height, pool_shape_width).
    ofm_channels : int
        The number of the Output Feature Map channels
    strides : Tuple[int, int]
        The 2 dimensional strides as (stride_height, stride_width).
    padding : Tuple[int, int, int, int]
        The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right).
    activation : str
        The activation function to use.
            "NONE" - no activation function.
            "CLIP" - clip the output between clip_min and clip_max.
            "TANH" - tanh activation function.
            "SIGMOID" - sigmoid activation function.
            "LUT" - use a look-up table to perform the activation function.
    clip_min : int
        The minimum clipping value if activation = "CLIP".
    clip_max : int
        The maximum clipping value if activation = "CLIP".
    rounding_mode : str
        The rounding mode to apply to the Output Feature Map tensor.
            "TFL" - Tensorflow Lite rounding scheme.
            "TRUNCATE" - Truncate towards zero.
            "NATURAL" - Round to nearest value, with x.5 rounded up towards +infinity.
    upscale : str
        The 2x2 upscaling mode to apply to the Input Feature Map tensor.
            "NONE" - no upscaling.
            "NEAREST" - upscale using nearest neighbour.
            "ZEROS" - upscale using zeros.
    ifm_layout : str
        The layout of the Input Feature Map tensor. Can be "NHWC" or "NHCWB16".
    ofm_layout : str
        The layout of the Output Feature Map tensor. Can be "NHWC" or "NHCWB16".

    Returns
    -------
    te.Tensor
        The OFM tensor.
    """
    stride_h, stride_w = strides
    pool_shape_h, pool_shape_w = pool_shape

    # Compute operation for the IFM DMA pipeline
    dmaed_ifm = dma_ifm_compute(ifm, ifm_layout, ifm_zero_point, ifm_scale,
                                ofm_channels, padding)

    # Pooling compute operation
    ofm_height = (dmaed_ifm.shape[1] - pool_shape_h) // stride_h + 1
    ofm_width = (dmaed_ifm.shape[2] - pool_shape_w) // stride_w + 1
    rh = te.reduce_axis((0, pool_shape_h), name="ry")
    rw = te.reduce_axis((0, pool_shape_w), name="rx")

    pooling_attrs = {
        "op": "ethosu_pooling",
        "pooling_type": pooling_type,
        "stride_h": stride_h,
        "stride_w": stride_w,
        "activation": activation,
        "clip_min": clip_min,
        "clip_max": clip_max,
        "rounding_mode": rounding_mode,
        "upscale": upscale,
    }

    # This is a trick to insert the LUT tensor into the TE graph if LUT is present
    lut_expr = (lut[0] +
                lut[255]).astype(ifm.dtype) if activation in ("TANH",
                                                              "LUT") else 0

    # Add the LUT tensor to the attributes to be able to later tell which tensor is the LUT
    if activation in ("TANH", "LUT"):
        pooling_attrs["lut"] = lut

    pooling = te.compute(
        (1, ofm_height, ofm_width, ofm_channels),
        lambda nn, hh, ww, cc: te.max(
            (dmaed_ifm(nn, hh * stride_h + rh, ww * stride_w + rw, cc) +
             lut_expr).astype(ifm.dtype),
            axis=[rh, rw],
        ),
        name="ethosu_pooling",
        attrs=pooling_attrs,
    )

    # Compute operation for the OFM DMA pipeline
    return dma_ofm_compute(pooling, ofm_layout, ofm_zero_point, ofm_scale,
                           ofm_channels)
Пример #17
0
def dilation2d_nhwc(input, filter, stride, padding, dilations, out_dtype=None):
    """Morphological 2d dilation NHWC layout.

    Parameters
    ----------
    input : tvm.Tensor
        4-D with shape [batch, in_height, in_width, in_channel]

    filter : tvm.Tensor
        3-D with shape [filter_height, filter_width, in_channel]

    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]

    padding : int
        Padding size

    dilations: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    out_dtype : Optional[str]
        Specifies the output data type.

    Returns
    -------
    Output : tvm.Tensor
        4-D with shape [batch, out_height, out_width, in_channel]
    """
    if out_dtype is None:
        out_dtype = input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilations, int) or len(dilations) == 2
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilations, int):
        dilation_h = dilation_w = dilations
    else:
        dilation_h, dilation_w = dilations

    batch, in_height, in_width, in_channel = input.shape
    kernel_h, kernel_w, channel = filter.shape
    assert in_channel.value == channel.value, \
        "For Dilation2D input and filter channels should be same."

    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
    pad_before = [0, pad_top, pad_left, 0]
    pad_after = [0, pad_down, pad_right, 0]
    padded_input = pad(input, pad_before, pad_after, name="padded_input")
    ry = te.reduce_axis((0, kernel_h), name='ry')
    rx = te.reduce_axis((0, kernel_w), name='rx')

    return te.compute((batch, out_height, out_width, in_channel),
                      lambda nn, yy, xx, ff: te.max(padded_input[
                          nn, yy * stride_h + ry * dilation_h, xx * stride_w +
                          rx * dilation_w, ff].astype(out_dtype) + filter[
                              ry, rx, ff].astype(out_dtype),
                                                    axis=[ry, rx]),
                      tag="dilation2d_nhcw")
def test_basic_operation():
    np.random.seed(0)
    shape = (10, 10)
    x = te.var("x", dtype='float32')
    k = te.reduce_axis((0, 10), name="k")
    l = te.reduce_axis((0, 10), name="l")
    A0 = te.placeholder(shape, name='A0')
    A1 = te.placeholder(shape, name='A1')
    zeros = np.zeros(shape)

    B = te.compute(shape, lambda i, j: A0[i, j], name='B')
    check_grad(B, [A0])

    B = te.compute(shape, lambda i, j: A0[i, j] + A1[i, j], name='B')
    check_grad(B, [A0, A1])

    B = te.compute(shape, lambda i, j: A0[i, j] + A0[j, i], name='B')
    check_grad(B, A0)

    B = te.compute(shape, lambda i, j: te.floor(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: te.ceil(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: te.trunc(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: te.round(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: A0[i, j] + te.exp(A0[j, i]), name='B')
    check_grad(B, A0)

    B = te.compute(
        shape,
        lambda i, j: te.log(0.1 + te.abs(A0[i, j] + te.exp(A0[j, i]))),
        name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.sigmoid(A0[i, j] * A0[i, j] * A0[j, i]),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.tanh(A0[i, j] * A0[i, j] * A0[j, i]),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.sqrt(A0[i, j] * A0[i, j] * A0[j, i]),
                   name='B')
    check_grad(B, A0, data_range=(0.1, 10))

    B = te.compute(shape,
                   lambda i, j: te.power(te.abs(A0[i, j]), A0[j, i]),
                   name='B')
    check_grad(B, A0, data_range=(-4, 4))

    B = te.compute(shape, lambda i, j: A0[i, j] * A0[j, i], name='B')
    check_grad(B, A0)

    B = te.compute((10, ),
                   lambda i: te.sum(A0[i, k] * A0[k, i], axis=k),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.sum(A0[i, k] * A0[k, i] + 5, axis=k),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.max(A0[i, k] * A0[k, j] + 5, axis=k),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: A0[i, j] * (A1[j, i] + A0[j, i]),
                   name='B')
    check_grad(B, [A0, A1])

    B = te.compute(shape,
                   lambda i, j: te.sum(
                       A0[k, k] - A0[te.min(j + k, 9), j] * A0[i, k], axis=k),
                   name='B')
    check_grad(B, A0)

    def fcombine(x, y):
        return x * y

    def fidentity(t0):
        return tvm.tir.const(1, t0)

    prod = te.comm_reducer(fcombine, fidentity, name='prod')
    B = te.compute((10, 10),
                   lambda i, j: prod(A0[i, k] + A0[k, i], axis=k),
                   name='B')
    check_grad(B, A0)

    X = te.placeholder((10, ), name='X')
    A = te.compute((10, ), lambda i: X[i] + X[9 - i])
    B = te.compute((10, ), lambda i: X[i] * X[9 - i])
    Y = topi.tensordot(A, B, 1)
    check_grad(Y, X)
Пример #19
0
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name,
                                      dtype="float32"):
    """TODO: Your code here"""
    """Hint: output shape should be (1,)"""
    A_=te.placeholder(shape,dtype=dtype,name="A_")
    A=te.placeholder(shape,dtype=dtype,name="A")

    #desined by myself
    k = te.reduce_axis((0, A.shape[1]), name="k")
    A_max = te.compute((A.shape[0],), lambda i: te.max(A[i, k], axis=k))
    A_ex = te.compute(shape, lambda i, j: te.exp(A[i, j] - A_max[i]))
    k1 = te.reduce_axis((0, A.shape[1]), name="k1")
    A_ex_sum = te.compute((A.shape[0],), lambda i: te.sum(A_ex[i, k1], axis=k1))
    A_logsoftmax = te.compute(shape, lambda i, j: te.log(A_ex[i, j] / A_ex_sum[i]))

    k2=te.reduce_axis((0,shape[1]),name="k2")
    A_logsoftmax_sum=te.compute((shape[0],0),lambda i:te.sum(A_logsoftmax[i,k2]*A_[i,k2],axis=k2))
    k3=te.reduce_axis((0,shape[0]),name="k3")
    B=te.compute((1,),lambda i: te.sum(-A_logsoftmax_sum[k3],axis = k3))
    B1=te.compute((1,), lambda i: B[i] / shape[0])

    s=te.create_schedule(B1.op)
    if tgt=="cuda":
        #I'dont know why it can't work?
        s = te.create_schedule(B1.op)

        num_thread = 64
        block_x = te.thread_axis("blockIdx.x")
        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")

        s[A_ex].bind(A_ex.op.axis[0], block_x)
        s[A_max].bind(A_max.op.axis[0], block_x)

        k_ex_sum = A_ex_sum.op.reduce_axis[0]
        ko, ki = s[A_ex_sum].split(k_ex_sum, factor=num_thread)
        EF = s.rfactor(A_ex_sum, ki)
        s[A_ex_sum].bind(s[A_ex_sum].op.axis[0], block_x)
        s[A_ex_sum].bind(s[A_ex_sum].op.reduce_axis[0], thread_x)
        s[EF].compute_at(s[A_ex_sum], s[A_ex_sum].op.reduce_axis[0])
        s[A_ex_sum].set_store_predicate(thread_x.var.equal(0))

        tx, xi = s[A_logsoftmax].split(A_logsoftmax.op.axis[1], nparts=num_thread)
        s[A_logsoftmax].bind(A_logsoftmax.op.axis[0], block_x)
        s[A_logsoftmax].bind(tx, thread_x)

        k_logsoftmax_sum = A_logsoftmax_sum.op.reduce_axis[0]
        klso, klsi = s[A_logsoftmax_sum].split(k_logsoftmax_sum, factor=num_thread)
        lsEF = s.rfactor(A_logsoftmax_sum, klsi)
        s[A_logsoftmax_sum].bind(s[A_logsoftmax_sum].op.axis[0], block_x)
        s[A_logsoftmax_sum].bind(s[A_logsoftmax_sum].op.reduce_axis[0], thread_x)
        s[lsEF].compute_at(s[A_logsoftmax_sum], s[A_logsoftmax_sum].op.reduce_axis[0])
        s[A_logsoftmax_sum].set_store_predicate(thread_x.var.equal(0))

        k_B=B.op.reduce_axis[0]
        kbo,kbi=s[B].split(k_B,factor=num_thread)
        bEF=s.rfactor(B,kbi)
        s[B].bind(s[B].op.reduce_axis[0],thread_x)
        s[bEF].compute_at(s[B],s[B].op.reduce_axis[0])
        s[B].set_store_predicate(block_x.var.equal(0))

        s[B1].set_store_predicate(block_x.var.equal(0))


        print(tvm.lower(s, [A, A_,B1], simple_mode=True))


    f=tvm.build(s,[A,A_,B1],tgt,tgt_host,name=func_name)
    return f
Пример #20
0
def dilation2d_nchw(input, filter, stride, padding, dilations, out_dtype=None):
    """Morphological dilation operator in NCHW layout.

    Parameters
    ----------
    input : tvm.te.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]

    filter : tvm.te.Tensor
        3-D with shape [ in_channel, filter_height, filter_width]

    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]

    padding : int or str
        Padding size

    dilations: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    out_dtype : Optional[str]
        Specifies the output data type.

    Returns
    -------
    Output : tvm.te.Tensor
        4-D with shape [batch, in_channel, out_height, out_width]
    """
    if out_dtype is None:
        out_dtype = input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilations, int) or len(dilations) == 2
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilations, int):
        dilation_h = dilation_w = dilations
    else:
        dilation_h, dilation_w = dilations

    batch, in_channel, in_height, in_width = input.shape
    channel, kernel_h, kernel_w = filter.shape
    assert (in_channel.value == channel.value
            ), "For Dilation2D input and filter channels should be same."

    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
    # compute graph
    pad_before = [0, 0, pad_top, pad_left]
    pad_after = [0, 0, pad_down, pad_right]
    temp = pad(input, pad_before, pad_after, name="pad_temp")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    return te.compute(
        (batch, in_channel, out_height, out_width),
        lambda nn, ff, yy, xx: te.max(
            temp[nn, ff, yy * stride_h + ry * dilation_h, xx * stride_w + rx *
                 dilation_w].astype(out_dtype) + filter[ff, ry, rx].astype(
                     out_dtype),
            axis=[ry, rx],
        ),
        tag="dilation2d_nchw",
    )
Пример #21
0
def batch_matmul(
    tensor_a,
    tensor_b,
    oshape=None,
    out_dtype=None,
    transpose_a=False,
    transpose_b=True,
    auto_scheduler_rewritten_layout="",
    meta_schedule_original_shape=None,
):
    """Compute batch matrix multiplication of `tensor_a` and `tensor_b`.

    Both `tensor_a` and `tensor_b` can be transposed. For legacy reason, we use NT format
    (transpose_a=False, transpose_b=True) by default.

    Parameters
    ----------
    tensor_a : tvm.te.Tensor
        3-D with shape [batch, M, K] or [batch, K, M].

    tensor_b : tvm.te.Tensor
        3-D with shape [batch, K, N] or [batch, N, K].

    oshape : List[Optional]
        Explicit intended output shape of the computation. Can be useful in cases
        with dynamic input shapes.

    out_dtype : Optional[str]
        Specifies the output data type for mixed precision batch matmul.

    transpose_a : Optional[bool] = False
        Whether the first tensor is in transposed format.

    transpose_b : Optional[bool] = True
        Whether the second tensor is in transposed format.

    auto_scheduler_rewritten_layout: Optional[str] = ""
        The layout after auto-scheduler's layout rewrite pass.

    meta_schedule_original_shape: Optional[List[PrimExpr]] = None
        The original shape of the tensor

    Returns
    -------
    output : tvm.te.Tensor
        3-D with shape [batch, M, N]
    """
    assert len(tensor_a.shape) == 3, "tensor_a only support 3-dim"
    if transpose_a:
        XB, XK, XI = get_const_tuple(tensor_a.shape)
    else:
        XB, XI, XK = get_const_tuple(tensor_a.shape)
    if auto_scheduler_rewritten_layout:
        # Infer shape for the rewritten layout
        YB, YK, YJ = auto_scheduler.get_shape_from_rewritten_layout(
            auto_scheduler_rewritten_layout, ["b", "k", "j"])
        auto_scheduler.remove_index_check(tensor_b)
    elif meta_schedule_original_shape:
        auto_scheduler.rewrite_tensor_shape(tensor_b,
                                            meta_schedule_original_shape)
        if transpose_b:
            YB, YJ, YK = get_const_tuple(tensor_b.shape)
        else:
            YB, YK, YJ = get_const_tuple(tensor_b.shape)
    else:
        assert len(tensor_b.shape) == 3, "tensor_b only support 3-dim"
        if transpose_b:
            YB, YJ, YK = get_const_tuple(tensor_b.shape)
        else:
            YB, YK, YJ = get_const_tuple(tensor_b.shape)

    assert XK == YK or isinstance(
        YK, tvm.tir.expr.Var), "shapes of x and y are inconsistent"
    k = te.reduce_axis((0, XK), name="k")
    if oshape is None:
        assert XB == YB or XB == 1 or YB == 1, "batch dimension doesn't match"
        batch = (tvm.tir.expr.SizeVar("batch", "int32")
                 if isinstance(XB, tvm.tir.expr.Var)
                 or isinstance(YB, tvm.tir.expr.Var) else te.max(XB, YB))
        oshape = (batch, XI, YJ)
    if out_dtype is None:
        out_dtype = tensor_a.dtype
        if tensor_a.dtype != tensor_b.dtype:
            logger.warning(
                "tensor_a has different data type with tensor_b: %s, %s",
                tensor_a.dtype,
                tensor_b.dtype,
            )

    if (transpose_a, transpose_b) == (True, True):
        compute_lambda = lambda b, i, j: te.sum(
            tensor_a[b if XB != 1 else 0, k, i].astype(out_dtype) * tensor_b[
                b if YB != 1 else 0, j, k].astype(out_dtype),
            axis=k,
        )
        compute_name = "T_batch_matmul_TT"
    elif (transpose_a, transpose_b) == (True, False):
        compute_lambda = lambda b, i, j: te.sum(
            tensor_a[b if XB != 1 else 0, k, i].astype(out_dtype) * tensor_b[
                b if YB != 1 else 0, k, j].astype(out_dtype),
            axis=k,
        )
        compute_name = "T_batch_matmul_TN"
    elif (transpose_a, transpose_b) == (False, True):
        compute_lambda = lambda b, i, j: te.sum(
            tensor_a[b if XB != 1 else 0, i, k].astype(out_dtype) * tensor_b[
                b if YB != 1 else 0, j, k].astype(out_dtype),
            axis=k,
        )
        compute_name = "T_batch_matmul_NT"
    else:  # (transpose_a, transpose_b) == (False, False):
        compute_lambda = lambda b, i, j: te.sum(
            tensor_a[b if XB != 1 else 0, i, k].astype(out_dtype) * tensor_b[
                b if YB != 1 else 0, k, j].astype(out_dtype),
            axis=k,
        )
        compute_name = "T_batch_matmul_NN"

    output = te.compute(
        oshape,
        compute_lambda,
        name=compute_name,
        tag="batch_matmul",
        attrs={"layout_free_placeholders": [tensor_b]},
    )
    if auto_scheduler_rewritten_layout:
        output = auto_scheduler.rewrite_compute_body(
            output, auto_scheduler_rewritten_layout)

    return output
Пример #22
0
 def _clip_coordinates(x, size):
     return te.min(te.max(x, 0), size - 1)