예제 #1
0
def compute_depthwise_conv2d_NHWC_HWOI(Input,
                                       Filter,
                                       stride,
                                       padding,
                                       dilation,
                                       out_dtype=None,
                                       args={}):
    """Depthwise convolution operator in NCHWc layout. """
    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2

    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_height, in_width, channels = Input.shape
    kernel_h, kernel_w, _, _ = Filter.shape

    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_height_orig = out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width_orig = out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    channel_block = 4
    channel_chunk = channels // channel_block
    num_filter_chunk = 1

    # compute:
    Input = te.compute(
        [batch, in_height, in_width, channel_chunk, channel_block],
        lambda nn, yy, xx, icc, icb: Input[nn, yy, xx, icc * 4 + icb],
        name="input_pack",
        tag="input_pack",
    )
    Filter = te.compute(
        [kernel_h, kernel_w, channel_chunk, num_filter_chunk, channel_block],
        lambda kh, kw, ifc, nfc, cb: Filter[kh, kw, ifc * 4 + cb, nfc],
        name="filter_pack",
        tag="filter_pack",
    )

    # can output shape be divded by 2 or even 4?
    # if it cannot be divided, need to extend for further help with split
    # theortically there should be addition padding for inputs, but it will be optimized by
    # cache_read InferBound. We must proceed pad here exactly to produce tensor which is
    # required for calculation of original out size, not more! In other case intermediate
    # tensor might be allcoated with less sizes while compute will try to fill the expanded
    # one - data discrepancy as a result
    # And in case of textures it is not a problem if we provide texture of less size because
    # 1. It is not important which valuses would be for extra calc - these calculations are
    #    required only for better utilizatin of GPU fit to working groups
    # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned
    #    above, the value itself is not important
    if out_height % 2 != 0:
        out_height += 1
    if out_width % 2 != 0:
        out_width += 1

    if out_height % 4 != 0:
        out_height += 2
    if out_width % 4 != 0:
        out_width += 2

    # compute graph
    pad_before = [0, pad_top, pad_left, 0, 0]
    pad_after = [0, pad_down, pad_right, 0, 0]
    # calculation of real used input size:
    input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w -
                                                        1) * dilation_w + 1
    input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h -
                                                         1) * dilation_h + 1
    if input_latest_w < in_width + pad_before[3] + pad_after[3]:
        pad_after[
            3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w
    if input_latest_h < in_height + pad_before[2] + pad_after[2]:
        pad_after[
            2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h

    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")

    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    conv = te.compute(
        (batch, out_height, out_width, channel_chunk, channel_block),
        lambda nn, yy, xx, ffc, ffb: te.sum(
            (temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx *
                  dilation_w, ffc, ffb] * Filter[ry, rx, ffc, 0, ffb]).astype(
                      args["accumulator"]),
            axis=[ry, rx],
        ),
        tag="depthwise_conv2d_nhwc",
    )

    dummy_cast = te.compute(
        (batch, out_height_orig, out_width_orig, channel_chunk, channel_block),
        lambda n, y, x, fc, fb: conv[n, y, x, fc, fb].astype(out_dtype),
        tag="dummy_cast")
    return te.compute((batch, out_height_orig, out_width_orig, channels),
                      lambda n, y, x, c: dummy_cast[n, y, x, c // 4, c % 4],
                      tag="cast_from_acc" + args["accumulator"][-2:])
예제 #2
0
def compute_conv2d_NCHWc_KCRSk(Input,
                               Filter,
                               stride,
                               padding,
                               dilation,
                               out_dtype=None,
                               args={}):
    """Convolution operator in NCHWc layout. """

    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape
    num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape
    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_height_orig = out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width_orig = out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
    # can output shape be divded by 2 or even 4?
    # if it cannot be divided, need to extend for further help with split
    # theortically there should be addition padding for inputs, but it will be optimized by
    # cache_read InferBound. We must proceed pad here exactly to produce tensor which is
    # required for calculation of original out size, not more! In other case intermediate
    # tensor might be allcoated with less sizes while compute will try to fill the expanded
    # one - data discrepancy as a result
    # And in case of textures it is not a problem if we provide texture of less size because
    # 1. It is not important which valuses would be for extra calc - these calculations are
    #    required only for better utilizatin of GPU fit to working groups
    # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned
    #    above, the value itself is not important
    if out_height % 2 != 0:
        out_height += 1
    if out_width % 2 != 0:
        out_width += 1

    if out_height % 4 != 0:
        out_height += 2
    if out_width % 4 != 0:
        out_width += 2
    # compute graph
    pad_before = [0, 0, pad_top, pad_left, 0]
    pad_after = [0, 0, pad_down, pad_right, 0]
    # calculation of real used input size:
    input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w -
                                                        1) * dilation_w + 1
    input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h -
                                                         1) * dilation_h + 1
    if input_latest_w < in_width + pad_before[3] + pad_after[3]:
        pad_after[
            3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w
    if input_latest_h < in_height + pad_before[2] + pad_after[2]:
        pad_after[
            2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h

    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")

    rcc = te.reduce_axis((0, in_channel_chunk), name="rc")
    rcb = te.reduce_axis((0, in_channel_block), name="rc")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    # When tuning, insert a cache_read("texture") stage to properly test
    # performance of kernels that utlize texture inputs. The cache_read
    # is not needed when using the graph_runtime which supports passing
    # in external texture buffers. This can be removed once AutoTVM tuning
    # supports capturing this runtime information during task extraction
    # or once texture lowering in tir.TextureFlatten supports cache_read
    # cancellation when padding is utilized.
    if autotvm.GLOBAL_SCOPE.in_tuning:
        # NCHWc x KCRSk
        # texture: NCH|W|c
        # texture: K|CRS|k
        Filter_tx = te.compute(
            (num_filter_chunk, channel * kernel_h * kernel_w,
             num_filter_block),
            lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), (
                crs // kernel_w) % kernel_h, crs % kernel_w, ffb],
            name="packed_filter")
        conv = te.compute(
            (batch, num_filter_chunk, out_height, out_width, num_filter_block),
            lambda nn, ffc, yy, xx, ffb: te.sum(
                (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w +
                      rx * dilation_w, rcb] * Filter_tx[ffc, (
                          (rcc * in_channel_block + rcb) * kernel_h + ry
                      ) * kernel_w + rx, ffb]).astype(args["accumulator"]),
                axis=[rcc, rcb, ry, rx],
            ),
            tag="conv2d_nchwc",
        )
    else:
        conv = te.compute(
            (batch, num_filter_chunk, out_height, out_width, num_filter_block),
            lambda nn, ffc, yy, xx, ffb: te.sum(
                (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w +
                      rx * dilation_w, rcb] * Filter[
                          ffc, rcc * in_channel_block + rcb, ry, rx, ffb]).
                astype(args["accumulator"]),
                axis=[rcc, rcb, ry, rx],
            ),
            tag="conv2d_nchwc",
        )
    return te.compute(
        (batch, num_filter_chunk, out_height_orig, out_width_orig,
         num_filter_block),
        lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype(out_dtype),
        tag="cast_from_acc" + args["accumulator"][-2:])
예제 #3
0
def compute_depthwise_conv2d_NCHWc_KCRSk(Input,
                                         Filter,
                                         stride,
                                         padding,
                                         dilation,
                                         out_dtype=None,
                                         args={}):
    """Depthwise convolution operator in NCHWc layout. """
    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2

    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, channel_chunk, in_height, in_width, channel_block = Input.shape
    _, channel_multiplier, kernel_h, kernel_w, _ = Filter.shape

    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    out_channel_chunk = simplify(channel_chunk * channel_multiplier)
    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
    # compute graph
    pad_before = [0, 0, pad_top, pad_left, 0]
    pad_after = [0, 0, pad_down, pad_right, 0]
    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")

    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    if autotvm.GLOBAL_SCOPE.in_tuning:
        # NCHWc x CMRSc = [N,(C//4)M,OH,OW, 4c]
        # NCHWc x CMRS
        # texture: NCH|W|c
        # texture: C|MRS|c
        Filter_tx = te.compute(
            (channel_chunk, channel_multiplier * kernel_h * kernel_w,
             channel_block),
            lambda ffc, mrs, ffb: Filter[ffc, mrs // (kernel_h * kernel_w), (
                mrs // kernel_w) % kernel_h, mrs % kernel_w, ffb],
            name="packed_filter")

        conv = te.compute(
            (batch, out_channel_chunk, out_height, out_width, channel_block),
            lambda nn, ffc, yy, xx, ffb: te.sum(
                (temp[nn, ffc // channel_multiplier, yy * stride_h + ry *
                      dilation_h, xx * stride_w + rx * dilation_w, ffb] *
                 Filter_tx[ffc // channel_multiplier, (
                     (ffc % channel_multiplier) * kernel_h + ry) * kernel_w +
                           rx, ffb]).astype(args["accumulator"]),
                axis=[ry, rx],
            ),
            tag="depthwise_conv2d_nchwc_kcrsk_texture",
        )
    else:
        conv = te.compute(
            (batch, out_channel_chunk, out_height, out_width, channel_block),
            lambda nn, ffc, yy, xx, ffb: te.sum(
                (temp[nn, ffc // channel_multiplier, yy * stride_h + ry *
                      dilation_h, xx * stride_w + rx * dilation_w, ffb] *
                 Filter[ffc // channel_multiplier, ffc % channel_multiplier,
                        ry, rx, ffb]).astype(args["accumulator"]),
                axis=[ry, rx],
            ),
            tag="depthwise_conv2d_nchwc_kcrsk",
        )
    return te.compute(
        conv.shape,
        lambda n, ffc, y, x, ffb: conv[n, ffc, y, x, ffb].astype(out_dtype),
        tag="cast_from_acc" + args["accumulator"][-2:])
예제 #4
0
def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation,
                               out_dtype):
    """Compute function for Cortex-M7 SIMD implementation of conv2d."""
    assert isinstance(strides, int) or len(strides) == 2
    assert isinstance(dilation, int) or len(dilation) == 2

    if isinstance(strides, int):
        stride_h = stride_w = strides
    else:
        stride_h, stride_w = strides

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch_size, in_height, in_width, in_channels = data.shape
    kernel_h, kernel_w, out_channels, _ = kernel.shape

    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    pad_before = [0, pad_top, pad_left, 0]
    pad_after = [0, pad_down, pad_right, 0]
    padded_data = pad(data, pad_before, pad_after, name="padded_data")

    rc = te.reduce_axis((0, in_channels), name="rc")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    conv = te.compute(
        (batch_size, out_height, out_width, out_channels),
        lambda nn, yy, xx, ff: te.sum(
            padded_data[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx
                        * dilation_w, rc].astype(out_dtype) * kernel[
                            ry, rx, ff, rc].astype(out_dtype),
            axis=[ry, rx, rc],
        ),
        name="conv2d",
        tag="conv2d_nhwc",
    )

    ###########################
    # Config Space Definition #
    ###########################
    n, oh, ow, co = (
        cfg.axis(batch_size.value),
        cfg.axis(out_height.value),
        cfg.axis(out_width.value),
        cfg.axis(out_channels.value),
    )
    kh, kw, ci = (
        cfg.reduce_axis(kernel_h.value),
        cfg.reduce_axis(kernel_w.value),
        cfg.reduce_axis(in_channels.value),
    )

    owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2)
    cio, cii = cfg.define_split(
        "tile_ci",
        ci,
        policy="factors",
        num_outputs=2,
        # TODO: check case with in_channels.value % 4 != 0 with AutoTVM
        filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0,
    )
    coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2)

    cfg.define_reorder(
        "reorder_0_simd",
        [n, oh, owo, owi, coo, coi, kh, kw, cio, cii],
        policy="candidate",
        candidate=[
            [n, oh, kh, kw, owo, coo, cio, owi, coi, cii],
            [n, oh, kh, kw, coo, owo, cio, owi, coi, cii],
            [n, kh, kw, oh, owo, coo, cio, owi, coi, cii],
            [n, kh, kw, oh, coo, owo, cio, owi, coi, cii],
        ],
    )

    cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
    cfg.define_knob("unroll_explicit", [0, 1])

    if cfg.is_fallback:
        cfg.fallback_split("tile_ow", [-1, out_width.value])
        cfg.fallback_split("tile_ci", [-1, in_channels.value])
        cfg.fallback_split("tile_co", [-1, out_channels.value])

    return conv
예제 #5
0
def compute_conv2d_NHWC_HWIO(Input,
                             Filter,
                             stride,
                             padding,
                             dilation,
                             out_dtype=None,
                             args={}):
    """Convolution operator in NHWC layout. """

    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_height, in_width, in_channel = Input.shape
    kernel_h, kernel_w, _, out_channels = Filter.shape
    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_height_orig = out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width_orig = out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    in_channel_block = 4
    in_channel_tail = in_channel % in_channel_block
    in_channel_chunk = in_channel // in_channel_block
    num_filter_block = 4
    num_filter_tail = out_channels % num_filter_block
    num_filter_chunk = out_channels // num_filter_block

    pad_value = tvm.tir.const(0, Input.dtype)
    # compute:
    if in_channel_tail == 0:
        Input = te.compute(
            [batch, in_height, in_width, in_channel_chunk, in_channel_block],
            lambda nn, yy, xx, icc, icb: Input[nn, yy, xx, icc *
                                               in_channel_block + icb],
            name="input_pack",
            tag="input_pack",
        )
    else:
        in_channel_chunk += 1

        def _reorder_data(*indices):
            condition = []
            condition.append(indices[3] == in_channel_chunk - 1)
            condition.append(indices[4] >= in_channel_tail)
            condition = tvm.tir.all(*condition)
            return tvm.tir.if_then_else(
                condition, pad_value,
                Input[indices[0], indices[1], indices[2],
                      indices[3] * in_channel_block + indices[4]])

        Input = te.compute(
            [batch, in_height, in_width, in_channel_chunk, in_channel_block],
            _reorder_data,
            name="input_pack",
            tag="input_pack_expanded",
        )
    if num_filter_tail == 0 and in_channel_tail == 0:
        Filter = te.compute(
            [
                kernel_h, kernel_w, in_channel, num_filter_chunk,
                num_filter_block
            ],
            lambda kh, kw, ic, nfc, nfb: Filter[kh, kw, ic, nfc *
                                                num_filter_block + nfb],
            name="filter_pack",
            tag="filter_pack",
        )
    else:
        num_filter_chunk += 1

        # HWIO
        def _reorder_weights(*indices):
            conditionA = []
            conditionA.append(indices[3] == num_filter_chunk - 1)
            conditionA.append(indices[4] >= num_filter_block)
            conditionAT = tvm.tir.all(*conditionA)

            conditionO = []
            conditionO.append(conditionAT)
            conditionO.append(
                indices[2] >= in_channel_chunk * in_channel_block +
                in_channel_tail)
            conditionOT = tvm.tir.any(*conditionO)
            return tvm.tir.if_then_else(
                conditionOT, pad_value,
                Filter[indices[0], indices[1], indices[2],
                       indices[3] * num_filter_block + indices[4]])

        Filter = te.compute(
            [
                kernel_h, kernel_w, in_channel, num_filter_chunk,
                num_filter_block
            ],
            _reorder_weights,
            name="filter_pack",
            tag="filter_pack_expanded",
        )

    # can output shape be divded by 2 or even 4?
    # if it cannot be divided, need to extend for further help with split
    # theortically there should be addition padding for inputs, but it will be optimized by
    # cache_read InferBound. We must proceed pad here exactly to produce tensor which is
    # required for calculation of original out size, not more! In other case intermediate
    # tensor might be allcoated with less sizes while compute will try to fill the expanded
    # one - data discrepancy as a result
    # And in case of textures it is not a problem if we provide texture of less size because
    # 1. It is not important which valuses would be for extra calc - these calculations are
    #    required only for better utilizatin of GPU fit to working groups
    # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned
    #    above, the value itself is not important
    if out_height % 2 != 0:
        out_height += 1
    if out_width % 2 != 0:
        out_width += 1

    if out_height % 4 != 0:
        out_height += 2
    if out_width % 4 != 0:
        out_width += 2

    # compute graph
    pad_before = [0, pad_top, pad_left, 0, 0]
    pad_after = [0, pad_down, pad_right, 0, 0]
    # calculation of real used input size:
    input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w -
                                                        1) * dilation_w + 1
    input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h -
                                                         1) * dilation_h + 1
    if input_latest_w < in_width + pad_before[3] + pad_after[3]:
        pad_after[
            3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w
    if input_latest_h < in_height + pad_before[2] + pad_after[2]:
        pad_after[
            2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h

    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")

    rcc = te.reduce_axis((0, in_channel_chunk), name="rc")
    rcb = te.reduce_axis((0, in_channel_block), name="rc")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")
    conv = te.compute(
        (batch, out_height, out_width, num_filter_chunk, num_filter_block),
        lambda nn, yy, xx, fc, fb: te.sum(
            (temp[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx *
                  dilation_w, rcc, rcb] * Filter[ry, rx, rcc * in_channel_block
                                                 + rcb, fc, fb]).astype(args[
                                                     "accumulator"]),
            axis=[ry, rx, rcc, rcb],
        ),
        tag="conv2d_nhwc",
    )

    dummy_cast = te.compute(
        (batch, out_height_orig, out_width_orig, num_filter_chunk,
         num_filter_block),
        lambda n, y, x, fc, fb: conv[n, y, x, fc, fb].astype(out_dtype),
        tag="dummy_cast")
    return te.compute((batch, out_height_orig, out_width_orig, out_channels),
                      lambda n, y, x, c: dummy_cast[n, y, x, c // 4, c % 4],
                      tag="cast_from_acc" + args["accumulator"][-2:])
예제 #6
0
def conv1d_nwc_dsp_compute(cfg, data, kernel, strides, padding, dilation,
                           out_dtype):
    """Compute function for v7e-m DSP instructions of conv1d on NWC layout."""
    if isinstance(strides, (tuple, list)):
        strides = strides[0]
    if isinstance(dilation, (tuple, list)):
        dilation = dilation[0]

    batch_size, data_width, in_channels = data.shape
    kernel_size, out_channels, _ = kernel.shape

    # Compute the output shape
    dilated_kernel_size = (kernel_size - 1) * dilation + 1
    pad_left, pad_right = get_pad_tuple1d(padding, (dilated_kernel_size, ))
    out_channels = simplify(out_channels)
    out_width = simplify(
        (data_width - dilated_kernel_size + pad_left + pad_right) // strides +
        1)

    # Apply padding
    pad_before = [0, pad_left, 0]
    pad_after = [0, pad_right, 0]
    padded_data = pad(data, pad_before, pad_after, name="padded_data")

    # Compute graph
    rc = te.reduce_axis((0, in_channels), name="rc")
    rw = te.reduce_axis((0, kernel_size), name="rw")

    conv = te.compute(
        (batch_size, out_width, out_channels),
        lambda b, w, c: te.sum(
            padded_data[b, w * strides + rw * dilation, rc].astype(out_dtype) *
            kernel[rw, c, rc].astype(out_dtype),
            axis=[rw, rc],
        ),
        name="conv1d",
        tag="conv1d_nwc",
    )

    ###########################
    # Config Space Definition #
    ###########################
    n, ow, co = (
        cfg.axis(batch_size.value),
        cfg.axis(out_width.value),
        cfg.axis(out_channels.value),
    )
    kw, ci = (
        cfg.reduce_axis(kernel_size.value),
        cfg.reduce_axis(in_channels.value),
    )

    owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2)
    cio, cii = cfg.define_split(
        "tile_ci",
        ci,
        policy="factors",
        num_outputs=2,
        # TODO: check case with in_channels.value % 4 != 0 with AutoTVM
        filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0,
    )
    coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2)

    cfg.define_reorder(
        "reorder_0_simd",
        [n, owo, owi, coo, coi, kw, cio, cii],
        policy="candidate",
        candidate=[
            [n, kw, owo, coo, cio, owi, coi, cii],
            [n, kw, coo, owo, cio, owi, coi, cii],
            [n, kw, owo, coo, cio, owi, coi, cii],
            [n, kw, coo, owo, cio, owi, coi, cii],
        ],
    )

    cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
    cfg.define_knob("unroll_explicit", [0, 1])

    if cfg.is_fallback:
        cfg.fallback_split("tile_ow", [-1, out_width.value])
        cfg.fallback_split("tile_ci", [-1, in_channels.value])
        cfg.fallback_split("tile_co", [-1, out_channels.value])

    return conv
예제 #7
0
def dilation2d_nchw(input, filter, stride, padding, dilations, out_dtype=None):
    """Morphological dilation operator in NCHW layout.

    Parameters
    ----------
    input : tvm.te.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]

    filter : tvm.te.Tensor
        3-D with shape [ in_channel, filter_height, filter_width]

    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]

    padding : int or str
        Padding size

    dilations: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    out_dtype : Optional[str]
        Specifies the output data type.

    Returns
    -------
    Output : tvm.te.Tensor
        4-D with shape [batch, in_channel, out_height, out_width]
    """
    if out_dtype is None:
        out_dtype = input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilations, int) or len(dilations) == 2
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilations, int):
        dilation_h = dilation_w = dilations
    else:
        dilation_h, dilation_w = dilations

    batch, in_channel, in_height, in_width = input.shape
    channel, kernel_h, kernel_w = filter.shape
    assert (in_channel.value == channel.value
            ), "For Dilation2D input and filter channels should be same."

    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
    # compute graph
    pad_before = [0, 0, pad_top, pad_left]
    pad_after = [0, 0, pad_down, pad_right]
    temp = pad(input, pad_before, pad_after, name="pad_temp")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    return te.compute(
        (batch, in_channel, out_height, out_width),
        lambda nn, ff, yy, xx: te.max(
            temp[nn, ff, yy * stride_h + ry * dilation_h, xx * stride_w + rx *
                 dilation_w].astype(out_dtype) + filter[ff, ry, rx].astype(
                     out_dtype),
            axis=[ry, rx],
        ),
        tag="dilation2d_nchw",
    )
예제 #8
0
파일: utils.py 프로젝트: were/tvm
def expand_spatial_dimensions(in_height, in_width, kernel_h, kernel_w,
                              dilation_h, dilation_w, padding, stride_h,
                              stride_w):
    """
    Expands spatial dimensions to be dividable by factor 4. This will allow us to do extrimely
    better parallel computation on GPU. The drawback of this solution - it will be number of
    useless computations. By fact the speed-up of parallelism significantly overcomes the slowdown
    of extra compute and eventuially this is useful approach, at least for GPU

    Parameters
    ----------
    in_height: int
        Height of the feature map

    in_width: int
        Width of the feature map

    kernel_h: int
        Height of the conv2d kernel

    kernel_w: int
        Width of the conv2d kernel

    dilation_h: int
        Vertical dilation of the conv2d kernel

    dilation_w: int
        Horizontal dilation of the conv2d kernel

    padding: tuple or list
        Conv2d paddings

    stride_h: int
        Vertical stride  of the conv2d kernel

    stride_w: int
        Horizontal stride  of the conv2d kernel
    """
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1

    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_height_orig = out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width_orig = out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    # can output shape be divded by 2 or even 4?
    # if it cannot be divided, need to extend for further help with split
    # theortically there should be addition padding for inputs, but it will be optimized by
    # cache_read InferBound. We must proceed pad here exactly to produce tensor which is
    # required for calculation of original out size, not more! In other case intermediate
    # tensor might be allcoated with less sizes while compute will try to fill the expanded
    # one - data discrepancy as a result
    # And in case of textures it is not a problem if we provide texture of less size because
    # 1. It is not important which values would be for extra calc - these calculations are
    #    required only for better utilizatin of GPU fit to working groups
    # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned
    #    above, the value itself is not important
    if out_height % 2 != 0:
        out_height += 1
    if out_width % 2 != 0:
        out_width += 1

    if out_height % 4 != 0:
        out_height += 2
    if out_width % 4 != 0:
        out_width += 2
    return out_height_orig, out_height, out_width_orig, out_width
예제 #9
0
def compute_conv2d_NCHWc_tpack(Input, Filter, stride, padding, dilation, out_dtype=None, args={}):
    """Convolution operator in NCHWc layout. """

    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_channels, in_height, in_width = Input.shape
    out_channles, _, kernel_h, kernel_w = Filter.shape
    in_channel_tail = in_channels % 4
    in_channel_chunk = in_channels // 4
    if in_channel_tail == 0:
      in_channel_tail = 4
    else:
      in_channel_chunk += 1

    num_filter_block = out_channles % 4
    num_filter_chunk = out_channles // 4
    if num_filter_block == 0:
        num_filter_block = 4
    else:
        num_filter_chunk += 1

    pad_value = tvm.tir.const(0, Input.dtype)
    def _reorder_data(*indices):
        condition = []
        condition.append(indices[1] == in_channel_chunk - 1)
        condition.append(indices[4] >= in_channel_tail)
        condition = tvm.tir.all(*condition)
        return tvm.tir.if_then_else(
                condition,
                pad_value,
                Input[indices[0],indices[1] * 4 + indices[4], indices[2], indices[3]])

    # compute:
    reordered_data = te.compute(
        [batch, in_channel_chunk, in_height, in_width, 4],
        _reorder_data,
        name="input_pack",
        tag="input_pack",
    )

    def _reorder_weights(*indices):
        conditionA = []
        conditionA.append(indices[0] == num_filter_chunk - 1)
        conditionA.append(indices[4] >= num_filter_block)
        conditionAT = tvm.tir.all(*conditionA)

        conditionO = []
        conditionO.append(conditionAT)
        conditionO.append(indices[1] >= in_channel_chunk * 4 + in_channel_tail)
        conditionOT = tvm.tir.any(*conditionO)
        return tvm.tir.if_then_else(
                conditionOT,
                pad_value,
                Filter[indices[0] * 4 + indices[4], indices[1], indices[2], indices[3]])

    reordered_filter = te.compute(
        [num_filter_chunk, in_channel_chunk * 4, kernel_h, kernel_w, 4],
        _reorder_weights,
        name="filter_pack",
        tag="filter_pack",
    )

    # batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape
    # num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape
    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w)
    )

    out_height_orig = out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width_orig = out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    # can output shape be divded by 2 or even 4?
    # if it cannot be divided, need to extend for further help with split
    # theortically there should be addition padding for inputs, but it will be optimized by
    # cache_read InferBound. We must proceed pad here exactly to produce tensor which is
    # required for calculation of original out size, not more! In other case intermediate
    # tensor might be allcoated with less sizes while compute will try to fill the expanded
    # one - data discrepancy as a result
    # And in case of textures it is not a problem if we provide texture of less size because
    # 1. It is not important which valuses would be for extra calc - these calculations are
    #    required only for better utilizatin of GPU fit to working groups
    # 2. When we request pixel out opf bound, texture will handle this correctly. As mentioned
    #    above, the value itself is not important
    if out_height % 2 != 0:
        out_height += 1
    if out_width % 2 != 0:
        out_width += 1

    if out_height % 4 != 0:
        out_height += 2
    if out_width % 4 != 0:
        out_width += 2

    # compute graph
    pad_before = [0, 0, pad_top, pad_left, 0]
    pad_after = [0, 0, pad_down, pad_right, 0]
    # calculation of real used input size:
    input_latest_w = (out_width_orig - 1) * stride_w + (kernel_w - 1) * dilation_w + 1
    input_latest_h = (out_height_orig - 1) * stride_h + (kernel_h - 1) * dilation_h + 1
    if input_latest_w < in_width + pad_before[3] + pad_after[3]:
        pad_after[3] -= in_width + pad_before[3] + pad_after[3] - input_latest_w
    if input_latest_h < in_height + pad_before[2] + pad_after[2]:
        pad_after[2] -= in_height + pad_before[2] + pad_after[2] - input_latest_h
    temp = nn.pad(reordered_data, pad_before, pad_after, name="pad_temp")

    rcc = te.reduce_axis((0, in_channel_chunk), name="rcc")
    rcb = te.reduce_axis((0, 4), name="rcb")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    conv = te.compute(
        (batch, num_filter_chunk, out_height, out_width, 4),
        lambda nn, ffc, yy, xx, ffb: te.sum(
            (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rcb]
            * reordered_filter[ffc, rcc * 4 + rcb, ry, rx, ffb]).astype(args["accumulator"]),
            axis=[rcc, rcb, ry, rx],
        ),
        tag="conv2d_nchwc_tpack",
    )

    # conv = s.cache_write(conv, "local") does not work properly, it does not create
    # intermediate buffer, continues to read/write from global tensor as accumulator and
    # leads to the crash in runtime
    # due to this reason we had to use such dummy cast and compute_at to create such intermediate
    # accumulator with local scope
    dummy_cast = te.compute((batch, num_filter_chunk, out_height_orig, out_width_orig, 4), lambda n,fc,y,x,fb: conv[n,fc,y,x,fb].astype(out_dtype), tag="dummy_cast")

    return te.compute((batch, out_channles, out_height_orig, out_width_orig), lambda n,c,y,x: dummy_cast[n,c // 4,y,x,c % 4], tag="cast_from_acc" + args["accumulator"][-2:])
def compute_conv2d_NCHWc_KCRSk_acc32(Input,
                                     Filter,
                                     stride,
                                     padding,
                                     dilation,
                                     out_dtype=None):
    """Convolution operator in NCHWc layout."""

    if out_dtype is None:
        out_dtype = Input.dtype
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2
    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch, in_channel_chunk, in_height, in_width, in_channel_block = Input.shape
    num_filter_chunk, channel, kernel_h, kernel_w, num_filter_block = Filter.shape
    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = nn.get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
    # compute graph
    pad_before = [0, 0, pad_top, pad_left, 0]
    pad_after = [0, 0, pad_down, pad_right, 0]
    temp = nn.pad(Input, pad_before, pad_after, name="pad_temp")

    rcc = te.reduce_axis((0, in_channel_chunk), name="rc")
    rcb = te.reduce_axis((0, in_channel_block), name="rc")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    # NCHWc x KCRSk
    # texture: NCH|W|c
    # texture: K|CRS|k
    # c = crs//RS
    # rs = crs % RS
    # r = rs // W == (crs // S) % R
    # s = rs % W == crs % S
    Filter = te.compute(
        (num_filter_chunk, channel * kernel_h * kernel_w, num_filter_block),
        lambda ffc, crs, ffb: Filter[ffc, crs // (kernel_h * kernel_w), (
            crs // kernel_w) % kernel_h, crs % kernel_w, ffb],
        name="packed_filter",
    )
    conv = te.compute(
        (batch, num_filter_chunk, out_height, out_width, num_filter_block),
        lambda nn, ffc, yy, xx, ffb: te.sum(
            (temp[nn, rcc, yy * stride_h + ry * dilation_h, xx * stride_w + rx
                  * dilation_w, rcb] * Filter[ffc, (
                      (rcc * in_channel_block + rcb) * kernel_h + ry
                  ) * kernel_w + rx, ffb]).astype(out_dtype),
            axis=[rcc, rcb, ry, rx],
        ),
        tag="conv2d_nchwc_kcrsk_texture",
    )
    output = te.compute(
        conv.shape,
        lambda n, fc, y, x, fb: conv[n, fc, y, x, fb].astype("float32"))
    return output