示例#1
0
 def main(A: T.handle, tensor: T.handle) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     # buffer definition
     tensor_2 = T.buffer_decl([1, 10, 12, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     A_1 = T.match_buffer(A, [1, 12, 14, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     tensor_1 = T.match_buffer(tensor, [1, 8, 8, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     # body
     T.realize(tensor_1[0:1, 0:8, 0:8, 0:16], "")
     T.realize(tensor_2[0:1, 0:6, 0:12, 0:16], "")
     for ax1_outer in T.serial(0, 2):
         for ax1 in T.serial(0, 6):
             for ax2 in T.serial(0, 12):
                 for ax3 in T.serial(0, 16):
                     if T.likely(((ax1_outer < 1) or (ax1 >= 2)), dtype='bool') :
                         tensor_2[0, T.floormod((ax1 + (ax1_outer*4)), 6), ax2, ax3] = T.int8(0)
                     for dh in T.serial(0, 3):
                         for dw in T.serial(0, 3):
                             if T.likely(((ax1_outer < 1) or (ax1 >= 2)), dtype='bool'):
                                 tensor_2[0, T.floormod((ax1 + (ax1_outer*4)), 6), ax2, ax3] = T.max(tensor_2[0, T.floormod((ax1 + (ax1_outer*4)), 6), ax2, ax3], A_1[0, ((ax1 + (ax1_outer*4)) + dh), (ax2 + dw), ax3])
         for ax1_inner in T.serial(0, 4):
             for ax2_inner in T.serial(0, 8):
                 for ax3_inner in T.serial(0, 16):
                     tensor_1[0, (ax1_inner + (ax1_outer*4)), ax2_inner, ax3_inner] = T.int8(0)
                     for dh_1 in T.serial(0, 3):
                         for dw_1 in T.serial(0, 5):
                             tensor_1[0, (ax1_inner + (ax1_outer*4)), ax2_inner, ax3_inner] = T.max(tensor_1[0, (ax1_inner + (ax1_outer*4)), ax2_inner, ax3_inner], tensor_2[0, T.floormod(((ax1_inner + (ax1_outer*4)) + dh_1), 6), (ax2_inner + dw_1), ax3_inner])
def non_perfect_tiling_cache(a: T.handle, b: T.handle) -> None:
    X = T.match_buffer(a, [224, 224], dtype="float32")
    Y = T.match_buffer(b, [224, 224], dtype="float32")
    cache = T.alloc_buffer([224, 224], dtype="float32")
    for hh_0, ww_0 in T.grid(28, 28):
        for ax0 in T.serial(0, 10):
            for ax1 in T.serial(0, 10):
                with T.block("cache"):
                    h = T.axis.spatial(224, hh_0 * 8 - 1 + ax0)
                    w = T.axis.spatial(224, ww_0 * 8 - 1 + ax1)
                    T.where(1 <= hh_0 * 8 + ax0 and hh_0 * 8 + ax0 < 225
                            and 1 <= ww_0 * 8 + ax1 and ww_0 * 8 + ax1 < 225)
                    cache[h, w] = X[h, w]
        for hh_1, ww_1, khh, kww in T.grid(8, 8, 3, 3):
            with T.block("compute"):
                h = T.axis.spatial(224, hh_0 * 8 + hh_1)
                w = T.axis.spatial(224, ww_0 * 8 + ww_1)
                kh, kw = T.axis.remap("RR", [khh, kww])
                with T.init():
                    Y[h, w] = 0.0
                Y[h, w] = T.max(
                    Y[h, w],
                    T.if_then_else(
                        T.likely(1 <= h + kh, dtype="bool")
                        and T.likely(h + kh < 225, dtype="bool")
                        and T.likely(1 <= w + kw, dtype="bool")
                        and T.likely(w + kw < 225, dtype="bool"),
                        cache[h + kh - 1, w + kw - 1],
                        0.0,
                        dtype="float32",
                    ),
                )
 def compacted_spatial_tiled_pad_and_pooling(
     X: T.Buffer[(64, 112, 112), "int32"], Y: T.Buffer[(64, 56, 56), "int32"]
 ) -> None:
     for h_o, w_o in T.grid(14, 14):
         with T.block():
             T.reads(X[0:64, h_o * 8 - 1 : h_o * 8 + 8, w_o * 8 - 1 : w_o * 8 + 8])
             T.writes(Y[h_o * 4 : h_o * 4 + 4, w_o * 4 : w_o * 4 + 4, 0:64])
             X_cache = T.alloc_buffer([9, 9, 64], dtype="int32")
             for ax0, ax1, ax2 in T.grid(64, 9, 9):
                 with T.block("cache"):
                     T.where(1 <= h_o * 8 + ax1 and 1 <= w_o * 8 + ax2)
                     T.reads(X[ax0, h_o * 8 + ax1 - 1, w_o * 8 + ax2 - 1])
                     T.writes(
                         X_cache[
                             h_o * 8 + ax1 - T.max(0, h_o * 8 - 1) - 1,
                             w_o * 8 + ax2 - T.max(0, w_o * 8 - 1) - 1,
                             ax0,
                         ]
                     )
                     X_cache[
                         h_o * 8 + ax1 - T.max(0, h_o * 8 - 1) - 1,
                         w_o * 8 + ax2 - T.max(0, w_o * 8 - 1) - 1,
                         ax0,
                     ] = X[ax0, h_o * 8 + ax1 - 1, w_o * 8 + ax2 - 1]
             for h_i, w_i, kh, kw, c in T.grid(4, 4, 3, 3, 64):
                 with T.block("compute"):
                     T.reads(
                         X_cache[
                             h_o * 8 + h_i * 2 + kh - T.max(0, h_o * 8 - 1) - 1,
                             w_o * 8 + w_i * 2 + kw - T.max(0, w_o * 8 - 1) - 1,
                             c,
                         ]
                     )
                     T.writes(Y[h_o * 4 + h_i, w_o * 4 + w_i, c])
                     if kh == 0 and kw == 0:
                         Y[h_o * 4 + h_i, w_o * 4 + w_i, c] = 0
                     Y[h_o * 4 + h_i, w_o * 4 + w_i, c] = T.max(
                         Y[h_o * 4 + h_i, w_o * 4 + w_i, c],
                         T.if_then_else(
                             T.likely(1 <= h_o * 8 + h_i * 2 + kh, dtype="bool")
                             and T.likely(1 <= w_o * 8 + w_i * 2 + kw, dtype="bool"),
                             X_cache[
                                 h_o * 8 + h_i * 2 + kh - T.max(0, h_o * 8 - 1) - 1,
                                 w_o * 8 + w_i * 2 + kw - T.max(0, w_o * 8 - 1) - 1,
                                 c,
                             ],
                             0,
                             dtype="int32",
                         ),
                     )
def compacted_padding_pattern_inlined(
    X: T.Buffer[(224, 224), "float32"], Y: T.Buffer[(224, 224), "float32"]
) -> None:
    cache = T.alloc_buffer([224, 224], dtype="float32")
    for h, w in T.grid(224, 224):
        with T.block("cache"):
            cache[h, w] = X[h, w]
    for h, w, kh, kw in T.grid(224, 224, 3, 3):
        with T.block("compute"):
            Y[h, w] = T.max(
                Y[h, w],
                T.if_then_else(
                    T.likely(1 <= h + kh, dtype="bool")
                    and T.likely(h + kh < 225, dtype="bool")
                    and T.likely(1 <= w + kw, dtype="bool")
                    and T.likely(w + kw < 225, dtype="bool"),
                    cache[h + kh - 1, w + kw - 1],
                    0.0,
                    dtype="float32",
                ),
            )
def padding_pattern_inlined(a: T.handle, b: T.handle) -> None:
    X = T.match_buffer(a, [224, 224], dtype="float32")
    Y = T.match_buffer(b, [224, 224], dtype="float32")
    cache = T.alloc_buffer([224, 224], dtype="float32")
    for h, w in T.grid(224, 224):
        with T.block("cache"):
            cache[h, w] = X[h, w]
    for h, w, kh, kw in T.grid(224, 224, 3, 3):
        with T.block("compute"):
            Y[h, w] = T.max(
                Y[h, w],
                T.if_then_else(
                    T.likely(1 <= h + kh, dtype="bool")
                    and T.likely(h + kh < 225, dtype="bool")
                    and T.likely(1 <= w + kw, dtype="bool")
                    and T.likely(w + kw < 225, dtype="bool"),
                    cache[h + kh - 1, w + kw - 1],
                    0.0,
                    dtype="float32",
                ),
            )
 def spatial_tiled_pad_and_pooling(
     X: T.Buffer[(64, 112, 112), "int32"], Y: T.Buffer[(64, 56, 56), "int32"]
 ) -> None:
     for h_o, w_o in T.grid(14, 14):
         with T.block():
             X_cache = T.alloc_buffer([112, 112, 64], dtype="int32")
             for ax0, ax1, ax2 in T.grid(64, 9, 9):
                 with T.block("cache"):
                     T.where(1 <= h_o * 8 + ax1 and 1 <= w_o * 8 + ax2)
                     T.reads(X[ax0, h_o * 8 - 1 + ax1, w_o * 8 - 1 + ax2])
                     T.writes(X_cache[h_o * 8 - 1 + ax1, w_o * 8 - 1 + ax2, ax0])
                     X_cache[h_o * 8 - 1 + ax1, w_o * 8 - 1 + ax2, ax0] = X[
                         ax0, h_o * 8 - 1 + ax1, w_o * 8 - 1 + ax2
                     ]
             for h_i, w_i, kh, kw, c in T.grid(4, 4, 3, 3, 64):
                 with T.block("compute"):
                     T.reads(
                         X_cache[(h_o * 4 + h_i) * 2 + kh - 1, (w_o * 4 + w_i) * 2 + kw - 1, c]
                     )
                     T.writes(Y[h_o * 4 + h_i, w_o * 4 + w_i, c])
                     if kh == 0 and kw == 0:
                         Y[h_o * 4 + h_i, w_o * 4 + w_i, c] = 0
                     Y[h_o * 4 + h_i, w_o * 4 + w_i, c] = T.max(
                         Y[h_o * 4 + h_i, w_o * 4 + w_i, c],
                         T.if_then_else(
                             T.likely(1 <= (h_o * 4 + h_i) * 2 + kh, dtype="bool")
                             and T.likely((h_o * 4 + h_i) * 2 + kh < 113, dtype="bool")
                             and T.likely(1 <= (w_o * 4 + w_i) * 2 + kw, dtype="bool")
                             and T.likely((w_o * 4 + w_i) * 2 + kw < 113, dtype="bool"),
                             X_cache[
                                 (h_o * 4 + h_i) * 2 + kh - 1,
                                 (w_o * 4 + w_i) * 2 + kw - 1,
                                 c,
                             ],
                             0,
                             dtype="int32",
                         ),
                     )