Exemplo n.º 1
0
 def main(buffer1: T.Buffer[(64,), "uint8"], 
 buffer2: T.Buffer[(48,), "uint8"], 
 buffer3: T.Buffer[(256,), "uint8"],
 buffer4: T.Buffer[(256,), "uint8"],
 buffer5: T.Buffer[(16,), "uint8"],
 buffer6: T.Buffer[(48,), "uint8"],
 buffer7: T.Buffer[(256,), "uint8"],
 buffer8: T.Buffer[(64,), "uint8"],
 buffer9: T.Buffer[(256,), "int8"],
 ) -> None:
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     # body
     p1 = T.allocate([48], "uint8", "global")
     p2 = T.allocate([48], "uint8", "global")
     p3 = T.allocate([256], "int8", "local")
     p5 = T.allocate([16], "uint8", "global")
     p6 = T.allocate([48], "uint8", "global")
     p7 = T.allocate([256], "int8", "local")
     T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle")) # Local
     T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, buffer9[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle")) # Local
     T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer9[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
Exemplo n.º 2
0
def gemm_mma_m8n8k4_row_col_fp64pf64fp64(a: T.handle, b: T.handle,
                                         c: T.handle):
    T.func_attr({"global_symbol": "default_function", "tir.noalias": True})
    A = T.match_buffer(a, [8, 4], dtype="float64")
    B = T.match_buffer(b, [8, 4], dtype="float64")
    C = T.match_buffer(c, [8, 8], dtype="float64")
    brow = T.env_thread("blockIdx.y")
    bcol = T.env_thread("blockIdx.x")
    tx = T.env_thread("threadIdx.x")
    T.launch_thread(brow, 1)
    T.launch_thread(bcol, 1)
    T.launch_thread(tx, 32)
    MultiA = T.allocate([1], "float64", scope="local")
    MultiB = T.allocate([1], "float64", scope="local")
    Accum = T.allocate([2], "float64", scope="local")
    for i in range(2):
        Accum[i] = T.float64(0)

    MultiA[0] = A[(tx % 32) // 4, (tx % 32) % 4]
    MultiB[0] = B[(tx % 32) // 4, (tx % 32) % 4]
    T.evaluate(
        T.ptx_mma(
            "m8n8k4",
            "row",
            "col",
            "fp64",
            "fp64",
            "fp64",
            MultiA.data,
            0,
            MultiB.data,
            0,
            Accum.data,
            0,
            False,
            dtype="float64",
        ))
    for mma_accum_c_id in range(2):
        C[(tx % 32) // 4,
          (tx % 32) % 4 * 2 + mma_accum_c_id] = Accum[mma_accum_c_id]
Exemplo n.º 3
0
 def main(placeholder: T.Buffer[(1024,), "int8"], ethosu_write: T.Buffer[(32768,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([1456], "uint8")
     buffer_1 = T.buffer_decl([352], "uint8")
     buffer_2 = T.buffer_decl([11040], "uint8")
     buffer_3 = T.buffer_decl([272], "uint8")
     T.preflattened_buffer(placeholder, [1, 8, 1, 8, 16], 'int8', data=placeholder.data)
     T.preflattened_buffer(ethosu_write, [1, 32, 2, 32, 16], 'int8', data=ethosu_write.data)
     # body
     ethosu_write_1 = T.allocate([12288], "int8", "global", annotations={"disable_lower_builtin":True})
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 768, 16, 256, 3, 3, 1, 1, 1, 1, buffer[0], 1456, 12, buffer_1[0], 352, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 768, 16, 256, "int8", 32, 32, 26, 32, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 1024, 16, 512, 3, 3, 1, 1, 1, 1, buffer_2[0], 11040, 12, buffer_3[0], 272, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
Exemplo n.º 4
0
 def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([592], "uint8")
     buffer_1 = T.buffer_decl([160], "uint8")
     buffer_2 = T.buffer_decl([160], "uint8")
     buffer_3 = T.buffer_decl([80], "uint8")
     T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
     T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
     # body
     ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, buffer[0], 592, 12, buffer_1[0], 160, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 160, 12, buffer_3[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
Exemplo n.º 5
0
def transformed_strided_buffer_func(A: T.Buffer[(16, 16), "float32"],
                                    C: T.Buffer[(16, 16), "float32"]) -> None:
    # body
    for i0 in T.serial(4):
        B = T.allocate([4, 17], "float32", "global")
        B_1 = T.buffer_decl([4, 16],
                            dtype="float32",
                            data=B.data,
                            strides=[17, 1])
        for i1, j in T.grid(4, 16):
            B_1[i1, j] = A[i0 * 4 + i1, j] + T.float32(1)
        for i1, j in T.grid(4, 16):
            C[i0 * 4 + i1, j] = B_1[i1, j] * T.float32(2)
Exemplo n.º 6
0
def flattened_symbolic_func(a: T.handle, c: T.handle, n: T.int32,
                            m: T.int32) -> None:
    A = T.match_buffer(a, n * m, "float32")
    C = T.match_buffer(c, n * m, "float32")
    T.preflattened_buffer(A, (n, m), "float32", data=A.data)
    T.preflattened_buffer(C, (n, m), "float32", data=C.data)

    for i in range(0, n):
        B = T.allocate([m], "float32", "global")
        for j in range(0, m):
            B[j] = A[i * m + j] + 1.0
        for j in range(0, m):
            C[i * m + j] = B[j] * 2.0
Exemplo n.º 7
0
def flattened_strided_buffer_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (256, ), "float32")
    C = T.match_buffer(c, (256, ), "float32")
    T.preflattened_buffer(A, [16, 16], dtype="float32", data=A.data)
    T.preflattened_buffer(C, [16, 16], dtype="float32", data=C.data)
    for i0 in T.serial(0, 4):
        B_new = T.allocate([68], "float32", "global")
        for i1 in T.serial(0, 4):
            for j in T.serial(0, 16):
                B_new[i1 * 17 + j] = A[i0 * 64 + i1 * 16 + j] + 1.0
        for i1 in T.serial(0, 4):
            for j in T.serial(0, 16):
                C[i0 * 64 + i1 * 16 + j] = B_new[i1 * 17 + j] * 2.0
Exemplo n.º 8
0
def strided_buffer_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (16, 16), "float32")
    C = T.match_buffer(c, (16, 16), "float32")
    for i0 in T.serial(4):
        B = T.allocate([4, 17], "float32", "global")
        B_1 = T.buffer_decl([4, 16],
                            dtype="float32",
                            data=B.data,
                            strides=[17, 1])
        for i1, j in T.grid(4, 16):
            B_1[i1, j] = A[i0 * 4 + i1, j] + 1.0
        for i1, j in T.grid(4, 16):
            C[i0 * 4 + i1, j] = B_1[i1, j] * 2.0
Exemplo n.º 9
0
def flattened_strided_buffer_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (16, 16), "float32")
    C = T.match_buffer(c, (16, 16), "float32")
    for i0 in T.serial(0, 4):
        B_new = T.allocate([68], "float32", "global")
        for i1 in T.serial(0, 4):
            for j in T.serial(0, 16):
                B_new[i1 * 17 + j] = T.load("float32", A.data,
                                            i0 * 64 + i1 * 16 + j) + 1.0
        for i1 in T.serial(0, 4):
            for j in T.serial(0, 16):
                C.data[i0 * 64 + i1 * 16 +
                       j] = T.load("float32", B_new, i1 * 17 + j) * 2.0
Exemplo n.º 10
0
 def main(placeholder: T.handle, placeholder_1: T.handle, placeholder_2: T.handle, placeholder_3: T.handle, placeholder_4: T.handle, ethosu_write: T.handle) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.match_buffer(placeholder_3, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     ethosu_write_1 = T.match_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     placeholder_5 = T.match_buffer(placeholder, [1, 16, 16, 32], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     buffer_1 = T.match_buffer(placeholder_1, [592], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_2 = T.match_buffer(placeholder_2, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_3 = T.match_buffer(placeholder_4, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     # body
     ethosu_write_2 = T.allocate([4096], "int8", "global")
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, T.load("int8", placeholder_5.data, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, T.load("int8", ethosu_write_2, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", buffer_1.data, 0), 592, 12, T.load("uint8", buffer_2.data, 0), 160, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, T.load("int8", ethosu_write_2, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, T.load("int8", ethosu_write_1.data, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", buffer.data, 0), 160, 12, T.load("uint8", buffer_3.data, 0), 80, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
Exemplo n.º 11
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(
         placeholder_4: T.handle, placeholder_5: T.handle,
         placeholder_6: T.handle, T_cast_2: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol":
         "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast",
         "tir.noalias": True
     })
     placeholder_7 = T.match_buffer(placeholder_4, [360000], dtype="int16")
     placeholder_8 = T.match_buffer(placeholder_5, [4096], dtype="int16")
     placeholder_9 = T.match_buffer(placeholder_6, [64], dtype="int32")
     T_cast_3 = T.match_buffer(T_cast_2, [360000], dtype="int16")
     # body
     PaddedInput = T.allocate([360000], "int16", "global")
     for i0_i1_fused, i2, i3 in T.grid(75, 75, 64):
         PaddedInput[i0_i1_fused * 4800 + i2 * 64 +
                     i3] = placeholder_7[i0_i1_fused * 4800 + i2 * 64 + i3]
     for ax0_ax1_fused_ax2_fused in T.serial(0, 5625):
         Conv2dOutput = T.allocate([64], "int32", "global")
         for ff in T.serial(0, 64):
             Conv2dOutput[ff] = 0
             for rc in T.serial(0, 64):
                 Conv2dOutput[ff] = Conv2dOutput[ff] + T.cast(
                     PaddedInput[ax0_ax1_fused_ax2_fused * 64 + rc],
                     "int32") * T.cast(placeholder_8[rc * 64 + ff], "int32")
         for ax3_inner_1 in T.serial(0, 64):
             T_cast_3[ax0_ax1_fused_ax2_fused * 64 + ax3_inner_1] = T.cast(
                 T.cast(
                     T.max(
                         T.min(
                             T.q_multiply_shift(Conv2dOutput[ax3_inner_1] +
                                                placeholder_9[ax3_inner_1],
                                                1843106743,
                                                31,
                                                -6,
                                                dtype="int32"), 255), 0),
                     "uint8"), "int16")
        def main(A_param: T.handle, C_param: T.handle):
            A = T.match_buffer(A_param, (400,), "float32", strides=[1])
            C = T.match_buffer(C_param, (4,), "float32", strides=[1])
            T.func_attr({"from_legacy_te_schedule": True})
            threadIdx_x = T.env_thread("threadIdx.x")
            T.launch_thread(threadIdx_x, 1)
            for i in T.serial(0, 100):
                B = T.allocate([4], "float32", scope="shared", strides=[1])
                with T.attr(B.data, "double_buffer_scope", 1):
                    for j in T.serial(0, 4):
                        B[j] = A[4 * i + j]

                for j in T.serial(0, 4):
                    C[j] = B[j] + 1.0
 def main(a: T.handle, b: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "main", "T.noalias": True})
     # var definition
     threadIdx_x = T.env_thread("threadIdx.x")
     threadIdx_y = T.env_thread("threadIdx.y")
     blockIdx_x = T.env_thread("blockIdx.x")
     blockIdx_y = T.env_thread("blockIdx.y")
     blockIdx_z = T.env_thread("blockIdx.z")
     A = T.match_buffer(a, [14*14*256*256], dtype="float32")
     B = T.match_buffer(b, [14*14*512*256], dtype="float32")
     # body
     T.launch_thread(blockIdx_z, 196)
     B_local = T.allocate([64], "float32", "local")
     Apad_shared = T.allocate([512], "float32", "shared")
     Apad_shared_local = T.allocate([8], "float32", "local")
     T.launch_thread(blockIdx_y, 8)
     T.launch_thread(blockIdx_x, 4)
     T.launch_thread(threadIdx_y, 8)
     T.launch_thread(threadIdx_x, 8)
     for ff_c_init, nn_c_init in T.grid(8, 8):
         B_local[ff_c_init * 8 + nn_c_init] = T.float32(0)
     for rc_outer, ry, rx in T.grid(32, 3, 3):
         for ax3_inner_outer in T.serial(0, 2):
             Apad_shared[T.ramp(threadIdx_y * 64 + threadIdx_x * 8 + ax3_inner_outer * 4, 1, 4)] = T.if_then_else(
                 1 <= blockIdx_z // 14 + ry and blockIdx_z // 14 + ry < 15 and 1 <= rx + blockIdx_z % 14 and rx + blockIdx_z % 14 < 15,
                 A[T.ramp(ry * 917504 + blockIdx_z * 65536 + rx * 65536 + rc_outer * 2048 + threadIdx_y * 256 + blockIdx_x * 64 + threadIdx_x * 8 + ax3_inner_outer * 4 - 983040, 1, 4)],
                 T.broadcast(T.float32(0), 4),
                 dtype="float32x4",
             )
         for rc_inner in T.serial(0, 8):
             for ax3 in T.serial(0, 8):
                 Apad_shared_local[ax3] = Apad_shared[rc_inner * 64 + threadIdx_x * 8 + ax3]
             for ff_c, nn_c in T.grid(8, 8):
                 B_local[ff_c * 8 + nn_c] = B_local[ff_c * 8 + nn_c] + Apad_shared_local[nn_c]
     for ff_inner_inner_inner, nn_inner_inner_inner in T.grid(8, 8):
         B[blockIdx_z * 131072 + blockIdx_y * 16384 + threadIdx_y * 2048 + ff_inner_inner_inner * 256 + blockIdx_x * 64 + threadIdx_x * 8 + nn_inner_inner_inner] = B_local[ff_inner_inner_inner * 8 + nn_inner_inner_inner] # fmt: on
Exemplo n.º 14
0
 def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle,
                                             T_cast_6: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast",
         "tir.noalias": True
     })
     placeholder_29 = T.match_buffer(placeholder_28, [802816],
                                     dtype="uint8",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
     T_cast_7 = T.match_buffer(T_cast_6, [200704],
                               dtype="int16",
                               elem_offset=0,
                               align=128,
                               offset_factor=1)
     # body
     tensor_2 = T.allocate([200704], "uint8", "global")
     for ax0_ax1_fused_4 in T.serial(0, 56):
         for ax2_4 in T.serial(0, 56):
             for ax3_init in T.serial(0, 64):
                 tensor_2[(((ax0_ax1_fused_4 * 3584) + (ax2_4 * 64)) +
                           ax3_init)] = T.uint8(0)
             for rv0_rv1_fused_1, ax3_2 in T.grid(9, 64):
                 tensor_2[(
                     ((ax0_ax1_fused_4 * 3584) + (ax2_4 * 64)) + ax3_2
                 )] = T.max(
                     tensor_2[(((ax0_ax1_fused_4 * 3584) + (ax2_4 * 64)) +
                               ax3_2)],
                     T.if_then_else(
                         ((((ax0_ax1_fused_4 * 2) + T.floordiv(
                             rv0_rv1_fused_1, 3)) < 112) and
                          (((ax2_4 * 2) + T.floormod(rv0_rv1_fused_1, 3))
                           < 112)),
                         placeholder_29[(
                             ((((ax0_ax1_fused_4 * 14336) +
                                (T.floordiv(rv0_rv1_fused_1, 3) * 7168)) +
                               (ax2_4 * 128)) +
                              (T.floormod(rv0_rv1_fused_1, 3) * 64)) +
                             ax3_2)],
                         T.uint8(0),
                         dtype="uint8"))
     for ax0_ax1_fused_5 in T.serial(0, 56):
         for ax2_5, ax3_3 in T.grid(56, 64):
             T_cast_7[(((ax0_ax1_fused_5 * 3584) + (ax2_5 * 64)) +
                       ax3_3)] = T.cast(
                           tensor_2[(((ax0_ax1_fused_5 * 3584) +
                                      (ax2_5 * 64)) + ax3_3)], "int16")
Exemplo n.º 15
0
 def main(placeholder_5: T.Buffer[(1024,), "int8"], ethosu_write_1: T.Buffer[(2048,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([1456], "uint8")
     buffer_1 = T.buffer_decl([352], "uint8")
     buffer_2 = T.buffer_decl([272], "uint8")
     buffer_3 = T.buffer_decl([11040], "uint8")
     T.preflattened_buffer(placeholder_5, [1, 8, 1, 8, 16], 'int8', data=placeholder_5.data)
     T.preflattened_buffer(ethosu_write_1, [1, 8, 2, 8, 16], 'int8', data=ethosu_write_1.data)
     # body
     ethosu_write_2 = T.allocate([2304], "int8", "global", annotations={"disable_lower_builtin": True})
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, T.int8(-1), T.int8(-1), 12, buffer_2[0], 272, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[256], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[1024], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, T.int8(-1), T.int8(-1), 12, buffer_2[0], 272, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
Exemplo n.º 16
0
 def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, ethosu_write: ty.handle) -> None:
     # function attr dict
     tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = tir.match_buffer(placeholder_1, [1456], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_1 = tir.match_buffer(placeholder_2, [352], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     placeholder_5 = tir.match_buffer(placeholder, [1, 8, 1, 8, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 8, 2, 8, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     buffer_2 = tir.match_buffer(placeholder_4, [272], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_3 = tir.match_buffer(placeholder_3, [11040], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     # body
     ethosu_write_2 = tir.allocate([2304], "int8", "global")
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, tir.load("int8", placeholder_5.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, tir.load("int8", ethosu_write_2, 384), 0, 0, 0, tir.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer.data, 0), 1456, 12, tir.load("uint8", buffer_1.data, 0), 352, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, tir.load("int8", ethosu_write_2, 384), 0, 0, 0, tir.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_3.data, 0), 11040, 12, tir.load("uint8", buffer_2.data, 0), 272, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, tir.load("int8", placeholder_5.data, 256), 0, 0, 0, tir.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer.data, 0), 1456, 12, tir.load("uint8", buffer_1.data, 0), 352, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, tir.load("int8", ethosu_write_1.data, 1024), 0, 0, 0, tir.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_3.data, 0), 11040, 12, tir.load("uint8", buffer_2.data, 0), 272, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
Exemplo n.º 17
0
 def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, ethosu_write: ty.handle) -> None:
     # function attr dict
     tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = tir.match_buffer(placeholder_4, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_1 = tir.match_buffer(placeholder_2, [320], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_2 = tir.match_buffer(placeholder_1, [1312], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_3 = tir.match_buffer(placeholder_3, [2608], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     placeholder_5 = tir.match_buffer(placeholder, [1, 8, 8, 3], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 8, 8, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     # body
     ethosu_write_2 = tir.allocate([1536], "int8", "global")
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, tir.load("int8", placeholder_5.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, tir.load("int8", ethosu_write_2, 256), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_2.data, 0), 1312, 12, tir.load("uint8", buffer_1.data, 0), 320, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, tir.load("int8", ethosu_write_2, 256), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_3.data, 0), 2608, 12, tir.load("uint8", buffer.data, 0), 80, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, tir.load("int8", placeholder_5.data, 48), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_2.data, 0), 1312, 12, tir.load("uint8", buffer_1.data, 0), 320, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, tir.load("int8", ethosu_write_1.data, 256), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_3.data, 0), 2608, 12, tir.load("uint8", buffer.data, 0), 80, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
Exemplo n.º 18
0
 def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([80], "uint8")
     buffer_1 = T.buffer_decl([320], "uint8")
     buffer_2 = T.buffer_decl([1312], "uint8")
     buffer_3 = T.buffer_decl([2608], "uint8")
     T.preflattened_buffer(placeholder_5, [1, 8, 8, 3], 'int8', data=placeholder_5.data)
     T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 8], 'int8', data=ethosu_write_1.data)
     # body
     ethosu_write_2 = T.allocate([1536], "int8", "global", annotations={"disable_lower_builtin": True})
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, 12, buffer_1[0], 320, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, 12, buffer[0], 80, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[48], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, 12, buffer_1[0], 320, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, 12, buffer[0], 80, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
Exemplo n.º 19
0
 def main(placeholder: T.handle, placeholder_1: T.handle, placeholder_2: T.handle, placeholder_3: T.handle, placeholder_4: T.handle, ethosu_write: T.handle) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.match_buffer(placeholder_3, [304], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     placeholder_5 = T.match_buffer(placeholder, [1, 8, 8, 3], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     buffer_1 = T.match_buffer(placeholder_4, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_2 = T.match_buffer(placeholder_2, [320], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     ethosu_write_1 = T.match_buffer(ethosu_write, [1, 8, 8, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     buffer_3 = T.match_buffer(placeholder_1, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     # body
     ethosu_write_2 = T.allocate([1024], "int8", "global", annotations={"disable_lower_builtin": True})
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, T.load("int8", placeholder_5.data, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, T.load("int8", ethosu_write_2, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", buffer_3.data, 0), 160, 12, T.load("uint8", buffer_2.data, 0), 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, T.load("int8", ethosu_write_2, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, T.load("int8", ethosu_write_1.data, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", buffer.data, 0), 304, 12, T.load("uint8", buffer_1.data, 0), 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, T.load("int8", placeholder_5.data, 12), 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, T.load("int8", ethosu_write_2, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", buffer_3.data, 0), 160, 12, T.load("uint8", buffer_2.data, 0), 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, T.load("int8", ethosu_write_2, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, T.load("int8", ethosu_write_1.data, 32), 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", buffer.data, 0), 304, 12, T.load("uint8", buffer_1.data, 0), 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
Exemplo n.º 20
0
def flattened_gpu_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (16, 16), "float32")
    C = T.match_buffer(c, (16, 16), "float32")

    i0 = T.env_thread("blockIdx.x")
    i1 = T.env_thread("threadIdx.x")
    i2 = T.env_thread("vthread")

    T.launch_thread(i0, 4)
    T.launch_thread(i1, 2)
    T.launch_thread(i2, 2)
    B = T.allocate([16], "float32", "local")
    for j in range(0, 16):
        B[j] = T.load("float32", A.data, i0 * 64 + i1 * 32 + i2 * 16 + j) + 1.0
    for j in range(0, 16):
        C.data[i0 * 64 + i1 * 32 + i2 * 16 + j] = T.load("float32", B, j) * 2.0
Exemplo n.º 21
0
def gpu_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (16, 16), "float32")
    C = T.match_buffer(c, (16, 16), "float32")

    i0 = T.env_thread("blockIdx.x")
    i1 = T.env_thread("threadIdx.x")
    i2 = T.env_thread("vthread")

    T.launch_thread(i0, 4)
    T.launch_thread(i1, 2)
    T.launch_thread(i2, 2)
    B = T.allocate([1, 16], "float32", "local")
    for j in range(0, 16):
        B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0
    for j in range(0, 16):
        C[i0 * 4 + i1 * 2 + i2, j] = B[0, j] * 2.0
Exemplo n.º 22
0
def flattened_gpu_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, 256, "float32")
    C = T.match_buffer(c, 256, "float32")
    T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data)
    T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data)

    i0 = T.env_thread("blockIdx.x")
    i1 = T.env_thread("threadIdx.x")
    i2 = T.env_thread("vthread")

    T.launch_thread(i0, 4)
    T.launch_thread(i1, 2)
    T.launch_thread(i2, 2)
    B = T.allocate([16], "float32", "local")
    for j in range(0, 16):
        B[j] = A[i0 * 64 + i1 * 32 + i2 * 16 + j] + 1.0
    for j in range(0, 16):
        C[i0 * 64 + i1 * 32 + i2 * 16 + j] = B[j] * 2.0
Exemplo n.º 23
0
 def main(placeholder: T.Buffer[(1536,), "int8"], placeholder_1: T.Buffer[(1280,), "int8"], T_concat: T.Buffer[(4096,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([2992], "uint8")
     buffer_1 = T.buffer_decl([160], "uint8")
     buffer_2 = T.buffer_decl([2992], "uint8")
     buffer_3 = T.buffer_decl([160], "uint8")
     buffer_4 = T.buffer_decl([2992], "uint8")
     buffer_5 = T.buffer_decl([160], "uint8")
     buffer_6 = T.buffer_decl([2992], "uint8")
     buffer_7 = T.buffer_decl([160], "uint8")
     T.preflattened_buffer(placeholder, [1, 8, 12, 16], "int8", data=placeholder.data)
     T.preflattened_buffer(placeholder_1, [1, 8, 10, 16], "int8", data=placeholder_1.data)
     T.preflattened_buffer(T_concat, [1, 8, 32, 16], "int8", data=T_concat.data)
     # body
     T_concat_1 = T.allocate([2816], "int8", "global", annotations={"disable_lower_builtin":True})
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, placeholder_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 160, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 2992, 12, buffer_1[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat[352], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 2992, 12, buffer_3[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 12, 16, 8, 0, 12, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 192, 16, 1, "int8", 8, 12, 16, 8, 0, 12, T_concat_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer_4[0], 2992, 12, buffer_5[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 22, 16, 8, 0, 22, T_concat_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 22, 16, 8, 0, 22, T_concat[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_6[0], 2992, 12, buffer_7[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
Exemplo n.º 24
0
def primfunc_global_allocates(placeholder_144: T.handle,
                              placeholder_145: T.handle,
                              placeholder_146: T.handle,
                              T_cast_48: T.handle) -> None:
    # function attr dict
    T.func_attr({
        "global_symbol":
        "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_13",
        "tir.noalias": True
    })
    placeholder_147 = T.match_buffer(placeholder_144, [100352],
                                     dtype="int16",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
    placeholder_148 = T.match_buffer(placeholder_145, [4608],
                                     dtype="int16",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
    placeholder_149 = T.match_buffer(placeholder_146, [512],
                                     dtype="int32",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
    T_cast_49 = T.match_buffer(T_cast_48, [100352],
                               dtype="int16",
                               elem_offset=0,
                               align=128,
                               offset_factor=1)
    # body
    PaddedInput_22 = T.allocate([131072], "int16", "global")
    DepthwiseConv2d_9 = T.allocate([100352], "int32", "global")
    for i1_29, i2_39, i3_40 in T.grid(16, 16, 512):
        PaddedInput_22[(((i1_29 * 8192) + (i2_39 * 512)) +
                        i3_40)] = T.if_then_else(
                            ((((1 <= i1_29) and (i1_29 < 15)) and
                              (1 <= i2_39)) and (i2_39 < 15)),
                            placeholder_147[((((i1_29 * 7168) +
                                               (i2_39 * 512)) + i3_40) -
                                             7680)],
                            T.int16(0),
                            dtype="int16")
    for i_9, j_9, c_9 in T.grid(14, 14, 512):
        DepthwiseConv2d_9[(((i_9 * 7168) + (j_9 * 512)) + c_9)] = 0
        for di_9, dj_9 in T.grid(3, 3):
            DepthwiseConv2d_9[(((i_9 * 7168) + (j_9 * 512)) + c_9)] = (
                DepthwiseConv2d_9[(((i_9 * 7168) + (j_9 * 512)) + c_9)] +
                (PaddedInput_22[(((((i_9 * 8192) + (di_9 * 8192)) +
                                   (j_9 * 512)) +
                                  (dj_9 * 512)) + c_9)].astype("int32") *
                 placeholder_148[(((di_9 * 1536) +
                                   (dj_9 * 512)) + c_9)].astype("int32")))
    for ax1_27, ax2_28, ax3_30 in T.grid(14, 14, 512):
        DepthwiseConv2d_9[(((ax1_27 * 7168) + (ax2_28 * 512)) + ax3_30)] = (
            DepthwiseConv2d_9[(((ax1_27 * 7168) + (ax2_28 * 512)) + ax3_30)] +
            placeholder_149[ax3_30])
    for i1_30, i2_40, i3_41 in T.grid(14, 14, 512):
        DepthwiseConv2d_9[(((i1_30 * 7168) + (i2_40 * 512)) +
                           i3_41)] = T.q_multiply_shift(
                               DepthwiseConv2d_9[(((i1_30 * 7168) +
                                                   (i2_40 * 512)) + i3_41)],
                               1269068532,
                               31,
                               -4,
                               dtype="int32")
    for i1_31, i2_41, i3_42 in T.grid(14, 14, 512):
        DepthwiseConv2d_9[(((i1_31 * 7168) + (i2_41 * 512)) + i3_42)] = T.max(
            T.max(
                DepthwiseConv2d_9[(((i1_31 * 7168) + (i2_41 * 512)) + i3_42)],
                255), 0)
    for ax1_28, ax2_29, ax3_31 in T.grid(14, 14, 512):
        PaddedInput_22[(((ax1_28 * 7168) + (ax2_29 * 512)) +
                        ax3_31)] = DepthwiseConv2d_9[(((ax1_28 * 7168) +
                                                       (ax2_29 * 512)) +
                                                      ax3_31)].astype("uint8")
    for ax1_29, ax2_30, ax3_32 in T.grid(14, 14, 512):
        T_cast_49[(((ax1_29 * 7168) + (ax2_30 * 512)) +
                   ax3_32)] = PaddedInput_22[(((ax1_29 * 7168) +
                                               (ax2_30 * 512)) +
                                              ax3_32)].astype("int16")
Exemplo n.º 25
0
 def main(placeholder: T.Buffer[(301056, ), "int8"],
          ethosu_write: T.Buffer[(75264, ), "int8"]) -> None:
     T.func_attr({
         "from_legacy_te_schedule": True,
         "global_symbol": "main",
         "tir.noalias": True
     })
     T.preflattened_buffer(placeholder, [1, 56, 56, 96],
                           dtype='int8',
                           data=placeholder.data)
     T.preflattened_buffer(ethosu_write, [1, 56, 56, 24],
                           dtype='int8',
                           data=ethosu_write.data)
     buffer1 = T.buffer_decl([2608], "uint8")
     buffer2 = T.buffer_decl([240], "uint8")
     buffer3 = T.buffer_decl([736], "uint8")
     buffer4 = T.buffer_decl([240], "uint8")
     p1 = T.allocate([2608],
                     "uint8",
                     "global",
                     annotations={"disable_lower_builtin": True})
     p2 = T.allocate([240],
                     "uint8",
                     "global",
                     annotations={"disable_lower_builtin": True})
     p3 = T.allocate([736],
                     "uint8",
                     "global",
                     annotations={"disable_lower_builtin": True})
     p4 = T.allocate([240],
                     "uint8",
                     "global",
                     annotations={"disable_lower_builtin": True})
     p5 = T.allocate([75264],
                     "int8",
                     "global",
                     annotations={"disable_lower_builtin": True})
     p6 = T.allocate([75264],
                     "int8",
                     "global",
                     annotations={"disable_lower_builtin": True})
     T.evaluate(
         T.call_extern("ethosu_copy",
                       buffer1[0],
                       2608,
                       p1[0],
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_copy",
                       buffer2[0],
                       240,
                       p2[0],
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_copy",
                       buffer3[0],
                       736,
                       p3[0],
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_copy",
                       buffer4[0],
                       240,
                       p4[0],
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_conv2d",
                       "int8",
                       56,
                       56,
                       96,
                       56,
                       0,
                       56,
                       placeholder[0],
                       0,
                       0,
                       0,
                       T.float32(0.5),
                       10,
                       "NHWC",
                       5376,
                       96,
                       1,
                       "int8",
                       56,
                       56,
                       24,
                       56,
                       0,
                       56,
                       p5[0],
                       0,
                       0,
                       0,
                       T.float32(0.25),
                       14,
                       "NHWC",
                       1344,
                       24,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       p1[0],
                       2608,
                       T.int8(-1),
                       T.int8(-1),
                       12,
                       p2[0],
                       240,
                       T.int8(-1),
                       T.int8(-1),
                       0,
                       0,
                       0,
                       0,
                       "NONE",
                       0,
                       0,
                       "TFL",
                       "NONE",
                       0,
                       0,
                       0,
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_conv2d",
                       "int8",
                       56,
                       56,
                       24,
                       56,
                       0,
                       56,
                       p5[0],
                       0,
                       0,
                       0,
                       T.float32(0.5),
                       10,
                       "NHWC",
                       1344,
                       24,
                       1,
                       "int8",
                       56,
                       56,
                       24,
                       56,
                       0,
                       56,
                       p6[0],
                       0,
                       0,
                       0,
                       T.float32(0.25),
                       14,
                       "NHWC",
                       1344,
                       24,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       p3[0],
                       736,
                       T.int8(-1),
                       T.int8(-1),
                       12,
                       p4[0],
                       240,
                       T.int8(-1),
                       T.int8(-1),
                       0,
                       0,
                       0,
                       0,
                       "NONE",
                       0,
                       0,
                       "TFL",
                       "NONE",
                       0,
                       0,
                       0,
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_binary_elementwise",
                       "int8",
                       56,
                       56,
                       24,
                       56,
                       0,
                       56,
                       p5[0],
                       0,
                       0,
                       0,
                       T.float32(1),
                       0,
                       "NHWC",
                       1344,
                       24,
                       1,
                       "int8",
                       56,
                       56,
                       24,
                       56,
                       0,
                       56,
                       p6[0],
                       0,
                       0,
                       0,
                       T.float32(1),
                       0,
                       "NHWC",
                       1344,
                       24,
                       1,
                       "int8",
                       56,
                       56,
                       24,
                       56,
                       0,
                       56,
                       ethosu_write[0],
                       0,
                       0,
                       0,
                       T.float32(1),
                       0,
                       "NHWC",
                       1344,
                       24,
                       1,
                       "ADD",
                       0,
                       "NONE",
                       0,
                       0,
                       "TFL",
                       0,
                       0,
                       0,
                       dtype="handle"))
Exemplo n.º 26
0
def high_dim_store() -> None:
    with T.block("root"):
        B = T.allocate([256], "float32", "global")
        for i, j in T.grid(16, 16):
            B[i, j] = 1.0  # error: Store is only allowed with one index
Exemplo n.º 27
0
def allocate_with_buffers() -> None:
    with T.allocate([1], "float32", "") as [A, B]:  # error
        T.evaluate(1.0)
Exemplo n.º 28
0
 def main(placeholder: T.handle, ethosu_write: T.handle,
          placeholder_1: T.handle, placeholder_2: T.handle,
          placeholder_3: T.handle, placeholder_4: T.handle,
          placeholder_5: T.handle, placeholder_6: T.handle,
          placeholder_7: T.handle, placeholder_8: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "from_legacy_te_schedule": True,
         "global_symbol": "main",
         "tir.noalias": True
     })
     buffer = T.match_buffer(placeholder_7, [112],
                             dtype="uint8",
                             elem_offset=0,
                             align=128,
                             offset_factor=1)
     buffer_1 = T.match_buffer(placeholder_4, [32],
                               dtype="uint8",
                               elem_offset=0,
                               align=128,
                               offset_factor=1)
     buffer_2 = T.match_buffer(placeholder_2, [32],
                               dtype="uint8",
                               elem_offset=0,
                               align=128,
                               offset_factor=1)
     buffer_3 = T.match_buffer(placeholder_8, [32],
                               dtype="uint8",
                               elem_offset=0,
                               align=128,
                               offset_factor=1)
     buffer_4 = T.match_buffer(placeholder_5, [112],
                               dtype="uint8",
                               elem_offset=0,
                               align=128,
                               offset_factor=1)
     placeholder_9 = T.match_buffer(placeholder, [1, 16, 16, 32],
                                    dtype="int8",
                                    elem_offset=0,
                                    align=128,
                                    offset_factor=1)
     buffer_5 = T.match_buffer(placeholder_3, [112],
                               dtype="uint8",
                               elem_offset=0,
                               align=128,
                               offset_factor=1)
     buffer_6 = T.match_buffer(placeholder_1, [128],
                               dtype="uint8",
                               elem_offset=0,
                               align=128,
                               offset_factor=1)
     ethosu_write_1 = T.match_buffer(ethosu_write, [1, 16, 16, 8],
                                     dtype="int8",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
     buffer_7 = T.match_buffer(placeholder_6, [32],
                               dtype="uint8",
                               elem_offset=0,
                               align=128,
                               offset_factor=1)
     # body
     placeholder_global = T.allocate([128], "uint8", "global")
     placeholder_d_global = T.allocate([32], "uint8", "global")
     T.evaluate(
         T.call_extern("ethosu_copy",
                       T.load("uint8", buffer_6.data, 0),
                       128,
                       T.load("uint8", placeholder_global, 0),
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_copy",
                       T.load("uint8", buffer_2.data, 0),
                       32,
                       T.load("uint8", placeholder_d_global, 0),
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_conv2d",
                       "int8",
                       16,
                       16,
                       32,
                       16,
                       0,
                       16,
                       T.load("int8", placeholder_9.data, 0),
                       0,
                       0,
                       0,
                       T.float32(0.5),
                       10,
                       "NHWC",
                       512,
                       32,
                       1,
                       "int8",
                       16,
                       16,
                       2,
                       16,
                       0,
                       16,
                       T.load("int8", ethosu_write_1.data, 0),
                       0,
                       0,
                       0,
                       T.float32(0.25),
                       14,
                       "NHWC",
                       128,
                       8,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       T.load("uint8", placeholder_global, 0),
                       128,
                       12,
                       T.load("uint8", placeholder_d_global, 0),
                       32,
                       0,
                       0,
                       0,
                       0,
                       "NONE",
                       0,
                       0,
                       "TFL",
                       "NONE",
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_copy",
                       T.load("uint8", buffer_5.data, 0),
                       112,
                       T.load("uint8", placeholder_global, 0),
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_copy",
                       T.load("uint8", buffer_1.data, 0),
                       32,
                       T.load("uint8", placeholder_d_global, 0),
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_conv2d",
                       "int8",
                       16,
                       16,
                       32,
                       16,
                       0,
                       16,
                       T.load("int8", placeholder_9.data, 0),
                       0,
                       0,
                       0,
                       T.float32(0.5),
                       10,
                       "NHWC",
                       512,
                       32,
                       1,
                       "int8",
                       16,
                       16,
                       2,
                       16,
                       0,
                       16,
                       T.load("int8", ethosu_write_1.data, 2),
                       0,
                       0,
                       0,
                       T.float32(0.25),
                       14,
                       "NHWC",
                       128,
                       8,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       T.load("uint8", placeholder_global, 0),
                       112,
                       12,
                       T.load("uint8", placeholder_d_global, 0),
                       32,
                       0,
                       0,
                       0,
                       0,
                       "NONE",
                       0,
                       0,
                       "TFL",
                       "NONE",
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_copy",
                       T.load("uint8", buffer_4.data, 0),
                       112,
                       T.load("uint8", placeholder_global, 0),
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_copy",
                       T.load("uint8", buffer_7.data, 0),
                       32,
                       T.load("uint8", placeholder_d_global, 0),
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_conv2d",
                       "int8",
                       16,
                       16,
                       32,
                       16,
                       0,
                       16,
                       T.load("int8", placeholder_9.data, 0),
                       0,
                       0,
                       0,
                       T.float32(0.5),
                       10,
                       "NHWC",
                       512,
                       32,
                       1,
                       "int8",
                       16,
                       16,
                       2,
                       16,
                       0,
                       16,
                       T.load("int8", ethosu_write_1.data, 4),
                       0,
                       0,
                       0,
                       T.float32(0.25),
                       14,
                       "NHWC",
                       128,
                       8,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       T.load("uint8", placeholder_global, 0),
                       112,
                       12,
                       T.load("uint8", placeholder_d_global, 0),
                       32,
                       0,
                       0,
                       0,
                       0,
                       "NONE",
                       0,
                       0,
                       "TFL",
                       "NONE",
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_copy",
                       T.load("uint8", buffer.data, 0),
                       112,
                       T.load("uint8", placeholder_global, 0),
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_copy",
                       T.load("uint8", buffer_3.data, 0),
                       32,
                       T.load("uint8", placeholder_d_global, 0),
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_conv2d",
                       "int8",
                       16,
                       16,
                       32,
                       16,
                       0,
                       16,
                       T.load("int8", placeholder_9.data, 0),
                       0,
                       0,
                       0,
                       T.float32(0.5),
                       10,
                       "NHWC",
                       512,
                       32,
                       1,
                       "int8",
                       16,
                       16,
                       2,
                       16,
                       0,
                       16,
                       T.load("int8", ethosu_write_1.data, 6),
                       0,
                       0,
                       0,
                       T.float32(0.25),
                       14,
                       "NHWC",
                       128,
                       8,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       T.load("uint8", placeholder_global, 0),
                       112,
                       12,
                       T.load("uint8", placeholder_d_global, 0),
                       32,
                       0,
                       0,
                       0,
                       0,
                       "NONE",
                       0,
                       0,
                       "TFL",
                       "NONE",
                       dtype="handle"))
Exemplo n.º 29
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(
         placeholder_62: T.handle, placeholder_63: T.handle,
         placeholder_64: T.handle, T_cast_20: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol":
         "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast",
         "tir.noalias": True
     })
     placeholder_65 = T.match_buffer(placeholder_62, [1, 224, 224, 3],
                                     dtype="int16",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
     placeholder_66 = T.match_buffer(placeholder_63, [7, 7, 3, 64],
                                     dtype="int16",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
     placeholder_67 = T.match_buffer(placeholder_64, [1, 1, 1, 64],
                                     dtype="int32",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
     T_cast_21 = T.match_buffer(T_cast_20, [1, 112, 112, 64],
                                dtype="uint8",
                                elem_offset=0,
                                align=128,
                                offset_factor=1)
     # body
     PaddedInput_7 = T.allocate([157323], "int16", "global")
     for i0_i1_fused_7 in T.serial(0, 229):
         for i2_7, i3_7 in T.grid(229, 3):
             T.store(
                 PaddedInput_7,
                 (((i0_i1_fused_7 * 687) + (i2_7 * 3)) + i3_7),
                 T.if_then_else(
                     ((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and
                       (2 <= i2_7)) and (i2_7 < 226)),
                     T.load("int16", placeholder_65.data,
                            ((((i0_i1_fused_7 * 672) +
                               (i2_7 * 3)) + i3_7) - 1350)),
                     T.int16(0),
                     dtype="int16"), True)
     for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544):
         Conv2dOutput_7 = T.allocate([64], "int32", "global")
         for ff_3 in T.serial(0, 64):
             T.store(Conv2dOutput_7, ff_3, 0, True)
             for ry_2, rx_2, rc_7 in T.grid(7, 7, 3):
                 T.store(
                     Conv2dOutput_7, ff_3,
                     (T.load("int32", Conv2dOutput_7, ff_3) + (T.cast(
                         T.load("int16", PaddedInput_7, ((
                             (((T.floordiv(ax0_ax1_fused_ax2_fused_7, 112) *
                                1374) + (ry_2 * 687)) +
                              (T.floormod(ax0_ax1_fused_ax2_fused_7, 112) *
                               6)) +
                             (rx_2 * 3)) + rc_7)), "int32") * T.cast(
                                 T.load("int16", placeholder_66.data,
                                        ((((ry_2 * 1344) + (rx_2 * 192)) +
                                          (rc_7 * 64)) + ff_3)), "int32"))),
                     True)
         for ax3_inner_7 in T.serial(0, 64):
             T.store(
                 T_cast_21.data,
                 ((ax0_ax1_fused_ax2_fused_7 * 64) + ax3_inner_7),
                 T.cast(
                     T.max(
                         T.min(
                             T.q_multiply_shift(
                                 (T.load("int32", Conv2dOutput_7,
                                         ax3_inner_7) +
                                  T.load("int32", placeholder_67.data,
                                         ax3_inner_7)),
                                 1939887962,
                                 31,
                                 -9,
                                 dtype="int32"), 255), 0), "uint8"), True)
 def main(
         inputs: T.Buffer[(1, 4, 4, 512),
                          "float32"], weight: T.Buffer[(4, 4, 512, 256),
                                                       "float32"],
         conv2d_transpose_nhwc: T.Buffer[(1, 8, 8, 256),
                                         "float32"]) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "main", "tir.noalias": True})
     # var definition
     threadIdx_x = T.env_thread("threadIdx.x")
     blockIdx_x = T.env_thread("blockIdx.x")
     # body
     T.launch_thread(blockIdx_x, 64)
     conv2d_transpose_nhwc_local = T.allocate([8], "float32", "local")
     PadInput_shared = T.allocate([768], "float32", "shared")
     weight_shared = T.allocate([4096], "float32", "shared")
     T.launch_thread(threadIdx_x, 32)
     for i2_3_init, i1_4_init, i2_4_init in T.grid(2, 2, 2):
         T.store(conv2d_transpose_nhwc_local,
                 i1_4_init * 4 + i2_3_init * 2 + i2_4_init, T.float32(0),
                 True)
     for i6_0 in T.serial(16):
         for ax0_ax1_ax2_ax3_fused_0 in T.serial(24):
             T.store(
                 PadInput_shared,
                 ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x,
                 T.if_then_else(
                     128 <= ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x
                     and ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x < 640
                     and 1 <= blockIdx_x // 32 * 2 +
                     (ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x) % 128 //
                     32 and blockIdx_x // 32 * 2 +
                     (ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x) % 128 //
                     32 < 5,
                     T.load(
                         "float32", inputs.data, blockIdx_x // 32 * 1024 +
                         ax0_ax1_ax2_ax3_fused_0 * 512 + i6_0 * 32 +
                         threadIdx_x - 2560),
                     T.float32(0),
                     dtype="float32"), True)
         for ax0_ax1_ax2_ax3_fused_0 in T.serial(32):
             T.store(
                 weight_shared,
                 T.ramp(ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4, 1,
                        4),
                 T.load(
                     "float32x4", weight.data,
                     T.ramp(
                         (ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4)
                         // 256 * 131072 + i6_0 * 8192 +
                         (ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4) %
                         256 // 8 * 256 + blockIdx_x % 32 * 8 +
                         threadIdx_x % 2 * 4, 1, 4), T.broadcast(True, 4)),
                 T.broadcast(True, 4))
         for i6_1, i2_3, i4_2, i5_2, i6_2, i1_4, i2_4 in T.grid(
                 4, 2, 4, 4, 8, 2, 2):
             T.store(
                 conv2d_transpose_nhwc_local, i1_4 * 4 + i2_3 * 2 + i2_4,
                 T.load("float32", conv2d_transpose_nhwc_local,
                        i1_4 * 4 + i2_3 * 2 + i2_4) +
                 T.if_then_else(
                     (i1_4 + i4_2) % 2 == 0 and (i2_4 + i5_2) % 2 == 0,
                     T.load(
                         "float32", PadInput_shared,
                         threadIdx_x // 8 * 128 + (i1_4 + i4_2) // 2 * 128 +
                         (i2_4 + i5_2) // 2 * 32 + i2_3 * 32 + i6_1 * 8 +
                         i6_2),
                     T.float32(0),
                     dtype="float32") * T.load(
                         "float32", weight_shared, i6_1 * 64 + i6_2 * 8 +
                         threadIdx_x % 8 + 3840 - i5_2 * 256 - i4_2 * 1024),
                 True)
     for ax1, ax2 in T.grid(2, 4):
         T.store(
             conv2d_transpose_nhwc.data, threadIdx_x // 8 * 4096 +
             ax1 * 2048 + blockIdx_x // 32 * 1024 + ax2 * 256 +
             blockIdx_x % 32 * 8 + threadIdx_x % 8,
             T.load("float32", conv2d_transpose_nhwc_local,
                    ax1 * 4 + ax2), True)