def unified_element_wise_thread_x(a: T.handle, b: T.handle, c: T.handle) -> None: thread_x = T.env_thread("threadIdx.x") block_x = T.env_thread("blockIdx.x") A = T.match_buffer(a, [128, 128]) B = T.match_buffer(b, [128, 128]) C = T.match_buffer(c, [128, 128]) T.launch_thread(block_x, 128) with T.launch_thread(thread_x, 4): for j0_1 in T.serial(0, 32): T.store( B.data, block_x * 128 + thread_x * 32 + j0_1, T.load("float32", A.data, block_x * 128 + thread_x * 32 + j0_1) * 2.0, True, ) T.launch_thread(thread_x, 4) for j1_1 in T.serial(0, 32): T.store( C.data, block_x * 128 + thread_x * 32 + j1_1, T.load("float32", A.data, block_x * 128 + thread_x * 32 + j1_1) + 1.0, True, )
def unified_element_wise_kernels_with_different_size(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None: block_x = T.env_thread("blockIdx.x") thread_x = T.env_thread("threadIdx.x") block_x_1 = T.env_thread("blockIdx.x") thread_x_1 = T.env_thread("threadIdx.x") A = T.match_buffer(a, [128, 128]) B = T.match_buffer(b, [128, 128]) C = T.match_buffer(c, [256, 256]) D = T.match_buffer(d, [256, 256]) with T.launch_thread(block_x, 128): T.launch_thread(thread_x, 128) T.store( B.data, block_x * 128 + thread_x, T.load("float32", A.data, block_x * 128 + thread_x) * 2.0, True, ) T.launch_thread(block_x_1, 256) T.launch_thread(thread_x_1, 256) T.store( D.data, block_x_1 * 256 + thread_x_1, T.load("float32", C.data, block_x_1 * 256 + thread_x_1) + 1.0, True, )
def tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast( placeholder: T.handle, placeholder_1: T.handle, T_cast: T.handle) -> None: # function attr dict T.func_attr({ "global_symbol": "tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast", "tir.noalias": True }) placeholder_2 = T.match_buffer(placeholder, [1, 75, 75, 64], dtype="uint8") placeholder_3 = T.match_buffer(placeholder_1, [64], dtype="int32") T_cast_1 = T.match_buffer(T_cast, [1, 75, 75, 64], dtype="int16") # body for ax0_ax1_fused, ax2, ax3_outer, ax3_inner in T.grid(75, 75, 4, 16): T.store( T_cast_1.data, ax0_ax1_fused * 4800 + ax2 * 64 + ax3_outer * 16 + ax3_inner, T.cast( T.cast( T.max( T.min( T.q_multiply_shift(T.cast( T.load( "uint8", placeholder_2.data, ax0_ax1_fused * 4800 + ax2 * 64 + ax3_outer * 16 + ax3_inner), "int32") - 94, 1843157232, 31, 1, dtype="int32") + T.load("int32", placeholder_3.data, ax3_outer * 16 + ax3_inner), 255), 0), "uint8"), "int16"), True)
def main(a: T.handle, b: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "main", "T.noalias": True}) # var definition threadIdx_x = T.env_thread("threadIdx.x") threadIdx_y = T.env_thread("threadIdx.y") blockIdx_x = T.env_thread("blockIdx.x") blockIdx_y = T.env_thread("blockIdx.y") blockIdx_z = T.env_thread("blockIdx.z") A = T.match_buffer(a, [14, 14, 256, 256], dtype="float32") B = T.match_buffer(b, [14, 14, 512, 256], dtype="float32") # body T.launch_thread(blockIdx_z, 196) B_local = T.allocate([6400000], "float32", "local") Apad_shared = T.allocate([512], "float32", "shared") Apad_shared_local = T.allocate([8], "float32", "local") T.launch_thread(blockIdx_y, 8) T.launch_thread(blockIdx_x, 4) T.launch_thread(threadIdx_y, 8) T.launch_thread(threadIdx_x, 8) for ff_c_init, nn_c_init in T.grid(8, 8): T.store(B_local, ff_c_init * 8 + nn_c_init, T.float32(0), True) for rc_outer, ry, rx in T.grid(32, 3, 3): for ax3_inner_outer in T.serial(0, 2): T.store(Apad_shared, T.ramp(threadIdx_y * 64 + threadIdx_x * 8 + ax3_inner_outer * 4, 1, 4), T.if_then_else(1 <= blockIdx_z // 14 + ry and blockIdx_z // 14 + ry < 15 and 1 <= rx + blockIdx_z % 14 and rx + blockIdx_z % 14 < 15, T.load("float32x4", A.data, T.ramp(ry * 917504 + blockIdx_z * 65536 + rx * 65536 + rc_outer * 2048 + threadIdx_y * 256 + blockIdx_x * 64 + threadIdx_x * 8 + ax3_inner_outer * 4 - 983040, 1, 4), T.broadcast(True, 4)), T.broadcast(T.float32(0), 4), dtype="float32x4"), T.broadcast(True, 4)) for rc_inner in T.serial(0, 8): for ax3 in T.serial(0, 8): T.store(Apad_shared_local, ax3, T.load("float32", Apad_shared, rc_inner * 64 + threadIdx_x * 8 + ax3), True) for ff_c, nn_c in T.grid(8, 8): T.store(B_local, ff_c * 8 + nn_c, T.load("float32", B_local, ff_c * 8 + nn_c) + T.load("float32", Apad_shared_local, nn_c), True) for ff_inner_inner_inner, nn_inner_inner_inner in T.grid(8, 8): T.store(B.data, blockIdx_z * 131072 + blockIdx_y * 16384 + threadIdx_y * 2048 + ff_inner_inner_inner * 256 + blockIdx_x * 64 + threadIdx_x * 8 + nn_inner_inner_inner, T.load("float32", B_local, ff_inner_inner_inner * 8 + nn_inner_inner_inner), True)# fmt: on
def compacted_complex_func(a: T.handle, c: T.handle, n: T.int32) -> None: A = T.match_buffer(a, (8, 8), "float32") C = T.match_buffer(c, (8, 8), "float32") for i in range(0, 8): with T.block(): T.reads(A[0, 8]) T.writes(C[0, 8]) B = T.alloc_buffer((1, 8), "float32") for j in range(0, 4): with T.block() as []: D = T.alloc_buffer((6, 1), "float32") T.reads(A[i, j]) T.writes(B[0, j]) for k in range(4, 8): D[k - 2, 0] = 1.0 for k in range(2, 4): T.store(B.data, j, A[i, j] + D[k - 2, 0]) for j in range(3, 5): with T.block() as []: T.reads(B[0, j]) T.writes(C[i, j]) C[i, j] = B[0, j] for j in range(6, 8): with T.block() as []: T.reads(B[0, j]) T.writes(C[i, j]) C[i, j] = B[0, j]
def buffer_opaque_access(b: T.handle, c: T.handle) -> None: B = T.match_buffer(b, [16, 16], "float32") C = T.match_buffer(c, [16, 16], "float32") with T.block([]): T.reads([]) T.writes(B[0:16, 0:16]) A = T.allocate([256], "float32", "global") for i, j in T.grid(16, 16): T.store(A, i * 16 + j, 1) for i in range(0, 16): for j in range(0, 16): T.evaluate(T.load("float32", A, i * 16 + j)) for j in range(0, 16): T.evaluate( T.tvm_fill_fragment(B.data, 16, 16, 16, 0, T.float32(0), dtype="handle")) for i, j in T.grid(16, 16): with T.block([16, 16]) as [vi, vj]: T.bind(vi, i) T.bind(vj, j) C[vi, vj] = B[vi, vj]
def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle, placeholder_3: T.handle, T_subtract: T.handle) -> None: # function attr dict T.func_attr({ "global_symbol": "tvmgen_default_fused_cast_subtract", "tir.noalias": True }) placeholder_4 = T.match_buffer(placeholder_2, [1, 224, 224, 3], dTpe="uint8", elem_offset=0, align=128, offset_factor=1) placeholder_5 = T.match_buffer(placeholder_3, [], dtype="int16", elem_offset=0, align=128, offset_factor=1) T_subtract_1 = T.match_buffer(T_subtract, [1, 224, 224, 3], dtype="int16", elem_offset=0, align=128, offset_factor=1) # body for ax0_ax1_fused_1 in T.serial(0, 224): for ax2_1, ax3_inner_1 in T.grid(224, 3): T.store(T_subtract_1.data, (((ax0_ax1_fused_1 * 672) + (ax2_1 * 3)) + ax3_inner_1), (T.cast( T.load("uint8", placeholder_4.data, (((ax0_ax1_fused_1 * 672) + (ax2_1 * 3)) + ax3_inner_1)), "int16") - T.load("int16", placeholder_5.data, 0)), True)
def threadpool_nested_parallel_loop( A: T.Buffer[(4, 4), "float32"], B: T.Buffer[(4, 4), "float32"]) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) for i in T.parallel(4): for j in T.parallel(4): T.store(B.data, i * 4 + j, T.load("float32", A.data, i * 4 + j) * 2.0)
def opaque_access_store(a: T.handle, c: T.handle) -> None: A = T.match_buffer(a, (128, 128)) B = T.alloc_buffer((128, 128)) C = T.match_buffer(c, (128, 128)) with T.block([128, 128], "B") as [vi, vj]: B[vi, vj] = A[vi, vj] * 2.0 with T.block([128, 128], "C") as [vi, vj]: T.reads(B[0:128, 0:128]) T.writes(C[0:128, 0:128]) T.store(C.data, vi * 128 + vj, B[vi, vj] + 1.0) C[vi, vj] = T.load("float32", B.data, vi * 16 + vj) + 1.0
def unschedulable_func(a: T.handle, c: T.handle) -> None: A = T.match_buffer(a, (16, 16), "float32") C = T.match_buffer(c, (16, 16), "float32") for i in range(0, 16): with T.block(): T.reads(A[i, 0:16]) T.writes(C[i, 0:16]) B = T.alloc_buffer((16, 16), "float32") for j in range(0, 16): T.store(B.data, i * 16 + j, A[i, j] + 1.0) for j in range(0, 16): C[i, j] = B[i, j] * 2.0
def partitioned_concat(a: T.handle, b: T.handle, c: T.handle) -> None: T.func_attr({ "from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True }) A = T.match_buffer(a, [16], dtype="float32") B = T.match_buffer(b, [16], dtype="float32") C = T.match_buffer(c, [32], dtype="float32") for i in T.serial(0, 16): T.store(C.data, i, T.load("float32", A.data, i), True) for i in T.serial(0, 16): T.store(C.data, i + 16, T.load("float32", B.data, i + 16), True)
def opaque_access_reorder(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, [16, 16], "float32") B = T.match_buffer(b, [16, 16], "float32") for j, i in T.grid(16, 16): with T.block("A"): vi, vj = T.axis.remap("SS", [i, j]) T.reads([]) T.writes([A[0:16, 0:16]]) T.store(A.data, vi * 16 + vj, 1) for j, i in T.grid(16, 16): with T.block("B"): vi, vj = T.axis.remap("SS", [i, j]) T.reads([]) T.writes([B[0:16, 0:16]]) T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, vi * 16 + vj, dtype="handle"))
def opaque_access_store(a: T.handle, c: T.handle) -> None: A = T.match_buffer(a, (128, 128)) B = T.alloc_buffer((128, 128)) C = T.match_buffer(c, (128, 128)) for i, j in T.grid(128, 128): with T.block("B"): vi, vj = T.axis.remap("SS", [i, j]) B[vi, vj] = A[vi, vj] * 2.0 for i, j in T.grid(128, 128): with T.block("C"): vi, vj = T.axis.remap("SS", [i, j]) T.reads(B[0:128, 0:128]) T.writes(C[0:128, 0:128]) T.store(C.data, vi * 128 + vj, B[vi, vj] + 1.0) C[vi, vj] = T.load("float32", B.data, vi * 16 + vj) + 1.0
def unified_element_wise_vthread_x(a: T.handle, b: T.handle) -> None: vthread_x = T.env_thread("vthread.x") thread_x = T.env_thread("threadIdx.x") A = T.match_buffer(a, [128, 128]) B = T.match_buffer(b, [128, 128]) T.launch_thread(vthread_x, 2) T.launch_thread(thread_x, 64) T.launch_thread(vthread_x, 2) for j_1 in T.serial(0, 64): T.store( B.data, vthread_x * 8256 + thread_x * 128 + j_1, T.load("float32", A.data, vthread_x * 8256 + thread_x * 128 + j_1) * 2.0, True, )
def element_wise_two_thread_x_in_same_kernel_not_equal(a: T.handle, b: T.handle, c: T.handle) -> None: i = T.env_thread("blockIdx.x") j0 = T.env_thread("threadIdx.x") j1 = T.env_thread("threadIdx.x") A = T.match_buffer(a, [128, 128]) B = T.match_buffer(b, [128, 128]) C = T.match_buffer(c, [128, 64]) T.launch_thread(i, 128) with T.launch_thread(j0, 128): T.store(B.data, i * 64 + j0, T.load("float32", A.data, i * 128 + j0) * 2.0, True) T.launch_thread(j1, 64) T.store(C.data, i * 64 + j1, T.load("float32", A.data, i * 128 + j1) + 1.0, True)
def opaque_access_split(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (16, 16)) B = T.match_buffer(b, (16, 16)) for i, j0, j1 in T.grid(16, 4, 4): with T.block("A"): vi = T.axis.S(16, i) vj = T.axis.S(16, j0 * 4 + j1) T.reads([]) T.writes([A[0:16, 0:16]]) T.store(A.data, ((vi * 16) + vj), 1, 1) for i, j0, j1 in T.grid(16, 4, 4): with T.block("B"): vi = T.axis.S(16, i) vj = T.axis.S(16, j0 * 4 + j1) T.reads([]) T.writes([B[0:16, 0:16]]) T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, ((vi * 16) + vj), dtype="handle"))
def compacted_opaque_access_annotated_func(a: T.handle) -> None: A = T.match_buffer(a, (1024, ), "float32") with T.block(): B = T.alloc_buffer((1024, ), dtypes="float32") C = T.alloc_buffer((520, ), dtypes="float32") for i in range(0, 512): with T.block(): # no annotation, opaque access will cover full region T.reads([]) T.writes([]) T.store(B.data, i, "float32", A[i]) with T.block(): # treat opaque access only access annotated regions, even if # they are not compatible with actual buffer accesses. T.reads([B[i]]) T.writes([C[i:i + 9]]) T.store(C.data, i, T.load("float32", B.data, i))
def opaque_access_fused(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, [16, 16]) B = T.match_buffer(b, [16, 16]) for i_j_fused in T.serial(0, 256): with T.block("A"): vi = T.axis.S(16, T.floordiv(i_j_fused, 16)) vj = T.axis.S(16, T.floormod(i_j_fused, 16)) T.reads([]) T.writes([A[0:16, 0:16]]) T.store(A.data, ((vi * 16) + vj), 1, 1) for i_j_fused in T.serial(0, 256): with T.block("B"): vi = T.axis.S(16, T.floordiv(i_j_fused, 16)) vj = T.axis.S(16, T.floormod(i_j_fused, 16)) T.reads([]) T.writes([B[0:16, 0:16]]) T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, ((vi * 16) + vj), dtype="handle"))
def element_wise_vthread_x(a: T.handle, b: T.handle) -> None: i_0 = T.env_thread("vthread.x") i_1 = T.env_thread("threadIdx.x") j_0 = T.env_thread("vthread.x") A = T.match_buffer(a, [128, 128]) B = T.match_buffer(b, [128, 128]) T.launch_thread(i_0, 2) T.launch_thread(i_1, 64) T.launch_thread(j_0, 2) for j_1 in T.serial(0, 64): T.store( B.data, i_0 * 8192 + i_1 * 128 + j_0 * 64 + j_1, T.load("float32", A.data, i_0 * 8192 + i_1 * 128 + j_0 * 64 + j_1) * 2.0, True, )
def element_wise_kernels_with_different_size(a: T.handle, b: T.handle, c: T.handle, d: T.handle) -> None: i0 = T.env_thread("blockIdx.x") j0 = T.env_thread("threadIdx.x") i1 = T.env_thread("blockIdx.x") j1 = T.env_thread("threadIdx.x") A = T.match_buffer(a, [128, 128]) B = T.match_buffer(b, [128, 128]) C = T.match_buffer(c, [256, 256]) D = T.match_buffer(d, [256, 256]) with T.launch_thread(i0, 128): T.launch_thread(j0, 128) T.store(B.data, i0 * 128 + j0, T.load("float32", A.data, i0 * 128 + j0) * 2.0, True) T.launch_thread(i1, 256) T.launch_thread(j1, 256) T.store(D.data, i1 * 256 + j1, T.load("float32", C.data, i1 * 256 + j1) + 1.0, True)
def opaque_access(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, [16, 16], "float32") B = T.match_buffer(b, [16, 16], "float32") with T.block([16, 16], "A") as [vi, vj]: T.reads([]) T.writes([A[0:16, 0:16]]) T.store(A.data, vi * 16 + vj, 1) with T.block([16, 16], "B") as [vi, vj]: T.reads([]) T.writes([B[0:16, 0:16]]) T.evaluate( T.tvm_fill_fragment(B.data, 16, 16, 16, 0, vi * 16 + vj, dtype="handle"))
def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast( placeholder_4: T.handle, placeholder_5: T.handle, placeholder_6: T.handle, T_cast_2: T.handle) -> None: # function attr dict T.func_attr({ "global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast", "tir.noalias": True }) placeholder_7 = T.match_buffer(placeholder_4, [1, 75, 75, 64], dtype="int16") placeholder_8 = T.match_buffer(placeholder_5, [1, 1, 64, 64], dtype="int16") placeholder_9 = T.match_buffer(placeholder_6, [1, 1, 1, 64], dtype="int32") T_cast_3 = T.match_buffer(T_cast_2, [1, 75, 75, 64], dtype="int16") # body PaddedInput = T.allocate([360000], "int16", "global") for i0_i1_fused, i2, i3 in T.grid(75, 75, 64): T.store( PaddedInput, i0_i1_fused * 4800 + i2 * 64 + i3, T.load("int16", placeholder_7.data, i0_i1_fused * 4800 + i2 * 64 + i3), True) for ax0_ax1_fused_ax2_fused in T.serial(0, 5625): Conv2dOutput = T.allocate([64], "int32", "global") for ff in T.serial(0, 64): T.store(Conv2dOutput, ff, 0, True) for rc in T.serial(0, 64): T.store( Conv2dOutput, ff, T.load("int32", Conv2dOutput, ff) + T.cast( T.load("int16", PaddedInput, ax0_ax1_fused_ax2_fused * 64 + rc), "int32") * T.cast( T.load("int16", placeholder_8.data, rc * 64 + ff), "int32"), True) for ax3_inner_1 in T.serial(0, 64): T.store( T_cast_3.data, ax0_ax1_fused_ax2_fused * 64 + ax3_inner_1, T.cast( T.cast( T.max( T.min( T.q_multiply_shift( T.load("int32", Conv2dOutput, ax3_inner_1) + T.load("int32", placeholder_9.data, ax3_inner_1), 1843106743, 31, -6, dtype="int32"), 255), 0), "uint8"), "int16"), True)
def partitioned_concat_3( placeholder: T.Buffer[(1, 64, 28, 28), "int8"], placeholder_1: T.Buffer[(1, 32, 28, 28), "int8"], placeholder_2: T.Buffer[(1, 32, 28, 28), "int8"], T_concat: T.Buffer[(1, 128, 28, 28), "int8"], ) -> None: for i1, i2, i3 in T.grid(64, 28, 28): T.store( T_concat.data, i1 * 784 + i2 * 28 + i3, T.load("int8", placeholder.data, i1 * 784 + i2 * 28 + i3), True, ) for i1, i2, i3 in T.grid(32, 28, 28): T.store( T_concat.data, i1 * 784 + i2 * 28 + i3 + 50176, T.load("int8", placeholder_1.data, i1 * 784 + i2 * 28 + i3), True, ) for i1, i2, i3 in T.grid(32, 28, 28): T.store( T_concat.data, i1 * 784 + i2 * 28 + i3 + 75264, T.load("int8", placeholder_2.data, i1 * 784 + i2 * 28 + i3), True, )
def concat_func_3( placeholder: T.Buffer[(1, 64, 28, 28), "int8"], placeholder_1: T.Buffer[(1, 32, 28, 28), "int8"], placeholder_2: T.Buffer[(1, 32, 28, 28), "int8"], T_concat: T.Buffer[(1, 128, 28, 28), "int8"], ) -> None: for i1 in T.serial(128, annotations={"pragma_loop_partition_hint": 1}): for i2, i3 in T.grid(28, 28): if 96 <= i1: T.store( T_concat.data, i1 * 784 + i2 * 28 + i3, T.load("int8", placeholder_2.data, i1 * 784 + i2 * 28 + i3 - 75264), True, ) if 64 <= i1 and i1 < 96: T.store( T_concat.data, i1 * 784 + i2 * 28 + i3, T.load("int8", placeholder_1.data, i1 * 784 + i2 * 28 + i3 - 50176), True, ) if i1 < 64: T.store( T_concat.data, i1 * 784 + i2 * 28 + i3, T.load("int8", placeholder.data, i1 * 784 + i2 * 28 + i3), True, )
def element_wise_thread_x(a: T.handle, b: T.handle, c: T.handle) -> None: j1_0 = T.env_thread("threadIdx.x") j0_0 = T.env_thread("threadIdx.x") i = T.env_thread("blockIdx.x") A = T.match_buffer(a, [128, 128]) B = T.match_buffer(b, [128, 128]) C = T.match_buffer(c, [128, 128]) T.launch_thread(i, 128) with T.launch_thread(j0_0, 4): for j0_1 in T.serial(0, 32): T.store( B.data, i * 128 + j0_0 * 32 + j0_1, T.load("float32", A.data, i * 128 + j0_0 * 32 + j0_1) * 2.0, True, ) T.launch_thread(j1_0, 4) for j1_1 in T.serial(0, 32): T.store( C.data, i * 128 + j1_0 * 32 + j1_1, T.load("float32", A.data, i * 128 + j1_0 * 32 + j1_1) + 1.0, True, )
def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle, T_cast_6: T.handle) -> None: # function attr dict T.func_attr({ "global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast", "tir.noalias": True }) placeholder_29 = T.match_buffer(placeholder_28, [1, 112, 112, 64], dtype="uint8", elem_offset=0, align=128, offset_factor=1) T_cast_7 = T.match_buffer(T_cast_6, [1, 56, 56, 64], dtype="int16", elem_offset=0, align=128, offset_factor=1) # body tensor_2 = T.allocate([200704], "uint8", "global") for ax0_ax1_fused_4 in T.serial(0, 56): for ax2_4 in T.serial(0, 56): for ax3_init in T.serial(0, 64): T.store(tensor_2, (((ax0_ax1_fused_4 * 3584) + (ax2_4 * 64)) + ax3_init), T.uint8(0), True) for rv0_rv1_fused_1, ax3_2 in T.grid(9, 64): T.store( tensor_2, (((ax0_ax1_fused_4 * 3584) + (ax2_4 * 64)) + ax3_2), T.max( T.load("uint8", tensor_2, (((ax0_ax1_fused_4 * 3584) + (ax2_4 * 64)) + ax3_2)), T.if_then_else( ((((ax0_ax1_fused_4 * 2) + T.floordiv(rv0_rv1_fused_1, 3)) < 112) and (((ax2_4 * 2) + T.floormod(rv0_rv1_fused_1, 3)) < 112)), T.load("uint8", placeholder_29.data, ((( ((ax0_ax1_fused_4 * 14336) + (T.floordiv(rv0_rv1_fused_1, 3) * 7168)) + (ax2_4 * 128)) + (T.floormod( rv0_rv1_fused_1, 3) * 64)) + ax3_2)), T.uint8(0), dtype="uint8")), True) for ax0_ax1_fused_5 in T.serial(0, 56): for ax2_5, ax3_3 in T.grid(56, 64): T.store( T_cast_7.data, (((ax0_ax1_fused_5 * 3584) + (ax2_5 * 64)) + ax3_3), T.cast( T.load("uint8", tensor_2, (((ax0_ax1_fused_5 * 3584) + (ax2_5 * 64)) + ax3_3)), "int16"), True)
def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(placeholder_10: T.handle, placeholder_11: T.handle, placeholder_12: T.handle, T_cast_4: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", "tir.noalias": True}) placeholder_13 = T.match_buffer(placeholder_10, [1, 75, 75, 64], dtype="int16") placeholder_14 = T.match_buffer(placeholder_11, [3, 3, 64, 64], dtype="int16") placeholder_15 = T.match_buffer(placeholder_12, [1, 1, 1, 64], dtype="int32") T_cast_5 = T.match_buffer(T_cast_4, [1, 75, 75, 64], dtype="int16") # body PaddedInput_1 = T.allocate([379456], "int16", "global") for i0_i1_fused_1, i2_1, i3_1 in T.grid(77, 77, 64): T.store(PaddedInput_1, i0_i1_fused_1 * 4928 + i2_1 * 64 + i3_1, T.if_then_else(1 <= i0_i1_fused_1 and i0_i1_fused_1 < 76 and 1 <= i2_1 and i2_1 < 76, T.load("int16", placeholder_13.data, i0_i1_fused_1 * 4800 + i2_1 * 64 + i3_1 - 4864), T.int16(0), dtype="int16"), True) for ax0_ax1_fused_ax2_fused_1 in T.serial(0, 5625): Conv2dOutput_1 = T.allocate([64], "int32", "global") for ff_1 in T.serial(0, 64): T.store(Conv2dOutput_1, ff_1, 0, True) for ry, rx, rc_1 in T.grid(3, 3, 64): T.store(Conv2dOutput_1, ff_1, T.load("int32", Conv2dOutput_1, ff_1) + T.cast(T.load("int16", PaddedInput_1, T.floordiv(ax0_ax1_fused_ax2_fused_1, 75) * 4928 + ry * 4928 + rx * 64 + T.floormod(ax0_ax1_fused_ax2_fused_1, 75) * 64 + rc_1), "int32") * T.cast(T.load("int16", placeholder_14.data, ry * 12288 + rx * 4096 + rc_1 * 64 + ff_1), "int32"), True) for ax3_inner_2 in T.serial(0, 64): T.store(T_cast_5.data, ax0_ax1_fused_ax2_fused_1 * 64 + ax3_inner_2, T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.load("int32", Conv2dOutput_1, ax3_inner_2) + T.load("int32", placeholder_15.data, ax3_inner_2), 1608879842, 31, -7, dtype="int32"), 255), 0), "uint8"), "int16"), True)
def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True}) placeholder_65 = T.match_buffer(placeholder_62, [1, 224, 224, 3], dtype="int16", elem_offset=0, align=128, offset_factor=1) placeholder_66 = T.match_buffer(placeholder_63, [7, 7, 3, 64], dtype="int16", elem_offset=0, align=128, offset_factor=1) placeholder_67 = T.match_buffer(placeholder_64, [1, 1, 1, 64], dtype="int32", elem_offset=0, align=128, offset_factor=1) T_cast_21 = T.match_buffer(T_cast_20, [1, 112, 112, 64], dtype="uint8", elem_offset=0, align=128, offset_factor=1) # body PaddedInput_7 = T.allocate([157323], "int16", "global") for i0_i1_fused_7 in T.serial(0, 229): for i2_7, i3_7 in T.grid(229, 3): T.store(PaddedInput_7, (((i0_i1_fused_7*687) + (i2_7*3)) + i3_7), T.if_then_else(((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and (2 <= i2_7)) and (i2_7 < 226)), T.load("int16", placeholder_65.data, ((((i0_i1_fused_7*672) + (i2_7*3)) + i3_7) - 1350)), T.int16(0), dtype="int16"), True) for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544): Conv2dOutput_7 = T.allocate([64], "int32", "global") for ff_3 in T.serial(0, 64): T.store(Conv2dOutput_7, ff_3, 0, True) for ry_2, rx_2, rc_7 in T.grid(7, 7, 3): T.store(Conv2dOutput_7, ff_3, (T.load("int32", Conv2dOutput_7, ff_3) + (T.cast(T.load("int16", PaddedInput_7, (((((T.floordiv(ax0_ax1_fused_ax2_fused_7, 112)*1374) + (ry_2*687)) + (T.floormod(ax0_ax1_fused_ax2_fused_7, 112)*6)) + (rx_2*3)) + rc_7)), "int32")*T.cast(T.load("int16", placeholder_66.data, ((((ry_2*1344) + (rx_2*192)) + (rc_7*64)) + ff_3)), "int32"))), True) for ax3_inner_7 in T.serial(0, 64): T.store(T_cast_21.data, ((ax0_ax1_fused_ax2_fused_7*64) + ax3_inner_7), T.cast(T.max(T.min(T.q_multiply_shift((T.load("int32", Conv2dOutput_7, ax3_inner_7) + T.load("int32", placeholder_67.data, ax3_inner_7)), 1939887962, 31, -9, dtype="int32"), 255), 0), "uint8"), True)
def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_(placeholder_16: T.handle, placeholder_17: T.handle, placeholder_18: T.handle, T_add: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_", "tir.noalias": True}) placeholder_19 = T.match_buffer(placeholder_16, [1, 75, 75, 64], dtype="int16") placeholder_20 = T.match_buffer(placeholder_17, [1, 1, 64, 256], dtype="int16") placeholder_21 = T.match_buffer(placeholder_18, [1, 1, 1, 256], dtype="int32") T_add_1 = T.match_buffer(T_add, [1, 75, 75, 256], dtype="int32") # body PaddedInput_2 = T.allocate([360000], "int16", "global") for i0_i1_fused_2, i2_2, i3_2 in T.grid(75, 75, 64): T.store(PaddedInput_2, i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2, T.load("int16", placeholder_19.data, i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2), True) for ax0_ax1_fused_ax2_fused_2 in T.serial(0, 5625): Conv2dOutput_2 = T.allocate([64], "int32", "global") for ax3_outer_1 in T.serial(0, 4): for ff_2 in T.serial(0, 64): T.store(Conv2dOutput_2, ff_2, 0, True) for rc_2 in T.serial(0, 64): T.store(Conv2dOutput_2, ff_2, T.load("int32", Conv2dOutput_2, ff_2) + T.cast(T.load("int16", PaddedInput_2, ax0_ax1_fused_ax2_fused_2 * 64 + rc_2), "int32") * T.cast(T.load("int16", placeholder_20.data, rc_2 * 256 + ax3_outer_1 * 64 + ff_2), "int32"), True) for ax3_inner_3 in T.serial(0, 64): T.store(T_add_1.data, ax0_ax1_fused_ax2_fused_2 * 256 + ax3_outer_1 * 64 + ax3_inner_3, T.q_multiply_shift(T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.load("int32", Conv2dOutput_2, ax3_inner_3) + T.load("int32", placeholder_21.data, ax3_outer_1 * 64 + ax3_inner_3), 1711626602, 31, -8, dtype="int32") + 132, 255), 0), "uint8"), "int32") - 132, 2094289803, 31, -2, dtype="int32") + 136, True)
def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_(placeholder_22: T.handle, placeholder_23: T.handle, placeholder_24: T.handle, placeholder_25: T.handle, T_cast_6: T.handle) -> None: # function attr dict T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_", "tir.noalias": True}) placeholder_29 = T.match_buffer(placeholder_22, [1, 75, 75, 64], dtype="int16") placeholder_27 = T.match_buffer(placeholder_23, [1, 1, 64, 256], dtype="int16") placeholder_26 = T.match_buffer(placeholder_24, [1, 1, 1, 256], dtype="int32") placeholder_28 = T.match_buffer(placeholder_25, [1, 75, 75, 256], dtype="int32") T_cast_7 = T.match_buffer(T_cast_6, [1, 75, 75, 256], dtype="uint8") # body PaddedInput_3 = T.allocate([360000], "int16", "global") for i0_i1_fused_3, i2_3, i3_3 in T.grid(75, 75, 64): T.store(PaddedInput_3, i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3, T.load("int16", placeholder_29.data, i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3), True) for ax0_ax1_fused_ax2_fused_3 in T.serial(0, 5625): Conv2dOutput_3 = T.allocate([64], "int32", "global") for ax3_outer_2 in T.serial(0, 4): for ff_3 in T.serial(0, 64): T.store(Conv2dOutput_3, ff_3, 0, True) for rc_3 in T.serial(0, 64): T.store(Conv2dOutput_3, ff_3, T.load("int32", Conv2dOutput_3, ff_3) + T.cast(T.load("int16", PaddedInput_3, ax0_ax1_fused_ax2_fused_3 * 64 + rc_3), "int32") * T.cast(T.load("int16", placeholder_27.data, rc_3 * 256 + ax3_outer_2 * 64 + ff_3), "int32"), True) for ax3_inner_4 in T.serial(0, 64): T.store(T_cast_7.data, ax0_ax1_fused_ax2_fused_3 * 256 + ax3_outer_2 * 64 + ax3_inner_4, T.cast(T.max(T.min(T.q_multiply_shift(T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.load("int32", Conv2dOutput_3, ax3_inner_4) + T.load("int32", placeholder_26.data, ax3_outer_2 * 64 + ax3_inner_4), 1343014664, 31, -8, dtype="int32") + 136, 255), 0), "uint8"), "int32") - 136, 1073903788, 31, 1, dtype="int32") + T.load("int32", placeholder_28.data, ax0_ax1_fused_ax2_fused_3 * 256 + ax3_outer_2 * 64 + ax3_inner_4), 255), 0), "uint8"), True)