def buffer_opaque_access(b: ty.handle, c: ty.handle) -> None: B = tir.match_buffer(b, [16, 16], "float32") C = tir.match_buffer(c, [16, 16], "float32") with tir.block([]): tir.reads([]) tir.writes(B[0:16, 0:16]) A = tir.allocate([256], "float32", "global") for i, j in tir.grid(16, 16): tir.store(A, i * 16 + j, 1) for i in range(0, 16): for j in range(0, 16): tir.evaluate(tir.load("float32", A, i * 16 + j)) for j in range(0, 16): tir.evaluate( tir.tvm_fill_fragment(B.data, 16, 16, 16, 0, tir.float32(0), dtype="handle")) for i, j in tir.grid(16, 16): with tir.block([16, 16]) as [vi, vj]: tir.bind(vi, i) tir.bind(vj, j) C[vi, vj] = B[vi, vj]
def transformed_recursive_match(a: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, (64, 64, 64)) B = tir.match_buffer(b, (64, 64, 64)) for i, j, k in tir.grid(64, 4, 4): with tir.block([]): tir.reads([]) tir.writes([ A[i, j * 16:j * 16 + 16, k * 16:k * 16 + 16], B[i, j * 16:j * 16 + 16, k * 16:k * 16 + 16], ]) for jj, kk in tir.grid(4, 4): with tir.block([]): tir.reads([]) tir.writes([ A[i, j * 16 + jj * 4:j * 16 + jj * 4 + 4, k * 16 + kk * 4:k * 16 + kk * 4 + 4, ], B[i, j * 16 + jj * 4:j * 16 + jj * 4 + 4, k * 16 + kk * 4:k * 16 + kk * 4 + 4, ], ]) tir.evaluate( tir.intrin_test( A.data, i * 4096 + j * 1024 + jj * 256 + k * 16 + kk * 4, 64, 1, 4, 4, dtype="handle", )) for jjj, kkk in tir.grid(4, 4): B[i, j * 16 + jj * 4 + jjj, k * 16 + kk * 4 + kkk] = 1
def transformed_opaque_access(a: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, (32, 64, 128)) B = tir.match_buffer(b, (64, 64, 64)) for i, j, k in tir.grid(2, 64, 8): with tir.block([]): tir.reads([]) tir.writes(A[i * 16:i * 16 + 16, j, k * 16:k * 16 + 16]) tir.evaluate( tir.intrin_test( A.data, i * 131072 + j * 128 + k * 16, 8192, 128, 16, 1, dtype="handle", )) for i, j, k in tir.grid(64, 2, 8): with tir.block([]): tir.reads([]) tir.writes(B[i, j * 32:j * 32 + 32, k * 8:k * 8 + 8]) tir.evaluate( tir.intrin_test( B.data, i * 4096 + j * 2048 + k * 8, 64, 1, 32, 8, dtype="handle", ))
def symbolic_match(a: ty.handle, b: ty.handle, n: ty.int32, m: ty.int32) -> None: A = tir.match_buffer(a, (n * m, m)) B = tir.match_buffer(b, (n * 2, m * 4)) for i in range(0, n): with tir.block([]): tir.reads([]) tir.writes([A[i * m:i * m + n, 0:m], B[i * n:i * n + 2, 0:m * 4]]) Bs_0 = tir.var("int32") Bs_1 = tir.var("int32") sub_A = tir.match_buffer(A[i * m:i * m + m, 0:m], (m, m), offset_factor=1) sub_B = tir.match_buffer(B[i * n:i * n + 2, 0:m * 4], (2, m * 4), strides=[Bs_0, Bs_1], offset_factor=1) for ii, jj in tir.grid(m, m): sub_A[ii, jj] = 1 for j in range(0, 4): tir.evaluate( tir.intrin_test( sub_B.data, sub_B.elem_offset, sub_B.strides[0], sub_B.strides[1], sub_B.shape[0], sub_B.shape[1], dtype="handle", ))
def transformed_opaque_access(a: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, [1024]) B = tir.match_buffer(b, [1024]) for i in tir.serial(0, 8): with tir.block([8]) as [vi]: tir.reads(A[vi * 128:vi * 128 + 128]) tir.writes(B[vi * 128:vi * 128 + 128]) A_cache = tir.alloc_buffer([1024]) with tir.block([8]) as [v]: tir.bind(v, vi) tir.reads([A[v * 128:v * 128 + 128]]) tir.writes([A_cache[v * 128:v * 128 + 128]]) tir.evaluate( tir.call_extern("test", A_cache.data, v * 128, 128, A.data, v * 128, 128, dtype="float32")) for j in tir.serial(0, 128): with tir.block([1024]) as [v]: tir.bind(v, ((vi * 128) + j)) tir.reads([A_cache[v]]) tir.writes([B[v]]) B[v] = A_cache[v]
def fail_match_load(a: ty.handle) -> None: A = tir.match_buffer(a, (8, 8)) for i, j in tir.grid(8, 8): with tir.block([]): tir.reads(A[i, j]) tir.writes([]) sub_A = tir.match_buffer(A[i, j], ()) tir.evaluate(tir.load("float32", sub_A.data, 0))
def recursive_match(a: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, (64, 64, 64)) B = tir.match_buffer(b, (64, 64, 64)) for i, j, k in tir.grid(64, 4, 4): with tir.block([]): tir.reads([]) tir.writes([ A[i, j * 16:j * 16 + 16, k * 16:k * 16 + 16], B[i, j * 16:j * 16 + 16, k * 16:k * 16 + 16], ]) As_0 = tir.var("int32") As_1 = tir.var("int32") sub_A = tir.match_buffer( A[i, j * 16:j * 16 + 16, k * 16:k * 16 + 16], (16, 16), strides=[As_0, As_1], offset_factor=1, ) sub_B = tir.match_buffer( B[i, j * 16:j * 16 + 16, k * 16:k * 16 + 16], (16, 16), offset_factor=1, ) for jj, kk in tir.grid(4, 4): with tir.block([]): tir.reads([]) tir.writes([ sub_A[jj * 4:jj * 4 + 4, kk * 4:kk * 4 + 4], sub_B[jj * 4:jj * 4 + 4, kk * 4:kk * 4 + 4], ]) Ass_0 = tir.var("int32") Ass_1 = tir.var("int32") sub_sub_A = tir.match_buffer( sub_A[jj * 4:jj * 4 + 4, kk * 4:kk * 4 + 4], (4, 4), strides=[Ass_0, Ass_1], offset_factor=1, ) sub_sub_B = tir.match_buffer( sub_B[jj * 4:jj * 4 + 4, kk * 4:kk * 4 + 4], (4, 4), offset_factor=1, ) tir.evaluate( tir.intrin_test( sub_sub_A.data, sub_sub_A.elem_offset, sub_sub_A.strides[0], sub_sub_A.strides[1], sub_sub_A.shape[0], sub_sub_A.shape[1], dtype="handle", )) for jjj, kkk in tir.grid(4, 4): sub_sub_B[jjj, kkk] = 1
def opaque_access_func() -> None: A = tir.alloc_buffer([1024]) B = tir.alloc_buffer([1024]) for i in tir.serial(0, 8): with tir.block([8]) as [v]: tir.bind(v, i) tir.reads([A[v * 128 : v * 128 + 128]]) tir.writes([B[v * 128 : v * 128 + 128]]) tir.evaluate( tir.call_extern("test", B.data, v * 128, 128, A.data, v * 128, 128, dtype="float32") )
def match_buffer_func(a: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, (128, 128), "float32") B = tir.match_buffer(b, (128, 128), "float32") with tir.block([8, 8], "block") as [vi, vj]: tir.reads(B[vi * 16 + 2 : vi * 16 + 12, vj * 16 + 2 : vj * 16 + 16]) tir.writes(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) B0 = tir.match_buffer(B[vi * 16 + 2 : vi * 16 + 6, vj * 16 + 2 : vj * 16 + 6], (4, 4)) B1 = tir.match_buffer(B[vi * 16 + 8 : vi * 16 + 12, vj * 16 + 8 : vj * 16 + 16], (4, 8)) with tir.block([16, 16], "AAA") as [i, j]: AA = tir.match_buffer(A[i, j], ()) AA[()] = 1.0 tir.evaluate(B0.data) tir.evaluate(B1.data)
def tir_packed_call() -> None: A = tir.var("handle") B = tir.var("handle") C = tir.var("handle") # body tir.evaluate( tir.tvm_call_cpacked( "tvm_test_cpacked", A, B, C, dtype="int32", ))
def func() -> None: A = tir.alloc_buffer((128, 128), "float32") B = tir.alloc_buffer((128, 128), "float32") C = tir.alloc_buffer((128, 128), "float32") D = tir.alloc_buffer((128, 128), "float32") with tir.block([]): # Need add read/write region manually to avoid triggering block access region detector tir.reads([B[0, 0], C[0:16, 0:16], A[4:12, 4:12]]) tir.writes([A[0:12, 0:12]]) for i, j in tir.grid(8, 8): A[i, j] = B[0, 0] + C[0, 0] with tir.block([2, 2]) as [vi, vj]: tir.reads([A[vi * 4 + 4 : vi * 4 + 8, vj * 4 + 4 : vj * 4 + 8], C[12:16, 12:16]]) tir.writes([A[vi * 4 + 4 : vi * 4 + 8, vj * 4 + 4 : vj * 4 + 8]]) for i, j in tir.grid(4, 4): A[vi * 4 + 4 + i, vj * 4 + 4 + j] += C[i + 12, j + 12] tir.evaluate(D.data)
def opaque_access(a: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, (32, 64, 128)) B = tir.match_buffer(b, (64, 64, 64)) for i, j, k in tir.grid(2, 64, 8): with tir.block([]): tir.reads([]) tir.writes(A[i * 16:i * 16 + 16, j, k * 16:k * 16 + 16]) sub_A = tir.match_buffer( A[i * 16:i * 16 + 16, j, k * 16:k * 16 + 16], (16, 1, 16), strides=[8192, 128, 1], offset_factor=1, ) tir.evaluate( tir.intrin_test( sub_A.data, sub_A.elem_offset, sub_A.strides[0], sub_A.strides[1], sub_A.shape[0], sub_A.shape[1], dtype="handle", )) for i, j, k in tir.grid(64, 2, 8): with tir.block([]): Bs_0 = tir.var("int32") Bs_1 = tir.var("int32") tir.reads([]) tir.writes(B[i, j * 32:j * 32 + 32, k * 8:k * 8 + 8]) sub_B = tir.match_buffer( B[i, j * 32:j * 32 + 32, k * 8:k * 8 + 8], (32, 8), strides=[Bs_0, Bs_1], offset_factor=1, ) tir.evaluate( tir.intrin_test( sub_B.data, sub_B.elem_offset, sub_B.strides[0], sub_B.strides[1], sub_B.shape[0], sub_B.shape[1], dtype="handle", ))
def opaque_access(a: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, [16, 16], "float32") B = tir.match_buffer(b, [16, 16], "float32") with tir.block([16, 16], "A") as [vi, vj]: tir.reads([]) tir.writes([A[0:16, 0:16]]) tir.store(A.data, vi * 16 + vj, 1) with tir.block([16, 16], "B") as [vi, vj]: tir.reads([]) tir.writes([B[0:16, 0:16]]) tir.evaluate( tir.tvm_fill_fragment(B.data, 16, 16, 16, 0, vi * 16 + vj, dtype="handle"))
def transformed_rank0_buffer(a: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, (8, 8)) B = tir.match_buffer(b, (8, 8)) for i, j in tir.grid(8, 8): with tir.block([]): tir.reads([]) tir.writes([A[i, j], B[i, j]]) A[i, j] = 1 tir.evaluate( tir.intrin_test( B.data, i * 8 + j, 0, 0, 0, 0, dtype="handle", ))
def tir_extern(a: ty.handle, b: ty.handle, c: ty.handle) -> None: A = tir.match_buffer(a, (128, 128)) B = tir.match_buffer(b, (128, 128)) C = tir.match_buffer(c, (128, 128)) # body with tir.block([], "C"): tir.reads([A[0:128, 0:128], B[0:128, 0:128]]) tir.writes([C[0:128, 0:128]]) tir.evaluate( tir.tvm_call_packed( "tvm.contrib.cblas.matmul", tir.tvm_stack_make_array( A.data, tir.tvm_stack_make_shape(128, 128, dtype="handle"), 0, 2, 0.0, 0, dtype="handle", ), tir.tvm_stack_make_array( B.data, tir.tvm_stack_make_shape(128, 128, dtype="handle"), 0, 2, 0.0, 0, dtype="handle", ), tir.tvm_stack_make_array( C.data, tir.tvm_stack_make_shape(128, 128, dtype="handle"), 0, 2, 0.0, 0, dtype="handle", ), 0, 0, dtype="int32", ) )
def rank0_buffer(a: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, (8, 8)) B = tir.match_buffer(b, (8, 8)) for i, j in tir.grid(8, 8): with tir.block([]): tir.reads([]) tir.writes([A[i, j], B[i, j]]) sub_A = tir.match_buffer(A[i, j], (), offset_factor=1) sub_B = tir.match_buffer(B[i, j], (), offset_factor=1) sub_A[()] = 1 tir.evaluate( tir.intrin_test( sub_B.data, sub_B.elem_offset, 0, 0, 0, 0, dtype="handle", ))
def match_buffer_func() -> None: with tir.block([], "root"): A = tir.alloc_buffer((128, 128), "float32") B = tir.alloc_buffer((128, 128), "float32") tir.reads([]) tir.writes([]) # Need add read/write region manually to avoid triggering block access region detector with tir.block([8, 8], "block") as [vi, vj]: tir.reads(B[vi * 16 + 2 : vi * 16 + 12, vj * 16 + 2 : vj * 16 + 16]) tir.writes(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) AA = tir.match_buffer(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16], (16, 16)) B0 = tir.match_buffer(B[vi * 16 + 2 : vi * 16 + 6, vj * 16 + 2 : vj * 16 + 6], (4, 4)) B1 = tir.match_buffer(B[vi * 16 + 8 : vi * 16 + 12, vj * 16 + 8 : vj * 16 + 16], (4, 8)) with tir.block([16, 16], "AAA") as [i, j]: tir.reads([]) tir.writes(AA[i, j]) AAA = tir.match_buffer(AA[i, j], ()) AAA[()] = 1.0 tir.evaluate(B0.data) tir.evaluate(B1.data)
def transformed_symbolic_match(a: ty.handle, b: ty.handle, n: ty.int32, m: ty.int32) -> None: A = tir.match_buffer(a, (n * m, m)) B = tir.match_buffer(b, (n * 2, m * 4)) for i in range(0, n): with tir.block([]): tir.reads([]) tir.writes([A[i * m:i * m + n, 0:m], B[i * n:i * n + 2, 0:m * 4]]) for ii, jj in tir.grid(m, m): A[i * m + ii, jj] = 1 for j in range(0, 4): tir.evaluate( tir.intrin_test( B.data, i * n * (m * 4), m * 4, 1, 2, m * 4, dtype="handle", ))
def opaque_access_split(a: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, (16, 16)) B = tir.match_buffer(b, (16, 16)) for i, j0, j1 in tir.grid(16, 4, 4): with tir.block([16, 16], "A") as [vi, vj]: tir.bind(vi, i) tir.bind(vj, ((j0 * 4) + j1)) tir.reads([]) tir.writes([A[0:16, 0:16]]) tir.store(A.data, ((vi * 16) + vj), 1, 1) for i, j0, j1 in tir.grid(16, 4, 4): with tir.block([16, 16], "B") as [vi, vj]: tir.bind(vi, i) tir.bind(vj, ((j0 * 4) + j1)) tir.reads([]) tir.writes([B[0:16, 0:16]]) tir.evaluate( tir.tvm_fill_fragment(B.data, 16, 16, 16, 0, ((vi * 16) + vj), dtype="handle"))
def opaque_access_fused(a: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, [16, 16]) B = tir.match_buffer(b, [16, 16]) for i_j_fused in tir.serial(0, 256): with tir.block([16, 16], "A") as [vi, vj]: tir.bind(vi, tir.floordiv(i_j_fused, 16)) tir.bind(vj, tir.floormod(i_j_fused, 16)) tir.reads([]) tir.writes([A[0:16, 0:16]]) tir.store(A.data, ((vi * 16) + vj), 1, 1) for i_j_fused in tir.serial(0, 256): with tir.block([16, 16], "B") as [vi, vj]: tir.bind(vi, tir.floordiv(i_j_fused, 16)) tir.bind(vj, tir.floormod(i_j_fused, 16)) tir.reads([]) tir.writes([B[0:16, 0:16]]) tir.evaluate( tir.tvm_fill_fragment(B.data, 16, 16, 16, 0, ((vi * 16) + vj), dtype="handle"))
def tir_packed_call() -> None: A = tir.var("handle") B = tir.var("handle") C = tir.var("handle") # body tvm_value_2 = tir.var("handle") tvm_value_1 = tir.var("handle") tvm_value_0 = tir.var("handle") with tir.let(tvm_value_2, tir.tvm_stack_alloca("array", 1, dtype="handle")): with tir.let(tvm_value_1, tir.tvm_stack_alloca("array", 1, dtype="handle")): with tir.let(tvm_value_0, tir.tvm_stack_alloca("array", 1, dtype="handle")): tir.evaluate( tir.tvm_struct_set(tvm_value_0, 0, 1, A, dtype="handle")) tir.evaluate( tir.tvm_struct_set(tvm_value_1, 0, 1, B, dtype="handle")) tir.evaluate( tir.tvm_struct_set(tvm_value_2, 0, 1, C, dtype="handle")) tir.evaluate( tir.tvm_call_cpacked( "tvm_test_cpacked", tvm_value_0, tvm_value_1, tvm_value_2, dtype="int32", ))
def return_not_allowed(a: ty.handle) -> None: return tir.evaluate(0) # error
def invalid_block_function(a: ty.handle) -> None: A = tir.match_buffer(a, (16, 16), "float32") with tir.evaluate(0.0): pass
def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, ethosu_write: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, placeholder_5: ty.handle, placeholder_6: ty.handle, placeholder_7: ty.handle, placeholder_8: ty.handle, placeholder_9: ty.handle, placeholder_10: ty.handle) -> None: # function attr dict tir.func_attr({ "from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True }) buffer = tir.match_buffer(placeholder_7, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1) buffer_1 = tir.match_buffer(placeholder_5, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1) buffer_2 = tir.match_buffer(placeholder_3, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1) buffer_3 = tir.match_buffer(placeholder_4, [32], dtype="uint8", elem_offset=0, align=128, offset_factor=1) buffer_4 = tir.match_buffer(placeholder_9, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1) buffer_5 = tir.match_buffer(placeholder_6, [32], dtype="uint8", elem_offset=0, align=128, offset_factor=1) placeholder_11 = tir.match_buffer(placeholder, [1, 16, 16, 32], dtype="int8", elem_offset=0, align=128, offset_factor=1) buffer_6 = tir.match_buffer(placeholder_1, [592], dtype="uint8", elem_offset=0, align=128, offset_factor=1) ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1) buffer_7 = tir.match_buffer(placeholder_2, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1) buffer_8 = tir.match_buffer(placeholder_8, [32], dtype="uint8", elem_offset=0, align=128, offset_factor=1) buffer_9 = tir.match_buffer(placeholder_10, [32], dtype="uint8", elem_offset=0, align=128, offset_factor=1) # body ethosu_write_2 = tir.allocate([4096], "int8", "global") placeholder_global = tir.allocate([80], "uint8", "global") placeholder_d_global = tir.allocate([32], "uint8", "global") tir.evaluate( tir.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, tir.load("int8", placeholder_11.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", buffer_6.data, 0), 592, 12, tir.load("uint8", buffer_7.data, 0), 160, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle")) tir.evaluate( tir.call_extern("ethosu_copy", tir.load("uint8", buffer_2.data, 0), 80, tir.load("uint8", placeholder_global, 0), dtype="handle")) tir.evaluate( tir.call_extern("ethosu_copy", tir.load("uint8", buffer_3.data, 0), 32, tir.load("uint8", placeholder_d_global, 0), dtype="handle")) tir.evaluate( tir.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 80, 12, tir.load("uint8", placeholder_d_global, 0), 32, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle")) tir.evaluate( tir.call_extern("ethosu_copy", tir.load("uint8", buffer_1.data, 0), 80, tir.load("uint8", placeholder_global, 0), dtype="handle")) tir.evaluate( tir.call_extern("ethosu_copy", tir.load("uint8", buffer_5.data, 0), 32, tir.load("uint8", placeholder_d_global, 0), dtype="handle")) tir.evaluate( tir.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, tir.load("int8", ethosu_write_1.data, 2), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 80, 12, tir.load("uint8", placeholder_d_global, 0), 32, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle")) tir.evaluate( tir.call_extern("ethosu_copy", tir.load("uint8", buffer.data, 0), 80, tir.load("uint8", placeholder_global, 0), dtype="handle")) tir.evaluate( tir.call_extern("ethosu_copy", tir.load("uint8", buffer_8.data, 0), 32, tir.load("uint8", placeholder_d_global, 0), dtype="handle")) tir.evaluate( tir.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, tir.load("int8", ethosu_write_1.data, 4), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 80, 12, tir.load("uint8", placeholder_d_global, 0), 32, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle")) tir.evaluate( tir.call_extern("ethosu_copy", tir.load("uint8", buffer_4.data, 0), 80, tir.load("uint8", placeholder_global, 0), dtype="handle")) tir.evaluate( tir.call_extern("ethosu_copy", tir.load("uint8", buffer_9.data, 0), 32, tir.load("uint8", placeholder_d_global, 0), dtype="handle")) tir.evaluate( tir.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 2, 16, 0, 16, tir.load("int8", ethosu_write_1.data, 6), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_global, 0), 80, 12, tir.load("uint8", placeholder_d_global, 0), 32, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
def main( placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, ethosu_conv2d: ty.handle, ) -> None: # function attr dict tir.func_attr({"global_symbol": "main", "tir.noalias": True}) placeholder_3 = tir.match_buffer(placeholder, [1, 8, 8, 3], dtype="uint8", elem_offset=0, align=128, offset_factor=1) placeholder_4 = tir.match_buffer(placeholder_1, [48], dtype="uint8", elem_offset=0, align=128, offset_factor=1) placeholder_5 = tir.match_buffer(placeholder_2, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1) ethosu_conv2d_1 = tir.match_buffer(ethosu_conv2d, [1, 8, 8, 16], dtype="uint8", elem_offset=0, align=128, offset_factor=1) # body tir.evaluate( tir.call_extern( "ethosu_conv2d", "uint8", 8, 8, 3, 8, 0, 8, tir.load("uint8", placeholder_3.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 8, 8, 16, 8, 0, 8, tir.load("uint8", ethosu_conv2d_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 128, 16, 1, 1, 1, 1, 1, 1, 1, tir.load("uint8", placeholder_4.data, 0), 0, 12, tir.load("uint8", placeholder_5.data, 0), 0, 0, 0, 0, 0, "CLIP", 0, 0, "NONE", dtype="uint8", ))
def invalid_for_function(a: ty.handle) -> None: A = tir.match_buffer(a, (16, 16), "float32") for i in tir.evaluate(0.0): for j in tir.serial(0, 16): A[i, j] = 0.0
def invalid_concise_scoping() -> None: tir.Assert(1.0 > 0.0, "aaaa") tir.evaluate(0.0)
def missing_type_annotation(a) -> None: # error tir.evaluate(0.0)
def opaque_access(a: ty.handle, b: ty.handle, c: ty.handle, d: ty.handle) -> None: A = tir.match_buffer(a, (128, 128), dtype="float16") B = tir.match_buffer(b, (128, 128), dtype="float16") C = tir.match_buffer(c, (128, 128), dtype="float16") D = tir.match_buffer(d, (128, 128), dtype="float16") with tir.block([128, 128], "load_store") as [vi, vj]: tir.reads(A[vi, vj]) tir.writes(D[vi, vj]) D.data[vi * 128 + vj] = tir.load("float16", A.data, vi * 128 + vj) with tir.block([8, 8], "opaque") as [vi, vj]: tir.reads(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) tir.writes(B[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) tir.evaluate( tir.tvm_load_matrix_sync( B.data, 16, 16, 16, vi * 8 + vj, tir.tvm_access_ptr( tir.type_annotation(dtype="float16"), A.data, vi * 2048 + vj * 16, 128, 1, dtype="handle", ), 128, "row_major", dtype="handle", ) ) with tir.block([8, 8], "match_buffer") as [vi, vj]: tir.reads(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) tir.writes(C[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16]) A0 = tir.match_buffer( A[ vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16, ], (16, 16), "float16", strides=[128, 1], offset_factor=1, ) C0 = tir.match_buffer( C[ vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16, ], (16, 16), "float16", strides=[128, 1], offset_factor=1, ) tir.evaluate( tir.tvm_load_matrix_sync( C0.data, 16, 16, 16, vi * 8 + vj, tir.tvm_access_ptr( tir.type_annotation(dtype="float16"), A0.data, A0.elem_offset, A0.strides[0], 1, dtype="handle", ), 128, "row_major", dtype="handle", ) )
def intrin_except_unassign(a: ty.handle) -> None: A = tir.match_buffer(a, (16, 16), "float32") tir.evaluate(A) # error