def test_device_ir(A, B): n = A.shape[0] ib = tvm.tir.ir_builder.create() tx = te.thread_axis("threadIdx.x") ib.scope_attr(tx, "thread_extent", n) temp = ib.allocate(dtype, (n, ), scope="shared.dyn") # n is symbolic size Aptr = ib.buffer_ptr(A) Bptr = ib.buffer_ptr(B) temp[tx] = Aptr[tx] depth = tvm.tir.log2(cast(n, "float32")) with ib.for_range(0, cast(tvm.tir.ceil(depth), n.dtype)) as i: ib.emit( tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))) d = n >> (i + 1) with ib.if_scope(tx < d): temp[tx] += temp[tx + d] Bptr[0] = temp[0] return ib.get()
def test_device_ir(A, B, C): n = A.shape[0] ib = tvm.tir.ir_builder.create() values_per_thread = 4 tx = te.thread_axis("threadIdx.x") ib.scope_attr(tx, "thread_extent", tvm.tir.indexdiv(n, values_per_thread)) A_sh = ib.allocate(A.dtype, (n, ), scope="shared.dyn") # fp16 B_sh = ib.allocate(B.dtype, (n, ), scope="shared.dyn") # fp32 Aptr = ib.buffer_ptr(A) Bptr = ib.buffer_ptr(B) Cptr = ib.buffer_ptr(C) with ib.for_range(0, values_per_thread, kind="vectorize") as i: A_sh[tx * values_per_thread + i] = Aptr[tx * values_per_thread + i] B_sh[tx * values_per_thread + i] = Bptr[tx * values_per_thread + i] with ib.for_range(0, values_per_thread) as i: Cptr[tx * values_per_thread + i] = (cast(A_sh[tx * values_per_thread + i], "float32") + B_sh[tx * values_per_thread + i]) return ib.get()
def test_device_ir(A, B, C): n = A.shape[0] ib = tvm.tir.ir_builder.create() tx = te.thread_axis("threadIdx.x") ib.scope_attr(tx, "thread_extent", n) A_sh = ib.allocate(A.dtype, (n, ), scope="shared.dyn") # i8 B_sh = ib.allocate(B.dtype, (n, ), scope="shared.dyn") # i16 C_sh = ib.allocate(C.dtype, (n, ), scope="shared.dyn") # i32 Aptr = ib.buffer_ptr(A) Bptr = ib.buffer_ptr(B) Cptr = ib.buffer_ptr(C) A_sh[tx] = Aptr[tx] B_sh[tx] = Bptr[tx] C_sh[tx] = cast(A_sh[tx], "int32") + cast(B_sh[tx], "int32") Cptr[tx] = C_sh[tx] return ib.get()
def do_copy(A, B, n): ib = tvm.tir.ir_builder.create() A = ib.buffer_ptr(A) B = ib.buffer_ptr(B) tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") max_threads = 32 ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(n + max_threads - 1, max_threads)) ib.scope_attr(tx, "thread_extent", max_threads) tid = bx * max_threads + tx with ib.if_scope(tid < n): B[tid] = cast(A[tid], "int32") return ib.get()
def test_matmul_ir(A, B, C): ib = tvm.tir.ir_builder.create() tx = te.thread_axis("threadIdx.x") ty = te.thread_axis("threadIdx.y") bx = te.thread_axis("blockIdx.x") by = te.thread_axis("blockIdx.y") ib.scope_attr(tx, "thread_extent", block) ib.scope_attr(ty, "thread_extent", block) ib.scope_attr(bx, "thread_extent", n // block) ib.scope_attr(by, "thread_extent", n // block) A_sh = ib.allocate(A.dtype, (block, block), scope="shared.dyn", name="A_sh") # fp16 B_sh = ib.allocate(B.dtype, (block, block), scope="shared.dyn", name="B_sh") # fp16 # Create a dynamic shared memory for the accumulation. # This is for testing merging dynamic shared memory alloctions with different data type. # In practice, there is no need to allocate a shared memory for C. C_local = ib.allocate(C.dtype, (1, ), scope="local", name="C_local") C_sh = ib.allocate(C.dtype, (block, block), scope="shared.dyn", name="C_sh") # fp32 A_ptr = ib.buffer_ptr(A) B_ptr = ib.buffer_ptr(B) C_ptr = ib.buffer_ptr(C) C_local[0] = 0.0 with ib.for_range(0, n // block, name="i") as i: A_sh[ty, tx] = A_ptr[by * block + ty, i * block + tx] B_sh[ty, tx] = B_ptr[i * block + ty, bx * block + tx] ib.emit(syncthread()) with ib.for_range(0, block, name="k") as k: C_local[0] += cast(A_sh[ty, k] * B_sh[k, tx], "float32") ib.emit(syncthread()) C_sh[ty, tx] = C_local[0] C_ptr[by * block + ty, bx * block + tx] = C_sh[ty, tx] return ib.get()