def get_valid_counts_scan(data, partial_in, partial): """Low level IR to do scan. Parameters ---------- data: Buffer 3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms. idx_in : Buffer 2D Buffer of valid data indices with shape [batch_size, num_anchors]. idx : Buffer 2D Buffer of valid data indices with shape [batch_size, num_anchors]. partial : Buffer 2D Buffer of valid data indices with shape [batch_size, new_range]. Returns ------- stmt : Stmt The result IR statement. """ batch_size = data.shape[0] num_anchors = data.shape[1] ib = tvm.ir_builder.create() partial_in = ib.buffer_ptr(partial_in) partial = ib.buffer_ptr(partial) max_threads = int( tvm.target.current_target(allow_none=False).max_num_threads) elem_per_thread = num_anchors // max_threads + 1 nthread_tx = max_threads nthread_bx = batch_size tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) var = tvm.make.node("FloatImm", dtype="float32", value=2) new_range = num_anchors // elem_per_thread + 1 iteration = log(cast(new_range, "float32")) // math.log(2) # Scan: Kogge-Stone adder with ib.if_scope( tvm.all(bx < batch_size, tx < tvm.min(new_range, num_anchors))): with ib.for_range(0, iteration) as k: with ib.if_scope(k == 0): with ib.if_scope( tvm.all(tx > 0, tx < tvm.min(new_range, num_anchors))): partial[bx * new_range + tx] = \ partial_in[bx * new_range + tx] + partial_in[bx * new_range + tx - 1] with ib.else_scope(): partial[bx * new_range] = partial_in[bx * new_range] with ib.else_scope(): with ib.if_scope(tvm.all(tx >= cast(power(var, k), "int32"), \ tx < tvm.min(new_range, num_anchors))): partial[bx * new_range + tx] += \ partial[bx * new_range + tx - cast(power(var, k), "int32")] ib.emit( tvm.make.Call(None, 'tvm_storage_sync', tvm.convert(['shared']), tvm.expr.Call.Intrinsic, None, 0)) return ib.get()
def get_valid_counts_scan(data, partial_in, partial): """Low level IR to do scan. Parameters ---------- data: Buffer 3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms. idx_in : Buffer 2D Buffer of valid data indices with shape [batch_size, num_anchors]. idx : Buffer 2D Buffer of valid data indices with shape [batch_size, num_anchors]. partial : Buffer 2D Buffer of valid data indices with shape [batch_size, new_range]. Returns ------- stmt : Stmt The result IR statement. """ batch_size = data.shape[0] num_anchors = data.shape[1] ib = tvm.ir_builder.create() partial_in = ib.buffer_ptr(partial_in) partial = ib.buffer_ptr(partial) max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads) elem_per_thread = num_anchors // max_threads + 1 nthread_tx = max_threads nthread_bx = batch_size tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) var = tvm.make.node("FloatImm", dtype="float32", value=2) new_range = num_anchors // elem_per_thread + 1 iteration = log(cast(new_range, "float32")) // math.log(2) # Scan: Kogge-Stone adder with ib.if_scope(tvm.all(bx < batch_size, tx < tvm.min(new_range, num_anchors))): with ib.for_range(0, iteration) as k: with ib.if_scope(k == 0): with ib.if_scope(tvm.all(tx > 0, tx < tvm.min(new_range, num_anchors))): partial[bx * new_range + tx] = \ partial_in[bx * new_range + tx] + partial_in[bx * new_range + tx - 1] with ib.else_scope(): partial[bx * new_range] = partial_in[bx * new_range] with ib.else_scope(): with ib.if_scope(tvm.all(tx >= cast(power(var, k), "int32"), \ tx < tvm.min(new_range, num_anchors))): partial[bx * new_range + tx] += \ partial[bx * new_range + tx - cast(power(var, k), "int32")] ib.emit(tvm.make.Call(None, 'tvm_storage_sync', tvm.convert(['shared']), tvm.expr.Call.Intrinsic, None, 0)) return ib.get()