def _dispatch_sim_quantize(value): pass_through_value = te.compute( data.shape, lambda *indices: _compute_pass_through(value, *indices)) int8_value = te.compute( data.shape, lambda *indices: tir.if_then_else( out_dtype.equal(SQNN_DTYPE_TO_CODE["int8"]), _compute_intn("int8", value, *indices), pass_through_value[indices], ), ) uint8_value = te.compute( data.shape, lambda *indices: tir.if_then_else( out_dtype.equal(SQNN_DTYPE_TO_CODE["uint8"]), _compute_intn("uint8", value, *indices), int8_value[indices], ), ) int32_value = te.compute( data.shape, lambda *indices: tir.if_then_else( out_dtype.equal(SQNN_DTYPE_TO_CODE["int32"]), _compute_intn("int32", value, *indices), uint8_value[indices], ), ) return int32_value
def im2col(row, col): j_w, j_h, j_n = idxsplit(row, [imw, imh]) j_c, k_w, k_h = idxsplit(col, [chanin, 3]) i_h, i_w = j_h + k_h - 1, j_w + k_w - 1 return tir.if_then_else( tir.all(i_h >= 0, i_h < imh, i_w >= 0, i_w < imw), data[j_n, i_h, i_w, j_c], 0)
def im2col(nsamples, ckk, imglen): j_h, j_w = imglen // imgw, imglen % imgw i_c, k_h, k_w = ckk // 9, ckk // 3 % 3, ckk % 3 i_h, i_w = j_h + k_h - 1, j_w + k_w - 1 return tir.if_then_else( tir.all(i_h >= 0, i_h < imgh, i_w >= 0, i_w < imgw), data[nsamples, i_c, i_h, i_w], 0)
def __reflect(index, size, corner_start): index_align_corner = te.abs(corner_start - index) size_times = te.truncdiv(index_align_corner.astype("int32"), size).astype("int32") t = tir.Mod(size_times, 2) extra = index_align_corner - size_times * size return tir.if_then_else( tir.EQ(t, 0), extra + corner_start, size - extra + corner_start )
def loop_carried_dependency(a: ty.handle, b: ty.handle, c: ty.handle) -> None: A = tir.match_buffer(a, (128,)) B = tir.match_buffer(b, (128,)) C = tir.match_buffer(c, (128,)) for i in range(0, 128): with tir.block([128], "B") as vi: B[vi] = A[vi] * 2.0 with tir.block([128], "C") as vi: C[vi] = tir.if_then_else(vi >= 1, B[vi - 1] + 1.0, 0.0, dtype="float32")
def read_out_of_bound(a: ty.handle, c: ty.handle) -> None: A = tir.match_buffer(a, [16], "float32") B = tir.alloc_buffer([16], "float32") C = tir.match_buffer(c, [16], "float32") for i in tir.serial(0, 16): with tir.block([16], "B") as [v]: B[v] = A[v] for j in tir.serial(0, 16): with tir.block([16], "C") as [v]: tir.reads(B[v:v + 2]) C[v] = tir.if_then_else(v < 15, tir.max(B[v], B[v + 1]), B[v], dtype="float32")
def read_out_of_bound_after_compute_at(a: ty.handle, c: ty.handle) -> None: A = tir.match_buffer(a, [16], "float32") B = tir.alloc_buffer([16], "float32") C = tir.match_buffer(c, [16], "float32") for j in tir.serial(0, 16): for i in tir.serial(0, tir.min(1, 15 - j) + 1): with tir.block([16], "B") as [v]: tir.bind(v, j + i) B[v] = A[v] with tir.block([16], "C") as [v]: tir.bind(v, j) tir.reads([B[v:v + 2]]) C[v] = tir.if_then_else(v < 15, tir.max(B[v], B[v + 1]), B[v], dtype="float32")
def _dispatch_sim_dequantize(value): pass_through_value = te.compute( data.shape, lambda *indices: _compute_pass_through(value, *indices)) intn_condition = tvm.te.any( in_dtype.equal(SQNN_DTYPE_TO_CODE["int8"]), in_dtype.equal(SQNN_DTYPE_TO_CODE["uint8"]), in_dtype.equal(SQNN_DTYPE_TO_CODE["int32"]), ) intn_value = te.compute( data.shape, lambda *indices: tir.if_then_else( intn_condition, _compute_intn(value, *indices), pass_through_value[indices], ), ) return intn_value
def tir_conv2d(a: ty.handle, w: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, [16, 16, 14, 14]) W = tir.match_buffer(w, [16, 3, 3, 32]) B = tir.match_buffer(b, [16, 32, 14, 14]) Apad = tir.alloc_buffer([16, 16, 16, 16]) with tir.block([16, 16, 16, 16], "Apad") as [nn, cc, yy, xx]: Apad[nn, cc, yy, xx] = tir.if_then_else( yy >= 1 and yy - 1 < 14 and xx >= 1 and xx - 1 < 14, A[nn, cc, yy - 1, xx - 1], 0.0, dtype="float32", ) with tir.block( [16, 32, 14, 14, tir.reduce_axis(0, 16), tir.reduce_axis(0, 3), tir.reduce_axis(0, 3)], "B" ) as [nn, ff, yy, xx, rc, ry, rx]: with tir.init(): B[nn, ff, yy, xx] = 0.0 B[nn, ff, yy, xx] += Apad[nn, rc, yy + ry, xx + rx] * W[rc, ry, rx, ff]
def _nms_loop( ib, batch_size, top_k, iou_threshold, max_output_size, valid_count, on_new_valid_box_func, on_new_invalidated_box_func, needs_bbox_check_func, calc_overlap_func, out_scores, num_valid_boxes, ): def nms_inner_loop(ib, i, j, nkeep, num_valid_boxes_local): # The box j is valid, invalidate other boxes that overlap with j above iou_threshold on_new_valid_box_func(ib, 0, num_valid_boxes_local[0], i, j) num_valid_boxes_local[0] += 1 num_boxes_to_check = nkeep - (j + 1) with ib.for_range(0, num_boxes_to_check, name="_k", kind="parallel") as _k: k = j + 1 + _k with ib.if_scope( tvm.tir.all( k < nkeep, out_scores[i, k] > 0, # is the box k still valid? needs_bbox_check_func(i, j, k), )): iou = calc_overlap_func(i, j, k) with ib.if_scope(iou >= iou_threshold): # invalidate the box k out_scores[i, k] = -1.0 on_new_invalidated_box_func(i, k) with ib.for_range(0, batch_size, name="i") as i: nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]) max_output_size = if_then_else(max_output_size > 0, max_output_size, nkeep) with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)): num_valid_boxes_local = ib.allocate("int32", (1, ), name="num_valid_boxes_local", scope="local") box_idx = ib.allocate("int32", (1, ), name="box_idx", scope="local") num_valid_boxes_local[0] = 0 box_idx[0] = 0 # Apply nms # No need to do more iteration if we have already reached max_output_size boxes with ib.while_loop( tvm.tir.all(box_idx[0] < nkeep, num_valid_boxes_local[0] < max_output_size)): # Proceed to the inner loop if the box with id box_idx is still valid with ib.if_scope(out_scores[i, box_idx[0]] > -1.0): nms_inner_loop(ib, i, box_idx[0], nkeep, num_valid_boxes_local) box_idx[0] += 1 num_valid_boxes[i] = num_valid_boxes_local[0] with ib.else_scope(): num_valid_boxes[i] = 0 return ib.get()
def transform_loc_pre(cls_prob, valid_count, temp_valid_count, temp_cls_id, temp_score, threshold): """Low level IR routing for transform location data preparation. Parameters ---------- cls_prob : Buffer Buffer of class probabilities. valid_count : Buffer Buffer of number of valid output boxes. temp_valid_count : Buffer Output intermediate result buffer temp_cls_id : Buffer Output intermediate result buffer temp_score : Buffer Output buffer threshold : float Threshold to be a positive prediction. Returns ------- stmt : Stmt The result IR statement. """ batch_size = cls_prob.shape[0] num_classes = cls_prob.shape[1] num_anchors = cls_prob.shape[2] ib = tvm.tir.ir_builder.create() cls_prob = ib.buffer_ptr(cls_prob) cls_id = ib.buffer_ptr(temp_cls_id) valid_count = ib.buffer_ptr(valid_count) temp_valid_count = ib.buffer_ptr(temp_valid_count) score = ib.buffer_ptr(temp_score) threshold = tvm.ir.make_node("FloatImm", dtype="float32", value=threshold) max_threads = int( tvm.target.Target.current(allow_none=False).max_num_threads) nthread_tx = max_threads nthread_bx = (batch_size * num_anchors) // max_threads + 1 tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx idxd = tvm.tir.indexdiv idxm = tvm.tir.indexmod with ib.if_scope(tid < batch_size * num_anchors): i = idxd(tid, num_anchors) j = idxm(tid, num_anchors) valid_count[i] = 0 score[tid] = -1.0 cls_id[tid] = 0 with ib.for_range(0, num_classes - 1) as k: temp = cls_prob[i * num_classes * num_anchors + (k + 1) * num_anchors + j] cls_id[tid] = if_then_else(temp > score[tid], k + 1, cls_id[tid]) score[tid] = tvm.te.max(temp, score[tid]) with ib.if_scope(tvm.tir.all(cls_id[tid] > 0, score[tid] < threshold)): cls_id[tid] = 0 with ib.if_scope(cls_id[tid] > 0): temp_valid_count[tid] = 1 with ib.else_scope(): temp_valid_count[tid] = 0 with ib.if_scope(tid < batch_size): with ib.for_range(0, num_anchors) as k: with ib.if_scope(k > 0): temp_valid_count[tid * num_anchors + k] += temp_valid_count[tid * num_anchors + k - 1] valid_count[i] = temp_valid_count[tid * num_anchors + num_anchors - 1] return ib.get()
def nms_ir( data, sorted_index, valid_count, indices, out_bboxes, out_scores, out_class_ids, out_features, box_indices, num_valid_boxes, max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index, score_index, return_indices, ): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- data : Buffer Buffer of output boxes with class and score. sorted_index : Buffer Buffer of output box indexes sorted by score. valid_count : Buffer Buffer of number of valid output boxes. indices : Buffer indices in original tensor, with shape [batch_size, num_anchors], represents the index of box in original data. It could be the third output out_indices of get_valid_counts. The values in the second dimension are like the output of arange(num_anchors) if get_valid_counts is not used before non_max_suppression. out_bboxes : Buffer Output buffer, to be filled with sorted box coordinates. out_scores : Buffer Output buffer, to be filled with sorted scores. out_class_ids : Buffer Output buffer, to be filled with sorted class ids. box_indices : Buffer A indices tensor mapping sorted indices to original indices This is the first output of NMS when return_indices=True. num_valid_boxes : Buffer Record the number of boxes that have survived IOU tests. This is the second output of NMS when return_indices=True. max_output_size : int Max number of output valid boxes for each instance. By default all valid boxes are returned. iou_threshold : float Overlapping(IoU) threshold to suppress object with smaller score. force_suppress : boolean Whether to suppress all detections regardless of class_id. top_k : int Keep maximum top k detections before nms, -1 for no limit. coord_start : int Start index of the consecutive 4 coordinates. id_index : int index of the class categories, -1 to disable. score_index : optional, int Index of the scores/confidence of boxes. return_indices : boolean Whether to return box indices in input data. Returns ------- stmt : Stmt The result IR statement. """ batch_size = data.shape[0] num_anchors = data.shape[1] box_data_length = data.shape[2] num_features = out_features.shape[2] ib = tvm.tir.ir_builder.create() data = ib.buffer_ptr(data) sorted_index = ib.buffer_ptr(sorted_index) valid_count = ib.buffer_ptr(valid_count) indices = ib.buffer_ptr(indices) # outputs out_bboxes = ib.buffer_ptr(out_bboxes) out_scores = ib.buffer_ptr(out_scores) out_class_ids = ib.buffer_ptr(out_class_ids) out_features = ib.buffer_ptr(out_features) box_indices = ib.buffer_ptr(box_indices) num_valid_boxes = ib.buffer_ptr(num_valid_boxes) if isinstance(iou_threshold, float): iou_threshold = tvm.tir.FloatImm("float32", iou_threshold) top_k = tvm.tir.IntImm("int32", top_k) coord_start = tvm.tir.IntImm("int32", coord_start) id_index = tvm.tir.IntImm("int32", id_index) score_index = tvm.tir.IntImm("int32", score_index) force_suppress = tvm.tir.IntImm("int32", 1 if force_suppress else 0) max_threads = int( tvm.target.Target.current(allow_none=False).max_num_threads) with ib.new_scope(): nthread_tx = max_threads nthread_bx = ceil_div(num_anchors, max_threads) nthread_by = batch_size tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") by = te.thread_axis("blockIdx.y") ib.scope_attr(by, "thread_extent", nthread_by) ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) i = by base_src_idx = i * num_anchors * box_data_length base_bbox_idx = i * num_anchors * 4 base_features_idx = i * num_anchors * num_features with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)): # Reorder output nkeep = if_then_else( tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]) j = bx * max_threads + tx with ib.if_scope(j < nkeep): src_idx = base_src_idx + sorted_index[i * num_anchors + j] * box_data_length with ib.for_range(0, 4, kind="unroll") as k: out_bboxes[(base_bbox_idx + j * 4 + k)] = data[src_idx + coord_start + k] with ib.for_range(0, num_features, kind="unroll") as k: out_features[(base_features_idx + j * num_features + k)] = data[src_idx + coord_start + 4 + k] out_scores[i * num_anchors + j] = data[src_idx + score_index] if id_index >= 0: out_class_ids[i * num_anchors + j] = data[src_idx + id_index] with ib.else_scope(): # Indices > nkeep are discarded # Only needed for return_indices = False case if return_indices is False: with ib.if_scope(j < num_anchors): with ib.for_range(0, 4, kind="unroll") as k: out_bboxes[(base_bbox_idx + j * 4 + k)] = -1.0 with ib.for_range(0, num_features, kind="unroll") as k: out_features[(base_features_idx + j * num_features + k)] = -1.0 out_scores[i, j] = -1.0 if id_index >= 0: out_class_ids[i, j] = -1.0 if return_indices: with ib.if_scope(j < num_anchors): box_indices[i * num_anchors + j] = -1 with ib.else_scope(): # Need to copy all boxes if not using return_indices bounds = valid_count[i] if return_indices else num_anchors with ib.if_scope(j < bounds): src_offset = base_src_idx + j * box_data_length with ib.for_range(0, 4, kind="unroll") as k: out_bboxes[base_bbox_idx + j * 4 + k] = data[src_offset + coord_start + k] with ib.for_range(0, num_features, kind="unroll") as k: out_features[(base_features_idx + j * num_features + k)] = data[src_offset + coord_start + 4 + k] out_scores[i * num_anchors + j] = data[src_offset + score_index] if id_index >= 0: out_class_ids[i * num_anchors + j] = data[src_offset + id_index] box_indices[i * num_anchors + j] = j if isinstance(max_output_size, int): max_output_size = tvm.tir.const(max_output_size) def calc_overlap(i, j, k): offset_j = j * 4 offset_k = k * 4 base_bbox_idx = i * num_anchors * 4 return calculate_overlap( out_bboxes, base_bbox_idx + offset_j, base_bbox_idx + offset_k, ) def on_new_valid_box(ib, tid, num_current_valid_box, i, j): # When return_indices is False, no need to populate box_indices if return_indices: with ib.if_scope(tid + 0 == 0): orig_idx = sorted_index[i * num_anchors + j] box_indices[i, num_current_valid_box] = indices[i, orig_idx] def on_new_invalidated_box(i, k): if return_indices is False and id_index >= 0: out_class_ids[i, k] = -1.0 def needs_bbox_check(i, j, k): return tvm.tir.any( force_suppress > 0, id_index < 0, out_class_ids[i, k] == out_class_ids[i, j], ) return _nms_loop( ib, batch_size, top_k, iou_threshold, max_output_size, valid_count, on_new_valid_box, on_new_invalidated_box, needs_bbox_check, calc_overlap, out_scores, num_valid_boxes, )
def _nms_loop( ib, batch_size, top_k, iou_threshold, max_output_size, valid_count, on_new_valid_box_func, on_new_invalidated_box_func, needs_bbox_check_func, calc_overlap_func, out_scores, num_valid_boxes, ): max_threads = int( tvm.target.Target.current(allow_none=False).max_num_threads) with ib.new_scope(): nthread_by = batch_size nthread_tx = max_threads # Some cuda architectures have smaller limit of 32K for cudaDevAttrMaxRegistersPerBlock # vs 64K for most GPUs. Since this kernel uses many registers (around 35), the limit will # be exceeded with 1024 threads. target = tvm.target.Target.current(allow_none=False) if target.kind.name == "cuda": if nvcc.get_target_compute_version(target) in [ "3.2", "5.3", "6.2" ]: nthread_tx = 512 by = te.thread_axis("blockIdx.y") tx = te.thread_axis("threadIdx.x") ib.scope_attr(by, "thread_extent", nthread_by) ib.scope_attr(tx, "thread_extent", nthread_tx) num_valid_boxes_local = ib.allocate("int32", (1, ), name="num_valid_boxes_local", scope="local") num_valid_boxes_local[0] = 0 def nms_inner_loop(ib, i, j, nkeep): # The box j is valid, invalidate other boxes that overlap with j above iou_threshold on_new_valid_box_func(ib, tx, num_valid_boxes_local[0], i, j) num_valid_boxes_local[0] += 1 num_iter_per_thread = ceil_div(nkeep - (j + 1), nthread_tx) with ib.for_range(0, num_iter_per_thread, name="_k") as _k: k = j + 1 + _k * nthread_tx + tx with ib.if_scope( tvm.tir.all( k < nkeep, out_scores[i, k] > 0, # is the box k still valid? needs_bbox_check_func(i, j, k), )): iou = calc_overlap_func(i, j, k) with ib.if_scope(iou >= iou_threshold): # invalidate the box k out_scores[i, k] = -1.0 on_new_invalidated_box_func(i, k) ib.emit( tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))) i = by nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]) max_output_size = if_then_else(max_output_size > 0, max_output_size, nkeep) with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)): # Apply nms # No need to do more iteration if we have already reached max_output_size boxes box_idx = ib.allocate("int32", (1, ), name="box_idx", scope="local") box_idx[0] = 0 with ib.while_loop( tvm.tir.all(box_idx[0] < nkeep, num_valid_boxes_local[0] < max_output_size)): # Proceed to the inner loop if the box with id box_idx is still valid with ib.if_scope(out_scores[i, box_idx[0]] > -1.0): nms_inner_loop(ib, i, box_idx[0], nkeep) box_idx[0] += 1 with ib.if_scope(tx + 0 == 0): num_valid_boxes[i] = num_valid_boxes_local[0] with ib.else_scope(): num_valid_boxes[i] = 0 return ib.get()
def nms_ir( data, sorted_index, valid_count, indices, out, box_indices, max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index, score_index, return_indices, ): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- data : Buffer Buffer of output boxes with class and score. sorted_index : Buffer Buffer of output box indexes sorted by score. valid_count : Buffer Buffer of number of valid output boxes. indices : Buffer indices in original tensor, with shape [batch_size, num_anchors], represents the index of box in original data. It could be the third output out_indices of get_valid_counts. The values in the second dimension are like the output of arange(num_anchors) if get_valid_counts is not used before non_max_suppression. out : Buffer Output buffer. max_output_size : int Max number of output valid boxes for each instance. By default all valid boxes are returned. iou_threshold : float Overlapping(IoU) threshold to suppress object with smaller score. force_suppress : boolean Whether to suppress all detections regardless of class_id. top_k : int Keep maximum top k detections before nms, -1 for no limit. coord_start : int Start index of the consecutive 4 coordinates. id_index : int index of the class categories, -1 to disable. score_index : optional, int Index of the scores/confidence of boxes. return_indices : boolean Whether to return box indices in input data. Returns ------- stmt : Stmt The result IR statement. """ def get_boundaries(output, box_idx): l = tvm.te.min( output[box_idx], output[box_idx + 2], ) t = tvm.te.min( output[box_idx + 1], output[box_idx + 3], ) r = tvm.te.max( output[box_idx], output[box_idx + 2], ) b = tvm.te.max( output[box_idx + 1], output[box_idx + 3], ) return l, t, r, b def calculate_overlap(out_tensor, box_a_idx, box_b_idx): """Calculate overlap of two boxes.""" a_l, a_t, a_r, a_b = get_boundaries(out_tensor, box_a_idx) b_l, b_t, b_r, b_b = get_boundaries(out_tensor, box_b_idx) # Overlapping width and height w = tvm.te.max(0.0, tvm.te.min(a_r, b_r) - tvm.te.max(a_l, b_l)) h = tvm.te.max(0.0, tvm.te.min(a_b, b_b) - tvm.te.max(a_t, b_t)) # Overlapping area area = h * w # total area of the figure formed by box a and box b # except for overlapping area u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area return tvm.tir.Select(u <= 0.0, 0.0, area / u) batch_size = data.shape[0] num_anchors = data.shape[1] box_data_length = data.shape[2] ib = tvm.tir.ir_builder.create() data = ib.buffer_ptr(data) sorted_index = ib.buffer_ptr(sorted_index) valid_count = ib.buffer_ptr(valid_count) indices = ib.buffer_ptr(indices) out = ib.buffer_ptr(out) box_indices = ib.buffer_ptr(box_indices) if isinstance(iou_threshold, float): iou_threshold = tvm.tir.FloatImm("float32", iou_threshold) top_k = tvm.tir.IntImm("int32", top_k) coord_start = tvm.tir.IntImm("int32", coord_start) id_index = tvm.tir.IntImm("int32", id_index) score_index = tvm.tir.IntImm("int32", score_index) force_suppress = tvm.tir.IntImm("int32", 1 if force_suppress else 0) max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) with ib.new_scope(): nthread_by = batch_size by = te.thread_axis("blockIdx.y") ib.scope_attr(by, "thread_extent", nthread_by) i = by base_idx = i * num_anchors * box_data_length with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)): # Reorder output nkeep = if_then_else( tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i] ) with ib.for_range(0, nkeep) as j: with ib.for_range(0, box_data_length) as k: out[(base_idx + j * box_data_length + k)] = data[ (base_idx + sorted_index[i * num_anchors + j] * box_data_length + k) ] box_indices[i * num_anchors + j] = sorted_index[i * num_anchors + j] with ib.if_scope(tvm.tir.all(top_k > 0, top_k < valid_count[i])): with ib.for_range(0, valid_count[i] - nkeep) as j: with ib.for_range(0, box_data_length) as k: out[(base_idx + (j + nkeep) * box_data_length + k)] = -1.0 box_indices[i * num_anchors + (j + nkeep)] = -1 with ib.new_scope(): nthread_by = batch_size by = te.thread_axis("blockIdx.y") ib.scope_attr(by, "thread_extent", nthread_by) i = by base_idx = i * num_anchors * box_data_length with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)): # Apply nms with ib.for_range(0, valid_count[i]) as j: with ib.for_range(0, j) as k: offset_k = k * box_data_length with ib.if_scope( tvm.tir.all( out[base_idx + offset_k + score_index] > 0, tvm.tir.any(id_index < 0, out[base_idx + offset_k + id_index] >= 0), ) ): offset_j = j * box_data_length with ib.if_scope( tvm.tir.all( j > k, out[base_idx + offset_k + score_index] > 0, tvm.tir.any(id_index < 0, out[base_idx + offset_j + id_index] >= 0), tvm.tir.any( force_suppress > 0, id_index < 0, out[base_idx + offset_k + id_index] == out[base_idx + offset_j + id_index], ), ) ): iou = calculate_overlap( out, base_idx + offset_j + coord_start, base_idx + offset_k + coord_start, ) with ib.if_scope(iou >= iou_threshold): out[base_idx + offset_j + score_index] = -1.0 with ib.if_scope(id_index >= 0): out[base_idx + offset_j + id_index] = -1.0 box_indices[i * num_anchors + j] = -1 with ib.new_scope(): nthread_tx = max_threads nthread_bx = num_anchors // max_threads + 1 nthread_by = batch_size nthread_bz = box_data_length tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") by = te.thread_axis("blockIdx.y") bz = te.thread_axis("blockIdx.z") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) ib.scope_attr(by, "thread_extent", nthread_by) ib.scope_attr(bz, "thread_extent", nthread_bz) tid = bx * max_threads + tx i = by j = tid k = bz base_idx = i * num_anchors * box_data_length with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)): pass with ib.else_scope(): with ib.if_scope(j < valid_count[i]): offset_j = j * box_data_length out[(base_idx + offset_j + k)] = data[base_idx + offset_j + k] box_indices[i * num_anchors + j] = j with ib.new_scope(): num_valid_boxes = ib.allocate("int32", (1,), name="num_valid_boxes", scope="local") bx = te.thread_axis("blockIdx.x") ib.scope_attr(bx, "thread_extent", batch_size) i = bx base_idx = i * num_anchors * box_data_length # Set invalid entry to be -1 with ib.for_range(0, num_anchors - valid_count[i]) as j: with ib.for_range(0, box_data_length) as k: out[base_idx + (j + valid_count[i]) * box_data_length + k] = -1.0 box_indices[i * num_anchors + j + valid_count[i]] = -1 # Only return max_output_size number of valid boxes num_valid_boxes[0] = 0 with ib.if_scope(max_output_size > 0): with ib.for_range(0, valid_count[i]) as j: offset_j = j * box_data_length with ib.if_scope(out[base_idx + offset_j] >= 0): with ib.if_scope(num_valid_boxes[0] == max_output_size): with ib.for_range(0, box_data_length) as k: out[base_idx + offset_j + k] = -1.0 box_indices[i * num_anchors + j] = -1 with ib.else_scope(): num_valid_boxes[0] += 1 if return_indices: with ib.new_scope(): nthread_tx = max_threads nthread_bx = batch_size // max_threads + 1 tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) i = bx * max_threads + tx with ib.if_scope(i < batch_size): with ib.for_range(0, valid_count[i]) as j: idx = box_indices[i * num_anchors + j] with ib.if_scope(idx >= 0): box_indices[i * num_anchors + j] = indices[i * num_anchors + idx] return ib.get()
def apply(condition, true_expr, false_expr): return tir.if_then_else(tir.Cast('bool', _clamp_tvm(condition, 0, 1)), true_expr, false_expr)
def nms_ir( data, sorted_index, valid_count, indices, out_bboxes, out_scores, out_class_ids, box_indices, num_valid_boxes, max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index, score_index, return_indices, ): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- data : Buffer Buffer of output boxes with class and score. sorted_index : Buffer Buffer of output box indexes sorted by score. valid_count : Buffer Buffer of number of valid output boxes. indices : Buffer indices in original tensor, with shape [batch_size, num_anchors], represents the index of box in original data. It could be the third output out_indices of get_valid_counts. The values in the second dimension are like the output of arange(num_anchors) if get_valid_counts is not used before non_max_suppression. out_bboxes : Buffer Output buffer, to be filled with sorted box coordinates. out_scores : Buffer Output buffer, to be filled with sorted scores. out_class_ids : Buffer Output buffer, to be filled with sorted class ids. box_indices : Buffer A indices tensor mapping sorted indices to original indices This is the first output of NMS when return_indices=True. num_valid_boxes : Buffer Record the number of boxes that have survived IOU tests. This is the second output of NMS when return_indices=True. max_output_size : int Max number of output valid boxes for each instance. By default all valid boxes are returned. iou_threshold : float Overlapping(IoU) threshold to suppress object with smaller score. force_suppress : boolean Whether to suppress all detections regardless of class_id. top_k : int Keep maximum top k detections before nms, -1 for no limit. coord_start : int Start index of the consecutive 4 coordinates. id_index : int index of the class categories, -1 to disable. score_index : optional, int Index of the scores/confidence of boxes. return_indices : boolean Whether to return box indices in input data. Returns ------- stmt : Stmt The result IR statement. """ def get_boundaries(output, box_idx): l = tvm.te.min( output[box_idx], output[box_idx + 2], ) t = tvm.te.min( output[box_idx + 1], output[box_idx + 3], ) r = tvm.te.max( output[box_idx], output[box_idx + 2], ) b = tvm.te.max( output[box_idx + 1], output[box_idx + 3], ) return l, t, r, b def calculate_overlap(out_tensor, box_a_idx, box_b_idx): """Calculate overlap of two boxes.""" a_l, a_t, a_r, a_b = get_boundaries(out_tensor, box_a_idx) b_l, b_t, b_r, b_b = get_boundaries(out_tensor, box_b_idx) # Overlapping width and height w = tvm.te.max(0.0, tvm.te.min(a_r, b_r) - tvm.te.max(a_l, b_l)) h = tvm.te.max(0.0, tvm.te.min(a_b, b_b) - tvm.te.max(a_t, b_t)) # Overlapping area area = h * w # total area of the figure formed by box a and box b # except for overlapping area u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area return tvm.tir.Select(u <= 0.0, 0.0, area / u) batch_size = data.shape[0] num_anchors = data.shape[1] box_data_length = data.shape[2] ib = tvm.tir.ir_builder.create() data = ib.buffer_ptr(data) sorted_index = ib.buffer_ptr(sorted_index) valid_count = ib.buffer_ptr(valid_count) indices = ib.buffer_ptr(indices) # outputs out_bboxes = ib.buffer_ptr(out_bboxes) out_scores = ib.buffer_ptr(out_scores) out_class_ids = ib.buffer_ptr(out_class_ids) box_indices = ib.buffer_ptr(box_indices) num_valid_boxes = ib.buffer_ptr(num_valid_boxes) if isinstance(iou_threshold, float): iou_threshold = tvm.tir.FloatImm("float32", iou_threshold) top_k = tvm.tir.IntImm("int32", top_k) coord_start = tvm.tir.IntImm("int32", coord_start) id_index = tvm.tir.IntImm("int32", id_index) score_index = tvm.tir.IntImm("int32", score_index) force_suppress = tvm.tir.IntImm("int32", 1 if force_suppress else 0) max_threads = int( tvm.target.Target.current(allow_none=False).max_num_threads) with ib.new_scope(): nthread_tx = max_threads nthread_bx = ceil_div(num_anchors, max_threads) nthread_by = batch_size tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") by = te.thread_axis("blockIdx.y") ib.scope_attr(by, "thread_extent", nthread_by) ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) i = by base_src_idx = i * num_anchors * box_data_length base_bbox_idx = i * num_anchors * 4 with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)): # Reorder output nkeep = if_then_else( tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]) j = bx * max_threads + tx with ib.if_scope(j < nkeep): src_idx = base_src_idx + sorted_index[i * num_anchors + j] * box_data_length with ib.for_range(0, 4, kind="unroll") as k: out_bboxes[(base_bbox_idx + j * 4 + k)] = data[src_idx + coord_start + k] out_scores[i * num_anchors + j] = data[src_idx + score_index] if id_index >= 0: out_class_ids[i * num_anchors + j] = data[src_idx + id_index] with ib.else_scope(): # Indices > nkeep are discarded # Only needed for return_indices = False case if return_indices is False: with ib.if_scope(j < num_anchors): with ib.for_range(0, 4, kind="unroll") as k: out_bboxes[(base_bbox_idx + j * 4 + k)] = -1.0 out_scores[i, j] = -1.0 if id_index >= 0: out_class_ids[i, j] = -1.0 if return_indices: with ib.if_scope(j < num_anchors): box_indices[i * num_anchors + j] = -1 with ib.else_scope(): with ib.if_scope(j < valid_count[i]): src_offset = base_src_idx + j * box_data_length with ib.for_range(0, 4, kind="unroll") as k: out_bboxes[base_bbox_idx + j * 4 + k] = data[src_offset + coord_start + k] out_scores[i * num_anchors + j] = data[src_offset + score_index] if id_index >= 0: out_class_ids[i * num_anchors + j] = data[src_offset + id_index] box_indices[i * num_anchors + j] = j with ib.new_scope(): nthread_by = batch_size nthread_tx = max_threads by = te.thread_axis("blockIdx.y") tx = te.thread_axis("threadIdx.x") ib.scope_attr(by, "thread_extent", nthread_by) ib.scope_attr(tx, "thread_extent", nthread_tx) i = by base_bbox_idx = i * num_anchors * 4 num_valid_boxes_local = ib.allocate("int32", (1, ), name="num_valid_boxes_local", scope="local") num_valid_boxes_local[0] = 0 nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]) def nms_inner_loop(ib, j): # The box j is valid, invalidate other boxes that overlap with j above iou_threshold # When return_indices is False, no need to populate box_indices if return_indices: with ib.if_scope(tx + 0 == 0): orig_idx = sorted_index[i * num_anchors + j] box_indices[i, num_valid_boxes_local[0]] = indices[i, orig_idx] num_valid_boxes_local[0] += 1 offset_j = j * 4 num_iter_per_thread = ceil_div(nkeep - (j + 1), nthread_tx) with ib.for_range(0, num_iter_per_thread) as _k: k = j + 1 + _k * nthread_tx + tx offset_k = k * 4 with ib.if_scope( tvm.tir.all( k < nkeep, out_scores[i, k] > 0, # is the box k still valid? tvm.tir.any( force_suppress > 0, id_index < 0, out_class_ids[i, k] == out_class_ids[i, j], ), )): iou = calculate_overlap( out_bboxes, base_bbox_idx + offset_j, base_bbox_idx + offset_k, ) with ib.if_scope(iou >= iou_threshold): # invalidate the box k out_scores[i, k] = -1.0 if return_indices is False and id_index >= 0: out_class_ids[i, k] = -1.0 ib.emit( tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))) if isinstance(max_output_size, int): max_output_size = tvm.tir.const(max_output_size) with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)): # Apply nms with ib.for_range(0, nkeep) as j: # Proceed to the inner loop if the box j is still valid with ib.if_scope(out_scores[i, j] > -1.0): with ib.if_scope(max_output_size > 0): # No need to do more iteration if we have already reached max_output_size # boxes # TODO(masahi): Add TIR while loop to realize early exit from the outer loop with ib.if_scope( num_valid_boxes_local[0] < max_output_size): nms_inner_loop(ib, j) with ib.else_scope(): nms_inner_loop(ib, j) with ib.if_scope(tx + 0 == 0): num_valid_boxes[i] = num_valid_boxes_local[0] with ib.else_scope(): num_valid_boxes[i] = 0 return ib.get()
def nms_ir(data, sorted_index, valid_count, out, box_indices, max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index, score_index): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- data : Buffer Buffer of output boxes with class and score. sort_index : Buffer Buffer of output box indexes sorted by score. valid_count : Buffer Buffer of number of valid output boxes. out : Buffer Output buffer. max_output_size : int Max number of output valid boxes for each instance. By default all valid boxes are returned. iou_threshold : float Overlapping(IoU) threshold to suppress object with smaller score. force_suppress : boolean Whether to suppress all detections regardless of class_id. top_k : int Keep maximum top k detections before nms, -1 for no limit. coord_start : int Start index of the consecutive 4 coordinates. id_index : int index of the class categories, -1 to disable. score_index : optional, int Index of the scores/confidence of boxes. Returns ------- stmt : Stmt The result IR statement. """ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): """Calculate overlap of two boxes. """ w = tvm.te.max( 0.0, tvm.te.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) - tvm.te.max(out_tensor[box_a_idx], out_tensor[box_b_idx])) h = tvm.te.max( 0.0, tvm.te.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) - tvm.te.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])) i = w * h u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \ (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \ (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \ (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i return tvm.tir.Select(u <= 0.0, 0.0, i / u) batch_size = data.shape[0] num_anchors = data.shape[1] box_data_length = data.shape[2] ib = tvm.tir.ir_builder.create() data = ib.buffer_ptr(data) sorted_index = ib.buffer_ptr(sorted_index) valid_count = ib.buffer_ptr(valid_count) out = ib.buffer_ptr(out) box_indices = ib.buffer_ptr(box_indices) num_valid_boxes = ib.allocate("int32", (1, ), name="num_valid_boxes", scope="local") max_threads = int( tvm.target.Target.current(allow_none=False).max_num_threads) nthread_tx = max_threads nthread_bx = num_anchors // max_threads + 1 tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) j = bx * max_threads + tx iou_threshold = tvm.ir.make_node("FloatImm", dtype="float32", value=iou_threshold) top_k = tvm.ir.make_node("IntImm", dtype="int32", value=top_k) coord_start = tvm.ir.make_node("IntImm", dtype="int32", value=coord_start) id_index = tvm.ir.make_node("IntImm", dtype="int32", value=id_index) score_index = tvm.ir.make_node("IntImm", dtype="int32", value=score_index) force_suppress = tvm.ir.make_node("IntImm", dtype="int32", value=1 if force_suppress else 0) with ib.for_range(0, batch_size, for_type="unroll") as i: base_idx = i * num_anchors * box_data_length with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)): # Reorder output nkeep = if_then_else( tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]) with ib.if_scope(j < nkeep): with ib.for_range(0, box_data_length) as k: out[(base_idx + j * box_data_length + k)] = \ data[(base_idx + sorted_index[i * num_anchors + j] * box_data_length + k)] box_indices[i * num_anchors + j] = sorted_index[i * num_anchors + j] with ib.if_scope(tvm.tir.all(top_k > 0, top_k < valid_count[i])): with ib.if_scope(j < valid_count[i] - nkeep): with ib.for_range(0, box_data_length) as k: out[(base_idx + (j + nkeep) * box_data_length + k)] = -1.0 box_indices[i * num_anchors + (j + nkeep)] = -1 # Apply nms with ib.for_range(0, valid_count[i]) as k: offset_k = k * box_data_length with ib.if_scope( tvm.tir.all( out[base_idx + offset_k + score_index] > 0, tvm.tir.any( id_index < 0, out[base_idx + offset_k + id_index] >= 0))): with ib.if_scope(j < valid_count[i]): offset_j = j * box_data_length with ib.if_scope( tvm.tir.all( j > k, out[base_idx + offset_j + score_index] > 0, tvm.tir.any( id_index < 0, out[base_idx + offset_j + id_index] >= 0), tvm.tir.any( force_suppress > 0, id_index < 0, out[base_idx + offset_k + id_index] == out[base_idx + offset_j + id_index]))): iou = calculate_overlap( out, base_idx + offset_j + coord_start, base_idx + offset_k + coord_start) with ib.if_scope(iou >= iou_threshold): out[base_idx + offset_j + score_index] = -1.0 with ib.if_scope(id_index >= 0): out[base_idx + offset_j + id_index] = -1.0 box_indices[i * num_anchors + j] = -1 with ib.else_scope(): with ib.if_scope(j < valid_count[i]): offset_j = j * box_data_length with ib.for_range(0, box_data_length) as k: out[(base_idx + offset_j + k)] = data[base_idx + offset_j + k] box_indices[i * num_anchors + j] = j # Set invalid entry to be -1 with ib.if_scope(j < num_anchors - valid_count[i]): with ib.for_range(0, box_data_length) as k: out[base_idx + (j + valid_count[i]) * box_data_length + k] = -1.0 box_indices[i * num_anchors + j + valid_count[i]] = -1 # Only return max_output_size number of valid boxes num_valid_boxes[0] = 0 with ib.if_scope(max_output_size > 0): with ib.if_scope(j < valid_count[i]): offset_j = j * box_data_length with ib.if_scope(out[base_idx + offset_j] >= 0): with ib.if_scope(num_valid_boxes[0] == max_output_size): with ib.for_range(0, box_data_length) as k: out[base_idx + offset_j + k] = -1.0 box_indices[i * num_anchors + j] = -1 with ib.else_scope(): num_valid_boxes[0] += 1 return ib.get()
def primfunc_local_allocates(placeholder_162: ty.handle, placeholder_163: ty.handle, placeholder_164: ty.handle, T_cast_76: ty.handle) -> None: # function attr dict tir.func_attr({ "global_symbol": "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_9", "tir.noalias": True }) placeholder_165 = tir.match_buffer(placeholder_162, [1, 14, 14, 512], dtype="int16", elem_offset=0, align=128, offset_factor=1) placeholder_166 = tir.match_buffer(placeholder_163, [3, 3, 512, 1], dtype="int16", elem_offset=0, align=128, offset_factor=1) placeholder_167 = tir.match_buffer(placeholder_164, [1, 1, 1, 512], dtype="int32", elem_offset=0, align=128, offset_factor=1) T_cast_77 = tir.match_buffer(T_cast_76, [1, 14, 14, 512], dtype="int16", elem_offset=0, align=128, offset_factor=1) # body PaddedInput_25 = tir.allocate([1, 16, 16, 512], "int16", "global") for i1_35, i2_46, i3_47 in tir.grid(16, 16, 512): PaddedInput_25[(((i1_35 * 8192) + (i2_46 * 512)) + i3_47)] = tir.if_then_else( ((((1 <= i1_35) and (i1_35 < 15)) and (1 <= i2_46)) and (i2_46 < 15)), tir.load("int16", placeholder_165.data, ((((i1_35 * 7168) + (i2_46 * 512)) + i3_47) - 7680)), tir.int16(0), dtype="int16") T_add_11 = tir.allocate([1, 14, 14, 512], "int32", "global") with tir.allocate([1, 14, 14, 512], "int32", "global") as DepthwiseConv2d_11: for i_11, j_11, c_11 in tir.grid(14, 14, 512): DepthwiseConv2d_11[(((i_11 * 7168) + (j_11 * 512)) + c_11)] = 0 for di_11, dj_11 in tir.grid(3, 3): DepthwiseConv2d_11[(((i_11 * 7168) + (j_11 * 512)) + c_11)] = ( tir.load("int32", DepthwiseConv2d_11, (((i_11 * 7168) + (j_11 * 512)) + c_11)) + (tir.load("int16", PaddedInput_25, (((((i_11 * 8192) + (di_11 * 8192)) + (j_11 * 512)) + (dj_11 * 512)) + c_11)).astype("int32") * tir.load("int16", placeholder_166.data, (((di_11 * 1536) + (dj_11 * 512)) + c_11)).astype("int32"))) for ax1_44, ax2_45, ax3_47 in tir.grid(14, 14, 512): T_add_11[(((ax1_44 * 7168) + (ax2_45 * 512)) + ax3_47)] = ( tir.load("int32", DepthwiseConv2d_11, (((ax1_44 * 7168) + (ax2_45 * 512)) + ax3_47)) + tir.load("int32", placeholder_167.data, ax3_47)) compute_22 = tir.allocate([1, 14, 14, 512], "int32", "global") with tir.allocate([1, 14, 14, 512], "int32", "global") as T_cast_78: for ax1_45, ax2_46, ax3_48 in tir.grid(14, 14, 512): T_cast_78[(((ax1_45 * 7168) + (ax2_46 * 512)) + ax3_48)] = tir.load( "int32", T_add_11, (((ax1_45 * 7168) + (ax2_46 * 512)) + ax3_48)) for i1_36, i2_47, i3_48 in tir.grid(14, 14, 512): compute_22[(((i1_36 * 7168) + (i2_47 * 512)) + i3_48)] = tir.q_multiply_shift(tir.load( "int32", T_cast_78, (((i1_36 * 7168) + (i2_47 * 512)) + i3_48)), 1948805937, 31, -5, dtype="int32") T_cast_79 = tir.allocate([1, 14, 14, 512], "uint8", "global") with tir.allocate([1, 14, 14, 512], "int32", "global") as compute_23: for i1_37, i2_48, i3_49 in tir.grid(14, 14, 512): compute_23[(((i1_37 * 7168) + (i2_48 * 512)) + i3_49)] = tir.max( tir.max( tir.load("int32", compute_22, (((i1_37 * 7168) + (i2_48 * 512)) + i3_49)), 255), 0) for ax1_46, ax2_47, ax3_49 in tir.grid(14, 14, 512): T_cast_79[(((ax1_46 * 7168) + (ax2_47 * 512)) + ax3_49)] = tir.load( "int32", compute_23, (((ax1_46 * 7168) + (ax2_47 * 512)) + ax3_49)).astype("uint8") for ax1_47, ax2_48, ax3_50 in tir.grid(14, 14, 512): T_cast_77.data[(((ax1_47 * 7168) + (ax2_48 * 512)) + ax3_50)] = tir.load( "uint8", T_cast_79, (((ax1_47 * 7168) + (ax2_48 * 512)) + ax3_50)).astype("int16")
def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): """Low level IR routing for multibox_prior operator. Parameters ---------- data : Buffer Input data buffer. out : Buffer Output buffer. sizes : tuple of float Tuple of sizes for anchor boxes. ratios : tuple of float Tuple of ratios for anchor boxes. steps : Tuple of float Priorbox step across y and x, -1 for auto calculation. offsets : tuple of int Priorbox center offsets, y and x respectively. Returns ------- stmt : Stmt The result IR statement. """ max_threads = int( math.sqrt(tvm.target.Target.current(allow_none=False).max_num_threads)) tx = te.thread_axis("threadIdx.x") ty = te.thread_axis("threadIdx.y") bx = te.thread_axis("blockIdx.x") by = te.thread_axis("blockIdx.y") ib = tvm.tir.ir_builder.create() p_out = ib.buffer_ptr(out) in_height = data.shape[2] in_width = data.shape[3] nthread_tx = max_threads nthread_bx = in_height // max_threads + 1 nthread_ty = max_threads nthread_by = in_width // max_threads + 1 ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(ty, "thread_extent", nthread_ty) ib.scope_attr(bx, "thread_extent", nthread_bx) ib.scope_attr(by, "thread_extent", nthread_by) num_sizes = len(sizes) num_ratios = len(ratios) size_ratio_concat = sizes + ratios steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width offset_h = offsets[0] offset_w = offsets[1] i = bx * max_threads + tx j = by * max_threads + ty with ib.if_scope((i < in_height)): with ib.if_scope((j < in_width)): center_h = (i + offset_h) * steps_h center_w = (j + offset_w) * steps_w for k in range(num_sizes + num_ratios - 1): w = if_then_else( k < num_sizes, float(size_ratio_concat[k]) * in_height / in_width / 2.0, float(size_ratio_concat[0]) * in_height / in_width * math.sqrt(size_ratio_concat[k + 1]) / 2.0, ) h = if_then_else( k < num_sizes, size_ratio_concat[k] / 2.0, size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0, ) count = (i * in_width * (num_sizes + num_ratios - 1) + j * (num_sizes + num_ratios - 1) + k) * 4 p_out[count] = center_w - w p_out[count + 1] = center_h - h p_out[count + 2] = center_w + w p_out[count + 3] = center_h + h body = ib.get() return body
def primfunc_global_allocates(placeholder_144: ty.handle, placeholder_145: ty.handle, placeholder_146: ty.handle, T_cast_48: ty.handle) -> None: # function attr dict tir.func_attr({ "global_symbol": "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_13", "tir.noalias": True }) placeholder_147 = tir.match_buffer(placeholder_144, [1, 14, 14, 512], dtype="int16", elem_offset=0, align=128, offset_factor=1) placeholder_148 = tir.match_buffer(placeholder_145, [3, 3, 512, 1], dtype="int16", elem_offset=0, align=128, offset_factor=1) placeholder_149 = tir.match_buffer(placeholder_146, [1, 1, 1, 512], dtype="int32", elem_offset=0, align=128, offset_factor=1) T_cast_49 = tir.match_buffer(T_cast_48, [1, 14, 14, 512], dtype="int16", elem_offset=0, align=128, offset_factor=1) # body PaddedInput_22 = tir.allocate([131072], "int16", "global") DepthwiseConv2d_9 = tir.allocate([100352], "int32", "global") for i1_29, i2_39, i3_40 in tir.grid(16, 16, 512): PaddedInput_22[(((i1_29 * 8192) + (i2_39 * 512)) + i3_40)] = tir.if_then_else( ((((1 <= i1_29) and (i1_29 < 15)) and (1 <= i2_39)) and (i2_39 < 15)), tir.load("int16", placeholder_147.data, ((((i1_29 * 7168) + (i2_39 * 512)) + i3_40) - 7680)), tir.int16(0), dtype="int16") for i_9, j_9, c_9 in tir.grid(14, 14, 512): DepthwiseConv2d_9[(((i_9 * 7168) + (j_9 * 512)) + c_9)] = 0 for di_9, dj_9 in tir.grid(3, 3): DepthwiseConv2d_9[(((i_9 * 7168) + (j_9 * 512)) + c_9)] = ( tir.load("int32", DepthwiseConv2d_9, (((i_9 * 7168) + (j_9 * 512)) + c_9)) + (tir.load("int16", PaddedInput_22, (((((i_9 * 8192) + (di_9 * 8192)) + (j_9 * 512)) + (dj_9 * 512)) + c_9)).astype("int32") * tir.load("int16", placeholder_148.data, (((di_9 * 1536) + (dj_9 * 512)) + c_9)).astype("int32"))) for ax1_27, ax2_28, ax3_30 in tir.grid(14, 14, 512): DepthwiseConv2d_9[(((ax1_27 * 7168) + (ax2_28 * 512)) + ax3_30)] = ( tir.load("int32", DepthwiseConv2d_9, (((ax1_27 * 7168) + (ax2_28 * 512)) + ax3_30)) + tir.load("int32", placeholder_149.data, ax3_30)) for i1_30, i2_40, i3_41 in tir.grid(14, 14, 512): DepthwiseConv2d_9[(((i1_30 * 7168) + (i2_40 * 512)) + i3_41)] = tir.q_multiply_shift(tir.load( "int32", DepthwiseConv2d_9, (((i1_30 * 7168) + (i2_40 * 512)) + i3_41)), 1269068532, 31, -4, dtype="int32") for i1_31, i2_41, i3_42 in tir.grid(14, 14, 512): DepthwiseConv2d_9[(((i1_31 * 7168) + (i2_41 * 512)) + i3_42)] = tir.max( tir.max( tir.load("int32", DepthwiseConv2d_9, (((i1_31 * 7168) + (i2_41 * 512)) + i3_42)), 255), 0) for ax1_28, ax2_29, ax3_31 in tir.grid(14, 14, 512): PaddedInput_22[(((ax1_28 * 7168) + (ax2_29 * 512)) + ax3_31)] = tir.load( "int32", DepthwiseConv2d_9, (((ax1_28 * 7168) + (ax2_29 * 512)) + ax3_31)).astype("uint8") for ax1_29, ax2_30, ax3_32 in tir.grid(14, 14, 512): T_cast_49.data[(((ax1_29 * 7168) + (ax2_30 * 512)) + ax3_32)] = tir.load( "uint8", PaddedInput_22, (((ax1_29 * 7168) + (ax2_30 * 512)) + ax3_32)).astype("int16")