def test_const_fold3(): # Test that using ints with logic operations is forbidden x = tvm.var("x") for val in [0, 1]: for func in [tvm.all, tvm.any]: check_throws(lambda: func(tvm.const(val, 'uint1'), x)) check_throws(lambda: func(x, tvm.const(val, 'uint1'))) # Test const folding when both arguments are const for tvm_func, py_func in [(tvm.all, lambda a, b: a and b), (tvm.any, lambda a, b: a or b)]: for v1 in [0, 1]: for v2 in [0, 1]: assert tvm.ir_pass.Equal( tvm_func(tvm.const(v1, 'uint1'), tvm.const(v2, 'uint1')), tvm.const(py_func(v1, v2), 'uint1')) x = tvm.var("x", 'uint1') true = tvm.const(1, 'uint1') false = tvm.const(0, 'uint1') assert tvm.all(x, true).same_as(x) assert tvm.all(true, x).same_as(x) assert tvm.any(x, false).same_as(x) assert tvm.any(false, x).same_as(x) assert tvm.all(x, false).same_as(false) assert tvm.all(false, x).same_as(false) assert tvm.any(x, true).same_as(true) assert tvm.any(true, x).same_as(true)
def test_attr_stmt(): ib = tvm.ir_builder.create() dshape = (32, 64) data = ib.pointer("float32", name="data") l = tvm.var('l') m = tvm.var('m') n = tvm.var('n') tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", dshape[0]) ib.scope_attr(bx, "thread_extent", dshape[1]) with ib.for_range(0, l, "i") as i: with ib.for_range(0, m, "j") as j: with ib.for_range(0, n, "k") as k: with ib.if_scope(tvm.any(i < 4, j >= 8)): data[bx * j + tx * j * k] = data[bx * j + tx * j * k] + 0.5 with ib.else_scope(): data[bx * j + tx * j * k] = data[bx * j + tx * j * k] + 1.0 stmt = ib.get() new_stmt = tvm.ir_pass.HoistIfThenElse(stmt) expected_struct = { ('For', 'k'): (None, ), ('IfThenElse', ('i', 'j')): (('For', 'k'), ('For', 'k')), ('For', 'j'): (('IfThenElse', ('i', 'j')), ), ('For', 'i'): (('For', 'j'), ), ('AttrStmt', 'thread_extent', 64): (('For', 'i'), ), ('AttrStmt', 'thread_extent', 32): (('AttrStmt', 'thread_extent', 64), ) } verify_structure(new_stmt, expected_struct)
def test_nested_for(): ib = tvm.ir_builder.create() data = ib.pointer("float32", name="data") with ib.for_range(0, 5, "i") as i: with ib.for_range(0, 10, "j") as j: with ib.if_scope(i >= 3): data[i * 3 + j] = data[i * 3 + j] + 0.5 with ib.for_range(0, 15, "k") as k: with ib.for_range(0, 20, "l") as l: with ib.if_scope(tvm.any(i < 4, j >= 8)): data[i * 3 + j + k + l] = data[i * 3 + j + k + l] * 2 with ib.else_scope(): data[i * 3 + j + k + l] = data[i * 3 + j + k + l] * 1.5 stmt = ib.get() new_stmt = tvm.ir_pass.HoistIfThenElse(stmt) expected_struct = { ('IfThenElse', ('i', 'j')): (None, None), ('For', 'l'): (('IfThenElse', ('i', 'j')), ), ('For', 'k'): (('For', 'l'), ), ('For', 'j'): (None, ), ('IfThenElse', ('i', )): (('For', 'j'), None), ('For', 'i'): (('IfThenElse', ('i', )), ) } verify_structure(new_stmt, expected_struct)
def padding(X, ph, pw): assert len(X.shape) >= 2 nh, nw = X.shape[-2:] return tvm.compute( (*X.shape[:-2], nh + ph * 2, nw + pw * 2), lambda *i: tvm.if_then_else( tvm.any(i[-2] < ph, i[-2] >= nh + ph, i[-1] < pw, i[-1] >= nw + pw ), 0, X[i[:-2] + (i[-2] - ph, i[-1] - pw)]), name='PaddedX')
def test_any(): x = tvm.var('x') y = tvm.var('y') z = tvm.var('z') try: t = x or x assert False except ValueError: pass try: tvm.any() assert False except ValueError: pass assert str(tvm.any(x < y)) == '(%s < %s)' % (x.name, y.name) assert str(tvm.any(x < y, x > z)) == '((%s < %s) || (%s > %s))' % ( x.name, y.name, x.name, z.name) assert str(tvm.any(x < y, y > z + 1, x < z * 2)) == \ '(((%s < %s) || (%s > (%s + 1))) || (%s < (%s*2)))' % ( x.name, y.name, y.name, z.name, x.name, z.name)
def padding(X, ph, pw): """Pad X with 0s in 2-D ph, pw : height and width padding """ assert len(X.shape) >= 2 nh, nw = X.shape[-2], X.shape[-1] return tvm.compute( (*X.shape[0:-2], nh+ph*2, nw+pw*2), lambda *i: tvm.if_then_else( tvm.any(i[-2]<ph, i[-2]>=nh+ph, i[-1]<pw, i[-1]>=nw+pw), 0, X[i[:-2]+(i[-2]-ph, i[-1]-pw)]), name='PaddedX')
def conv_compute(n, m, h_, w_): x = h_ * stride - padding + r y = w_ * stride - padding + s return tvm.sum( tvm.select( tvm.any( x < 0, y < 0, x >= H, y >= W, ), tvm.const(0, dtype), # padding data[n, c, x, y] * filters[m, c, r, s] ), axis = [c, r, s] )
def test_const_fold3(): def check_throws(f): try: f() except tvm.TVMError: pass else: raise AssertionError("Should have raised an exception but didn't.") # Test that using ints with logic operations is forbidden x = tvm.var("x") for val in [0, 1]: for func in [tvm.all, tvm.any]: check_throws(lambda: func(tvm.const(val, 'uint1'), x)) check_throws(lambda: func(x, tvm.const(val, 'uint1'))) # Test const folding when both arguments are const for tvm_func, py_func in [(tvm.all, lambda a, b: a and b), (tvm.any, lambda a, b: a or b)]: for v1 in [0, 1]: for v2 in [0, 1]: assert tvm.ir_pass.Equal(tvm_func(tvm.const(v1, 'uint1'), tvm.const(v2, 'uint1')), tvm.const(py_func(v1, v2), 'uint1')) x = tvm.var("x", 'uint1') true = tvm.const(1, 'uint1') false = tvm.const(0, 'uint1') assert tvm.all(x, true).same_as(x) assert tvm.all(true, x).same_as(x) assert tvm.any(x, false).same_as(x) assert tvm.any(false, x).same_as(x) assert tvm.all(x, false).same_as(false) assert tvm.all(false, x).same_as(false) assert tvm.any(x, true).same_as(true) assert tvm.any(true, x).same_as(true)
def nms_ir(data, sorted_index, valid_count, out, box_indices, max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index, score_index): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- data : Buffer Buffer of output boxes with class and score. sort_index : Buffer Buffer of output box indexes sorted by score. valid_count : Buffer Buffer of number of valid output boxes. out : Buffer Output buffer. max_output_size : int Max number of output valid boxes for each instance. By default all valid boxes are returned. iou_threshold : float Overlapping(IoU) threshold to suppress object with smaller score. force_suppress : boolean Whether to suppress all detections regardless of class_id. top_k : int Keep maximum top k detections before nms, -1 for no limit. coord_start : int Start index of the consecutive 4 coordinates. id_index : int index of the class categories, -1 to disable. score_index : optional, int Index of the scores/confidence of boxes. Returns ------- stmt : Stmt The result IR statement. """ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): """Calculate overlap of two boxes. """ w = tvm.max( 0.0, tvm.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) - tvm.max(out_tensor[box_a_idx], out_tensor[box_b_idx])) h = tvm.max( 0.0, tvm.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) - tvm.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])) i = w * h u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \ (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \ (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \ (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i return tvm.expr.Select(u <= 0.0, 0.0, i / u) batch_size = data.shape[0] num_anchors = data.shape[1] box_data_length = data.shape[2] ib = tvm.ir_builder.create() data = ib.buffer_ptr(data) sorted_index = ib.buffer_ptr(sorted_index) valid_count = ib.buffer_ptr(valid_count) out = ib.buffer_ptr(out) box_indices = ib.buffer_ptr(box_indices) num_valid_boxes = ib.allocate("int32", (1, ), name="num_valid_boxes", scope="local") max_threads = int( tvm.target.current_target(allow_none=False).max_num_threads) nthread_tx = max_threads nthread_bx = num_anchors // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) j = bx * max_threads + tx iou_threshold = tvm.make.node("FloatImm", dtype="float32", value=iou_threshold) top_k = tvm.make.node("IntImm", dtype="int32", value=top_k) coord_start = tvm.make.node("IntImm", dtype="int32", value=coord_start) id_index = tvm.make.node("IntImm", dtype="int32", value=id_index) score_index = tvm.make.node("IntImm", dtype="int32", value=score_index) force_suppress = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0) with ib.for_range(0, batch_size, for_type="unroll") as i: base_idx = i * num_anchors * box_data_length with ib.if_scope(tvm.all(iou_threshold > 0, valid_count[i] > 0)): # Reorder output nkeep = if_then_else( \ tvm.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]) with ib.if_scope(j < nkeep): with ib.for_range(0, box_data_length) as k: out[(base_idx + j * box_data_length + k)] = \ data[(base_idx + sorted_index[i * num_anchors + j] \ * box_data_length + k)] box_indices[i * num_anchors + j] = sorted_index[i * num_anchors + j] with ib.if_scope(tvm.all(top_k > 0, top_k < valid_count[i])): with ib.if_scope(j < valid_count[i] - nkeep): with ib.for_range(0, box_data_length) as k: out[(base_idx + (j + nkeep) * box_data_length + k)] = -1.0 box_indices[i * num_anchors + (j + nkeep)] = -1 # Apply nms with ib.for_range(0, valid_count[i]) as k: offset_k = k * box_data_length with ib.if_scope(tvm.all(out[base_idx + offset_k + score_index] > 0, \ tvm.any(id_index < 0, out[base_idx + offset_k + id_index] >= 0))): with ib.if_scope(j < valid_count[i]): offset_j = j * box_data_length with ib.if_scope(tvm.all(j > k, \ out[base_idx + offset_j + score_index] > 0, \ tvm.any(id_index < 0, \ out[base_idx + offset_j + id_index] >= 0), \ tvm.any(force_suppress > 0, id_index < 0, \ out[base_idx + offset_k + id_index] == \ out[base_idx + offset_j + id_index]))): iou = calculate_overlap( out, base_idx + offset_j + coord_start, base_idx + offset_k + coord_start) with ib.if_scope(iou >= iou_threshold): out[base_idx + offset_j + score_index] = -1.0 with ib.if_scope(id_index >= 0): out[base_idx + offset_j + id_index] = -1.0 box_indices[i * num_anchors + j] = -1 with ib.else_scope(): with ib.if_scope(j < valid_count[i]): offset_j = j * box_data_length with ib.for_range(0, box_data_length) as k: out[(base_idx + offset_j + k)] = data[base_idx + offset_j + k] box_indices[i * num_anchors + j] = j # Set invalid entry to be -1 with ib.if_scope(j < num_anchors - valid_count[i]): with ib.for_range(0, box_data_length) as k: out[base_idx + (j + valid_count[i]) * box_data_length + k] = -1.0 box_indices[i * num_anchors + j + valid_count[i]] = -1 # Only return max_output_size number of valid boxes num_valid_boxes[0] = 0 with ib.if_scope(max_output_size > 0): with ib.if_scope(j < valid_count[i]): offset_j = j * box_data_length with ib.if_scope(out[base_idx + offset_j] >= 0): with ib.if_scope(num_valid_boxes[0] == max_output_size): with ib.for_range(0, box_data_length) as k: out[base_idx + offset_j + k] = -1.0 box_indices[i * num_anchors + j] = -1 with ib.else_scope(): num_valid_boxes[0] += 1 return ib.get()
def get_valid_counts_pre(data, flag, idx, score_threshold, id_index, score_index): """Low level IR to Prepare get valid count of bounding boxes given a score threshold. Also moves valid boxes to the top of input data. Parameters ---------- data: Buffer 3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms. flag : Buffer 2D Buffer of flag indicating valid data with shape [batch_size, num_anchors]. idx : Buffer 2D Buffer of valid data indices with shape [batch_size, num_anchors]. score_threshold : float32 Lower limit of score for valid bounding boxes. id_index : optional, int index of the class categories, -1 to disable. score_index: optional, int Index of the scores/confidence of boxes. Returns ------- stmt : Stmt The result IR statement. """ batch_size = data.shape[0] num_anchors = data.shape[1] box_data_length = data.shape[2] ib = tvm.ir_builder.create() data = ib.buffer_ptr(data) flag = ib.buffer_ptr(flag) idx = ib.buffer_ptr(idx) score_threshold = tvm.make.node("FloatImm", dtype="float32", value=score_threshold) id_index = tvm.make.node("IntImm", dtype="int32", value=id_index) score_index = tvm.make.node("IntImm", dtype="int32", value=score_index) max_threads = int( tvm.target.current_target(allow_none=False).max_num_threads) nthread_tx = max_threads nthread_bx = batch_size * num_anchors // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx with ib.if_scope(tid < batch_size * num_anchors): with ib.if_scope(tvm.all(data[tid * box_data_length + score_index] > score_threshold, \ tvm.any(id_index < 0, data[tid * box_data_length + id_index] >= 0))): flag[tid] = 1 idx[tid] = 1 with ib.else_scope(): flag[tid] = 0 idx[tid] = 0 return ib.get()
tvm.any ''' import numpy as np import tvm print(any((0, 1, 2)), all((0, 1, 2))) a = np.ones((3, 4), dtype='float32') # applying a zero padding of size 1 to a b = np.zeros((5, 6), dtype='float32') b[1:-1, 1:-1] = a print(b) p = 1 n, m = tvm.var('n'), tvm.var('m') A = tvm.placeholder((n, m), name='a') B = tvm.compute( (n + p * 2, m + p * 2), lambda i, j: tvm.if_then_else( tvm.any(i < p, i >= n + p, j < p, j >= m + p), 0, A[i - p, j - p]), name='b') s = tvm.create_schedule(B.op) print(tvm.lower(s, [A, B], simple_mode=True)) mod = tvm.build(s, [A, B]) c = tvm.nd.array(np.empty_like(b)) mod(tvm.nd.array(a), c) print(c)
'equal': lambda a, b, *idx: a[idx] == b[idx], 'not_equal': lambda a, b, *idx: a[idx] != b[idx], 'greater': lambda a, b, *idx: a[idx] > b[idx], 'less': lambda a, b, *idx: a[idx] < b[idx], 'greater_equal': lambda a, b, *idx: a[idx] >= b[idx], 'less_equal': lambda a, b, *idx: a[idx] <= b[idx], 'logical_and': lambda a, b, *idx: tvm.all(a[idx] != 0, b[idx] != 0), 'logical_or': lambda a, b, *idx: tvm.any(a[idx] != 0, b[idx] != 0), 'logical_xor': lambda a, b, *idx: tvm.all(tvm.any(a[idx] != 0, b[idx] != 0), tvm.any(a[idx] == 0, b[idx] == 0)), } def _compute_binary_logic(op, dtype, ndim): a = tvm.placeholder([tvm.size_var() for _ in range(ndim)], dtype=dtype, name='a') b = tvm.placeholder([tvm.size_var() for _ in range(ndim)], dtype=dtype, name='b') c = tvm.compute([tvm.size_var() for _ in range(ndim)], lambda *idx: _bin_logic_op_map[op](a, b, *idx),
def predict_bbox_ir(cls_prob_buf, bbox_pred_buf, im_info_buf, out_buf, scales, ratios, feature_stride, rpn_min_size, iou_loss): """Predict bounding boxes based on anchors, scores and deltas. Parameters ---------- cls_prob_buf : tvm.schedule.Buffer 4-D with shape [batch, 2 * num_anchors, height, width] bbox_pred_buf : tvm.schedule.Buffer 4-D with shape [batch, 4 * num_anchors, height, width] im_info_buf : tvm.schedule.Buffer 2-D with shape [batch, 3] out_buf : tvm.schedule.Buffer 3-D with shape [batch, num_bbox, 5] The last dimension is in format of [w_start, h_start, w_end, h_end, score] scales : list/tuple of float Scales of anchor windoes. ratios : list/tuple of float Ratios of anchor windoes. feature_stride : int The size of the receptive field each unit in the convolution layer of the rpn, for example the product of all stride's prior to this layer. rpn_min_size : int Minimum height or width in proposal. iou_loss : bool Usage of IoU loss. Returns ------- stmt : Stmt The result IR statement. """ batch, num_anchors, height, width = get_const_tuple(cls_prob_buf.shape) num_anchors //= 2 max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads) nthread_tx = max_threads nthread_bx = (batch * height * width) // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") tid = bx * max_threads + tx ib = tvm.ir_builder.create() ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) p_score = ib.buffer_ptr(cls_prob_buf) p_delta = ib.buffer_ptr(bbox_pred_buf) p_im_info = ib.buffer_ptr(im_info_buf) p_out = ib.buffer_ptr(out_buf) with ib.if_scope(tid < batch * height * width): w = tid % width h = (tid // width) % height b = tid // width // height for k in range(num_anchors): out_index = tid * num_anchors + k ratio = ratios[k // len(scales)] scale = scales[k % len(scales)] anchor = generate_anchor(ratio, scale, feature_stride) im_height = p_im_info[b * 3] im_width = p_im_info[b * 3 + 1] x1 = anchor[0] + w * feature_stride y1 = anchor[1] + h * feature_stride x2 = anchor[2] + w * feature_stride y2 = anchor[3] + h * feature_stride delta = [p_delta[((((b * num_anchors + k) * 4 + i) * height + h) * width + w)] for i in range(4)] regression_func = reg_iou if iou_loss else reg_bbox pred_x1, pred_y1, pred_x2, pred_y2 = regression_func(x1, y1, x2, y2, *delta) pred_x1 = tvm.max(tvm.min(pred_x1, im_width - 1.0), 0.0) pred_y1 = tvm.max(tvm.min(pred_y1, im_height - 1.0), 0.0) pred_x2 = tvm.max(tvm.min(pred_x2, im_width - 1.0), 0.0) pred_y2 = tvm.max(tvm.min(pred_y2, im_height - 1.0), 0.0) real_height = (im_height / feature_stride).astype('int32') real_width = (im_width / feature_stride).astype('int32') bbox_w = pred_x2 - pred_x1 + 1.0 bbox_h = pred_y2 - pred_y1 + 1.0 min_size = p_im_info[b * 3 + 2] * rpn_min_size pred_score = p_score[((b * num_anchors * 2 + num_anchors + k) * height + h) * width + w] pred_score = tvm.expr.Select(tvm.any(h >= real_height, w >= real_width), -1.0, pred_score) p_out[out_index * 5 + 0] = pred_x1 p_out[out_index * 5 + 1] = pred_y1 p_out[out_index * 5 + 2] = pred_x2 p_out[out_index * 5 + 3] = pred_y2 p_out[out_index * 5 + 4] = pred_score with ib.if_scope(tvm.any(bbox_w < min_size, bbox_h < min_size)): p_out[out_index * 5 + 0] -= min_size / 2.0 p_out[out_index * 5 + 1] -= min_size / 2.0 p_out[out_index * 5 + 2] += min_size / 2.0 p_out[out_index * 5 + 3] += min_size / 2.0 p_out[out_index * 5 + 4] = -1.0 return ib.get()
def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- data: Buffer Buffer of output boxes with class and score. sort_result : Buffer Buffer of output box indexes sorted by score. valid_count : Buffer Buffer of number of valid output boxes. out : Buffer Output buffer. nms_threshold : float Non-maximum suppression threshold. force_suppress : boolean Whether to suppress all detections regardless of class_id. nms_topk : int Keep maximum top k detections before nms, -1 for no limit. Returns ------- stmt : Stmt The result IR statement. """ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): """Calculate overlap of two boxes. """ w = tvm.make.Max( 0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx])) h = tvm.make.Max( 0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])) i = w * h u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \ (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \ (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \ (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i return tvm.select(u <= 0.0, 0.0, i / u) ib = tvm.ir_builder.create() p_data = ib.buffer_ptr(data) p_sort_result = ib.buffer_ptr(sort_result) p_valid_count = ib.buffer_ptr(valid_count) p_out = ib.buffer_ptr(out) batch_size = out.shape[0] num_anchors = out.shape[1] nms_threshold_node = tvm.make.node("FloatImm", dtype="float32", value=nms_threshold) nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk) force_suppress_node = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0) with ib.for_range(0, batch_size, for_type="parallel", name="n") as n: with ib.if_scope( tvm.all(nms_threshold_node > 0, nms_threshold_node < 1, p_valid_count[0] > 0)): # Reorder output nkeep = tvm.select( tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]), nms_topk, p_valid_count[n]) with ib.for_range(0, nkeep, name="l") as l: with ib.for_range(0, 6, name="m") as m: p_out[(n * num_anchors * 6 + l * 6 + m)] = p_data[( n * num_anchors * 6 + p_sort_result[n * num_anchors + l] * 6 + m)] with ib.if_scope( tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])): with ib.for_range(0, p_valid_count[n] - nkeep, name="l") as l: with ib.for_range(0, 6, name="m") as m: p_out[(n * num_anchors * 6 + (l + nkeep) * 6 + m)] = p_data[(n * num_anchors * 6 + (l + nkeep) * 6 + m)] # Apply nms with ib.for_range(0, p_valid_count[n], name="l") as l: offset_l = l * 6 with ib.if_scope(p_out[n * num_anchors * 6 + offset_l] >= 0): with ib.for_range(0, p_valid_count[n], name="m") as m: offset_m = m * 6 with ib.if_scope( tvm.all( m > l, p_out[n * num_anchors * 6 + offset_m] >= 0)): with ib.if_scope( tvm.any( force_suppress_node > 0, p_out[n * num_anchors * 6 + offset_l] == p_out[n * num_anchors * 6 + offset_m])): # When force_suppress == True or class_id equals iou = calculate_overlap( p_out, n * num_anchors * 6 + offset_l + 2, n * num_anchors * 6 + offset_m + 2) with ib.if_scope(iou >= nms_threshold): p_out[n * num_anchors * 6 + offset_m] = -1.0 with ib.else_scope(): with ib.for_range(0, p_valid_count[n], name="l") as l: with ib.for_range(0, 6, name="m") as m: p_out[(n * num_anchors * 6 + l * 6 + m)] = p_data[n * num_anchors * 6 + l * 6 + m] # Set invalid entry to be -1 with ib.for_range(0, num_anchors - p_valid_count[n], name="l") as l: with ib.for_range(0, 6, name="m") as m: p_out[n * num_anchors * 6 + (l + p_valid_count[n]) * 6 + m] = -1.0 return ib.get()
def _bilinear(i, c, y, x): outside = tvm.any(y < -1.0, x < -1.0, y > height, x > width) y = tvm.max(y, 0.0) x = tvm.max(x, 0.0) val = bilinear_sample_nchw(data, (i, c, y, x), height - 1, width - 1) return tvm.if_then_else(outside, 0.0, val)
def nms_ir(data, sorted_index, valid_count, out, box_indices, max_output_size, iou_threshold, force_suppress, top_k, coord_start, id_index): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- data : Buffer Buffer of output boxes with class and score. sort_index : Buffer Buffer of output box indexes sorted by score. valid_count : Buffer Buffer of number of valid output boxes. out : Buffer Output buffer. max_output_size : int Max number of output valid boxes for each instance. By default all valid boxes are returned. iou_threshold : float Overlapping(IoU) threshold to suppress object with smaller score. force_suppress : boolean Whether to suppress all detections regardless of class_id. top_k : int Keep maximum top k detections before nms, -1 for no limit. coord_start : int Start index of the consecutive 4 coordinates. id_index : int index of the class categories, -1 to disable. Returns ------- stmt : Stmt The result IR statement. """ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): """Calculate overlap of two boxes. """ w = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) - tvm.max(out_tensor[box_a_idx], out_tensor[box_b_idx])) h = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) - tvm.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])) i = w * h u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \ (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \ (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \ (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i return tvm.expr.Select(u <= 0.0, 0.0, i / u) batch_size = data.shape[0] num_anchors = data.shape[1] box_data_length = data.shape[2] ib = tvm.ir_builder.create() data = ib.buffer_ptr(data) sorted_index = ib.buffer_ptr(sorted_index) valid_count = ib.buffer_ptr(valid_count) out = ib.buffer_ptr(out) box_indices = ib.buffer_ptr(box_indices) num_valid_boxes = ib.allocate("int32", (1,), name="num_valid_boxes", scope="local") max_threads = int(math.sqrt( tvm.target.current_target(allow_none=False).max_num_threads)) nthread_tx = max_threads nthread_bx = num_anchors // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) k = bx * max_threads + tx iou_threshold = tvm.make.node("FloatImm", dtype="float32", value=iou_threshold) top_k = tvm.make.node("IntImm", dtype="int32", value=top_k) coord_start = tvm.make.node("IntImm", dtype="int32", value=coord_start) id_index = tvm.make.node("IntImm", dtype="int32", value=id_index) force_suppress = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0) with ib.for_range(0, batch_size, for_type="unroll") as i: base_idx = i * num_anchors * box_data_length with ib.if_scope(tvm.all(iou_threshold > 0, valid_count[i] > 0)): # Reorder output nkeep = if_then_else( \ tvm.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]) with ib.for_range(0, nkeep) as j: with ib.if_scope(k < box_data_length): out[(base_idx + j * box_data_length + k)] = \ data[(base_idx + sorted_index[i * num_anchors + j] \ * box_data_length + k)] box_indices[i * num_anchors + j] = sorted_index[i * num_anchors + j] with ib.if_scope(tvm.all(top_k > 0, top_k < valid_count[i])): with ib.for_range(0, valid_count[i] - nkeep) as j: with ib.if_scope(k < box_data_length): out[(base_idx + (j + nkeep) * box_data_length + k)] = -1.0 box_indices[i * num_anchors + (j + nkeep)] = -1 # Apply nms with ib.for_range(0, valid_count[i]) as j: offset_j = j * box_data_length with ib.if_scope(out[base_idx + offset_j] >= 0): with ib.if_scope(k < valid_count[i]): offset_k = k * box_data_length with ib.if_scope(tvm.all(k > j, out[base_idx + offset_k] >= 0, \ tvm.any(force_suppress > 0, id_index < 0, \ out[base_idx + offset_j] == \ out[base_idx + offset_k]))): iou = calculate_overlap(out, base_idx + offset_k + coord_start, base_idx + offset_j + coord_start) with ib.if_scope(iou >= iou_threshold): out[base_idx + offset_k] = -1.0 box_indices[i * num_anchors + k] = -1 ib.emit(tvm.make.Call(None, 'tvm_storage_sync', tvm.convert(['shared']), tvm.expr.Call.Intrinsic, None, 0)) with ib.else_scope(): with ib.for_range(0, valid_count[i]) as j: offset_j = j * box_data_length with ib.if_scope(k < box_data_length): out[(base_idx + offset_j + k)] = data[base_idx + offset_j + k] box_indices[i * num_anchors + j] = j # Set invalid entry to be -1 with ib.for_range(0, num_anchors - valid_count[i]) as j: with ib.if_scope(k < box_data_length): out[base_idx + (j + valid_count[i]) * box_data_length + k] = -1.0 box_indices[i * num_anchors + j + valid_count[i]] = -1 # Only return max_output_size number of valid boxes num_valid_boxes[0] = 0 with ib.if_scope(max_output_size > 0): with ib.for_range(0, valid_count[i]) as j: offset_j = j * box_data_length with ib.if_scope(out[base_idx + offset_j] >= 0): with ib.if_scope(num_valid_boxes[0] == max_output_size): with ib.if_scope(k < box_data_length): out[base_idx + offset_j + k] = -1.0 box_indices[i * num_anchors + j] = -1 with ib.else_scope(): num_valid_boxes[0] += 1 ib.emit(tvm.make.Call(None, 'tvm_storage_sync', tvm.convert(['shared']), tvm.expr.Call.Intrinsic, None, 0)) return ib.get()
def _bilinear(n, c, h, w): outside = tvm.any(h < 0, w < 0, h >= in_height, w >= in_width) val = bilinear_sample_nchw(data, (n, c, h, w), in_height - 1, in_width - 1) return tvm.if_then_else(outside, zero, val)
def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- data: Buffer Buffer of output boxes with class and score. sort_result : Buffer Buffer of output box indexes sorted by score. valid_count : Buffer Buffer of number of valid output boxes. out : Buffer Output buffer. nms_threshold : float Non-maximum suppression threshold. force_suppress : boolean Whether to suppress all detections regardless of class_id. nms_topk : int Keep maximum top k detections before nms, -1 for no limit. Returns ------- stmt : Stmt The result IR statement. """ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): """Calculate overlap of two boxes. """ w = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) - tvm.max(out_tensor[box_a_idx], out_tensor[box_b_idx])) h = tvm.max(0.0, tvm.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) - tvm.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])) i = w * h u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \ (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \ (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \ (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i return tvm.expr.Select(u <= 0.0, 0.0, i / u) max_threads = int(math.sqrt( tvm.target.current_target(allow_none=False).max_num_threads)) ib = tvm.ir_builder.create() p_data = ib.buffer_ptr(data) p_sort_result = ib.buffer_ptr(sort_result) p_valid_count = ib.buffer_ptr(valid_count) p_out = ib.buffer_ptr(out) batch_size = out.shape[0] num_anchors = out.shape[1] nthread_tx = max_threads nthread_bx = num_anchors // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) i = bx * max_threads + tx nms_threshold_node = tvm.make.node( "FloatImm", dtype="float32", value=nms_threshold) nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk) force_suppress_node = tvm.make.node( "IntImm", dtype="int32", value=1 if force_suppress else 0) with ib.for_range(0, batch_size, for_type="unroll") as b: base_idx = b * num_anchors * 6 with ib.if_scope( \ tvm.all(nms_threshold_node > 0, nms_threshold_node < 1, p_valid_count[0] > 0)): # Reorder output nkeep = tvm.if_then_else( \ tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[b]), nms_topk, p_valid_count[b]) with ib.for_range(0, nkeep) as l: with ib.if_scope(i < 6): p_out[(base_idx + l * 6 + i)] = \ p_data[(base_idx + p_sort_result[b * num_anchors + l] * 6 + i)] with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[b])): with ib.for_range(0, p_valid_count[b] - nkeep) as l: with ib.if_scope(i < 6): p_out[(base_idx + (l + nkeep) * 6 + i)] = -1.0 # Apply nms with ib.for_range(0, p_valid_count[b]) as l: offset_l = l * 6 with ib.if_scope(p_out[base_idx + offset_l] >= 0): with ib.if_scope(i < p_valid_count[b]): offset_i = i * 6 with ib.if_scope(tvm.all(i > l, p_out[base_idx + offset_i] >= 0)): with ib.if_scope(tvm.any(force_suppress_node > 0, p_out[base_idx + offset_l] == p_out[base_idx + offset_i])): # When force_suppress == True or class_id equals iou = calculate_overlap(p_out, base_idx + offset_l + 2, base_idx + offset_i + 2) with ib.if_scope(iou >= nms_threshold): p_out[base_idx + offset_i] = -1.0 ib.emit(tvm.make.Call(None, 'tvm_storage_sync', tvm.convert(['shared']), tvm.expr.Call.Intrinsic, None, 0)) with ib.else_scope(): with ib.for_range(0, p_valid_count[b]) as c: with ib.if_scope(i < 6): p_out[(base_idx + c * 6 + i)] = p_data[base_idx + c * 6 + i] # Set invalid entry to be -1 with ib.for_range(0, num_anchors - p_valid_count[b]) as c: with ib.if_scope(i < 6): p_out[base_idx + (c + p_valid_count[b]) * 6 + i] = -1.0 body = ib.get() return body
def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- data: Buffer Buffer of output boxes with class and score. sort_result : Buffer Buffer of output box indexes sorted by score. valid_count : Buffer Buffer of number of valid output boxes. out : Buffer Output buffer. nms_threshold : float Non-maximum suppression threshold. force_suppress : boolean Whether to suppress all detections regardless of class_id. nms_topk : int Keep maximum top k detections before nms, -1 for no limit. Returns ------- stmt : Stmt The result IR statement. """ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): """Calculate overlap of two boxes. """ w = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx])) h = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])) i = w * h u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \ (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \ (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \ (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i return tvm.select(u <= 0.0, 0.0, i / u) max_threads = int(math.sqrt( tvm.target.current_target(allow_none=False).max_num_threads)) tx = tvm.thread_axis("threadIdx.x") ty = tvm.thread_axis("threadIdx.y") bx = tvm.thread_axis("blockIdx.x") by = tvm.thread_axis("blockIdx.y") ib = tvm.ir_builder.create() p_data = ib.buffer_ptr(data) p_sort_result = ib.buffer_ptr(sort_result) p_valid_count = ib.buffer_ptr(valid_count) p_out = ib.buffer_ptr(out) batch_size = out.shape[0] num_anchors = out.shape[1] nthread_tx = max_threads nthread_bx = num_anchors // max_threads + 1 nthread_ty = max_threads nthread_by = 6 // max_threads + 1 ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(ty, "thread_extent", nthread_ty) ib.scope_attr(bx, "thread_extent", nthread_bx) ib.scope_attr(by, "thread_extent", nthread_by) i = bx * max_threads + tx j = by * max_threads + ty nms_threshold_node = tvm.make.node( "FloatImm", dtype="float32", value=nms_threshold) nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk) force_suppress_node = tvm.make.node( "IntImm", dtype="int32", value=1 if force_suppress else 0) with ib.for_range(0, batch_size, for_type="unroll", name="n") as n: with ib.if_scope( tvm.all(nms_threshold_node > 0, nms_threshold_node < 1, p_valid_count[0] > 0)): # Reorder output nkeep = tvm.select( tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]), nms_topk, p_valid_count[n]) with ib.if_scope(i < nkeep): with ib.if_scope(j < 6): p_out[(n * num_anchors * 6 + i * 6 + j)] = p_data[(n * num_anchors * 6 + p_sort_result[n * num_anchors + i] * 6 + j)] with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])): with ib.if_scope(i < p_valid_count[n] - nkeep): with ib.if_scope(j < 6): p_out[(n * num_anchors * 6 + (i + nkeep) * 6 + j)] = p_data[(n * num_anchors * 6 + (i + nkeep) * 6 + j)] # Apply nms with ib.if_scope(i < p_valid_count[n]): offset_i = i * 6 with ib.if_scope(p_out[n * num_anchors * 6 + offset_i] >= 0): with ib.if_scope(j < p_valid_count[n]): offset_j = j * 6 with ib.if_scope(tvm.all(j > i, p_out[n * num_anchors * 6 + offset_j] >= 0)): with ib.if_scope(tvm.any(force_suppress_node > 0, p_out[n * num_anchors * 6 + offset_i] == p_out[n * num_anchors * 6 + offset_j])): # When force_suppress == True or class_id equals iou = calculate_overlap( p_out, n * num_anchors * 6 + offset_i + 2, n * num_anchors * 6 + offset_j + 2) with ib.if_scope(iou >= nms_threshold): p_out[ n * num_anchors * 6 + offset_j] = -1.0 with ib.else_scope(): with ib.if_scope(i < p_valid_count[n]): with ib.if_scope(j < 6): p_out[(n * num_anchors * 6 + i * 6 + j)] = p_data[n * num_anchors * 6 + i * 6 + j] # Set invalid entry to be -1 with ib.if_scope(i < num_anchors - p_valid_count[n]): with ib.if_scope(j < 6): p_out[n * num_anchors * 6 + (i + p_valid_count[n]) * 6 + j] = -1.0 body = ib.get() return body
def predict_bbox_ir(cls_prob_buf, bbox_pred_buf, im_info_buf, out_buf, scales, ratios, feature_stride, rpn_min_size, iou_loss): """Predict bounding boxes based on anchors, scores and deltas. Parameters ---------- cls_prob_buf : tvm.schedule.Buffer 4-D with shape [batch, 2 * num_anchors, height, width] bbox_pred_buf : tvm.schedule.Buffer 4-D with shape [batch, 4 * num_anchors, height, width] im_info_buf : tvm.schedule.Buffer 2-D with shape [batch, 3] out_buf : tvm.schedule.Buffer 3-D with shape [batch, num_bbox, 5] The last dimension is in format of [w_start, h_start, w_end, h_end, score] scales : list/tuple of float Scales of anchor windoes. ratios : list/tuple of float Ratios of anchor windoes. feature_stride : int The size of the receptive field each unit in the convolution layer of the rpn, for example the product of all stride's prior to this layer. rpn_min_size : int Minimum height or width in proposal. iou_loss : bool Usage of IoU loss. Returns ------- stmt : Stmt The result IR statement. """ batch, num_anchors, height, width = get_const_tuple(cls_prob_buf.shape) num_anchors //= 2 max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) nthread_tx = max_threads nthread_bx = (batch * height * width) // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") tid = bx * max_threads + tx ib = tvm.ir_builder.create() ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) p_score = ib.buffer_ptr(cls_prob_buf) p_delta = ib.buffer_ptr(bbox_pred_buf) p_im_info = ib.buffer_ptr(im_info_buf) p_out = ib.buffer_ptr(out_buf) idxm = tvm.indexmod idxd = tvm.indexdiv with ib.if_scope(tid < batch * height * width): w = idxm(tid, width) h = idxm(idxd(tid, width), height) b = idxd(idxd(tid, width), height) for k in range(num_anchors): out_index = tid * num_anchors + k ratio = ratios[k // len(scales)] scale = scales[k % len(scales)] anchor = generate_anchor(ratio, scale, feature_stride) im_height = p_im_info[b * 3] im_width = p_im_info[b * 3 + 1] x1 = anchor[0] + w * feature_stride y1 = anchor[1] + h * feature_stride x2 = anchor[2] + w * feature_stride y2 = anchor[3] + h * feature_stride delta = [p_delta[((((b * num_anchors + k) * 4 + i) * height + h) * width + w)] for i in range(4)] regression_func = reg_iou if iou_loss else reg_bbox pred_x1, pred_y1, pred_x2, pred_y2 = regression_func(x1, y1, x2, y2, *delta) pred_x1 = tvm.max(tvm.min(pred_x1, im_width - 1.0), 0.0) pred_y1 = tvm.max(tvm.min(pred_y1, im_height - 1.0), 0.0) pred_x2 = tvm.max(tvm.min(pred_x2, im_width - 1.0), 0.0) pred_y2 = tvm.max(tvm.min(pred_y2, im_height - 1.0), 0.0) real_height = (im_height / feature_stride).astype('int32') real_width = (im_width / feature_stride).astype('int32') bbox_w = pred_x2 - pred_x1 + 1.0 bbox_h = pred_y2 - pred_y1 + 1.0 min_size = p_im_info[b * 3 + 2] * rpn_min_size pred_score = p_score[((b * num_anchors * 2 + num_anchors + k) * height + h) * width + w] pred_score = tvm.expr.Select(tvm.any(h >= real_height, w >= real_width), -1.0, pred_score) p_out[out_index * 5 + 0] = pred_x1 p_out[out_index * 5 + 1] = pred_y1 p_out[out_index * 5 + 2] = pred_x2 p_out[out_index * 5 + 3] = pred_y2 p_out[out_index * 5 + 4] = pred_score with ib.if_scope(tvm.any(bbox_w < min_size, bbox_h < min_size)): p_out[out_index * 5 + 0] -= min_size / 2.0 p_out[out_index * 5 + 1] -= min_size / 2.0 p_out[out_index * 5 + 2] += min_size / 2.0 p_out[out_index * 5 + 3] += min_size / 2.0 p_out[out_index * 5 + 4] = -1.0 return ib.get()
def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- data: Buffer Buffer of output boxes with class and score. sort_result : Buffer Buffer of output box indexes sorted by score. valid_count : Buffer Buffer of number of valid output boxes. out : Buffer Output buffer. nms_threshold : float Non-maximum suppression threshold. force_suppress : boolean Whether to suppress all detections regardless of class_id. nms_topk : int Keep maximum top k detections before nms, -1 for no limit. Returns ------- stmt : Stmt The result IR statement. """ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): """Calculate overlap of two boxes. """ w = tvm.make.Max( 0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx])) h = tvm.make.Max( 0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])) i = w * h u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \ (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \ (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \ (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i return tvm.select(u <= 0.0, 0.0, i / u) max_threads = int( math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads)) tx = tvm.thread_axis("threadIdx.x") ty = tvm.thread_axis("threadIdx.y") bx = tvm.thread_axis("blockIdx.x") by = tvm.thread_axis("blockIdx.y") ib = tvm.ir_builder.create() p_data = ib.buffer_ptr(data) p_sort_result = ib.buffer_ptr(sort_result) p_valid_count = ib.buffer_ptr(valid_count) p_out = ib.buffer_ptr(out) batch_size = out.shape[0] num_anchors = out.shape[1] nthread_tx = max_threads nthread_bx = num_anchors // max_threads + 1 nthread_ty = max_threads nthread_by = 6 // max_threads + 1 ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(ty, "thread_extent", nthread_ty) ib.scope_attr(bx, "thread_extent", nthread_bx) ib.scope_attr(by, "thread_extent", nthread_by) i = bx * max_threads + tx j = by * max_threads + ty nms_threshold_node = tvm.make.node("FloatImm", dtype="float32", value=nms_threshold) nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk) force_suppress_node = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0) with ib.for_range(0, batch_size, for_type="unroll", name="n") as n: with ib.if_scope( tvm.all(nms_threshold_node > 0, nms_threshold_node < 1, p_valid_count[0] > 0)): # Reorder output nkeep = tvm.select( tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]), nms_topk, p_valid_count[n]) with ib.if_scope(i < nkeep): with ib.if_scope(j < 6): p_out[(n * num_anchors * 6 + i * 6 + j)] = p_data[( n * num_anchors * 6 + p_sort_result[n * num_anchors + i] * 6 + j)] with ib.if_scope( tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])): with ib.if_scope(i < p_valid_count[n] - nkeep): with ib.if_scope(j < 6): p_out[(n * num_anchors * 6 + (i + nkeep) * 6 + j)] = p_data[(n * num_anchors * 6 + (i + nkeep) * 6 + j)] # Apply nms with ib.if_scope(i < p_valid_count[n]): offset_i = i * 6 with ib.if_scope(p_out[n * num_anchors * 6 + offset_i] >= 0): with ib.if_scope(j < p_valid_count[n]): offset_j = j * 6 with ib.if_scope( tvm.all( j > i, p_out[n * num_anchors * 6 + offset_j] >= 0)): with ib.if_scope( tvm.any( force_suppress_node > 0, p_out[n * num_anchors * 6 + offset_i] == p_out[n * num_anchors * 6 + offset_j])): # When force_suppress == True or class_id equals iou = calculate_overlap( p_out, n * num_anchors * 6 + offset_i + 2, n * num_anchors * 6 + offset_j + 2) with ib.if_scope(iou >= nms_threshold): p_out[n * num_anchors * 6 + offset_j] = -1.0 with ib.else_scope(): with ib.if_scope(i < p_valid_count[n]): with ib.if_scope(j < 6): p_out[(n * num_anchors * 6 + i * 6 + j)] = p_data[n * num_anchors * 6 + i * 6 + j] # Set invalid entry to be -1 with ib.if_scope(i < num_anchors - p_valid_count[n]): with ib.if_scope(j < 6): p_out[n * num_anchors * 6 + (i + p_valid_count[n]) * 6 + j] = -1.0 body = ib.get() return body
def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk): """Low level IR routing for transform location in multibox_detection operator. Parameters ---------- data: Buffer Buffer of output boxes with class and score. sort_result : Buffer Buffer of output box indexes sorted by score. valid_count : Buffer Buffer of number of valid output boxes. out : Buffer Output buffer. nms_threshold : float Non-maximum suppression threshold. force_suppress : boolean Whether to suppress all detections regardless of class_id. nms_topk : int Keep maximum top k detections before nms, -1 for no limit. Returns ------- stmt : Stmt The result IR statement. """ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): """Calculate overlap of two boxes. """ w = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx])) h = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1])) i = w * h u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \ (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \ (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \ (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i return tvm.select(u <= 0.0, 0.0, i / u) ib = tvm.ir_builder.create() p_data = ib.buffer_ptr(data) p_sort_result = ib.buffer_ptr(sort_result) p_valid_count = ib.buffer_ptr(valid_count) p_out = ib.buffer_ptr(out) batch_size = out.shape[0] num_anchors = out.shape[1] nms_threshold_node = tvm.make.node("FloatImm", dtype="float32", value=nms_threshold) nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk) force_suppress_node = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0) with ib.for_range(0, batch_size, for_type="parallel", name="n") as n: with ib.if_scope(tvm.all(nms_threshold_node > 0, nms_threshold_node < 1, p_valid_count[0] > 0)): # Reorder output nkeep = tvm.select(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]), nms_topk, p_valid_count[n]) with ib.for_range(0, nkeep, name="l") as l: with ib.for_range(0, 6, name="m") as m: p_out[(n * num_anchors * 6 + l * 6 + m)] = p_data[(n * num_anchors * 6 + p_sort_result[n * num_anchors + l] * 6 + m)] with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])): with ib.for_range(0, p_valid_count[n] - nkeep, name="l") as l: with ib.for_range(0, 6, name="m") as m: p_out[(n * num_anchors * 6 + (l + nkeep) * 6 + m)] = p_data[(n * num_anchors * 6 + (l + nkeep) * 6 + m)] # Apply nms with ib.for_range(0, p_valid_count[n], name="l") as l: offset_l = l * 6 with ib.if_scope(p_out[n * num_anchors * 6 + offset_l] >= 0): with ib.for_range(0, p_valid_count[n], name="m") as m: offset_m = m * 6 with ib.if_scope(tvm.all(m > l, p_out[n * num_anchors * 6 + offset_m] >= 0)): with ib.if_scope(tvm.any(force_suppress_node > 0, p_out[n * num_anchors * 6 + offset_l] == p_out[n * num_anchors * 6 + offset_m])): # When force_suppress == True or class_id equals iou = calculate_overlap(p_out, n * num_anchors * 6 + offset_l + 2, n * num_anchors * 6 + offset_m + 2) with ib.if_scope(iou >= nms_threshold): p_out[n * num_anchors * 6 + offset_m] = -1.0 with ib.else_scope(): with ib.for_range(0, p_valid_count[n], name="l") as l: with ib.for_range(0, 6, name="m") as m: p_out[(n * num_anchors * 6 + l * 6 + m)] = p_data[n * num_anchors * 6 + l * 6 + m] # Set invalid entry to be -1 with ib.for_range(0, num_anchors - p_valid_count[n], name="l") as l: with ib.for_range(0, 6, name="m") as m: p_out[n * num_anchors * 6 + (l + p_valid_count[n]) * 6 + m] = -1.0 return ib.get()
def get_valid_counts_ir(data, valid_count, flag, score_threshold, id_index, score_index): """Low level IR to get valid count of bounding boxes given a score threshold. Also prepares to move valid boxes to the top of input data. Parameters ---------- data : Buffer Input data. 3-D Buffer with shape [batch_size, num_anchors, elem_length]. valid_count : Buffer 1D buffer for valid number of boxes with shape [batch_size, ]. flag : Buffer 2D Buffer of flag indicating valid data with shape [batch_size, num_anchors]. score_threshold : float32 Lower limit of score for valid bounding boxes. id_index : optional, int index of the class categories, -1 to disable. score_index: optional, int Index of the scores/confidence of boxes. Returns ------- stmt : Stmt The result IR statement. """ batch_size = data.shape[0] num_anchors = data.shape[1] elem_length = data.shape[2] ib = tvm.ir_builder.create() data = ib.buffer_ptr(data) valid_count = ib.buffer_ptr(valid_count) flag = ib.buffer_ptr(flag) atomic_add_return = ib.allocate(valid_count.dtype, (1, ), name='atomic_add_return', scope='local') one_count = tvm.const(1, dtype=valid_count.dtype) score_threshold = tvm.make.node("FloatImm", dtype="float32", value=score_threshold) id_index = tvm.make.node("IntImm", dtype="int32", value=id_index) score_index = tvm.make.node("IntImm", dtype="int32", value=score_index) max_threads = int( tvm.target.Target.current(allow_none=False).max_num_threads) nthread_tx = max_threads nthread_bx = batch_size * num_anchors // max_threads + 1 tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx idxd = tvm.indexdiv # initialize valid_count with ib.if_scope(tid < batch_size): valid_count[tid] = 0 # initialize flag with ib.if_scope(tid < batch_size * num_anchors): flag[tid] = 0 with ib.if_scope(tid < batch_size * num_anchors): i = idxd(tid, num_anchors) with ib.if_scope( tvm.all( data[tid * elem_length + score_index] > score_threshold, tvm.any(id_index < 0, data[tid * elem_length + id_index] >= 0))): flag[tid] = 1 atomic_add_return[0] = atomic_add( tvm.call_pure_intrin("handle", "tvm_address_of", valid_count[i]), one_count) return ib.get()