Пример #1
0
    def _dispatch_sim_quantize(value):
        pass_through_value = te.compute(
            data.shape,
            lambda *indices: _compute_pass_through(value, *indices))
        int8_value = te.compute(
            data.shape,
            lambda *indices: tir.if_then_else(
                out_dtype.equal(SQNN_DTYPE_TO_CODE["int8"]),
                _compute_intn("int8", value, *indices),
                pass_through_value[indices],
            ),
        )
        uint8_value = te.compute(
            data.shape,
            lambda *indices: tir.if_then_else(
                out_dtype.equal(SQNN_DTYPE_TO_CODE["uint8"]),
                _compute_intn("uint8", value, *indices),
                int8_value[indices],
            ),
        )
        int32_value = te.compute(
            data.shape,
            lambda *indices: tir.if_then_else(
                out_dtype.equal(SQNN_DTYPE_TO_CODE["int32"]),
                _compute_intn("int32", value, *indices),
                uint8_value[indices],
            ),
        )

        return int32_value
Пример #2
0
 def im2col(row, col):
     j_w, j_h, j_n = idxsplit(row, [imw, imh])
     j_c, k_w, k_h = idxsplit(col, [chanin, 3])
     i_h, i_w = j_h + k_h - 1, j_w + k_w - 1
     return tir.if_then_else(
         tir.all(i_h >= 0, i_h < imh, i_w >= 0, i_w < imw),
         data[j_n, i_h, i_w, j_c], 0)
Пример #3
0
 def im2col(nsamples, ckk, imglen):
     j_h, j_w = imglen // imgw, imglen % imgw
     i_c, k_h, k_w = ckk // 9, ckk // 3 % 3, ckk % 3
     i_h, i_w = j_h + k_h - 1, j_w + k_w - 1
     return tir.if_then_else(
         tir.all(i_h >= 0, i_h < imgh, i_w >= 0, i_w < imgw),
         data[nsamples, i_c, i_h, i_w], 0)
Пример #4
0
 def __reflect(index, size, corner_start):
     index_align_corner = te.abs(corner_start - index)
     size_times = te.truncdiv(index_align_corner.astype("int32"), size).astype("int32")
     t = tir.Mod(size_times, 2)
     extra = index_align_corner - size_times * size
     return tir.if_then_else(
         tir.EQ(t, 0), extra + corner_start, size - extra + corner_start
     )
def loop_carried_dependency(a: ty.handle, b: ty.handle, c: ty.handle) -> None:
    A = tir.match_buffer(a, (128,))
    B = tir.match_buffer(b, (128,))
    C = tir.match_buffer(c, (128,))
    for i in range(0, 128):
        with tir.block([128], "B") as vi:
            B[vi] = A[vi] * 2.0
        with tir.block([128], "C") as vi:
            C[vi] = tir.if_then_else(vi >= 1, B[vi - 1] + 1.0, 0.0, dtype="float32")
Пример #6
0
def read_out_of_bound(a: ty.handle, c: ty.handle) -> None:
    A = tir.match_buffer(a, [16], "float32")
    B = tir.alloc_buffer([16], "float32")
    C = tir.match_buffer(c, [16], "float32")
    for i in tir.serial(0, 16):
        with tir.block([16], "B") as [v]:
            B[v] = A[v]
    for j in tir.serial(0, 16):
        with tir.block([16], "C") as [v]:
            tir.reads(B[v:v + 2])
            C[v] = tir.if_then_else(v < 15,
                                    tir.max(B[v], B[v + 1]),
                                    B[v],
                                    dtype="float32")
Пример #7
0
def read_out_of_bound_after_compute_at(a: ty.handle, c: ty.handle) -> None:
    A = tir.match_buffer(a, [16], "float32")
    B = tir.alloc_buffer([16], "float32")
    C = tir.match_buffer(c, [16], "float32")
    for j in tir.serial(0, 16):
        for i in tir.serial(0, tir.min(1, 15 - j) + 1):
            with tir.block([16], "B") as [v]:
                tir.bind(v, j + i)
                B[v] = A[v]
        with tir.block([16], "C") as [v]:
            tir.bind(v, j)
            tir.reads([B[v:v + 2]])
            C[v] = tir.if_then_else(v < 15,
                                    tir.max(B[v], B[v + 1]),
                                    B[v],
                                    dtype="float32")
Пример #8
0
    def _dispatch_sim_dequantize(value):
        pass_through_value = te.compute(
            data.shape,
            lambda *indices: _compute_pass_through(value, *indices))
        intn_condition = tvm.te.any(
            in_dtype.equal(SQNN_DTYPE_TO_CODE["int8"]),
            in_dtype.equal(SQNN_DTYPE_TO_CODE["uint8"]),
            in_dtype.equal(SQNN_DTYPE_TO_CODE["int32"]),
        )
        intn_value = te.compute(
            data.shape,
            lambda *indices: tir.if_then_else(
                intn_condition,
                _compute_intn(value, *indices),
                pass_through_value[indices],
            ),
        )

        return intn_value
Пример #9
0
def tir_conv2d(a: ty.handle, w: ty.handle, b: ty.handle) -> None:
    A = tir.match_buffer(a, [16, 16, 14, 14])
    W = tir.match_buffer(w, [16, 3, 3, 32])
    B = tir.match_buffer(b, [16, 32, 14, 14])
    Apad = tir.alloc_buffer([16, 16, 16, 16])

    with tir.block([16, 16, 16, 16], "Apad") as [nn, cc, yy, xx]:
        Apad[nn, cc, yy, xx] = tir.if_then_else(
            yy >= 1 and yy - 1 < 14 and xx >= 1 and xx - 1 < 14,
            A[nn, cc, yy - 1, xx - 1],
            0.0,
            dtype="float32",
        )
    with tir.block(
        [16, 32, 14, 14, tir.reduce_axis(0, 16), tir.reduce_axis(0, 3), tir.reduce_axis(0, 3)], "B"
    ) as [nn, ff, yy, xx, rc, ry, rx]:
        with tir.init():
            B[nn, ff, yy, xx] = 0.0
        B[nn, ff, yy, xx] += Apad[nn, rc, yy + ry, xx + rx] * W[rc, ry, rx, ff]
Пример #10
0
def _nms_loop(
    ib,
    batch_size,
    top_k,
    iou_threshold,
    max_output_size,
    valid_count,
    on_new_valid_box_func,
    on_new_invalidated_box_func,
    needs_bbox_check_func,
    calc_overlap_func,
    out_scores,
    num_valid_boxes,
):
    def nms_inner_loop(ib, i, j, nkeep, num_valid_boxes_local):
        # The box j is valid, invalidate other boxes that overlap with j above iou_threshold
        on_new_valid_box_func(ib, 0, num_valid_boxes_local[0], i, j)
        num_valid_boxes_local[0] += 1

        num_boxes_to_check = nkeep - (j + 1)

        with ib.for_range(0, num_boxes_to_check, name="_k",
                          kind="parallel") as _k:
            k = j + 1 + _k

            with ib.if_scope(
                    tvm.tir.all(
                        k < nkeep,
                        out_scores[i, k] > 0,  # is the box k still valid?
                        needs_bbox_check_func(i, j, k),
                    )):
                iou = calc_overlap_func(i, j, k)

                with ib.if_scope(iou >= iou_threshold):
                    # invalidate the box k
                    out_scores[i, k] = -1.0
                    on_new_invalidated_box_func(i, k)

    with ib.for_range(0, batch_size, name="i") as i:
        nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]),
                             top_k, valid_count[i])
        max_output_size = if_then_else(max_output_size > 0, max_output_size,
                                       nkeep)

        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
            num_valid_boxes_local = ib.allocate("int32", (1, ),
                                                name="num_valid_boxes_local",
                                                scope="local")
            box_idx = ib.allocate("int32", (1, ),
                                  name="box_idx",
                                  scope="local")
            num_valid_boxes_local[0] = 0
            box_idx[0] = 0

            # Apply nms
            # No need to do more iteration if we have already reached max_output_size boxes
            with ib.while_loop(
                    tvm.tir.all(box_idx[0] < nkeep,
                                num_valid_boxes_local[0] < max_output_size)):
                # Proceed to the inner loop if the box with id box_idx is still valid
                with ib.if_scope(out_scores[i, box_idx[0]] > -1.0):
                    nms_inner_loop(ib, i, box_idx[0], nkeep,
                                   num_valid_boxes_local)
                box_idx[0] += 1

            num_valid_boxes[i] = num_valid_boxes_local[0]

        with ib.else_scope():
            num_valid_boxes[i] = 0

    return ib.get()
Пример #11
0
def transform_loc_pre(cls_prob, valid_count, temp_valid_count, temp_cls_id,
                      temp_score, threshold):
    """Low level IR routing for transform location data preparation.

    Parameters
    ----------
    cls_prob : Buffer
        Buffer of class probabilities.

    valid_count : Buffer
        Buffer of number of valid output boxes.

    temp_valid_count : Buffer
        Output intermediate result buffer

    temp_cls_id : Buffer
        Output intermediate result buffer

    temp_score : Buffer
        Output buffer

    threshold : float
        Threshold to be a positive prediction.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    batch_size = cls_prob.shape[0]
    num_classes = cls_prob.shape[1]
    num_anchors = cls_prob.shape[2]

    ib = tvm.tir.ir_builder.create()

    cls_prob = ib.buffer_ptr(cls_prob)
    cls_id = ib.buffer_ptr(temp_cls_id)
    valid_count = ib.buffer_ptr(valid_count)
    temp_valid_count = ib.buffer_ptr(temp_valid_count)
    score = ib.buffer_ptr(temp_score)

    threshold = tvm.ir.make_node("FloatImm", dtype="float32", value=threshold)

    max_threads = int(
        tvm.target.Target.current(allow_none=False).max_num_threads)
    nthread_tx = max_threads
    nthread_bx = (batch_size * num_anchors) // max_threads + 1
    tx = te.thread_axis("threadIdx.x")
    bx = te.thread_axis("blockIdx.x")
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    tid = bx * max_threads + tx
    idxd = tvm.tir.indexdiv
    idxm = tvm.tir.indexmod

    with ib.if_scope(tid < batch_size * num_anchors):
        i = idxd(tid, num_anchors)
        j = idxm(tid, num_anchors)
        valid_count[i] = 0
        score[tid] = -1.0
        cls_id[tid] = 0
        with ib.for_range(0, num_classes - 1) as k:
            temp = cls_prob[i * num_classes * num_anchors +
                            (k + 1) * num_anchors + j]
            cls_id[tid] = if_then_else(temp > score[tid], k + 1, cls_id[tid])
            score[tid] = tvm.te.max(temp, score[tid])
        with ib.if_scope(tvm.tir.all(cls_id[tid] > 0, score[tid] < threshold)):
            cls_id[tid] = 0
        with ib.if_scope(cls_id[tid] > 0):
            temp_valid_count[tid] = 1
        with ib.else_scope():
            temp_valid_count[tid] = 0

        with ib.if_scope(tid < batch_size):
            with ib.for_range(0, num_anchors) as k:
                with ib.if_scope(k > 0):
                    temp_valid_count[tid * num_anchors +
                                     k] += temp_valid_count[tid * num_anchors +
                                                            k - 1]
            valid_count[i] = temp_valid_count[tid * num_anchors + num_anchors -
                                              1]

    return ib.get()
Пример #12
0
def nms_ir(
    data,
    sorted_index,
    valid_count,
    indices,
    out_bboxes,
    out_scores,
    out_class_ids,
    out_features,
    box_indices,
    num_valid_boxes,
    max_output_size,
    iou_threshold,
    force_suppress,
    top_k,
    coord_start,
    id_index,
    score_index,
    return_indices,
):
    """Low level IR routing for transform location in multibox_detection operator.

    Parameters
    ----------
    data : Buffer
        Buffer of output boxes with class and score.

    sorted_index : Buffer
        Buffer of output box indexes sorted by score.

    valid_count : Buffer
        Buffer of number of valid output boxes.

    indices : Buffer
        indices in original tensor, with shape [batch_size, num_anchors],
        represents the index of box in original data. It could be the third
        output out_indices of get_valid_counts. The values in the second
        dimension are like the output of arange(num_anchors) if get_valid_counts
        is not used before non_max_suppression.

    out_bboxes : Buffer
        Output buffer, to be filled with sorted box coordinates.

    out_scores : Buffer
        Output buffer, to be filled with sorted scores.

    out_class_ids : Buffer
        Output buffer, to be filled with sorted class ids.

    box_indices : Buffer
        A indices tensor mapping sorted indices to original indices
        This is the first output of NMS when return_indices=True.

    num_valid_boxes : Buffer
        Record the number of boxes that have survived IOU tests.
        This is the second output of NMS when return_indices=True.

    max_output_size : int
        Max number of output valid boxes for each instance.
        By default all valid boxes are returned.

    iou_threshold : float
        Overlapping(IoU) threshold to suppress object with smaller score.

    force_suppress : boolean
        Whether to suppress all detections regardless of class_id.

    top_k : int
        Keep maximum top k detections before nms, -1 for no limit.

    coord_start : int
        Start index of the consecutive 4 coordinates.

    id_index : int
        index of the class categories, -1 to disable.

    score_index : optional, int
        Index of the scores/confidence of boxes.

    return_indices : boolean
        Whether to return box indices in input data.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    box_data_length = data.shape[2]
    num_features = out_features.shape[2]

    ib = tvm.tir.ir_builder.create()

    data = ib.buffer_ptr(data)
    sorted_index = ib.buffer_ptr(sorted_index)
    valid_count = ib.buffer_ptr(valid_count)
    indices = ib.buffer_ptr(indices)

    # outputs
    out_bboxes = ib.buffer_ptr(out_bboxes)
    out_scores = ib.buffer_ptr(out_scores)
    out_class_ids = ib.buffer_ptr(out_class_ids)
    out_features = ib.buffer_ptr(out_features)
    box_indices = ib.buffer_ptr(box_indices)
    num_valid_boxes = ib.buffer_ptr(num_valid_boxes)

    if isinstance(iou_threshold, float):
        iou_threshold = tvm.tir.FloatImm("float32", iou_threshold)
    top_k = tvm.tir.IntImm("int32", top_k)
    coord_start = tvm.tir.IntImm("int32", coord_start)
    id_index = tvm.tir.IntImm("int32", id_index)
    score_index = tvm.tir.IntImm("int32", score_index)
    force_suppress = tvm.tir.IntImm("int32", 1 if force_suppress else 0)

    max_threads = int(
        tvm.target.Target.current(allow_none=False).max_num_threads)

    with ib.new_scope():
        nthread_tx = max_threads
        nthread_bx = ceil_div(num_anchors, max_threads)
        nthread_by = batch_size
        tx = te.thread_axis("threadIdx.x")
        bx = te.thread_axis("blockIdx.x")
        by = te.thread_axis("blockIdx.y")
        ib.scope_attr(by, "thread_extent", nthread_by)
        ib.scope_attr(tx, "thread_extent", nthread_tx)
        ib.scope_attr(bx, "thread_extent", nthread_bx)
        i = by
        base_src_idx = i * num_anchors * box_data_length
        base_bbox_idx = i * num_anchors * 4
        base_features_idx = i * num_anchors * num_features

        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
            # Reorder output
            nkeep = if_then_else(
                tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k,
                valid_count[i])
            j = bx * max_threads + tx
            with ib.if_scope(j < nkeep):
                src_idx = base_src_idx + sorted_index[i * num_anchors +
                                                      j] * box_data_length
                with ib.for_range(0, 4, kind="unroll") as k:
                    out_bboxes[(base_bbox_idx + j * 4 +
                                k)] = data[src_idx + coord_start + k]
                with ib.for_range(0, num_features, kind="unroll") as k:
                    out_features[(base_features_idx + j * num_features +
                                  k)] = data[src_idx + coord_start + 4 + k]

                out_scores[i * num_anchors + j] = data[src_idx + score_index]

                if id_index >= 0:
                    out_class_ids[i * num_anchors + j] = data[src_idx +
                                                              id_index]

            with ib.else_scope():
                # Indices > nkeep are discarded
                # Only needed for return_indices = False case
                if return_indices is False:
                    with ib.if_scope(j < num_anchors):
                        with ib.for_range(0, 4, kind="unroll") as k:
                            out_bboxes[(base_bbox_idx + j * 4 + k)] = -1.0
                        with ib.for_range(0, num_features, kind="unroll") as k:
                            out_features[(base_features_idx +
                                          j * num_features + k)] = -1.0

                        out_scores[i, j] = -1.0

                        if id_index >= 0:
                            out_class_ids[i, j] = -1.0

            if return_indices:
                with ib.if_scope(j < num_anchors):
                    box_indices[i * num_anchors + j] = -1

        with ib.else_scope():
            # Need to copy all boxes if not using return_indices
            bounds = valid_count[i] if return_indices else num_anchors
            with ib.if_scope(j < bounds):
                src_offset = base_src_idx + j * box_data_length

                with ib.for_range(0, 4, kind="unroll") as k:
                    out_bboxes[base_bbox_idx + j * 4 +
                               k] = data[src_offset + coord_start + k]
                with ib.for_range(0, num_features, kind="unroll") as k:
                    out_features[(base_features_idx + j * num_features +
                                  k)] = data[src_offset + coord_start + 4 + k]
                out_scores[i * num_anchors + j] = data[src_offset +
                                                       score_index]

                if id_index >= 0:
                    out_class_ids[i * num_anchors + j] = data[src_offset +
                                                              id_index]

                box_indices[i * num_anchors + j] = j

    if isinstance(max_output_size, int):
        max_output_size = tvm.tir.const(max_output_size)

    def calc_overlap(i, j, k):
        offset_j = j * 4
        offset_k = k * 4
        base_bbox_idx = i * num_anchors * 4
        return calculate_overlap(
            out_bboxes,
            base_bbox_idx + offset_j,
            base_bbox_idx + offset_k,
        )

    def on_new_valid_box(ib, tid, num_current_valid_box, i, j):
        # When return_indices is False, no need to populate box_indices
        if return_indices:
            with ib.if_scope(tid + 0 == 0):
                orig_idx = sorted_index[i * num_anchors + j]
                box_indices[i, num_current_valid_box] = indices[i, orig_idx]

    def on_new_invalidated_box(i, k):
        if return_indices is False and id_index >= 0:
            out_class_ids[i, k] = -1.0

    def needs_bbox_check(i, j, k):
        return tvm.tir.any(
            force_suppress > 0,
            id_index < 0,
            out_class_ids[i, k] == out_class_ids[i, j],
        )

    return _nms_loop(
        ib,
        batch_size,
        top_k,
        iou_threshold,
        max_output_size,
        valid_count,
        on_new_valid_box,
        on_new_invalidated_box,
        needs_bbox_check,
        calc_overlap,
        out_scores,
        num_valid_boxes,
    )
Пример #13
0
def _nms_loop(
    ib,
    batch_size,
    top_k,
    iou_threshold,
    max_output_size,
    valid_count,
    on_new_valid_box_func,
    on_new_invalidated_box_func,
    needs_bbox_check_func,
    calc_overlap_func,
    out_scores,
    num_valid_boxes,
):
    max_threads = int(
        tvm.target.Target.current(allow_none=False).max_num_threads)

    with ib.new_scope():
        nthread_by = batch_size
        nthread_tx = max_threads

        # Some cuda architectures have smaller limit of 32K for cudaDevAttrMaxRegistersPerBlock
        # vs 64K for most GPUs. Since this kernel uses many registers (around 35), the limit will
        # be exceeded with 1024 threads.
        target = tvm.target.Target.current(allow_none=False)
        if target.kind.name == "cuda":
            if nvcc.get_target_compute_version(target) in [
                    "3.2", "5.3", "6.2"
            ]:
                nthread_tx = 512

        by = te.thread_axis("blockIdx.y")
        tx = te.thread_axis("threadIdx.x")
        ib.scope_attr(by, "thread_extent", nthread_by)
        ib.scope_attr(tx, "thread_extent", nthread_tx)

        num_valid_boxes_local = ib.allocate("int32", (1, ),
                                            name="num_valid_boxes_local",
                                            scope="local")
        num_valid_boxes_local[0] = 0

        def nms_inner_loop(ib, i, j, nkeep):
            # The box j is valid, invalidate other boxes that overlap with j above iou_threshold
            on_new_valid_box_func(ib, tx, num_valid_boxes_local[0], i, j)
            num_valid_boxes_local[0] += 1

            num_iter_per_thread = ceil_div(nkeep - (j + 1), nthread_tx)

            with ib.for_range(0, num_iter_per_thread, name="_k") as _k:
                k = j + 1 + _k * nthread_tx + tx

                with ib.if_scope(
                        tvm.tir.all(
                            k < nkeep,
                            out_scores[i, k] > 0,  # is the box k still valid?
                            needs_bbox_check_func(i, j, k),
                        )):
                    iou = calc_overlap_func(i, j, k)

                    with ib.if_scope(iou >= iou_threshold):
                        # invalidate the box k
                        out_scores[i, k] = -1.0
                        on_new_invalidated_box_func(i, k)

                ib.emit(
                    tvm.tir.Call(None, "tir.tvm_storage_sync",
                                 tvm.runtime.convert(["shared"])))

        i = by

        nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]),
                             top_k, valid_count[i])
        max_output_size = if_then_else(max_output_size > 0, max_output_size,
                                       nkeep)

        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
            # Apply nms
            # No need to do more iteration if we have already reached max_output_size boxes
            box_idx = ib.allocate("int32", (1, ),
                                  name="box_idx",
                                  scope="local")
            box_idx[0] = 0
            with ib.while_loop(
                    tvm.tir.all(box_idx[0] < nkeep,
                                num_valid_boxes_local[0] < max_output_size)):
                # Proceed to the inner loop if the box with id box_idx is still valid
                with ib.if_scope(out_scores[i, box_idx[0]] > -1.0):
                    nms_inner_loop(ib, i, box_idx[0], nkeep)
                box_idx[0] += 1

            with ib.if_scope(tx + 0 == 0):
                num_valid_boxes[i] = num_valid_boxes_local[0]

        with ib.else_scope():
            num_valid_boxes[i] = 0

    return ib.get()
Пример #14
0
def nms_ir(
    data,
    sorted_index,
    valid_count,
    indices,
    out,
    box_indices,
    max_output_size,
    iou_threshold,
    force_suppress,
    top_k,
    coord_start,
    id_index,
    score_index,
    return_indices,
):
    """Low level IR routing for transform location in multibox_detection operator.

    Parameters
    ----------
    data : Buffer
        Buffer of output boxes with class and score.

    sorted_index : Buffer
        Buffer of output box indexes sorted by score.

    valid_count : Buffer
        Buffer of number of valid output boxes.

    indices : Buffer
        indices in original tensor, with shape [batch_size, num_anchors],
        represents the index of box in original data. It could be the third
        output out_indices of get_valid_counts. The values in the second
        dimension are like the output of arange(num_anchors) if get_valid_counts
        is not used before non_max_suppression.

    out : Buffer
        Output buffer.

    max_output_size : int
        Max number of output valid boxes for each instance.
        By default all valid boxes are returned.

    iou_threshold : float
        Overlapping(IoU) threshold to suppress object with smaller score.

    force_suppress : boolean
        Whether to suppress all detections regardless of class_id.

    top_k : int
        Keep maximum top k detections before nms, -1 for no limit.

    coord_start : int
        Start index of the consecutive 4 coordinates.

    id_index : int
        index of the class categories, -1 to disable.

    score_index : optional, int
        Index of the scores/confidence of boxes.

    return_indices : boolean
        Whether to return box indices in input data.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """

    def get_boundaries(output, box_idx):
        l = tvm.te.min(
            output[box_idx],
            output[box_idx + 2],
        )
        t = tvm.te.min(
            output[box_idx + 1],
            output[box_idx + 3],
        )
        r = tvm.te.max(
            output[box_idx],
            output[box_idx + 2],
        )
        b = tvm.te.max(
            output[box_idx + 1],
            output[box_idx + 3],
        )
        return l, t, r, b

    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
        """Calculate overlap of two boxes."""
        a_l, a_t, a_r, a_b = get_boundaries(out_tensor, box_a_idx)
        b_l, b_t, b_r, b_b = get_boundaries(out_tensor, box_b_idx)

        # Overlapping width and height
        w = tvm.te.max(0.0, tvm.te.min(a_r, b_r) - tvm.te.max(a_l, b_l))
        h = tvm.te.max(0.0, tvm.te.min(a_b, b_b) - tvm.te.max(a_t, b_t))

        # Overlapping area
        area = h * w

        # total area of the figure formed by box a and box b
        # except for overlapping area
        u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area
        return tvm.tir.Select(u <= 0.0, 0.0, area / u)

    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    box_data_length = data.shape[2]

    ib = tvm.tir.ir_builder.create()

    data = ib.buffer_ptr(data)
    sorted_index = ib.buffer_ptr(sorted_index)
    valid_count = ib.buffer_ptr(valid_count)
    indices = ib.buffer_ptr(indices)
    out = ib.buffer_ptr(out)
    box_indices = ib.buffer_ptr(box_indices)

    if isinstance(iou_threshold, float):
        iou_threshold = tvm.tir.FloatImm("float32", iou_threshold)
    top_k = tvm.tir.IntImm("int32", top_k)
    coord_start = tvm.tir.IntImm("int32", coord_start)
    id_index = tvm.tir.IntImm("int32", id_index)
    score_index = tvm.tir.IntImm("int32", score_index)
    force_suppress = tvm.tir.IntImm("int32", 1 if force_suppress else 0)

    max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads)

    with ib.new_scope():
        nthread_by = batch_size
        by = te.thread_axis("blockIdx.y")
        ib.scope_attr(by, "thread_extent", nthread_by)
        i = by
        base_idx = i * num_anchors * box_data_length
        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
            # Reorder output
            nkeep = if_then_else(
                tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]
            )
            with ib.for_range(0, nkeep) as j:
                with ib.for_range(0, box_data_length) as k:
                    out[(base_idx + j * box_data_length + k)] = data[
                        (base_idx + sorted_index[i * num_anchors + j] * box_data_length + k)
                    ]
                box_indices[i * num_anchors + j] = sorted_index[i * num_anchors + j]
            with ib.if_scope(tvm.tir.all(top_k > 0, top_k < valid_count[i])):
                with ib.for_range(0, valid_count[i] - nkeep) as j:
                    with ib.for_range(0, box_data_length) as k:
                        out[(base_idx + (j + nkeep) * box_data_length + k)] = -1.0
                    box_indices[i * num_anchors + (j + nkeep)] = -1
    with ib.new_scope():
        nthread_by = batch_size
        by = te.thread_axis("blockIdx.y")
        ib.scope_attr(by, "thread_extent", nthread_by)
        i = by
        base_idx = i * num_anchors * box_data_length
        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
            # Apply nms
            with ib.for_range(0, valid_count[i]) as j:
                with ib.for_range(0, j) as k:
                    offset_k = k * box_data_length
                    with ib.if_scope(
                        tvm.tir.all(
                            out[base_idx + offset_k + score_index] > 0,
                            tvm.tir.any(id_index < 0, out[base_idx + offset_k + id_index] >= 0),
                        )
                    ):
                        offset_j = j * box_data_length
                        with ib.if_scope(
                            tvm.tir.all(
                                j > k,
                                out[base_idx + offset_k + score_index] > 0,
                                tvm.tir.any(id_index < 0, out[base_idx + offset_j + id_index] >= 0),
                                tvm.tir.any(
                                    force_suppress > 0,
                                    id_index < 0,
                                    out[base_idx + offset_k + id_index]
                                    == out[base_idx + offset_j + id_index],
                                ),
                            )
                        ):
                            iou = calculate_overlap(
                                out,
                                base_idx + offset_j + coord_start,
                                base_idx + offset_k + coord_start,
                            )
                            with ib.if_scope(iou >= iou_threshold):
                                out[base_idx + offset_j + score_index] = -1.0
                                with ib.if_scope(id_index >= 0):
                                    out[base_idx + offset_j + id_index] = -1.0
                                box_indices[i * num_anchors + j] = -1
    with ib.new_scope():
        nthread_tx = max_threads
        nthread_bx = num_anchors // max_threads + 1
        nthread_by = batch_size
        nthread_bz = box_data_length
        tx = te.thread_axis("threadIdx.x")
        bx = te.thread_axis("blockIdx.x")
        by = te.thread_axis("blockIdx.y")
        bz = te.thread_axis("blockIdx.z")
        ib.scope_attr(tx, "thread_extent", nthread_tx)
        ib.scope_attr(bx, "thread_extent", nthread_bx)
        ib.scope_attr(by, "thread_extent", nthread_by)
        ib.scope_attr(bz, "thread_extent", nthread_bz)
        tid = bx * max_threads + tx
        i = by
        j = tid
        k = bz
        base_idx = i * num_anchors * box_data_length
        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
            pass
        with ib.else_scope():
            with ib.if_scope(j < valid_count[i]):
                offset_j = j * box_data_length
                out[(base_idx + offset_j + k)] = data[base_idx + offset_j + k]
                box_indices[i * num_anchors + j] = j

    with ib.new_scope():
        num_valid_boxes = ib.allocate("int32", (1,), name="num_valid_boxes", scope="local")
        bx = te.thread_axis("blockIdx.x")
        ib.scope_attr(bx, "thread_extent", batch_size)
        i = bx
        base_idx = i * num_anchors * box_data_length
        # Set invalid entry to be -1
        with ib.for_range(0, num_anchors - valid_count[i]) as j:
            with ib.for_range(0, box_data_length) as k:
                out[base_idx + (j + valid_count[i]) * box_data_length + k] = -1.0
            box_indices[i * num_anchors + j + valid_count[i]] = -1
        # Only return max_output_size number of valid boxes
        num_valid_boxes[0] = 0
        with ib.if_scope(max_output_size > 0):
            with ib.for_range(0, valid_count[i]) as j:
                offset_j = j * box_data_length
                with ib.if_scope(out[base_idx + offset_j] >= 0):
                    with ib.if_scope(num_valid_boxes[0] == max_output_size):
                        with ib.for_range(0, box_data_length) as k:
                            out[base_idx + offset_j + k] = -1.0
                        box_indices[i * num_anchors + j] = -1
                    with ib.else_scope():
                        num_valid_boxes[0] += 1

    if return_indices:
        with ib.new_scope():
            nthread_tx = max_threads
            nthread_bx = batch_size // max_threads + 1
            tx = te.thread_axis("threadIdx.x")
            bx = te.thread_axis("blockIdx.x")
            ib.scope_attr(tx, "thread_extent", nthread_tx)
            ib.scope_attr(bx, "thread_extent", nthread_bx)
            i = bx * max_threads + tx
            with ib.if_scope(i < batch_size):
                with ib.for_range(0, valid_count[i]) as j:
                    idx = box_indices[i * num_anchors + j]
                    with ib.if_scope(idx >= 0):
                        box_indices[i * num_anchors + j] = indices[i * num_anchors + idx]

    return ib.get()
Пример #15
0
 def apply(condition, true_expr, false_expr):
     return tir.if_then_else(tir.Cast('bool', _clamp_tvm(condition, 0, 1)),
                             true_expr, false_expr)
Пример #16
0
Файл: nms.py Проект: jchia/tvm
def nms_ir(
    data,
    sorted_index,
    valid_count,
    indices,
    out_bboxes,
    out_scores,
    out_class_ids,
    box_indices,
    num_valid_boxes,
    max_output_size,
    iou_threshold,
    force_suppress,
    top_k,
    coord_start,
    id_index,
    score_index,
    return_indices,
):
    """Low level IR routing for transform location in multibox_detection operator.

    Parameters
    ----------
    data : Buffer
        Buffer of output boxes with class and score.

    sorted_index : Buffer
        Buffer of output box indexes sorted by score.

    valid_count : Buffer
        Buffer of number of valid output boxes.

    indices : Buffer
        indices in original tensor, with shape [batch_size, num_anchors],
        represents the index of box in original data. It could be the third
        output out_indices of get_valid_counts. The values in the second
        dimension are like the output of arange(num_anchors) if get_valid_counts
        is not used before non_max_suppression.

    out_bboxes : Buffer
        Output buffer, to be filled with sorted box coordinates.

    out_scores : Buffer
        Output buffer, to be filled with sorted scores.

    out_class_ids : Buffer
        Output buffer, to be filled with sorted class ids.

    box_indices : Buffer
        A indices tensor mapping sorted indices to original indices
        This is the first output of NMS when return_indices=True.

    num_valid_boxes : Buffer
        Record the number of boxes that have survived IOU tests.
        This is the second output of NMS when return_indices=True.

    max_output_size : int
        Max number of output valid boxes for each instance.
        By default all valid boxes are returned.

    iou_threshold : float
        Overlapping(IoU) threshold to suppress object with smaller score.

    force_suppress : boolean
        Whether to suppress all detections regardless of class_id.

    top_k : int
        Keep maximum top k detections before nms, -1 for no limit.

    coord_start : int
        Start index of the consecutive 4 coordinates.

    id_index : int
        index of the class categories, -1 to disable.

    score_index : optional, int
        Index of the scores/confidence of boxes.

    return_indices : boolean
        Whether to return box indices in input data.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    def get_boundaries(output, box_idx):
        l = tvm.te.min(
            output[box_idx],
            output[box_idx + 2],
        )
        t = tvm.te.min(
            output[box_idx + 1],
            output[box_idx + 3],
        )
        r = tvm.te.max(
            output[box_idx],
            output[box_idx + 2],
        )
        b = tvm.te.max(
            output[box_idx + 1],
            output[box_idx + 3],
        )
        return l, t, r, b

    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
        """Calculate overlap of two boxes."""
        a_l, a_t, a_r, a_b = get_boundaries(out_tensor, box_a_idx)
        b_l, b_t, b_r, b_b = get_boundaries(out_tensor, box_b_idx)

        # Overlapping width and height
        w = tvm.te.max(0.0, tvm.te.min(a_r, b_r) - tvm.te.max(a_l, b_l))
        h = tvm.te.max(0.0, tvm.te.min(a_b, b_b) - tvm.te.max(a_t, b_t))

        # Overlapping area
        area = h * w

        # total area of the figure formed by box a and box b
        # except for overlapping area
        u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area
        return tvm.tir.Select(u <= 0.0, 0.0, area / u)

    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    box_data_length = data.shape[2]

    ib = tvm.tir.ir_builder.create()

    data = ib.buffer_ptr(data)
    sorted_index = ib.buffer_ptr(sorted_index)
    valid_count = ib.buffer_ptr(valid_count)
    indices = ib.buffer_ptr(indices)

    # outputs
    out_bboxes = ib.buffer_ptr(out_bboxes)
    out_scores = ib.buffer_ptr(out_scores)
    out_class_ids = ib.buffer_ptr(out_class_ids)
    box_indices = ib.buffer_ptr(box_indices)
    num_valid_boxes = ib.buffer_ptr(num_valid_boxes)

    if isinstance(iou_threshold, float):
        iou_threshold = tvm.tir.FloatImm("float32", iou_threshold)
    top_k = tvm.tir.IntImm("int32", top_k)
    coord_start = tvm.tir.IntImm("int32", coord_start)
    id_index = tvm.tir.IntImm("int32", id_index)
    score_index = tvm.tir.IntImm("int32", score_index)
    force_suppress = tvm.tir.IntImm("int32", 1 if force_suppress else 0)

    max_threads = int(
        tvm.target.Target.current(allow_none=False).max_num_threads)

    with ib.new_scope():
        nthread_tx = max_threads
        nthread_bx = ceil_div(num_anchors, max_threads)
        nthread_by = batch_size
        tx = te.thread_axis("threadIdx.x")
        bx = te.thread_axis("blockIdx.x")
        by = te.thread_axis("blockIdx.y")
        ib.scope_attr(by, "thread_extent", nthread_by)
        ib.scope_attr(tx, "thread_extent", nthread_tx)
        ib.scope_attr(bx, "thread_extent", nthread_bx)
        i = by
        base_src_idx = i * num_anchors * box_data_length
        base_bbox_idx = i * num_anchors * 4

        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
            # Reorder output
            nkeep = if_then_else(
                tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k,
                valid_count[i])
            j = bx * max_threads + tx
            with ib.if_scope(j < nkeep):
                src_idx = base_src_idx + sorted_index[i * num_anchors +
                                                      j] * box_data_length
                with ib.for_range(0, 4, kind="unroll") as k:
                    out_bboxes[(base_bbox_idx + j * 4 +
                                k)] = data[src_idx + coord_start + k]

                out_scores[i * num_anchors + j] = data[src_idx + score_index]

                if id_index >= 0:
                    out_class_ids[i * num_anchors + j] = data[src_idx +
                                                              id_index]

            with ib.else_scope():
                # Indices > nkeep are discarded
                # Only needed for return_indices = False case
                if return_indices is False:
                    with ib.if_scope(j < num_anchors):
                        with ib.for_range(0, 4, kind="unroll") as k:
                            out_bboxes[(base_bbox_idx + j * 4 + k)] = -1.0

                        out_scores[i, j] = -1.0

                        if id_index >= 0:
                            out_class_ids[i, j] = -1.0

            if return_indices:
                with ib.if_scope(j < num_anchors):
                    box_indices[i * num_anchors + j] = -1

        with ib.else_scope():
            with ib.if_scope(j < valid_count[i]):
                src_offset = base_src_idx + j * box_data_length

                with ib.for_range(0, 4, kind="unroll") as k:
                    out_bboxes[base_bbox_idx + j * 4 +
                               k] = data[src_offset + coord_start + k]
                out_scores[i * num_anchors + j] = data[src_offset +
                                                       score_index]

                if id_index >= 0:
                    out_class_ids[i * num_anchors + j] = data[src_offset +
                                                              id_index]

                box_indices[i * num_anchors + j] = j

    with ib.new_scope():
        nthread_by = batch_size
        nthread_tx = max_threads

        by = te.thread_axis("blockIdx.y")
        tx = te.thread_axis("threadIdx.x")
        ib.scope_attr(by, "thread_extent", nthread_by)
        ib.scope_attr(tx, "thread_extent", nthread_tx)

        i = by

        base_bbox_idx = i * num_anchors * 4
        num_valid_boxes_local = ib.allocate("int32", (1, ),
                                            name="num_valid_boxes_local",
                                            scope="local")
        num_valid_boxes_local[0] = 0
        nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]),
                             top_k, valid_count[i])

        def nms_inner_loop(ib, j):
            # The box j is valid, invalidate other boxes that overlap with j above iou_threshold

            # When return_indices is False, no need to populate box_indices
            if return_indices:
                with ib.if_scope(tx + 0 == 0):
                    orig_idx = sorted_index[i * num_anchors + j]
                    box_indices[i,
                                num_valid_boxes_local[0]] = indices[i,
                                                                    orig_idx]

            num_valid_boxes_local[0] += 1

            offset_j = j * 4
            num_iter_per_thread = ceil_div(nkeep - (j + 1), nthread_tx)

            with ib.for_range(0, num_iter_per_thread) as _k:
                k = j + 1 + _k * nthread_tx + tx
                offset_k = k * 4

                with ib.if_scope(
                        tvm.tir.all(
                            k < nkeep,
                            out_scores[i, k] > 0,  # is the box k still valid?
                            tvm.tir.any(
                                force_suppress > 0,
                                id_index < 0,
                                out_class_ids[i, k] == out_class_ids[i, j],
                            ),
                        )):
                    iou = calculate_overlap(
                        out_bboxes,
                        base_bbox_idx + offset_j,
                        base_bbox_idx + offset_k,
                    )
                    with ib.if_scope(iou >= iou_threshold):
                        # invalidate the box k
                        out_scores[i, k] = -1.0

                        if return_indices is False and id_index >= 0:
                            out_class_ids[i, k] = -1.0

                ib.emit(
                    tvm.tir.Call(None, "tir.tvm_storage_sync",
                                 tvm.runtime.convert(["shared"])))

        if isinstance(max_output_size, int):
            max_output_size = tvm.tir.const(max_output_size)

        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
            # Apply nms
            with ib.for_range(0, nkeep) as j:
                # Proceed to the inner loop if the box j is still valid
                with ib.if_scope(out_scores[i, j] > -1.0):
                    with ib.if_scope(max_output_size > 0):
                        # No need to do more iteration if we have already reached max_output_size
                        # boxes
                        # TODO(masahi): Add TIR while loop to realize early exit from the outer loop
                        with ib.if_scope(
                                num_valid_boxes_local[0] < max_output_size):
                            nms_inner_loop(ib, j)
                    with ib.else_scope():
                        nms_inner_loop(ib, j)

            with ib.if_scope(tx + 0 == 0):
                num_valid_boxes[i] = num_valid_boxes_local[0]

        with ib.else_scope():
            num_valid_boxes[i] = 0

    return ib.get()
Пример #17
0
def nms_ir(data, sorted_index, valid_count, out, box_indices, max_output_size,
           iou_threshold, force_suppress, top_k, coord_start, id_index,
           score_index):
    """Low level IR routing for transform location in multibox_detection operator.

    Parameters
    ----------
    data : Buffer
        Buffer of output boxes with class and score.

    sort_index : Buffer
        Buffer of output box indexes sorted by score.

    valid_count : Buffer
        Buffer of number of valid output boxes.

    out : Buffer
        Output buffer.

    max_output_size : int
        Max number of output valid boxes for each instance.
        By default all valid boxes are returned.

    iou_threshold : float
        Overlapping(IoU) threshold to suppress object with smaller score.

    force_suppress : boolean
        Whether to suppress all detections regardless of class_id.

    top_k : int
        Keep maximum top k detections before nms, -1 for no limit.

    coord_start : int
        Start index of the consecutive 4 coordinates.

    id_index : int
        index of the class categories, -1 to disable.

    score_index : optional, int
        Index of the scores/confidence of boxes.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
        """Calculate overlap of two boxes.
        """
        w = tvm.te.max(
            0.0,
            tvm.te.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) -
            tvm.te.max(out_tensor[box_a_idx], out_tensor[box_b_idx]))
        h = tvm.te.max(
            0.0,
            tvm.te.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) -
            tvm.te.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]))
        i = w * h
        u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \
            (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \
            (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \
            (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i
        return tvm.tir.Select(u <= 0.0, 0.0, i / u)

    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    box_data_length = data.shape[2]

    ib = tvm.tir.ir_builder.create()

    data = ib.buffer_ptr(data)
    sorted_index = ib.buffer_ptr(sorted_index)
    valid_count = ib.buffer_ptr(valid_count)
    out = ib.buffer_ptr(out)
    box_indices = ib.buffer_ptr(box_indices)
    num_valid_boxes = ib.allocate("int32", (1, ),
                                  name="num_valid_boxes",
                                  scope="local")

    max_threads = int(
        tvm.target.Target.current(allow_none=False).max_num_threads)
    nthread_tx = max_threads
    nthread_bx = num_anchors // max_threads + 1
    tx = te.thread_axis("threadIdx.x")
    bx = te.thread_axis("blockIdx.x")
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    j = bx * max_threads + tx

    iou_threshold = tvm.ir.make_node("FloatImm",
                                     dtype="float32",
                                     value=iou_threshold)
    top_k = tvm.ir.make_node("IntImm", dtype="int32", value=top_k)
    coord_start = tvm.ir.make_node("IntImm", dtype="int32", value=coord_start)
    id_index = tvm.ir.make_node("IntImm", dtype="int32", value=id_index)
    score_index = tvm.ir.make_node("IntImm", dtype="int32", value=score_index)
    force_suppress = tvm.ir.make_node("IntImm",
                                      dtype="int32",
                                      value=1 if force_suppress else 0)

    with ib.for_range(0, batch_size, for_type="unroll") as i:
        base_idx = i * num_anchors * box_data_length
        with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)):
            # Reorder output
            nkeep = if_then_else(
                tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k,
                valid_count[i])
            with ib.if_scope(j < nkeep):
                with ib.for_range(0, box_data_length) as k:
                    out[(base_idx + j * box_data_length + k)] = \
                        data[(base_idx + sorted_index[i * num_anchors + j]
                              * box_data_length + k)]
                box_indices[i * num_anchors +
                            j] = sorted_index[i * num_anchors + j]
            with ib.if_scope(tvm.tir.all(top_k > 0, top_k < valid_count[i])):
                with ib.if_scope(j < valid_count[i] - nkeep):
                    with ib.for_range(0, box_data_length) as k:
                        out[(base_idx + (j + nkeep) * box_data_length +
                             k)] = -1.0
                    box_indices[i * num_anchors + (j + nkeep)] = -1
            # Apply nms
            with ib.for_range(0, valid_count[i]) as k:
                offset_k = k * box_data_length
                with ib.if_scope(
                        tvm.tir.all(
                            out[base_idx + offset_k + score_index] > 0,
                            tvm.tir.any(
                                id_index < 0,
                                out[base_idx + offset_k + id_index] >= 0))):
                    with ib.if_scope(j < valid_count[i]):
                        offset_j = j * box_data_length
                        with ib.if_scope(
                                tvm.tir.all(
                                    j > k,
                                    out[base_idx + offset_j + score_index] > 0,
                                    tvm.tir.any(
                                        id_index < 0,
                                        out[base_idx + offset_j + id_index] >=
                                        0),
                                    tvm.tir.any(
                                        force_suppress > 0, id_index < 0,
                                        out[base_idx + offset_k +
                                            id_index] == out[base_idx +
                                                             offset_j +
                                                             id_index]))):
                            iou = calculate_overlap(
                                out, base_idx + offset_j + coord_start,
                                base_idx + offset_k + coord_start)
                            with ib.if_scope(iou >= iou_threshold):
                                out[base_idx + offset_j + score_index] = -1.0
                                with ib.if_scope(id_index >= 0):
                                    out[base_idx + offset_j + id_index] = -1.0
                                box_indices[i * num_anchors + j] = -1
        with ib.else_scope():
            with ib.if_scope(j < valid_count[i]):
                offset_j = j * box_data_length
                with ib.for_range(0, box_data_length) as k:
                    out[(base_idx + offset_j + k)] = data[base_idx + offset_j +
                                                          k]
                box_indices[i * num_anchors + j] = j
        # Set invalid entry to be -1
        with ib.if_scope(j < num_anchors - valid_count[i]):
            with ib.for_range(0, box_data_length) as k:
                out[base_idx + (j + valid_count[i]) * box_data_length +
                    k] = -1.0
            box_indices[i * num_anchors + j + valid_count[i]] = -1
        # Only return max_output_size number of valid boxes
        num_valid_boxes[0] = 0
        with ib.if_scope(max_output_size > 0):
            with ib.if_scope(j < valid_count[i]):
                offset_j = j * box_data_length
                with ib.if_scope(out[base_idx + offset_j] >= 0):
                    with ib.if_scope(num_valid_boxes[0] == max_output_size):
                        with ib.for_range(0, box_data_length) as k:
                            out[base_idx + offset_j + k] = -1.0
                        box_indices[i * num_anchors + j] = -1
                    with ib.else_scope():
                        num_valid_boxes[0] += 1

    return ib.get()
def primfunc_local_allocates(placeholder_162: ty.handle,
                             placeholder_163: ty.handle,
                             placeholder_164: ty.handle,
                             T_cast_76: ty.handle) -> None:
    # function attr dict
    tir.func_attr({
        "global_symbol":
        "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_9",
        "tir.noalias": True
    })
    placeholder_165 = tir.match_buffer(placeholder_162, [1, 14, 14, 512],
                                       dtype="int16",
                                       elem_offset=0,
                                       align=128,
                                       offset_factor=1)
    placeholder_166 = tir.match_buffer(placeholder_163, [3, 3, 512, 1],
                                       dtype="int16",
                                       elem_offset=0,
                                       align=128,
                                       offset_factor=1)
    placeholder_167 = tir.match_buffer(placeholder_164, [1, 1, 1, 512],
                                       dtype="int32",
                                       elem_offset=0,
                                       align=128,
                                       offset_factor=1)
    T_cast_77 = tir.match_buffer(T_cast_76, [1, 14, 14, 512],
                                 dtype="int16",
                                 elem_offset=0,
                                 align=128,
                                 offset_factor=1)
    # body
    PaddedInput_25 = tir.allocate([1, 16, 16, 512], "int16", "global")
    for i1_35, i2_46, i3_47 in tir.grid(16, 16, 512):
        PaddedInput_25[(((i1_35 * 8192) + (i2_46 * 512)) +
                        i3_47)] = tir.if_then_else(
                            ((((1 <= i1_35) and (i1_35 < 15)) and
                              (1 <= i2_46)) and (i2_46 < 15)),
                            tir.load("int16", placeholder_165.data,
                                     ((((i1_35 * 7168) +
                                        (i2_46 * 512)) + i3_47) - 7680)),
                            tir.int16(0),
                            dtype="int16")
    T_add_11 = tir.allocate([1, 14, 14, 512], "int32", "global")
    with tir.allocate([1, 14, 14, 512], "int32",
                      "global") as DepthwiseConv2d_11:
        for i_11, j_11, c_11 in tir.grid(14, 14, 512):
            DepthwiseConv2d_11[(((i_11 * 7168) + (j_11 * 512)) + c_11)] = 0
            for di_11, dj_11 in tir.grid(3, 3):
                DepthwiseConv2d_11[(((i_11 * 7168) + (j_11 * 512)) + c_11)] = (
                    tir.load("int32", DepthwiseConv2d_11,
                             (((i_11 * 7168) + (j_11 * 512)) + c_11)) +
                    (tir.load("int16", PaddedInput_25,
                              (((((i_11 * 8192) + (di_11 * 8192)) +
                                 (j_11 * 512)) +
                                (dj_11 * 512)) + c_11)).astype("int32") *
                     tir.load("int16", placeholder_166.data,
                              (((di_11 * 1536) +
                                (dj_11 * 512)) + c_11)).astype("int32")))
        for ax1_44, ax2_45, ax3_47 in tir.grid(14, 14, 512):
            T_add_11[(((ax1_44 * 7168) + (ax2_45 * 512)) + ax3_47)] = (
                tir.load("int32", DepthwiseConv2d_11,
                         (((ax1_44 * 7168) + (ax2_45 * 512)) + ax3_47)) +
                tir.load("int32", placeholder_167.data, ax3_47))
    compute_22 = tir.allocate([1, 14, 14, 512], "int32", "global")
    with tir.allocate([1, 14, 14, 512], "int32", "global") as T_cast_78:
        for ax1_45, ax2_46, ax3_48 in tir.grid(14, 14, 512):
            T_cast_78[(((ax1_45 * 7168) +
                        (ax2_46 * 512)) + ax3_48)] = tir.load(
                            "int32", T_add_11,
                            (((ax1_45 * 7168) + (ax2_46 * 512)) + ax3_48))
        for i1_36, i2_47, i3_48 in tir.grid(14, 14, 512):
            compute_22[(((i1_36 * 7168) + (i2_47 * 512)) +
                        i3_48)] = tir.q_multiply_shift(tir.load(
                            "int32", T_cast_78,
                            (((i1_36 * 7168) + (i2_47 * 512)) + i3_48)),
                                                       1948805937,
                                                       31,
                                                       -5,
                                                       dtype="int32")
    T_cast_79 = tir.allocate([1, 14, 14, 512], "uint8", "global")
    with tir.allocate([1, 14, 14, 512], "int32", "global") as compute_23:
        for i1_37, i2_48, i3_49 in tir.grid(14, 14, 512):
            compute_23[(((i1_37 * 7168) + (i2_48 * 512)) + i3_49)] = tir.max(
                tir.max(
                    tir.load("int32", compute_22,
                             (((i1_37 * 7168) + (i2_48 * 512)) + i3_49)), 255),
                0)
        for ax1_46, ax2_47, ax3_49 in tir.grid(14, 14, 512):
            T_cast_79[(((ax1_46 * 7168) +
                        (ax2_47 * 512)) + ax3_49)] = tir.load(
                            "int32", compute_23,
                            (((ax1_46 * 7168) +
                              (ax2_47 * 512)) + ax3_49)).astype("uint8")
    for ax1_47, ax2_48, ax3_50 in tir.grid(14, 14, 512):
        T_cast_77.data[(((ax1_47 * 7168) +
                         (ax2_48 * 512)) + ax3_50)] = tir.load(
                             "uint8", T_cast_79,
                             (((ax1_47 * 7168) +
                               (ax2_48 * 512)) + ax3_50)).astype("int16")
Пример #19
0
def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
    """Low level IR routing for multibox_prior operator.

    Parameters
    ----------
    data : Buffer
        Input data buffer.

    out : Buffer
        Output buffer.

    sizes : tuple of float
        Tuple of sizes for anchor boxes.

    ratios : tuple of float
        Tuple of ratios for anchor boxes.

    steps : Tuple of float
        Priorbox step across y and x, -1 for auto calculation.

    offsets : tuple of int
        Priorbox center offsets, y and x respectively.

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    max_threads = int(
        math.sqrt(tvm.target.Target.current(allow_none=False).max_num_threads))
    tx = te.thread_axis("threadIdx.x")
    ty = te.thread_axis("threadIdx.y")
    bx = te.thread_axis("blockIdx.x")
    by = te.thread_axis("blockIdx.y")
    ib = tvm.tir.ir_builder.create()
    p_out = ib.buffer_ptr(out)
    in_height = data.shape[2]
    in_width = data.shape[3]
    nthread_tx = max_threads
    nthread_bx = in_height // max_threads + 1
    nthread_ty = max_threads
    nthread_by = in_width // max_threads + 1
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(ty, "thread_extent", nthread_ty)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    ib.scope_attr(by, "thread_extent", nthread_by)

    num_sizes = len(sizes)
    num_ratios = len(ratios)
    size_ratio_concat = sizes + ratios
    steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
    steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
    offset_h = offsets[0]
    offset_w = offsets[1]

    i = bx * max_threads + tx
    j = by * max_threads + ty
    with ib.if_scope((i < in_height)):
        with ib.if_scope((j < in_width)):
            center_h = (i + offset_h) * steps_h
            center_w = (j + offset_w) * steps_w

            for k in range(num_sizes + num_ratios - 1):
                w = if_then_else(
                    k < num_sizes,
                    float(size_ratio_concat[k]) * in_height / in_width / 2.0,
                    float(size_ratio_concat[0]) * in_height / in_width *
                    math.sqrt(size_ratio_concat[k + 1]) / 2.0,
                )
                h = if_then_else(
                    k < num_sizes,
                    size_ratio_concat[k] / 2.0,
                    size_ratio_concat[0] /
                    math.sqrt(size_ratio_concat[k + 1]) / 2.0,
                )
                count = (i * in_width * (num_sizes + num_ratios - 1) + j *
                         (num_sizes + num_ratios - 1) + k) * 4
                p_out[count] = center_w - w
                p_out[count + 1] = center_h - h
                p_out[count + 2] = center_w + w
                p_out[count + 3] = center_h + h

    body = ib.get()
    return body
def primfunc_global_allocates(placeholder_144: ty.handle,
                              placeholder_145: ty.handle,
                              placeholder_146: ty.handle,
                              T_cast_48: ty.handle) -> None:
    # function attr dict
    tir.func_attr({
        "global_symbol":
        "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_13",
        "tir.noalias": True
    })
    placeholder_147 = tir.match_buffer(placeholder_144, [1, 14, 14, 512],
                                       dtype="int16",
                                       elem_offset=0,
                                       align=128,
                                       offset_factor=1)
    placeholder_148 = tir.match_buffer(placeholder_145, [3, 3, 512, 1],
                                       dtype="int16",
                                       elem_offset=0,
                                       align=128,
                                       offset_factor=1)
    placeholder_149 = tir.match_buffer(placeholder_146, [1, 1, 1, 512],
                                       dtype="int32",
                                       elem_offset=0,
                                       align=128,
                                       offset_factor=1)
    T_cast_49 = tir.match_buffer(T_cast_48, [1, 14, 14, 512],
                                 dtype="int16",
                                 elem_offset=0,
                                 align=128,
                                 offset_factor=1)
    # body
    PaddedInput_22 = tir.allocate([131072], "int16", "global")
    DepthwiseConv2d_9 = tir.allocate([100352], "int32", "global")
    for i1_29, i2_39, i3_40 in tir.grid(16, 16, 512):
        PaddedInput_22[(((i1_29 * 8192) + (i2_39 * 512)) +
                        i3_40)] = tir.if_then_else(
                            ((((1 <= i1_29) and (i1_29 < 15)) and
                              (1 <= i2_39)) and (i2_39 < 15)),
                            tir.load("int16", placeholder_147.data,
                                     ((((i1_29 * 7168) +
                                        (i2_39 * 512)) + i3_40) - 7680)),
                            tir.int16(0),
                            dtype="int16")
    for i_9, j_9, c_9 in tir.grid(14, 14, 512):
        DepthwiseConv2d_9[(((i_9 * 7168) + (j_9 * 512)) + c_9)] = 0
        for di_9, dj_9 in tir.grid(3, 3):
            DepthwiseConv2d_9[(((i_9 * 7168) + (j_9 * 512)) + c_9)] = (
                tir.load("int32", DepthwiseConv2d_9,
                         (((i_9 * 7168) + (j_9 * 512)) + c_9)) +
                (tir.load("int16", PaddedInput_22,
                          (((((i_9 * 8192) + (di_9 * 8192)) + (j_9 * 512)) +
                            (dj_9 * 512)) + c_9)).astype("int32") *
                 tir.load("int16", placeholder_148.data,
                          (((di_9 * 1536) +
                            (dj_9 * 512)) + c_9)).astype("int32")))
    for ax1_27, ax2_28, ax3_30 in tir.grid(14, 14, 512):
        DepthwiseConv2d_9[(((ax1_27 * 7168) + (ax2_28 * 512)) + ax3_30)] = (
            tir.load("int32", DepthwiseConv2d_9,
                     (((ax1_27 * 7168) + (ax2_28 * 512)) + ax3_30)) +
            tir.load("int32", placeholder_149.data, ax3_30))
    for i1_30, i2_40, i3_41 in tir.grid(14, 14, 512):
        DepthwiseConv2d_9[(((i1_30 * 7168) + (i2_40 * 512)) +
                           i3_41)] = tir.q_multiply_shift(tir.load(
                               "int32", DepthwiseConv2d_9,
                               (((i1_30 * 7168) + (i2_40 * 512)) + i3_41)),
                                                          1269068532,
                                                          31,
                                                          -4,
                                                          dtype="int32")
    for i1_31, i2_41, i3_42 in tir.grid(14, 14, 512):
        DepthwiseConv2d_9[(((i1_31 * 7168) + (i2_41 * 512)) +
                           i3_42)] = tir.max(
                               tir.max(
                                   tir.load("int32", DepthwiseConv2d_9,
                                            (((i1_31 * 7168) +
                                              (i2_41 * 512)) + i3_42)), 255),
                               0)
    for ax1_28, ax2_29, ax3_31 in tir.grid(14, 14, 512):
        PaddedInput_22[(((ax1_28 * 7168) +
                         (ax2_29 * 512)) + ax3_31)] = tir.load(
                             "int32", DepthwiseConv2d_9,
                             (((ax1_28 * 7168) +
                               (ax2_29 * 512)) + ax3_31)).astype("uint8")
    for ax1_29, ax2_30, ax3_32 in tir.grid(14, 14, 512):
        T_cast_49.data[(((ax1_29 * 7168) +
                         (ax2_30 * 512)) + ax3_32)] = tir.load(
                             "uint8", PaddedInput_22,
                             (((ax1_29 * 7168) +
                               (ax2_30 * 512)) + ax3_32)).astype("int16")