예제 #1
0
파일: nms.py 프로젝트: gyuseokByeon/tvm
def get_valid_counts_scan(data, partial_in, partial):
    """Low level IR to do scan.

    Parameters
    ----------
    data: Buffer
        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.

    idx_in : Buffer
        2D Buffer of valid data indices with shape [batch_size, num_anchors].

    idx : Buffer
        2D Buffer of valid data indices with shape [batch_size, num_anchors].

    partial : Buffer
        2D Buffer of valid data indices with shape [batch_size, new_range].

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    ib = tvm.ir_builder.create()
    partial_in = ib.buffer_ptr(partial_in)
    partial = ib.buffer_ptr(partial)
    max_threads = int(
        tvm.target.current_target(allow_none=False).max_num_threads)
    elem_per_thread = num_anchors // max_threads + 1
    nthread_tx = max_threads
    nthread_bx = batch_size
    tx = tvm.thread_axis("threadIdx.x")
    bx = tvm.thread_axis("blockIdx.x")
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    var = tvm.make.node("FloatImm", dtype="float32", value=2)
    new_range = num_anchors // elem_per_thread + 1
    iteration = log(cast(new_range, "float32")) // math.log(2)
    # Scan: Kogge-Stone adder
    with ib.if_scope(
            tvm.all(bx < batch_size, tx < tvm.min(new_range, num_anchors))):
        with ib.for_range(0, iteration) as k:
            with ib.if_scope(k == 0):
                with ib.if_scope(
                        tvm.all(tx > 0, tx < tvm.min(new_range, num_anchors))):
                    partial[bx * new_range + tx] = \
                    partial_in[bx * new_range + tx] + partial_in[bx * new_range + tx - 1]
                with ib.else_scope():
                    partial[bx * new_range] = partial_in[bx * new_range]
            with ib.else_scope():
                with ib.if_scope(tvm.all(tx >= cast(power(var, k), "int32"), \
                                         tx < tvm.min(new_range, num_anchors))):
                    partial[bx * new_range + tx] += \
                    partial[bx * new_range + tx - cast(power(var, k), "int32")]
            ib.emit(
                tvm.make.Call(None, 'tvm_storage_sync',
                              tvm.convert(['shared']), tvm.expr.Call.Intrinsic,
                              None, 0))
    return ib.get()
예제 #2
0
파일: nms.py 프로젝트: bddppq/tvm
def get_valid_counts_scan(data, partial_in, partial):
    """Low level IR to do scan.

    Parameters
    ----------
    data: Buffer
        3D Buffer with shape [batch_size, num_anchors, elem_length], output of nms.

    idx_in : Buffer
        2D Buffer of valid data indices with shape [batch_size, num_anchors].

    idx : Buffer
        2D Buffer of valid data indices with shape [batch_size, num_anchors].

    partial : Buffer
        2D Buffer of valid data indices with shape [batch_size, new_range].

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    ib = tvm.ir_builder.create()
    partial_in = ib.buffer_ptr(partial_in)
    partial = ib.buffer_ptr(partial)
    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
    elem_per_thread = num_anchors // max_threads + 1
    nthread_tx = max_threads
    nthread_bx = batch_size
    tx = tvm.thread_axis("threadIdx.x")
    bx = tvm.thread_axis("blockIdx.x")
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    ib.scope_attr(bx, "thread_extent", nthread_bx)
    var = tvm.make.node("FloatImm", dtype="float32", value=2)
    new_range = num_anchors // elem_per_thread + 1
    iteration = log(cast(new_range, "float32")) // math.log(2)
    # Scan: Kogge-Stone adder
    with ib.if_scope(tvm.all(bx < batch_size, tx < tvm.min(new_range, num_anchors))):
        with ib.for_range(0, iteration) as k:
            with ib.if_scope(k == 0):
                with ib.if_scope(tvm.all(tx > 0, tx < tvm.min(new_range, num_anchors))):
                    partial[bx * new_range + tx] = \
                    partial_in[bx * new_range + tx] + partial_in[bx * new_range + tx - 1]
                with ib.else_scope():
                    partial[bx * new_range] = partial_in[bx * new_range]
            with ib.else_scope():
                with ib.if_scope(tvm.all(tx >= cast(power(var, k), "int32"), \
                                         tx < tvm.min(new_range, num_anchors))):
                    partial[bx * new_range + tx] += \
                    partial[bx * new_range + tx - cast(power(var, k), "int32")]
            ib.emit(tvm.make.Call(None, 'tvm_storage_sync',
                                  tvm.convert(['shared']),
                                  tvm.expr.Call.Intrinsic, None, 0))
    return ib.get()