예제 #1
0
    def _sample(i, c, ph, pw):
        roi = rois[i]
        batch_index = roi[0].astype('int32')
        roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[
            3], roi[4]
        roi_start_h *= spatial_scale
        roi_end_h *= spatial_scale
        roi_start_w *= spatial_scale
        roi_end_w *= spatial_scale

        # force malformed ROIs to be 1x1
        roi_h = tvm.te.max(roi_end_h - roi_start_h, tvm.tir.const(1.0, dtype))
        roi_w = tvm.te.max(roi_end_w - roi_start_w, tvm.tir.const(1.0, dtype))

        bin_h = roi_h / pooled_size_h
        bin_w = roi_w / pooled_size_w

        if sample_ratio > 0:
            roi_bin_grid_h = roi_bin_grid_w = tvm.tir.const(
                sample_ratio, 'int32')
        else:
            roi_bin_grid_h = te.ceil(roi_h / pooled_size_h).astype('int32')
            roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype('int32')

        count = roi_bin_grid_h * roi_bin_grid_w
        rh = te.reduce_axis((0, roi_bin_grid_h))
        rw = te.reduce_axis((0, roi_bin_grid_w))
        roi_start_h += ph * bin_h
        roi_start_w += pw * bin_w
        return te.sum(
            _bilinear(batch_index, c, roi_start_h +
                      (rh + 0.5) * bin_h / roi_bin_grid_h, roi_start_w +
                      (rw + 0.5) * bin_w / roi_bin_grid_w) / count,
            axis=[rh, rw])
예제 #2
0
def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
    """Copy output after applying nms to continuous memory.

    Parameters
    ----------
    sorted_bbox_buf : tvm.te.schedule.Buffer
        3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
        [w_start, h_start, w_end, h_end, score].

    remove_mask_buf : tvm.te.schedule.Buffer
        2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.

    out_buf : tvm.te.schedule.Buffer
        2-D with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
        [batch_index, w_start, h_start, w_end, h_end].

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    batch, num_bbox, _ = get_const_tuple(sorted_bbox_buf.shape)
    rpn_post_nms_top_n = get_const_int(out_buf.shape[0]) // batch
    nthread_tx = batch
    tx = te.thread_axis("threadIdx.x")
    ib = tvm.tir.ir_builder.create()
    ib.scope_attr(tx, "thread_extent", nthread_tx)
    i = ib.allocate('int32', (1, ), 'i', scope='local')
    i[0] = 0
    p_sorted_bbox = ib.buffer_ptr(sorted_bbox_buf)
    p_remove = ib.buffer_ptr(remove_mask_buf)
    p_out = ib.buffer_ptr(out_buf)
    b = tx

    nkeep = ib.allocate('int32', (1, ), 'nkeep', scope='local')
    nkeep[0] = 0  # number of bbox after nms

    with ib.for_range(0, num_bbox) as j:
        with ib.if_scope(p_remove[b * num_bbox + j] == False):
            nkeep[0] += 1
    with ib.if_scope(nkeep[0] > 0):
        with ib.for_range(
                0,
                te.ceil(
                    tvm.tir.const(rpn_post_nms_top_n, 'float32') /
                    nkeep[0]).astype('int32')):
            with ib.for_range(0, num_bbox) as j:
                offset_j = (b * num_bbox + j) * 5
                offset_i = (b * rpn_post_nms_top_n + i[0]) * 5
                with ib.if_scope(
                        tvm.tir.all(i[0] < rpn_post_nms_top_n,
                                    p_remove[(b * num_bbox + j)] == False)):
                    p_out[offset_i] = tvm.tir.Cast('float32', b)
                    with ib.for_range(0, 4, for_type='unroll') as k:
                        p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k]
                    i[0] = i[0] + 1

    body = ib.get()
    return body
예제 #3
0
def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf):
    """Copy output after applying nms to continuous memory.

    Parameters
    ----------
    sorted_bbox_buf : tvm.te.schedule.Buffer
        3-D with shape [batch, num_bbox, 5]. The last dimension is in format of
        [w_start, h_start, w_end, h_end, score].

    remove_mask_buf : tvm.te.schedule.Buffer
        2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed.

    out_buf : tvm.te.schedule.Buffer
        2-D with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
        [batch_index, w_start, h_start, w_end, h_end].

    Returns
    -------
    stmt : Stmt
        The result IR statement.
    """
    batch, num_bbox, _ = get_const_tuple(sorted_bbox_buf.shape)
    rpn_post_nms_top_n = get_const_int(out_buf.shape[0]) // batch
    ib = tvm.tir.ir_builder.create()
    i = ib.allocate("int32", (batch,), "i", scope="local")
    p_sorted_bbox = ib.buffer_ptr(sorted_bbox_buf)
    p_remove = ib.buffer_ptr(remove_mask_buf)
    p_out = ib.buffer_ptr(out_buf)

    nkeep = ib.allocate("int32", (batch,), "nkeep", scope="local")

    with ib.for_range(0, batch) as b:
        nkeep[b] = 0
        i[b] = 0

    with ib.for_range(0, num_bbox) as j:
        with ib.for_range(0, batch) as b:
            with ib.if_scope(p_remove[b * num_bbox + j] == False):
                nkeep[b] += 1
    with ib.for_range(0, batch) as b:
        with ib.if_scope(nkeep[b] > 0):
            with ib.for_range(
                0, te.ceil(tvm.tir.const(rpn_post_nms_top_n, "float32") / nkeep[b]).astype("int32")
            ):
                with ib.for_range(0, num_bbox) as j:
                    offset_j = (b * num_bbox + j) * 5
                    offset_i = (b * rpn_post_nms_top_n + i[b]) * 5
                    with ib.if_scope(
                        tvm.tir.all(
                            i[b] < rpn_post_nms_top_n, p_remove[(b * num_bbox + j)] == False
                        )
                    ):
                        p_out[offset_i] = tvm.tir.Cast("float32", b)
                        with ib.for_range(0, 4, kind="unroll") as k:
                            p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k]
                        i[b] = i[b] + 1

    body = ib.get()
    return body
예제 #4
0
def get_closest_index(in_x, rounding_method, boxes):
    """get the closest index to a value based on a certain rounding method"""
    if rounding_method == "round" or boxes is not None:
        closest_x_index = te.round(in_x).astype("int32")
    elif rounding_method == "round_prefer_floor":
        closest_x_index = te.ceil(in_x - 0.5).astype("int32")
    elif rounding_method == "round_prefer_ceil":
        closest_x_index = te.floor(in_x + 0.5).astype("int32")
    elif rounding_method == "floor":
        # Add epsilon to floor to prevent gpu rounding errors.
        epsilon = 1e-5
        closest_x_index = te.floor(in_x + epsilon).astype("int32")
    elif rounding_method == "ceil":
        # Subract epsilon from ceil to prevent gpu rounding errors.
        epsilon = 1e-5
        closest_x_index = te.ceil(in_x - epsilon).astype("int32")
    else:
        raise ValueError("Uknown rounding method: {}".format(rounding_method))
    return closest_x_index
예제 #5
0
    def _pool(i, c, ph, pw):
        roi = rois[i]
        batch_index = roi[0].astype("int32")
        roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[
            3], roi[4]

        roi_start_h = te.round(roi_start_h * spatial_scale).astype("int32")
        roi_start_w = te.round(roi_start_w * spatial_scale).astype("int32")
        roi_end_h = te.round(roi_end_h * spatial_scale).astype("int32")
        roi_end_w = te.round(roi_end_w * spatial_scale).astype("int32")

        # force malformed ROIs to be 1x1
        roi_h = tvm.te.max(roi_end_h - roi_start_h + 1,
                           tvm.tir.const(1, "int32"))
        roi_w = tvm.te.max(roi_end_w - roi_start_w + 1,
                           tvm.tir.const(1, "int32"))

        bin_h = roi_h.astype(dtype) / pooled_size_h
        bin_w = roi_w.astype(dtype) / pooled_size_w

        # use epsilon to prevent floating point precision loss in floor/ceil
        epsilon = tvm.tir.const(0.00001, dtype)
        hstart = te.floor(ph * bin_h + epsilon).astype("int32")
        wstart = te.floor(pw * bin_w + epsilon).astype("int32")
        hend = te.ceil((ph + 1) * bin_h - epsilon).astype("int32")
        wend = te.ceil((pw + 1) * bin_w - epsilon).astype("int32")
        hstart = tvm.te.min(tvm.te.max(hstart + roi_start_h, 0), height)
        wstart = tvm.te.min(tvm.te.max(wstart + roi_start_w, 0), width)
        hend = tvm.te.min(tvm.te.max(hend + roi_start_h, 0), height)
        wend = tvm.te.min(tvm.te.max(wend + roi_start_w, 0), width)

        non_empty = tvm.tir.all(hstart < hend, wstart < wend)
        min_value = lambda dtype: tvm.tir.if_then_else(
            non_empty, tvm.te.min_value(dtype), tvm.tir.const(0.0, dtype))
        # pylint: disable=unnecessary-lambda
        _max = te.comm_reducer(lambda x, y: tvm.te.max(x, y),
                               min_value,
                               name="max")
        rh = te.reduce_axis((0, hend - hstart), "rh")
        rw = te.reduce_axis((0, wend - wstart), "rw")
        return _max(data[batch_index, c, hstart + rh, wstart + rw],
                    axis=[rh, rw])
예제 #6
0
파일: math.py 프로젝트: hsaputra/tvm
def ceil(x):
    """Take ceil of input x.

    Parameters
    ----------
    x : tvm.te.Tensor
        Input argument.

    Returns
    -------
    y : tvm.te.Tensor
        The result.
    """
    return te.compute(x.shape, lambda *i: te.ceil(x(*i)))
예제 #7
0
def test_const_fold4():
    x1 = tvm.tir.const(4, "int32")
    x2 = x1 + 5
    tdiv = tvm.tir.truncdiv
    assert isinstance(x2, tvm.tir.IntImm) and x2.value == 9
    x3 = tdiv(x2, 3)
    assert isinstance(x3, tvm.tir.IntImm) and x3.value == 3
    x4 = x3 + 0.55
    assert isinstance(x4, tvm.tir.FloatImm) and abs(x4.value - 3.55) < 1e-6
    x5 = te.ceil(x4)
    assert isinstance(x5, tvm.tir.FloatImm) and x5.value == 4
    x6 = x5.astype("int")
    assert isinstance(x6, tvm.tir.IntImm) and x6.value == 4, "x6={}".format(x6)
    y = (te.round((tvm.tir.const(6.5, "float32") - 1) / 1.5) + 2).astype("int")
    assert isinstance(y, tvm.tir.IntImm) and y.value == 6
예제 #8
0
def _sample_common(
    i,
    c,
    ph,
    pw,
    rois,
    pooled_size_h,
    pooled_size_w,
    spatial_scale,
    sample_ratio,
    dtype,
    avg_mode,
    bilinear_func,
):
    roi = rois[i]
    batch_index = roi[0].astype("int32")
    roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[
        3], roi[4]
    roi_start_h *= spatial_scale
    roi_end_h *= spatial_scale
    roi_start_w *= spatial_scale
    roi_end_w *= spatial_scale

    # force malformed ROIs to be 1x1
    roi_h = tvm.te.max(roi_end_h - roi_start_h, tvm.tir.const(1.0, dtype))
    roi_w = tvm.te.max(roi_end_w - roi_start_w, tvm.tir.const(1.0, dtype))

    bin_h = roi_h / pooled_size_h
    bin_w = roi_w / pooled_size_w

    if sample_ratio > 0:
        roi_bin_grid_h = roi_bin_grid_w = tvm.tir.const(sample_ratio, "int32")
    else:
        roi_bin_grid_h = te.ceil(roi_h / pooled_size_h).astype("int32")
        roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype("int32")

    count = roi_bin_grid_h * roi_bin_grid_w
    rh = te.reduce_axis((0, roi_bin_grid_h))
    rw = te.reduce_axis((0, roi_bin_grid_w))
    roi_start_h += ph * bin_h
    roi_start_w += pw * bin_w

    if avg_mode:
        return te.sum(
            bilinear_func(
                batch_index,
                c,
                roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
                roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
            ) / count,
            axis=[rh, rw],
        )
    # max mode
    return te.max(
        bilinear_func(
            batch_index,
            c,
            roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
            roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
        ),
        axis=[rh, rw],
    )
def test_basic_operation():
    np.random.seed(0)
    shape = (10, 10)
    x = te.var("x", dtype='float32')
    k = te.reduce_axis((0, 10), name="k")
    l = te.reduce_axis((0, 10), name="l")
    A0 = te.placeholder(shape, name='A0')
    A1 = te.placeholder(shape, name='A1')
    zeros = np.zeros(shape)

    B = te.compute(shape, lambda i, j: A0[i, j], name='B')
    check_grad(B, [A0])

    B = te.compute(shape, lambda i, j: A0[i, j] + A1[i, j], name='B')
    check_grad(B, [A0, A1])

    B = te.compute(shape, lambda i, j: A0[i, j] + A0[j, i], name='B')
    check_grad(B, A0)

    B = te.compute(shape, lambda i, j: te.floor(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: te.ceil(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: te.trunc(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: te.round(A0[i, j]), name='B')
    check_grad(B, A0, desired_grads=[zeros])

    B = te.compute(shape, lambda i, j: A0[i, j] + te.exp(A0[j, i]), name='B')
    check_grad(B, A0)

    B = te.compute(
        shape,
        lambda i, j: te.log(0.1 + te.abs(A0[i, j] + te.exp(A0[j, i]))),
        name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.sigmoid(A0[i, j] * A0[i, j] * A0[j, i]),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.tanh(A0[i, j] * A0[i, j] * A0[j, i]),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.sqrt(A0[i, j] * A0[i, j] * A0[j, i]),
                   name='B')
    check_grad(B, A0, data_range=(0.1, 10))

    B = te.compute(shape,
                   lambda i, j: te.power(te.abs(A0[i, j]), A0[j, i]),
                   name='B')
    check_grad(B, A0, data_range=(-4, 4))

    B = te.compute(shape, lambda i, j: A0[i, j] * A0[j, i], name='B')
    check_grad(B, A0)

    B = te.compute((10, ),
                   lambda i: te.sum(A0[i, k] * A0[k, i], axis=k),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.sum(A0[i, k] * A0[k, i] + 5, axis=k),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: te.max(A0[i, k] * A0[k, j] + 5, axis=k),
                   name='B')
    check_grad(B, A0)

    B = te.compute(shape,
                   lambda i, j: A0[i, j] * (A1[j, i] + A0[j, i]),
                   name='B')
    check_grad(B, [A0, A1])

    B = te.compute(shape,
                   lambda i, j: te.sum(
                       A0[k, k] - A0[te.min(j + k, 9), j] * A0[i, k], axis=k),
                   name='B')
    check_grad(B, A0)

    def fcombine(x, y):
        return x * y

    def fidentity(t0):
        return tvm.tir.const(1, t0)

    prod = te.comm_reducer(fcombine, fidentity, name='prod')
    B = te.compute((10, 10),
                   lambda i, j: prod(A0[i, k] + A0[k, i], axis=k),
                   name='B')
    check_grad(B, A0)

    X = te.placeholder((10, ), name='X')
    A = te.compute((10, ), lambda i: X[i] + X[9 - i])
    B = te.compute((10, ), lambda i: X[i] * X[9 - i])
    Y = topi.tensordot(A, B, 1)
    check_grad(Y, X)
예제 #10
0
def resize_bilinear(indices,
                    data,
                    image_height,
                    image_width,
                    target_height,
                    target_width,
                    boxes=None,
                    box_indices=None,
                    extrapolation_value=None,
                    layout='NCHW',
                    coordinate_transformation_mode="align_corners",
                    out_dtype=None):
    """Perform resize operation with bilinear method on the data.
    For details about Bilinear interpolation please refer to
    https://en.wikipedia.org/wiki/Bilinear_interpolation.

    Parameters
    ----------
    indices : tuple
        The indices of input data

    data : tvm.te.Tensor
        inputs is a 4-D tensor with shape
        [batch, channel, in_height, in_width]
        or  [batch, in_height, in_width, channel]

    image_height : integer
        Input image height

    image_width : integer
        Input image width

    target_height : integer
        The target resized image height

    target_width : integer
        The target resized image width

    boxes : tvm.te.Tensor, optional
        A 2-D tensor of shape [num_boxes, 4]. Each row of the tensor specifies
        the coordinates of a box.

    box_indices : tvm.te.Tensor, optional
        A 1-D tensor of shape [num_boxes], box_indices[i] specifies the data that
        the i-th box refers to.

    extrapolation_value: float, optional
        Value used for extrapolation, when applicable.

    layout: string, optional
        "NCHW", "NHWC", or "NCHWc".

    coordinate_transformation_mode: string, optional
        Describes how to transform the coordinate in the resized tensor
        to the coordinate in the original tensor.
        Refer to the ONNX Resize operator specification for details.
        Available options are "half_pixel", "align_corners" and "asymmetric".

    out_dtype: string, optional
        Type to return. If left None will be same as input type.

    Returns
    -------
    output : out_dtype
        The computed result with type out_dtype
    """
    def _cast_output(value, data_dtype="float32", out_dtype=None):
        if out_dtype:
            dtype = out_dtype
        else:
            dtype = data_dtype
        return value.astype(dtype)

    def _lerp(A, B, t):
        return A * (1.0 - t) + B * t

    n, c, y, x, cc, inum, ic = get_2d_indices(indices, layout=layout)
    box_idx = box_indices(n) if box_indices is not None else n

    if boxes is not None:
        y1, x1 = boxes(n, 0), boxes(n, 1)
        y2, x2 = boxes(n, 2), boxes(n, 3)

        in_h = (image_height - 1) * (y2 - y1)
        in_w = (image_width - 1) * (x2 - x1)
        h_scale = in_h.astype('float') / (target_height - 1)
        w_scale = in_w.astype('float') / (target_width - 1)

        in_y = y1 * (image_height - 1) + h_scale * y
        in_x = x1 * (image_width - 1) + w_scale * x
    else:
        if coordinate_transformation_mode == "align_corners":
            h_scale = (image_height - 1).astype('float') / (target_height - 1)
            w_scale = (image_width - 1).astype('float') / (target_width - 1)
        elif coordinate_transformation_mode in ["asymmetric", "half_pixel"]:
            h_scale = image_height.astype('float') / target_height
            w_scale = image_width.astype('float') / target_width
        else:
            raise ValueError(
                "Unsupported coordinate_transformation_mode: {}".format(
                    coordinate_transformation_mode))

        if coordinate_transformation_mode == "half_pixel":
            in_y = h_scale * (y + 0.5) - 0.5
            in_x = w_scale * (x + 0.5) - 0.5
        else:
            in_y = h_scale * y
            in_x = w_scale * x

    top_y_index = te.floor(in_y).astype('int32')
    bottom_y_index = te.ceil(in_y).astype('int32')
    y_lerp = in_y - top_y_index

    left_x_index = te.floor(in_x).astype('int32')
    right_x_index = te.ceil(in_x).astype('int32')
    x_lerp = in_x - left_x_index

    top_left = get_2d_pixel(data, layout, boxes, image_height, image_width,
                            box_idx, c, top_y_index, left_x_index, cc, inum,
                            ic)
    top_right = get_2d_pixel(data, layout, boxes, image_height, image_width,
                             box_idx, c, top_y_index, right_x_index, cc, inum,
                             ic)
    bottom_left = get_2d_pixel(data, layout, boxes, image_height, image_width,
                               box_idx, c, bottom_y_index, left_x_index, cc,
                               inum, ic)
    bottom_right = get_2d_pixel(data, layout, boxes, image_height, image_width,
                                box_idx, c, bottom_y_index, right_x_index, cc,
                                inum, ic)

    top = _lerp(top_left, top_right, x_lerp)
    bottom = _lerp(bottom_left, bottom_right, x_lerp)
    value = _lerp(top, bottom, y_lerp)

    # use extrapolation_value if in_y/in_x is out of boundary
    if extrapolation_value is not None:
        out = tvm.tir.if_then_else(
            in_y < 0, extrapolation_value,
            tvm.tir.if_then_else(in_y > image_height - 1, extrapolation_value,
                                 value))
        value = tvm.tir.if_then_else(
            in_x < 0, extrapolation_value,
            tvm.tir.if_then_else(in_x > image_width - 1, extrapolation_value,
                                 out))
    return _cast_output(value, data.dtype, out_dtype=out_dtype)
예제 #11
0
def resize_nearest_neighbor(
    indices,
    data,
    image_height,
    image_width,
    target_height,
    target_width,
    boxes=None,
    box_indices=None,
    extrapolation_value=None,
    layout="NCHW",
    coordinate_transformation_mode="align_corners",
    rounding_method="",
    out_dtype=None,
):
    """Perform resize operation with nearest neighbor method on the data.
    For details about Nearest-neighbor interpolation please refer to
    https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.

    Parameters
    ----------
    indices : tuple
        The indices of input data

    data : tvm.te.Tensor
        inputs is a 4-D tensor with shape
        [batch, channel, in_height, in_width]
        or  [batch, in_height, in_width, channel]

    image_height : integer
        Input image height

    image_width : integer
        Input image width

    target_height : integer
        The target resized image height

    target_width : integer
        The target resized image width

    boxes : tvm.te.Tensor, optional
        A 2-D tensor of shape [num_boxes, 4]. Each row of the tensor specifies
        the coordinates of a box.

    box_indices : tvm.te.Tensor, optional
        A 1-D tensor of shape [num_boxes], box_indices[i] specifies the data that
        the i-th box refers to.

    extrapolation_value: float, optional
        Value used for extrapolation, when applicable.

    layout: string, optional
        "NCHW", "NHWC", or "NCHWc".

    coordinate_transformation_mode: string, optional
        Describes how to transform the coordinate in the resized tensor
        to the coordinate in the original tensor.
        Refer to the ONNX Resize operator specification for details.
        Available options are "half_pixel", "align_corners" and "asymmetric".

    rounding_method: string, optional
        indicates how to find the "nearest" pixel in nearest_neighbor method
        [round, floor, ceil]

    out_dtype: string, optional
        Type to return. If left None will be same as input type.

    Returns
    -------
    output : out_dtype
        The computed result with type out_dtype
    """
    if rounding_method == "":
        if coordinate_transformation_mode == "align_corners":
            rounding_method = "round"
        else:
            rounding_method = "floor"

    def _cast_output(value, data_dtype="float32", out_dtype=None):
        if out_dtype:
            dtype = out_dtype
        else:
            dtype = data_dtype
        return value.astype(dtype)

    n, c, y, x, cc, inum, ic = get_2d_indices(indices, layout)
    box_idx = box_indices(n) if box_indices is not None else n
    if boxes is not None:
        y1, x1 = boxes(n, 0), boxes(n, 1)
        y2, x2 = boxes(n, 2), boxes(n, 3)

        in_h = (image_height - 1) * (y2 - y1)
        in_w = (image_width - 1) * (x2 - x1)
        h_scale = in_h.astype("float") / (target_height - 1)
        w_scale = in_w.astype("float") / (target_width - 1)

        in_y = y1 * (image_height - 1) + h_scale * y
        in_x = x1 * (image_width - 1) + w_scale * x
    else:
        in_y, in_x = get_iny_inx(
            y,
            x,
            image_height,
            image_width,
            target_height,
            target_width,
            coordinate_transformation_mode,
        )

    if rounding_method == "round" or boxes is not None:
        closest_x_index = te.round(in_x).astype("int32")
        closest_y_index = te.round(in_y).astype("int32")
    elif rounding_method == "round_prefer_floor":
        closest_x_index = te.ceil(in_x - 0.5).astype("int32")
        closest_y_index = te.ceil(in_y - 0.5).astype("int32")
    elif rounding_method == "round_prefer_ceil":
        closest_x_index = te.floor(in_x + 0.5).astype("int32")
        closest_y_index = te.floor(in_y + 0.5).astype("int32")
    elif rounding_method == "floor":
        # Add epsilon to floor to prevent gpu rounding errors.
        epsilon = 1e-5
        closest_y_index = te.floor(in_y + epsilon).astype("int32")
        closest_x_index = te.floor(in_x + epsilon).astype("int32")
    elif rounding_method == "ceil":
        # Subract epsilon from ceil to prevent gpu rounding errors.
        epsilon = 1e-5
        closest_y_index = te.ceil(in_y - epsilon).astype("int32")
        closest_x_index = te.ceil(in_x - epsilon).astype("int32")
    else:
        raise ValueError("Uknown rounding method: {}".format(rounding_method))

    value = get_2d_pixel(
        data,
        layout,
        boxes,
        image_height,
        image_width,
        box_idx,
        c,
        closest_y_index,
        closest_x_index,
        cc,
        inum,
        ic,
    )

    if extrapolation_value is not None:
        out = tvm.tir.if_then_else(
            in_y < 0,
            extrapolation_value,
            tvm.tir.if_then_else(in_y > image_height - 1, extrapolation_value,
                                 value),
        )
        # use extrapolation_value if in_x is out of boundary
        value = tvm.tir.if_then_else(
            in_x < 0,
            extrapolation_value,
            tvm.tir.if_then_else(in_x > image_width - 1, extrapolation_value,
                                 out),
        )
    return _cast_output(value, data.dtype, out_dtype=out_dtype)