示例#1
0
def pad_groundtruths_to_fixed_size(groundtruths: Dict[str, tf.Tensor],
                                   size: int) -> Dict[str, tf.Tensor]:
    """Pads the first dimension of groundtruths labels to the fixed size.

  Args:
    groundtruths: A dictionary of {`str`: `tf.Tensor`} that contains groundtruth
      annotations of `boxes`, `is_crowds`, `areas` and `classes`.
    size: An `int` that specifies the expected size of the first dimension of
      padded tensors.

  Returns:
    A dictionary of the same keys as input and padded tensors as values.

  """
    groundtruths['boxes'] = preprocess_ops.clip_or_pad_to_fixed_size(
        groundtruths['boxes'], size, -1)
    groundtruths['is_crowds'] = preprocess_ops.clip_or_pad_to_fixed_size(
        groundtruths['is_crowds'], size, 0)
    groundtruths['areas'] = preprocess_ops.clip_or_pad_to_fixed_size(
        groundtruths['areas'], size, -1)
    groundtruths['classes'] = preprocess_ops.clip_or_pad_to_fixed_size(
        groundtruths['classes'], size, -1)
    if 'attributes' in groundtruths:
        for k, v in groundtruths['attributes'].items():
            groundtruths['attributes'][
                k] = preprocess_ops.clip_or_pad_to_fixed_size(v, size, -1)
    return groundtruths
  def test_batch_generate_targets(self):

    input_size = [512, 512]
    output_size = [128, 128]
    max_num_instances = 128

    boxes = tf.constant([
        (10, 300, 15, 370),  # center (y, x) = (12, 335)
        (100, 300, 150, 370),  # center (y, x) = (125, 335)
        (15, 100, 200, 170),  # center (y, x) = (107, 135)
    ], dtype=tf.float32)

    classes = tf.constant((1, 1, 1), dtype=tf.float32)

    boxes = preprocess_ops.clip_or_pad_to_fixed_size(
        boxes, max_num_instances, 0)
    classes = preprocess_ops.clip_or_pad_to_fixed_size(
        classes, max_num_instances, 0)

    boxes = tf.stack([boxes, boxes], axis=0)
    classes = tf.stack([classes, classes], axis=0)

    # pylint: disable=g-long-lambda
    labels = tf.map_fn(
        fn=lambda x: target_assigner.assign_centernet_targets(
            labels=x,
            output_size=output_size,
            input_size=input_size),
        elems={
            'boxes': boxes,
            'classes': classes,
            'groundtruths': {
                'num_detections': tf.constant([3, 3]),
            }
        },
        dtype={
            'ct_heatmaps': tf.float32,
            'ct_offset': tf.float32,
            'size': tf.float32,
            'box_mask': tf.int32,
            'box_indices': tf.int32
        }
    )

    ct_heatmaps = labels['ct_heatmaps']
    ct_offset = labels['ct_offset']
    size = labels['size']
    box_mask = labels['box_mask']
    box_indices = labels['box_indices']

    self.assertEqual(ct_heatmaps.shape, (2, output_size[0], output_size[1], 90))

    self.assertEqual(ct_offset.shape, (2, max_num_instances, 2))

    self.assertEqual(size.shape, (2, max_num_instances, 2))
    self.assertEqual(box_mask.shape, (2, max_num_instances))
    self.assertEqual(box_indices.shape, (2, max_num_instances, 2))
示例#3
0
    def _parse_eval_data(self, data):
        """Parses data for training and evaluation."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        is_crowd = data['groundtruth_is_crowd']

        # Gets original image and its size.
        image = data['image']

        # Normalizes image with mean and std pixel values.
        image = preprocess_ops.normalize_image(image)

        scales = tf.constant([self._resize_scales[-1]], tf.float32)

        image_shape = tf.shape(image)[:2]
        boxes = box_ops.denormalize_boxes(boxes, image_shape)
        gt_boxes = boxes
        short_side = scales[0]
        image, image_info = preprocess_ops.resize_image(
            image, short_side, max(self._output_size))
        boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :],
                                                     image_info[1, :],
                                                     image_info[3, :])
        boxes = box_ops.normalize_boxes(boxes, image_info[1, :])

        # Filters out ground truth boxes that are all zeros.
        indices = box_ops.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)
        is_crowd = tf.gather(is_crowd, indices)
        boxes = box_ops.yxyx_to_cycxhw(boxes)

        image = tf.image.pad_to_bounding_box(image, 0, 0, self._output_size[0],
                                             self._output_size[1])
        labels = {
            'classes':
            preprocess_ops.clip_or_pad_to_fixed_size(classes,
                                                     self._max_num_boxes),
            'boxes':
            preprocess_ops.clip_or_pad_to_fixed_size(boxes,
                                                     self._max_num_boxes)
        }
        labels.update({
            'id':
            int(data['source_id']),
            'image_info':
            image_info,
            'is_crowd':
            preprocess_ops.clip_or_pad_to_fixed_size(is_crowd,
                                                     self._max_num_boxes),
            'gt_boxes':
            preprocess_ops.clip_or_pad_to_fixed_size(gt_boxes,
                                                     self._max_num_boxes),
        })

        return image, labels
示例#4
0
    def _build_label(self, boxes, classes, image_info, unpad_image_shape,
                     data):

        # Sets up groundtruth data for evaluation.
        groundtruths = {
            'source_id':
            data['source_id'],
            'height':
            data['height'],
            'width':
            data['width'],
            'num_detections':
            tf.shape(data['groundtruth_classes'])[0],
            'boxes':
            box_ops.denormalize_boxes(data['groundtruth_boxes'],
                                      tf.shape(input=data['image'])[0:2]),
            'classes':
            data['groundtruth_classes'],
            'areas':
            data['groundtruth_area'],
            'is_crowds':
            tf.cast(data['groundtruth_is_crowd'], tf.int32),
        }

        groundtruths['source_id'] = utils.process_source_id(
            groundtruths['source_id'])
        groundtruths = utils.pad_groundtruths_to_fixed_size(
            groundtruths, self._max_num_instances)

        labels = {
            'boxes':
            preprocess_ops.clip_or_pad_to_fixed_size(boxes,
                                                     self._max_num_instances,
                                                     -1),
            'classes':
            preprocess_ops.clip_or_pad_to_fixed_size(classes,
                                                     self._max_num_instances,
                                                     -1),
            'image_info':
            image_info,
            'unpad_image_shapes':
            unpad_image_shape,
            'groundtruths':
            groundtruths
        }

        return labels
    def test_pad_to_fixed_size(self, input_shape, output_size):
        # Copies input shape to padding shape.
        clip_shape = input_shape[:]
        clip_shape[0] = min(output_size, clip_shape[0])
        padding_shape = input_shape[:]
        padding_shape[0] = max(output_size - input_shape[0], 0)
        expected_outputs = np.concatenate(
            [np.ones(clip_shape), np.zeros(padding_shape)], axis=0)

        data = tf.ones(input_shape)
        output_data = preprocess_ops.clip_or_pad_to_fixed_size(
            data, output_size, constant_values=0)
        output_data = output_data.numpy()
        self.assertAllClose(output_size, output_data.shape[0])
        self.assertAllClose(expected_outputs, output_data)
示例#6
0
    def _parse_train_data(self, data):
        """Parses data for training and evaluation."""
        classes = data['groundtruth_classes'] + self._class_offset
        boxes = data['groundtruth_boxes']
        is_crowd = data['groundtruth_is_crowd']

        # Gets original image.
        image = data['image']

        # Normalizes image with mean and std pixel values.
        image = preprocess_ops.normalize_image(image)
        image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes)

        do_crop = tf.greater(tf.random.uniform([]), 0.5)
        if do_crop:
            # Rescale
            boxes = box_ops.denormalize_boxes(boxes, tf.shape(image)[:2])
            index = tf.random.categorical(tf.zeros([1, 3]), 1)[0]
            scales = tf.gather([400.0, 500.0, 600.0], index, axis=0)
            short_side = scales[0]
            image, image_info = preprocess_ops.resize_image(image, short_side)
            boxes = preprocess_ops.resize_and_crop_boxes(
                boxes, image_info[2, :], image_info[1, :], image_info[3, :])
            boxes = box_ops.normalize_boxes(boxes, image_info[1, :])

            # Do croping
            shape = tf.cast(image_info[1], dtype=tf.int32)
            h = tf.random.uniform([],
                                  384,
                                  tf.math.minimum(shape[0], 600),
                                  dtype=tf.int32)
            w = tf.random.uniform([],
                                  384,
                                  tf.math.minimum(shape[1], 600),
                                  dtype=tf.int32)
            i = tf.random.uniform([], 0, shape[0] - h + 1, dtype=tf.int32)
            j = tf.random.uniform([], 0, shape[1] - w + 1, dtype=tf.int32)
            image = tf.image.crop_to_bounding_box(image, i, j, h, w)
            boxes = tf.clip_by_value(
                (boxes[..., :] *
                 tf.cast(tf.stack([shape[0], shape[1], shape[0], shape[1]]),
                         dtype=tf.float32) -
                 tf.cast(tf.stack([i, j, i, j]), dtype=tf.float32)) /
                tf.cast(tf.stack([h, w, h, w]), dtype=tf.float32), 0.0, 1.0)
        scales = tf.constant(self._resize_scales, dtype=tf.float32)
        index = tf.random.categorical(tf.zeros([1, 11]), 1)[0]
        scales = tf.gather(scales, index, axis=0)

        image_shape = tf.shape(image)[:2]
        boxes = box_ops.denormalize_boxes(boxes, image_shape)
        short_side = scales[0]
        image, image_info = preprocess_ops.resize_image(
            image, short_side, max(self._output_size))
        boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :],
                                                     image_info[1, :],
                                                     image_info[3, :])
        boxes = box_ops.normalize_boxes(boxes, image_info[1, :])

        # Filters out ground truth boxes that are all zeros.
        indices = box_ops.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)
        is_crowd = tf.gather(is_crowd, indices)
        boxes = box_ops.yxyx_to_cycxhw(boxes)

        image = tf.image.pad_to_bounding_box(image, 0, 0, self._output_size[0],
                                             self._output_size[1])
        labels = {
            'classes':
            preprocess_ops.clip_or_pad_to_fixed_size(classes,
                                                     self._max_num_boxes),
            'boxes':
            preprocess_ops.clip_or_pad_to_fixed_size(boxes,
                                                     self._max_num_boxes)
        }

        return image, labels
def _generate_detections_per_image(
        boxes: tf.Tensor,
        scores: tf.Tensor,
        attributes: Optional[Mapping[str, tf.Tensor]] = None,
        pre_nms_top_k: int = 5000,
        pre_nms_score_threshold: float = 0.05,
        nms_iou_threshold: float = 0.5,
        max_num_detections: int = 100,
        soft_nms_sigma: Optional[float] = None):
    """Generates the final detections per image given the model outputs.

  Args:
    boxes: A  `tf.Tensor` with shape `[N, num_classes, 4]` or `[N, 1, 4]`, which
      box predictions on all feature levels. The N is the number of total
      anchors on all levels.
    scores: A `tf.Tensor` with shape `[N, num_classes]`, which stacks class
      probability on all feature levels. The N is the number of total anchors on
      all levels. The num_classes is the number of classes predicted by the
      model. Note that the class_outputs here is the raw score.
    attributes: If not None, a dict of `tf.Tensor`. Each value is in shape
      `[N, num_classes, attribute_size]` or `[N, 1, attribute_size]` of
      attribute predictions on all feature levels. The N is the number of total
      anchors on all levels.
    pre_nms_top_k: An `int` number of top candidate detections per class before
      NMS.
    pre_nms_score_threshold: A `float` representing the threshold for deciding
      when to remove boxes based on score.
    nms_iou_threshold: A `float` representing the threshold for deciding whether
      boxes overlap too much with respect to IOU.
    max_num_detections: A `scalar` representing maximum number of boxes retained
      over all classes.
    soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS.
      When soft_nms_sigma=0.0, we fall back to standard NMS.
      If set to None, `tf.image.non_max_suppression_padded` is called instead.

  Returns:
    nms_boxes: A `float` tf.Tensor of shape `[max_num_detections, 4]`
      representing top detected boxes in `[y1, x1, y2, x2]`.
    nms_scores: A `float` tf.Tensor of shape `[max_num_detections]` representing
      sorted confidence scores for detected boxes. The values are between [0,
      1].
    nms_classes: An `int` tf.Tensor of shape `[max_num_detections]` representing
      classes for detected boxes.
    valid_detections: An `int` tf.Tensor of shape [1] only the top
      `valid_detections` boxes are valid detections.
    nms_attributes: None or a dict. Each value is a `float` tf.Tensor of shape
      `[max_num_detections, attribute_size]` representing attribute predictions
      for detected boxes. Can be an empty dict if `attributes` is None.
  """
    nmsed_boxes = []
    nmsed_scores = []
    nmsed_classes = []
    num_classes_for_box = boxes.get_shape().as_list()[1]
    num_classes = scores.get_shape().as_list()[1]
    if attributes:
        nmsed_attributes = {att_name: [] for att_name in attributes.keys()}
    else:
        nmsed_attributes = {}

    for i in range(num_classes):
        boxes_i = boxes[:, min(num_classes_for_box - 1, i)]
        scores_i = scores[:, i]
        # Obtains pre_nms_top_k before running NMS.
        scores_i, indices = tf.nn.top_k(scores_i,
                                        k=tf.minimum(
                                            tf.shape(scores_i)[-1],
                                            pre_nms_top_k))
        boxes_i = tf.gather(boxes_i, indices)

        if soft_nms_sigma is not None:
            (nmsed_indices_i,
             nmsed_scores_i) = tf.image.non_max_suppression_with_scores(
                 tf.cast(boxes_i, tf.float32),
                 tf.cast(scores_i, tf.float32),
                 max_num_detections,
                 iou_threshold=nms_iou_threshold,
                 score_threshold=pre_nms_score_threshold,
                 soft_nms_sigma=soft_nms_sigma,
                 name='nms_detections_' + str(i))
            nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
            nmsed_boxes_i = preprocess_ops.clip_or_pad_to_fixed_size(
                nmsed_boxes_i, max_num_detections, 0.0)
            nmsed_scores_i = preprocess_ops.clip_or_pad_to_fixed_size(
                nmsed_scores_i, max_num_detections, -1.0)
        else:
            (nmsed_indices_i,
             nmsed_num_valid_i) = tf.image.non_max_suppression_padded(
                 tf.cast(boxes_i, tf.float32),
                 tf.cast(scores_i, tf.float32),
                 max_num_detections,
                 iou_threshold=nms_iou_threshold,
                 score_threshold=pre_nms_score_threshold,
                 pad_to_max_output_size=True,
                 name='nms_detections_' + str(i))
            nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i)
            nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i)
            # Sets scores of invalid boxes to -1.
            nmsed_scores_i = tf.where(
                tf.less(tf.range(max_num_detections), [nmsed_num_valid_i]),
                nmsed_scores_i, -tf.ones_like(nmsed_scores_i))

        nmsed_classes_i = tf.fill([max_num_detections], i)
        nmsed_boxes.append(nmsed_boxes_i)
        nmsed_scores.append(nmsed_scores_i)
        nmsed_classes.append(nmsed_classes_i)
        if attributes:
            for att_name, att in attributes.items():
                num_classes_for_attr = att.get_shape().as_list()[1]
                att_i = att[:, min(num_classes_for_attr - 1, i)]
                att_i = tf.gather(att_i, indices)
                nmsed_att_i = tf.gather(att_i, nmsed_indices_i)
                nmsed_att_i = preprocess_ops.clip_or_pad_to_fixed_size(
                    nmsed_att_i, max_num_detections, 0.0)
                nmsed_attributes[att_name].append(nmsed_att_i)

    # Concats results from all classes and sort them.
    nmsed_boxes = tf.concat(nmsed_boxes, axis=0)
    nmsed_scores = tf.concat(nmsed_scores, axis=0)
    nmsed_classes = tf.concat(nmsed_classes, axis=0)
    nmsed_scores, indices = tf.nn.top_k(nmsed_scores,
                                        k=max_num_detections,
                                        sorted=True)
    nmsed_boxes = tf.gather(nmsed_boxes, indices)
    nmsed_classes = tf.gather(nmsed_classes, indices)
    valid_detections = tf.reduce_sum(
        tf.cast(tf.greater(nmsed_scores, -1), tf.int32))
    if attributes:
        for att_name in attributes.keys():
            nmsed_attributes[att_name] = tf.concat(nmsed_attributes[att_name],
                                                   axis=0)
            nmsed_attributes[att_name] = tf.gather(nmsed_attributes[att_name],
                                                   indices)

    return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes
示例#8
0
文件: coco.py 项目: tensorflow/models
  def preprocess(self, inputs):
    """Preprocess COCO for DETR."""
    image = inputs['image']
    boxes = inputs['objects']['bbox']
    classes = inputs['objects']['label'] + 1
    is_crowd = inputs['objects']['is_crowd']

    image = preprocess_ops.normalize_image(image)
    if self._params.is_training:
      image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes)

      do_crop = tf.greater(tf.random.uniform([]), 0.5)
      if do_crop:
        # Rescale
        boxes = box_ops.denormalize_boxes(boxes, tf.shape(image)[:2])
        index = tf.random.categorical(tf.zeros([1, 3]), 1)[0]
        scales = tf.gather([400.0, 500.0, 600.0], index, axis=0)
        short_side = scales[0]
        image, image_info = preprocess_ops.resize_image(image, short_side)
        boxes = preprocess_ops.resize_and_crop_boxes(boxes,
                                                     image_info[2, :],
                                                     image_info[1, :],
                                                     image_info[3, :])
        boxes = box_ops.normalize_boxes(boxes, image_info[1, :])

        # Do croping
        shape = tf.cast(image_info[1], dtype=tf.int32)
        h = tf.random.uniform(
            [], 384, tf.math.minimum(shape[0], 600), dtype=tf.int32)
        w = tf.random.uniform(
            [], 384, tf.math.minimum(shape[1], 600), dtype=tf.int32)
        i = tf.random.uniform([], 0, shape[0] - h + 1, dtype=tf.int32)
        j = tf.random.uniform([], 0, shape[1] - w + 1, dtype=tf.int32)
        image = tf.image.crop_to_bounding_box(image, i, j, h, w)
        boxes = tf.clip_by_value(
            (boxes[..., :] * tf.cast(
                tf.stack([shape[0], shape[1], shape[0], shape[1]]),
                dtype=tf.float32) -
             tf.cast(tf.stack([i, j, i, j]), dtype=tf.float32)) /
            tf.cast(tf.stack([h, w, h, w]), dtype=tf.float32), 0.0, 1.0)
      scales = tf.constant(
          self._params.resize_scales,
          dtype=tf.float32)
      index = tf.random.categorical(tf.zeros([1, 11]), 1)[0]
      scales = tf.gather(scales, index, axis=0)
    else:
      scales = tf.constant([self._params.resize_scales[-1]], tf.float32)

    image_shape = tf.shape(image)[:2]
    boxes = box_ops.denormalize_boxes(boxes, image_shape)
    gt_boxes = boxes
    short_side = scales[0]
    image, image_info = preprocess_ops.resize_image(
        image,
        short_side,
        max(self._params.output_size))
    boxes = preprocess_ops.resize_and_crop_boxes(boxes,
                                                 image_info[2, :],
                                                 image_info[1, :],
                                                 image_info[3, :])
    boxes = box_ops.normalize_boxes(boxes, image_info[1, :])

    # Filters out ground truth boxes that are all zeros.
    indices = box_ops.get_non_empty_box_indices(boxes)
    boxes = tf.gather(boxes, indices)
    classes = tf.gather(classes, indices)
    is_crowd = tf.gather(is_crowd, indices)
    boxes = box_ops.yxyx_to_cycxhw(boxes)

    image = tf.image.pad_to_bounding_box(
        image, 0, 0, self._params.output_size[0], self._params.output_size[1])
    labels = {
        'classes':
            preprocess_ops.clip_or_pad_to_fixed_size(
                classes, self._params.max_num_boxes),
        'boxes':
            preprocess_ops.clip_or_pad_to_fixed_size(
                boxes, self._params.max_num_boxes)
    }
    if not self._params.is_training:
      labels.update({
          'id':
              inputs['image/id'],
          'image_info':
              image_info,
          'is_crowd':
              preprocess_ops.clip_or_pad_to_fixed_size(
                  is_crowd, self._params.max_num_boxes),
          'gt_boxes':
              preprocess_ops.clip_or_pad_to_fixed_size(
                  gt_boxes, self._params.max_num_boxes),
      })

    return image, labels
  def check_labels_correct(self,
                           boxes,
                           classes,
                           output_size,
                           input_size):
    max_num_instances = 128
    num_detections = len(boxes)
    boxes = tf.constant(boxes, dtype=tf.float32)
    classes = tf.constant(classes, dtype=tf.float32)

    boxes = preprocess_ops.clip_or_pad_to_fixed_size(
        boxes, max_num_instances, 0)
    classes = preprocess_ops.clip_or_pad_to_fixed_size(
        classes, max_num_instances, 0)

    # pylint: disable=g-long-lambda
    labels = target_assigner.assign_centernet_targets(
        labels={
            'boxes': boxes,
            'classes': classes,
            'groundtruths': {
                'num_detections': num_detections,
            }
        },
        output_size=output_size,
        input_size=input_size)

    ct_heatmaps = labels['ct_heatmaps']
    ct_offset = labels['ct_offset']
    size = labels['size']
    box_mask = labels['box_mask']
    box_indices = labels['box_indices']

    boxes = tf.cast(boxes, tf.float32)
    classes = tf.cast(classes, tf.float32)
    height_ratio = output_size[0] / input_size[0]
    width_ratio = output_size[1] / input_size[1]

    # Shape checks
    self.assertEqual(ct_heatmaps.shape, (output_size[0], output_size[1], 90))

    self.assertEqual(ct_offset.shape, (max_num_instances, 2))

    self.assertEqual(size.shape, (max_num_instances, 2))
    self.assertEqual(box_mask.shape, (max_num_instances,))
    self.assertEqual(box_indices.shape, (max_num_instances, 2))

    self.assertAllInRange(ct_heatmaps, 0, 1)

    for i in range(len(boxes)):
      # Check sizes
      self.assertAllEqual(size[i],
                          [(boxes[i][2] - boxes[i][0]) * height_ratio,
                           (boxes[i][3] - boxes[i][1]) * width_ratio,
                           ])

      # Check box indices
      y = tf.math.floor((boxes[i][0] + boxes[i][2]) / 2 * height_ratio)
      x = tf.math.floor((boxes[i][1] + boxes[i][3]) / 2 * width_ratio)
      self.assertAllEqual(box_indices[i], [y, x])

      # check offsets
      true_y = (boxes[i][0] + boxes[i][2]) / 2 * height_ratio
      true_x = (boxes[i][1] + boxes[i][3]) / 2 * width_ratio
      self.assertAllEqual(ct_offset[i], [true_y - y, true_x - x])

    for i in range(len(boxes), max_num_instances):
      # Make sure rest are zero
      self.assertAllEqual(size[i], [0, 0])
      self.assertAllEqual(box_indices[i], [0, 0])
      self.assertAllEqual(ct_offset[i], [0, 0])

    # Check mask indices
    self.assertAllEqual(tf.cast(box_mask[3:], tf.int32),
                        tf.repeat(0, repeats=max_num_instances - 3))
    self.assertAllEqual(tf.cast(box_mask[:3], tf.int32),
                        tf.repeat(1, repeats=3))
示例#10
0
  def _parse_train_data(self, data):
    """Parses data for training.

    Args:
      data: the decoded tensor dictionary from TfExampleDecoder.

    Returns:
      image: image tensor that is preproessed to have normalized value and
        dimension [output_size[0], output_size[1], 3]
      labels: a dictionary of tensors used for training. The following describes
        {key: value} pairs in the dictionary.
        image_info: a 2D `Tensor` that encodes the information of the image and
          the applied preprocessing. It is in the format of
          [[original_height, original_width], [scaled_height, scaled_width],
        anchor_boxes: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, 4] representing anchor boxes at each level.
        rpn_score_targets: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, anchors_per_location]. The height_l and
          width_l represent the dimension of class logits at l-th level.
        rpn_box_targets: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, anchors_per_location * 4]. The height_l and
          width_l represent the dimension of bounding box regression output at
          l-th level.
        gt_boxes: Groundtruth bounding box annotations. The box is represented
           in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
           image that is fed to the network. The tennsor is padded with -1 to
           the fixed dimension [self._max_num_instances, 4].
        gt_classes: Groundtruth classes annotations. The tennsor is padded
          with -1 to the fixed dimension [self._max_num_instances].
        gt_masks: groundtrugh masks cropped by the bounding box and
          resized to a fixed size determined by mask_crop_size.
    """
    classes = data['groundtruth_classes']
    boxes = data['groundtruth_boxes']
    if self._include_mask:
      masks = data['groundtruth_instance_masks']

    is_crowds = data['groundtruth_is_crowd']
    # Skips annotations with `is_crowd` = True.
    if self._skip_crowd_during_training:
      num_groundtruths = tf.shape(classes)[0]
      with tf.control_dependencies([num_groundtruths, is_crowds]):
        indices = tf.cond(
            tf.greater(tf.size(is_crowds), 0),
            lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
            lambda: tf.cast(tf.range(num_groundtruths), tf.int64))
      classes = tf.gather(classes, indices)
      boxes = tf.gather(boxes, indices)
      if self._include_mask:
        masks = tf.gather(masks, indices)

    # Gets original image and its size.
    image = data['image']
    if self._augmenter is not None:
      image = self._augmenter.distort(image)

    image_shape = tf.shape(image)[0:2]

    # Normalizes image with mean and std pixel values.
    image = preprocess_ops.normalize_image(image)

    # Flips image randomly during training.
    if self._aug_rand_hflip:
      if self._include_mask:
        image, boxes, masks = preprocess_ops.random_horizontal_flip(
            image, boxes, masks)
      else:
        image, boxes, _ = preprocess_ops.random_horizontal_flip(
            image, boxes)

    # Converts boxes from normalized coordinates to pixel coordinates.
    # Now the coordinates of boxes are w.r.t. the original image.
    boxes = box_ops.denormalize_boxes(boxes, image_shape)

    # Resizes and crops image.
    image, image_info = preprocess_ops.resize_and_crop_image(
        image,
        self._output_size,
        padded_size=preprocess_ops.compute_padded_size(
            self._output_size, 2 ** self._max_level),
        aug_scale_min=self._aug_scale_min,
        aug_scale_max=self._aug_scale_max)
    image_height, image_width, _ = image.get_shape().as_list()

    # Resizes and crops boxes.
    # Now the coordinates of boxes are w.r.t the scaled image.
    image_scale = image_info[2, :]
    offset = image_info[3, :]
    boxes = preprocess_ops.resize_and_crop_boxes(
        boxes, image_scale, image_info[1, :], offset)

    # Filters out ground truth boxes that are all zeros.
    indices = box_ops.get_non_empty_box_indices(boxes)
    boxes = tf.gather(boxes, indices)
    classes = tf.gather(classes, indices)
    if self._include_mask:
      masks = tf.gather(masks, indices)
      # Transfer boxes to the original image space and do normalization.
      cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
      cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
      cropped_boxes = box_ops.normalize_boxes(cropped_boxes, image_shape)
      num_masks = tf.shape(masks)[0]
      masks = tf.image.crop_and_resize(
          tf.expand_dims(masks, axis=-1),
          cropped_boxes,
          box_indices=tf.range(num_masks, dtype=tf.int32),
          crop_size=[self._mask_crop_size, self._mask_crop_size],
          method='bilinear')
      masks = tf.squeeze(masks, axis=-1)

    # Assigns anchor targets.
    # Note that after the target assignment, box targets are absolute pixel
    # offsets w.r.t. the scaled image.
    input_anchor = anchor.build_anchor_generator(
        min_level=self._min_level,
        max_level=self._max_level,
        num_scales=self._num_scales,
        aspect_ratios=self._aspect_ratios,
        anchor_size=self._anchor_size)
    anchor_boxes = input_anchor(image_size=(image_height, image_width))
    anchor_labeler = anchor.RpnAnchorLabeler(
        self._rpn_match_threshold,
        self._rpn_unmatched_threshold,
        self._rpn_batch_size_per_im,
        self._rpn_fg_fraction)
    rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors(
        anchor_boxes, boxes,
        tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32))

    # Casts input image to self._dtype
    image = tf.cast(image, dtype=self._dtype)

    # Packs labels for model_fn outputs.
    labels = {
        'anchor_boxes':
            anchor_boxes,
        'image_info':
            image_info,
        'rpn_score_targets':
            rpn_score_targets,
        'rpn_box_targets':
            rpn_box_targets,
        'gt_boxes':
            preprocess_ops.clip_or_pad_to_fixed_size(boxes,
                                                     self._max_num_instances,
                                                     -1),
        'gt_classes':
            preprocess_ops.clip_or_pad_to_fixed_size(classes,
                                                     self._max_num_instances,
                                                     -1),
    }
    if self._include_mask:
      labels['gt_masks'] = preprocess_ops.clip_or_pad_to_fixed_size(
          masks, self._max_num_instances, -1)

    return image, labels