Python compute_padded_size 예제들, utils.input_utils.compute_padded_size Python 예제들

예제 #1

0

파일 보기

파일: serving.py 프로젝트: youngbaby123/tpu

def preprocess_image(image, desired_size, stride):
    image = input_utils.normalize_image(image)
    image, image_info = input_utils.resize_and_crop_image(
        image,
        desired_size,
        padded_size=input_utils.compute_padded_size(desired_size, stride))
    return image, image_info

예제 #2

0

파일 보기

파일: evaluate_model.py 프로젝트: vishalbelsare/tpu

def parse_single_example(serialized_example, params):
  """Parses a singel serialized TFExample string."""
  if 'retinanet_parser' in dir(params):
    parser_params = params.retinanet_parser
    decoder = tf_example_decoder.TfExampleDecoder()
  else:
    parser_params = params.maskrcnn_parser
    decoder = tf_example_decoder.TfExampleDecoder(include_mask=True)
  data = decoder.decode(serialized_example)
  image = data['image']
  source_id = data['source_id']
  source_id = dataloader_utils.process_source_id(source_id)
  height = data['height']
  width = data['width']
  boxes = data['groundtruth_boxes']
  boxes = box_utils.denormalize_boxes(boxes, tf.shape(image)[:2])
  classes = data['groundtruth_classes']
  is_crowds = data['groundtruth_is_crowd']
  areas = data['groundtruth_area']
  masks = data.get('groundtruth_instance_masks_png', None)

  image = input_utils.normalize_image(image)
  image, image_info = input_utils.resize_and_crop_image(
      image,
      parser_params.output_size,
      padded_size=input_utils.compute_padded_size(
          parser_params.output_size,
          2 ** params.architecture.max_level),
      aug_scale_min=1.0,
      aug_scale_max=1.0)

  labels = {
      'image_info': image_info,
  }
  groundtruths = {
      'source_id': source_id,
      'height': height,
      'width': width,
      'num_detections': tf.shape(classes),
      'boxes': boxes,
      'classes': classes,
      'areas': areas,
      'is_crowds': tf.cast(is_crowds, tf.int32),
  }
  if masks is not None:
    groundtruths['masks'] = masks
  return image, labels, groundtruths

예제 #3

0

파일 보기

파일: evaluate_model.py 프로젝트: shimacos37/tpu_experiment

def parse_single_example(serialized_example, params):
    """Parses a singel serialized TFExample string."""
    decoder = tf_example_decoder.TfExampleDecoder()
    data = decoder.decode(serialized_example)
    image = data['image']
    source_id = data['source_id']
    source_id = dataloader_utils.process_source_id(source_id)
    height = data['height']
    width = data['width']
    boxes = data['groundtruth_boxes']
    boxes = box_utils.denormalize_boxes(boxes, tf.shape(image)[:2])
    classes = data['groundtruth_classes']
    is_crowds = data['groundtruth_is_crowd']
    areas = data['groundtruth_area']

    image = input_utils.normalize_image(image)
    image, image_info = input_utils.resize_and_crop_image(
        image,
        params.retinanet_parser.output_size,
        padded_size=input_utils.compute_padded_size(
            params.retinanet_parser.output_size, 2**params.anchor.max_level),
        aug_scale_min=1.0,
        aug_scale_max=1.0)
    anchors = anchor.Anchor(params.anchor.min_level, params.anchor.max_level,
                            params.anchor.num_scales,
                            params.anchor.aspect_ratios,
                            params.anchor.anchor_size,
                            image.get_shape().as_list()[:2])

    labels = {
        'anchor_boxes': anchors.multilevel_boxes,
        'image_info': image_info,
    }
    groundtruths = {
        'source_id': source_id,
        'height': height,
        'width': width,
        'num_detections': tf.shape(classes),
        'boxes': boxes,
        'classes': classes,
        'areas': areas,
        'is_crowds': tf.cast(is_crowds, tf.int32),
    }
    return image, labels, groundtruths

예제 #4

0

파일 보기

 def _preprocess(self, image: np.ndarray) -> tuple[tf.Tensor, tf.Tensor]:
     image = tf.convert_to_tensor(image, tf.uint8)
     image = input_utils.normalize_image(image)
     image, image_info = input_utils.resize_and_crop_image(
         image=image,
         desired_size=self.resize_shape,
         padded_size=input_utils.compute_padded_size(
             desired_size=self.resize_shape,
             stride=2**self.max_level,
         ),
         aug_scale_min=1.0,
         aug_scale_max=1.0,
     )
     # image_info: (4, 2)
     # [
     #     [original_height, original_width],
     #     [desired_height,  desired_width ],
     #     [y_scale,         x_scale       ],
     #     [y_offset,        x_offset      ]
     # ]
     image.set_shape([self.resize_shape[0], self.resize_shape[1], 3])
     return image, image_info

예제 #5

0

파일 보기

파일: retinanet_parser.py 프로젝트: zj19921221/tpu

    def _parse_predict_data(self, data):
        """Parses data for prediction."""
        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=1.0,
            aug_scale_max=1.0)
        image_height, image_width, _ = image.get_shape().as_list()

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Compute Anchor boxes.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))

        labels = {
            'anchor_boxes': input_anchor.multilevel_boxes,
            'image_info': image_info,
        }
        # If mode is PREDICT_WITH_GT, returns groundtruths and training targets
        # in labels.
        if self._mode == ModeKeys.PREDICT_WITH_GT:
            # Converts boxes from normalized coordinates to pixel coordinates.
            boxes = box_utils.denormalize_boxes(data['groundtruth_boxes'],
                                                image_shape)
            groundtruths = {
                'source_id': data['source_id'],
                'height': data['height'],
                'width': data['width'],
                'num_detections': tf.shape(data['groundtruth_classes']),
                'boxes': boxes,
                'classes': data['groundtruth_classes'],
                'areas': data['groundtruth_area'],
                'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
            }
            groundtruths['source_id'] = dataloader_utils.process_source_id(
                groundtruths['source_id'])
            groundtruths = dataloader_utils.pad_groundtruths_to_fixed_size(
                groundtruths, self._max_num_instances)
            labels['groundtruths'] = groundtruths

            # Computes training objective for evaluation loss.
            classes = data['groundtruth_classes']

            image_scale = image_info[2, :]
            offset = image_info[3, :]
            boxes = input_utils.resize_and_crop_boxes(
                boxes, image_scale, (image_height, image_width), offset)
            # Filters out ground truth boxes that are all zeros.
            indices = box_utils.get_non_empty_box_indices(boxes)
            boxes = tf.gather(boxes, indices)
            classes = tf.gather(classes, indices)

            # Assigns anchors.
            anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                                  self._match_threshold,
                                                  self._unmatched_threshold)
            (cls_targets, box_targets,
             num_positives) = anchor_labeler.label_anchors(
                 boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))
            labels['cls_targets'] = cls_targets
            labels['box_targets'] = box_targets
            labels['num_positives'] = num_positives
        return {
            'images': image,
            'labels': labels,
        }

예제 #6

0

파일 보기

파일: retinanet_parser.py 프로젝트: zj19921221/tpu

    def _parse_train_data(self, data):
        """Parses data for training and evaluation."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        is_crowds = data['groundtruth_is_crowd']
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    tf.greater(tf.size(is_crowds), 0),
                    lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)

        # Gets original image and its size.
        image = data['image']

        # NOTE: The autoaugment method works best when used alongside the standard
        # horizontal flipping of images along with size jittering and normalization.
        if self._use_autoaugment:
            try:
                from utils import autoaugment_utils  # pylint: disable=g-import-not-at-top
            except ImportError as e:
                logging.exception('Autoaugment is not supported in TF 2.x.')
                raise e

            image, boxes = autoaugment_utils.distort_image_with_autoaugment(
                image, boxes, self._autoaugment_policy_name)

        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            image, boxes = input_utils.random_horizontal_flip(image, boxes)

        # Converts boxes from normalized coordinates to pixel coordinates.
        # Now the coordinates of boxes are w.r.t. the original image.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        # Now the coordinates of boxes are w.r.t the scaled image.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  (image_height, image_width),
                                                  offset)

        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)

        # Assigns anchor targets.
        # Note that after the target assignment, box targets are absolute pixel
        # offsets w.r.t. the scaled image.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))
        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(
             boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': input_anchor.multilevel_boxes,
            'num_positives': num_positives,
            'image_info': image_info,
        }
        return image, labels

예제 #7

0

파일 보기

파일: maskrcnn_parser.py 프로젝트: ulen2000/training_prepare

    def _parse_predict_data(self, data):
        """Parses data for prediction.

    Args:
      data: the decoded tensor dictionary from TfExampleDecoder.

    Returns:
      A dictionary of {'images': image, 'labels': labels} where
        images: image tensor that is preproessed to have normalized value and
          dimension [output_size[0], output_size[1], 3]
        labels: a dictionary of tensors used for training. The following
          describes {key: value} pairs in the dictionary.
          source_ids: Source image id. Default value -1 if the source id is
            empty in the groundtruth annotation.
          image_info: a 2D `Tensor` that encodes the information of the image
            and the applied preprocessing. It is in the format of
            [[original_height, original_width], [scaled_height, scaled_width],
          anchor_boxes: ordered dictionary with keys
            [min_level, min_level+1, ..., max_level]. The values are tensor with
            shape [height_l, width_l, 4] representing anchor boxes at each
            level.
    """
        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=1.0,
            aug_scale_max=1.0)
        image_height, image_width, _ = image.get_shape().as_list()

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Compute Anchor boxes.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))

        labels = {
            'source_id': dataloader_utils.process_source_id(data['source_id']),
            'anchor_boxes': input_anchor.multilevel_boxes,
            'image_info': image_info,
        }

        if self._mode == ModeKeys.PREDICT_WITH_GT:
            # Converts boxes from normalized coordinates to pixel coordinates.
            boxes = box_utils.denormalize_boxes(data['groundtruth_boxes'],
                                                image_shape)
            groundtruths = {
                'source_id': data['source_id'],
                'height': data['height'],
                'width': data['width'],
                'num_detections': tf.shape(data['groundtruth_classes']),
                'boxes': boxes,
                'classes': data['groundtruth_classes'],
                'areas': data['groundtruth_area'],
                'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
            }
            groundtruths['source_id'] = dataloader_utils.process_source_id(
                groundtruths['source_id'])
            groundtruths = dataloader_utils.pad_groundtruths_to_fixed_size(
                groundtruths, self._max_num_instances)
            labels['groundtruths'] = groundtruths

        return {
            'images': image,
            'labels': labels,
        }

예제 #8

0

파일 보기

파일: maskrcnn_parser.py 프로젝트: ulen2000/training_prepare

    def _parse_eval_data(self, data):
        """Parses data for evaluation.

    Args:
      data: the decoded tensor dictionary from TfExampleDecoder.

    Returns:
      image: image tensor that is preproessed to have normalized value and
        dimension [output_size[0], output_size[1], 3]
      labels: a dictionary of tensors used for training. The following describes
        {key: value} pairs in the dictionary.
        image_info: a 2D `Tensor` that encodes the information of the image and
          the applied preprocessing. It is in the format of
          [[original_height, original_width], [scaled_height, scaled_width],
        anchor_boxes: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, 4] representing anchor boxes at each level.
        groundtruths:
          source_id: Groundtruth source id.
          height: Original image height.
          width: Original image width.
          boxes: Groundtruth bounding box annotations. The box is represented
             in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
             image that is fed to the network. The tennsor is padded with -1 to
             the fixed dimension [self._max_num_instances, 4].
          classes: Groundtruth classes annotations. The tennsor is padded
            with -1 to the fixed dimension [self._max_num_instances].
          areas: Box area or mask area depend on whether mask is present.
          is_crowds: Whether the ground truth label is a crowd label.
          num_groundtruths: Number of ground truths in the image.
    """
        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=1.0,
            aug_scale_max=1.0)
        image_height, image_width, _ = image.get_shape().as_list()

        # Assigns anchor targets.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Sets up groundtruth data for evaluation.
        groundtruths = {
            'source_id':
            data['source_id'],
            'height':
            data['height'],
            'width':
            data['width'],
            'num_groundtruths':
            tf.shape(data['groundtruth_classes']),
            'boxes':
            box_utils.denormalize_boxes(data['groundtruth_boxes'],
                                        image_shape),
            'classes':
            data['groundtruth_classes'],
            'areas':
            data['groundtruth_area'],
            'is_crowds':
            tf.cast(data['groundtruth_is_crowd'], tf.int32),
        }
        # TODO(b/143766089): Add ground truth masks for segmentation metrics.
        groundtruths['source_id'] = dataloader_utils.process_source_id(
            groundtruths['source_id'])
        groundtruths = dataloader_utils.pad_groundtruths_to_fixed_size(
            groundtruths, self._max_num_instances)

        # Packs labels for model_fn outputs.
        labels = {
            'anchor_boxes': input_anchor.multilevel_boxes,
            'image_info': image_info,
            'groundtruths': groundtruths,
        }

        return image, labels

예제 #9

0

파일 보기

파일: maskrcnn_parser.py 프로젝트: ulen2000/training_prepare

    def _parse_train_data(self, data):
        """Parses data for training.

    Args:
      data: the decoded tensor dictionary from TfExampleDecoder.

    Returns:
      image: image tensor that is preproessed to have normalized value and
        dimension [output_size[0], output_size[1], 3]
      labels: a dictionary of tensors used for training. The following describes
        {key: value} pairs in the dictionary.
        image_info: a 2D `Tensor` that encodes the information of the image and
          the applied preprocessing. It is in the format of
          [[original_height, original_width], [scaled_height, scaled_width],
        anchor_boxes: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, 4] representing anchor boxes at each level.
        rpn_score_targets: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, anchors_per_location]. The height_l and
          width_l represent the dimension of class logits at l-th level.
        rpn_box_targets: ordered dictionary with keys
          [min_level, min_level+1, ..., max_level]. The values are tensor with
          shape [height_l, width_l, anchors_per_location * 4]. The height_l and
          width_l represent the dimension of bounding box regression output at
          l-th level.
        gt_boxes: Groundtruth bounding box annotations. The box is represented
           in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
           image that is fed to the network. The tennsor is padded with -1 to
           the fixed dimension [self._max_num_instances, 4].
        gt_classes: Groundtruth classes annotations. The tennsor is padded
          with -1 to the fixed dimension [self._max_num_instances].
        gt_masks: groundtrugh masks cropped by the bounding box and
          resized to a fixed size determined by mask_crop_size.
    """
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        if self._include_mask:
            masks = data['groundtruth_instance_masks']

        is_crowds = data['groundtruth_is_crowd']
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    tf.greater(tf.size(is_crowds), 0),
                    lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)
            if self._include_mask:
                masks = tf.gather(masks, indices)

        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            if self._include_mask:
                image, boxes, masks = input_utils.random_horizontal_flip(
                    image, boxes, masks)
            else:
                image, boxes = input_utils.random_horizontal_flip(image, boxes)

        # Converts boxes from normalized coordinates to pixel coordinates.
        # Now the coordinates of boxes are w.r.t. the original image.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        # Now the coordinates of boxes are w.r.t the scaled image.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  image_info[1, :], offset)

        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)
        if self._include_mask:
            masks = tf.gather(masks, indices)
            # Transfer boxes to the original image space and do normalization.
            cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0),
                                            [1, 2])
            cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0),
                                     [1, 2])
            cropped_boxes = box_utils.normalize_boxes(cropped_boxes,
                                                      image_shape)
            num_masks = tf.shape(masks)[0]
            masks = tf.image.crop_and_resize(
                tf.expand_dims(masks, axis=-1),
                cropped_boxes,
                box_indices=tf.range(num_masks, dtype=tf.int32),
                crop_size=[self._mask_crop_size, self._mask_crop_size],
                method='bilinear')
            masks = tf.squeeze(masks, axis=-1)

        # Assigns anchor targets.
        # Note that after the target assignment, box targets are absolute pixel
        # offsets w.r.t. the scaled image.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))
        anchor_labeler = anchor.RpnAnchorLabeler(input_anchor,
                                                 self._rpn_match_threshold,
                                                 self._rpn_unmatched_threshold,
                                                 self._rpn_batch_size_per_im,
                                                 self._rpn_fg_fraction)
        rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors(
            boxes, tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Packs labels for model_fn outputs.
        labels = {
            'anchor_boxes': input_anchor.multilevel_boxes,
            'image_info': image_info,
            'rpn_score_targets': rpn_score_targets,
            'rpn_box_targets': rpn_box_targets,
        }
        labels['gt_boxes'] = input_utils.clip_or_pad_to_fixed_size(
            boxes, self._max_num_instances, -1)
        labels['gt_classes'] = input_utils.clip_or_pad_to_fixed_size(
            classes, self._max_num_instances, -1)
        if self._include_mask:
            labels['gt_masks'] = input_utils.clip_or_pad_to_fixed_size(
                masks, self._max_num_instances, -1)

        return image, labels

예제 #10

0

파일 보기

파일: extract_objects_parser.py 프로젝트: vishalbelsare/tpu

    def _parse_train_data(self, data):
        """Parses data for training.

    Args:
      data: the decoded tensor dictionary from TfExampleDecoder.

    Returns:
      image: image tensor that is preproessed to have normalized value and
        dimension [output_size[0], output_size[1], 3]
      labels: a dictionary of tensors used for training. The following describes
        {key: value} pairs in the dictionary.
        image: image tensor that is preproessed to have normalized value and
          dimension [output_size[0], output_size[1], 3]
        image_info: a 2D `Tensor` that encodes the information of the image and
          the applied preprocessing. It is in the format of
          [[original_height, original_width], [scaled_height, scaled_width],
        num_groundtrtuhs: number of objects.
        boxes: Groundtruth bounding box annotations. The box is represented
           in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
           image that is fed to the network. The tennsor is padded with -1 to
           the fixed dimension [self._max_num_instances, 4].
        classes: Groundtruth classes annotations. The tennsor is padded
          with -1 to the fixed dimension [self._max_num_instances].
        masks: groundtrugh masks cropped by the bounding box and
          resized to a fixed size determined by mask_crop_size.
        pasted_objects_mask: a binary tensor with the same size as image which
          is computed as the union of all the objects masks.
    """
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        if self._include_mask:
            masks = data['groundtruth_instance_masks']

        is_crowds = data['groundtruth_is_crowd']
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training:
            num_groundtrtuhs = tf.shape(classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    tf.greater(tf.size(is_crowds), 0),
                    lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)
            if self._include_mask:
                masks = tf.gather(masks, indices)

        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            if self._include_mask:
                image, boxes, masks = input_utils.random_horizontal_flip(
                    image, boxes, masks)
            else:
                image, boxes = input_utils.random_horizontal_flip(image, boxes)

        # Converts boxes from normalized coordinates to pixel coordinates.
        # Now the coordinates of boxes are w.r.t. the original image.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)

        # Resizes and crops boxes.
        # Now the coordinates of boxes are w.r.t the scaled image.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  image_info[1, :], offset)

        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)
        if self._include_mask:
            masks = tf.gather(masks, indices)
            uncropped_masks = tf.cast(masks, tf.int8)
            uncropped_masks = tf.expand_dims(uncropped_masks, axis=3)
            uncropped_masks = input_utils.resize_and_crop_masks(
                uncropped_masks, image_scale, self._output_size, offset)
            # Transfer boxes to the original image space and do normalization.
            cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0),
                                            [1, 2])
            cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0),
                                     [1, 2])
            cropped_boxes = box_utils.normalize_boxes(cropped_boxes,
                                                      image_shape)
            num_masks = tf.shape(masks)[0]
            masks = tf.image.crop_and_resize(
                tf.expand_dims(masks, axis=-1),
                cropped_boxes,
                box_indices=tf.range(num_masks, dtype=tf.int32),
                crop_size=[self._mask_crop_size, self._mask_crop_size],
                method='bilinear')
            masks = tf.squeeze(masks, axis=-1)
        indices = tf.range(start=0, limit=tf.shape(classes)[0], dtype=tf.int32)

        # Samples the numbers of masks for pasting.
        m = tf.random.uniform(shape=[],
                              maxval=tf.shape(classes)[0] + 1,
                              dtype=tf.int32)
        m = tf.math.minimum(m, tf.shape(classes)[0])

        # Shuffles the indices of objects and keep the first m objects for pasting.
        shuffled_indices = tf.random.shuffle(indices)
        shuffled_indices = tf.slice(shuffled_indices, [0], [m])

        boxes = tf.gather(boxes, shuffled_indices)
        masks = tf.gather(masks, shuffled_indices)
        classes = tf.gather(classes, shuffled_indices)
        uncropped_masks = tf.gather(uncropped_masks, shuffled_indices)
        pasted_objects_mask = tf.reduce_max(uncropped_masks, 0)
        pasted_objects_mask = tf.cast(pasted_objects_mask, tf.bool)

        labels = {
            'image': image,
            'image_info': image_info,
            'num_groundtrtuhs': tf.shape(classes)[0],
            'boxes': boxes,
            'masks': masks,
            'classes': classes,
            'pasted_objects_mask': pasted_objects_mask,
        }
        return labels

예제 #11

0

파일 보기

    def _parse_eval_data(self, data):
        """Parses data for training and evaluation."""
        groundtruths = {}
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']

        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(input=image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Converts boxes from normalized coordinates to pixel coordinates.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=1.0,
            aug_scale_max=1.0)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  image_info[1, :], offset)
        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)

        # Assigns anchors.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))
        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(
             boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Sets up groundtruth data for evaluation.
        groundtruths = {
            'source_id':
            data['source_id'],
            'num_groundtrtuhs':
            tf.shape(data['groundtruth_classes']),
            'image_info':
            image_info,
            'boxes':
            box_utils.denormalize_boxes(data['groundtruth_boxes'],
                                        image_shape),
            'classes':
            data['groundtruth_classes'],
            'areas':
            data['groundtruth_area'],
            'is_crowds':
            tf.cast(data['groundtruth_is_crowd'], tf.int32),
        }
        groundtruths['source_id'] = process_source_id(
            groundtruths['source_id'])
        groundtruths = pad_groundtruths_to_fixed_size(groundtruths,
                                                      self._max_num_instances)

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': input_anchor.multilevel_boxes,
            'num_positives': num_positives,
            'image_info': image_info,
            'groundtruths': groundtruths,
        }
        return image, labels

예제 #12

0

파일 보기

    def _parse_train_data(self, data):
        """Parses data for training and evaluation."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        is_crowds = data['groundtruth_is_crowd']
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    tf.greater(tf.size(is_crowds), 0),
                    lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    lambda: tf.cast(tf.range(num_groundtrtuhs), tf.int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)

        # Gets original image and its size.
        image = data['image']

        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            image, boxes = input_utils.random_horizontal_flip(image, boxes)

        # Converts boxes from normalized coordinates to pixel coordinates.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  (image_height, image_width),
                                                  offset)
        # Filters out ground truth boxes that are all zeros.
        indices = input_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)

        # Assigns anchors.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))
        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(
             boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': input_anchor.multilevel_boxes,
            'num_positives': num_positives,
            'image_info': image_info,
        }
        return image, labels

예제 #13

0

파일 보기

    def _parse_train_data_v2(self, data):
        """Parses data for training.

        Args:
          data: the decoded tensor dictionary from TfExampleDecoder.

        Returns:
          image: image tensor that is preproessed to have normalized value and
            dimension [output_size[0], output_size[1], 3]
          labels: a dictionary of tensors used for training. The following describes
            {key: value} pairs in the dictionary.
            image_info: a 2D `Tensor` that encodes the information of the image and
              the applied preprocessing. It is in the format of
              [[original_height, original_width], [scaled_height, scaled_width],
            anchor_boxes: ordered dictionary with keys
              [min_level, min_level+1, ..., max_level]. The values are tensor with
              shape [height_l, width_l, 4] representing anchor boxes at each level.
            rpn_score_targets: ordered dictionary with keys
              [min_level, min_level+1, ..., max_level]. The values are tensor with
              shape [height_l, width_l, anchors_per_location]. The height_l and
              width_l represent the dimension of class logits at l-th level.
            rpn_box_targets: ordered dictionary with keys
              [min_level, min_level+1, ..., max_level]. The values are tensor with
              shape [height_l, width_l, anchors_per_location * 4]. The height_l and
              width_l represent the dimension of bounding box regression output at
              l-th level.
            gt_boxes: Groundtruth bounding box annotations. The box is represented
               in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled
               image that is fed to the network. The tennsor is padded with -1 to
               the fixed dimension [self._max_num_instances, 4].
            gt_classes: Groundtruth classes annotations. The tennsor is padded
              with -1 to the fixed dimension [self._max_num_instances].
            gt_masks: groundtrugh masks cropped by the bounding box and
              resized to a fixed size determined by mask_crop_size.
        """
        if self._use_autoaugment:
            try:
                from utils import (
                    autoaugment_utils, )  # pylint: disable=g-import-not-at-top
            except ImportError as e:
                logging.exception("Autoaugment is not supported in TF 2.x.")
                raise e

        classes = data["groundtruth_classes"]
        boxes = data["groundtruth_boxes"]
        masks = None
        attributes = None

        if self._include_mask:
            masks = data["groundtruth_instance_masks"]

        if self._num_attributes:
            attributes = data["groundtruth_attributes"]

        is_crowds = data["groundtruth_is_crowd"]
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(input=classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    pred=tf.greater(tf.size(input=is_crowds), 0),
                    true_fn=lambda: tf.compat.v1.where(
                        tf.logical_not(is_crowds))[:, 0],
                    false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf.
                                             int64),
                )
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)

            if self._include_mask:
                masks = tf.gather(masks, indices)

            if self._num_attributes:
                attributes = tf.gather(attributes, indices)

        # Gets original image and its size.
        image = data["image"]

        # NOTE: The autoaugment method works best when used alongside the standard
        # horizontal flipping of images along with size jittering and normalization.
        if self._use_autoaugment and not self._apply_autoaugment_after_resizing:
            (
                image,
                boxes,
                masks,
            ) = autoaugment_utils.distort_image_and_masks_with_autoaugment(
                image, boxes, masks, self._autoaugment_policy_name)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            if self._include_mask:
                image, boxes, masks = input_utils.random_horizontal_flip(
                    image, boxes, masks)
            else:
                image, boxes = input_utils.random_horizontal_flip(image, boxes)

        # Resizes and crops image.
        image = tf.image.convert_image_dtype(image, dtype=tf.float32)
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max,
        )

        # Converts boxes from normalized coordinates to pixel coordinates.
        # Now the coordinates of boxes are w.r.t. the original image.
        orig_image_shape = image_info[0]
        boxes = box_utils.denormalize_boxes(boxes, orig_image_shape)

        # Resizes and crops boxes.
        # Now the coordinates of boxes are w.r.t the scaled image.
        rescaled_image_shape = tf.shape(input=image)[:2]
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  rescaled_image_shape, offset)

        # Filters out ground truth boxes that are all zeros.
        boxes, classes, masks, attributes = self._remove_empty_boxes(
            boxes, classes, masks, attributes)

        # apply the autoaugment after resizing
        if self._use_autoaugment and self._apply_autoaugment_after_resizing:
            # prepare image and boxes for the autoaugment
            image = tf.image.convert_image_dtype(image, dtype=tf.uint8)
            boxes = box_utils.normalize_boxes(boxes, rescaled_image_shape)

            # prepare masks for the autoaugment
            masks = tf.expand_dims(masks, axis=-1)
            scaled_mask_size = tf.cast(
                tf.round(orig_image_shape * image_scale), tf.int32)
            scaled_masks = tf.image.resize(
                masks, scaled_mask_size, method=tf.image.ResizeMethod.BILINEAR)
            offset_int = tf.cast(offset, tf.int32)
            masks = scaled_masks[:, offset_int[0]:offset_int[0] +
                                 rescaled_image_shape[0],
                                 offset_int[1]:offset_int[1] +
                                 rescaled_image_shape[1], ]
            masks = tf.squeeze(masks, axis=-1)
            masks = tf.cast(tf.greater(masks, 0.5), tf.float32)

            # apply the autoaugment
            (
                image,
                boxes,
                masks,
            ) = autoaugment_utils.distort_image_and_masks_with_autoaugment(
                image, boxes, masks, self._autoaugment_policy_name)

            # convert the image back to float32 and denormalize bboxes
            image = tf.image.convert_image_dtype(image, dtype=tf.float32)
            boxes = box_utils.denormalize_boxes(boxes, rescaled_image_shape)

            # filters out empty bboxes
            boxes, classes, masks, attributes = self._remove_empty_boxes(
                boxes, classes, masks, attributes)

        if self._include_mask:
            if self._use_autoaugment and self._apply_autoaugment_after_resizing:
                # don't rescale boxes as masks were already resized
                cropped_boxes = box_utils.normalize_boxes(
                    boxes, rescaled_image_shape)
            else:
                # transfer boxes to the original image space
                cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0),
                                                [1, 2])
                cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0),
                                         [1, 2])
                cropped_boxes = box_utils.normalize_boxes(
                    cropped_boxes, orig_image_shape)

            num_masks = tf.shape(input=masks)[0]
            masks = tf.image.crop_and_resize(
                tf.expand_dims(masks, axis=-1),
                cropped_boxes,
                box_indices=tf.range(num_masks, dtype=tf.int32),
                crop_size=[self._mask_crop_size, self._mask_crop_size],
                method="bilinear",
            )
            masks = tf.squeeze(masks, axis=-1)

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # pad the image
        padded_size = input_utils.compute_padded_size(self._output_size,
                                                      2**self._max_level)
        image = tf.image.pad_to_bounding_box(image, 0, 0, padded_size[0],
                                             padded_size[1])

        image_height, image_width, _ = image.get_shape().as_list()

        # Assigns anchor targets.
        # Note that after the target assignment, box targets are absolute pixel
        # offsets w.r.t. the scaled image.
        input_anchor = anchor.Anchor(
            self._min_level,
            self._max_level,
            self._num_scales,
            self._aspect_ratios,
            self._anchor_size,
            (image_height, image_width),
        )
        anchor_labeler = anchor.RpnAnchorLabeler(
            input_anchor,
            self._rpn_match_threshold,
            self._rpn_unmatched_threshold,
            self._rpn_batch_size_per_im,
            self._rpn_fg_fraction,
        )
        rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors(
            boxes, tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Packs labels for model_fn outputs.
        labels = {
            "anchor_boxes": input_anchor.multilevel_boxes,
            "image_info": image_info,
            "rpn_score_targets": rpn_score_targets,
            "rpn_box_targets": rpn_box_targets,
        }
        labels["gt_boxes"] = input_utils.pad_to_fixed_size(
            boxes, self._max_num_instances, -1)
        labels["gt_classes"] = input_utils.pad_to_fixed_size(
            classes, self._max_num_instances, -1)

        if self._include_mask:
            labels["gt_masks"] = input_utils.pad_to_fixed_size(
                masks, self._max_num_instances, -1)

        if self._num_attributes:
            labels["gt_attributes"] = input_utils.pad_to_fixed_size(
                attributes, self._max_num_instances, -1)

        return image, labels

예제 #14

0

파일 보기

파일: retinanet_parser.py 프로젝트: lucifer443/Yolov3_tensorflow-keras

    def _parse_train_data(self, data):
        """Parses data for training and evaluation."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        is_crowds = data['groundtruth_is_crowd']
        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(input=classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    pred=tf.greater(tf.size(input=is_crowds), 0),
                    true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf.
                                             int64))
            classes = tf.gather(classes, indices)
            boxes = tf.gather(boxes, indices)

        # Gets original image and its size.
        image = data['image']

        # NOTE: The autoaugment method works best when used alongside the standard
        # horizontal flipping of images along with size jittering and normalization.
        if self._use_autoaugment:
            image, boxes = autoaugment_utils.distort_image_with_autoaugment(
                image, boxes, self._autoaugment_policy_name)

        image_shape = tf.shape(input=image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            image, boxes = input_utils.random_horizontal_flip(image, boxes)

        # Converts boxes from normalized coordinates to pixel coordinates.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  image_info[1, :], offset)
        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)

        # Assigns anchors.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))
        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)
        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(
             boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))

        # If bfloat16 is used, casts input image to tf.bfloat16.
        if self._use_bfloat16:
            image = tf.cast(image, dtype=tf.bfloat16)

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': input_anchor.multilevel_boxes,
            'num_positives': num_positives,
            'image_info': image_info,
        }
        # return image, labels
        num_anchors = input_anchor.anchors_per_location
        mlvl_cls_targets = tf.concat([tf.reshape(cls_targets[lv], [-1, num_anchors]) \
                                     for lv in range(self._min_level, self._max_level+1)], axis=0)
        mlvl_box_targets = tf.concat([tf.reshape(box_targets[lv], [-1, num_anchors*4]) \
                                     for lv in range(self._min_level, self._max_level + 1)], axis=0)
        num_positives_expand = tf.ones_like(
            mlvl_box_targets[..., 0:1]) * num_positives
        mlvl_cls_targets_wp = tf.concat(
            [mlvl_cls_targets,
             tf.cast(num_positives_expand, dtype=tf.int32)],
            axis=-1)
        mlvl_box_targets_wp = tf.concat(
            [mlvl_box_targets, num_positives_expand], axis=-1)
        return image, (mlvl_cls_targets_wp, mlvl_box_targets_wp)