Exemplo n.º 1
0
    def _parse_eval_data(self, data):
        """Parses data for training and evaluation."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        is_crowd = data['groundtruth_is_crowd']

        # Gets original image and its size.
        image = data['image']

        # Normalizes image with mean and std pixel values.
        image = preprocess_ops.normalize_image(image)

        scales = tf.constant([self._resize_scales[-1]], tf.float32)

        image_shape = tf.shape(image)[:2]
        boxes = box_ops.denormalize_boxes(boxes, image_shape)
        gt_boxes = boxes
        short_side = scales[0]
        image, image_info = preprocess_ops.resize_image(
            image, short_side, max(self._output_size))
        boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :],
                                                     image_info[1, :],
                                                     image_info[3, :])
        boxes = box_ops.normalize_boxes(boxes, image_info[1, :])

        # Filters out ground truth boxes that are all zeros.
        indices = box_ops.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)
        is_crowd = tf.gather(is_crowd, indices)
        boxes = box_ops.yxyx_to_cycxhw(boxes)

        image = tf.image.pad_to_bounding_box(image, 0, 0, self._output_size[0],
                                             self._output_size[1])
        labels = {
            'classes':
            preprocess_ops.clip_or_pad_to_fixed_size(classes,
                                                     self._max_num_boxes),
            'boxes':
            preprocess_ops.clip_or_pad_to_fixed_size(boxes,
                                                     self._max_num_boxes)
        }
        labels.update({
            'id':
            int(data['source_id']),
            'image_info':
            image_info,
            'is_crowd':
            preprocess_ops.clip_or_pad_to_fixed_size(is_crowd,
                                                     self._max_num_boxes),
            'gt_boxes':
            preprocess_ops.clip_or_pad_to_fixed_size(gt_boxes,
                                                     self._max_num_boxes),
        })

        return image, labels
Exemplo n.º 2
0
    def _parse_train_data(self, data):
        """Parses data for training and evaluation."""
        classes = data['groundtruth_classes'] + self._class_offset
        boxes = data['groundtruth_boxes']
        is_crowd = data['groundtruth_is_crowd']

        # Gets original image.
        image = data['image']

        # Normalizes image with mean and std pixel values.
        image = preprocess_ops.normalize_image(image)
        image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes)

        do_crop = tf.greater(tf.random.uniform([]), 0.5)
        if do_crop:
            # Rescale
            boxes = box_ops.denormalize_boxes(boxes, tf.shape(image)[:2])
            index = tf.random.categorical(tf.zeros([1, 3]), 1)[0]
            scales = tf.gather([400.0, 500.0, 600.0], index, axis=0)
            short_side = scales[0]
            image, image_info = preprocess_ops.resize_image(image, short_side)
            boxes = preprocess_ops.resize_and_crop_boxes(
                boxes, image_info[2, :], image_info[1, :], image_info[3, :])
            boxes = box_ops.normalize_boxes(boxes, image_info[1, :])

            # Do croping
            shape = tf.cast(image_info[1], dtype=tf.int32)
            h = tf.random.uniform([],
                                  384,
                                  tf.math.minimum(shape[0], 600),
                                  dtype=tf.int32)
            w = tf.random.uniform([],
                                  384,
                                  tf.math.minimum(shape[1], 600),
                                  dtype=tf.int32)
            i = tf.random.uniform([], 0, shape[0] - h + 1, dtype=tf.int32)
            j = tf.random.uniform([], 0, shape[1] - w + 1, dtype=tf.int32)
            image = tf.image.crop_to_bounding_box(image, i, j, h, w)
            boxes = tf.clip_by_value(
                (boxes[..., :] *
                 tf.cast(tf.stack([shape[0], shape[1], shape[0], shape[1]]),
                         dtype=tf.float32) -
                 tf.cast(tf.stack([i, j, i, j]), dtype=tf.float32)) /
                tf.cast(tf.stack([h, w, h, w]), dtype=tf.float32), 0.0, 1.0)
        scales = tf.constant(self._resize_scales, dtype=tf.float32)
        index = tf.random.categorical(tf.zeros([1, 11]), 1)[0]
        scales = tf.gather(scales, index, axis=0)

        image_shape = tf.shape(image)[:2]
        boxes = box_ops.denormalize_boxes(boxes, image_shape)
        short_side = scales[0]
        image, image_info = preprocess_ops.resize_image(
            image, short_side, max(self._output_size))
        boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :],
                                                     image_info[1, :],
                                                     image_info[3, :])
        boxes = box_ops.normalize_boxes(boxes, image_info[1, :])

        # Filters out ground truth boxes that are all zeros.
        indices = box_ops.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices)
        classes = tf.gather(classes, indices)
        is_crowd = tf.gather(is_crowd, indices)
        boxes = box_ops.yxyx_to_cycxhw(boxes)

        image = tf.image.pad_to_bounding_box(image, 0, 0, self._output_size[0],
                                             self._output_size[1])
        labels = {
            'classes':
            preprocess_ops.clip_or_pad_to_fixed_size(classes,
                                                     self._max_num_boxes),
            'boxes':
            preprocess_ops.clip_or_pad_to_fixed_size(boxes,
                                                     self._max_num_boxes)
        }

        return image, labels
Exemplo n.º 3
0
def random_crop_image_with_boxes_and_labels(img, boxes, labels, min_scale,
                                            aspect_ratio_range,
                                            min_overlap_params, max_retry):
  """Crops a random slice from the input image.

  The function will correspondingly recompute the bounding boxes and filter out
  outside boxes and their labels.

  References:
  [1] End-to-End Object Detection with Transformers
  https://arxiv.org/abs/2005.12872

  The preprocessing steps:
  1. Sample a minimum IoU overlap.
  2. For each trial, sample the new image width, height, and top-left corner.
  3. Compute the IoUs of bounding boxes with the cropped image and retry if
    the maximum IoU is below the sampled threshold.
  4. Find boxes whose centers are in the cropped image.
  5. Compute new bounding boxes in the cropped region and only select those
    boxes' labels.

  Args:
    img: a 'Tensor' of shape [height, width, 3] representing the input image.
    boxes: a 'Tensor' of shape [N, 4] representing the ground-truth bounding
      boxes with (ymin, xmin, ymax, xmax).
    labels: a 'Tensor' of shape [N,] representing the class labels of the boxes.
    min_scale: a 'float' in [0.0, 1.0) indicating the lower bound of the random
      scale variable.
    aspect_ratio_range: a list of two 'float' that specifies the lower and upper
      bound of the random aspect ratio.
    min_overlap_params: a list of four 'float' representing the min value, max
      value, step size, and offset for the minimum overlap sample.
    max_retry: an 'int' representing the number of trials for cropping. If it is
      exhausted, no cropping will be performed.

  Returns:
    img: a Tensor representing the random cropped image. Can be the
      original image if max_retry is exhausted.
    boxes: a Tensor representing the bounding boxes in the cropped image.
    labels: a Tensor representing the new bounding boxes' labels.
  """

  shape = tf.shape(img)
  original_h = shape[0]
  original_w = shape[1]

  minval, maxval, step, offset = min_overlap_params

  min_overlap = tf.math.floordiv(
      tf.random.uniform([], minval=minval, maxval=maxval), step) * step - offset

  min_overlap = tf.clip_by_value(min_overlap, 0.0, 1.1)

  if min_overlap > 1.0:
    return img, boxes, labels

  aspect_ratio_low = aspect_ratio_range[0]
  aspect_ratio_high = aspect_ratio_range[1]

  for _ in tf.range(max_retry):
    scale_h = tf.random.uniform([], min_scale, 1.0)
    scale_w = tf.random.uniform([], min_scale, 1.0)
    new_h = tf.cast(
        scale_h * tf.cast(original_h, dtype=tf.float32), dtype=tf.int32)
    new_w = tf.cast(
        scale_w * tf.cast(original_w, dtype=tf.float32), dtype=tf.int32)

    # Aspect ratio has to be in the prespecified range
    aspect_ratio = new_h / new_w
    if aspect_ratio_low > aspect_ratio or aspect_ratio > aspect_ratio_high:
      continue

    left = tf.random.uniform([], 0, original_w - new_w, dtype=tf.int32)
    right = left + new_w
    top = tf.random.uniform([], 0, original_h - new_h, dtype=tf.int32)
    bottom = top + new_h

    normalized_left = tf.cast(
        left, dtype=tf.float32) / tf.cast(
            original_w, dtype=tf.float32)
    normalized_right = tf.cast(
        right, dtype=tf.float32) / tf.cast(
            original_w, dtype=tf.float32)
    normalized_top = tf.cast(
        top, dtype=tf.float32) / tf.cast(
            original_h, dtype=tf.float32)
    normalized_bottom = tf.cast(
        bottom, dtype=tf.float32) / tf.cast(
            original_h, dtype=tf.float32)

    cropped_box = tf.expand_dims(
        tf.stack([
            normalized_top,
            normalized_left,
            normalized_bottom,
            normalized_right,
        ]),
        axis=0)
    iou = box_ops.bbox_overlap(
        tf.expand_dims(cropped_box, axis=0),
        tf.expand_dims(boxes, axis=0))  # (1, 1, n_ground_truth)
    iou = tf.squeeze(iou, axis=[0, 1])

    # If not a single bounding box has a Jaccard overlap of greater than
    # the minimum, try again
    if tf.reduce_max(iou) < min_overlap:
      continue

    centroids = box_ops.yxyx_to_cycxhw(boxes)
    mask = tf.math.logical_and(
        tf.math.logical_and(centroids[:, 0] > normalized_top,
                            centroids[:, 0] < normalized_bottom),
        tf.math.logical_and(centroids[:, 1] > normalized_left,
                            centroids[:, 1] < normalized_right))
    # If not a single bounding box has its center in the crop, try again.
    if tf.reduce_sum(tf.cast(mask, dtype=tf.int32)) > 0:
      indices = tf.squeeze(tf.where(mask), axis=1)

      filtered_boxes = tf.gather(boxes, indices)

      boxes = tf.clip_by_value(
          (filtered_boxes[..., :] * tf.cast(
              tf.stack([original_h, original_w, original_h, original_w]),
              dtype=tf.float32) -
           tf.cast(tf.stack([top, left, top, left]), dtype=tf.float32)) /
          tf.cast(tf.stack([new_h, new_w, new_h, new_w]), dtype=tf.float32),
          0.0, 1.0)

      img = tf.image.crop_to_bounding_box(img, top, left, bottom - top,
                                          right - left)

      labels = tf.gather(labels, indices)
      break

  return img, boxes, labels
Exemplo n.º 4
0
  def preprocess(self, inputs):
    """Preprocess COCO for DETR."""
    image = inputs['image']
    boxes = inputs['objects']['bbox']
    classes = inputs['objects']['label'] + 1
    is_crowd = inputs['objects']['is_crowd']

    image = preprocess_ops.normalize_image(image)
    if self._params.is_training:
      image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes)

      do_crop = tf.greater(tf.random.uniform([]), 0.5)
      if do_crop:
        # Rescale
        boxes = box_ops.denormalize_boxes(boxes, tf.shape(image)[:2])
        index = tf.random.categorical(tf.zeros([1, 3]), 1)[0]
        scales = tf.gather([400.0, 500.0, 600.0], index, axis=0)
        short_side = scales[0]
        image, image_info = preprocess_ops.resize_image(image, short_side)
        boxes = preprocess_ops.resize_and_crop_boxes(boxes,
                                                     image_info[2, :],
                                                     image_info[1, :],
                                                     image_info[3, :])
        boxes = box_ops.normalize_boxes(boxes, image_info[1, :])

        # Do croping
        shape = tf.cast(image_info[1], dtype=tf.int32)
        h = tf.random.uniform(
            [], 384, tf.math.minimum(shape[0], 600), dtype=tf.int32)
        w = tf.random.uniform(
            [], 384, tf.math.minimum(shape[1], 600), dtype=tf.int32)
        i = tf.random.uniform([], 0, shape[0] - h + 1, dtype=tf.int32)
        j = tf.random.uniform([], 0, shape[1] - w + 1, dtype=tf.int32)
        image = tf.image.crop_to_bounding_box(image, i, j, h, w)
        boxes = tf.clip_by_value(
            (boxes[..., :] * tf.cast(
                tf.stack([shape[0], shape[1], shape[0], shape[1]]),
                dtype=tf.float32) -
             tf.cast(tf.stack([i, j, i, j]), dtype=tf.float32)) /
            tf.cast(tf.stack([h, w, h, w]), dtype=tf.float32), 0.0, 1.0)
      scales = tf.constant(
          self._params.resize_scales,
          dtype=tf.float32)
      index = tf.random.categorical(tf.zeros([1, 11]), 1)[0]
      scales = tf.gather(scales, index, axis=0)
    else:
      scales = tf.constant([self._params.resize_scales[-1]], tf.float32)

    image_shape = tf.shape(image)[:2]
    boxes = box_ops.denormalize_boxes(boxes, image_shape)
    gt_boxes = boxes
    short_side = scales[0]
    image, image_info = preprocess_ops.resize_image(
        image,
        short_side,
        max(self._params.output_size))
    boxes = preprocess_ops.resize_and_crop_boxes(boxes,
                                                 image_info[2, :],
                                                 image_info[1, :],
                                                 image_info[3, :])
    boxes = box_ops.normalize_boxes(boxes, image_info[1, :])

    # Filters out ground truth boxes that are all zeros.
    indices = box_ops.get_non_empty_box_indices(boxes)
    boxes = tf.gather(boxes, indices)
    classes = tf.gather(classes, indices)
    is_crowd = tf.gather(is_crowd, indices)
    boxes = box_ops.yxyx_to_cycxhw(boxes)

    image = tf.image.pad_to_bounding_box(
        image, 0, 0, self._params.output_size[0], self._params.output_size[1])
    labels = {
        'classes':
            preprocess_ops.clip_or_pad_to_fixed_size(
                classes, self._params.max_num_boxes),
        'boxes':
            preprocess_ops.clip_or_pad_to_fixed_size(
                boxes, self._params.max_num_boxes)
    }
    if not self._params.is_training:
      labels.update({
          'id':
              inputs['image/id'],
          'image_info':
              image_info,
          'is_crowd':
              preprocess_ops.clip_or_pad_to_fixed_size(
                  is_crowd, self._params.max_num_boxes),
          'gt_boxes':
              preprocess_ops.clip_or_pad_to_fixed_size(
                  gt_boxes, self._params.max_num_boxes),
      })

    return image, labels
    def update_state(self, groundtruths, predictions):
        """Update the metrics state with prediction and groundtruth data.

    Args:
      groundtruths: a dictionary of Tensors including the fields below.
        Required fields:
          - source_id: a numpy array of int or string of shape [batch_size].
          - num_detections: a numpy array of int of shape [batch_size].
          - boxes: a numpy array of float of shape [batch_size, K, 4].
          - classes: a numpy array of int of shape [batch_size, K].
          - difficulties: a numpy array of int of shape [batch_size, K].

      predictions: a dictionary of tensors including the fields below.
        Required fields:
          - source_id: a numpy array of int or string of shape [batch_size].
          - image_info: a numpy array of float of shape [batch_size, 4, 2].
          - num_detections: a numpy array of int of shape [batch_size].
          - detection_boxes: a numpy array of float of shape [batch_size, K, 4].
          - detection_classes: a numpy array of int of shape [batch_size, K].
          - detection_scores: a numpy array of float of shape [batch_size, K].
    """
        # Preprocess potentially aggregated tensors.
        for k, v in groundtruths.items():
            if isinstance(v, tuple):
                groundtruths[k] = tf.concat(v, axis=0)
        for k, v in predictions.items():
            if isinstance(v, tuple):
                predictions[k] = tf.concat(v, axis=0)

        # Change cyclists' type id from 3 to 4, where 3 is reserved for sign.
        groundtruth_type = tf.cast(groundtruths['classes'], tf.uint8)
        groundtruth_type = tf.where(tf.equal(groundtruth_type, 3),
                                    tf.ones_like(groundtruth_type) * 4,
                                    groundtruth_type)
        prediction_type = tf.cast(predictions['detection_classes'], tf.uint8)
        prediction_type = tf.where(tf.equal(prediction_type, 3),
                                   tf.ones_like(prediction_type) * 4,
                                   prediction_type)

        # Rescale the detection boxes back to original scale.
        image_scale = tf.tile(predictions['image_info'][:, 2:3, :], (1, 1, 2))
        prediction_bbox = predictions['detection_boxes'] / image_scale

        batch_size = tf.shape(groundtruths['source_id'])[0]

        for i in tf.range(batch_size):
            frame_groundtruths = {
                'ground_truth_frame_id':
                groundtruths['source_id'][i],
                'ground_truth_bbox':
                box_ops.yxyx_to_cycxhw(
                    tf.cast(groundtruths['boxes'][i], tf.float32)),
                'ground_truth_type':
                groundtruth_type[i],
                'ground_truth_difficulty':
                tf.cast(groundtruths['difficulties'][i], tf.uint8),
            }
            frame_groundtruths = self._remove_padding(
                frame_groundtruths, groundtruths['num_detections'][i])
            frame_predictions = {
                'prediction_frame_id':
                groundtruths['source_id'][i],
                'prediction_bbox':
                box_ops.yxyx_to_cycxhw(tf.cast(prediction_bbox[i],
                                               tf.float32)),
                'prediction_type':
                prediction_type[i],
                'prediction_score':
                tf.cast(predictions['detection_scores'][i], tf.float32),
                'prediction_overlap_nlz':
                tf.zeros_like(predictions['detection_scores'][i],
                              dtype=tf.bool)
            }
            frame_predictions = self._remove_padding(
                frame_predictions, predictions['num_detections'][i])
            super().update_state(frame_groundtruths, frame_predictions)