示例#1
0
    def build(self, input_shapes):
        """Creates the variables of the layer."""
        feature_channels = [
            shape_utils.get_dim_as_int(input_shape[3])
            for input_shape in input_shapes
        ]
        has_different_feature_channels = len(set(feature_channels)) > 1
        if has_different_feature_channels:
            inserted_layer_counter = 0
            target_channel = max(set(feature_channels),
                                 key=feature_channels.count)
            tf.logging.info(
                'Not all feature maps have the same number of '
                'channels, found: {}, appending additional projection '
                'layers to bring all feature maps to uniformly have {} '
                'channels.'.format(feature_channels, target_channel))
        else:
            # Place holder variables if has_different_feature_channels is False.
            target_channel = -1
            inserted_layer_counter = -1

        def _build_layers(tower_name_scope, feature_index):
            conv_layers, base_tower_layers = self._compute_base_tower(
                tower_name_scope=tower_name_scope, feature_index=feature_index)
            if tower_name_scope not in self._head_scope_conv_layers:
                self._head_scope_conv_layers[tower_name_scope] = conv_layers
            return base_tower_layers

        for feature_index, input_shape in enumerate(input_shapes):
            # Additional projection layers should not be shared as input channels
            # (and thus weight shapes) are different
            inserted_layer_counter, projection_layers = (
                self._insert_additional_projection_layer(
                    inserted_layer_counter, target_channel))
            self._additional_projection_layers.append(projection_layers)

            if self._share_prediction_tower:
                box_tower_scope = 'PredictionTower'
            else:
                box_tower_scope = 'BoxPredictionTower'
            # For box tower base
            box_tower_layers = _build_layers(box_tower_scope, feature_index)
            self._base_tower_layers_for_heads[BOX_ENCODINGS].append(
                box_tower_layers)

            for head_name in self._sorted_head_names:
                if head_name == CLASS_PREDICTIONS_WITH_BACKGROUND:
                    tower_name_scope = 'ClassPredictionTower'
                else:
                    tower_name_scope = '{}PredictionTower'.format(head_name)
                box_tower_layers = _build_layers(tower_name_scope,
                                                 feature_index)
                self._base_tower_layers_for_heads[head_name].append(
                    box_tower_layers)

        self.built = True
示例#2
0
  def num_boxes_static(self):
    """Returns number of boxes held in collection.

    This number is inferred at graph construction time rather than run-time.

    Returns:
      Number of boxes held in collection (integer) or None if this is not
        inferrable at graph construction time.
    """
    return shape_utils.get_dim_as_int(self.data['boxes'].get_shape()[0])
示例#3
0
def batch_decode(encoded_boxes, box_coder, anchors):
    """Decode a batch of encoded boxes.

  This op takes a batch of encoded bounding boxes and transforms
  them to a batch of bounding boxes specified by their corners in
  the order of [y_min, x_min, y_max, x_max].

  Args:
    encoded_boxes: a float32 tensor of shape [batch_size, num_anchors,
      code_size] representing the location of the objects.
    box_coder: a BoxCoder object.
    anchors: a BoxList of anchors used to encode `encoded_boxes`.

  Returns:
    decoded_boxes: a float32 tensor of shape [batch_size, num_anchors,
      coder_size] representing the corners of the objects in the order
      of [y_min, x_min, y_max, x_max].

  Raises:
    ValueError: if batch sizes of the inputs are inconsistent, or if
    the number of anchors inferred from encoded_boxes and anchors are
    inconsistent.
  """
    encoded_boxes.get_shape().assert_has_rank(3)
    if (shape_utils.get_dim_as_int(encoded_boxes.get_shape()[1]) !=
            anchors.num_boxes_static()):
        raise ValueError(
            'The number of anchors inferred from encoded_boxes'
            ' and anchors are inconsistent: shape[1] of encoded_boxes'
            ' %s should be equal to the number of anchors: %s.' %
            (shape_utils.get_dim_as_int(
                encoded_boxes.get_shape()[1]), anchors.num_boxes_static()))

    decoded_boxes = tf.stack([
        box_coder.decode(boxes, anchors).get()
        for boxes in tf.unstack(encoded_boxes)
    ])
    return decoded_boxes
示例#4
0
  def build(self, input_shapes):
    num_conv_channels = self._mask_prediction_conv_depth
    if num_conv_channels == 0:
      num_feature_channels = input_shapes.as_list()[3]
      num_conv_channels = self._get_mask_predictor_conv_depth(
          num_feature_channels, self._num_classes)

    for i in range(self._mask_prediction_num_conv_layers - 1):
      self._mask_predictor_layers.append(
          tf.keras.layers.Conv2D(
              num_conv_channels,
              [3, 3],
              padding='SAME',
              name='MaskPredictor_conv2d_{}'.format(i),
              **self._conv_hyperparams.params()))
      self._mask_predictor_layers.append(
          self._conv_hyperparams.build_batch_norm(
              training=(self._is_training and not self._freeze_batchnorm),
              name='MaskPredictor_batchnorm_{}'.format(i)))
      self._mask_predictor_layers.append(
          self._conv_hyperparams.build_activation_layer(
              name='MaskPredictor_activation_{}'.format(i)))

    if self._convolve_then_upsample:
      # Replace Transposed Convolution with a Nearest Neighbor upsampling step
      # followed by 3x3 convolution.
      height_scale = self._mask_height / shape_utils.get_dim_as_int(
          input_shapes[1])
      width_scale = self._mask_width / shape_utils.get_dim_as_int(
          input_shapes[2])
      # pylint: disable=g-long-lambda
      self._mask_predictor_layers.append(tf.keras.layers.Lambda(
          lambda features: ops.nearest_neighbor_upsampling(
              features, height_scale=height_scale, width_scale=width_scale)
      ))
      # pylint: enable=g-long-lambda
      self._mask_predictor_layers.append(
          tf.keras.layers.Conv2D(
              num_conv_channels,
              [3, 3],
              padding='SAME',
              name='MaskPredictor_upsample_conv2d',
              **self._conv_hyperparams.params()))
      self._mask_predictor_layers.append(
          self._conv_hyperparams.build_batch_norm(
              training=(self._is_training and not self._freeze_batchnorm),
              name='MaskPredictor_upsample_batchnorm'))
      self._mask_predictor_layers.append(
          self._conv_hyperparams.build_activation_layer(
              name='MaskPredictor_upsample_activation'))

    num_masks = 1 if self._masks_are_class_agnostic else self._num_classes
    self._mask_predictor_layers.append(
        tf.keras.layers.Conv2D(
            num_masks,
            [3, 3],
            padding='SAME',
            name='MaskPredictor_last_conv2d',
            **self._conv_hyperparams.params(use_bias=True)))

    self.built = True
示例#5
0
    def _predict(self, image_features, num_predictions_per_location_list):
        """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A list of float tensors of shape [batch_size, height_i,
        width_i, channels] containing features for a batch of images. Note that
        when not all tensors in the list have the same number of channels, an
        additional projection layer will be added on top the tensor to generate
        feature map with number of channels consitent with the majority.
      num_predictions_per_location_list: A list of integers representing the
        number of box predictions to be made per spatial location for each
        feature map. Note that all values must be the same since the weights are
        shared.

    Returns:
      A dictionary containing:
        box_encodings: A list of float tensors of shape
          [batch_size, num_anchors_i, code_size] representing the location of
          the objects. Each entry in the list corresponds to a feature map in
          the input `image_features` list.
        class_predictions_with_background: A list of float tensors of shape
          [batch_size, num_anchors_i, num_classes + 1] representing the class
          predictions for the proposals. Each entry in the list corresponds to a
          feature map in the input `image_features` list.
        (optional) Predictions from other heads.
          E.g., mask_predictions: A list of float tensors of shape
          [batch_size, num_anchord_i, num_classes, mask_height, mask_width].


    Raises:
      ValueError: If the num predictions per locations differs between the
        feature maps.
    """
        if len(set(num_predictions_per_location_list)) > 1:
            raise ValueError(
                'num predictions per location must be same for all'
                'feature maps, found: {}'.format(
                    num_predictions_per_location_list))
        feature_channels = [
            shape_utils.get_dim_as_int(image_feature.shape[3])
            for image_feature in image_features
        ]
        has_different_feature_channels = len(set(feature_channels)) > 1
        if has_different_feature_channels:
            inserted_layer_counter = 0
            target_channel = max(set(feature_channels),
                                 key=feature_channels.count)
            tf.logging.info(
                'Not all feature maps have the same number of '
                'channels, found: {}, appending additional projection '
                'layers to bring all feature maps to uniformly have {} '
                'channels.'.format(feature_channels, target_channel))
        else:
            # Place holder variables if has_different_feature_channels is False.
            target_channel = -1
            inserted_layer_counter = -1
        predictions = {
            BOX_ENCODINGS: [],
            CLASS_PREDICTIONS_WITH_BACKGROUND: [],
        }
        for head_name in self._other_heads.keys():
            predictions[head_name] = []
        for feature_index, (image_feature,
                            num_predictions_per_location) in enumerate(
                                zip(image_features,
                                    num_predictions_per_location_list)):
            with tf.variable_scope('WeightSharedConvolutionalBoxPredictor',
                                   reuse=tf.AUTO_REUSE):
                with slim.arg_scope(self._conv_hyperparams_fn()):
                    # TODO(wangjiang) Pass is_training to the head class directly.
                    with slim.arg_scope([slim.dropout],
                                        is_training=self._is_training):
                        (image_feature, inserted_layer_counter
                         ) = self._insert_additional_projection_layer(
                             image_feature, inserted_layer_counter,
                             target_channel)
                        if self._share_prediction_tower:
                            box_tower_scope = 'PredictionTower'
                        else:
                            box_tower_scope = 'BoxPredictionTower'
                        box_tower_feature = self._compute_base_tower(
                            tower_name_scope=box_tower_scope,
                            image_feature=image_feature,
                            feature_index=feature_index)
                        box_encodings = self._box_prediction_head.predict(
                            features=box_tower_feature,
                            num_predictions_per_location=
                            num_predictions_per_location)
                        predictions[BOX_ENCODINGS].append(box_encodings)
                        sorted_keys = sorted(self._other_heads.keys())
                        sorted_keys.append(CLASS_PREDICTIONS_WITH_BACKGROUND)
                        for head_name in sorted_keys:
                            if head_name == CLASS_PREDICTIONS_WITH_BACKGROUND:
                                head_obj = self._class_prediction_head
                            else:
                                head_obj = self._other_heads[head_name]
                            prediction = self._predict_head(
                                head_name=head_name,
                                head_obj=head_obj,
                                image_feature=image_feature,
                                box_tower_feature=box_tower_feature,
                                feature_index=feature_index,
                                num_predictions_per_location=
                                num_predictions_per_location)
                            predictions[head_name].append(prediction)
        return predictions
示例#6
0
def pad_input_data_to_static_shapes(tensor_dict, max_num_boxes, num_classes,
                                    spatial_image_shape=None):
  """Pads input tensors to static shapes.

  In case num_additional_channels > 0, we assume that the additional channels
  have already been concatenated to the base image.

  Args:
    tensor_dict: Tensor dictionary of input data
    max_num_boxes: Max number of groundtruth boxes needed to compute shapes for
      padding.
    num_classes: Number of classes in the dataset needed to compute shapes for
      padding.
    spatial_image_shape: A list of two integers of the form [height, width]
      containing expected spatial shape of the image.

  Returns:
    A dictionary keyed by fields.InputDataFields containing padding shapes for
    tensors in the dataset.

  Raises:
    ValueError: If groundtruth classes is neither rank 1 nor rank 2, or if we
      detect that additional channels have not been concatenated yet.
  """

  if not spatial_image_shape or spatial_image_shape == [-1, -1]:
    height, width = None, None
  else:
    height, width = spatial_image_shape  # pylint: disable=unpacking-non-sequence

  num_additional_channels = 0
  if fields.InputDataFields.image_additional_channels in tensor_dict:
    num_additional_channels = shape_utils.get_dim_as_int(tensor_dict[
        fields.InputDataFields.image_additional_channels].shape[2])

  # We assume that if num_additional_channels > 0, then it has already been
  # concatenated to the base image (but not the ground truth).
  num_channels = 3
  if fields.InputDataFields.image in tensor_dict:
    num_channels = shape_utils.get_dim_as_int(
        tensor_dict[fields.InputDataFields.image].shape[2])

  if num_additional_channels:
    if num_additional_channels >= num_channels:
      raise ValueError(
          'Image must be already concatenated with additional channels.')

    if (fields.InputDataFields.original_image in tensor_dict and
        shape_utils.get_dim_as_int(
            tensor_dict[fields.InputDataFields.original_image].shape[2]) ==
        num_channels):
      raise ValueError(
          'Image must be already concatenated with additional channels.')

  padding_shapes = {
      fields.InputDataFields.image: [
          height, width, num_channels
      ],
      fields.InputDataFields.original_image_spatial_shape: [2],
      fields.InputDataFields.image_additional_channels: [
          height, width, num_additional_channels
      ],
      fields.InputDataFields.source_id: [],
      fields.InputDataFields.filename: [],
      fields.InputDataFields.key: [],
      fields.InputDataFields.groundtruth_difficult: [max_num_boxes],
      fields.InputDataFields.groundtruth_boxes: [max_num_boxes, 4],
      fields.InputDataFields.groundtruth_classes: [max_num_boxes, num_classes],
      fields.InputDataFields.groundtruth_instance_masks: [
          max_num_boxes, height, width
      ],
      fields.InputDataFields.groundtruth_is_crowd: [max_num_boxes],
      fields.InputDataFields.groundtruth_group_of: [max_num_boxes],
      fields.InputDataFields.groundtruth_area: [max_num_boxes],
      fields.InputDataFields.groundtruth_weights: [max_num_boxes],
      fields.InputDataFields.groundtruth_confidences: [
          max_num_boxes, num_classes
      ],
      fields.InputDataFields.num_groundtruth_boxes: [],
      fields.InputDataFields.groundtruth_label_types: [max_num_boxes],
      fields.InputDataFields.groundtruth_label_weights: [max_num_boxes],
      fields.InputDataFields.true_image_shape: [3],
      fields.InputDataFields.groundtruth_image_classes: [num_classes],
      fields.InputDataFields.groundtruth_image_confidences: [num_classes],
  }

  if fields.InputDataFields.original_image in tensor_dict:
    padding_shapes[fields.InputDataFields.original_image] = [
        height, width,
        shape_utils.get_dim_as_int(tensor_dict[fields.InputDataFields.
                                               original_image].shape[2])
    ]
  if fields.InputDataFields.groundtruth_keypoints in tensor_dict:
    tensor_shape = (
        tensor_dict[fields.InputDataFields.groundtruth_keypoints].shape)
    padding_shape = [max_num_boxes,
                     shape_utils.get_dim_as_int(tensor_shape[1]),
                     shape_utils.get_dim_as_int(tensor_shape[2])]
    padding_shapes[fields.InputDataFields.groundtruth_keypoints] = padding_shape
  if fields.InputDataFields.groundtruth_keypoint_visibilities in tensor_dict:
    tensor_shape = tensor_dict[fields.InputDataFields.
                               groundtruth_keypoint_visibilities].shape
    padding_shape = [max_num_boxes, shape_utils.get_dim_as_int(tensor_shape[1])]
    padding_shapes[fields.InputDataFields.
                   groundtruth_keypoint_visibilities] = padding_shape

  padded_tensor_dict = {}
  for tensor_name in tensor_dict:
    padded_tensor_dict[tensor_name] = shape_utils.pad_or_clip_nd(
        tensor_dict[tensor_name], padding_shapes[tensor_name])

  # Make sure that the number of groundtruth boxes now reflects the
  # padded/clipped tensors.
  if fields.InputDataFields.num_groundtruth_boxes in padded_tensor_dict:
    padded_tensor_dict[fields.InputDataFields.num_groundtruth_boxes] = (
        tf.minimum(
            padded_tensor_dict[fields.InputDataFields.num_groundtruth_boxes],
            max_num_boxes))
  return padded_tensor_dict
示例#7
0
    def _match(self, similarity_matrix, valid_rows):
        """Tries to match each column of the similarity matrix to a row.

    Args:
      similarity_matrix: tensor of shape [N, M] representing any similarity
        metric.
      valid_rows: a boolean tensor of shape [N] indicating valid rows.

    Returns:
      Match object with corresponding matches for each of M columns.
    """
        def _match_when_rows_are_empty():
            """Performs matching when the rows of similarity matrix are empty.

      When the rows are empty, all detections are false positives. So we return
      a tensor of -1's to indicate that the columns do not match to any rows.

      Returns:
        matches:  int32 tensor indicating the row each column matches to.
      """
            similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape(
                similarity_matrix)
            return -1 * tf.ones([similarity_matrix_shape[1]], dtype=tf.int32)

        def _match_when_rows_are_non_empty():
            """Performs matching when the rows of similarity matrix are non empty.

      Returns:
        matches:  int32 tensor indicating the row each column matches to.
      """
            # Matches for each column
            matches = tf.argmax(similarity_matrix, 0, output_type=tf.int32)

            # Deal with matched and unmatched threshold
            if self._matched_threshold is not None:
                # Get logical indices of ignored and unmatched columns as tf.int64
                matched_vals = tf.reduce_max(similarity_matrix, 0)
                below_unmatched_threshold = tf.greater(
                    self._unmatched_threshold, matched_vals)
                between_thresholds = tf.logical_and(
                    tf.greater_equal(matched_vals, self._unmatched_threshold),
                    tf.greater(self._matched_threshold, matched_vals))

                if self._negatives_lower_than_unmatched:
                    matches = self._set_values_using_indicator(
                        matches, below_unmatched_threshold, -1)
                    matches = self._set_values_using_indicator(
                        matches, between_thresholds, -2)
                else:
                    matches = self._set_values_using_indicator(
                        matches, below_unmatched_threshold, -2)
                    matches = self._set_values_using_indicator(
                        matches, between_thresholds, -1)

            if self._force_match_for_each_row:
                similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape(
                    similarity_matrix)
                force_match_column_ids = tf.argmax(similarity_matrix,
                                                   1,
                                                   output_type=tf.int32)
                force_match_column_indicators = (
                    tf.one_hot(force_match_column_ids,
                               depth=similarity_matrix_shape[1]) *
                    tf.cast(tf.expand_dims(valid_rows, axis=-1),
                            dtype=tf.float32))
                force_match_row_ids = tf.argmax(force_match_column_indicators,
                                                0,
                                                output_type=tf.int32)
                force_match_column_mask = tf.cast(
                    tf.reduce_max(force_match_column_indicators, 0), tf.bool)
                final_matches = tf.where(force_match_column_mask,
                                         force_match_row_ids, matches)
                return final_matches
            else:
                return matches

        if similarity_matrix.shape.is_fully_defined():
            if shape_utils.get_dim_as_int(similarity_matrix.shape[0]) == 0:
                return _match_when_rows_are_empty()
            else:
                return _match_when_rows_are_non_empty()
        else:
            return tf.cond(tf.greater(tf.shape(similarity_matrix)[0],
                                      0), _match_when_rows_are_non_empty,
                           _match_when_rows_are_empty)