예제 #1
0
  def test_check_min_image_dim_static_shape(self):
    input_tensor = tf.constant(np.zeros([1, 42, 42, 3]))
    _ = shape_utils.check_min_image_dim(33, input_tensor)

    with self.assertRaisesRegexp(
        ValueError, 'image size must be >= 64 in both height and width.'):
      _ = shape_utils.check_min_image_dim(64, input_tensor)
  def extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]

    Raises:
      ValueError: depth multiplier is not supported.
    """
    if self._depth_multiplier != 1.0:
      raise ValueError('Depth multiplier not supported.')

    preprocessed_inputs = shape_utils.check_min_image_dim(
        129, preprocessed_inputs)

    with tf.variable_scope(
        self._resnet_scope_name, reuse=self._reuse_weights) as scope:
      with slim.arg_scope(resnet_v1.resnet_arg_scope()):
        with (slim.arg_scope(self._conv_hyperparams_fn())
              if self._override_base_feature_extractor_hyperparams else
              context_manager.IdentityContextManager()):
          _, image_features = self._resnet_base_fn(
              inputs=ops.pad_to_multiple(preprocessed_inputs,
                                         self._pad_to_multiple),
              num_classes=None,
              is_training=None,
              global_pool=False,
              output_stride=None,
              store_non_strided_activations=True,
              scope=scope)
          image_features = self._filter_features(image_features)
      with slim.arg_scope(self._conv_hyperparams_fn()):
        with tf.variable_scope(self._fpn_scope_name,
                               reuse=self._reuse_weights):
          fpn_features = feature_map_generators.fpn_top_down_feature_maps(
              [(key, image_features[key])
               for key in ['block2', 'block3', 'block4']],
              depth=256)
          last_feature_map = fpn_features['top_down_block4']
          coarse_features = {}
          for i in range(5, 7):
            last_feature_map = slim.conv2d(
                last_feature_map,
                num_outputs=256,
                kernel_size=[3, 3],
                stride=2,
                padding='SAME',
                scope='bottom_up_block{}'.format(i))
            coarse_features['bottom_up_block{}'.format(i)] = last_feature_map
    return [fpn_features['top_down_block2'],
            fpn_features['top_down_block3'],
            fpn_features['top_down_block4'],
            coarse_features['bottom_up_block5'],
            coarse_features['bottom_up_block6']]
예제 #3
0
  def test_check_min_image_dim_dynamic_shape(self):
    input_placeholder = tf.placeholder(tf.float32, shape=[1, None, None, 3])
    image_tensor = shape_utils.check_min_image_dim(33, input_placeholder)

    with self.test_session() as sess:
      sess.run(image_tensor,
               feed_dict={input_placeholder: np.zeros([1, 42, 42, 3])})
      with self.assertRaises(tf.errors.InvalidArgumentError):
        sess.run(image_tensor,
                 feed_dict={input_placeholder: np.zeros([1, 32, 32, 3])})
  def extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)

    feature_map_layout = {
        'from_layer': ['layer_15/expansion_output', 'layer_19', '', '', '', ''],
        'layer_depth': [-1, -1, 512, 256, 256, 128],
        'use_depthwise': self._use_depthwise,
        'use_explicit_padding': self._use_explicit_padding,
    }

    with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope:
      with slim.arg_scope(
          mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \
          slim.arg_scope(
              [mobilenet.depth_multiplier], min_depth=self._min_depth):
        with (slim.arg_scope(self._conv_hyperparams_fn())
              if self._override_base_feature_extractor_hyperparams else
              context_manager.IdentityContextManager()):
          # TODO(b/68150321): Enable fused batch norm once quantization
          # supports it.
          with slim.arg_scope([slim.batch_norm], fused=False):
            _, image_features = mobilenet_v2.mobilenet_base(
                ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
                final_endpoint='layer_19',
                depth_multiplier=self._depth_multiplier,
                use_explicit_padding=self._use_explicit_padding,
                scope=scope)
        with slim.arg_scope(self._conv_hyperparams_fn()):
          # TODO(b/68150321): Enable fused batch norm once quantization
          # supports it.
          with slim.arg_scope([slim.batch_norm], fused=False):
            feature_maps = feature_map_generators.multi_resolution_feature_maps(
                feature_map_layout=feature_map_layout,
                depth_multiplier=self._depth_multiplier,
                min_depth=self._min_depth,
                insert_1x1_conv=True,
                image_features=image_features)

    return feature_maps.values()
  def _extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)

    feature_map_layout = {
        'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '',
                       '', ''],
        'layer_depth': [-1, -1, 512, 256, 256, 128],
        'use_explicit_padding': self._use_explicit_padding,
        'use_depthwise': self._use_depthwise,
    }

    with tf.variable_scope('MobilenetV1',
                           reuse=self._reuse_weights) as scope:
      with slim.arg_scope(
          mobilenet_v1.mobilenet_v1_arg_scope(
              is_training=(self._batch_norm_trainable and self._is_training))):
        # TODO(skligys): Enable fused batch norm once quantization supports it.
        with slim.arg_scope([slim.batch_norm], fused=False):
          _, image_features = mobilenet_v1.mobilenet_v1_base(
              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
              final_endpoint='Conv2d_13_pointwise',
              min_depth=self._min_depth,
              depth_multiplier=self._depth_multiplier,
              use_explicit_padding=self._use_explicit_padding,
              scope=scope)
      with slim.arg_scope(self._conv_hyperparams):
        # TODO(skligys): Enable fused batch norm once quantization supports it.
        with slim.arg_scope([slim.batch_norm], fused=False):
          feature_maps = feature_map_generators.multi_resolution_feature_maps(
              feature_map_layout=feature_map_layout,
              depth_multiplier=self._depth_multiplier,
              min_depth=self._min_depth,
              insert_1x1_conv=True,
              image_features=image_features)

    return feature_maps.values()
  def extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]

    Raises:
      ValueError: depth multiplier is not supported.
    """
    if self._depth_multiplier != 1.0:
      raise ValueError('Depth multiplier not supported.')

    preprocessed_inputs = shape_utils.check_min_image_dim(
        129, preprocessed_inputs)

    with tf.variable_scope(
        self._resnet_scope_name, reuse=self._reuse_weights) as scope:
      with slim.arg_scope(resnet_v1.resnet_arg_scope()):
        with (slim.arg_scope(self._conv_hyperparams_fn())
              if self._override_base_feature_extractor_hyperparams else
              context_manager.IdentityContextManager()):
          with slim.arg_scope(
              [resnet_v1.bottleneck],
              use_bounded_activations=self._use_bounded_activations):
            _, activations = self._resnet_base_fn(
                inputs=ops.pad_to_multiple(preprocessed_inputs,
                                           self._pad_to_multiple),
                num_classes=None,
                is_training=None,
                global_pool=False,
                output_stride=None,
                store_non_strided_activations=True,
                scope=scope)

      with slim.arg_scope(self._conv_hyperparams_fn()):
        feature_maps = feature_map_generators.pooling_pyramid_feature_maps(
            base_feature_map_depth=self._base_feature_map_depth,
            num_layers=self._num_layers,
            image_features={
                'image_features': self._filter_features(activations)['block3']
            })
    return feature_maps.values()
  def extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)

    feature_map_layout = {
        'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '',
                       '', ''],
        'layer_depth': [-1, -1, 512, 256, 256, 128],
        'use_explicit_padding': self._use_explicit_padding,
        'use_depthwise': self._use_depthwise,
    }

    with tf.variable_scope('MobilenetV1',
                           reuse=self._reuse_weights) as scope:
      with slim.arg_scope(
          mobilenet_v1.mobilenet_v1_arg_scope(
              is_training=None, regularize_depthwise=True)):
        with (slim.arg_scope(self._conv_hyperparams_fn())
              if self._override_base_feature_extractor_hyperparams
              else context_manager.IdentityContextManager()):
          _, image_features = mobilenet_v1.mobilenet_v1_base(
              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
              final_endpoint='Conv2d_13_pointwise',
              min_depth=self._min_depth,
              depth_multiplier=self._depth_multiplier,
              use_explicit_padding=self._use_explicit_padding,
              scope=scope)
      with slim.arg_scope(self._conv_hyperparams_fn()):
        feature_maps = feature_map_generators.multi_resolution_feature_maps(
            feature_map_layout=feature_map_layout,
            depth_multiplier=self._depth_multiplier,
            min_depth=self._min_depth,
            insert_1x1_conv=True,
            image_features=image_features)

    return feature_maps.values()
  def _extract_proposal_features(self, preprocessed_inputs, scope):
    """Extracts first stage RPN features.

    Args:
      preprocessed_inputs: A [batch, height, width, channels] float32 tensor
        representing a batch of images.
      scope: A scope name.

    Returns:
      rpn_feature_map: A tensor with shape [batch, height, width, depth]
      activations: A dictionary mapping feature extractor tensor names to
        tensors

    Raises:
      InvalidArgumentError: If the spatial size of `preprocessed_inputs`
        (height or width) is less than 33.
      ValueError: If the created network is missing the required activation.
    """

    preprocessed_inputs.get_shape().assert_has_rank(4)
    preprocessed_inputs = shape_utils.check_min_image_dim(
        min_dim=33, image_tensor=preprocessed_inputs)

    with slim.arg_scope(
        mobilenet_v1.mobilenet_v1_arg_scope(
            is_training=self._train_batch_norm,
            weight_decay=self._weight_decay)):
      with tf.variable_scope('MobilenetV1',
                             reuse=self._reuse_weights) as scope:
        params = {}
        if self._skip_last_stride:
          params['conv_defs'] = _get_mobilenet_conv_no_last_stride_defs(
              conv_depth_ratio_in_percentage=self.
              _conv_depth_ratio_in_percentage)
        _, activations = mobilenet_v1.mobilenet_v1_base(
            preprocessed_inputs,
            final_endpoint='Conv2d_11_pointwise',
            min_depth=self._min_depth,
            depth_multiplier=self._depth_multiplier,
            scope=scope,
            **params)
    return activations['Conv2d_11_pointwise'], activations
  def _extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)

    image_features = self.mobilenet_v2(
        ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple))

    feature_maps = self.feature_map_generator({
        'layer_15/expansion_output': image_features[0],
        'layer_19': image_features[1]})

    return feature_maps.values()
  def extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)

    feature_map_layout = {
        'from_layer': ['Mixed_4c', 'Mixed_5c', '', '', '', ''],
        'layer_depth': [-1, -1, 512, 256, 256, 128],
        'use_explicit_padding': self._use_explicit_padding,
        'use_depthwise': self._use_depthwise,
    }

    with slim.arg_scope(self._conv_hyperparams_fn()):
      with tf.variable_scope('InceptionV2',
                             reuse=self._reuse_weights) as scope:
        _, image_features = inception_v2.inception_v2_base(
            ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
            final_endpoint='Mixed_5c',
            min_depth=self._min_depth,
            depth_multiplier=self._depth_multiplier,
            scope=scope)
        feature_maps = feature_map_generators.multi_resolution_feature_maps(
            feature_map_layout=feature_map_layout,
            depth_multiplier=self._depth_multiplier,
            min_depth=self._min_depth,
            insert_1x1_conv=True,
            image_features=image_features)

    return feature_maps.values()
예제 #11
0
  def _extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)

    feature_map_layout = {
        'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''],
        'layer_depth': [-1, -1, -1, 512, 256, 128],
        'use_explicit_padding': self._use_explicit_padding,
        'use_depthwise': self._use_depthwise,
    }

    with slim.arg_scope(self._conv_hyperparams):
      with tf.variable_scope('InceptionV3', reuse=self._reuse_weights) as scope:
        _, image_features = inception_v3.inception_v3_base(
            ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
            final_endpoint='Mixed_7c',
            min_depth=self._min_depth,
            depth_multiplier=self._depth_multiplier,
            scope=scope)
        feature_maps = feature_map_generators.multi_resolution_feature_maps(
            feature_map_layout=feature_map_layout,
            depth_multiplier=self._depth_multiplier,
            min_depth=self._min_depth,
            insert_1x1_conv=True,
            image_features=image_features)

    return feature_maps.values()
  def extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)

    with tf.variable_scope('MobilenetV1',
                           reuse=self._reuse_weights) as scope:
      with slim.arg_scope(
          mobilenet_v1.mobilenet_v1_arg_scope(
              is_training=None, regularize_depthwise=True)):
        with (slim.arg_scope(self._conv_hyperparams_fn())
              if self._override_base_feature_extractor_hyperparams
              else context_manager.IdentityContextManager()):
          _, image_features = mobilenet_v1.mobilenet_v1_base(
              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
              final_endpoint='Conv2d_13_pointwise',
              min_depth=self._min_depth,
              depth_multiplier=self._depth_multiplier,
              use_explicit_padding=self._use_explicit_padding,
              scope=scope)
      with slim.arg_scope(self._conv_hyperparams_fn()):
        feature_maps = feature_map_generators.pooling_pyramid_feature_maps(
            base_feature_map_depth=0,
            num_layers=6,
            image_features={
                'image_features': image_features['Conv2d_11_pointwise']
            })
    return feature_maps.values()
    def _extract_proposal_features(self, preprocessed_inputs, scope):
        """Extracts first stage RPN features.

    Args:
      preprocessed_inputs: A [batch, height, width, channels] float32 tensor
        representing a batch of images.
      scope: A scope name.

    Returns:
      rpn_feature_map: A tensor with shape [batch, height, width, depth]
      activations: A dictionary mapping feature extractor tensor names to
        tensors

    Raises:
      InvalidArgumentError: If the spatial size of `preprocessed_inputs`
        (height or width) is less than 33.
      ValueError: If the created network is missing the required activation.
    """

        preprocessed_inputs = shape_utils.check_min_image_dim(
            33, preprocessed_inputs)

        with tf.variable_scope('MobilenetV3',
                               reuse=self._reuse_weights) as scope:
            with slim.arg_scope(
                mobilenet_v3.training_scope(is_training=None, bn_decay=0.9997)), \
                slim.arg_scope(
                    [mobilenet.depth_multiplier], min_depth=self._min_depth):
                _, activations = mobilenet_v3.mobilenet_base(
                    preprocessed_inputs,
                    conv_defs=mobilenet_v3.V3_LARGE_DETECTION,
                    final_endpoint='layer_17',
                    depth_multiplier=self._depth_multiplier,
                    scope=scope)

        return activations['layer_17'], activations
예제 #14
0
  def extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)
    with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope:
      with slim.arg_scope(
          mobilenet_v2.training_scope(is_training=None, bn_decay=0.99)), \
          slim.arg_scope(
              [mobilenet.depth_multiplier], min_depth=self._min_depth):
        with slim.arg_scope(
            training_scope(l2_weight_decay=4e-5,
                           is_training=self._is_training)):

          _, image_features = mobilenet_v2.mobilenet_base(
              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
              final_endpoint='layer_18',
              depth_multiplier=self._depth_multiplier,
              use_explicit_padding=self._use_explicit_padding,
              scope=scope)

    multiplier_func = functools.partial(
        _apply_multiplier,
        multiplier=self._depth_multiplier,
        min_depth=self._min_depth)
    with tf.variable_scope('MnasFPN', reuse=self._reuse_weights):
      with slim.arg_scope(
          training_scope(l2_weight_decay=1e-4, is_training=self._is_training)):
        # Create C6 by downsampling C5.
        c6 = slim.max_pool2d(
            _maybe_pad(image_features['layer_18'], self._use_explicit_padding),
            [3, 3],
            stride=[2, 2],
            padding='VALID' if self._use_explicit_padding else 'SAME',
            scope='C6_downsample')
        c6 = slim.conv2d(
            c6,
            multiplier_func(self._fpn_layer_depth),
            [1, 1],
            activation_fn=tf.identity,
            normalizer_fn=slim.batch_norm,
            weights_regularizer=None,  # this 1x1 has no kernel regularizer.
            padding='VALID',
            scope='C6_Conv1x1')
        image_features['C6'] = tf.identity(c6)  # Needed for quantization.
        for k in sorted(image_features.keys()):
          tf.logging.error('{}: {}'.format(k, image_features[k]))

        mnasfpn_inputs = [
            image_features['layer_7'],  # C3
            image_features['layer_14'],  # C4
            image_features['layer_18'],  # C5
            image_features['C6']  # C6
        ]
        self._verify_config(mnasfpn_inputs)
        feature_maps = mnasfpn(
            mnasfpn_inputs,
            head_def=self._head_def,
            output_channel=self._fpn_layer_depth,
            use_explicit_padding=self._use_explicit_padding,
            use_native_resize_op=self._use_native_resize_op,
            multiplier_func=multiplier_func)
    return feature_maps
    def extract_features(self, preprocessed_inputs):
        """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
        preprocessed_inputs = shape_utils.check_min_image_dim(
            33, preprocessed_inputs)

        depth = lambda d: max(int(d * self._depth_multiplier), self._min_depth)
        trunc_normal = lambda stddev: tf.truncated_normal_initializer(
            0.0, stddev)

        #add convolution autoencoder
        encoder_1_conv = slim.conv2d(preprocessed_inputs,
                                     depth(64), [3, 3],
                                     weights_initializer=trunc_normal(0.09),
                                     scope='encoder_c1_conv')
        #encoder_1_pool = slim.max_pool2d(encoder_1_conv, [2, 2], stride=2,
        #                                    scope='encoder_c1_pool')
        #encoder_1_dropout1 = slim.dropout(encoder_1_pool, 0.7, scope='encoder_c1_dropout1')
        encoder_2_conv = slim.conv2d(encoder_1_conv,
                                     depth(128), [5, 5],
                                     weights_initializer=trunc_normal(0.09),
                                     scope='encoder_c2_conv')
        encoder_3_conv = slim.conv2d(encoder_2_conv,
                                     depth(128), [5, 5],
                                     weights_initializer=trunc_normal(0.09),
                                     scope='encoder_c3_conv')
        #decoder
        decoder_3_deconv = slim.conv2d_transpose(
            encoder_3_conv,
            depth(128), [5, 5],
            weights_initializer=trunc_normal(0.09),
            scope='decoder_c3_deconv')
        decoder_2_deconv = slim.conv2d_transpose(
            decoder_3_deconv,
            depth(128), [5, 5],
            weights_initializer=trunc_normal(0.09),
            scope='decoder_c2_deconv')
        decoder_1_deconv = slim.conv2d_transpose(
            decoder_2_deconv,
            depth(1), [3, 3],
            weights_initializer=trunc_normal(0.09),
            scope='decoder_c1_deconv')

        feature_map_layout = {
            'from_layer': ['Mixed_4c', 'Mixed_5c', '', '', '', ''],
            'layer_depth': [-1, -1, 512, 256, 256, 128],
            'use_explicit_padding': self._use_explicit_padding,
            'use_depthwise': self._use_depthwise,
        }

        with slim.arg_scope(self._conv_hyperparams_fn()):
            with tf.variable_scope('InceptionV2',
                                   reuse=self._reuse_weights) as scope:
                _, image_features = inception_v2.inception_v2_base(
                    ops.pad_to_multiple(decoder_1_deconv,
                                        self._pad_to_multiple),
                    final_endpoint='Mixed_5c',
                    min_depth=self._min_depth,
                    depth_multiplier=self._depth_multiplier,
                    scope=scope)
                feature_maps = feature_map_generators.multi_resolution_feature_maps(
                    feature_map_layout=feature_map_layout,
                    depth_multiplier=self._depth_multiplier,
                    min_depth=self._min_depth,
                    insert_1x1_conv=True,
                    image_features=image_features)

        return feature_maps.values()
    def extract_features(self, preprocessed_inputs):
        """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
        preprocessed_inputs = shape_utils.check_min_image_dim(
            33, preprocessed_inputs)

        with tf.variable_scope('MobilenetV1',
                               reuse=self._reuse_weights) as scope:
            with slim.arg_scope(
                    mobilenet_v1.mobilenet_v1_arg_scope(
                        is_training=None, regularize_depthwise=True)):
                with (slim.arg_scope(self._conv_hyperparams_fn())
                      if self._override_base_feature_extractor_hyperparams else
                      context_manager.IdentityContextManager()):
                    _, image_features = mobilenet_v1.mobilenet_v1_base(
                        ops.pad_to_multiple(preprocessed_inputs,
                                            self._pad_to_multiple),
                        final_endpoint='Conv2d_13_pointwise',
                        min_depth=self._min_depth,
                        depth_multiplier=self._depth_multiplier,
                        use_explicit_padding=self._use_explicit_padding,
                        scope=scope)

            depth_fn = lambda d: max(int(d * self._depth_multiplier), self.
                                     _min_depth)
            with slim.arg_scope(self._conv_hyperparams_fn()):
                with tf.variable_scope('fpn', reuse=self._reuse_weights):
                    feature_blocks = [
                        'Conv2d_3_pointwise', 'Conv2d_5_pointwise',
                        'Conv2d_11_pointwise', 'Conv2d_13_pointwise'
                    ]
                    base_fpn_max_level = min(self._fpn_max_level, 5)
                    feature_block_list = []
                    for level in range(self._fpn_min_level,
                                       base_fpn_max_level + 1):
                        feature_block_list.append(feature_blocks[level - 2])
                    fpn_features = feature_map_generators.fpn_top_down_feature_maps(
                        [(key, image_features[key])
                         for key in feature_block_list],
                        depth=depth_fn(256))
                    feature_maps = []
                    for level in range(self._fpn_min_level,
                                       base_fpn_max_level + 1):
                        feature_maps.append(fpn_features['top_down_{}'.format(
                            feature_blocks[level - 2])])
                    last_feature_map = fpn_features['top_down_{}'.format(
                        feature_blocks[base_fpn_max_level - 2])]
                    # Construct coarse features
                    for i in range(base_fpn_max_level + 1,
                                   self._fpn_max_level + 1):
                        last_feature_map = slim.conv2d(
                            last_feature_map,
                            num_outputs=depth_fn(256),
                            kernel_size=[3, 3],
                            stride=2,
                            padding='SAME',
                            scope='bottom_up_Conv2d_{}'.format(
                                i - base_fpn_max_level + 13))
                        feature_maps.append(last_feature_map)
        return feature_maps
    def extract_features(self, preprocessed_inputs):
        """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]

    Raises:
      ValueError: depth multiplier is not supported.
    """
        if self._depth_multiplier != 1.0:
            raise ValueError('Depth multiplier not supported.')

        preprocessed_inputs = shape_utils.check_min_image_dim(
            129, preprocessed_inputs)

        with tf.variable_scope(self._resnet_scope_name,
                               reuse=self._reuse_weights) as scope:
            with slim.arg_scope(resnet_v1.resnet_arg_scope()):
                with (slim.arg_scope(self._conv_hyperparams_fn())
                      if self._override_base_feature_extractor_hyperparams else
                      context_manager.IdentityContextManager()):
                    _, image_features = self._resnet_base_fn(
                        inputs=ops.pad_to_multiple(preprocessed_inputs,
                                                   self._pad_to_multiple),
                        num_classes=None,
                        is_training=None,
                        global_pool=False,
                        output_stride=None,
                        store_non_strided_activations=True,
                        scope=scope)
                    image_features = self._filter_features(image_features)
            with slim.arg_scope(self._conv_hyperparams_fn()):
                with tf.variable_scope(self._fpn_scope_name,
                                       reuse=self._reuse_weights):
                    fpn_features = feature_map_generators.fpn_top_down_feature_maps(
                        [(key, image_features[key])
                         for key in ['block2', 'block3', 'block4']],
                        depth=256)
                    last_feature_map = fpn_features['top_down_block4']
                    coarse_features = {}
                    for i in range(5, 7):
                        last_feature_map = slim.conv2d(
                            last_feature_map,
                            num_outputs=256,
                            kernel_size=[3, 3],
                            stride=2,
                            padding='SAME',
                            scope='bottom_up_block{}'.format(i))
                        coarse_features['bottom_up_block{}'.format(
                            i)] = last_feature_map
        return [
            fpn_features['top_down_block2'], fpn_features['top_down_block3'],
            fpn_features['top_down_block4'],
            coarse_features['bottom_up_block5'],
            coarse_features['bottom_up_block6']
        ]
    def extract_features(self, preprocessed_inputs):
        """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    Raises:
      ValueError if conv_defs is not provided or from_layer does not meet the
        size requirement.
    """

        if not self._conv_defs:
            raise ValueError('Must provide backbone conv defs.')

        if len(self._from_layer) != 2:
            raise ValueError('SSD input feature names are not provided.')

        preprocessed_inputs = shape_utils.check_min_image_dim(
            33, preprocessed_inputs)

        feature_map_layout = {
            'from_layer':
            [self._from_layer[0], self._from_layer[1], '', '', '', ''],
            'layer_depth': [-1, -1, 128, 128, 128,
                            128],  #[-1, -1, 512, 256, 256, 128]
            'use_depthwise':
            self._use_depthwise,
            'use_explicit_padding':
            self._use_explicit_padding,
        }

        with tf.variable_scope(self._scope_name,
                               reuse=self._reuse_weights) as scope:
            with slim.arg_scope(
                mobilenet_v3.training_scope(is_training=None, bn_decay=0.9997)), \
                slim.arg_scope(
                    [mobilenet.depth_multiplier], min_depth=self._min_depth):
                with (slim.arg_scope(self._conv_hyperparams_fn())
                      if self._override_base_feature_extractor_hyperparams else
                      context_manager.IdentityContextManager()):
                    _, image_features = mobilenet_v3.mobilenet_base(
                        ops.pad_to_multiple(preprocessed_inputs,
                                            self._pad_to_multiple),
                        conv_defs=self._conv_defs,
                        final_endpoint=self._from_layer[1],
                        depth_multiplier=self._depth_multiplier,
                        use_explicit_padding=self._use_explicit_padding,
                        scope=scope)
                with slim.arg_scope(self._conv_hyperparams_fn()):
                    feature_maps = feature_map_generators.multi_resolution_feature_maps(
                        feature_map_layout=feature_map_layout,
                        depth_multiplier=self._depth_multiplier,
                        min_depth=self._min_depth,
                        insert_1x1_conv=True,
                        image_features=image_features)

        return feature_maps.values()
예제 #19
0
    def extract_features(self, preprocessed_inputs):
        """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]

    Raises:
      ValueError: depth multiplier is not supported.
    """
        if self._depth_multiplier != 1.0:
            raise ValueError('Depth multiplier not supported.')

        preprocessed_inputs = shape_utils.check_min_image_dim(
            129, preprocessed_inputs)

        with tf.variable_scope(self._resnet_scope_name,
                               reuse=self._reuse_weights) as scope:
            with slim.arg_scope(resnet_v2.resnet_arg_scope()):
                with (slim.arg_scope(self._conv_hyperparams_fn())
                      if self._override_base_feature_extractor_hyperparams else
                      context_manager.IdentityContextManager()):
                    _, image_features = self._resnet_base_fn(
                        inputs=ops.pad_to_multiple(preprocessed_inputs,
                                                   self._pad_to_multiple),
                        num_classes=None,
                        is_training=None,
                        global_pool=False,
                        output_stride=None,
                        # store_non_strided_activations=True,
                        scope=scope)
                    image_features = self._filter_features(image_features)
            with slim.arg_scope(self._conv_hyperparams_fn()):
                with tf.variable_scope(self._fpn_scope_name,
                                       reuse=self._reuse_weights):
                    base_fpn_max_level = min(self._fpn_max_level, 5)
                    feature_block_list = []
                    for level in range(self._fpn_min_level,
                                       base_fpn_max_level + 1):
                        feature_block_list.append('block{}'.format(level - 1))
                    fpn_features = feature_map_generators.fpn_top_down_feature_maps(
                        [(key, image_features[key])
                         for key in feature_block_list],
                        depth=self._additional_layer_depth,
                        use_explicit_padding=True)
                    feature_maps = []
                    for level in range(self._fpn_min_level,
                                       base_fpn_max_level + 1):
                        feature_maps.append(
                            fpn_features['top_down_block{}'.format(level - 1)])
                    last_feature_map = fpn_features['top_down_block{}'.format(
                        base_fpn_max_level - 1)]
                    # Construct coarse features
                    for i in range(base_fpn_max_level, self._fpn_max_level):
                        last_feature_map = slim.conv2d(
                            last_feature_map,
                            num_outputs=self._additional_layer_depth,
                            kernel_size=[3, 3],
                            stride=2,
                            padding='SAME',
                            scope='bottom_up_block{}'.format(i))
                        feature_maps.append(last_feature_map)
        return feature_maps
  def extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)

    with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope:
      with slim.arg_scope(
          mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \
          slim.arg_scope(
              [mobilenet.depth_multiplier], min_depth=self._min_depth):
        with (slim.arg_scope(self._conv_hyperparams_fn())
              if self._override_base_feature_extractor_hyperparams else
              context_manager.IdentityContextManager()):
          _, image_features = mobilenet_v2.mobilenet_base(
              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
              final_endpoint='layer_19',
              depth_multiplier=self._depth_multiplier,
              conv_defs=_CONV_DEFS if self._use_depthwise else None,
              use_explicit_padding=self._use_explicit_padding,
              scope=scope)
      depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth)
      with slim.arg_scope(self._conv_hyperparams_fn()):
        with tf.variable_scope('fpn', reuse=self._reuse_weights):
          feature_blocks = [
              'layer_4', 'layer_7', 'layer_14', 'layer_19'
          ]
          base_fpn_max_level = min(self._fpn_max_level, 5)
          feature_block_list = []
          for level in range(self._fpn_min_level, base_fpn_max_level + 1):
            feature_block_list.append(feature_blocks[level - 2])
          fpn_features = feature_map_generators.fpn_top_down_feature_maps(
              [(key, image_features[key]) for key in feature_block_list],
              depth=depth_fn(self._additional_layer_depth),
              use_depthwise=self._use_depthwise)
          feature_maps = []
          for level in range(self._fpn_min_level, base_fpn_max_level + 1):
            feature_maps.append(fpn_features['top_down_{}'.format(
                feature_blocks[level - 2])])
          last_feature_map = fpn_features['top_down_{}'.format(
              feature_blocks[base_fpn_max_level - 2])]
          # Construct coarse features
          for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1):
            if self._use_depthwise:
              conv_op = functools.partial(
                  slim.separable_conv2d, depth_multiplier=1)
            else:
              conv_op = slim.conv2d
            last_feature_map = conv_op(
                last_feature_map,
                num_outputs=depth_fn(self._additional_layer_depth),
                kernel_size=[3, 3],
                stride=2,
                padding='SAME',
                scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 19))
            feature_maps.append(last_feature_map)
    return feature_maps
  def extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)

    with tf.variable_scope('MobilenetV1',
                           reuse=self._reuse_weights) as scope:
      with slim.arg_scope(
          mobilenet_v1.mobilenet_v1_arg_scope(
              is_training=None, regularize_depthwise=True)):
        with (slim.arg_scope(self._conv_hyperparams_fn())
              if self._override_base_feature_extractor_hyperparams
              else context_manager.IdentityContextManager()):
          _, image_features = mobilenet_v1.mobilenet_v1_base(
              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
              final_endpoint='Conv2d_13_pointwise',
              min_depth=self._min_depth,
              depth_multiplier=self._depth_multiplier,
              use_explicit_padding=self._use_explicit_padding,
              scope=scope)

      depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth)
      with slim.arg_scope(self._conv_hyperparams_fn()):
        with tf.variable_scope('fpn', reuse=self._reuse_weights):
          feature_blocks = [
              'Conv2d_3_pointwise', 'Conv2d_5_pointwise', 'Conv2d_11_pointwise',
              'Conv2d_13_pointwise'
          ]
          base_fpn_max_level = min(self._fpn_max_level, 5)
          feature_block_list = []
          for level in range(self._fpn_min_level, base_fpn_max_level + 1):
            feature_block_list.append(feature_blocks[level - 2])
          fpn_features = feature_map_generators.fpn_top_down_feature_maps(
              [(key, image_features[key]) for key in feature_block_list],
              depth=depth_fn(256))
          feature_maps = []
          for level in range(self._fpn_min_level, base_fpn_max_level + 1):
            feature_maps.append(fpn_features['top_down_{}'.format(
                feature_blocks[level - 2])])
          last_feature_map = fpn_features['top_down_{}'.format(
              feature_blocks[base_fpn_max_level - 2])]
          # Construct coarse features
          for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1):
            last_feature_map = slim.conv2d(
                last_feature_map,
                num_outputs=depth_fn(256),
                kernel_size=[3, 3],
                stride=2,
                padding='SAME',
                scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 13))
            feature_maps.append(last_feature_map)
    return feature_maps
    def extract_features(self,
                         preprocessed_inputs,
                         state_saver=None,
                         state_name='lstm_state',
                         unroll_length=5,
                         scope=None):
        """Extracts features from preprocessed inputs.

    The features include the base network features, lstm features and SSD
    features, organized in the following name scope:

    <parent scope>/MobilenetV1/...
    <parent scope>/LSTM/...
    <parent scope>/FeatureMaps/...

    Args:
      preprocessed_inputs: A [batch, height, width, channels] float tensor
        representing a batch of consecutive frames from video clips.
      state_saver: A state saver object with methods `state` and `save_state`.
      state_name: A python string for the name to use with the state_saver.
      unroll_length: The number of steps to unroll the lstm.
      scope: The scope for the base network of the feature extractor.

    Returns:
      A list of tensors where the ith tensor has shape [batch, height_i,
      width_i, depth_i]
    """
        preprocessed_inputs = shape_utils.check_min_image_dim(
            33, preprocessed_inputs)
        with slim.arg_scope(
                mobilenet_v1.mobilenet_v1_arg_scope(
                    is_training=self._is_training)):
            with (slim.arg_scope(self._conv_hyperparams_fn())
                  if self._override_base_feature_extractor_hyperparams else
                  context_manager.IdentityContextManager()):
                with slim.arg_scope([slim.batch_norm], fused=False):
                    # Base network.
                    with tf.variable_scope(scope,
                                           self._base_network_scope,
                                           reuse=self._reuse_weights) as scope:
                        net, image_features = mobilenet_v1.mobilenet_v1_base(
                            ops.pad_to_multiple(preprocessed_inputs,
                                                self._pad_to_multiple),
                            final_endpoint='Conv2d_13_pointwise',
                            min_depth=self._min_depth,
                            depth_multiplier=self._depth_multiplier,
                            scope=scope)

        with slim.arg_scope(self._conv_hyperparams_fn()):
            with slim.arg_scope([slim.batch_norm],
                                fused=False,
                                is_training=self._is_training):
                # ConvLSTM layers.
                with tf.variable_scope(
                        'LSTM', reuse=self._reuse_weights) as lstm_scope:
                    lstm_cell = lstm_cells.BottleneckConvLSTMCell(
                        filter_size=(3, 3),
                        output_size=(net.shape[1].value, net.shape[2].value),
                        num_units=max(self._min_depth, self._lstm_state_depth),
                        activation=tf.nn.relu6,
                        visualize_gates=True)

                    net_seq = list(tf.split(net, unroll_length))
                    if state_saver is None:
                        init_state = lstm_cell.init_state(
                            state_name, net.shape[0].value / unroll_length,
                            tf.float32)
                    else:
                        c = state_saver.state('%s_c' % state_name)
                        h = state_saver.state('%s_h' % state_name)
                        init_state = (c, h)

                    # Identities added for inputing state tensors externally.
                    c_ident = tf.identity(init_state[0],
                                          name='lstm_state_in_c')
                    h_ident = tf.identity(init_state[1],
                                          name='lstm_state_in_h')
                    init_state = (c_ident, h_ident)

                    net_seq, states_out = rnn_decoder.rnn_decoder(
                        net_seq, init_state, lstm_cell, scope=lstm_scope)
                    batcher_ops = None
                    self._states_out = states_out
                    if state_saver is not None:
                        self._step = state_saver.state('%s_step' % state_name)
                        batcher_ops = [
                            state_saver.save_state('%s_c' % state_name,
                                                   states_out[-1][0]),
                            state_saver.save_state('%s_h' % state_name,
                                                   states_out[-1][1]),
                            state_saver.save_state('%s_step' % state_name,
                                                   self._step - 1)
                        ]
                    with tf_ops.control_dependencies(batcher_ops):
                        image_features['Conv2d_13_pointwise_lstm'] = tf.concat(
                            net_seq, 0)

                    # Identities added for reading output states, to be reused externally.
                    tf.identity(states_out[-1][0], name='lstm_state_out_c')
                    tf.identity(states_out[-1][1], name='lstm_state_out_h')

                # SSD layers.
                with tf.variable_scope('FeatureMaps',
                                       reuse=self._reuse_weights):
                    feature_maps = feature_map_generators.multi_resolution_feature_maps(
                        feature_map_layout=self._feature_map_layout,
                        depth_multiplier=(self._depth_multiplier),
                        min_depth=self._min_depth,
                        insert_1x1_conv=True,
                        image_features=image_features)

        return feature_maps.values()
예제 #23
0
  def extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)

    feature_map_layout = {
        'from_layer': ['vgg_16/conv4_3_norm', 'vgg_16/fc7', 'vgg_16/conv6_2', 'vgg_16/conv7_2', 'vgg_16/conv8_2', 'vgg_16/conv9_2'],
        'layer_depth': [-1, -1, -1, -1, -1, -1],
        'use_explicit_padding': self._use_explicit_padding,
        'use_depthwise': self._use_depthwise,
    }
    net,image_features = vgg.vgg_16(ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
                                        final_endpoint='pool5',spatial_squeeze=False)
    #double check scale filler
    image_features['vgg_16/conv4_3_norm']=custom_layers.l2_normalization(image_features['vgg_16/conv4/conv4_3'],scaling=True,scope='vgg_16/conv4_3_norm')
    with slim.arg_scope(self._conv_hyperparams):
      with tf.variable_scope('vgg_16', reuse=self._reuse_weights) as scope:
        # In [5]: net
        # Out[5]: <tf.Tensor 'vgg_16/pool5/MaxPool:0' shape=(32, 18, 18, 512) dtype=float32>

        # In [6]: end_points
        # Out[6]:
        # OrderedDict([('vgg_16/conv1/conv1_1',
        #               <tf.Tensor 'vgg_16/conv1/conv1_1/Relu:0' shape=(32, 300, 300, 64) dtype=float32>),
        #              ('vgg_16/conv1/conv1_2',
        #               <tf.Tensor 'vgg_16/conv1/conv1_2/Relu:0' shape=(32, 300, 300, 64) dtype=float32>),
        #              ('vgg_16/pool1',
        #               <tf.Tensor 'vgg_16/pool1/MaxPool:0' shape=(32, 150, 150, 64) dtype=float32>),
        #              ('vgg_16/conv2/conv2_1',
        #               <tf.Tensor 'vgg_16/conv2/conv2_1/Relu:0' shape=(32, 150, 150, 128) dtype=float32>),
        #              ('vgg_16/conv2/conv2_2',
        #               <tf.Tensor 'vgg_16/conv2/conv2_2/Relu:0' shape=(32, 150, 150, 128) dtype=float32>),
        #              ('vgg_16/pool2',
        #               <tf.Tensor 'vgg_16/pool2/MaxPool:0' shape=(32, 75, 75, 128) dtype=float32>),
        #              ('vgg_16/conv3/conv3_1',
        #               <tf.Tensor 'vgg_16/conv3/conv3_1/Relu:0' shape=(32, 75, 75, 256) dtype=float32>),
        #              ('vgg_16/conv3/conv3_2',
        #               <tf.Tensor 'vgg_16/conv3/conv3_2/Relu:0' shape=(32, 75, 75, 256) dtype=float32>),
        #              ('vgg_16/conv3/conv3_3',
        #               <tf.Tensor 'vgg_16/conv3/conv3_3/Relu:0' shape=(32, 75, 75, 256) dtype=float32>),
        #              ('vgg_16/pool3',
        #               <tf.Tensor 'vgg_16/pool3/MaxPool:0' shape=(32, 37, 37, 256) dtype=float32>),
        #              ('vgg_16/conv4/conv4_1',
        #               <tf.Tensor 'vgg_16/conv4/conv4_1/Relu:0' shape=(32, 37, 37, 512) dtype=float32>),
        #              ('vgg_16/conv4/conv4_2',
        #               <tf.Tensor 'vgg_16/conv4/conv4_2/Relu:0' shape=(32, 37, 37, 512) dtype=float32>),
        #              ('vgg_16/conv4/conv4_3',
        #               <tf.Tensor 'vgg_16/conv4/conv4_3/Relu:0' shape=(32, 37, 37, 512) dtype=float32>),
        #              ('vgg_16/pool4',
        #               <tf.Tensor 'vgg_16/pool4/MaxPool:0' shape=(32, 18, 18, 512) dtype=float32>),
        #              ('vgg_16/conv5/conv5_1',
        #               <tf.Tensor 'vgg_16/conv5/conv5_1/Relu:0' shape=(32, 18, 18, 512) dtype=float32>),
        #              ('vgg_16/conv5/conv5_2',
        #               <tf.Tensor 'vgg_16/conv5/conv5_2/Relu:0' shape=(32, 18, 18, 512) dtype=float32>),
        #              ('vgg_16/conv5/conv5_3',
        #               <tf.Tensor 'vgg_16/conv5/conv5_3/Relu:0' shape=(32, 18, 18, 512) dtype=float32>),
        #              ('vgg_16/pool5',
        #               <tf.Tensor 'vgg_16/pool5/MaxPool:0' shape=(32, 18, 18, 512) dtype=float32>)])
        end_points_collection = scope.original_name_scope + '_end_points'
        with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
                        outputs_collections=end_points_collection):

          net = slim.convolution(net, 1024, [3, 3], padding='SAME', rate=6, scope='fc6')

          # def convolution(inputs,
          #         num_outputs,
          #         kernel_size,
          #         stride=1,
          #         padding='SAME',
          #         data_format=None,
          #         rate=1,
          #         activation_fn=nn.relu,
          #         normalizer_fn=None,
          #         normalizer_params=None,
          #         weights_initializer=initializers.xavier_initializer(),
          #         weights_regularizer=None,
          #         biases_initializer=init_ops.zeros_initializer(),
          #         biases_regularizer=None,
          #         reuse=None,
          #         variables_collections=None,
          #         outputs_collections=None,
          #         trainable=True,
          #         scope=None):

          #fc6 is dilated conv
          # layer {
          # name: "fc6"
          # type: "Convolution"
          # bottom: "pool5"
          # top: "fc6"
          #   param {
          #     lr_mult: 1.0
          #     decay_mult: 1.0
          #   }
          #   param {
          #     lr_mult: 2.0
          #     decay_mult: 0.0
          #   }
          #   convolution_param {
          #     num_output: 1024
          #     pad: 6
          #     kernel_size: 3
          #     weight_filler {
          #       type: "xavier"
          #     }
          #     bias_filler {
          #       type: "constant"
          #       value: 0.0
          #     }
          #     dilation: 6
          #   }
          # }
          # layer {
          #   name: "relu6"
          #   type: "ReLU"
          #   bottom: "fc6"
          #   top: "fc6"
          # }

          #fc7 is 1*1 conv
          # layer {
          #   name: "fc7"
          #   type: "Convolution"
          #   bottom: "fc6"
          #   top: "fc7"
          #   param {
          #     lr_mult: 1.0
          #     decay_mult: 1.0
          #   }
          #   param {
          #     lr_mult: 2.0
          #     decay_mult: 0.0
          #   }
          #   convolution_param {
          #     num_output: 1024
          #     kernel_size: 1
          #     weight_filler {
          #       type: "xavier"
          #     }
          #     bias_filler {
          #       type: "constant"
          #       value: 0.0
          #     }
          #   }
          # }
          # layer {
          #   name: "relu7"
          #   type: "ReLU"
          #   bottom: "fc7"
          #   top: "fc7"
          # }
          net = slim.conv2d(net,1024,[1,1],padding='SAME',scope='fc7')
          net = slim.conv2d(net,256, [1,1],padding='SAME',scope='conv6_1')
          net = slim.conv2d(net,512, [3,3],padding='SAME',stride=2,scope='conv6_2')
          net = slim.conv2d(net,128, [1,1],padding='SAME',scope='conv7_1')
          net = slim.conv2d(net,256, [3,3],padding='SAME',stride=2,scope='conv7_2')
          net = slim.conv2d(net,128, [1,1],padding='VALID',scope='conv8_1')
          net = slim.conv2d(net,256, [3,3],padding='VALID',stride=1,scope='conv8_2')
          net = slim.conv2d(net,128, [1,1],padding='VALID',scope='conv9_1')
          net = slim.conv2d(net,256, [3,3],padding='VALID',stride=1,scope='conv9_2')
          _image_features_new = slim.utils.convert_collection_to_dict(end_points_collection)

          for k,v in _image_features_new.iteritems():
            image_features[k] = v
        # _, image_features = inception_v3.inception_v3_base(
        #     ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
        #     final_endpoint='Mixed_7c',
        #     min_depth=self._min_depth,
        #     depth_multiplier=self._depth_multiplier,
        #     scope=scope)
    feature_maps = feature_map_generators.multi_resolution_feature_maps(
        feature_map_layout=feature_map_layout,
        depth_multiplier=self._depth_multiplier,
        min_depth=self._min_depth,
        insert_1x1_conv=True,
        image_features=image_features)

    return feature_maps.values()
  def extract_features(self, preprocessed_inputs, state_saver=None,
                       state_name='lstm_state', unroll_length=10, scope=None):
    """Extract features from preprocessed inputs.

    The features include the base network features, lstm features and SSD
    features, organized in the following name scope:

    <scope>/MobilenetV2_1/...
    <scope>/MobilenetV2_2/...
    <scope>/LSTM/...
    <scope>/FeatureMap/...

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of consecutive frames from video clips.
      state_saver: A state saver object with methods `state` and `save_state`.
      state_name: Python string, the name to use with the state_saver.
      unroll_length: number of steps to unroll the lstm.
      scope: Scope for the base network of the feature extractor.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    Raises:
      ValueError: if interleave_method not recognized or large and small base
        network output feature maps of different sizes.
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)
    preprocessed_inputs = ops.pad_to_multiple(
        preprocessed_inputs, self._pad_to_multiple)
    batch_size = preprocessed_inputs.shape[0].value / unroll_length
    batch_axis = 0
    nets = []

    # Batch processing of mobilenet features.
    with slim.arg_scope(mobilenet_v2.training_scope(
        is_training=self._is_training,
        bn_decay=0.9997)), \
        slim.arg_scope([mobilenet.depth_multiplier],
                       min_depth=self._min_depth, divisible_by=8):
      # Big model.
      net, _ = self.extract_base_features_large(preprocessed_inputs)
      nets.append(net)
      large_base_feature_shape = net.shape

      # Small models
      net, _ = self.extract_base_features_small(preprocessed_inputs)
      nets.append(net)
      small_base_feature_shape = net.shape
      if not (large_base_feature_shape[1] == small_base_feature_shape[1] and
              large_base_feature_shape[2] == small_base_feature_shape[2]):
        raise ValueError('Large and Small base network feature map dimension '
                         'not equal!')

    with slim.arg_scope(self._conv_hyperparams_fn()):
      with tf.variable_scope('LSTM', reuse=self._reuse_weights):
        output_size = (large_base_feature_shape[1], large_base_feature_shape[2])
        lstm_cell, init_state, step = self.create_lstm_cell(
            batch_size, output_size, state_saver, state_name)

        nets_seq = [
            tf.split(net, unroll_length, axis=batch_axis) for net in nets
        ]

        net_seq, states_out = rnn_decoder.multi_input_rnn_decoder(
            nets_seq,
            init_state,
            lstm_cell,
            step,
            selection_strategy=self._interleave_method,
            is_training=self._is_training,
            is_quantized=self._is_quantized,
            pre_bottleneck=self._pre_bottleneck,
            flatten_state=self._flatten_state,
            scope=None)
        self._states_out = states_out

      batcher_ops = None
      if state_saver is not None:
        self._step = state_saver.state(state_name + '_step')
        batcher_ops = [
            state_saver.save_state(state_name + '_c', states_out[-1][0]),
            state_saver.save_state(state_name + '_h', states_out[-1][1]),
            state_saver.save_state(state_name + '_step', self._step + 1)]
      image_features = {}
      with tf_ops.control_dependencies(batcher_ops):
        image_features['layer_19'] = tf.concat(net_seq, 0)

      # SSD layers.
      with tf.variable_scope('FeatureMap'):
        feature_maps = feature_map_generators.multi_resolution_feature_maps(
            feature_map_layout=self._feature_map_layout,
            depth_multiplier=self._depth_multiplier,
            min_depth=self._min_depth,
            insert_1x1_conv=True,
            image_features=image_features,
            pool_residual=True)
    return feature_maps.values()
예제 #25
0
 def graph_fn(input_tensor):
   return shape_utils.check_min_image_dim(33, input_tensor)
예제 #26
0
    def extract_features(self, preprocessed_inputs):
        """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
        preprocessed_inputs = shape_utils.check_min_image_dim(
            33, preprocessed_inputs)

        with tf.variable_scope('MobilenetV2',
                               reuse=self._reuse_weights) as scope:
            with slim.arg_scope(
                mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \
                slim.arg_scope(
                    [mobilenet.depth_multiplier], min_depth=self._min_depth):
                with (slim.arg_scope(self._conv_hyperparams_fn())
                      if self._override_base_feature_extractor_hyperparams else
                      context_manager.IdentityContextManager()):
                    _, image_features = mobilenet_v2.mobilenet_base(
                        ops.pad_to_multiple(preprocessed_inputs,
                                            self._pad_to_multiple),
                        final_endpoint='layer_19',
                        depth_multiplier=self._depth_multiplier,
                        conv_defs=self._conv_defs,
                        use_explicit_padding=self._use_explicit_padding,
                        scope=scope)
            depth_fn = lambda d: max(int(d * self._depth_multiplier), self.
                                     _min_depth)
            with slim.arg_scope(self._conv_hyperparams_fn()):
                with tf.variable_scope('fpn', reuse=self._reuse_weights):
                    feature_blocks = [
                        'layer_4', 'layer_7', 'layer_14', 'layer_19'
                    ]
                    base_fpn_max_level = min(self._fpn_max_level, 5)
                    feature_block_list = []
                    for level in range(self._fpn_min_level,
                                       base_fpn_max_level + 1):
                        feature_block_list.append(feature_blocks[level - 2])
                    fpn_features = feature_map_generators.fpn_top_down_feature_maps(
                        [(key, image_features[key])
                         for key in feature_block_list],
                        depth=depth_fn(self._additional_layer_depth),
                        use_depthwise=self._use_depthwise,
                        use_explicit_padding=self._use_explicit_padding)
                    feature_maps = []
                    for level in range(self._fpn_min_level,
                                       base_fpn_max_level + 1):
                        feature_maps.append(fpn_features['top_down_{}'.format(
                            feature_blocks[level - 2])])
                    last_feature_map = fpn_features['top_down_{}'.format(
                        feature_blocks[base_fpn_max_level - 2])]
                    # Construct coarse features
                    padding = 'VALID' if self._use_explicit_padding else 'SAME'
                    kernel_size = 3
                    for i in range(base_fpn_max_level + 1,
                                   self._fpn_max_level + 1):
                        if self._use_depthwise:
                            conv_op = functools.partial(slim.separable_conv2d,
                                                        depth_multiplier=1)
                        else:
                            conv_op = slim.conv2d
                        if self._use_explicit_padding:
                            last_feature_map = ops.fixed_padding(
                                last_feature_map, kernel_size)
                        last_feature_map = conv_op(
                            last_feature_map,
                            num_outputs=depth_fn(self._additional_layer_depth),
                            kernel_size=[kernel_size, kernel_size],
                            stride=2,
                            padding=padding,
                            scope='bottom_up_Conv2d_{}'.format(
                                i - base_fpn_max_level + 19))
                        feature_maps.append(last_feature_map)
        return feature_maps
  def extract_features(self,
                       preprocessed_inputs,
                       state_saver=None,
                       state_name='lstm_state',
                       unroll_length=5,
                       scope=None):
    """Extracts features from preprocessed inputs.

    The features include the base network features, lstm features and SSD
    features, organized in the following name scope:

    <parent scope>/MobilenetV1/...
    <parent scope>/LSTM/...
    <parent scope>/FeatureMaps/...

    Args:
      preprocessed_inputs: A [batch, height, width, channels] float tensor
        representing a batch of consecutive frames from video clips.
      state_saver: A state saver object with methods `state` and `save_state`.
      state_name: A python string for the name to use with the state_saver.
      unroll_length: The number of steps to unroll the lstm.
      scope: The scope for the base network of the feature extractor.

    Returns:
      A list of tensors where the ith tensor has shape [batch, height_i,
      width_i, depth_i]
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)
    with slim.arg_scope(
        mobilenet_v1.mobilenet_v1_arg_scope(is_training=self._is_training)):
      with (slim.arg_scope(self._conv_hyperparams_fn())
            if self._override_base_feature_extractor_hyperparams else
            context_manager.IdentityContextManager()):
        with slim.arg_scope([slim.batch_norm], fused=False):
          # Base network.
          with tf.variable_scope(
              scope, self._base_network_scope,
              reuse=self._reuse_weights) as scope:
            net, image_features = mobilenet_v1.mobilenet_v1_base(
                ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
                final_endpoint='Conv2d_13_pointwise',
                min_depth=self._min_depth,
                depth_multiplier=self._depth_multiplier,
                scope=scope)

    with slim.arg_scope(self._conv_hyperparams_fn()):
      with slim.arg_scope(
          [slim.batch_norm], fused=False, is_training=self._is_training):
        # ConvLSTM layers.
        with tf.variable_scope('LSTM', reuse=self._reuse_weights) as lstm_scope:
          lstm_cell = lstm_cells.BottleneckConvLSTMCell(
              filter_size=(3, 3),
              output_size=(net.shape[1].value, net.shape[2].value),
              num_units=max(self._min_depth, self._lstm_state_depth),
              activation=tf.nn.relu6,
              visualize_gates=True)

          net_seq = list(tf.split(net, unroll_length))
          if state_saver is None:
            init_state = lstm_cell.init_state(
                state_name, net.shape[0].value / unroll_length, tf.float32)
          else:
            c = state_saver.state('%s_c' % state_name)
            h = state_saver.state('%s_h' % state_name)
            init_state = (c, h)

          # Identities added for inputing state tensors externally.
          c_ident = tf.identity(init_state[0], name='lstm_state_in_c')
          h_ident = tf.identity(init_state[1], name='lstm_state_in_h')
          init_state = (c_ident, h_ident)

          net_seq, states_out = rnn_decoder.rnn_decoder(
              net_seq, init_state, lstm_cell, scope=lstm_scope)
          batcher_ops = None
          self._states_out = states_out
          if state_saver is not None:
            self._step = state_saver.state('%s_step' % state_name)
            batcher_ops = [
                state_saver.save_state('%s_c' % state_name, states_out[-1][0]),
                state_saver.save_state('%s_h' % state_name, states_out[-1][1]),
                state_saver.save_state('%s_step' % state_name, self._step - 1)
            ]
          with tf_ops.control_dependencies(batcher_ops):
            image_features['Conv2d_13_pointwise_lstm'] = tf.concat(net_seq, 0)

          # Identities added for reading output states, to be reused externally.
          tf.identity(states_out[-1][0], name='lstm_state_out_c')
          tf.identity(states_out[-1][1], name='lstm_state_out_h')

        # SSD layers.
        with tf.variable_scope('FeatureMaps', reuse=self._reuse_weights):
          feature_maps = feature_map_generators.multi_resolution_feature_maps(
              feature_map_layout=self._feature_map_layout,
              depth_multiplier=(self._depth_multiplier),
              min_depth=self._min_depth,
              insert_1x1_conv=True,
              image_features=image_features)

    return feature_maps.values()