def test_check_min_image_dim_static_shape(self): input_tensor = tf.constant(np.zeros([1, 42, 42, 3])) _ = shape_utils.check_min_image_dim(33, input_tensor) with self.assertRaisesRegexp( ValueError, 'image size must be >= 64 in both height and width.'): _ = shape_utils.check_min_image_dim(64, input_tensor)
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] Raises: ValueError: depth multiplier is not supported. """ if self._depth_multiplier != 1.0: raise ValueError('Depth multiplier not supported.') preprocessed_inputs = shape_utils.check_min_image_dim( 129, preprocessed_inputs) with tf.variable_scope( self._resnet_scope_name, reuse=self._reuse_weights) as scope: with slim.arg_scope(resnet_v1.resnet_arg_scope()): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = self._resnet_base_fn( inputs=ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), num_classes=None, is_training=None, global_pool=False, output_stride=None, store_non_strided_activations=True, scope=scope) image_features = self._filter_features(image_features) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope(self._fpn_scope_name, reuse=self._reuse_weights): fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in ['block2', 'block3', 'block4']], depth=256) last_feature_map = fpn_features['top_down_block4'] coarse_features = {} for i in range(5, 7): last_feature_map = slim.conv2d( last_feature_map, num_outputs=256, kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_block{}'.format(i)) coarse_features['bottom_up_block{}'.format(i)] = last_feature_map return [fpn_features['top_down_block2'], fpn_features['top_down_block3'], fpn_features['top_down_block4'], coarse_features['bottom_up_block5'], coarse_features['bottom_up_block6']]
def test_check_min_image_dim_dynamic_shape(self): input_placeholder = tf.placeholder(tf.float32, shape=[1, None, None, 3]) image_tensor = shape_utils.check_min_image_dim(33, input_placeholder) with self.test_session() as sess: sess.run(image_tensor, feed_dict={input_placeholder: np.zeros([1, 42, 42, 3])}) with self.assertRaises(tf.errors.InvalidArgumentError): sess.run(image_tensor, feed_dict={input_placeholder: np.zeros([1, 32, 32, 3])})
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': ['layer_15/expansion_output', 'layer_19', '', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256, 128], 'use_depthwise': self._use_depthwise, 'use_explicit_padding': self._use_explicit_padding, } with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): # TODO(b/68150321): Enable fused batch norm once quantization # supports it. with slim.arg_scope([slim.batch_norm], fused=False): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): # TODO(b/68150321): Enable fused batch norm once quantization # supports it. with slim.arg_scope([slim.batch_norm], fused=False): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def _extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256, 128], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=(self._batch_norm_trainable and self._is_training))): # TODO(skligys): Enable fused batch norm once quantization supports it. with slim.arg_scope([slim.batch_norm], fused=False): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams): # TODO(skligys): Enable fused batch norm once quantization supports it. with slim.arg_scope([slim.batch_norm], fused=False): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] Raises: ValueError: depth multiplier is not supported. """ if self._depth_multiplier != 1.0: raise ValueError('Depth multiplier not supported.') preprocessed_inputs = shape_utils.check_min_image_dim( 129, preprocessed_inputs) with tf.variable_scope( self._resnet_scope_name, reuse=self._reuse_weights) as scope: with slim.arg_scope(resnet_v1.resnet_arg_scope()): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): with slim.arg_scope( [resnet_v1.bottleneck], use_bounded_activations=self._use_bounded_activations): _, activations = self._resnet_base_fn( inputs=ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), num_classes=None, is_training=None, global_pool=False, output_stride=None, store_non_strided_activations=True, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.pooling_pyramid_feature_maps( base_feature_map_depth=self._base_feature_map_depth, num_layers=self._num_layers, image_features={ 'image_features': self._filter_features(activations)['block3'] }) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': ['Conv2d_11_pointwise', 'Conv2d_13_pointwise', '', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256, 128], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def _extract_proposal_features(self, preprocessed_inputs, scope): """Extracts first stage RPN features. Args: preprocessed_inputs: A [batch, height, width, channels] float32 tensor representing a batch of images. scope: A scope name. Returns: rpn_feature_map: A tensor with shape [batch, height, width, depth] activations: A dictionary mapping feature extractor tensor names to tensors Raises: InvalidArgumentError: If the spatial size of `preprocessed_inputs` (height or width) is less than 33. ValueError: If the created network is missing the required activation. """ preprocessed_inputs.get_shape().assert_has_rank(4) preprocessed_inputs = shape_utils.check_min_image_dim( min_dim=33, image_tensor=preprocessed_inputs) with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=self._train_batch_norm, weight_decay=self._weight_decay)): with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: params = {} if self._skip_last_stride: params['conv_defs'] = _get_mobilenet_conv_no_last_stride_defs( conv_depth_ratio_in_percentage=self. _conv_depth_ratio_in_percentage) _, activations = mobilenet_v1.mobilenet_v1_base( preprocessed_inputs, final_endpoint='Conv2d_11_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope, **params) return activations['Conv2d_11_pointwise'], activations
def _extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) image_features = self.mobilenet_v2( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple)) feature_maps = self.feature_map_generator({ 'layer_15/expansion_output': image_features[0], 'layer_19': image_features[1]}) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': ['Mixed_4c', 'Mixed_5c', '', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256, 128], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('InceptionV2', reuse=self._reuse_weights) as scope: _, image_features = inception_v2.inception_v2_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Mixed_5c', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def _extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''], 'layer_depth': [-1, -1, -1, 512, 256, 128], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } with slim.arg_scope(self._conv_hyperparams): with tf.variable_scope('InceptionV3', reuse=self._reuse_weights) as scope: _, image_features = inception_v3.inception_v3_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Mixed_7c', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.pooling_pyramid_feature_maps( base_feature_map_depth=0, num_layers=6, image_features={ 'image_features': image_features['Conv2d_11_pointwise'] }) return feature_maps.values()
def _extract_proposal_features(self, preprocessed_inputs, scope): """Extracts first stage RPN features. Args: preprocessed_inputs: A [batch, height, width, channels] float32 tensor representing a batch of images. scope: A scope name. Returns: rpn_feature_map: A tensor with shape [batch, height, width, depth] activations: A dictionary mapping feature extractor tensor names to tensors Raises: InvalidArgumentError: If the spatial size of `preprocessed_inputs` (height or width) is less than 33. ValueError: If the created network is missing the required activation. """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV3', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v3.training_scope(is_training=None, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): _, activations = mobilenet_v3.mobilenet_base( preprocessed_inputs, conv_defs=mobilenet_v3.V3_LARGE_DETECTION, final_endpoint='layer_17', depth_multiplier=self._depth_multiplier, scope=scope) return activations['layer_17'], activations
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=None, bn_decay=0.99)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with slim.arg_scope( training_scope(l2_weight_decay=4e-5, is_training=self._is_training)): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_18', depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) multiplier_func = functools.partial( _apply_multiplier, multiplier=self._depth_multiplier, min_depth=self._min_depth) with tf.variable_scope('MnasFPN', reuse=self._reuse_weights): with slim.arg_scope( training_scope(l2_weight_decay=1e-4, is_training=self._is_training)): # Create C6 by downsampling C5. c6 = slim.max_pool2d( _maybe_pad(image_features['layer_18'], self._use_explicit_padding), [3, 3], stride=[2, 2], padding='VALID' if self._use_explicit_padding else 'SAME', scope='C6_downsample') c6 = slim.conv2d( c6, multiplier_func(self._fpn_layer_depth), [1, 1], activation_fn=tf.identity, normalizer_fn=slim.batch_norm, weights_regularizer=None, # this 1x1 has no kernel regularizer. padding='VALID', scope='C6_Conv1x1') image_features['C6'] = tf.identity(c6) # Needed for quantization. for k in sorted(image_features.keys()): tf.logging.error('{}: {}'.format(k, image_features[k])) mnasfpn_inputs = [ image_features['layer_7'], # C3 image_features['layer_14'], # C4 image_features['layer_18'], # C5 image_features['C6'] # C6 ] self._verify_config(mnasfpn_inputs) feature_maps = mnasfpn( mnasfpn_inputs, head_def=self._head_def, output_channel=self._fpn_layer_depth, use_explicit_padding=self._use_explicit_padding, use_native_resize_op=self._use_native_resize_op, multiplier_func=multiplier_func) return feature_maps
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) depth = lambda d: max(int(d * self._depth_multiplier), self._min_depth) trunc_normal = lambda stddev: tf.truncated_normal_initializer( 0.0, stddev) #add convolution autoencoder encoder_1_conv = slim.conv2d(preprocessed_inputs, depth(64), [3, 3], weights_initializer=trunc_normal(0.09), scope='encoder_c1_conv') #encoder_1_pool = slim.max_pool2d(encoder_1_conv, [2, 2], stride=2, # scope='encoder_c1_pool') #encoder_1_dropout1 = slim.dropout(encoder_1_pool, 0.7, scope='encoder_c1_dropout1') encoder_2_conv = slim.conv2d(encoder_1_conv, depth(128), [5, 5], weights_initializer=trunc_normal(0.09), scope='encoder_c2_conv') encoder_3_conv = slim.conv2d(encoder_2_conv, depth(128), [5, 5], weights_initializer=trunc_normal(0.09), scope='encoder_c3_conv') #decoder decoder_3_deconv = slim.conv2d_transpose( encoder_3_conv, depth(128), [5, 5], weights_initializer=trunc_normal(0.09), scope='decoder_c3_deconv') decoder_2_deconv = slim.conv2d_transpose( decoder_3_deconv, depth(128), [5, 5], weights_initializer=trunc_normal(0.09), scope='decoder_c2_deconv') decoder_1_deconv = slim.conv2d_transpose( decoder_2_deconv, depth(1), [3, 3], weights_initializer=trunc_normal(0.09), scope='decoder_c1_deconv') feature_map_layout = { 'from_layer': ['Mixed_4c', 'Mixed_5c', '', '', '', ''], 'layer_depth': [-1, -1, 512, 256, 256, 128], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('InceptionV2', reuse=self._reuse_weights) as scope: _, image_features = inception_v2.inception_v2_base( ops.pad_to_multiple(decoder_1_deconv, self._pad_to_multiple), final_endpoint='Mixed_5c', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self. _min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'Conv2d_3_pointwise', 'Conv2d_5_pointwise', 'Conv2d_11_pointwise', 'Conv2d_13_pointwise' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(256)) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): last_feature_map = slim.conv2d( last_feature_map, num_outputs=depth_fn(256), kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_Conv2d_{}'.format( i - base_fpn_max_level + 13)) feature_maps.append(last_feature_map) return feature_maps
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] Raises: ValueError: depth multiplier is not supported. """ if self._depth_multiplier != 1.0: raise ValueError('Depth multiplier not supported.') preprocessed_inputs = shape_utils.check_min_image_dim( 129, preprocessed_inputs) with tf.variable_scope(self._resnet_scope_name, reuse=self._reuse_weights) as scope: with slim.arg_scope(resnet_v1.resnet_arg_scope()): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = self._resnet_base_fn( inputs=ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), num_classes=None, is_training=None, global_pool=False, output_stride=None, store_non_strided_activations=True, scope=scope) image_features = self._filter_features(image_features) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope(self._fpn_scope_name, reuse=self._reuse_weights): fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in ['block2', 'block3', 'block4']], depth=256) last_feature_map = fpn_features['top_down_block4'] coarse_features = {} for i in range(5, 7): last_feature_map = slim.conv2d( last_feature_map, num_outputs=256, kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_block{}'.format(i)) coarse_features['bottom_up_block{}'.format( i)] = last_feature_map return [ fpn_features['top_down_block2'], fpn_features['top_down_block3'], fpn_features['top_down_block4'], coarse_features['bottom_up_block5'], coarse_features['bottom_up_block6'] ]
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] Raises: ValueError if conv_defs is not provided or from_layer does not meet the size requirement. """ if not self._conv_defs: raise ValueError('Must provide backbone conv defs.') if len(self._from_layer) != 2: raise ValueError('SSD input feature names are not provided.') preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': [self._from_layer[0], self._from_layer[1], '', '', '', ''], 'layer_depth': [-1, -1, 128, 128, 128, 128], #[-1, -1, 512, 256, 256, 128] 'use_depthwise': self._use_depthwise, 'use_explicit_padding': self._use_explicit_padding, } with tf.variable_scope(self._scope_name, reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v3.training_scope(is_training=None, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v3.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), conv_defs=self._conv_defs, final_endpoint=self._from_layer[1], depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] Raises: ValueError: depth multiplier is not supported. """ if self._depth_multiplier != 1.0: raise ValueError('Depth multiplier not supported.') preprocessed_inputs = shape_utils.check_min_image_dim( 129, preprocessed_inputs) with tf.variable_scope(self._resnet_scope_name, reuse=self._reuse_weights) as scope: with slim.arg_scope(resnet_v2.resnet_arg_scope()): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = self._resnet_base_fn( inputs=ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), num_classes=None, is_training=None, global_pool=False, output_stride=None, # store_non_strided_activations=True, scope=scope) image_features = self._filter_features(image_features) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope(self._fpn_scope_name, reuse=self._reuse_weights): base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append('block{}'.format(level - 1)) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=self._additional_layer_depth, use_explicit_padding=True) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append( fpn_features['top_down_block{}'.format(level - 1)]) last_feature_map = fpn_features['top_down_block{}'.format( base_fpn_max_level - 1)] # Construct coarse features for i in range(base_fpn_max_level, self._fpn_max_level): last_feature_map = slim.conv2d( last_feature_map, num_outputs=self._additional_layer_depth, kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_block{}'.format(i)) feature_maps.append(last_feature_map) return feature_maps
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, conv_defs=_CONV_DEFS if self._use_depthwise else None, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'layer_4', 'layer_7', 'layer_14', 'layer_19' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(self._additional_layer_depth), use_depthwise=self._use_depthwise) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): if self._use_depthwise: conv_op = functools.partial( slim.separable_conv2d, depth_multiplier=1) else: conv_op = slim.conv2d last_feature_map = conv_op( last_feature_map, num_outputs=depth_fn(self._additional_layer_depth), kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 19)) feature_maps.append(last_feature_map) return feature_maps
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV1', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=None, regularize_depthwise=True)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self._min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'Conv2d_3_pointwise', 'Conv2d_5_pointwise', 'Conv2d_11_pointwise', 'Conv2d_13_pointwise' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(256)) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): last_feature_map = slim.conv2d( last_feature_map, num_outputs=depth_fn(256), kernel_size=[3, 3], stride=2, padding='SAME', scope='bottom_up_Conv2d_{}'.format(i - base_fpn_max_level + 13)) feature_maps.append(last_feature_map) return feature_maps
def extract_features(self, preprocessed_inputs, state_saver=None, state_name='lstm_state', unroll_length=5, scope=None): """Extracts features from preprocessed inputs. The features include the base network features, lstm features and SSD features, organized in the following name scope: <parent scope>/MobilenetV1/... <parent scope>/LSTM/... <parent scope>/FeatureMaps/... Args: preprocessed_inputs: A [batch, height, width, channels] float tensor representing a batch of consecutive frames from video clips. state_saver: A state saver object with methods `state` and `save_state`. state_name: A python string for the name to use with the state_saver. unroll_length: The number of steps to unroll the lstm. scope: The scope for the base network of the feature extractor. Returns: A list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope( is_training=self._is_training)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): with slim.arg_scope([slim.batch_norm], fused=False): # Base network. with tf.variable_scope(scope, self._base_network_scope, reuse=self._reuse_weights) as scope: net, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): with slim.arg_scope([slim.batch_norm], fused=False, is_training=self._is_training): # ConvLSTM layers. with tf.variable_scope( 'LSTM', reuse=self._reuse_weights) as lstm_scope: lstm_cell = lstm_cells.BottleneckConvLSTMCell( filter_size=(3, 3), output_size=(net.shape[1].value, net.shape[2].value), num_units=max(self._min_depth, self._lstm_state_depth), activation=tf.nn.relu6, visualize_gates=True) net_seq = list(tf.split(net, unroll_length)) if state_saver is None: init_state = lstm_cell.init_state( state_name, net.shape[0].value / unroll_length, tf.float32) else: c = state_saver.state('%s_c' % state_name) h = state_saver.state('%s_h' % state_name) init_state = (c, h) # Identities added for inputing state tensors externally. c_ident = tf.identity(init_state[0], name='lstm_state_in_c') h_ident = tf.identity(init_state[1], name='lstm_state_in_h') init_state = (c_ident, h_ident) net_seq, states_out = rnn_decoder.rnn_decoder( net_seq, init_state, lstm_cell, scope=lstm_scope) batcher_ops = None self._states_out = states_out if state_saver is not None: self._step = state_saver.state('%s_step' % state_name) batcher_ops = [ state_saver.save_state('%s_c' % state_name, states_out[-1][0]), state_saver.save_state('%s_h' % state_name, states_out[-1][1]), state_saver.save_state('%s_step' % state_name, self._step - 1) ] with tf_ops.control_dependencies(batcher_ops): image_features['Conv2d_13_pointwise_lstm'] = tf.concat( net_seq, 0) # Identities added for reading output states, to be reused externally. tf.identity(states_out[-1][0], name='lstm_state_out_c') tf.identity(states_out[-1][1], name='lstm_state_out_h') # SSD layers. with tf.variable_scope('FeatureMaps', reuse=self._reuse_weights): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=self._feature_map_layout, depth_multiplier=(self._depth_multiplier), min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) feature_map_layout = { 'from_layer': ['vgg_16/conv4_3_norm', 'vgg_16/fc7', 'vgg_16/conv6_2', 'vgg_16/conv7_2', 'vgg_16/conv8_2', 'vgg_16/conv9_2'], 'layer_depth': [-1, -1, -1, -1, -1, -1], 'use_explicit_padding': self._use_explicit_padding, 'use_depthwise': self._use_depthwise, } net,image_features = vgg.vgg_16(ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='pool5',spatial_squeeze=False) #double check scale filler image_features['vgg_16/conv4_3_norm']=custom_layers.l2_normalization(image_features['vgg_16/conv4/conv4_3'],scaling=True,scope='vgg_16/conv4_3_norm') with slim.arg_scope(self._conv_hyperparams): with tf.variable_scope('vgg_16', reuse=self._reuse_weights) as scope: # In [5]: net # Out[5]: <tf.Tensor 'vgg_16/pool5/MaxPool:0' shape=(32, 18, 18, 512) dtype=float32> # In [6]: end_points # Out[6]: # OrderedDict([('vgg_16/conv1/conv1_1', # <tf.Tensor 'vgg_16/conv1/conv1_1/Relu:0' shape=(32, 300, 300, 64) dtype=float32>), # ('vgg_16/conv1/conv1_2', # <tf.Tensor 'vgg_16/conv1/conv1_2/Relu:0' shape=(32, 300, 300, 64) dtype=float32>), # ('vgg_16/pool1', # <tf.Tensor 'vgg_16/pool1/MaxPool:0' shape=(32, 150, 150, 64) dtype=float32>), # ('vgg_16/conv2/conv2_1', # <tf.Tensor 'vgg_16/conv2/conv2_1/Relu:0' shape=(32, 150, 150, 128) dtype=float32>), # ('vgg_16/conv2/conv2_2', # <tf.Tensor 'vgg_16/conv2/conv2_2/Relu:0' shape=(32, 150, 150, 128) dtype=float32>), # ('vgg_16/pool2', # <tf.Tensor 'vgg_16/pool2/MaxPool:0' shape=(32, 75, 75, 128) dtype=float32>), # ('vgg_16/conv3/conv3_1', # <tf.Tensor 'vgg_16/conv3/conv3_1/Relu:0' shape=(32, 75, 75, 256) dtype=float32>), # ('vgg_16/conv3/conv3_2', # <tf.Tensor 'vgg_16/conv3/conv3_2/Relu:0' shape=(32, 75, 75, 256) dtype=float32>), # ('vgg_16/conv3/conv3_3', # <tf.Tensor 'vgg_16/conv3/conv3_3/Relu:0' shape=(32, 75, 75, 256) dtype=float32>), # ('vgg_16/pool3', # <tf.Tensor 'vgg_16/pool3/MaxPool:0' shape=(32, 37, 37, 256) dtype=float32>), # ('vgg_16/conv4/conv4_1', # <tf.Tensor 'vgg_16/conv4/conv4_1/Relu:0' shape=(32, 37, 37, 512) dtype=float32>), # ('vgg_16/conv4/conv4_2', # <tf.Tensor 'vgg_16/conv4/conv4_2/Relu:0' shape=(32, 37, 37, 512) dtype=float32>), # ('vgg_16/conv4/conv4_3', # <tf.Tensor 'vgg_16/conv4/conv4_3/Relu:0' shape=(32, 37, 37, 512) dtype=float32>), # ('vgg_16/pool4', # <tf.Tensor 'vgg_16/pool4/MaxPool:0' shape=(32, 18, 18, 512) dtype=float32>), # ('vgg_16/conv5/conv5_1', # <tf.Tensor 'vgg_16/conv5/conv5_1/Relu:0' shape=(32, 18, 18, 512) dtype=float32>), # ('vgg_16/conv5/conv5_2', # <tf.Tensor 'vgg_16/conv5/conv5_2/Relu:0' shape=(32, 18, 18, 512) dtype=float32>), # ('vgg_16/conv5/conv5_3', # <tf.Tensor 'vgg_16/conv5/conv5_3/Relu:0' shape=(32, 18, 18, 512) dtype=float32>), # ('vgg_16/pool5', # <tf.Tensor 'vgg_16/pool5/MaxPool:0' shape=(32, 18, 18, 512) dtype=float32>)]) end_points_collection = scope.original_name_scope + '_end_points' with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d], outputs_collections=end_points_collection): net = slim.convolution(net, 1024, [3, 3], padding='SAME', rate=6, scope='fc6') # def convolution(inputs, # num_outputs, # kernel_size, # stride=1, # padding='SAME', # data_format=None, # rate=1, # activation_fn=nn.relu, # normalizer_fn=None, # normalizer_params=None, # weights_initializer=initializers.xavier_initializer(), # weights_regularizer=None, # biases_initializer=init_ops.zeros_initializer(), # biases_regularizer=None, # reuse=None, # variables_collections=None, # outputs_collections=None, # trainable=True, # scope=None): #fc6 is dilated conv # layer { # name: "fc6" # type: "Convolution" # bottom: "pool5" # top: "fc6" # param { # lr_mult: 1.0 # decay_mult: 1.0 # } # param { # lr_mult: 2.0 # decay_mult: 0.0 # } # convolution_param { # num_output: 1024 # pad: 6 # kernel_size: 3 # weight_filler { # type: "xavier" # } # bias_filler { # type: "constant" # value: 0.0 # } # dilation: 6 # } # } # layer { # name: "relu6" # type: "ReLU" # bottom: "fc6" # top: "fc6" # } #fc7 is 1*1 conv # layer { # name: "fc7" # type: "Convolution" # bottom: "fc6" # top: "fc7" # param { # lr_mult: 1.0 # decay_mult: 1.0 # } # param { # lr_mult: 2.0 # decay_mult: 0.0 # } # convolution_param { # num_output: 1024 # kernel_size: 1 # weight_filler { # type: "xavier" # } # bias_filler { # type: "constant" # value: 0.0 # } # } # } # layer { # name: "relu7" # type: "ReLU" # bottom: "fc7" # top: "fc7" # } net = slim.conv2d(net,1024,[1,1],padding='SAME',scope='fc7') net = slim.conv2d(net,256, [1,1],padding='SAME',scope='conv6_1') net = slim.conv2d(net,512, [3,3],padding='SAME',stride=2,scope='conv6_2') net = slim.conv2d(net,128, [1,1],padding='SAME',scope='conv7_1') net = slim.conv2d(net,256, [3,3],padding='SAME',stride=2,scope='conv7_2') net = slim.conv2d(net,128, [1,1],padding='VALID',scope='conv8_1') net = slim.conv2d(net,256, [3,3],padding='VALID',stride=1,scope='conv8_2') net = slim.conv2d(net,128, [1,1],padding='VALID',scope='conv9_1') net = slim.conv2d(net,256, [3,3],padding='VALID',stride=1,scope='conv9_2') _image_features_new = slim.utils.convert_collection_to_dict(end_points_collection) for k,v in _image_features_new.iteritems(): image_features[k] = v # _, image_features = inception_v3.inception_v3_base( # ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), # final_endpoint='Mixed_7c', # min_depth=self._min_depth, # depth_multiplier=self._depth_multiplier, # scope=scope) feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()
def extract_features(self, preprocessed_inputs, state_saver=None, state_name='lstm_state', unroll_length=10, scope=None): """Extract features from preprocessed inputs. The features include the base network features, lstm features and SSD features, organized in the following name scope: <scope>/MobilenetV2_1/... <scope>/MobilenetV2_2/... <scope>/LSTM/... <scope>/FeatureMap/... Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of consecutive frames from video clips. state_saver: A state saver object with methods `state` and `save_state`. state_name: Python string, the name to use with the state_saver. unroll_length: number of steps to unroll the lstm. scope: Scope for the base network of the feature extractor. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] Raises: ValueError: if interleave_method not recognized or large and small base network output feature maps of different sizes. """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) preprocessed_inputs = ops.pad_to_multiple( preprocessed_inputs, self._pad_to_multiple) batch_size = preprocessed_inputs.shape[0].value / unroll_length batch_axis = 0 nets = [] # Batch processing of mobilenet features. with slim.arg_scope(mobilenet_v2.training_scope( is_training=self._is_training, bn_decay=0.9997)), \ slim.arg_scope([mobilenet.depth_multiplier], min_depth=self._min_depth, divisible_by=8): # Big model. net, _ = self.extract_base_features_large(preprocessed_inputs) nets.append(net) large_base_feature_shape = net.shape # Small models net, _ = self.extract_base_features_small(preprocessed_inputs) nets.append(net) small_base_feature_shape = net.shape if not (large_base_feature_shape[1] == small_base_feature_shape[1] and large_base_feature_shape[2] == small_base_feature_shape[2]): raise ValueError('Large and Small base network feature map dimension ' 'not equal!') with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('LSTM', reuse=self._reuse_weights): output_size = (large_base_feature_shape[1], large_base_feature_shape[2]) lstm_cell, init_state, step = self.create_lstm_cell( batch_size, output_size, state_saver, state_name) nets_seq = [ tf.split(net, unroll_length, axis=batch_axis) for net in nets ] net_seq, states_out = rnn_decoder.multi_input_rnn_decoder( nets_seq, init_state, lstm_cell, step, selection_strategy=self._interleave_method, is_training=self._is_training, is_quantized=self._is_quantized, pre_bottleneck=self._pre_bottleneck, flatten_state=self._flatten_state, scope=None) self._states_out = states_out batcher_ops = None if state_saver is not None: self._step = state_saver.state(state_name + '_step') batcher_ops = [ state_saver.save_state(state_name + '_c', states_out[-1][0]), state_saver.save_state(state_name + '_h', states_out[-1][1]), state_saver.save_state(state_name + '_step', self._step + 1)] image_features = {} with tf_ops.control_dependencies(batcher_ops): image_features['layer_19'] = tf.concat(net_seq, 0) # SSD layers. with tf.variable_scope('FeatureMap'): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=self._feature_map_layout, depth_multiplier=self._depth_multiplier, min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features, pool_residual=True) return feature_maps.values()
def graph_fn(input_tensor): return shape_utils.check_min_image_dim(33, input_tensor)
def extract_features(self, preprocessed_inputs): """Extract features from preprocessed inputs. Args: preprocessed_inputs: a [batch, height, width, channels] float tensor representing a batch of images. Returns: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with tf.variable_scope('MobilenetV2', reuse=self._reuse_weights) as scope: with slim.arg_scope( mobilenet_v2.training_scope(is_training=None, bn_decay=0.9997)), \ slim.arg_scope( [mobilenet.depth_multiplier], min_depth=self._min_depth): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): _, image_features = mobilenet_v2.mobilenet_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='layer_19', depth_multiplier=self._depth_multiplier, conv_defs=self._conv_defs, use_explicit_padding=self._use_explicit_padding, scope=scope) depth_fn = lambda d: max(int(d * self._depth_multiplier), self. _min_depth) with slim.arg_scope(self._conv_hyperparams_fn()): with tf.variable_scope('fpn', reuse=self._reuse_weights): feature_blocks = [ 'layer_4', 'layer_7', 'layer_14', 'layer_19' ] base_fpn_max_level = min(self._fpn_max_level, 5) feature_block_list = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_block_list.append(feature_blocks[level - 2]) fpn_features = feature_map_generators.fpn_top_down_feature_maps( [(key, image_features[key]) for key in feature_block_list], depth=depth_fn(self._additional_layer_depth), use_depthwise=self._use_depthwise, use_explicit_padding=self._use_explicit_padding) feature_maps = [] for level in range(self._fpn_min_level, base_fpn_max_level + 1): feature_maps.append(fpn_features['top_down_{}'.format( feature_blocks[level - 2])]) last_feature_map = fpn_features['top_down_{}'.format( feature_blocks[base_fpn_max_level - 2])] # Construct coarse features padding = 'VALID' if self._use_explicit_padding else 'SAME' kernel_size = 3 for i in range(base_fpn_max_level + 1, self._fpn_max_level + 1): if self._use_depthwise: conv_op = functools.partial(slim.separable_conv2d, depth_multiplier=1) else: conv_op = slim.conv2d if self._use_explicit_padding: last_feature_map = ops.fixed_padding( last_feature_map, kernel_size) last_feature_map = conv_op( last_feature_map, num_outputs=depth_fn(self._additional_layer_depth), kernel_size=[kernel_size, kernel_size], stride=2, padding=padding, scope='bottom_up_Conv2d_{}'.format( i - base_fpn_max_level + 19)) feature_maps.append(last_feature_map) return feature_maps
def extract_features(self, preprocessed_inputs, state_saver=None, state_name='lstm_state', unroll_length=5, scope=None): """Extracts features from preprocessed inputs. The features include the base network features, lstm features and SSD features, organized in the following name scope: <parent scope>/MobilenetV1/... <parent scope>/LSTM/... <parent scope>/FeatureMaps/... Args: preprocessed_inputs: A [batch, height, width, channels] float tensor representing a batch of consecutive frames from video clips. state_saver: A state saver object with methods `state` and `save_state`. state_name: A python string for the name to use with the state_saver. unroll_length: The number of steps to unroll the lstm. scope: The scope for the base network of the feature extractor. Returns: A list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i] """ preprocessed_inputs = shape_utils.check_min_image_dim( 33, preprocessed_inputs) with slim.arg_scope( mobilenet_v1.mobilenet_v1_arg_scope(is_training=self._is_training)): with (slim.arg_scope(self._conv_hyperparams_fn()) if self._override_base_feature_extractor_hyperparams else context_manager.IdentityContextManager()): with slim.arg_scope([slim.batch_norm], fused=False): # Base network. with tf.variable_scope( scope, self._base_network_scope, reuse=self._reuse_weights) as scope: net, image_features = mobilenet_v1.mobilenet_v1_base( ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple), final_endpoint='Conv2d_13_pointwise', min_depth=self._min_depth, depth_multiplier=self._depth_multiplier, scope=scope) with slim.arg_scope(self._conv_hyperparams_fn()): with slim.arg_scope( [slim.batch_norm], fused=False, is_training=self._is_training): # ConvLSTM layers. with tf.variable_scope('LSTM', reuse=self._reuse_weights) as lstm_scope: lstm_cell = lstm_cells.BottleneckConvLSTMCell( filter_size=(3, 3), output_size=(net.shape[1].value, net.shape[2].value), num_units=max(self._min_depth, self._lstm_state_depth), activation=tf.nn.relu6, visualize_gates=True) net_seq = list(tf.split(net, unroll_length)) if state_saver is None: init_state = lstm_cell.init_state( state_name, net.shape[0].value / unroll_length, tf.float32) else: c = state_saver.state('%s_c' % state_name) h = state_saver.state('%s_h' % state_name) init_state = (c, h) # Identities added for inputing state tensors externally. c_ident = tf.identity(init_state[0], name='lstm_state_in_c') h_ident = tf.identity(init_state[1], name='lstm_state_in_h') init_state = (c_ident, h_ident) net_seq, states_out = rnn_decoder.rnn_decoder( net_seq, init_state, lstm_cell, scope=lstm_scope) batcher_ops = None self._states_out = states_out if state_saver is not None: self._step = state_saver.state('%s_step' % state_name) batcher_ops = [ state_saver.save_state('%s_c' % state_name, states_out[-1][0]), state_saver.save_state('%s_h' % state_name, states_out[-1][1]), state_saver.save_state('%s_step' % state_name, self._step - 1) ] with tf_ops.control_dependencies(batcher_ops): image_features['Conv2d_13_pointwise_lstm'] = tf.concat(net_seq, 0) # Identities added for reading output states, to be reused externally. tf.identity(states_out[-1][0], name='lstm_state_out_c') tf.identity(states_out[-1][1], name='lstm_state_out_h') # SSD layers. with tf.variable_scope('FeatureMaps', reuse=self._reuse_weights): feature_maps = feature_map_generators.multi_resolution_feature_maps( feature_map_layout=self._feature_map_layout, depth_multiplier=(self._depth_multiplier), min_depth=self._min_depth, insert_1x1_conv=True, image_features=image_features) return feature_maps.values()