def resample_feature_map(feat, level, target_level, is_training, target_feat_dims=256, conv2d_op=tf.layers.conv2d, batch_norm_relu=nn_ops.BatchNormRelu(), name=None): """Resample input feature map to have target number of channels and width.""" feat_dims = feat.get_shape().as_list()[3] with tf.variable_scope('resample_{}'.format(name)): if feat_dims != target_feat_dims: feat = conv2d_op(feat, filters=target_feat_dims, kernel_size=(1, 1), padding='same') feat = batch_norm_relu(feat, is_training=is_training, relu=False, name='bn') if level < target_level: stride = int(2**(target_level - level)) feat = tf.layers.max_pooling2d(inputs=feat, pool_size=stride, strides=[stride, stride], padding='SAME') elif level > target_level: scale = int(2**(level - target_level)) feat = spatial_transform_ops.nearest_upsampling(feat, scale=scale) return feat
def resample_with_sepconv(feat, target_width, target_num_filters, use_native_resize_op=False, batch_norm_activation=nn_ops.BatchNormActivation(), data_format='channels_last', name=None, is_training=False): """Match resolution and feature dimension to the target block.""" _, height, width, num_filters = feat.get_shape().as_list() if width is None or num_filters is None: raise ValueError('Shape of feat is None (shape:{}).'.format( feat.shape)) with tf.variable_scope('resample_with_sepconv_{}'.format(name)): # Down-sample. if width > target_width: if width % target_width != 0: raise ValueError('width ({}) is not divisible by ' 'target_width ({}).'.format( width, target_width)) while width > target_width: feat = nn_ops.depthwise_conv2d_fixed_padding( inputs=feat, kernel_size=3, strides=2, data_format=data_format) feat = batch_norm_activation(feat, is_training=is_training) width /= 2 # Up-sample with NN interpolation. elif width < target_width: if target_width % width != 0: raise ValueError('target_wdith ({}) is not divisible by ' 'width ({}).'.format(target_width, width)) scale = target_width // width if use_native_resize_op: feat = tf.image.resize_nearest_neighbor( feat, [height * scale, width * scale]) else: feat = spatial_transform_ops.nearest_upsampling(feat, scale=scale) # Match feature dimension to the target block. feat = nn_ops.conv2d_fixed_padding(inputs=feat, filters=target_num_filters, kernel_size=1, strides=1, data_format=data_format) feat = batch_norm_activation(feat, relu=False, is_training=is_training) return feat
def __call__(self, multilevel_features, is_training=None): """Returns the FPN features for a given multilevel features. Args: multilevel_features: a `dict` containing `int` keys for continuous feature levels, e.g., [2, 3, 4, 5]. The values are corresponding features with shape [batch_size, height_l, width_l, num_filters]. is_training: `bool` if True, the model is in training mode. Returns: a `dict` containing `int` keys for continuous feature levels [min_level, min_level + 1, ..., max_level]. The values are corresponding FPN features with shape [batch_size, height_l, width_l, fpn_feat_dims]. """ input_levels = list(multilevel_features.keys()) if min(input_levels) > self._min_level: raise ValueError( 'The minimum backbone level %d should be '%(min(input_levels)) + 'less or equal to FPN minimum level %d.:'%(self._min_level)) backbone_max_level = min(max(input_levels), self._max_level) with backend.get_graph().as_default(), tf.name_scope('fpn'): # Adds lateral connections. feats_lateral = {} for level in range(self._min_level, backbone_max_level + 1): feats_lateral[level] = self._lateral_conv2d_op[level]( multilevel_features[level]) # Adds top-down path. feats = {backbone_max_level: feats_lateral[backbone_max_level]} for level in range(backbone_max_level - 1, self._min_level - 1, -1): feats[level] = spatial_transform_ops.nearest_upsampling( feats[level + 1], 2) + feats_lateral[level] # Adds post-hoc 3x3 convolution kernel. for level in range(self._min_level, backbone_max_level + 1): feats[level] = self._post_hoc_conv2d_op[level](feats[level]) # Adds coarser FPN levels introduced for RetinaNet. for level in range(backbone_max_level + 1, self._max_level + 1): feats_in = feats[level - 1] if level > backbone_max_level + 1: feats_in = tf.nn.relu(feats_in) feats[level] = self._coarse_conv2d_op[level](feats_in) if self._use_batch_norm: # Adds batch_norm layer. for level in range(self._min_level, self._max_level + 1): feats[level] = self._batch_norm_relus[level]( feats[level], is_training=is_training) return feats
def pyramid_feature_fusion(pyramid_feats, target_level): """Fuse all feature maps in the feature pyramid at the target level. Args: pyramid_feats: a dictionary containing the feature pyramid. target_level: `int` the target feature level for feature fusion. Returns: A float Tensor of shape [batch_size, feature_height, feature_width, feature_channel]. """ min_level, max_level = min(pyramid_feats.keys()), max(pyramid_feats.keys()) resampled_feats = [] for l in range(min_level, max_level + 1): if l == target_level: resampled_feats.append(pyramid_feats[l]) else: resampled_feat = spatial_transform_ops.nearest_upsampling( pyramid_feats[l], 2**(l - target_level)) resampled_feats.append(resampled_feat) return tf.math.add_n(resampled_feats)
def resample_with_alpha(feat, input_block_fn, target_width, target_num_filters, target_block_fn, alpha=1.0, use_native_resize_op=False, batch_norm_activation=nn_ops.BatchNormActivation(), data_format='channels_last', name=None, is_training=False): """Match resolution and feature dimension to the target block.""" _, height, width, num_filters = feat.get_shape().as_list() if width is None or num_filters is None: raise ValueError('Shape of feat is None (shape:{}).'.format( feat.shape)) if input_block_fn == 'bottleneck': num_filters /= 4 new_num_filters = int(num_filters * alpha) with tf.variable_scope('resample_with_alpha_{}'.format(name)): # First 1x1 conv to reduce feature dimension to alpha*. feat = nn_ops.conv2d_fixed_padding(inputs=feat, filters=new_num_filters, kernel_size=1, strides=1, data_format=data_format) feat = batch_norm_activation(feat, is_training=is_training) # Down-sample. if width > target_width: # Apply stride-2 conv to reduce feature map size to 1/2. feat = nn_ops.conv2d_fixed_padding(inputs=feat, filters=new_num_filters, kernel_size=3, strides=2, data_format=data_format) feat = batch_norm_activation(feat, is_training=is_training) # Apply maxpool to further reduce feature map size if necessary. if width // target_width > 2: if width % target_width != 0: stride_size = 2 else: stride_size = width // target_width // 2 feat = tf.layers.max_pooling2d( inputs=feat, pool_size=3 if width / target_width <= 4 else 5, strides=stride_size, padding='SAME', data_format=data_format) # Use NN interpolation to resize if necessary. This could happen in cases # where `wdith` is not divisible by `target_width`. if feat.get_shape().as_list()[2] != target_width: feat = spatial_transform_ops.native_resize( feat, [int(target_width / width * height), target_width]) # Up-sample with NN interpolation. elif width < target_width: if target_width % width != 0 or use_native_resize_op: feat = spatial_transform_ops.native_resize( feat, [int(target_width / width * height), target_width]) else: scale = target_width // width feat = spatial_transform_ops.nearest_upsampling(feat, scale=scale) # Match feature dimension to the target block. if target_block_fn == 'bottleneck': target_num_filters *= 4 feat = nn_ops.conv2d_fixed_padding(inputs=feat, filters=target_num_filters, kernel_size=1, strides=1, data_format=data_format) feat = batch_norm_activation(feat, relu=False, is_training=is_training) return feat
def __call__(self, multilevel_features, is_training=False): """Returns the FPN features for a given multilevel features. Args: multilevel_features: a `dict` containing `int` keys for continuous feature levels, e.g., [2, 3, 4, 5]. The values are corresponding features with shape [batch_size, height_l, width_l, num_filters]. is_training: `bool` if True, the model is in training mode. Returns: a `dict` containing `int` keys for continuous feature levels [min_level, min_level + 1, ..., max_level]. The values are corresponding FPN features with shape [batch_size, height_l, width_l, fpn_feat_dims]. """ input_levels = multilevel_features.keys() if min(input_levels) > self._min_level: raise ValueError('The minimum backbone level %d should be ' % (min(input_levels)) + 'less or equal to FPN minimum level %d.:' % (self._min_level)) backbone_max_level = min(max(input_levels), self._max_level) with tf.variable_scope('fpn'): # Adds lateral connections. feats_lateral = {} for level in range(self._min_level, backbone_max_level + 1): feats_lateral[level] = self._conv2d_op( multilevel_features[level], filters=self._fpn_feat_dims, kernel_size=(1, 1), padding='same', name='l%d' % level) # Adds top-down path. feats = {backbone_max_level: feats_lateral[backbone_max_level]} for level in range(backbone_max_level - 1, self._min_level - 1, -1): feats[level] = spatial_transform_ops.nearest_upsampling( feats[level + 1], 2) + feats_lateral[level] # Adds post-hoc 3x3 convolution kernel. for level in range(self._min_level, backbone_max_level + 1): feats[level] = self._conv2d_op(feats[level], filters=self._fpn_feat_dims, strides=(1, 1), kernel_size=(3, 3), padding='same', name='post_hoc_d%d' % level) # Adds coarser FPN levels introduced for RetinaNet. for level in range(backbone_max_level + 1, self._max_level + 1): feats_in = feats[level - 1] if level > backbone_max_level + 1: feats_in = tf.nn.relu(feats_in) feats[level] = self._conv2d_op(feats_in, filters=self._fpn_feat_dims, strides=(2, 2), kernel_size=(3, 3), padding='same', name='p%d' % level) # Adds batch_norm layer. for level in range(self._min_level, self._max_level + 1): feats[level] = self._batch_norm_relu(feats[level], relu=False, is_training=is_training, name='p%d-bn' % level) return feats