def test_position_sensitive_with_global_pool_false_and_single_bin(self): num_spatial_bins = [1, 1] image_shape = [2, 3, 3, 4] crop_size = [1, 1] image = tf.random_uniform(image_shape) boxes = tf.random_uniform((6, 4)) box_ind = tf.constant([0, 0, 0, 1, 1, 1], dtype=tf.int32) # Since single_bin is used and crop_size = [1, 1] (i.e., no crop resize), # the outputs are the same whatever the global_pool value is. ps_crop_and_pool = ops.position_sensitive_crop_regions( image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True) ps_crop = ops.position_sensitive_crop_regions(image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=False) with self.test_session() as sess: pooled_output, unpooled_output = sess.run( (ps_crop_and_pool, ps_crop)) self.assertAllClose(pooled_output, unpooled_output)
def test_position_sensitive_with_global_pool_false_and_known_boxes(self): num_spatial_bins = [2, 2] image_shape = [2, 2, 2, 4] crop_size = [2, 2] image = tf.constant(range(1, 2 * 2 * 4 + 1) * 2, dtype=tf.float32, shape=image_shape) # First box contains whole image, and second box contains only first row. boxes = tf.constant(np.array([[0., 0., 1., 1.], [0., 0., 0.5, 1.]]), dtype=tf.float32) box_ind = tf.constant([0, 1], dtype=tf.int32) expected_output = [] # Expected output, when the box containing whole image. expected_output.append( np.reshape(np.array([[4, 7], [10, 13]]), (1, 2, 2, 1))) # Expected output, when the box containing only first row. expected_output.append( np.reshape(np.array([[3, 6], [7, 10]]), (1, 2, 2, 1))) expected_output = np.concatenate(expected_output, axis=0) ps_crop = ops.position_sensitive_crop_regions(image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=False) with self.test_session() as sess: output = sess.run(ps_crop) self.assertAllEqual(output, expected_output)
def test_position_sensitive_with_single_bin(self): num_spatial_bins = [1, 1] image_shape = [2, 3, 3, 4] crop_size = [2, 2] image = tf.random_uniform(image_shape) boxes = tf.random_uniform((6, 4)) box_ind = tf.constant([0, 0, 0, 1, 1, 1], dtype=tf.int32) # When a single bin is used, position-sensitive crop and pool should be # the same as non-position sensitive crop and pool. crop = tf.image.crop_and_resize(image, boxes, box_ind, crop_size) crop_and_pool = tf.reduce_mean(crop, [1, 2], keep_dims=True) ps_crop_and_pool = ops.position_sensitive_crop_regions( image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True) with self.test_session() as sess: expected_output, output = sess.run( (crop_and_pool, ps_crop_and_pool)) self.assertAllClose(output, expected_output)
def test_position_sensitive_with_equal_channels(self): num_spatial_bins = [2, 2] image_shape = [1, 3, 3, 4] crop_size = [2, 2] image = tf.constant(range(1, 3 * 3 + 1), dtype=tf.float32, shape=[1, 3, 3, 1]) tiled_image = tf.tile(image, [1, 1, 1, image_shape[3]]) boxes = tf.random_uniform((3, 4)) box_ind = tf.constant([0, 0, 0], dtype=tf.int32) # All channels are equal so position-sensitive crop and resize should # work as the usual crop and resize for just one channel. crop = tf.image.crop_and_resize(image, boxes, box_ind, crop_size) crop_and_pool = tf.reduce_mean(crop, [1, 2], keep_dims=True) ps_crop_and_pool = ops.position_sensitive_crop_regions( tiled_image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True) with self.test_session() as sess: expected_output, output = sess.run( (crop_and_pool, ps_crop_and_pool)) self.assertAllClose(output, expected_output)
def test_position_sensitive(self): num_spatial_bins = [3, 2] image_shape = [1, 3, 2, 6] # First channel is 1's, second channel is 2's, etc. image = tf.constant(range(1, 3 * 2 + 1) * 6, dtype=tf.float32, shape=image_shape) boxes = tf.random_uniform((2, 4)) box_ind = tf.constant([0, 0], dtype=tf.int32) # The result for both boxes should be [[1, 2], [3, 4], [5, 6]] # before averaging. expected_output = np.array([3.5, 3.5]).reshape([2, 1, 1, 1]) for crop_size_mult in range(1, 3): crop_size = [3 * crop_size_mult, 2 * crop_size_mult] ps_crop_and_pool = ops.position_sensitive_crop_regions( image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True) with self.test_session() as sess: output = sess.run(ps_crop_and_pool) self.assertAllClose(output, expected_output)
def test_raise_value_error_on_num_bins_less_than_one(self): num_spatial_bins = [1, -1] image_shape = [1, 1, 1, 2] crop_size = [2, 2] image = tf.constant(1, dtype=tf.float32, shape=image_shape) boxes = tf.constant([[0, 0, 1, 1]], dtype=tf.float32) box_ind = tf.constant([0], dtype=tf.int32) with self.assertRaisesRegexp(ValueError, 'num_spatial_bins should be >= 1'): ops.position_sensitive_crop_regions(image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True)
def test_raise_value_error_on_non_square_block_size(self): num_spatial_bins = [3, 2] image_shape = [1, 3, 2, 6] crop_size = [6, 2] image = tf.constant(1, dtype=tf.float32, shape=image_shape) boxes = tf.constant([[0, 0, 1, 1]], dtype=tf.float32) box_ind = tf.constant([0], dtype=tf.int32) with self.assertRaisesRegexp( ValueError, 'Only support square bin crop size for now.'): ops.position_sensitive_crop_regions(image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=False)
def test_raise_value_error_on_non_divisible_num_channels(self): num_spatial_bins = [2, 2] image_shape = [1, 1, 1, 5] crop_size = [2, 2] image = tf.constant(1, dtype=tf.float32, shape=image_shape) boxes = tf.constant([[0, 0, 1, 1]], dtype=tf.float32) box_ind = tf.constant([0], dtype=tf.int32) with self.assertRaisesRegexp( ValueError, 'Dimension size must be evenly divisible by 4 but is 5'): ops.position_sensitive_crop_regions(image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True)
def test_raise_value_error_on_non_divisible_crop_size(self): num_spatial_bins = [2, 3] image_shape = [1, 1, 1, 6] crop_size = [3, 2] image = tf.constant(1, dtype=tf.float32, shape=image_shape) boxes = tf.constant([[0, 0, 1, 1]], dtype=tf.float32) box_ind = tf.constant([0], dtype=tf.int32) with self.assertRaisesRegexp( ValueError, 'crop_size should be divisible by num_spatial_bins'): ops.position_sensitive_crop_regions(image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True)
def test_position_sensitive_with_global_pool_false_and_do_global_pool( self): num_spatial_bins = [3, 2] image_shape = [1, 3, 2, 6] num_boxes = 2 # First channel is 1's, second channel is 2's, etc. image = tf.constant(range(1, 3 * 2 + 1) * 6, dtype=tf.float32, shape=image_shape) boxes = tf.random_uniform((num_boxes, 4)) box_ind = tf.constant([0, 0], dtype=tf.int32) expected_output = [] # Expected output, when crop_size = [3, 2]. expected_output.append( np.mean(np.expand_dims(np.tile(np.array([[1, 2], [3, 4], [5, 6]]), (num_boxes, 1, 1)), axis=-1), axis=(1, 2), keepdims=True)) # Expected output, when crop_size = [6, 4]. expected_output.append( np.mean(np.expand_dims(np.tile( np.array([[1, 1, 2, 2], [1, 1, 2, 2], [3, 3, 4, 4], [3, 3, 4, 4], [5, 5, 6, 6], [5, 5, 6, 6]]), (num_boxes, 1, 1)), axis=-1), axis=(1, 2), keepdims=True)) for crop_size_mult in range(1, 3): crop_size = [3 * crop_size_mult, 2 * crop_size_mult] # Perform global_pooling after running the function with # global_pool=False. ps_crop = ops.position_sensitive_crop_regions(image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=False) ps_crop_and_pool = tf.reduce_mean(ps_crop, reduction_indices=(1, 2), keep_dims=True) with self.test_session() as sess: output = sess.run(ps_crop_and_pool) self.assertAllEqual(output, expected_output[crop_size_mult - 1])
def _predict(self, image_features, num_predictions_per_location, proposal_boxes): """Computes encoded object locations and corresponding confidences. Args: image_features: A list of float tensors of shape [batch_size, height_i, width_i, channels_i] containing features for a batch of images. num_predictions_per_location: A list of integers representing the number of box predictions to be made per spatial location for each feature map. Currently, this must be set to [1], or an error will be raised. proposal_boxes: A float tensor of shape [batch_size, num_proposals, box_code_size]. Returns: box_encodings: A float tensor of shape [batch_size, num_anchors, num_classes, code_size] representing the location of the objects. class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. Raises: ValueError: if num_predictions_per_location is not 1 or if len(image_features) is not 1. """ if (len(num_predictions_per_location) != 1 or num_predictions_per_location[0] != 1): raise ValueError('Currently RfcnBoxPredictor only supports ' 'predicting a single box per class per location.') if len(image_features) != 1: raise ValueError('length of `image_features` must be 1. Found {}'. format(len(image_features))) image_feature = image_features[0] num_predictions_per_location = num_predictions_per_location[0] batch_size = tf.shape(proposal_boxes)[0] num_boxes = tf.shape(proposal_boxes)[1] def get_box_indices(proposals): proposals_shape = proposals.get_shape().as_list() if any(dim is None for dim in proposals_shape): proposals_shape = tf.shape(proposals) ones_mat = tf.ones(proposals_shape[:2], dtype=tf.int32) multiplier = tf.expand_dims( tf.range(start=0, limit=proposals_shape[0]), 1) return tf.reshape(ones_mat * multiplier, [-1]) net = image_feature with slim.arg_scope(self._conv_hyperparams): net = slim.conv2d(net, self._depth, [1, 1], scope='reduce_depth') # Location predictions. location_feature_map_depth = (self._num_spatial_bins[0] * self._num_spatial_bins[1] * self.num_classes * self._box_code_size) location_feature_map = slim.conv2d(net, location_feature_map_depth, [1, 1], activation_fn=None, scope='refined_locations') box_encodings = ops.position_sensitive_crop_regions( location_feature_map, boxes=tf.reshape(proposal_boxes, [-1, self._box_code_size]), box_ind=get_box_indices(proposal_boxes), crop_size=self._crop_size, num_spatial_bins=self._num_spatial_bins, global_pool=True) box_encodings = tf.squeeze(box_encodings, squeeze_dims=[1, 2]) box_encodings = tf.reshape(box_encodings, [batch_size * num_boxes, 1, self.num_classes, self._box_code_size]) # Class predictions. total_classes = self.num_classes + 1 # Account for background class. class_feature_map_depth = (self._num_spatial_bins[0] * self._num_spatial_bins[1] * total_classes) class_feature_map = slim.conv2d(net, class_feature_map_depth, [1, 1], activation_fn=None, scope='class_predictions') class_predictions_with_background = ops.position_sensitive_crop_regions( class_feature_map, boxes=tf.reshape(proposal_boxes, [-1, self._box_code_size]), box_ind=get_box_indices(proposal_boxes), crop_size=self._crop_size, num_spatial_bins=self._num_spatial_bins, global_pool=True) class_predictions_with_background = tf.squeeze( class_predictions_with_background, squeeze_dims=[1, 2]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [batch_size * num_boxes, 1, total_classes]) return {BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background}