def abs_diff(template, search, is_training, trainable=True, use_pre_conv=True, pre_conv_output_dim=256, reduce_channels=True, use_mean=False, use_batch_norm=False, scope='abs_diff'): ''' Requires that template is 1x1. Args: template: [b, ht, wt, c] search: [b, s, hs, ws, c] ''' with tf.variable_scope(scope, 'abs_diff'): template = cnn.as_tensor(template) search = cnn.as_tensor(search) if use_pre_conv: # Reduce template to 1x1. kernel_size = template.value.shape[-3:-1].as_list() def pre_conv(x): x = cnn.pixelwise(partial(slim.batch_norm, is_training=is_training), x) x = cnn.pixelwise(tf.nn.relu, x) x, restore = cnn.merge_batch_dims(x) x = cnn.slim_conv2d(x, pre_conv_output_dim, kernel_size, padding='VALID', activation_fn=None, normalizer_fn=slim.batch_norm, normalizer_params=dict(is_training=is_training), scope='conv') x = restore(x) return x # Perform pre-activation because the output layer did not have activations. with tf.variable_scope('pre_conv', reuse=False): template = pre_conv(template) with tf.variable_scope('pre_conv', reuse=True): search = pre_conv(search) template = cnn.get_value(template) template_size = template.shape[-3:-1].as_list() if template_size != [1, 1]: raise ValueError('template shape is not [1, 1]: {}'.format(template_size)) # Use broadcasting to perform element-wise operation. template = tf.expand_dims(template, 1) delta = cnn.pixelwise(lambda x: tf.abs(x - template), search) if reduce_channels: delta = cnn.channel_sum(delta) if use_mean: num_channels = template.shape[-1].value delta = cnn.pixelwise(lambda x: (1 / tf.to_float(num_channels)) * x, delta) # TODO: No bias if attaching more layers? return _calibrate(delta, is_training, use_batch_norm, learn_gain=False, gain_init=1, trainable=trainable)
def pre_conv(x): x = cnn.pixelwise(partial(slim.batch_norm, is_training=is_training), x) x = cnn.pixelwise(tf.nn.relu, x) x, restore = cnn.merge_batch_dims(x) x = cnn.slim_conv2d(x, pre_conv_output_dim, kernel_size, padding='VALID', activation_fn=None, normalizer_fn=slim.batch_norm, normalizer_params=dict(is_training=is_training), scope='conv') x = restore(x) return x
def concat_fc(template, search, is_training, trainable=True, join_dim=128, mlp_num_outputs=1, mlp_num_layers=2, mlp_num_hidden=128, mlp_kwargs=None, scope=None): ''' Args: template: [b, h, w, c] search: [b, s, h, w, c] ''' with tf.variable_scope(scope, 'concat_fc'): template = cnn.as_tensor(template) search = cnn.as_tensor(search) # Instead of sliding-window concat, we do separate conv and sum the results. # Disable activation and normalizer. Perform these after the sum. kernel_size = template.value.shape[-3:-1].as_list() conv_kwargs = dict( padding='VALID', activation_fn=None, normalizer_fn=None, biases_initializer=None, # Disable bias because bnorm is performed later. ) with tf.variable_scope('template'): template = cnn.slim_conv2d(template, join_dim, kernel_size, scope='fc', **conv_kwargs) with tf.variable_scope('search'): search, restore = cnn.merge_batch_dims(search) search = cnn.slim_conv2d(search, join_dim, kernel_size, scope='fc', **conv_kwargs) search = restore(search) template = cnn.get_value(template) template = tf.expand_dims(template, 1) # This is a broadcasting addition. Receptive field in template not tracked. output = cnn.pixelwise(lambda search: search + template, search) output = cnn.pixelwise(partial(slim.batch_norm, is_training=is_training), output) output = cnn.pixelwise(tf.nn.relu, output) mlp_kwargs = mlp_kwargs or {} output, restore = cnn.merge_batch_dims(output) output = cnn.mlp(output, num_layers=mlp_num_layers, num_hidden=mlp_num_hidden, num_outputs=mlp_num_outputs, trainable=trainable, **mlp_kwargs) output = restore(output) return output
def distance(template, search, is_training, trainable=True, use_mean=False, use_batch_norm=False, learn_gain=False, gain_init=1, scope='distance'): ''' Args: template: [b, h, w, c] search: [b, s, h, w, c] ''' search = cnn.as_tensor(search) num_search_dims = len(search.value.shape) if num_search_dims != 5: raise ValueError('search should have 5 dims: {}'.format(num_search_dims)) with tf.variable_scope(scope, 'distance'): search = cnn.as_tensor(search) # Discard receptive field of template and get underlying tf.Tensor. template = cnn.get_value(template) num_channels = template.shape[-1].value template_size = template.shape[-3:-1].as_list() ones = tf.ones(template_size + [num_channels, 1], tf.float32) dot_xy = cnn.diag_xcorr(search, template) dot_xx = tf.reduce_sum(tf.square(template), axis=(-3, -2, -1), keepdims=True) if len(search.value.shape) == 5: dot_xx = tf.expand_dims(dot_xx, 1) sq_search = cnn.pixelwise(tf.square, search) sq_search, restore = cnn.merge_batch_dims(sq_search) dot_yy = cnn.nn_conv2d(sq_search, ones, strides=[1, 1, 1, 1], padding='VALID') dot_yy = restore(dot_yy) # (x - y)**2 = x**2 - 2 x y + y**2 # sq_dist = dot_xx - 2 * dot_xy + dot_yy sq_dist = cnn.pixelwise_binary( lambda dot_xy, dot_yy: dot_xx - 2 * dot_xy + dot_yy, dot_xy, dot_yy) sq_dist = cnn.pixelwise( lambda sq_dist: tf.reduce_sum(sq_dist, axis=-1, keepdims=True), sq_dist) if use_mean: # Take root-mean-square of difference. num_elems = np.prod(template.shape[-3:].as_list()) sq_dist = cnn.pixelwise(lambda sq_dist: (1 / tf.to_float(num_elems)) * sq_dist, sq_dist) dist = cnn.pixelwise(tf.sqrt, sq_dist) return _calibrate(dist, is_training, use_batch_norm, learn_gain, gain_init, trainable=trainable)
def cosine(template, search, is_training, trainable=True, use_batch_norm=False, gain_init=1, eps=1e-3, scope='cosine'): ''' Args: template: [b, h, w, c] search: [b, s, h, w, c] ''' search = cnn.as_tensor(search) num_search_dims = len(search.value.shape) if num_search_dims != 5: raise ValueError('search should have 5 dims: {}'.format(num_search_dims)) with tf.variable_scope(scope, 'cosine'): # Discard receptive field of template and get underlying tf.Tensor. template = cnn.get_value(template) dot_xy = cnn.channel_sum(cnn.diag_xcorr(search, template, padding='VALID')) dot_xx = tf.reduce_sum(tf.square(template), axis=(-3, -2, -1), keepdims=True) sq_search = cnn.pixelwise(tf.square, search) ones = tf.ones_like(template) # TODO: Faster and less memory to use sum. dot_yy = cnn.channel_sum(cnn.diag_xcorr(sq_search, ones, padding='VALID')) # num_channels = template.shape[-1].value # template_size = template.shape[-3:-1].as_list() # ones = tf.ones(template_size + [num_channels, 1], tf.float32) # sq_search, restore = cnn.merge_batch_dims(sq_search) # dot_yy = cnn.nn_conv2d(sq_search, ones, strides=[1, 1, 1, 1], padding='VALID') # dot_yy = restore(dot_yy) dot_xx = tf.expand_dims(dot_xx, 1) assert_ops = [tf.assert_non_negative(dot_xx, message='assert dot_xx non negative'), tf.assert_non_negative(dot_yy.value, message='assert dot_yy non negative')] with tf.control_dependencies(assert_ops): denom = cnn.pixelwise(lambda dot_yy: tf.sqrt(dot_xx * dot_yy), dot_yy) similarity = cnn.pixelwise_binary( lambda dot_xy, denom: dot_xy / (denom + eps), dot_xy, denom) # Gain is necessary here because similarity is always in [-1, 1]. return _calibrate(similarity, is_training, use_batch_norm, learn_gain=True, gain_init=gain_init, trainable=trainable)
def _calibrate(response, is_training, use_batch_norm, learn_gain, gain_init, trainable=True): ''' Either adds batch_norm (with center and scale) or a scalar bias with optional gain. ''' if use_batch_norm: output = cnn.pixelwise(slim.batch_norm, response, center=True, scale=True, is_training=is_training, trainable=trainable) else: # Add bias (cannot be represented by dot product) and optional gain. bias = tf.get_variable('bias', [], tf.float32, initializer=tf.zeros_initializer(), trainable=trainable) if learn_gain: gain = tf.get_variable('gain', [], tf.float32, initializer=tf.constant_initializer(gain_init), trainable=trainable) output = cnn.pixelwise(lambda x: gain * x + bias, response) else: output = cnn.pixelwise(lambda x: x + bias, response) return output
def _xcorr_general(template, search, is_training, trainable=True, use_pre_conv=False, pre_conv_params=None, learn_spatial_weight=False, weight_init_method='ones', reduce_channels=True, use_mean=False, use_batch_norm=False, learn_gain=False, gain_init=1, scope='xcorr'): '''Convolves template with search. Args: template: [b, h, w, c] search: [b, s, h, w, c] If use_batch_norm is true, then an output gain will always be incorporated. Otherwise, it will only be incorporated if learn_gain is true. When `learn_spatial_weight` is false: If `use_batch_norm` is true, `use_mean` should have no effect. When `learn_spatial_weight` is true: The `use_mean` parameter also controls the initialization of the spatial weights. This may have an effect on gradient descent, even if `use_batch_norm` is true. ''' with tf.variable_scope(scope, 'xcorr'): pre_conv_params = pre_conv_params or {} if use_pre_conv: template = _pre_conv(template, is_training, trainable=trainable, scope='pre', reuse=False, **pre_conv_params) search = _pre_conv(search, is_training, trainable=trainable, scope='pre', reuse=True, **pre_conv_params) # Discard receptive field of template and get underlying tf.Tensor. template = cnn.get_value(template) template_size = template.shape[-3:-1].as_list() # There are two separate issues here: # 1. Whether to make the initial output equal to the mean? # 2. How to share this between a constant multiplier and initialization? spatial_normalizer = 1 / np.prod(template_size) if learn_spatial_weight: if weight_init_method == 'mean': weight_init = spatial_normalizer elif weight_init_method == 'ones': weight_init = 1 else: raise ValueError('unknown weight init method: "{}"'.format(weight_init_method)) else: weight_init = 1 if use_mean: # Maintain property: # normalize_factor * weight_init = spatial_normalizer normalize_factor = spatial_normalizer / weight_init else: normalize_factor = 1 if learn_spatial_weight: # Initialize with spatial normalizer. spatial_weight = tf.get_variable( 'spatial_weight', template_size, tf.float32, initializer=tf.constant_initializer(weight_init), trainable=trainable) template *= tf.expand_dims(spatial_weight, -1) dot = cnn.diag_xcorr(search, template) dot = cnn.pixelwise(lambda dot: normalize_factor * dot, dot) if reduce_channels: dot = cnn.channel_mean(dot) if use_mean else cnn.channel_sum(dot) return _calibrate(dot, is_training, use_batch_norm, learn_gain, gain_init, trainable=trainable)