Exemplo n.º 1
0
def abs_diff(template, search, is_training,
             trainable=True,
             use_pre_conv=True,
             pre_conv_output_dim=256,
             reduce_channels=True,
             use_mean=False,
             use_batch_norm=False,
             scope='abs_diff'):
    '''
    Requires that template is 1x1.

    Args:
        template: [b, ht, wt, c]
        search: [b, s, hs, ws, c]
    '''
    with tf.variable_scope(scope, 'abs_diff'):
        template = cnn.as_tensor(template)
        search = cnn.as_tensor(search)

        if use_pre_conv:
            # Reduce template to 1x1.
            kernel_size = template.value.shape[-3:-1].as_list()

            def pre_conv(x):
                x = cnn.pixelwise(partial(slim.batch_norm, is_training=is_training), x)
                x = cnn.pixelwise(tf.nn.relu, x)
                x, restore = cnn.merge_batch_dims(x)
                x = cnn.slim_conv2d(x, pre_conv_output_dim, kernel_size,
                                    padding='VALID',
                                    activation_fn=None,
                                    normalizer_fn=slim.batch_norm,
                                    normalizer_params=dict(is_training=is_training),
                                    scope='conv')
                x = restore(x)
                return x

            # Perform pre-activation because the output layer did not have activations.
            with tf.variable_scope('pre_conv', reuse=False):
                template = pre_conv(template)
            with tf.variable_scope('pre_conv', reuse=True):
                search = pre_conv(search)

        template = cnn.get_value(template)
        template_size = template.shape[-3:-1].as_list()
        if template_size != [1, 1]:
            raise ValueError('template shape is not [1, 1]: {}'.format(template_size))
        # Use broadcasting to perform element-wise operation.
        template = tf.expand_dims(template, 1)
        delta = cnn.pixelwise(lambda x: tf.abs(x - template), search)
        if reduce_channels:
            delta = cnn.channel_sum(delta)
            if use_mean:
                num_channels = template.shape[-1].value
                delta = cnn.pixelwise(lambda x: (1 / tf.to_float(num_channels)) * x, delta)
        # TODO: No bias if attaching more layers?
        return _calibrate(delta, is_training, use_batch_norm, learn_gain=False, gain_init=1,
                          trainable=trainable)
Exemplo n.º 2
0
def concat_fc(template, search, is_training,
              trainable=True,
              join_dim=128,
              mlp_num_outputs=1,
              mlp_num_layers=2,
              mlp_num_hidden=128,
              mlp_kwargs=None,
              scope=None):
    '''
    Args:
        template: [b, h, w, c]
        search: [b, s, h, w, c]
    '''
    with tf.variable_scope(scope, 'concat_fc'):
        template = cnn.as_tensor(template)
        search = cnn.as_tensor(search)

        # Instead of sliding-window concat, we do separate conv and sum the results.
        # Disable activation and normalizer. Perform these after the sum.
        kernel_size = template.value.shape[-3:-1].as_list()
        conv_kwargs = dict(
            padding='VALID',
            activation_fn=None,
            normalizer_fn=None,
            biases_initializer=None,  # Disable bias because bnorm is performed later.
        )
        with tf.variable_scope('template'):
            template = cnn.slim_conv2d(template, join_dim, kernel_size,
                                       scope='fc', **conv_kwargs)
        with tf.variable_scope('search'):
            search, restore = cnn.merge_batch_dims(search)
            search = cnn.slim_conv2d(search, join_dim, kernel_size,
                                     scope='fc', **conv_kwargs)
            search = restore(search)

        template = cnn.get_value(template)
        template = tf.expand_dims(template, 1)
        # This is a broadcasting addition. Receptive field in template not tracked.
        output = cnn.pixelwise(lambda search: search + template, search)
        output = cnn.pixelwise(partial(slim.batch_norm, is_training=is_training), output)
        output = cnn.pixelwise(tf.nn.relu, output)

        mlp_kwargs = mlp_kwargs or {}
        output, restore = cnn.merge_batch_dims(output)
        output = cnn.mlp(output,
                         num_layers=mlp_num_layers,
                         num_hidden=mlp_num_hidden,
                         num_outputs=mlp_num_outputs,
                         trainable=trainable, **mlp_kwargs)
        output = restore(output)
        return output
Exemplo n.º 3
0
def distance(template, search, is_training,
             trainable=True,
             use_mean=False,
             use_batch_norm=False,
             learn_gain=False,
             gain_init=1,
             scope='distance'):
    '''
    Args:
        template: [b, h, w, c]
        search: [b, s, h, w, c]
    '''
    search = cnn.as_tensor(search)
    num_search_dims = len(search.value.shape)
    if num_search_dims != 5:
        raise ValueError('search should have 5 dims: {}'.format(num_search_dims))

    with tf.variable_scope(scope, 'distance'):
        search = cnn.as_tensor(search)
        # Discard receptive field of template and get underlying tf.Tensor.
        template = cnn.get_value(template)

        num_channels = template.shape[-1].value
        template_size = template.shape[-3:-1].as_list()
        ones = tf.ones(template_size + [num_channels, 1], tf.float32)

        dot_xy = cnn.diag_xcorr(search, template)
        dot_xx = tf.reduce_sum(tf.square(template), axis=(-3, -2, -1), keepdims=True)
        if len(search.value.shape) == 5:
            dot_xx = tf.expand_dims(dot_xx, 1)
        sq_search = cnn.pixelwise(tf.square, search)
        sq_search, restore = cnn.merge_batch_dims(sq_search)
        dot_yy = cnn.nn_conv2d(sq_search, ones, strides=[1, 1, 1, 1], padding='VALID')
        dot_yy = restore(dot_yy)
        # (x - y)**2 = x**2 - 2 x y + y**2
        # sq_dist = dot_xx - 2 * dot_xy + dot_yy
        sq_dist = cnn.pixelwise_binary(
            lambda dot_xy, dot_yy: dot_xx - 2 * dot_xy + dot_yy, dot_xy, dot_yy)
        sq_dist = cnn.pixelwise(
            lambda sq_dist: tf.reduce_sum(sq_dist, axis=-1, keepdims=True), sq_dist)
        if use_mean:
            # Take root-mean-square of difference.
            num_elems = np.prod(template.shape[-3:].as_list())
            sq_dist = cnn.pixelwise(lambda sq_dist: (1 / tf.to_float(num_elems)) * sq_dist, sq_dist)
        dist = cnn.pixelwise(tf.sqrt, sq_dist)
        return _calibrate(dist, is_training, use_batch_norm, learn_gain, gain_init,
                          trainable=trainable)
Exemplo n.º 4
0
def cosine(template, search, is_training,
           trainable=True,
           use_batch_norm=False,
           gain_init=1,
           eps=1e-3,
           scope='cosine'):
    '''
    Args:
        template: [b, h, w, c]
        search: [b, s, h, w, c]
    '''
    search = cnn.as_tensor(search)
    num_search_dims = len(search.value.shape)
    if num_search_dims != 5:
        raise ValueError('search should have 5 dims: {}'.format(num_search_dims))

    with tf.variable_scope(scope, 'cosine'):
        # Discard receptive field of template and get underlying tf.Tensor.
        template = cnn.get_value(template)

        dot_xy = cnn.channel_sum(cnn.diag_xcorr(search, template, padding='VALID'))
        dot_xx = tf.reduce_sum(tf.square(template), axis=(-3, -2, -1), keepdims=True)

        sq_search = cnn.pixelwise(tf.square, search)
        ones = tf.ones_like(template)  # TODO: Faster and less memory to use sum.
        dot_yy = cnn.channel_sum(cnn.diag_xcorr(sq_search, ones, padding='VALID'))
        # num_channels = template.shape[-1].value
        # template_size = template.shape[-3:-1].as_list()
        # ones = tf.ones(template_size + [num_channels, 1], tf.float32)
        # sq_search, restore = cnn.merge_batch_dims(sq_search)
        # dot_yy = cnn.nn_conv2d(sq_search, ones, strides=[1, 1, 1, 1], padding='VALID')
        # dot_yy = restore(dot_yy)

        dot_xx = tf.expand_dims(dot_xx, 1)
        assert_ops = [tf.assert_non_negative(dot_xx, message='assert dot_xx non negative'),
                      tf.assert_non_negative(dot_yy.value, message='assert dot_yy non negative')]
        with tf.control_dependencies(assert_ops):
            denom = cnn.pixelwise(lambda dot_yy: tf.sqrt(dot_xx * dot_yy), dot_yy)
        similarity = cnn.pixelwise_binary(
            lambda dot_xy, denom: dot_xy / (denom + eps), dot_xy, denom)
        # Gain is necessary here because similarity is always in [-1, 1].
        return _calibrate(similarity, is_training, use_batch_norm,
                          learn_gain=True,
                          gain_init=gain_init,
                          trainable=trainable)
Exemplo n.º 5
0
    def test_instantiate(self):
        '''Instantiates the join functions.'''
        for join_arch in join_nets.SINGLE_JOIN_FNS:
            with trySubTest(self, join_arch=join_arch):
                with tf.Graph().as_default():
                    join_fn = join_nets.BY_NAME[join_arch]
                    if join_arch in join_nets.FULLY_CONNECTED_FNS:
                        template_size = np.array([1, 1])
                    else:
                        template_size = np.array([4, 4])
                    search_size = np.array([10, 10])
                    template_shape = [3] + list(template_size) + [16]
                    search_shape = [3, 2] + list(search_size) + [16]

                    template = tf.placeholder(tf.float32,
                                              template_shape,
                                              name='template')
                    search = tf.placeholder(tf.float32,
                                            search_shape,
                                            name='search')
                    is_training = tf.placeholder(tf.bool, (),
                                                 name='is_training')
                    output = join_fn(template, search, is_training)
                    output = cnn.get_value(output)
                    output_size = output.shape[-3:-1].as_list()
                    self.assertAllEqual(output_size,
                                        search_size - template_size + 1)

                    init_op = tf.global_variables_initializer()
                    # with self.test_session() as sess:
                    with tf.Session() as sess:
                        sess.run(init_op)
                        sess.run(output,
                                 feed_dict={
                                     template:
                                     np.random.normal(size=template_shape),
                                     search:
                                     np.random.normal(size=search_shape),
                                     is_training:
                                     False,
                                 })
Exemplo n.º 6
0
    def start(self, features_init, run_opts, name=None):
        with tf.name_scope(name, 'start') as scope:
            im = features_init['image']['data']
            aspect = features_init['aspect']
            target_rect = features_init['rect']
            mean_color = tf.reduce_mean(im, axis=(-3, -2), keepdims=True)

            with tf.variable_scope('appearance', reuse=False):
                template_rect = self._context_rect(target_rect, aspect,
                                                   self.template_scale)
                template_im = self._crop(im, template_rect, self.template_size,
                                         mean_color)
                template_input = self._preproc(template_im)
                template_input = cnn.as_tensor(template_input, add_to_set=True)
                with tf.variable_scope('embed', reuse=False):
                    template_feat, template_layers, feature_scope = self._embed_net(
                        template_input, (False if not self.learn_appearance
                                         else run_opts['is_training']))
                    # Get names relative to this scope for loading pre-trained.
                    # self._feature_vars = _global_variables_relative_to_scope(feature_scope)
                rf_template = template_feat.fields[template_input.value]
                template_feat = cnn.get_value(template_feat)
                feat_size = template_feat.shape[-3:-1].as_list()
                receptive_field.assert_center_alignment(
                    self.template_size, feat_size, rf_template)

            # self._feature_saver = tf.train.Saver(self._feature_vars)

            with tf.name_scope('summary'):
                tf.summary.image('template', template_im)

            state = {
                'run_opts': run_opts,
                'aspect': aspect,
                'image': im,
                'rect': tf.identity(target_rect),
                'template_init': tf.identity(template_feat),
                'mean_color': tf.identity(mean_color),
            }
            return state
Exemplo n.º 7
0
    def next(self, features, labels, state, name=None, reset_position=False):
        '''
        Args:
            reset_position: Keep the appearance model but reset the position.
                If this is true, then features['rect'] must be present.
        '''
        with tf.name_scope(name, 'next_{}'.format(self._num_frames)) as scope:
            im = features['image']['data']
            run_opts = state['run_opts']
            aspect = state['aspect']
            prev_im = state['image']
            mean_color = state['mean_color']

            # If the label is not valid, there will be no loss for this frame.
            # However, the input image may still be processed.
            # In this case, adopt the previous rectangle as the "ground-truth".
            if self.mode in MODE_KEYS_SUPERVISED:
                gt_rect = tf.where(labels['valid'], labels['rect'],
                                   state['rect'])
            else:
                gt_rect = None
            # Use the previous rectangle.
            # This will be the ground-truth rect during training if `use_predictions` is false.
            prev_target_rect = state['rect']

            # Coerce the aspect ratio of the rectangle to construct the search area.
            # search_rect = self._context_rect(prev_target_rect, aspect, self.search_scale)
            base_rect = model_util.coerce_aspect(
                prev_target_rect, aspect, aspect_method=self.aspect_method)
            # Apply perturbation to aspect-coerced "previous" rect (may be current gt).
            if self.use_perturb and self.mode == tf.estimator.ModeKeys.TRAIN:
                base_rect = tf.cond(
                    run_opts['is_training'],
                    lambda: siamfc.perturb(base_rect, **self.perturb_params),
                    lambda: base_rect)
            search_rect = geom.grow_rect(self.search_scale, base_rect)

            # Coerce the aspect ratio of the rectangle to construct the context area.
            # context_rect = self._context_rect(prev_target_rect, aspect, self.context_scale)
            context_rect = geom.grow_rect(self.context_scale, base_rect)
            # Extract same rectangle in past and current images and feed into conv-net.
            context_curr = self._crop(im, context_rect, self.context_size,
                                      mean_color)
            context_prev = self._crop(prev_im, context_rect, self.context_size,
                                      mean_color)
            with tf.name_scope('summary_context'):
                tf.summary.image('curr', context_curr)
                tf.summary.image('prev', context_curr)
            motion = [context_curr
                      ] if self.stateless else [context_curr, context_prev]
            motion = tf.stack(motion, axis=1)

            # How to obtain template from previous state?
            template_feat = state['template_init']

            # Extract an image pyramid (use 1 scale when not in tracking mode).
            mid_scale = (self.num_scales - 1) // 2
            if self.num_scales == 1:
                scales = tf.constant([1.0], dtype=tf.float32)
            else:
                scales = model_util.scale_range(
                    tf.constant(self.num_scales),
                    tf.to_float(self.log_scale_step))
            search_ims, search_rects = self._crop_pyr(im, search_rect,
                                                      self.search_size, scales,
                                                      mean_color)

            with tf.name_scope('summary'):
                _image_sequence_summary('search',
                                        search_ims,
                                        elem_name='scale')

            with tf.variable_scope('appearance',
                                   reuse=False) as appearance_scope:
                # Extract features, perform search, get receptive field of response wrt image.
                search_input = self._preproc(search_ims)
                search_input = cnn.as_tensor(search_input, add_to_set=True)
                with tf.variable_scope('embed', reuse=True):
                    search_feat, search_layers, _ = self._embed_net(
                        search_input, (False if not self.learn_appearance else
                                       run_opts['is_training']))
                rf_search = search_feat.fields[search_input.value]
                search_feat_size = search_feat.value.shape[-3:-1].as_list()
                receptive_field.assert_center_alignment(
                    self.search_size, search_feat_size, rf_search)

                with tf.variable_scope('join', reuse=(self._num_frames >= 1)):
                    join_fn = join_nets.BY_NAME[self.join_arch]
                    if self.join_type == 'single':
                        response = join_fn(
                            template_feat,
                            search_feat,
                            is_training=(False if not self.learn_appearance
                                         else run_opts['is_training']),
                            trainable=self.learn_appearance,
                            **self.join_params)
                    elif self.join_type == 'multi':
                        response = join_fn(
                            template_feat,
                            search_feat,
                            self.multi_join_layers,
                            template_layers,
                            search_layers,
                            search_input,
                            is_training=(False if not self.learn_appearance
                                         else run_opts['is_training']),
                            trainable=self.learn_appearance,
                            **self.join_params)
                    else:
                        raise ValueError('unknown join type: "{}"'.format(
                            self.join_type))
                rf_response = response.fields[search_input.value]
                response = cnn.get_value(response)
                response_size = response.shape[-3:-1].as_list()
                receptive_field.assert_center_alignment(
                    self.search_size, response_size, rf_response)
                response = tf.verify_tensor_all_finite(
                    response, 'output of xcorr is not finite')

            if self._num_frames == 0:
                # Define appearance model saver.
                if self.appearance_model_file:
                    # Create the graph ops for the saver.
                    var_list = appearance_scope.global_variables()
                    var_list = {var.op.name: var for var in var_list}
                    if self.appearance_scope_dst or self.appearance_scope_src:
                        # Replace 'dst' with 'src'.
                        # Caution: This string replacement is a little dangerous.
                        var_list = {
                            k.replace(self.appearance_scope_dst,
                                      self.appearance_scope_src, 1): v
                            for k, v in var_list.items()
                        }
                    self._appearance_var_list = var_list
                    self._appearance_saver = tf.train.Saver(var_list)

            # Post-process scores.
            with tf.variable_scope('output', reuse=(self._num_frames > 0)):
                if not self.learn_appearance:
                    # TODO: Prevent batch-norm updates as well.
                    # TODO: Set trainable=False for all variables above.
                    response = tf.stop_gradient(response)

                # Regress response to translation and log(scale).
                output_shapes = {'translation': [2], 'log_scale': [1]}
                outputs = _output_net(response,
                                      motion,
                                      output_shapes,
                                      run_opts['is_training'],
                                      weight_decay=self.wd,
                                      use_response=self.output_use_response,
                                      use_images=self.output_use_images)

            _image_sequence_summary('response',
                                    model_util.colormap(
                                        tf.sigmoid(response), _COLORMAP),
                                    elem_name='scale')

            losses = {}
            if self.mode in MODE_KEYS_SUPERVISED:
                # Get ground-truth translation and scale relative to search window.
                gt_rect_in_search = geom.crop_rect(gt_rect, search_rect)
                gt_position, gt_rect_size = geom.rect_center_size(
                    gt_rect_in_search)
                # Positions in real interval [0, 1] correspond to real interval [0, search_size].
                # Pixel centers range from 0.5 to search_size - 0.5 in [0, search_size].
                gt_translation = gt_position - 0.5  # Displacement relative to center.
                gt_size = helpers.scalar_size(gt_rect_size, self.aspect_method)
                target_size_in_search = self.target_size / self.search_size
                # size = target_size * scale
                gt_scale = gt_size / target_size_in_search
                gt_log_scale = tf.log(gt_scale)

                if self.appearance_loss:
                    target_size_in_response = self.target_size / rf_response.stride
                    loss_name, loss = siamfc.compute_loss(
                        response[:, mid_scale], target_size_in_response,
                        **self.appearance_loss_params)
                    losses[loss_name] = loss

                loss_name, loss = regress.compute_loss_vector(
                    outputs['translation'], outputs['log_scale'],
                    gt_translation, gt_log_scale, **self.loss_params)
                losses[loss_name] = loss

                if reset_position:
                    # TODO: Something better!
                    # TODO: Keep appearance loss even when `reset_position` is true?
                    losses = {k: tf.zeros_like(v) for k, v in losses.items()}

            translation = outputs['translation']  # [b, 2]
            scale = tf.exp(outputs['log_scale'])  # [b, 1]

            # Damp the scale update towards 1 (no change).
            # TODO: Should this be in log space?
            scale = self.scale_update_rate * scale + (
                1. - self.scale_update_rate) * 1.
            # Get rectangle in search image.
            prev_target_in_search = geom.crop_rect(prev_target_rect,
                                                   search_rect)
            pred_in_search = _rect_translate_scale(prev_target_in_search,
                                                   translation, scale)
            # Move from search back to original image.
            pred = geom.crop_rect(pred_in_search,
                                  geom.crop_inverse(search_rect))

            # Limit size of object.
            pred = _clip_rect_size(pred, min_size=0.001, max_size=10.0)

            # Rectangle to use in next frame for search area.
            # If using gt and rect not valid, use previous.
            if self.mode in MODE_KEYS_SUPERVISED:
                next_prev_rect = pred if self.use_predictions else gt_rect
            else:
                next_prev_rect = pred

            # outputs = {'rect': pred, 'score': confidence}
            outputs = {'rect': pred}
            state = {
                'run_opts': run_opts,
                'aspect': aspect,
                'image': im,
                'rect': next_prev_rect,
                'template_init': state['template_init'],
                'mean_color': state['mean_color'],
            }
            self._num_frames += 1
            return outputs, state, losses
Exemplo n.º 8
0
    def test_output_equal(self):
        '''Compares output to library implementation of networks.'''
        # The desired_size may need to be chosen such that original network structure is valid.
        TestCase = collections.namedtuple('TestCase', ['kwargs', 'desired_size', 'end_point'])
        cases = {
            'slim_alexnet_v2': TestCase(
                kwargs=dict(
                    output_layer='conv5',
                    output_act='relu',
                    conv_padding='SAME',
                    pool_padding='VALID'),
                desired_size=np.array([13, 13]),  # 3 + (6 - 1) * 2
                end_point='alexnet_v2/conv5',
            ),
            'slim_resnet_v1_50': TestCase(
                kwargs=dict(
                    num_blocks=4,
                    conv_padding='SAME',
                    pool_padding='SAME'),
                desired_size=np.array([3, 3]),
                end_point='resnet_v1_50/block4',
            ),
            'slim_vgg_16': TestCase(
                kwargs=dict(
                    output_layer='fc6',
                    output_act='relu',
                    conv_padding='SAME',
                    pool_padding='VALID'),
                desired_size=np.array([1, 1]),
                end_point='vgg_16/fc6',
            ),
        }

        for feature_arch, test_case in cases.items():
            graph = tf.Graph()
            sub_test = trySubTest(self, feature_arch=feature_arch)
            with sub_test, graph.as_default():
                original_fn = globals()[feature_arch]
                feature_fn = functools.partial(feature_nets.BY_NAME[feature_arch],
                                               **test_case.kwargs)
                field = feature_nets.get_receptive_field(feature_fn)
                input_size = receptive_field.input_size(field, test_case.desired_size)
                input_shape = [None] + list(input_size) + [3]

                image = tf.placeholder(tf.float32, input_shape, name='image')
                with tf.variable_scope('net', reuse=False):
                    _, end_points = original_fn(image, is_training=True)
                    try:
                        original = end_points['net/' + test_case.end_point]
                    except KeyError as ex:
                        raise ValueError('key not found ({}) in list: {}'.format(
                            ex, sorted(end_points.keys())))
                init_op = tf.global_variables_initializer()
                with tf.variable_scope('net', reuse=True):
                    ours, _ = feature_fn(image, is_training=True)
                    ours = cnn.get_value(ours)
                # self.assertEqual(original.shape.as_list(), ours.shape.as_list())

                with self.session(graph=graph) as sess:
                    sess.run(init_op)
                    want, got = sess.run((original, ours), feed_dict={
                        image: np.random.uniform(size=[BATCH_LEN] + input_shape[1:]),
                    })
                    self.assertAllClose(want, got)
Exemplo n.º 9
0
def multi_xcorr(template, search,
                layer_names, template_layers, search_layers, search_image,
                is_training,
                trainable=True,
                use_final_conv=False,
                final_conv_params=None,
                hidden_conv_num_outputs=None,
                hidden_conv_activation='linear',
                use_batch_norm=False,
                use_mean=False,
                scope='multi_xcorr'):
    '''
    Args:
        template_layers: Dict that maps names to tensors.
        search_layers: Dict that maps names to tensors.
    '''
    with tf.variable_scope(scope, 'multi_xcorr'):
        template_layers = template_layers or {}
        search_layers = search_layers or {}
        final_conv_params = final_conv_params or {}
        assert 'final' not in layer_names

        scores = {}
        scores['final'] = _xcorr_general(template, search, is_training,
                                         trainable=trainable,
                                         use_pre_conv=use_final_conv,
                                         pre_conv_params=final_conv_params,
                                         use_mean=use_mean,
                                         use_batch_norm=use_batch_norm,
                                         scope='final_xcorr')
        final_conv = cnn.channel_sum(cnn.diag_xcorr(search, template))
        for name in layer_names:
            template_layer = template_layers[name]
            search_layer = search_layers[name]
            # TODO: Add batch-norm to each cross-correlation?
            # Must be a 1x1 convolution to ensure that receptive fields of different layers align.
            scores[name] = _xcorr_general(template_layers[name], search_layers[name], is_training,
                                          trainable=trainable,
                                          use_pre_conv=True,
                                          pre_conv_params=dict(
                                              num_outputs=hidden_conv_num_outputs,
                                              kernel_size=1,
                                              stride=1,
                                              activation=hidden_conv_activation),
                                          use_mean=use_mean,
                                          use_batch_norm=use_batch_norm,
                                          scope=name + '_xcorr')

        # Upsample all to minimum stride.
        # Then take center-crop of minimum size.
        field_strides = {name: _unique(score.fields[cnn.get_value(search_image)].stride)
                         for name, score in scores.items()}
        min_stride = min(field_strides.values())
        for name in ['final'] + layer_names:
            stride = field_strides[name]
            if stride != min_stride:
                assert stride % min_stride == 0
                relative = stride // min_stride
                scores[name] = cnn.upsample(scores[name], relative,
                                            method=tf.image.ResizeMethod.BILINEAR)

        sizes = {name: _unique(score.value.shape[-3:-1].as_list()) for name, score in scores.items()}
        min_size = min(sizes.values())
        for name in ['final'] + layer_names:
            size = sizes[name]
            if (size - min_size) % 2 != 0:
                raise ValueError('remainder is not even: {} within {}'.format(min_size, size))
            margin = (size - min_size) // 2
            scores[name] = cnn.spatial_trim(scores[name], margin, margin)

        # TODO: How to handle calibration here?
        total = scores['final']
        for name in layer_names:
            total += scores[name]
        return total
Exemplo n.º 10
0
def _xcorr_general(template, search, is_training,
                   trainable=True,
                   use_pre_conv=False,
                   pre_conv_params=None,
                   learn_spatial_weight=False,
                   weight_init_method='ones',
                   reduce_channels=True,
                   use_mean=False,
                   use_batch_norm=False,
                   learn_gain=False,
                   gain_init=1,
                   scope='xcorr'):
    '''Convolves template with search.

    Args:
        template: [b, h, w, c]
        search: [b, s, h, w, c]

    If use_batch_norm is true, then an output gain will always be incorporated.
    Otherwise, it will only be incorporated if learn_gain is true.

    When `learn_spatial_weight` is false:
    If `use_batch_norm` is true, `use_mean` should have no effect.
    When `learn_spatial_weight` is true:
    The `use_mean` parameter also controls the initialization of the spatial weights.
    This may have an effect on gradient descent, even if `use_batch_norm` is true.
    '''
    with tf.variable_scope(scope, 'xcorr'):
        pre_conv_params = pre_conv_params or {}

        if use_pre_conv:
            template = _pre_conv(template, is_training, trainable=trainable,
                                 scope='pre', reuse=False, **pre_conv_params)
            search = _pre_conv(search, is_training, trainable=trainable,
                               scope='pre', reuse=True, **pre_conv_params)
        # Discard receptive field of template and get underlying tf.Tensor.
        template = cnn.get_value(template)
        template_size = template.shape[-3:-1].as_list()

        # There are two separate issues here:
        # 1. Whether to make the initial output equal to the mean?
        # 2. How to share this between a constant multiplier and initialization?
        spatial_normalizer = 1 / np.prod(template_size)
        if learn_spatial_weight:
            if weight_init_method == 'mean':
                weight_init = spatial_normalizer
            elif weight_init_method == 'ones':
                weight_init = 1
            else:
                raise ValueError('unknown weight init method: "{}"'.format(weight_init_method))
        else:
            weight_init = 1
        if use_mean:
            # Maintain property:
            # normalize_factor * weight_init = spatial_normalizer
            normalize_factor = spatial_normalizer / weight_init
        else:
            normalize_factor = 1

        if learn_spatial_weight:
            # Initialize with spatial normalizer.
            spatial_weight = tf.get_variable(
                'spatial_weight', template_size, tf.float32,
                initializer=tf.constant_initializer(weight_init),
                trainable=trainable)
            template *= tf.expand_dims(spatial_weight, -1)
        dot = cnn.diag_xcorr(search, template)
        dot = cnn.pixelwise(lambda dot: normalize_factor * dot, dot)
        if reduce_channels:
            dot = cnn.channel_mean(dot) if use_mean else cnn.channel_sum(dot)
        return _calibrate(dot, is_training, use_batch_norm, learn_gain, gain_init,
                          trainable=trainable)
Exemplo n.º 11
0
    def next(self, features, labels, state, name='timestep'):
        '''
        Args:
            reset_position: Keep the appearance model but reset the position.
                If this is true, then features['rect'] must be present.
        '''
        with tf.name_scope(name) as scope:
            im = features['image']['data']
            run_opts = state['run_opts']
            aspect = state['aspect']
            mean_color = state['mean_color']
            prev_im = state['image']

            # If the label is not valid, there will be no loss for this frame.
            # However, the input image may still be processed.
            # In this case, adopt the previous rectangle as the "ground-truth".
            if self.mode in MODE_KEYS_SUPERVISED:
                gt_rect = tf.where(labels['valid'], labels['rect'],
                                   state['rect'])
            else:
                gt_rect = None
            # Use the previous rectangle.
            # This will be the ground-truth rect during training if `use_predictions` is false.
            prev_target_rect = state['rect']

            # Coerce the aspect ratio of the rectangle to construct the context area.
            context_rect = self._context_rect(prev_target_rect, aspect,
                                              self.context_scale)
            # Extract same rectangle in past and current images and feed into conv-net.
            context_curr = self._crop(im, context_rect, CONTEXT_SIZE,
                                      mean_color)
            context_prev = self._crop(prev_im, context_rect, CONTEXT_SIZE,
                                      mean_color)
            with tf.name_scope('summary_context'):
                tf.summary.image('curr', context_curr)
                tf.summary.image('prev', context_curr)
            ims = [context_curr
                   ] if self.stateless else [context_curr, context_prev]
            ims = tf.stack(ims, axis=1)

            if self.output_form == 'discrete':
                output_shapes = {
                    'response': [
                        self.num_scales, self.response_size,
                        self.response_size, 1
                    ]
                }
            elif self.output_form == 'vector':
                output_shapes = {'translation': [2], 'log_scale': [1]}
            else:
                raise ValueError(
                    'unknown output form: "{}"'.format(output_form))

            # Extract features, perform search, get receptive field of response wrt image.
            ims_preproc = self._preproc(ims)
            with tf.variable_scope('motion', reuse=(self._num_frames > 0)):
                outputs = _motion_net(ims_preproc,
                                      output_shapes,
                                      run_opts['is_training'],
                                      weight_decay=self.wd)
            outputs = {
                k:
                tf.verify_tensor_all_finite(v,
                                            'output "{}" not finite'.format(k))
                for k, v in outputs.items()
            }

            losses = {}
            if self.mode in MODE_KEYS_SUPERVISED:
                # Get ground-truth translation and scale relative to context window.
                gt_rect_in_context = geom.crop_rect(gt_rect, context_rect)
                gt_position, gt_rect_size = geom.rect_center_size(
                    gt_rect_in_context)
                gt_translation = gt_position - 0.5  # Displacement relative to center.
                gt_size = helpers.scalar_size(gt_rect_size, self.aspect_method)
                # Scale is size relative to target_size.
                gt_scale = gt_size / (self.target_size / CONTEXT_SIZE)
                gt_log_scale = tf.log(gt_scale)

                if self.output_form == 'discrete':
                    # base_translations = ((self.response_stride / self.context_size) *
                    #                      util.displacement_from_center(self.response_size))
                    # scales = util.scale_range(tf.constant(self.num_scales),
                    #                           tf.to_float(self.log_scale_step))
                    base_target_size = self.target_size / CONTEXT_SIZE
                    translation_stride = self.response_stride / CONTEXT_SIZE
                    loss_name, loss = compute_loss_discrete(
                        outputs['response'], self.num_scales,
                        translation_stride, self.log_scale_step,
                        base_target_size, gt_translation, gt_size,
                        **self.loss_params)
                else:
                    loss_name, loss = compute_loss_vector(
                        outputs['translation'], outputs['log_scale'],
                        gt_translation, gt_log_scale, **self.loss_params)

                # if reset_position:
                #     # TODO: Something better!
                #     losses[loss_name] = tf.zeros_like(loss)
                # else:
                #     losses[loss_name] = loss
                losses[loss_name] = loss

            if self.output_form == 'discrete':
                response = outputs['response']
                scales = util.scale_range(tf.constant(self.num_scales),
                                          tf.to_float(self.log_scale_step))
                # Use pyramid from loss function to obtain position.
                # Get relative translation and scale from response.
                # TODO: Upsample to higher resolution than original image?
                response_resize = cnn.get_value(
                    cnn.upsample(response,
                                 self.response_stride,
                                 method=tf.image.ResizeMethod.BICUBIC))
                response_final = response_resize
                # if self.learn_motion:
                #     response_final = response_resize
                # else:
                #     response_final = apply_motion_penalty(
                #         response_resize, radius=self.window_radius * self.target_size,
                #         **self.window_params)
                translation, scale, in_arg_max = util.find_peak_pyr(
                    response_final, scales, eps_abs=self.arg_max_eps)
                scale = tf.expand_dims(scale, -1)  # [b, 1]
                # Obtain translation in relative co-ordinates within search image.
                translation = 1 / tf.to_float(CONTEXT_SIZE) * translation
                # Get scalar representing confidence in prediction.
                # Use raw appearance score (before motion penalty).
                confidence = helpers.weighted_mean(response_resize,
                                                   in_arg_max,
                                                   axis=(-4, -3, -2))
            else:
                translation = outputs['translation']  # [b, 2]
                scale = tf.exp(outputs['log_scale'])  # [b, 1]

            # Damp the scale update towards 1 (no change).
            # TODO: Should this be in log space?
            scale = self.scale_update_rate * scale + (
                1. - self.scale_update_rate) * 1.
            # Get rectangle in search image.
            prev_target_in_context = geom.crop_rect(prev_target_rect,
                                                    context_rect)
            pred_in_context = _rect_translate_scale(prev_target_in_context,
                                                    translation, scale)
            # Move from search back to original image.
            pred = geom.crop_rect(pred_in_context,
                                  geom.crop_inverse(context_rect))

            # Limit size of object.
            pred = _clip_rect_size(pred, min_size=0.001, max_size=10.0)

            # Rectangle to use in next frame for search area.
            # If using gt and rect not valid, use previous.
            if self.mode in MODE_KEYS_SUPERVISED:
                next_prev_rect = pred if self.use_predictions else gt_rect
            else:
                next_prev_rect = pred

            self._num_frames += 1
            # outputs = {'rect': pred, 'score': confidence}
            predictions = {'rect': pred}
            state = {
                'run_opts': run_opts,
                'aspect': aspect,
                # 'image': tf.image.resize_images(im, [self.image_size, self.image_size]),
                'image': im,
                'rect': next_prev_rect,
                'mean_color': state['mean_color'],
            }
            return predictions, state, losses