Пример #1
0
def rect_grid_pyr(response_size,
                  rf,
                  search_size,
                  rect_size,
                  scales,
                  name='rect_grid_pyr'):
    '''Obtains the rectangles for a translation and scale scoremap.

    Args:
        response_size -- Dimension of scoremap. (height, width)
        rf -- Receptive field of scoremap.
        search_size -- Dimension of search image in pixels. (height, width)
        rect_size -- Size of rectangle in normalized co-ords in search image. [b, 2]
    '''
    with tf.name_scope(name) as scope:
        # Assert that receptive fields are centered.
        receptive_field.assert_center_alignment(search_size, response_size, rf)
        # Obtain displacement from center of search image.
        # Not necessary to use receptive field offset because it is centered.
        disp = displacement_from_center(response_size)
        disp = tf.to_float(disp) * rf.stride / search_size
        disp = tf.multiply(
            tf.expand_dims(disp, -4),  # [b, h, w, 2] -> [b, 1, h, w, 2]
            expand_dims_n(scales, -1, 3))  # [s] -> [s, 1, 1, 1]
        # Get centers of receptive field of each pixel.
        centers = 0.5 + disp
        rect_size = tf.multiply(
            tf.expand_dims(rect_size, -2),  # [b, 2] -> [b, 1, 2]
            tf.expand_dims(scales, -1))  # [s] -> [s, 1]
        rect_size = expand_dims_n(rect_size, -2,
                                  2)  # [b, s, 2] -> [b, s, 1, 1, 2]
        return geom.make_rect_center_size(centers, rect_size)
Пример #2
0
    def test_desired_output_size_from_receptive_field(self):
        '''Uses the receptive field to get the input size for desired output size.'''
        for feature_arch in feature_nets.NAMES:
            sub_test = trySubTest(self, feature_arch=feature_arch)
            with sub_test, tf.Graph().as_default():
                feature_fn = feature_nets.BY_NAME[feature_arch]
                field = feature_nets.get_receptive_field(feature_fn)

                desired = np.array([10, 10])
                input_size = receptive_field.input_size(field, desired)
                input_shape = [None] + list(input_size) + [3]
                image = tf.placeholder(tf.float32, input_shape, name='image')
                is_training = tf.placeholder(tf.bool, (), name='is_training')
                feat, _ = feature_fn(image, is_training)
                output_size = feat.value.shape[1:3].as_list()
                self.assertAllEqual(output_size, desired)
                receptive_field.assert_center_alignment(input_size, output_size, field)
Пример #3
0
    def start(self, features_init, run_opts, name=None):
        with tf.name_scope(name, 'start') as scope:
            im = features_init['image']['data']
            aspect = features_init['aspect']
            target_rect = features_init['rect']
            mean_color = tf.reduce_mean(im, axis=(-3, -2), keepdims=True)

            with tf.variable_scope('appearance', reuse=False):
                template_rect = self._context_rect(target_rect, aspect,
                                                   self.template_scale)
                template_im = self._crop(im, template_rect, self.template_size,
                                         mean_color)
                template_input = self._preproc(template_im)
                template_input = cnn.as_tensor(template_input, add_to_set=True)
                with tf.variable_scope('embed', reuse=False):
                    template_feat, template_layers, feature_scope = self._embed_net(
                        template_input, (False if not self.learn_appearance
                                         else run_opts['is_training']))
                    # Get names relative to this scope for loading pre-trained.
                    # self._feature_vars = _global_variables_relative_to_scope(feature_scope)
                rf_template = template_feat.fields[template_input.value]
                template_feat = cnn.get_value(template_feat)
                feat_size = template_feat.shape[-3:-1].as_list()
                receptive_field.assert_center_alignment(
                    self.template_size, feat_size, rf_template)

            # self._feature_saver = tf.train.Saver(self._feature_vars)

            with tf.name_scope('summary'):
                tf.summary.image('template', template_im)

            state = {
                'run_opts': run_opts,
                'aspect': aspect,
                'image': im,
                'rect': tf.identity(target_rect),
                'template_init': tf.identity(template_feat),
                'mean_color': tf.identity(mean_color),
            }
            return state
Пример #4
0
    def next(self, features, labels, state, name=None, reset_position=False):
        '''
        Args:
            reset_position: Keep the appearance model but reset the position.
                If this is true, then features['rect'] must be present.
        '''
        with tf.name_scope(name, 'next_{}'.format(self._num_frames)) as scope:
            im = features['image']['data']
            run_opts = state['run_opts']
            aspect = state['aspect']
            prev_im = state['image']
            mean_color = state['mean_color']

            # If the label is not valid, there will be no loss for this frame.
            # However, the input image may still be processed.
            # In this case, adopt the previous rectangle as the "ground-truth".
            if self.mode in MODE_KEYS_SUPERVISED:
                gt_rect = tf.where(labels['valid'], labels['rect'],
                                   state['rect'])
            else:
                gt_rect = None
            # Use the previous rectangle.
            # This will be the ground-truth rect during training if `use_predictions` is false.
            prev_target_rect = state['rect']

            # Coerce the aspect ratio of the rectangle to construct the search area.
            # search_rect = self._context_rect(prev_target_rect, aspect, self.search_scale)
            base_rect = model_util.coerce_aspect(
                prev_target_rect, aspect, aspect_method=self.aspect_method)
            # Apply perturbation to aspect-coerced "previous" rect (may be current gt).
            if self.use_perturb and self.mode == tf.estimator.ModeKeys.TRAIN:
                base_rect = tf.cond(
                    run_opts['is_training'],
                    lambda: siamfc.perturb(base_rect, **self.perturb_params),
                    lambda: base_rect)
            search_rect = geom.grow_rect(self.search_scale, base_rect)

            # Coerce the aspect ratio of the rectangle to construct the context area.
            # context_rect = self._context_rect(prev_target_rect, aspect, self.context_scale)
            context_rect = geom.grow_rect(self.context_scale, base_rect)
            # Extract same rectangle in past and current images and feed into conv-net.
            context_curr = self._crop(im, context_rect, self.context_size,
                                      mean_color)
            context_prev = self._crop(prev_im, context_rect, self.context_size,
                                      mean_color)
            with tf.name_scope('summary_context'):
                tf.summary.image('curr', context_curr)
                tf.summary.image('prev', context_curr)
            motion = [context_curr
                      ] if self.stateless else [context_curr, context_prev]
            motion = tf.stack(motion, axis=1)

            # How to obtain template from previous state?
            template_feat = state['template_init']

            # Extract an image pyramid (use 1 scale when not in tracking mode).
            mid_scale = (self.num_scales - 1) // 2
            if self.num_scales == 1:
                scales = tf.constant([1.0], dtype=tf.float32)
            else:
                scales = model_util.scale_range(
                    tf.constant(self.num_scales),
                    tf.to_float(self.log_scale_step))
            search_ims, search_rects = self._crop_pyr(im, search_rect,
                                                      self.search_size, scales,
                                                      mean_color)

            with tf.name_scope('summary'):
                _image_sequence_summary('search',
                                        search_ims,
                                        elem_name='scale')

            with tf.variable_scope('appearance',
                                   reuse=False) as appearance_scope:
                # Extract features, perform search, get receptive field of response wrt image.
                search_input = self._preproc(search_ims)
                search_input = cnn.as_tensor(search_input, add_to_set=True)
                with tf.variable_scope('embed', reuse=True):
                    search_feat, search_layers, _ = self._embed_net(
                        search_input, (False if not self.learn_appearance else
                                       run_opts['is_training']))
                rf_search = search_feat.fields[search_input.value]
                search_feat_size = search_feat.value.shape[-3:-1].as_list()
                receptive_field.assert_center_alignment(
                    self.search_size, search_feat_size, rf_search)

                with tf.variable_scope('join', reuse=(self._num_frames >= 1)):
                    join_fn = join_nets.BY_NAME[self.join_arch]
                    if self.join_type == 'single':
                        response = join_fn(
                            template_feat,
                            search_feat,
                            is_training=(False if not self.learn_appearance
                                         else run_opts['is_training']),
                            trainable=self.learn_appearance,
                            **self.join_params)
                    elif self.join_type == 'multi':
                        response = join_fn(
                            template_feat,
                            search_feat,
                            self.multi_join_layers,
                            template_layers,
                            search_layers,
                            search_input,
                            is_training=(False if not self.learn_appearance
                                         else run_opts['is_training']),
                            trainable=self.learn_appearance,
                            **self.join_params)
                    else:
                        raise ValueError('unknown join type: "{}"'.format(
                            self.join_type))
                rf_response = response.fields[search_input.value]
                response = cnn.get_value(response)
                response_size = response.shape[-3:-1].as_list()
                receptive_field.assert_center_alignment(
                    self.search_size, response_size, rf_response)
                response = tf.verify_tensor_all_finite(
                    response, 'output of xcorr is not finite')

            if self._num_frames == 0:
                # Define appearance model saver.
                if self.appearance_model_file:
                    # Create the graph ops for the saver.
                    var_list = appearance_scope.global_variables()
                    var_list = {var.op.name: var for var in var_list}
                    if self.appearance_scope_dst or self.appearance_scope_src:
                        # Replace 'dst' with 'src'.
                        # Caution: This string replacement is a little dangerous.
                        var_list = {
                            k.replace(self.appearance_scope_dst,
                                      self.appearance_scope_src, 1): v
                            for k, v in var_list.items()
                        }
                    self._appearance_var_list = var_list
                    self._appearance_saver = tf.train.Saver(var_list)

            # Post-process scores.
            with tf.variable_scope('output', reuse=(self._num_frames > 0)):
                if not self.learn_appearance:
                    # TODO: Prevent batch-norm updates as well.
                    # TODO: Set trainable=False for all variables above.
                    response = tf.stop_gradient(response)

                # Regress response to translation and log(scale).
                output_shapes = {'translation': [2], 'log_scale': [1]}
                outputs = _output_net(response,
                                      motion,
                                      output_shapes,
                                      run_opts['is_training'],
                                      weight_decay=self.wd,
                                      use_response=self.output_use_response,
                                      use_images=self.output_use_images)

            _image_sequence_summary('response',
                                    model_util.colormap(
                                        tf.sigmoid(response), _COLORMAP),
                                    elem_name='scale')

            losses = {}
            if self.mode in MODE_KEYS_SUPERVISED:
                # Get ground-truth translation and scale relative to search window.
                gt_rect_in_search = geom.crop_rect(gt_rect, search_rect)
                gt_position, gt_rect_size = geom.rect_center_size(
                    gt_rect_in_search)
                # Positions in real interval [0, 1] correspond to real interval [0, search_size].
                # Pixel centers range from 0.5 to search_size - 0.5 in [0, search_size].
                gt_translation = gt_position - 0.5  # Displacement relative to center.
                gt_size = helpers.scalar_size(gt_rect_size, self.aspect_method)
                target_size_in_search = self.target_size / self.search_size
                # size = target_size * scale
                gt_scale = gt_size / target_size_in_search
                gt_log_scale = tf.log(gt_scale)

                if self.appearance_loss:
                    target_size_in_response = self.target_size / rf_response.stride
                    loss_name, loss = siamfc.compute_loss(
                        response[:, mid_scale], target_size_in_response,
                        **self.appearance_loss_params)
                    losses[loss_name] = loss

                loss_name, loss = regress.compute_loss_vector(
                    outputs['translation'], outputs['log_scale'],
                    gt_translation, gt_log_scale, **self.loss_params)
                losses[loss_name] = loss

                if reset_position:
                    # TODO: Something better!
                    # TODO: Keep appearance loss even when `reset_position` is true?
                    losses = {k: tf.zeros_like(v) for k, v in losses.items()}

            translation = outputs['translation']  # [b, 2]
            scale = tf.exp(outputs['log_scale'])  # [b, 1]

            # Damp the scale update towards 1 (no change).
            # TODO: Should this be in log space?
            scale = self.scale_update_rate * scale + (
                1. - self.scale_update_rate) * 1.
            # Get rectangle in search image.
            prev_target_in_search = geom.crop_rect(prev_target_rect,
                                                   search_rect)
            pred_in_search = _rect_translate_scale(prev_target_in_search,
                                                   translation, scale)
            # Move from search back to original image.
            pred = geom.crop_rect(pred_in_search,
                                  geom.crop_inverse(search_rect))

            # Limit size of object.
            pred = _clip_rect_size(pred, min_size=0.001, max_size=10.0)

            # Rectangle to use in next frame for search area.
            # If using gt and rect not valid, use previous.
            if self.mode in MODE_KEYS_SUPERVISED:
                next_prev_rect = pred if self.use_predictions else gt_rect
            else:
                next_prev_rect = pred

            # outputs = {'rect': pred, 'score': confidence}
            outputs = {'rect': pred}
            state = {
                'run_opts': run_opts,
                'aspect': aspect,
                'image': im,
                'rect': next_prev_rect,
                'template_init': state['template_init'],
                'mean_color': state['mean_color'],
            }
            self._num_frames += 1
            return outputs, state, losses