Пример #1
0
def _clip_rect_size(rect, min_size=None, max_size=None, name='clip_rect_size'):
    with tf.name_scope(name) as scope:
        center, size = geom.rect_center_size(rect)
        if max_size is not None:
            size = tf.minimum(size, max_size)
        if min_size is not None:
            size = tf.maximum(size, min_size)
        return geom.make_rect_center_size(center, size)
Пример #2
0
def _rect_translate_scale(rect, translate, scale, name='rect_translate_scale'):
    '''
    Args:
        rect: [..., 4]
        translate: [..., 2]
        scale: [..., 1]
    '''
    with tf.name_scope(name) as scope:
        center, size = geom.rect_center_size(rect)
        return geom.make_rect_center_size(center + translate, size * scale)
Пример #3
0
def modify_aspect_ratio(rect, method='stretch', axis=-1, eps=1e-3, name='modify_aspect_ratio'):
    if method == 'stretch':
        return rect  # No change.
    with tf.name_scope(name) as scope:
        center, size = geom.rect_center_size(rect)
        with tf.control_dependencies([tf.assert_non_negative(size)]):
            size = tf.identity(size)
        size = tf.maximum(size, eps)
        width = scalar_size(size, method, axis=axis, keepdims=True)
        return geom.make_rect_center_size(center, width)
Пример #4
0
    def next(self, features, labels, state, name=None, reset_position=False):
        '''
        Args:
            reset_position: Keep the appearance model but reset the position.
                If this is true, then features['rect'] must be present.
        '''
        with tf.name_scope(name, 'next_{}'.format(self._num_frames)) as scope:
            im = features['image']['data']
            run_opts = state['run_opts']
            aspect = state['aspect']
            prev_im = state['image']
            mean_color = state['mean_color']

            # If the label is not valid, there will be no loss for this frame.
            # However, the input image may still be processed.
            # In this case, adopt the previous rectangle as the "ground-truth".
            if self.mode in MODE_KEYS_SUPERVISED:
                gt_rect = tf.where(labels['valid'], labels['rect'],
                                   state['rect'])
            else:
                gt_rect = None
            # Use the previous rectangle.
            # This will be the ground-truth rect during training if `use_predictions` is false.
            prev_target_rect = state['rect']

            # Coerce the aspect ratio of the rectangle to construct the search area.
            # search_rect = self._context_rect(prev_target_rect, aspect, self.search_scale)
            base_rect = model_util.coerce_aspect(
                prev_target_rect, aspect, aspect_method=self.aspect_method)
            # Apply perturbation to aspect-coerced "previous" rect (may be current gt).
            if self.use_perturb and self.mode == tf.estimator.ModeKeys.TRAIN:
                base_rect = tf.cond(
                    run_opts['is_training'],
                    lambda: siamfc.perturb(base_rect, **self.perturb_params),
                    lambda: base_rect)
            search_rect = geom.grow_rect(self.search_scale, base_rect)

            # Coerce the aspect ratio of the rectangle to construct the context area.
            # context_rect = self._context_rect(prev_target_rect, aspect, self.context_scale)
            context_rect = geom.grow_rect(self.context_scale, base_rect)
            # Extract same rectangle in past and current images and feed into conv-net.
            context_curr = self._crop(im, context_rect, self.context_size,
                                      mean_color)
            context_prev = self._crop(prev_im, context_rect, self.context_size,
                                      mean_color)
            with tf.name_scope('summary_context'):
                tf.summary.image('curr', context_curr)
                tf.summary.image('prev', context_curr)
            motion = [context_curr
                      ] if self.stateless else [context_curr, context_prev]
            motion = tf.stack(motion, axis=1)

            # How to obtain template from previous state?
            template_feat = state['template_init']

            # Extract an image pyramid (use 1 scale when not in tracking mode).
            mid_scale = (self.num_scales - 1) // 2
            if self.num_scales == 1:
                scales = tf.constant([1.0], dtype=tf.float32)
            else:
                scales = model_util.scale_range(
                    tf.constant(self.num_scales),
                    tf.to_float(self.log_scale_step))
            search_ims, search_rects = self._crop_pyr(im, search_rect,
                                                      self.search_size, scales,
                                                      mean_color)

            with tf.name_scope('summary'):
                _image_sequence_summary('search',
                                        search_ims,
                                        elem_name='scale')

            with tf.variable_scope('appearance',
                                   reuse=False) as appearance_scope:
                # Extract features, perform search, get receptive field of response wrt image.
                search_input = self._preproc(search_ims)
                search_input = cnn.as_tensor(search_input, add_to_set=True)
                with tf.variable_scope('embed', reuse=True):
                    search_feat, search_layers, _ = self._embed_net(
                        search_input, (False if not self.learn_appearance else
                                       run_opts['is_training']))
                rf_search = search_feat.fields[search_input.value]
                search_feat_size = search_feat.value.shape[-3:-1].as_list()
                receptive_field.assert_center_alignment(
                    self.search_size, search_feat_size, rf_search)

                with tf.variable_scope('join', reuse=(self._num_frames >= 1)):
                    join_fn = join_nets.BY_NAME[self.join_arch]
                    if self.join_type == 'single':
                        response = join_fn(
                            template_feat,
                            search_feat,
                            is_training=(False if not self.learn_appearance
                                         else run_opts['is_training']),
                            trainable=self.learn_appearance,
                            **self.join_params)
                    elif self.join_type == 'multi':
                        response = join_fn(
                            template_feat,
                            search_feat,
                            self.multi_join_layers,
                            template_layers,
                            search_layers,
                            search_input,
                            is_training=(False if not self.learn_appearance
                                         else run_opts['is_training']),
                            trainable=self.learn_appearance,
                            **self.join_params)
                    else:
                        raise ValueError('unknown join type: "{}"'.format(
                            self.join_type))
                rf_response = response.fields[search_input.value]
                response = cnn.get_value(response)
                response_size = response.shape[-3:-1].as_list()
                receptive_field.assert_center_alignment(
                    self.search_size, response_size, rf_response)
                response = tf.verify_tensor_all_finite(
                    response, 'output of xcorr is not finite')

            if self._num_frames == 0:
                # Define appearance model saver.
                if self.appearance_model_file:
                    # Create the graph ops for the saver.
                    var_list = appearance_scope.global_variables()
                    var_list = {var.op.name: var for var in var_list}
                    if self.appearance_scope_dst or self.appearance_scope_src:
                        # Replace 'dst' with 'src'.
                        # Caution: This string replacement is a little dangerous.
                        var_list = {
                            k.replace(self.appearance_scope_dst,
                                      self.appearance_scope_src, 1): v
                            for k, v in var_list.items()
                        }
                    self._appearance_var_list = var_list
                    self._appearance_saver = tf.train.Saver(var_list)

            # Post-process scores.
            with tf.variable_scope('output', reuse=(self._num_frames > 0)):
                if not self.learn_appearance:
                    # TODO: Prevent batch-norm updates as well.
                    # TODO: Set trainable=False for all variables above.
                    response = tf.stop_gradient(response)

                # Regress response to translation and log(scale).
                output_shapes = {'translation': [2], 'log_scale': [1]}
                outputs = _output_net(response,
                                      motion,
                                      output_shapes,
                                      run_opts['is_training'],
                                      weight_decay=self.wd,
                                      use_response=self.output_use_response,
                                      use_images=self.output_use_images)

            _image_sequence_summary('response',
                                    model_util.colormap(
                                        tf.sigmoid(response), _COLORMAP),
                                    elem_name='scale')

            losses = {}
            if self.mode in MODE_KEYS_SUPERVISED:
                # Get ground-truth translation and scale relative to search window.
                gt_rect_in_search = geom.crop_rect(gt_rect, search_rect)
                gt_position, gt_rect_size = geom.rect_center_size(
                    gt_rect_in_search)
                # Positions in real interval [0, 1] correspond to real interval [0, search_size].
                # Pixel centers range from 0.5 to search_size - 0.5 in [0, search_size].
                gt_translation = gt_position - 0.5  # Displacement relative to center.
                gt_size = helpers.scalar_size(gt_rect_size, self.aspect_method)
                target_size_in_search = self.target_size / self.search_size
                # size = target_size * scale
                gt_scale = gt_size / target_size_in_search
                gt_log_scale = tf.log(gt_scale)

                if self.appearance_loss:
                    target_size_in_response = self.target_size / rf_response.stride
                    loss_name, loss = siamfc.compute_loss(
                        response[:, mid_scale], target_size_in_response,
                        **self.appearance_loss_params)
                    losses[loss_name] = loss

                loss_name, loss = regress.compute_loss_vector(
                    outputs['translation'], outputs['log_scale'],
                    gt_translation, gt_log_scale, **self.loss_params)
                losses[loss_name] = loss

                if reset_position:
                    # TODO: Something better!
                    # TODO: Keep appearance loss even when `reset_position` is true?
                    losses = {k: tf.zeros_like(v) for k, v in losses.items()}

            translation = outputs['translation']  # [b, 2]
            scale = tf.exp(outputs['log_scale'])  # [b, 1]

            # Damp the scale update towards 1 (no change).
            # TODO: Should this be in log space?
            scale = self.scale_update_rate * scale + (
                1. - self.scale_update_rate) * 1.
            # Get rectangle in search image.
            prev_target_in_search = geom.crop_rect(prev_target_rect,
                                                   search_rect)
            pred_in_search = _rect_translate_scale(prev_target_in_search,
                                                   translation, scale)
            # Move from search back to original image.
            pred = geom.crop_rect(pred_in_search,
                                  geom.crop_inverse(search_rect))

            # Limit size of object.
            pred = _clip_rect_size(pred, min_size=0.001, max_size=10.0)

            # Rectangle to use in next frame for search area.
            # If using gt and rect not valid, use previous.
            if self.mode in MODE_KEYS_SUPERVISED:
                next_prev_rect = pred if self.use_predictions else gt_rect
            else:
                next_prev_rect = pred

            # outputs = {'rect': pred, 'score': confidence}
            outputs = {'rect': pred}
            state = {
                'run_opts': run_opts,
                'aspect': aspect,
                'image': im,
                'rect': next_prev_rect,
                'template_init': state['template_init'],
                'mean_color': state['mean_color'],
            }
            self._num_frames += 1
            return outputs, state, losses
Пример #5
0
    def next(self, features, labels, state, name='timestep'):
        '''
        Args:
            reset_position: Keep the appearance model but reset the position.
                If this is true, then features['rect'] must be present.
        '''
        with tf.name_scope(name) as scope:
            im = features['image']['data']
            run_opts = state['run_opts']
            aspect = state['aspect']
            mean_color = state['mean_color']
            prev_im = state['image']

            # If the label is not valid, there will be no loss for this frame.
            # However, the input image may still be processed.
            # In this case, adopt the previous rectangle as the "ground-truth".
            if self.mode in MODE_KEYS_SUPERVISED:
                gt_rect = tf.where(labels['valid'], labels['rect'],
                                   state['rect'])
            else:
                gt_rect = None
            # Use the previous rectangle.
            # This will be the ground-truth rect during training if `use_predictions` is false.
            prev_target_rect = state['rect']

            # Coerce the aspect ratio of the rectangle to construct the context area.
            context_rect = self._context_rect(prev_target_rect, aspect,
                                              self.context_scale)
            # Extract same rectangle in past and current images and feed into conv-net.
            context_curr = self._crop(im, context_rect, CONTEXT_SIZE,
                                      mean_color)
            context_prev = self._crop(prev_im, context_rect, CONTEXT_SIZE,
                                      mean_color)
            with tf.name_scope('summary_context'):
                tf.summary.image('curr', context_curr)
                tf.summary.image('prev', context_curr)
            ims = [context_curr
                   ] if self.stateless else [context_curr, context_prev]
            ims = tf.stack(ims, axis=1)

            if self.output_form == 'discrete':
                output_shapes = {
                    'response': [
                        self.num_scales, self.response_size,
                        self.response_size, 1
                    ]
                }
            elif self.output_form == 'vector':
                output_shapes = {'translation': [2], 'log_scale': [1]}
            else:
                raise ValueError(
                    'unknown output form: "{}"'.format(output_form))

            # Extract features, perform search, get receptive field of response wrt image.
            ims_preproc = self._preproc(ims)
            with tf.variable_scope('motion', reuse=(self._num_frames > 0)):
                outputs = _motion_net(ims_preproc,
                                      output_shapes,
                                      run_opts['is_training'],
                                      weight_decay=self.wd)
            outputs = {
                k:
                tf.verify_tensor_all_finite(v,
                                            'output "{}" not finite'.format(k))
                for k, v in outputs.items()
            }

            losses = {}
            if self.mode in MODE_KEYS_SUPERVISED:
                # Get ground-truth translation and scale relative to context window.
                gt_rect_in_context = geom.crop_rect(gt_rect, context_rect)
                gt_position, gt_rect_size = geom.rect_center_size(
                    gt_rect_in_context)
                gt_translation = gt_position - 0.5  # Displacement relative to center.
                gt_size = helpers.scalar_size(gt_rect_size, self.aspect_method)
                # Scale is size relative to target_size.
                gt_scale = gt_size / (self.target_size / CONTEXT_SIZE)
                gt_log_scale = tf.log(gt_scale)

                if self.output_form == 'discrete':
                    # base_translations = ((self.response_stride / self.context_size) *
                    #                      util.displacement_from_center(self.response_size))
                    # scales = util.scale_range(tf.constant(self.num_scales),
                    #                           tf.to_float(self.log_scale_step))
                    base_target_size = self.target_size / CONTEXT_SIZE
                    translation_stride = self.response_stride / CONTEXT_SIZE
                    loss_name, loss = compute_loss_discrete(
                        outputs['response'], self.num_scales,
                        translation_stride, self.log_scale_step,
                        base_target_size, gt_translation, gt_size,
                        **self.loss_params)
                else:
                    loss_name, loss = compute_loss_vector(
                        outputs['translation'], outputs['log_scale'],
                        gt_translation, gt_log_scale, **self.loss_params)

                # if reset_position:
                #     # TODO: Something better!
                #     losses[loss_name] = tf.zeros_like(loss)
                # else:
                #     losses[loss_name] = loss
                losses[loss_name] = loss

            if self.output_form == 'discrete':
                response = outputs['response']
                scales = util.scale_range(tf.constant(self.num_scales),
                                          tf.to_float(self.log_scale_step))
                # Use pyramid from loss function to obtain position.
                # Get relative translation and scale from response.
                # TODO: Upsample to higher resolution than original image?
                response_resize = cnn.get_value(
                    cnn.upsample(response,
                                 self.response_stride,
                                 method=tf.image.ResizeMethod.BICUBIC))
                response_final = response_resize
                # if self.learn_motion:
                #     response_final = response_resize
                # else:
                #     response_final = apply_motion_penalty(
                #         response_resize, radius=self.window_radius * self.target_size,
                #         **self.window_params)
                translation, scale, in_arg_max = util.find_peak_pyr(
                    response_final, scales, eps_abs=self.arg_max_eps)
                scale = tf.expand_dims(scale, -1)  # [b, 1]
                # Obtain translation in relative co-ordinates within search image.
                translation = 1 / tf.to_float(CONTEXT_SIZE) * translation
                # Get scalar representing confidence in prediction.
                # Use raw appearance score (before motion penalty).
                confidence = helpers.weighted_mean(response_resize,
                                                   in_arg_max,
                                                   axis=(-4, -3, -2))
            else:
                translation = outputs['translation']  # [b, 2]
                scale = tf.exp(outputs['log_scale'])  # [b, 1]

            # Damp the scale update towards 1 (no change).
            # TODO: Should this be in log space?
            scale = self.scale_update_rate * scale + (
                1. - self.scale_update_rate) * 1.
            # Get rectangle in search image.
            prev_target_in_context = geom.crop_rect(prev_target_rect,
                                                    context_rect)
            pred_in_context = _rect_translate_scale(prev_target_in_context,
                                                    translation, scale)
            # Move from search back to original image.
            pred = geom.crop_rect(pred_in_context,
                                  geom.crop_inverse(context_rect))

            # Limit size of object.
            pred = _clip_rect_size(pred, min_size=0.001, max_size=10.0)

            # Rectangle to use in next frame for search area.
            # If using gt and rect not valid, use previous.
            if self.mode in MODE_KEYS_SUPERVISED:
                next_prev_rect = pred if self.use_predictions else gt_rect
            else:
                next_prev_rect = pred

            self._num_frames += 1
            # outputs = {'rect': pred, 'score': confidence}
            predictions = {'rect': pred}
            state = {
                'run_opts': run_opts,
                'aspect': aspect,
                # 'image': tf.image.resize_images(im, [self.image_size, self.image_size]),
                'image': im,
                'rect': next_prev_rect,
                'mean_color': state['mean_color'],
            }
            return predictions, state, losses