示例#1
0
    def init_tracks(self, sess, det, input_image):

        # Get initial target bounding box and convert to center based
        init_bb = Rectangle(
            int(det[0]) - 1,
            int(det[1]) - 1, int(det[2]), int(det[3]))
        bbox = convert_bbox_format(init_bb, 'center-based')

        # Feed in the first frame image to set initial state.
        bbox_feed = [bbox.y, bbox.x, bbox.height, bbox.width]
        bbox_in = [init_bb.x, init_bb.y, init_bb.width, init_bb.height]
        input_feed = [input_image, bbox_feed]
        templates, reid_templates = self.siamese_model.initialize(
            sess, input_feed)

        his_feature = []
        his_feature.append(reid_templates)

        search_center = np.array(
            [get_center(self.x_image_size),
             get_center(self.x_image_size)])
        current_target_state = TargetState(
            bbox=bbox,
            search_pos=search_center,
            original_search_center=search_center,
            scale_idx=int(get_center(self.num_scales)),
            original_target_wh=[bbox.width, bbox.height],
            init_templates=templates,  # used for SOT
            his_feature=his_feature,  # used for re-id
            reid_templates=reid_templates,
            similarity=1.0,
            bbox_in=bbox_in,
        )  # bbox_in  [x,y,w,h]
        return current_target_state
示例#2
0
def get_exemplar_images(images, exemplar_size, targets_pos=None):
  """Crop exemplar image from input images"""
  with tf.name_scope('get_exemplar_image'):
    batch_size, x_height, x_width = images.get_shape().as_list()[:3]
    z_height, z_width = exemplar_size

    if targets_pos is None:
      target_pos_single = [[get_center(x_height), get_center(x_width)]]
      targets_pos_ = tf.tile(target_pos_single, [batch_size, 1])
    else:
      targets_pos_ = targets_pos

    # convert to top-left corner based coordinates
    top = tf.to_int32(tf.round(targets_pos_[:, 0] - get_center(z_height)))
    bottom = tf.to_int32(top + z_height)
    left = tf.to_int32(tf.round(targets_pos_[:, 1] - get_center(z_width)))
    right = tf.to_int32(left + z_width)

    def _slice(x):
      f, t, l, b, r = x
      c = f[t:b, l:r]
      return c

    exemplar_img = tf.map_fn(_slice, (images, top, left, bottom, right), dtype=images.dtype)
    exemplar_img.set_shape([batch_size, z_height, z_width, 3])
    return exemplar_img
示例#3
0
  def build_search_images(self):
    """Crop search images from the input image based on the last target position

    1. The input image is scaled such that the area of target&context takes up to (scale_factor * z_image_size) ^ 2
    2. Crop an image patch as large as x_image_size centered at the target center.
    3. If the cropped image region is beyond the boundary of the input image, mean values are padded.
    """
    model_config = self.model_config
    track_config = self.track_config

    size_z = model_config['z_image_size']   # 127
    size_x = track_config['x_image_size']   # 255

    num_scales = track_config['num_scales']   # 3
    scales = np.arange(num_scales) - get_center(num_scales)
    assert np.sum(scales) == 0, 'scales should be symmetric'
    search_factors = [track_config['scale_step'] ** x for x in scales]   # pow(1.0375, -1), pow(1.0375, 0), pow(1.0375, 1)

    frame_sz = tf.shape(self.image)
    target_yx = self.target_bbox_feed[0:2]
    target_size = self.target_bbox_feed[2:4]
    avg_chan = tf.reduce_mean(self.image, axis=(0, 1), name='avg_chan')

    # Compute base values
    base_z_size = target_size   # suppose [60, 120]
    base_z_context_size = base_z_size + self.context_amount * tf.reduce_sum(base_z_size)
    base_s_z = tf.sqrt(tf.reduce_prod(base_z_context_size))  # Canonical size, sqrt(87*147) = 113
    base_scale_z = tf.div(tf.to_float(size_z), base_s_z)  # 127 / 113 = 1.124
    d_search = (size_x - size_z) / 2.0  # 64
    base_pad = tf.div(d_search, base_scale_z)   # 64 / 1.124 =57
    base_s_x = base_s_z + 2 * base_pad   # 113 + 2*57 = 227
    base_scale_x = tf.div(tf.to_float(size_x), base_s_x)   # 255 / 227 = 1.123

    boxes = []

    for factor in search_factors:
      s_x = factor * base_s_x   # 1.0375 x 227
      frame_sz_1 = tf.to_float(frame_sz[0:2] - 1)
      # self.frame_shape = frame_sz_1
      topleft = tf.div(target_yx - get_center(s_x), frame_sz_1)
      bottomright = tf.div(target_yx + get_center(s_x), frame_sz_1)
      box = tf.concat([topleft, bottomright], axis=0)
      boxes.append(box)

    boxes = tf.stack(boxes)
    scale_xs = []
    for factor in search_factors:
      scale_x = base_scale_x / factor
      scale_xs.append(scale_x)
    self.scale_xs = tf.stack(scale_xs)

    # Note we use different padding values for each image
    # while the original implementation uses only the average value
    # of the first image for all images.
    image_minus_avg = tf.expand_dims(self.image - avg_chan, 0)
    image_cropped = tf.image.crop_and_resize(image_minus_avg, boxes,
                                             box_ind=tf.zeros((track_config['num_scales']), tf.int32),
                                             crop_size=[size_x, size_x])
    self.search_images = image_cropped + avg_chan
示例#4
0
def convert_bbox_format(bbox, to):
  x, y, target_width, target_height = bbox.x, bbox.y, bbox.width, bbox.height
  if to == 'top-left-based':
    x -= get_center(target_width)
    y -= get_center(target_height)
  elif to == 'center-based':
    y += get_center(target_height)
    x += get_center(target_width)
  else:
    raise ValueError("Bbox format: {} was not recognized".format(to))
  return Rectangle(x, y, target_width, target_height)
示例#5
0
 def roi_crop(disp_instance_feat, instance):
     instance_pad = instance.copy()
     crop_center = np.round(disp_instance_feat +
                            get_center(instance_size)).astype(int)
     crop_box = [
         np.maximum(crop_center[0] - 3, 0),
         np.maximum(crop_center[1] - 3, 0),
         np.minimum(crop_center[0] + 3, instance_size),
         np.minimum(crop_center[1] + 3, instance_size)
     ]
     if (int(crop_box[2] - crop_box[0]) !=
             6) or (int(crop_box[3] - crop_box[1]) !=
                    6):  # padding if reach border
         instance_pad = np.pad(instance_pad, ((6, 6), (6, 6), (0, 0)),
                               'constant',
                               constant_values=np.mean(instance_pad))
         crop_center = crop_center + 6
         crop_box = [
             crop_center[0] - 3, crop_center[1] - 3, crop_center[0] + 3,
             crop_center[1] + 3
         ]
         # print(crop_box)
     instance_crop = instance_pad[crop_box[0]:crop_box[2],
                                  crop_box[1]:crop_box[3], :]
     return instance_crop
示例#6
0
  def build_template(self):

    model_config = self.model_config
    track_config = self.track_config

    # Exemplar image lies at the center of the search image in the first frame
    exemplar_images = get_exemplar_images(self.search_images, [model_config['z_image_size'],
                                                               model_config['z_image_size']])

    self.exemplar = exemplar_images
    templates, reid_templates = self.get_image_embedding(exemplar_images, stage='init')

    center_scale = int(get_center(track_config['num_scales']))
    center_template = tf.identity(templates[center_scale]) # Shared feature
    self.center_template = center_template
    self.reid_templates = tf.identity(reid_templates[center_scale])


    templates = tf.stack([center_template for _ in range(track_config['num_scales'])])

    with tf.variable_scope('target_template'):
      # Store template in Variable such that we don't have to feed this template every time.
      with tf.variable_scope('State'):
        state = tf.get_variable('exemplar',
                                initializer=tf.zeros(templates.get_shape().as_list(), dtype=templates.dtype),
                                trainable=False)
        with tf.control_dependencies([templates]):
          self.init = tf.assign(state, templates, validate_shape=True)
        self.init_templates = state
示例#7
0
        def roi_align(image, disp_instance_feat, height, width):
            """
      `image` is a 3-D array, representing the input feature map
      `disp_instance_feat` box center
      `height` and `width` are the desired spatial size of output feature map
      """
            crop_center = disp_instance_feat + get_center(image.shape[0])
            crop_box = [
                np.maximum(crop_center[0] - 3, 0),
                np.maximum(crop_center[1] - 3, 0),
                np.minimum(crop_center[0] + 3, image.shape[0]),
                np.minimum(crop_center[1] + 3, image.shape[0])
            ]
            if (int(crop_box[2] - crop_box[0]) !=
                    6) or (int(crop_box[3] - crop_box[1]) !=
                           6):  # pad if reach boundary
                image = np.pad(image, ((6, 6), (6, 6), (0, 0)),
                               'constant',
                               constant_values=np.mean(image))
                crop_center = crop_center + 6
                crop_box = [
                    crop_center[0] - 3, crop_center[1] - 3, crop_center[0] + 3,
                    crop_center[1] + 3
                ]

            crop_box = [ele / image.shape[0] for ele in crop_box]

            y_min, x_min, y_max, x_max = crop_box

            img_height, img_width, channel_num = image.shape

            feature_map = []

            for y in np.linspace(y_min, y_max, height) * (img_height - 1):
                for x in np.linspace(x_min, x_max, width) * (img_height - 1):
                    y_l, y_h = np.floor(y).astype('int32'), np.ceil(y).astype(
                        'int32')
                    x_l, x_h = np.floor(x).astype('int32'), np.ceil(x).astype(
                        'int32')

                    a = image[y_l, x_l]
                    b = image[y_l, x_h]
                    c = image[y_h, x_l]
                    d = image[y_h, x_h]

                    y_weight = y - y_l
                    x_weight = x - x_l

                    val = a * (1 - x_weight) * (1 - y_weight) + \
                          b * x_weight * (1 - y_weight) + \
                          c * y_weight * (1 - x_weight) + \
                          d * x_weight * y_weight

                    feature_map.append(val)

            return np.array(feature_map).reshape(height, width, channel_num)
示例#8
0
    def __init__(self, siamese_model, model_config, track_config):
        self.siamese_model = siamese_model
        self.model_config = model_config
        self.track_config = track_config

        self.num_scales = track_config['num_scales']
        logging.info('track num scales -- {}'.format(self.num_scales))
        scales = np.arange(self.num_scales) - get_center(self.num_scales)
        self.search_factors = [
            self.track_config['scale_step']**x for x in scales
        ]  #0.963, 1, 1.0375
        self.x_image_size = track_config[
            'x_image_size']  # Search image size 255
        self.window = None  # Cosine window
        self.log_level = track_config['log_level']
示例#9
0
def get_subwindow_avg(im, pos, model_sz, original_sz):
  # avg_chans = np.mean(im, axis=(0, 1)) # This version is 3x slower
  avg_chans = [np.mean(im[:, :, 0]), np.mean(im[:, :, 1]), np.mean(im[:, :, 2])]
  if not original_sz:
    original_sz = model_sz
  sz = original_sz
  im_sz = im.shape
  # make sure the size is not too small
  assert im_sz[0] > 2 and im_sz[1] > 2
  c = [get_center(s) for s in sz]

  # check out-of-bounds coordinates, and set them to avg_chans
  context_xmin = np.int(np.round(pos[1] - c[1]))
  context_xmax = np.int(context_xmin + sz[1] - 1)
  context_ymin = np.int(np.round(pos[0] - c[0]))
  context_ymax = np.int(context_ymin + sz[0] - 1)
  left_pad = np.int(np.maximum(0, -context_xmin))
  top_pad = np.int(np.maximum(0, -context_ymin))
  right_pad = np.int(np.maximum(0, context_xmax - im_sz[1] + 1))
  bottom_pad = np.int(np.maximum(0, context_ymax - im_sz[0] + 1))

  context_xmin = context_xmin + left_pad
  context_xmax = context_xmax + left_pad
  context_ymin = context_ymin + top_pad
  context_ymax = context_ymax + top_pad
  if top_pad > 0 or bottom_pad > 0 or left_pad > 0 or right_pad > 0:
    R = np.pad(im[:, :, 0], ((top_pad, bottom_pad), (left_pad, right_pad)),
               'constant', constant_values=(avg_chans[0]))
    G = np.pad(im[:, :, 1], ((top_pad, bottom_pad), (left_pad, right_pad)),
               'constant', constant_values=(avg_chans[1]))
    B = np.pad(im[:, :, 2], ((top_pad, bottom_pad), (left_pad, right_pad)),
               'constant', constant_values=(avg_chans[2]))

    im = np.stack((R, G, B), axis=2)

  im_patch_original = im[context_ymin:context_ymax + 1,
                      context_xmin:context_xmax + 1, :]
  if not (model_sz[0] == original_sz[0] and model_sz[1] == original_sz[1]):
    im_patch = resize(im_patch_original, tuple(model_sz))
  else:
    im_patch = im_patch_original
  return im_patch, left_pad, top_pad, right_pad, bottom_pad
示例#10
0
    def track(self, sess, current_target_state, input_image):
        """Runs tracking on a single image sequence."""
        def roi_align(image, disp_instance_feat, height, width):
            """
      `image` is a 3-D array, representing the input feature map
      `disp_instance_feat` box center
      `height` and `width` are the desired spatial size of output feature map
      """
            crop_center = disp_instance_feat + get_center(image.shape[0])
            crop_box = [
                np.maximum(crop_center[0] - 3, 0),
                np.maximum(crop_center[1] - 3, 0),
                np.minimum(crop_center[0] + 3, image.shape[0]),
                np.minimum(crop_center[1] + 3, image.shape[0])
            ]
            if (int(crop_box[2] - crop_box[0]) !=
                    6) or (int(crop_box[3] - crop_box[1]) !=
                           6):  # pad if reach boundary
                image = np.pad(image, ((6, 6), (6, 6), (0, 0)),
                               'constant',
                               constant_values=np.mean(image))
                crop_center = crop_center + 6
                crop_box = [
                    crop_center[0] - 3, crop_center[1] - 3, crop_center[0] + 3,
                    crop_center[1] + 3
                ]

            crop_box = [ele / image.shape[0] for ele in crop_box]

            y_min, x_min, y_max, x_max = crop_box

            img_height, img_width, channel_num = image.shape

            feature_map = []

            for y in np.linspace(y_min, y_max, height) * (img_height - 1):
                for x in np.linspace(x_min, x_max, width) * (img_height - 1):
                    y_l, y_h = np.floor(y).astype('int32'), np.ceil(y).astype(
                        'int32')
                    x_l, x_h = np.floor(x).astype('int32'), np.ceil(x).astype(
                        'int32')

                    a = image[y_l, x_l]
                    b = image[y_l, x_h]
                    c = image[y_h, x_l]
                    d = image[y_h, x_h]

                    y_weight = y - y_l
                    x_weight = x - x_l

                    val = a * (1 - x_weight) * (1 - y_weight) + \
                          b * x_weight * (1 - y_weight) + \
                          c * y_weight * (1 - x_weight) + \
                          d * x_weight * y_weight

                    feature_map.append(val)

            return np.array(feature_map).reshape(height, width, channel_num)

        def roi_crop(disp_instance_feat, instance):
            instance_pad = instance.copy()
            crop_center = np.round(disp_instance_feat +
                                   get_center(instance_size)).astype(int)
            crop_box = [
                np.maximum(crop_center[0] - 3, 0),
                np.maximum(crop_center[1] - 3, 0),
                np.minimum(crop_center[0] + 3, instance_size),
                np.minimum(crop_center[1] + 3, instance_size)
            ]
            if (int(crop_box[2] - crop_box[0]) !=
                    6) or (int(crop_box[3] - crop_box[1]) !=
                           6):  # padding if reach border
                instance_pad = np.pad(instance_pad, ((6, 6), (6, 6), (0, 0)),
                                      'constant',
                                      constant_values=np.mean(instance_pad))
                crop_center = crop_center + 6
                crop_box = [
                    crop_center[0] - 3, crop_center[1] - 3, crop_center[0] + 3,
                    crop_center[1] + 3
                ]
                # print(crop_box)
            instance_crop = instance_pad[crop_box[0]:crop_box[2],
                                         crop_box[1]:crop_box[3], :]
            return instance_crop

        def npair_distance(a, b, data_is_normalized=False):
            b = np.vstack(b)
            a = np.reshape(a, (1, -1))
            b = np.reshape(b, (b.shape[0], -1))
            if not data_is_normalized:
                a = np.asarray(a) / np.linalg.norm(a, axis=1, keepdims=True)
                b = np.asarray(b) / np.linalg.norm(b, axis=1, keepdims=True)
            return np.mean(np.dot(a, b.T))

        current_target_state.old_bbox = current_target_state.bbox  # [x_c,y_c,w,h]
        current_target_state.old_scale_idx = current_target_state.scale_idx
        current_target_state.old_search_pos = current_target_state.search_pos

        bbox_feed = [
            current_target_state.bbox.y, current_target_state.bbox.x,
            current_target_state.bbox.height, current_target_state.bbox.width
        ]  # center x y
        bbox_feed_ltwh = [
            current_target_state.bbox.x - current_target_state.bbox.width / 2,
            current_target_state.bbox.y - current_target_state.bbox.height / 2,
            current_target_state.bbox.width, current_target_state.bbox.height
        ]

        templates = current_target_state.init_templates
        input_feed = [input_image, bbox_feed, templates]
        outputs = self.siamese_model.inference_step(sess, input_feed)
        search_scale_list = outputs['scale_xs']
        response = outputs['response_up']  # [3,272,272]
        instance = outputs['instance']  # [3,22,22,256]
        reid_instance = outputs['instance_reid']  # [3,22,22,256]
        response_size = response.shape[1]
        instance_size = instance.shape[1]

        # Choose the scale whole response map has the highest peak
        if self.num_scales > 1:
            response_max = np.max(response, axis=(1, 2))
            penalties = self.track_config['scale_penalty'] * np.ones(
                self.num_scales)
            current_scale_idx = int(get_center(self.num_scales))
            penalties[current_scale_idx] = 1.0
            response_penalized = response_max * penalties
            best_scale = np.argmax(response_penalized)
        else:
            best_scale = 0

        response = response[best_scale]

        with np.errstate(all='raise'):  # Raise error if something goes wrong
            response = response - np.min(response)
            response = response / np.sum(response)

        if self.window is None:  # suppress the border
            window = np.dot(np.expand_dims(np.hanning(response_size), 1),
                            np.expand_dims(np.hanning(response_size), 0))
            self.window = window / np.sum(window)  # normalize window
        window_influence = self.track_config['window_influence']  # 0.3
        response = (
            1 - window_influence) * response + window_influence * self.window
        # Find maximum response
        r_max, c_max = np.unravel_index(response.argmax(), response.shape)

        # Convert from crop-relative coordinates to frame coordinates
        p_coor = np.array([r_max, c_max])

        # displacement from the center in instance final representation (response comes from instance)
        disp_instance_final = p_coor - get_center(response_size)

        # ... in instance feature space ...
        upsample_factor = self.track_config['upsample_factor']
        disp_instance_feat = disp_instance_final / upsample_factor
        # ... Avoid empty position ...
        r_radius = int(response_size / upsample_factor / 2)
        disp_instance_feat = np.maximum(
            np.minimum(disp_instance_feat, r_radius), -r_radius)

        # ... in instance input ...
        disp_instance_input = disp_instance_feat * self.model_config[
            'embed_config']['stride']
        # ... in instance original crop (in frame coordinates)
        disp_instance_frame = disp_instance_input / search_scale_list[
            best_scale]
        # Position within frame in frame coordinates
        y = current_target_state.bbox.y
        x = current_target_state.bbox.x
        y += disp_instance_frame[0]
        x += disp_instance_frame[1]

        # compute the similarity
        instance_reid_crop1 = np.mean(roi_crop(disp_instance_feat,
                                               reid_instance[best_scale]),
                                      axis=(0, 1))
        similarity1 = npair_distance(instance_reid_crop1,
                                     current_target_state.his_feature)

        # instance_reid_crop2 = np.mean(roi_align(reid_instance[best_scale], disp_instance_feat, 6, 6), axis=(0, 1))
        # similarity2 = npair_distance(instance_reid_crop2, current_target_state.his_feature)

        current_target_state.similarity = similarity1

        # Target scale damping and saturation
        original_target_width = current_target_state.original_target_wh[0]
        original_target_height = current_target_state.original_target_wh[1]

        target_scale = current_target_state.bbox.height / original_target_height
        search_factor = self.search_factors[best_scale]
        scale_damp = self.track_config[
            'scale_damp']  # damping factor for scale update
        target_scale *= ((1 - scale_damp) * 1.0 + scale_damp * search_factor)
        target_scale = np.maximum(0.5, np.minimum(1.5, target_scale))

        # Some book keeping
        height = original_target_height * target_scale
        width = original_target_width * target_scale
        current_target_state.bbox = Rectangle(x, y, width, height)
        current_target_state.scale_idx = best_scale
        current_target_state.search_pos = current_target_state.original_search_center + disp_instance_input
        current_target_state.bbox_in = bbox_feed_ltwh

        assert 0 <= current_target_state.search_pos[0] < self.x_image_size, \
          'target position in feature space should be no larger than input image size'
        assert 0 <= current_target_state.search_pos[1] < self.x_image_size, \
          'target position in feature space should be no larger than input image size'

        track_bbox = convert_bbox_format(
            current_target_state.bbox, 'top-left-based')  #  center -> top left
        track_bbox = np.array(
            [track_bbox.x, track_bbox.y, track_bbox.width, track_bbox.height])

        return current_target_state, track_bbox