예제 #1
0
    def track(self, image, info: dict = None):
        H, W, _ = image.shape
        self.frame_id += 1
        x_patch_arr, resize_factor, x_amask_arr = sample_target(
            image,
            self.state,
            self.params.search_factor,
            output_sz=self.params.search_size)  # (x1, y1, w, h)
        search = self.preprocessor.process(x_patch_arr, x_amask_arr)
        with torch.no_grad():
            x_dict = self.network.forward_backbone(search)
            # merge the template and the search
            feat_dict_list = [self.z_dict1, x_dict]
            seq_dict = merge_template_search(feat_dict_list)
            # run the transformer
            out_dict, _, _ = self.network.forward_transformer(
                seq_dict=seq_dict, run_box_head=True)

        pred_boxes = out_dict['pred_boxes'].view(-1, 4)
        # Baseline: Take the mean of all pred boxes as the final result
        pred_box = (pred_boxes.mean(dim=0) * self.params.search_size /
                    resize_factor).tolist()  # (cx, cy, w, h) [0,1]
        # get the final box result
        self.state = clip_box(self.map_box_back(pred_box, resize_factor),
                              H,
                              W,
                              margin=10)
        # Clipping helps to improve robustness. Experiments shows that it doesn't influence performance
        # self.state = self.map_box_back(pred_box, resize_factor)
        # for debug
        if self.debug:
            x1, y1, w, h = self.state
            image_BGR = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            cv2.rectangle(image_BGR, (int(x1), int(y1)),
                          (int(x1 + w), int(y1 + h)),
                          color=(0, 0, 255),
                          thickness=2)
            save_path = os.path.join(self.save_dir, "%04d.jpg" % self.frame_id)
            cv2.imwrite(save_path, image_BGR)
        if self.save_all_boxes:
            '''save all 10 predictions'''
            all_boxes = self.map_box_back_batch(
                pred_boxes * self.params.search_size / resize_factor,
                resize_factor)
            all_boxes_save = all_boxes.view(-1).tolist()  # (4N, )
            return {"target_bbox": self.state, "all_boxes": all_boxes_save}
        else:
            return {"target_bbox": self.state}
예제 #2
0
    def track(self, image, info: dict = None):
        H, W, _ = image.shape
        self.frame_id += 1
        x_patch_arr, resize_factor, x_amask_arr = sample_target(
            image,
            self.state,
            self.params.search_factor,
            output_sz=self.params.search_size)  # (x1, y1, w, h)
        search, search_mask = self.preprocessor.process(
            x_patch_arr, x_amask_arr)
        with torch.no_grad():
            x_dict = self.network.forward_backbone(search,
                                                   zx="search",
                                                   mask=search_mask)
            # merge the template and the search
            feat_dict_list = [self.z_dict1, x_dict]
            q, k, v, key_padding_mask = get_qkv(feat_dict_list)
            # run the transformer
            out_dict, _, _ = self.network.forward_transformer(
                q=q, k=k, v=v, key_padding_mask=key_padding_mask)

        pred_boxes = out_dict['pred_boxes'].view(-1, 4)
        # Baseline: Take the mean of all pred boxes as the final result
        pred_box = (pred_boxes.mean(dim=0) * self.params.search_size /
                    resize_factor).tolist()  # (cx, cy, w, h) [0,1]
        # get the final box result
        self.state = clip_box(self.map_box_back(pred_box, resize_factor),
                              H,
                              W,
                              margin=10)

        # for debug
        if self.debug:
            x1, y1, w, h = self.state
            image_BGR = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            cv2.rectangle(image_BGR, (int(x1), int(y1)),
                          (int(x1 + w), int(y1 + h)),
                          color=(0, 0, 255),
                          thickness=2)
            save_path = os.path.join(self.save_dir, "%04d.jpg" % self.frame_id)
            cv2.imwrite(save_path, image_BGR)
        return {"target_bbox": self.state}
예제 #3
0
    def track(self, image, info: dict = None):
        H, W, _ = image.shape
        self.frame_id += 1
        x_patch_arr, resize_factor, x_amask_arr = sample_target(
            image,
            self.state,
            self.params.search_factor,
            output_sz=self.params.search_size)  # (x1, y1, w, h)
        search, search_mask = self.preprocessor.process(
            x_patch_arr, x_amask_arr)

        ort_inputs = {
            'img_x': search,
            'mask_x': search_mask,
            'feat_vec_z': self.ort_outs_z[0],
            'mask_vec_z': self.ort_outs_z[1],
            'pos_vec_z': self.ort_outs_z[2],
        }

        ort_outs = self.ort_sess_x.run(None, ort_inputs)

        pred_box = (ort_outs[0].reshape(4) * self.params.search_size /
                    resize_factor).tolist()  # (cx, cy, w, h) [0,1]
        # get the final box result
        self.state = clip_box(self.map_box_back(pred_box, resize_factor),
                              H,
                              W,
                              margin=10)

        # for debug
        if self.debug:
            x1, y1, w, h = self.state
            image_BGR = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            cv2.rectangle(image_BGR, (int(x1), int(y1)),
                          (int(x1 + w), int(y1 + h)),
                          color=(0, 0, 255),
                          thickness=2)
            save_path = os.path.join(self.save_dir, "%04d.jpg" % self.frame_id)
            cv2.imwrite(save_path, image_BGR)
        return {"target_bbox": self.state}
예제 #4
0
 def track(self, image, info: dict = None):
     H, W, _ = image.shape
     self.frame_id += 1
     # get the t-th search region
     x_patch_arr, resize_factor, x_amask_arr = sample_target(
         image,
         self.state,
         self.params.search_factor,
         output_sz=self.params.search_size)  # (x1, y1, w, h)
     search = self.preprocessor.process(x_patch_arr, x_amask_arr)
     with torch.no_grad():
         x_dict = self.network.forward_backbone(search)
         # merge the template and the search
         feat_dict_list = self.z_dict_list + [x_dict]
         seq_dict = merge_template_search(feat_dict_list)
         # run the transformer
         out_dict, _, _ = self.network.forward_transformer(
             seq_dict=seq_dict, run_box_head=True, run_cls_head=True)
     # get the final result
     pred_boxes = out_dict['pred_boxes'].view(-1, 4)
     # Baseline: Take the mean of all pred boxes as the final result
     pred_box = (pred_boxes.mean(dim=0) * self.params.search_size /
                 resize_factor).tolist()  # (cx, cy, w, h) [0,1]
     # get the final box result
     self.state = clip_box(self.map_box_back(pred_box, resize_factor),
                           H,
                           W,
                           margin=10)
     # Clipping helps to improve robustness. Experiments shows that it doesn't influence performance
     # self.state = self.map_box_back(pred_box, resize_factor)
     # get confidence score (whether the search region is reliable)
     conf_score = out_dict["pred_logits"].view(-1).sigmoid().item()
     # update template
     for idx, update_i in enumerate(self.update_intervals):
         if self.frame_id % update_i == 0 and conf_score > 0.5:
             z_patch_arr, _, z_amask_arr = sample_target(
                 image,
                 self.state,
                 self.params.template_factor,
                 output_sz=self.params.template_size)  # (x1, y1, w, h)
             template_t = self.preprocessor.process(z_patch_arr,
                                                    z_amask_arr)
             with torch.no_grad():
                 z_dict_t = self.network.forward_backbone(template_t)
             self.z_dict_list[
                 idx +
                 1] = z_dict_t  # the 1st element of z_dict_list is template from the 1st frame
     # for debug
     if self.debug:
         x1, y1, w, h = self.state
         image_BGR = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
         cv2.rectangle(image_BGR, (int(x1), int(y1)),
                       (int(x1 + w), int(y1 + h)),
                       color=(0, 0, 255),
                       thickness=2)
         save_path = os.path.join(self.save_dir, "%04d.jpg" % self.frame_id)
         cv2.imwrite(save_path, image_BGR)
     if self.save_all_boxes:
         '''save all 10 predictions'''
         all_boxes = self.map_box_back_batch(
             pred_boxes * self.params.search_size / resize_factor,
             resize_factor)
         all_boxes_save = all_boxes.view(-1).tolist()  # (4N, )
         return {
             "target_bbox": self.state,
             "all_boxes": all_boxes_save,
             "conf_score": conf_score
         }
     else:
         return {"target_bbox": self.state, "conf_score": conf_score}