示例#1
0
def _dump_vis(model, pred, gt, common,
              video_list, K_list, dist_list, M_list):
    from utils.plot_util import draw_skel
    from utils.StitchedImage import StitchedImage
    import utils.CamLib as cl
    from tqdm import tqdm

    # iterate frames
    for i, (_, fid) in tqdm(enumerate(common), desc='Dumping Samples', total=len(common)):
        # Accumulate frames
        merged_list = list()
        # inpaint pred/gt
        for K, dist, M, v in zip(K_list, dist_list, M_list, video_list):
            img = read_vid_frame(v, fid)
            uv_p = cl.project(cl.trafo_coords(pred[i], M), K, dist)
            img_p = draw_skel(img.copy(), model, uv_p, color_fixed='r', order='uv')
            uv_gt = cl.project(cl.trafo_coords(gt[i], M), K, dist)
            img_p = draw_skel(img_p, model, uv_gt, color_fixed='g', order='uv')

            merged_list.append(img_p)

        merged = StitchedImage(merged_list)
        p = os.path.join(os.path.dirname(video_list[0]), 'eval_vis_dump/%04d.png' % i)
        # cv2.imshow('img', merged.image)
        # cv2.waitKey()
        my_mkdir(p, is_file=True)
        cv2.imwrite(p, merged.image)
示例#2
0
    def postfunc_save_samples(self, trainer):
        """ Adds a visual sample to the summary writer. """
        from utils.plot_util import draw_hand
        import utils.CamLib as cl
        import numpy as np

        tmp = list()
        for img, cam_int, xyz_pred, xyz_gt in zip(
                trainer.fetches_v[data_t.image], trainer.fetches_v[data_t.K],
                trainer.fetches_v[data_t.pred_xyz],
                trainer.fetches_v[data_t.xyz]):
            # project
            uv_gt = cl.project(xyz_gt, cam_int)
            uv_pred = cl.project(xyz_pred, cam_int)

            img_rgb = ((img + 1.0) / 2.0 * 255).round().astype(np.uint8)
            img_p = draw_hand(img_rgb.copy(), uv_pred, order='uv')
            img_gt = draw_hand(img_rgb.copy(), uv_gt, order='uv')
            tmp.append(np.concatenate([img_p, img_gt], 1))

            # from utils.mpl_setup import plt_figure
            # plt, fig, axes = plt_figure(2)
            # axes[0].imshow(img_p)
            # axes[1].imshow(img_gt)
            # plt.show()

            if len(tmp) == trainer.config.save_sample_num:
                break

        summary_v = trainer.session.run(self.merged_vis_sum,
                                        {self.merged_vis: np.stack(tmp)})
        trainer.summary_writer.add_summary(summary_v, trainer.global_step_v)
        trainer.summary_writer.flush()
        print('Saved some samples.')
示例#3
0
def _calc_board_stats(points2d_obs,
                      model_point3d_coord_obs,
                      K,
                      dist,
                      img=None):
    success, r_rel, t_rel = cv2.solvePnP(np.expand_dims(
        model_point3d_coord_obs, 1),
                                         np.expand_dims(points2d_obs, 1),
                                         K,
                                         distCoeffs=dist,
                                         flags=cv2.SOLVEPNP_ITERATIVE)

    # get normal vector from tag rotation
    R, _ = cv2.Rodrigues(r_rel)
    n = np.matmul(R, np.array([0.0, 0.0, 1.0]))  # normal wrt camera
    n = np.clip(np.sum(n), -1.0, 1.0)
    angle = np.arccos(n) * 180.0 / np.pi

    # calculate points in camera frame
    M_w2c = np.concatenate([R, t_rel], 1)
    M_w2c = np.concatenate([M_w2c, np.array([[0.0, 0.0, 0.0, 1.0]])], 0)
    p3d = cl.trafo_coords(model_point3d_coord_obs, M_w2c)

    # reprojection error
    p2d_p = cl.project(p3d, K, dist)
    err = np.linalg.norm(p2d_p - points2d_obs, 2, -1)

    if img is not None:
        import matplotlib.pyplot as plt
        plt.imshow(img[:, :, ::-1])
        plt.plot(points2d_obs[:, 0], points2d_obs[:, 1], 'go')
        plt.plot(p2d_p[:, 0], p2d_p[:, 1], 'rx')
        plt.show()
    return angle, p3d[:, -1], err
示例#4
0
def draw_trafo(img, M_trafo_w2l, M_cam_w2l, K_cam, dist_cam, linewidth=2, l=0.025):
    """ Draws a little coordinate frame into an image. """
    import utils.CamLib as cl

    M_trafo_l2w = np.linalg.inv(M_trafo_w2l)

    # points in local space we'd like to draw
    points_local = np.array([
        [0.0, 0.0, 0.0],  # origin
        [l, 0.0, 0.0],  # end x
        [0.0, l, 0.0],  #end y
        [0.0, 0.0, l]  #end z
    ])
    # trafo them to world space
    points_world = cl.trafo_coords(points_local, M_trafo_l2w)

    # transform points into image space
    p_cam = cl.trafo_coords(points_world, M_cam_w2l)
    p_uv = cl.project(p_cam, K_cam, dist_cam)
    p_uv = np.round(p_uv).astype(np.int32)

    # draw stuff
    bones, colors = [[0, 1], [0, 2], [0, 3]], [(0, 0, 255), (0, 255, 0), (255, 0, 0)]
    for b, c in zip(bones, colors):
        pid, cid = b
        img = cv2.line(img,
                       (p_uv[pid, 0], p_uv[pid, 1]),
                       (p_uv[cid, 0], p_uv[cid, 1]),
                       c, thickness=linewidth)
    return img
示例#5
0
    def update_frame_view_points(self, use_labels=True):
        self.frame_view.clear()  # clear frame view
        k = self.keys[self.key_id]

        if k in self.keys_valid:
            # draw when there is a valid annotation

            this_pred = self.predictions[k]
            xyz = np.array(this_pred['kp_xyz'])[0]

            if k in self.label_tasks.keys() and use_labels:
                # use label task results if there are any
                this_pred = self.label_tasks[k]
                xyz = np.array(this_pred['kp_xyz'])

            for i, cid in enumerate(self.cam_range):
                # project into frame
                kp_uv = cl.project(cl.trafo_coords(xyz, self.M_list[i]),
                                   self.K_list[i], self.dist_list[i])

                for kp_id, uv in enumerate(kp_uv):
                    kp_name = self.config['keypoints'][kp_id]
                    self.frame_view.update_frame_keypoint(i,
                                                          kp_name,
                                                          uv,
                                                          is_scene_pos=False)

            # make point all not movable
            for item in self.frame_view.frame_keypoints.values():
                item.setFlag(QGraphicsItem.ItemIsMovable, False)
示例#6
0
def read_img(metas):
    for meta in metas:
        if not all([os.path.exists(x) for x in meta.img_paths]):
            print('Path not found: ', meta.img_paths[0])

        # read all images
        meta.img_list = [
            cv2.imread(x).astype(np.float32) for x in meta.img_paths
        ]

        # read calibration
        calib = _get_calib(meta.calib_id, meta.calib_path)
        # dist = [np.array(calib[cam]['dist']) for cam in meta.cam_range]
        meta.K_list = [np.array(calib[cam]['K']) for cam in meta.cam_range]
        meta.M_list = [np.array(calib[cam]['M']) for cam in meta.cam_range]
        # meta.M_list = [np.linalg.inv(M) for M in meta.M_list]

        # compensate for crop
        meta.K_list = [
            compensate_crop_K(K, s, o)
            for K, s, o in zip(meta.K_list, meta.scales, meta.offsets)
        ]

        # compensate 2D coordinates for crop
        meta.uv_merged = [
            compensate_crop_coords(pts, s, o)
            for pts, s, o in zip(meta.uv_merged, meta.scales, meta.offsets)
        ]

        # # undistort them
        # meta.img_list = [cv2.undistort(I, K, dist) for I, K, dist in zip(meta.img_list, meta.K_list, dist)]

        meta.uv = [
            cl.project(cl.trafo_coords(meta.xyz, M), K)
            for K, M in zip(meta.K_list, meta.M_list)
        ]
    return metas
示例#7
0
    model = Model(args.model)
    df = build_dataflow(model, [args.set_name],
                        ['/misc/lmbraid18/datasets/RatPose/RatTrack_paper_resub_sessions/Rat506_200306/run046_cam%d.avi'],
                        ['/misc/lmbraid18/datasets/RatPose/RatTrack_paper_resub_sessions/Rat506_200306/pred_run046__00.json'],
                        is_train=False,
                        threaded=True, single_sample=False)

    start = None
    for idx, dp in enumerate(df.get_data()):
        if idx >= df.size():
            break

        data = df2dict(dp)
        img_rgb = np.round((data[data_t.image]+0.5)*255.0).astype(np.uint8)[:, :, :, ::-1]
        num_cams = img_rgb.shape[0]
        print('is_supervised', data[data_t.is_supervised])

        img_list = list()
        for i in range(num_cams):
            xyz_cam = cl.trafo_coords(data[data_t.xyz_nobatch][0], data[data_t.M][i])
            uv = cl.project(xyz_cam, data[data_t.K][i])
            I = draw_skel(img_rgb[i], model, data[data_t.uv][i], data[data_t.vis_nobatch][0], order='uv')
            img_list.append(I)
        xyz = data[data_t.xyz_nobatch][0]

        merge = StitchedImage(img_list, target_size=(int(0.8 * args.window_size), args.window_size))

        cv2.imshow('pose labeled', merge.image[:, :, ::-1])
        cv2.waitKey(0 if args.wait else 10)

def read_img(metas):
    for meta in metas:
        if type(meta) == DatasetLabeledMeta:
            if not all([os.path.exists(x) for x in meta.img_paths]):
                print('Path not found: ', meta.img_paths[0])

            meta.is_supervised = 1.0

            # read all images
            meta.img_list = [cv2.imread(x).astype(np.float32) for x in meta.img_paths]

            # read calibration
            calib = _get_calib(meta.calib_id, meta.calib_path)
            # dist = [np.array(calib[cam]['dist']) for cam in meta.cam_range]
            meta.K_list = [np.array(calib[cam]['K']) for cam in meta.cam_range]
            meta.M_list = [np.array(calib[cam]['M']) for cam in meta.cam_range]
            # meta.M_list = [np.linalg.inv(M) for M in meta.M_list]

            # compensate for crop
            meta.K_list = [compensate_crop_K(K, s, o) for K, s, o in zip(meta.K_list, meta.scales, meta.offsets)]

            # compensate 2D coordinates for crop
            meta.uv_merged = [compensate_crop_coords(pts, s, o) for pts, s, o in zip(meta.uv_merged, meta.scales, meta.offsets)]

            # # undistort them
            # meta.img_list = [cv2.undistort(I, K, dist) for I, K, dist in zip(meta.img_list, meta.K_list, dist)]

            meta.uv = [cl.project(cl.trafo_coords(meta.xyz, M), K) for K, M in zip(meta.K_list, meta.M_list)]

        elif type(meta) == DatasetUnlabeledMeta:

            meta.is_supervised = 0.0

            # read a video frames
            fid, meta.img_list = read_random_video_frames(meta.video_set)

            # read calibration
            calib = _get_calib_videos(meta.calib)
            # dist = [np.array(calib[cam]['dist']) for cam in meta.cam_range]
            meta.K_list = [np.array(calib['K'][cam]) for cam in meta.cam_range]
            meta.M_list = [np.array(calib['M'][cam]) for cam in meta.cam_range]

            # read bounding box
            boxes, meta.voxel_root = _get_pred_bb(meta.pred_bb_file, fid)

            if meta.voxel_root is None:
                print('Invalid root in unlabeled sequence, Skipping.')
                return None

            # crop images according to the boxes
            meta.img_list, meta.K_list = _crop_images(meta.img_list, boxes, meta.K_list)

            # create dummy data just so its available:
            meta.img_paths = meta.video_set
            meta.xyz = np.zeros((12, 3))
            meta.uv = np.zeros((len(meta.cam_range), 12, 2))
            meta.uv_merged = meta.uv
            meta.vis = np.zeros((12, ))
            meta.vis_merged = np.zeros((len(meta.cam_range), 12))

        else:
            raise NotImplementedError()


    return metas
示例#9
0
                    # if not detected use full image
                    this_box = [0.0, 1.0, 0.0, 1.0]
                box_scaled = np.array([
                    this_box[0] * w, this_box[1] * w, this_box[2] * h,
                    this_box[3] * h
                ])
                this_img = draw_bb(this_img,
                                   box_scaled,
                                   mode='lrtb',
                                   color='g',
                                   linewidth=2)

            if args.draw_root:
                # draw voxel root
                root_uv = cl.project(
                    cl.trafo_coords(np.array(predictions[idx]['xyz']),
                                    M_list[i]), K_list[i])
                this_img = cv2.circle(this_img,
                                      (int(root_uv[0, 0]), int(root_uv[0, 1])),
                                      radius=5,
                                      color=(0, 255, 255),
                                      thickness=-1)

            # draw keypoints
            if 'kp_xyz' in predictions[idx].keys():
                uv_proj = cl.project(
                    cl.trafo_coords(np.array(predictions[idx]['kp_xyz'][0]),
                                    M_list[i]), K_list[i])
                this_img = draw_skel(this_img,
                                     model,
                                     uv_proj,
        for fid in range(fid1, fid2):
            img, K, img_shape = reader.read()

            # inpaint pose
            img_p = img.copy()
            for c, pose in zip(['g', 'b'], [pose_ours, pose_dlc]):

                last_uv = None
                for t in range(5):
                    if fid - t < 0:
                        continue
                    if pose[fid - t] is None:
                        continue
                    p = np.reshape(np.array(pose[fid - t]), [-1, 3])
                    uv = cl.project(cl.trafo_coords(p, M_list[cid]),
                                    K)[args.kp_id]
                    # img_p = cv2.circle(img_p,
                    #                    (int(uv[0]), int(uv[1])),
                    #                    radius=2,
                    #                    color=(255, 0, 0) if c == 'b' else (0, 255, 0),
                    #                    thickness=-1)
                    col = (0, 255, 0) if c == 'g' else (0, 0, 255)

                    if last_uv is None:
                        img_p = cv2.circle(img_p, (int(uv[0]), int(uv[1])),
                                           radius=5,
                                           color=col,
                                           thickness=2)
                    else:
                        img_p = cv2.line(img_p, (int(uv[0]), int(uv[1])),
                                         (int(last_uv[0]), int(last_uv[1])),
示例#11
0
        fetches = [
            xyz_tf,
            uv_tf,
            pred_tensors[data_t.pred_score3d][-1]
        ]
        fetches_v = sess.run(fetches, feed_dict=feeds)

        # save predictions
        xyz_pred, uv_pred, score_pred = fetches_v
        predictions[idx]['kp_xyz'] = xyz_pred
        predictions[idx]['kp_score'] = score_pred

        if args.show:
            img_list = list()
            for i, (this_img, this_box, this_uv) in enumerate(zip(imgs, boxes, uv_pred)):
                uv_proj = cl.project(cl.trafo_coords(xyz_pred[0], M_list[i]), K_list[i])
                h, w = this_img.shape[:2]
                if np.all(np.abs(this_box) < 1e-4):
                    # if not detected use full image
                    this_box = [0.0, 1.0, 0.0, 1.0]
                box_scaled = np.array([this_box[0] * w, this_box[1] * w, this_box[2] * h, this_box[3] * h])
                this_img_box = draw_bb(this_img, box_scaled, mode='lrtb', color='g', linewidth=2)

                root_uv = cl.project(cl.trafo_coords(root, M_list[i]), K_list[i])
                this_img_box = cv2.circle(this_img_box,
                                          (int(root_uv[0, 0]), int(root_uv[0, 1])),
                                          radius=5,
                                          color=(0, 255, 255),
                                          thickness=-1)

                img_list.append(
示例#12
0
def triangulate_robust(kp_uv,
                       vis,
                       K_list,
                       M_list,
                       dist_list=None,
                       threshold=50.0,
                       mode=None):
    """ Given some 2D observations kp_uv and their respective validity this function finds 3D point hypothesis
        kp_uv: NxKx2 2D points
        vis: NxK visibility
        K_list Nx3x3 camera intrinsic
        dist_list Nx1x5 camera distortion
        M_list Nx4x4 camera extrinsic

        points3d: Kx3 3D point hypothesis
        points2d_proj: NxKx2 projection of points3d into the N cameras
        vis3d: K Validity of points3d
        points2d_merged: NxKx2 Merged result of input observations and points2d_proj according to
                points2d_merged = points2d_proj if ~vis else kp_uv
            So it basically keeps the 2D annotations and uses the reprojection if there was no annotation.
        vis2d_merged: NxK Validity of points2d_merged.
    """
    global G_TRIANG_TOOL, G_TRIANG_TOOL_AVAIL
    if not G_TRIANG_TOOL_AVAIL:
        print('THIS IS THE PROBLEM')
        raise ImportError('Could not load the triangulation toolbox.')

    # create tool if necessary
    if G_TRIANG_TOOL is None:
        G_TRIANG_TOOL = TriangTool()

    if mode is None:
        mode = t_triangulation.RANSAC

    # output values
    num_cams, num_kp = kp_uv.shape[:2]
    points3d = np.zeros((num_kp, 3), dtype=np.float32)
    vis3d = np.zeros((num_kp, ), dtype=np.float32)
    points2d_proj = np.zeros((num_cams, num_kp, 2), dtype=np.float32)

    points2d_merged = kp_uv.copy(
    )  # merged result of projection and 2d annotation (uses 2d anno if avail, otherwise 3d proj)
    vis2d_merged = vis.copy()  # validity of points2d_merged

    # iterate over keypoints
    for kp_id in range(num_kp):
        # resort data
        points2d = list()
        cams = list()
        for cid in range(num_cams):
            if vis[cid, kp_id] > 0.5:
                points2d.append(kp_uv[cid, kp_id])
                cams.append(cid)

        if np.unique(cams).shape[0] >= 2:
            # find consistent 3D hypothesis for the center of bounding boxed
            point3d, inlier = G_TRIANG_TOOL.triangulate(
                [K_list[i] for i in cams], [M_list[i] for i in cams],
                np.expand_dims(np.array(points2d), 1),
                dist_list=dist_list,
                mode=mode,
                threshold=threshold)
            if np.sum(inlier) >= 2:
                points3d[kp_id] = point3d
                vis3d[kp_id] = 1.0
                for cid, (K, M) in enumerate(zip(K_list, M_list)):
                    points2d_proj[cid, kp_id] = cl.project(
                        cl.trafo_coords(point3d, M), K)

                    is_outlier_label = np.linalg.norm(
                        points2d_proj[cid, kp_id] -
                        kp_uv[cid, kp_id]) >= 2 * threshold

                    # fill in projection into merged, if this point was not visible before
                    if vis2d_merged[cid, kp_id] < 0.5:
                        # if the 2D point was not filled before set projection
                        points2d_merged[cid, kp_id] = points2d_proj[cid, kp_id]
                        vis2d_merged[cid, kp_id] = 1.0

                    elif is_outlier_label:
                        # some 2D labels are just weird outliers:
                        # If the distance between a consistent 3D points proj and the label is too big we stick with the 3D pt
                        points2d_merged[cid, kp_id] = points2d_proj[cid, kp_id]
                        vis2d_merged[cid, kp_id] = 1.0

    return points3d, points2d_proj, vis3d, points2d_merged, vis2d_merged
示例#13
0
    def btn_write(self):
        self.save_label_state()  # save current annotation

        num_kp = len(self.config['keypoints'])
        empty = {
            'kp_xyz': np.zeros((num_kp, 3)),
            'vis3d': np.zeros((num_kp, ))
        }

        # assemble all info we want to write to disk
        output_data = dict()
        for k in self.file_list_sel_full_keys:
            fid = int(k)
            if k in self.label_tasks.keys():
                output_data[fid] = self.label_tasks[k]

                # project into views
                for i, cid in enumerate(self.cam_range):
                    # project into frame
                    xyz = self.label_tasks[k]['kp_xyz']
                    kp_uv = cl.project(cl.trafo_coords(xyz, self.M_list[i]),
                                       self.K_list[i], self.dist_list[i])
                    output_data[fid]['cam%d' % cid] = {
                        'kp_uv': kp_uv,
                        'vis': self.label_tasks[k]['vis3d']
                    }

            else:
                output_data[fid] = empty

        self.pb_start(len(output_data))

        # figure out base path
        i = 0
        while True:
            base_path = os.path.join(os.path.dirname(self.video_list[0]),
                                     self.output_task_dir % i)
            if not os.path.exists(base_path):
                break
            i += 1

        # dump frames
        for fid, _ in output_data.items():
            img_list, K_list, M_list, dist_list = self.precacher.get_data(fid)

            # write image frames
            for cid, img in zip(self.cam_range, img_list):
                output_path = os.path.join(base_path, 'cam%d' % cid,
                                           '%08d.png' % fid)
                my_mkdir(output_path, is_file=True)
                cv2.imwrite(output_path, img)
                # print('Dumped: ', output_path)
            self.pb_update()

        self.pb_finish()

        # dump anno
        anno_out_path = os.path.join(base_path, 'anno.json')
        my_mkdir(anno_out_path, is_file=True)
        json_dump(anno_out_path,
                  {'%08d.png' % k: v
                   for k, v in output_data.items()},
                  verbose=True)
示例#14
0
def post_process_detections(boxes,
                            scores,
                            K_list,
                            M_list,
                            img_shape,
                            min_score_cand=0.2,
                            min_score_pick=0.1,
                            max_reproj_error=10.0,
                            verbose=True,
                            img=None,
                            logger=None):
    """ Some post processing to increase the quality of bounding box detections.
        We consider all bounding boxes as candidate above some min_score_cand and calculate their 2D center position.
        Using all centers we aim to find a 3D hypothesis explaining as many centers as possible. Subsequently, we pick
        the boxes with minimal distance to the
    """
    global triang_tool
    if triang_tool is None:
        triang_tool = TriangTool()

    output = {'boxes': None, 'xyz': None}

    # calculate bounding box centers
    box_centers = np.stack([
        0.5 * (boxes[:, :, 1] + boxes[:, :, 3]) * img_shape[1], 0.5 *
        (boxes[:, :, 0] + boxes[:, :, 2]) * img_shape[0]
    ], -1)

    if img is not None:
        # If an image was give we assume that it should be showed
        img_list = list()
        # show centers
        for ind, I in enumerate(img):
            I = I.copy()
            for s, b in zip(scores[ind], box_centers[ind]):
                if s < min_score_cand:
                    continue
                c = (int(b[0]), int(b[1]))
                I = cv2.circle(I, c, radius=5, color=(0, 0, 255), thickness=-1)
            img_list.append(I)

        from utils.StitchedImage import StitchedImage
        merge = StitchedImage(img_list)
        cv2.imshow('centers all', merge.image)
        cv2.waitKey(10)

    # resort data
    points2d = list()
    cams = list()
    for cid in range(boxes.shape[0]):
        for i in range(boxes.shape[1]):
            if scores[cid, i] > min_score_cand:
                points2d.append(box_centers[cid, i])
                cams.append(cid)

        if verbose and logger is not None:
            logger.log('Cam %d contributes %d points for triangulation' %
                       (cid, np.sum(scores[cid] > min_score_cand)))
    points2d = np.array(points2d)

    if np.unique(cams).shape[0] >= 3:
        # find consistent 3D hypothesis for the center of bounding boxed
        point3d, inlier = triang_tool.triangulate([K_list[i] for i in cams],
                                                  [M_list[i] for i in cams],
                                                  np.expand_dims(points2d, 1),
                                                  mode=t_triangulation.RANSAC,
                                                  threshold=max_reproj_error)
        if verbose and logger is not None:
            logger.log('Found 3D point with %d inliers' % np.sum(inlier))

            if img is not None:
                img_list = list()
                for ind, (I, K, M) in enumerate(zip(img, K_list, M_list)):
                    p2d = cl.project(cl.trafo_coords(point3d, M), K)
                    c = (int(p2d[0, 0]), int(p2d[0, 1]))
                    print(ind, c)
                    I = cv2.circle(I.copy(),
                                   c,
                                   radius=5,
                                   color=(0, 0, 255),
                                   thickness=-1)
                    img_list.append(I)

                from utils.StitchedImage import StitchedImage
                merge = StitchedImage(img_list)
                cv2.imshow('center consistent', merge.image)
                cv2.waitKey()

        if np.sum(inlier) > 0:
            output['xyz'] = point3d

            # select optimal box wrt the center
            order = [1, 3, 0, 2]
            boxes_opti = list()
            for cid, (K, M) in enumerate(zip(K_list, M_list)):
                uv = cl.project(cl.trafo_coords(point3d, M), K)

                # find bbox with minimal distance to found center
                diff_l2 = np.sqrt(np.sum(np.square(box_centers[cid] - uv), -1))
                diff_combined = diff_l2 / np.sqrt(
                    scores[cid] + 0.001
                )  # we want to pick something with low distance and high score
                ind = np.argmin(diff_combined)
                boxes_opti.append(boxes[cid, ind, order])
            output['boxes'] = np.stack(boxes_opti)

            return output

    # If we get here its time to use the fall back solution:
    # Use top scoring bbox in each frame independently
    boxes_opti = list()
    order = [1, 3, 0, 2]
    for box, score in zip(boxes, scores):
        ind = np.argmax(score)
        boxes_opti.append(box[ind, order])
    output['boxes'] = np.stack(boxes_opti)

    if verbose and logger is not None:
        logger.log(
            'Using fallback solution: Best scoring box from each view, because of small amount of inliers.'
        )

    return output