예제 #1
0
def main(args):
    cfg = yaml.load(open(args.config), Loader=yaml.SafeLoader)
    gpu_mode = args.mode == 'gpu'
    tddfa = TDDFA(gpu_mode=gpu_mode, **cfg)

    # Initialize FaceBoxes
    face_boxes = FaceBoxes()

    # Given a video path
    fn = args.video_fp.split('/')[-1]
    reader = imageio.get_reader(args.video_fp)

    fps = reader.get_meta_data()['fps']

    suffix = get_suffix(args.video_fp)
    video_wfp = f'examples/results/videos/{fn.replace(suffix, "")}_{args.opt}.mp4'
    writer = imageio.get_writer(video_wfp, fps=fps)

    # run
    dense_flag = args.opt in ('3d',)
    pre_ver = None
    for i, frame in tqdm(enumerate(reader)):
        frame_bgr = frame[..., ::-1]  # RGB->BGR

        if i == 0:
            # the first frame, detect face, here we only use the first face, you can change depending on your need
            boxes = face_boxes(frame_bgr)
            boxes = [boxes[0]]
            param_lst, roi_box_lst = tddfa(frame_bgr, boxes)
            ver = tddfa.recon_vers(param_lst, roi_box_lst, dense_flag=dense_flag)[0]

            # refine
            param_lst, roi_box_lst = tddfa(frame_bgr, [ver], crop_policy='landmark')
            ver = tddfa.recon_vers(param_lst, roi_box_lst, dense_flag=dense_flag)[0]
        else:
            param_lst, roi_box_lst = tddfa(frame_bgr, [pre_ver], crop_policy='landmark')

            roi_box = roi_box_lst[0]
            # todo: add confidence threshold to judge the tracking is failed
            if abs(roi_box[2] - roi_box[0]) * abs(roi_box[3] - roi_box[1]) < 2020:
                boxes = face_boxes(frame_bgr)
                boxes = [boxes[0]]
                param_lst, roi_box_lst = tddfa(frame_bgr, boxes)

            ver = tddfa.recon_vers(param_lst, roi_box_lst, dense_flag=dense_flag)[0]

        pre_ver = ver  # for tracking

        if args.opt == '2d':
            res = cv_draw_landmark(frame_bgr, ver)
        elif args.opt == '3d':
            res = render(frame_bgr, [ver])
        else:
            raise Exception(f'Unknown opt {args.opt}')

        writer.append_data(res[..., ::-1])  # BGR->RGB

    writer.close()
    print(f'Dump to {video_wfp}')
예제 #2
0
def main(args):
    cfg = yaml.load(open(args.config), Loader=yaml.FullLoader)
    gpu_mode = args.mode == 'gpu'
    tddfa = TDDFA(gpu_mode=gpu_mode, **cfg)

    # Initialize FaceBoxes
    face_boxes = FaceBoxes()

    # Given a video path
    fn = args.video_fp.split('/')[-1]
    reader = imageio.get_reader(args.video_fp)

    fps = reader.get_meta_data()['fps']
    video_wfp = f'examples/results/videos/{fn.replace(".avi", ".mp4")}'
    writer = imageio.get_writer(video_wfp, fps=fps)

    pre_ver = None
    for i, frame in tqdm(enumerate(reader)):
        frame_bgr = frame[:, :, ::-1]  # RGB->BGR

        if i == 0:
            # the first frame, detect face, here we only use the first face, you can change depending on your need
            boxes = face_boxes(frame_bgr)
            boxes = [boxes[0]]
            param_lst, roi_box_lst = tddfa(frame_bgr, boxes)
            ver = tddfa.recon_vers(param_lst, roi_box_lst)[0]

            # refine
            param_lst, roi_box_lst = tddfa(frame_bgr, [ver],
                                           crop_policy='landmark')
            ver = tddfa.recon_vers(param_lst, roi_box_lst)[0]
        else:
            param_lst, roi_box_lst = tddfa(frame_bgr, [pre_ver],
                                           crop_policy='landmark')

            roi_box = roi_box_lst[0]
            # todo: add confidence threshold to judge the tracking is failed
            if abs(roi_box[2] - roi_box[0]) * abs(roi_box[3] -
                                                  roi_box[1]) < 2020:
                boxes = face_boxes(frame_bgr)
                boxes = [boxes[0]]
                param_lst, roi_box_lst = tddfa(frame_bgr, boxes)

            ver = tddfa.recon_vers(param_lst, roi_box_lst)[0]

        pre_ver = ver  # for tracking

        img_draw = cv_draw_landmark(frame_bgr, ver)
        writer.append_data(img_draw[:, :, ::-1])  # BGR->RGB

    writer.close()
    print(f'Dump to {video_wfp}')
예제 #3
0
def main(args):
    cfg = yaml.load(open(args.config), Loader=yaml.SafeLoader)
    gpu_mode = args.mode == 'gpu'
    tddfa = TDDFA(gpu_mode=gpu_mode, **cfg)
    vis = args.vis
    UDP_IP = args.ip
    UDP_PORT = args.port
    fov = args.fov

    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock, \
        imageio.get_reader("<video0>") as reader:

        # Initialize FaceBoxes
        face_boxes = FaceBoxes()

        dense_flag = False
        pre_ver = None
        first = True
        for frame in reader:
            frame_bgr = frame[..., ::-1]  # RGB->BGR
            #inference_time = time.time()
            if first:
                # the first frame, detect face, here we only use the first face, you can change depending on your need
                boxes = face_boxes(frame_bgr)
                if boxes:
                    boxes = [boxes[0]]
                    param_lst, roi_box_lst = tddfa(frame_bgr, boxes)
                    ver = tddfa.recon_vers(param_lst, roi_box_lst, dense_flag=dense_flag)[0]
                    # refine
                    param_lst, roi_box_lst = tddfa(frame_bgr, [ver], crop_policy='landmark')
                    ver = tddfa.recon_vers(param_lst, roi_box_lst, dense_flag=dense_flag)[0]
                    first = False
            else:
                param_lst, roi_box_lst = tddfa(frame_bgr, [pre_ver], crop_policy='landmark')

                roi_box = roi_box_lst[0]
                # todo: add confidence threshold to judge the tracking is failed
                if abs(roi_box[2] - roi_box[0]) * abs(roi_box[3] - roi_box[1]) < 2020:
                    boxes = face_boxes(frame_bgr)
                    if boxes:
                        boxes = [boxes[0]]
                        param_lst, roi_box_lst = tddfa(frame_bgr, boxes)
                    else:
                        first = True
                        pre_ver = None
                if boxes:
                    ver = tddfa.recon_vers(param_lst, roi_box_lst, dense_flag=dense_flag)[0]
            #inference_time = time.time()-inference_time
            if boxes:
                pre_ver = ver  # for tracking
                # Most of what follows here is an attempt to compute the translation t
                # of the head model in world space. The depth coordinate shall be denoted tz.
                h, w, _ = frame.shape
                f, R, t = P2sRt(param_lst[0][:12].reshape(3, -1))
                P, pose = calc_pose(param_lst[0])
                x0, y0, x1, y1 = map(int, roi_box_lst[0])
                # Compute the translation vector in image space.
                # The prediction of t is defined in some arbitrary scaled way (from our perspective here.)
                # See tddfa_util.similar_transform(), which computes the orthographic 
                # projection of 3d mesh positions to the image space of our roi_box.
                t_img_x = x0 + (t[0]-1) * (x1-x0) / tddfa.size
                t_img_y = y0 + (tddfa.size-t[1]) * (y1-y0) / tddfa.size
                # The following should give us the scaling transfrom from the 
                # space of the (deformed) model to image pixels.
                scale_model_to_roi_x = f / tddfa.size * (x1-x0)
                # Probably the y "version" makes no sense ... Idk ...
                scale_model_to_roi_y = f / tddfa.size * (y1-y0)
                # In order to relate length in our roi box to the real world we can utilize
                # the size of the deformable model. Judging from magnitude of the  coordinate values 
                # in the model files (https://github.com/cleardusk/3DDFA/tree/master/, u_exp.npy and u_shp.npy),
                # they are defined in micrometers. Hence ...
                scale_model_to_meters = 1.e-6   # micrometers to meters.
                # Furthermore let's define the focal length of our webcam considered as pinhole camera. 
                # (I think this is focal length, or rather one over it)
                focal_length_inv_x = math.tan(fov*math.pi/180.*0.5)
                focal_length_inv_y = math.tan(fov*h/w*math.pi/180.*0.5)
                # Considering the perspective transform, we know that tz/world_space_size = f/projected_size,
                # where projected_size ranges from -1 to 1 as it is defined in screen space. Or rearanging slightly.
                # tz = focal_length * projected_size/world_space_size. The quotient in the back is just a scaling
                # factor which we can compute with the quantities we have. I.e. scale_model_to_roi_x/w gives us
                # the model->screen scale trafo. The inverse screen->model. Multiply by model->meters, and we have
                # our desired meters->screen scaling.
                # Uhm ... except I have this twice now, one for x, one for y ...
                tz_roi_x = scale_model_to_meters * w / (scale_model_to_roi_x * focal_length_inv_x)
                tz_roi_y = scale_model_to_meters * h / (scale_model_to_roi_y * focal_length_inv_y)
                # Just average ... 
                tz_roi = (tz_roi_x + tz_roi_y)*0.5
                # I don't know how the length units of the predicted translation depth are defined.
                # This scaling factor does not seem to bad though.
                scale_model_z = scale_model_to_meters / f
                # So we just add the scaled predicted depth to the guestimated depth from the head size.
                tz = tz_roi + t[2]*scale_model_z
                # Using the perspective transform, calculating the world x and y translation is easy.
                tx = (t_img_x / w * 2. - 1.) * tz * focal_length_inv_x
                ty = (t_img_y / h * 2. - 1.) * tz * focal_length_inv_y
                
                # This final part corrects the yaw and pitch values. Why is simple. The network only sees the
                # roi around the face. When the roi moves off-center our head is seen at an angle due to perspective
                # even though in world space we face straight forward in z-direction. By adding the angle between
                # the forward direction and the head translation vector, we can compensate for this effect.
                yaw_correction = math.atan2(tx, tz) * RAD2DEG
                pitch_correction = math.atan2(ty, tz) * RAD2DEG
                pose[0] += yaw_correction
                pose[1] += pitch_correction

                print (f"H {pose[0]:.1f} P {pose[1]:.1f} B {pose[2]:.1f}" + 
                       f"X {tx:.2f} Y {ty:.2f} Z {tz:.2f}")
                
                # Send the pose to opentrack
                a = np.zeros((6,), dtype=np.float64)
                # I guess it wants cm ...
                a[0] = tx * 100.
                a[1] = ty * 100.
                a[2] = tz * 100.
                # Degrees are fine.
                a[3] = pose[0]
                a[4] = pose[1]
                a[5] = pose[2]
                sock.sendto(a.tobytes(), (UDP_IP, UDP_PORT))
                #print ("update period ", time.time()-time_val, "inference time", inference_time)
                #time_val = time.time()

                if vis:
                    img_draw = cv_draw_landmark(frame_bgr, ver)  # since we use padding
                    viz_pose(img_draw, param_lst, [ver])
                    # The 2d bounding box
                    cv2.rectangle(img_draw, (x0, y0), (x1, y1) ,(128,0,0),1)
                    # Draw the projection of a 10cm radius circle. Sanity check for the 
                    # scaling and the translation estimation.
                    cx = int((tx / (focal_length_inv_x*tz) + 1.)*0.5*w)
                    cy = int((ty / (focal_length_inv_y*tz) + 1.)*0.5*h)
                    r = int(0.1 * w / (focal_length_inv_x*tz))
                    cv2.circle(img_draw, (cx, cy), r, (0,0,255), 1)
                    cv2.circle(img_draw, (cx, cy), 3, (0,0,255), -1)
                    cv2.imshow('image', img_draw)
                    cv2.waitKey(20)
예제 #4
0
def main(args):
    cfg = yaml.load(open(args.config), Loader=yaml.SafeLoader)

    # Init FaceBoxes and TDDFA, recommend using onnx flag
    if args.onnx:
        import os
        os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
        os.environ['OMP_NUM_THREADS'] = '4'

        from FaceBoxes.FaceBoxes_ONNX import FaceBoxes_ONNX
        from TDDFA_ONNX import TDDFA_ONNX

        face_boxes = FaceBoxes_ONNX()
        tddfa = TDDFA_ONNX(**cfg)
    else:
        gpu_mode = args.mode == 'gpu'
        tddfa = TDDFA(gpu_mode=gpu_mode, **cfg)
        face_boxes = FaceBoxes()

    # Given a camera
    # before run this line, make sure you have installed `imageio-ffmpeg`
    reader = imageio.get_reader("<video0>")

    # the simple implementation of average smoothing by looking ahead by n_next frames
    # assert the frames of the video >= n
    n_pre, n_next = args.n_pre, args.n_next
    n = n_pre + n_next + 1
    queue_ver = deque()
    queue_frame = deque()

    # run
    dense_flag = args.opt in ('2d_dense', '3d')
    pre_ver = None
    for i, frame in tqdm(enumerate(reader)):
        frame_bgr = frame[..., ::-1]  # RGB->BGR

        if i == 0:
            # the first frame, detect face, here we only use the first face, you can change depending on your need
            boxes = face_boxes(frame_bgr)
            boxes = [boxes[0]]
            param_lst, roi_box_lst = tddfa(frame_bgr, boxes)
            ver = tddfa.recon_vers(param_lst, roi_box_lst, dense_flag=dense_flag)[0]

            # refine
            param_lst, roi_box_lst = tddfa(frame_bgr, [ver], crop_policy='landmark')
            ver = tddfa.recon_vers(param_lst, roi_box_lst, dense_flag=dense_flag)[0]

            # padding queue
            for _ in range(n_pre):
                queue_ver.append(ver.copy())
            queue_ver.append(ver.copy())

            for _ in range(n_pre):
                queue_frame.append(frame_bgr.copy())
            queue_frame.append(frame_bgr.copy())
        else:
            param_lst, roi_box_lst = tddfa(frame_bgr, [pre_ver], crop_policy='landmark')

            roi_box = roi_box_lst[0]
            # todo: add confidence threshold to judge the tracking is failed
            if abs(roi_box[2] - roi_box[0]) * abs(roi_box[3] - roi_box[1]) < 2020:
                boxes = face_boxes(frame_bgr)
                boxes = [boxes[0]]
                param_lst, roi_box_lst = tddfa(frame_bgr, boxes)

            ver = tddfa.recon_vers(param_lst, roi_box_lst, dense_flag=dense_flag)[0]

            queue_ver.append(ver.copy())
            queue_frame.append(frame_bgr.copy())

        pre_ver = ver  # for tracking

        # smoothing: enqueue and dequeue ops
        if len(queue_ver) >= n:
            ver_ave = np.mean(queue_ver, axis=0)

            if args.opt == '2d_sparse':
                img_draw = cv_draw_landmark(queue_frame[n_pre], ver_ave)  # since we use padding
            elif args.opt == '2d_dense':
                img_draw = cv_draw_landmark(queue_frame[n_pre], ver_ave, size=1)
            elif args.opt == '3d':
                img_draw = render(queue_frame[n_pre], [ver_ave], tddfa.tri, alpha=0.7)
            else:
                raise ValueError(f'Unknown opt {args.opt}')

            cv2.imshow('image', img_draw)
            k = cv2.waitKey(20)
            if (k & 0xff == ord('q')):
                break

            queue_ver.popleft()
            queue_frame.popleft()
예제 #5
0
def main(args):
    # Init TDDFA or TDDFA_ONNX
    cfg = yaml.load(open(args.config), Loader=yaml.SafeLoader)
    if args.onnx:
        from TDDFA_ONNX import TDDFA_ONNX
        tddfa = TDDFA_ONNX(**cfg)
    else:
        gpu_mode = args.mode == 'gpu'
        tddfa = TDDFA(gpu_mode=gpu_mode, **cfg)

    # Initialize FaceBoxes
    face_boxes = FaceBoxes()

    # Given a video path
    fn = args.video_fp.split('/')[-1]
    reader = imageio.get_reader(args.video_fp)

    fps = reader.get_meta_data()['fps']
    suffix = get_suffix(args.video_fp)
    video_wfp = f'examples/results/videos/{fn.replace(suffix, "")}_{args.opt}_smooth.mp4'
    writer = imageio.get_writer(video_wfp, fps=fps)

    # the simple implementation of average smoothing by looking ahead by n_next frames
    # assert the frames of the video >= n
    n_pre, n_next = args.n_pre, args.n_next
    n = n_pre + n_next + 1
    queue_ver = deque()
    queue_frame = deque()

    # run
    dense_flag = args.opt in (
        '2d_dense',
        '3d',
    )
    pre_ver = None
    for i, frame in tqdm(enumerate(reader)):
        if args.start > 0 and i < args.start:
            continue
        if args.end > 0 and i > args.end:
            break

        frame_bgr = frame[..., ::-1]  # RGB->BGR

        if i == 0:
            # detect
            boxes = face_boxes(frame_bgr)
            boxes = [boxes[0]]
            param_lst, roi_box_lst = tddfa(frame_bgr, boxes)
            ver = tddfa.recon_vers(param_lst,
                                   roi_box_lst,
                                   dense_flag=dense_flag)[0]

            # refine
            param_lst, roi_box_lst = tddfa(frame_bgr, [ver],
                                           crop_policy='landmark')
            ver = tddfa.recon_vers(param_lst,
                                   roi_box_lst,
                                   dense_flag=dense_flag)[0]

            # padding queue
            for _ in range(n_pre):
                queue_ver.append(ver.copy())
            queue_ver.append(ver.copy())

            for _ in range(n_pre):
                queue_frame.append(frame_bgr.copy())
            queue_frame.append(frame_bgr.copy())

        else:
            param_lst, roi_box_lst = tddfa(frame_bgr, [pre_ver],
                                           crop_policy='landmark')

            roi_box = roi_box_lst[0]
            # todo: add confidence threshold to judge the tracking is failed
            if abs(roi_box[2] - roi_box[0]) * abs(roi_box[3] -
                                                  roi_box[1]) < 2020:
                boxes = face_boxes(frame_bgr)
                boxes = [boxes[0]]
                param_lst, roi_box_lst = tddfa(frame_bgr, boxes)

            ver = tddfa.recon_vers(param_lst,
                                   roi_box_lst,
                                   dense_flag=dense_flag)[0]

            queue_ver.append(ver.copy())
            queue_frame.append(frame_bgr.copy())

        pre_ver = ver  # for tracking

        # smoothing: enqueue and dequeue ops
        if len(queue_ver) >= n:
            ver_ave = np.mean(queue_ver, axis=0)

            if args.opt == '2d_sparse':
                img_draw = cv_draw_landmark(queue_frame[n_pre],
                                            ver_ave)  # since we use padding
            elif args.opt == '2d_dense':
                img_draw = cv_draw_landmark(queue_frame[n_pre],
                                            ver_ave,
                                            size=1)
            elif args.opt == '3d':
                img_draw = render(queue_frame[n_pre], [ver_ave],
                                  tddfa.bfm.tri,
                                  alpha=0.7)
            else:
                raise ValueError(f'Unknown opt {args.opt}')

            writer.append_data(img_draw[:, :, ::-1])  # BGR->RGB

            queue_ver.popleft()
            queue_frame.popleft()

    # we will lost the last n_next frames, still padding
    for _ in range(n_next):
        queue_ver.append(ver.copy())
        queue_frame.append(frame_bgr.copy())  # the last frame

        ver_ave = np.mean(queue_ver, axis=0)

        if args.opt == '2d_sparse':
            img_draw = cv_draw_landmark(queue_frame[n_pre],
                                        ver_ave)  # since we use padding
        elif args.opt == '2d_dense':
            img_draw = cv_draw_landmark(queue_frame[n_pre], ver_ave, size=1)
        elif args.opt == '3d':
            img_draw = render(queue_frame[n_pre], [ver_ave],
                              tddfa.bfm.tri,
                              alpha=0.7)
        else:
            raise ValueError(f'Unknown opt {args.opt}')

        writer.append_data(img_draw[..., ::-1])  # BGR->RGB

        queue_ver.popleft()
        queue_frame.popleft()

    writer.close()
    print(f'Dump to {video_wfp}')
예제 #6
0
def main(args):
    cfg = yaml.load(open(args.config), Loader=yaml.SafeLoader)
    gpu_mode = args.mode == 'gpu'
    tddfa = TDDFA(gpu_mode=gpu_mode, **cfg)

    # Initialize FaceBoxes
    face_boxes = FaceBoxes()

    # Given a camera
    # before run this line, make sure you have installed `imageio-ffmpeg`
    reader = imageio.get_reader("<video0>")

    # the simple implementation of average smoothing by looking ahead by n_next frames
    # assert the frames of the video >= n
    n_pre, n_next = args.n_pre, args.n_next
    n = n_pre + n_next + 1
    queue_ver = deque()
    queue_frame = deque()

    # run
    dense_flag = args.opt in ('2d_dense', '3d')
    pre_ver = None
    for i, frame in tqdm(enumerate(reader)):
        frame_bgr = frame[..., ::-1]  # RGB->BGR

        if i == 0:
            # the first frame, detect face, here we only use the first face, you can change depending on your need
            boxes = face_boxes(frame_bgr)
            boxes = [boxes[0]]
            param_lst, roi_box_lst = tddfa(frame_bgr, boxes)
            ver = tddfa.recon_vers(param_lst,
                                   roi_box_lst,
                                   dense_flag=dense_flag)[0]

            # refine
            param_lst, roi_box_lst = tddfa(frame_bgr, [ver],
                                           crop_policy='landmark')
            ver = tddfa.recon_vers(param_lst,
                                   roi_box_lst,
                                   dense_flag=dense_flag)[0]

            # padding queue
            for _ in range(n_pre):
                queue_ver.append(ver.copy())
            queue_ver.append(ver.copy())

            for _ in range(n_pre):
                queue_frame.append(frame_bgr.copy())
            queue_frame.append(frame_bgr.copy())
        else:
            param_lst, roi_box_lst = tddfa(frame_bgr, [pre_ver],
                                           crop_policy='landmark')

            roi_box = roi_box_lst[0]
            # todo: add confidence threshold to judge the tracking is failed
            if abs(roi_box[2] - roi_box[0]) * abs(roi_box[3] -
                                                  roi_box[1]) < 2020:
                boxes = face_boxes(frame_bgr)
                boxes = [boxes[0]]
                param_lst, roi_box_lst = tddfa(frame_bgr, boxes)

            ver = tddfa.recon_vers(param_lst,
                                   roi_box_lst,
                                   dense_flag=dense_flag)[0]

            queue_ver.append(ver.copy())
            queue_frame.append(frame_bgr.copy())

        pre_ver = ver  # for tracking

        # smoothing: enqueue and dequeue ops
        if len(queue_ver) >= n:
            ver_ave = np.mean(queue_ver, axis=0)
            img_draw = cv_draw_landmark(queue_frame[n_pre],
                                        ver_ave)  # since we use padding
            cv2.imshow('image', img_draw)
            k = cv2.waitKey(20)
            if (k & 0xff == ord('q')):
                break

            queue_ver.popleft()
            queue_frame.popleft()
예제 #7
0
def main(args):
    cfg = yaml.load(open(args.config), Loader=yaml.SafeLoader)
    gpu_mode = args.mode == 'gpu'
    tddfa = TDDFA(gpu_mode=gpu_mode, **cfg)

    # Initialize FaceBoxes
    face_boxes = FaceBoxes()

    # Given a video path
    fn = args.video_fp.split('/')[-1]
    reader = imageio.get_reader(args.video_fp)

    fps = reader.get_meta_data()['fps']
    video_wfp = f'examples/results/videos/{fn.replace(".avi", "_smooth.mp4")}'
    writer = imageio.get_writer(video_wfp, fps=fps)

    # the simple implementation of average smoothing by looking ahead by n_next frames
    # assert the frames of the video >= n
    n_pre, n_next = args.n_pre, args.n_next
    n = n_pre + n_next + 1
    queue_ver = deque()
    queue_frame = deque()

    pre_ver = None
    for i, frame in tqdm(enumerate(reader)):
        frame_bgr = frame[..., ::-1]  # RGB->BGR

        if i == 0:
            # detect
            boxes = face_boxes(frame_bgr)
            boxes = [boxes[0]]
            param_lst, roi_box_lst = tddfa(frame_bgr, boxes)
            ver = tddfa.recon_vers(param_lst, roi_box_lst)[0]

            # refine
            param_lst, roi_box_lst = tddfa(frame_bgr, [ver],
                                           crop_policy='landmark')
            ver = tddfa.recon_vers(param_lst, roi_box_lst)[0]

            # padding queue
            for j in range(n_pre):
                queue_ver.append(ver.copy())
            queue_ver.append(ver.copy())

            for j in range(n_pre):
                queue_frame.append(frame_bgr.copy())
            queue_frame.append(frame_bgr.copy())

        else:
            param_lst, roi_box_lst = tddfa(frame_bgr, [pre_ver],
                                           crop_policy='landmark')

            roi_box = roi_box_lst[0]
            # todo: add confidence threshold to judge the tracking is failed
            if abs(roi_box[2] - roi_box[0]) * abs(roi_box[3] -
                                                  roi_box[1]) < 2020:
                boxes = face_boxes(frame_bgr)
                boxes = [boxes[0]]
                param_lst, roi_box_lst = tddfa(frame_bgr, boxes)

            ver = tddfa.recon_vers(param_lst, roi_box_lst)[0]

            queue_ver.append(ver.copy())
            queue_frame.append(frame_bgr.copy())

        pre_ver = ver  # for tracking

        # smoothing: enqueue and dequeue ops
        if len(queue_ver) >= n:
            ver_ave = np.mean(queue_ver, axis=0)
            img_draw = cv_draw_landmark(queue_frame[n_pre],
                                        ver_ave)  # since we use padding

            writer.append_data(img_draw[:, :, ::-1])  # BGR->RGB

            queue_ver.popleft()
            queue_frame.popleft()

    # we will lost the last n_next frames, still padding
    for j in range(n_next):
        queue_ver.append(ver.copy())
        queue_frame.append(frame_bgr.copy())  # the last frame

        ver_ave = np.mean(queue_ver, axis=0)

        img_draw = cv_draw_landmark(queue_frame[n_pre],
                                    ver_ave)  # since we use padding

        writer.append_data(img_draw[..., ::-1])  # BGR->RGB

        queue_ver.popleft()
        queue_frame.popleft()

    writer.close()
    print(f'Dump to {video_wfp}')