Exemplo n.º 1
0
    def input_task(self):
        seq_num = 0
        while self.is_running():

            # Send images to next stage

            # if camera - receive frames from camera
            if camera:
                try:
                    frame = self.device.getOutputQueue('cam_out').get()
                    self.frame_queue.put(frame)
                except RuntimeError:
                    continue

            # else if video - send frames down to NN
            else:

                # Get frame from video capture
                read_correctly, vid_frame = self.cap.read()
                if not read_correctly:
                    break

                # Send to NN and to inference thread
                frame_nn = dai.ImgFrame()
                frame_nn.setSequenceNum(seq_num)
                frame_nn.setWidth(544)
                frame_nn.setHeight(320)
                frame_nn.setData(to_planar(vid_frame, (544, 320)))
                self.device.getInputQueue("detection_in").send(frame_nn)

                # if high quality, send original frames
                if hq:
                    frame_orig = dai.ImgFrame()
                    frame_orig.setSequenceNum(seq_num)
                    frame_orig.setWidth(vid_frame.shape[1])
                    frame_orig.setHeight(vid_frame.shape[0])
                    frame_orig.setData(
                        to_planar(vid_frame,
                                  (vid_frame.shape[1], vid_frame.shape[0])))
                    self.frame_queue.put(frame_orig)
                # else send resized frames
                else:
                    self.frame_queue.put(frame_nn)

                seq_num = seq_num + 1

                # Sleep at video framerate
                time.sleep(1.0 / self.FRAMERATE)
        # Stop execution after input task doesn't have
        # any extra data anymore
        self.running = False
Exemplo n.º 2
0
def main(filename=None):
    pipeline = createPipeline(filename is None)
    with depthai.Device() as device:
        device.startPipeline(pipeline)

        if filename:
            image_in = device.getInputQueue("image_in")

        nn_out = device.getOutputQueue(name="nn_out", maxSize=4, blocking=True)

        run = True
        while run:
            if filename:
                origframe = cv2.imread(filename)
                frame = cv2.cvtColor(origframe, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, (224, 224)).transpose(2, 0,
                                                                1).flatten()
                tstamp = time.monotonic()
                lic_frame = depthai.ImgFrame()
                lic_frame.setData(frame)
                lic_frame.setTimestamp(tstamp)
                lic_frame.setType(lic_frame.Type.RGB888p)
                lic_frame.setWidth(224)
                lic_frame.setHeight(224)
                image_in.send(lic_frame)

            data = nn_out.get()
            result = np.array(data.getFirstLayerFp16()).reshape((5, 224, 224))
            tags = createTagImage(result)
            cv2.imshow("Source", origframe)
            cv2.imshow("Tags", tags)
            cv2.waitKey(0)
            if filename is not None:
                run = False
Exemplo n.º 3
0
def lic_thread(det_queue, det_pass, rec_queue):
    global license_detections, lic_last_seq

    while running:
        try:
            in_det = det_queue.get().detections
            in_pass = det_pass.get()

            orig_frame = frame_seq_map.get(in_pass.getSequenceNum(), None)
            if orig_frame is None:
                continue

            license_detections = [
                detection for detection in in_det if detection.label == 2
            ]

            for detection in license_detections:
                bbox = frame_norm(orig_frame, (detection.xmin, detection.ymin,
                                               detection.xmax, detection.ymax))
                cropped_frame = orig_frame[bbox[1]:bbox[3], bbox[0]:bbox[2]]

                tstamp = time.monotonic()
                img = dai.ImgFrame()
                img.setTimestamp(tstamp)
                img.setType(dai.RawImgFrame.Type.BGR888p)
                img.setData(to_planar(cropped_frame, (94, 24)))
                img.setWidth(94)
                img.setHeight(24)
                rec_queue.send(img)

            fps.tick('lic')
        except RuntimeError:
            continue
Exemplo n.º 4
0
def veh_thread(det_queue, det_pass, attr_queue):
    global vehicle_detections, veh_last_seq

    while running:
        try:
            vehicle_detections = det_queue.get().detections
            in_pass = det_pass.get()

            orig_frame = frame_seq_map.get(in_pass.getSequenceNum(), None)
            if orig_frame is None:
                continue

            veh_last_seq = in_pass.getSequenceNum()

            for detection in vehicle_detections:
                bbox = frame_norm(orig_frame, (detection.xmin, detection.ymin,
                                               detection.xmax, detection.ymax))
                cropped_frame = orig_frame[bbox[1]:bbox[3], bbox[0]:bbox[2]]

                tstamp = time.monotonic()
                img = dai.ImgFrame()
                img.setTimestamp(tstamp)
                img.setType(dai.RawImgFrame.Type.BGR888p)
                img.setData(to_planar(cropped_frame, (72, 72)))
                img.setWidth(72)
                img.setHeight(72)
                attr_queue.send(img)

            fps.tick('veh')
        except RuntimeError:
            continue
Exemplo n.º 5
0
    def next_frame(self):

        self.fps.update()
        if self.input_type == "rgb":
            # Send cropping information to manip node on device
            cfg = dai.ImageManipConfig()
            points = [
                [self.crop_region.xmin, self.crop_region.ymin],
                [self.crop_region.xmax - 1, self.crop_region.ymin],
                [self.crop_region.xmax - 1, self.crop_region.ymax - 1],
                [self.crop_region.xmin, self.crop_region.ymax - 1],
            ]
            point2fList = []
            for p in points:
                pt = dai.Point2f()
                pt.x, pt.y = p[0], p[1]
                point2fList.append(pt)
            cfg.setWarpTransformFourPoints(point2fList, False)
            cfg.setResize(self.pd_input_length, self.pd_input_length)
            cfg.setFrameType(dai.ImgFrame.Type.RGB888p)
            self.q_manip_cfg.send(cfg)

            # Get the device camera frame if wanted
            if self.laconic:
                frame = np.zeros((self.frame_size, self.frame_size, 3),
                                 dtype=np.uint8)
            else:
                in_video = self.q_video.get()
                frame = in_video.getCvFrame()
        else:
            if self.input_type == "image":
                frame = self.img.copy()
            else:
                ok, frame = self.cap.read()
                if not ok:
                    return None, None

            # Cropping of the video frame
            cropped = self.crop_and_resize(frame, self.crop_region)
            cropped = cv2.cvtColor(cropped,
                                   cv2.COLOR_BGR2RGB).transpose(2, 0, 1)

            frame_nn = dai.ImgFrame()
            frame_nn.setTimestamp(time.monotonic())
            frame_nn.setWidth(self.pd_input_length)
            frame_nn.setHeight(self.pd_input_length)
            frame_nn.setData(cropped)
            self.q_pd_in.send(frame_nn)

        # Get result from device
        inference = self.q_pd_out.get()
        body = self.pd_postprocess(inference)
        self.crop_region = body.next_crop_region

        # Statistics
        if self.stats:
            self.nb_frames += 1
            self.nb_pd_inferences += 1

        return frame, body
Exemplo n.º 6
0
def run_nn(img, input_queue, width, height):
    frameNn = dai.ImgFrame()
    frameNn.setType(dai.ImgFrame.Type.BGR888p)
    frameNn.setWidth(width)
    frameNn.setHeight(height)
    frameNn.setData(toPlanar(img, (height, width)))
    input_queue.send(frameNn)
Exemplo n.º 7
0
def run_nn(x_in, frame, w, h):
    nn_data = dai.ImgFrame()
    nn_data.setData(toPlanar(frame, (w, h)))
    nn_data.setType(dai.RawImgFrame.Type.BGR888p)
    nn_data.setTimestamp(monotonic())
    nn_data.setWidth(w)
    nn_data.setHeight(h)
    x_in.send(nn_data)
 def send_mono(self, q, img, right):
     self.lastFrame['right' if right else 'left'] = img
     h, w = img.shape
     frame = dai.ImgFrame()
     frame.setData(img)
     frame.setType(dai.RawImgFrame.Type.RAW8)
     frame.setWidth(w)
     frame.setHeight(h)
     frame.setInstanceNum((2 if right else 1))
     q.send(frame)
 def send_color(self, q, img):
     # Resize/crop color frame as specified by the user
     img = self.resize_color(img)
     self.lastFrame['color'] = img
     h, w, c = img.shape
     frame = dai.ImgFrame()
     frame.setType(dai.RawImgFrame.Type.BGR888p)
     frame.setData(self.to_planar(img))
     frame.setWidth(w)
     frame.setHeight(h)
     frame.setInstanceNum(0)
     q.send(frame)
Exemplo n.º 10
0
def run_nn(img, input_queue, width, height):
    """
    It takes an image, converts it to a planar image, and sends it to the input queue

    :param img: The image to be processed
    :param input_queue: The input queue to the neural network
    :param width: The width of the image in pixels
    :param height: The height of the image in pixels
    """
    frameNn = dai.ImgFrame()
    frameNn.setType(dai.ImgFrame.Type.BGR888p)
    frameNn.setWidth(width)
    frameNn.setHeight(height)
    frameNn.setData(toPlanar(img, (height, width)))
    input_queue.send(frameNn)
    def send_depth(self, q, depth):
        # TODO refactor saving depth. Reading will be from ROS bags.

        # print("depth size", type(depth))
        # depth_frame = np.array(depth).astype(np.uint8).view(np.uint16).reshape((400, 640))
        # depthFrameColor = cv2.normalize(depth_frame, None, 255, 0, cv2.NORM_INF, cv2.CV_8UC1)
        # depthFrameColor = cv2.equalizeHist(depthFrameColor)
        # depthFrameColor = cv2.applyColorMap(depthFrameColor, cv2.COLORMAP_HOT)
        # cv2.imshow("depth", depthFrameColor)
        frame = dai.ImgFrame()
        frame.setType(dai.RawImgFrame.Type.RAW16)
        frame.setData(depth)
        frame.setWidth(640)
        frame.setHeight(400)
        frame.setInstanceNum(0)
        q.send(frame)
Exemplo n.º 12
0
    # Show the frame
    cv2.imshow(name, frame)
    # cv2.imshow('out', mask)


size = 128
start_time = monotonic()
counter = 0
fps = 0
cap = cv2.VideoCapture(videoPath)
while cap.isOpened():
    read_correctly, frame = cap.read()
    if not read_correctly:
        break

    img = dai.ImgFrame()
    img.setData(to_planar(frame, (size, size)))
    img.setTimestamp(monotonic())
    img.setWidth(size)
    img.setHeight(size)
    qIn.send(img)

    inDet = qDet.get()
    print(inDet.getAllLayerNames())
    out = inDet.getLayerFp16("model_7/conv2d_transpose_36/Sigmoid")
    output = np.array(out).reshape(size, size)
    norm_image = cv2.normalize(output,
                               None,
                               alpha=0,
                               beta=255,
                               norm_type=cv2.NORM_MINMAX,
Exemplo n.º 13
0
def test_pipeline():
    #pipeline, streams = create_rgb_cam_pipeline()
    #pipeline, streams = create_mono_cam_pipeline()
    pipeline, streams = create_stereo_depth_pipeline(source_camera)

    print("Creating DepthAI device")
    with dai.Device(pipeline) as device:
        print("Starting pipeline")
        device.startPipeline()

        in_streams = []
        if not source_camera:
            # Reversed order trick:
            # The sync stage on device side has a timeout between receiving left
            # and right frames. In case a delay would occur on host between sending
            # left and right, the timeout will get triggered.
            # We make sure to send first the right frame, then left.
            in_streams.extend(['in_right', 'in_left'])
        in_q_list = []
        inStreamsCameraID = []
        for s in in_streams:
            q = device.getInputQueue(s)
            in_q_list.append(q)
            inStreamsCameraID = [
                dai.CameraBoardSocket.RIGHT, dai.CameraBoardSocket.LEFT
            ]

        # Create a receive queue for each stream
        q_list = []
        for s in streams:
            q = device.getOutputQueue(s, 8, blocking=False)
            q_list.append(q)

        # Need to set a timestamp for input frames, for the sync stage in Stereo node
        timestamp_ms = 0
        index = 0
        while True:
            # Handle input streams, if any
            if in_q_list:
                dataset_size = 2  # Number of image pairs
                frame_interval_ms = 33
                for i, q in enumerate(in_q_list):
                    name = q.getName()
                    path = 'dataset/' + str(index) + '/' + name + '.png'
                    data = cv2.imread(path,
                                      cv2.IMREAD_GRAYSCALE).reshape(720 * 1280)
                    tstamp = datetime.timedelta(seconds=timestamp_ms // 1000,
                                                milliseconds=timestamp_ms %
                                                1000)
                    img = dai.ImgFrame()
                    img.setData(data)
                    img.setTimestamp(tstamp)
                    img.setInstanceNum(inStreamsCameraID[i])
                    img.setType(dai.ImgFrame.Type.RAW8)
                    img.setWidth(1280)
                    img.setHeight(720)
                    q.send(img)
                    if timestamp_ms == 0:  # Send twice for first iteration
                        q.send(img)
                    print("Sent frame: {:25s}".format(path), 'timestamp_ms:',
                          timestamp_ms)
                timestamp_ms += frame_interval_ms
                index = (index + 1) % dataset_size
                if 1:  # Optional delay between iterations, host driven pipeline
                    sleep(frame_interval_ms / 1000)
            # Handle output streams
            for q in q_list:
                name = q.getName()
                image = q.get()
                #print("Received frame:", name)
                # Skip some streams for now, to reduce CPU load
                if name in ['left', 'right', 'depth']: continue
                frame = convert_to_cv2_frame(name, image)
                cv2.imshow(name, frame)
            if cv2.waitKey(1) == ord('q'):
                break
Exemplo n.º 14
0
    def run(self):

        device = dai.Device(self.create_pipeline())
        device.startPipeline()

        # Define data queues 
        if self.camera:
            q_video = device.getOutputQueue(name="cam_out", maxSize=1, blocking=False)
            q_pd_out = device.getOutputQueue(name="pd_out", maxSize=1, blocking=False)
            if self.use_lm:
                q_lm_out = device.getOutputQueue(name="lm_out", maxSize=2, blocking=False)
                q_lm_in = device.getInputQueue(name="lm_in")
        else:
            q_pd_in = device.getInputQueue(name="pd_in")
            q_pd_out = device.getOutputQueue(name="pd_out", maxSize=4, blocking=True)
            if self.use_lm:
                q_lm_out = device.getOutputQueue(name="lm_out", maxSize=4, blocking=True)
                q_lm_in = device.getInputQueue(name="lm_in")

        self.fps = FPS(mean_nb_frames=20)

        seq_num = 0
        nb_pd_inferences = 0
        nb_lm_inferences = 0
        glob_pd_rtrip_time = 0
        glob_lm_rtrip_time = 0
        while True:
            self.fps.update()
            if self.camera:
                in_video = q_video.get()
                video_frame = in_video.getCvFrame()
            else:
                if self.image_mode:
                    vid_frame = self.img
                else:
                    ok, vid_frame = self.cap.read()
                    if not ok:
                        break
                h, w = vid_frame.shape[:2]
                dx = (w - self.video_size) // 2
                dy = (h - self.video_size) // 2
                video_frame = vid_frame[dy:dy+self.video_size, dx:dx+self.video_size]
                frame_nn = dai.ImgFrame()
                frame_nn.setSequenceNum(seq_num)
                frame_nn.setWidth(self.pd_input_length)
                frame_nn.setHeight(self.pd_input_length)
                frame_nn.setData(to_planar(video_frame, (self.pd_input_length, self.pd_input_length)))
                q_pd_in.send(frame_nn)
                pd_rtrip_time = now()

                seq_num += 1

            annotated_frame = video_frame.copy()

            # Get palm detection
            inference = q_pd_out.get()
            if not self.camera: glob_pd_rtrip_time += now() - pd_rtrip_time
            self.pd_postprocess(inference)
            self.pd_render(annotated_frame)
            nb_pd_inferences += 1

            # Hand landmarks
            if self.use_lm:
                for i,r in enumerate(self.regions):
                    img_hand = mpu.warp_rect_img(r.rect_points, video_frame, self.lm_input_length, self.lm_input_length)
                    nn_data = dai.NNData()   
                    nn_data.setLayer("input_1", to_planar(img_hand, (self.lm_input_length, self.lm_input_length)))
                    q_lm_in.send(nn_data)
                    if i == 0: lm_rtrip_time = now() # We measure only for the first region
                
                # Retrieve hand landmarks
                for i,r in enumerate(self.regions):
                    inference = q_lm_out.get()
                    if i == 0: glob_lm_rtrip_time += now() - lm_rtrip_time
                    self.lm_postprocess(r, inference)
                    self.lm_render(annotated_frame, r)
                    nb_lm_inferences += 1

                
            self.fps.display(annotated_frame, orig=(50,50),color=(240,180,100))
            cv2.imshow("video", annotated_frame)

            key = cv2.waitKey(1) 
            if key == ord('q') or key == 27:
                break
            elif key == 32:
                # Pause on space bar
                cv2.waitKey(0)
            elif key == ord('1'):
                self.show_pd_box = not self.show_pd_box
            elif key == ord('2'):
                self.show_pd_kps = not self.show_pd_kps
            elif key == ord('3'):
                self.show_rot_rect = not self.show_rot_rect
            elif key == ord('4'):
                self.show_landmarks = not self.show_landmarks
            elif key == ord('5'):
                self.show_handedness = not self.show_handedness
            elif key == ord('6'):
                self.show_scores = not self.show_scores
            elif key == ord('7'):
                self.show_gesture = not self.show_gesture

        # Print some stats
        if not self.camera:
            print(f"# video files frames                 : {seq_num}")
            print(f"# palm detection inferences received : {nb_pd_inferences}")
            print(f"# hand landmark inferences received  : {nb_lm_inferences}")
            print(f"Palm detection round trip            : {glob_pd_rtrip_time/nb_pd_inferences*1000:.1f} ms")
            print(f"Hand landmark round trip             : {glob_lm_rtrip_time/nb_lm_inferences*1000:.1f} ms")
Exemplo n.º 15
0
def test_pipeline():
    #pipeline, streams = create_rgb_cam_pipeline()
    #pipeline, streams = create_mono_cam_pipeline()
    pipeline, streams = create_stereo_depth_pipeline(source_camera)

    print("Creating DepthAI device")
    if 1:
        device = dai.Device(pipeline)
    else:  # For debug mode, with the firmware already loaded
        found, device_info = dai.XLinkConnection.getFirstDevice(
            dai.XLinkDeviceState.X_LINK_BOOTED)
        if found:
            device = dai.Device(pipeline, device_info)
        else:
            raise RuntimeError("Device not found")
    print("Starting pipeline")
    device.startPipeline()

    in_streams = []
    if not source_camera:
        # Reversed order trick:
        # The sync stage on device side has a timeout between receiving left
        # and right frames. In case a delay would occur on host between sending
        # left and right, the timeout will get triggered.
        # We make sure to send first the right frame, then left.
        in_streams.extend(['in_right', 'in_left'])
    in_q_list = []
    for s in in_streams:
        q = device.getInputQueue(s)
        in_q_list.append(q)

    # Create a receive queue for each stream
    q_list = []
    for s in streams:
        q = device.getOutputQueue(s, 8, True)
        q_list.append(q)

    # Need to set a timestamp for input frames, for the sync stage in Stereo node
    timestamp_ms = 0
    index = 0
    while True:
        # Handle input streams, if any
        if in_q_list:
            dataset_size = 2  # Number of image pairs
            frame_interval_ms = 33
            for q in in_q_list:
                name = q.getName()
                path = 'dataset/' + str(index) + '/' + name + '.png'
                data = cv2.imread(path,
                                  cv2.IMREAD_GRAYSCALE).reshape(720 * 1280)
                tstamp_ns = timestamp_ms * (1000 * 1000)
                tstamp = dai.Timestamp()
                tstamp.sec = tstamp_ns // (1000 * 1000 * 1000)
                tstamp.nsec = tstamp_ns % (1000 * 1000 * 1000)
                img = dai.ImgFrame()
                img.setData(data)
                img.setTimestamp(tstamp)
                img.setWidth(1280)
                img.setHeight(720)
                q.send(img)
                print("Sent frame: {:25s}".format(path), 'timestamp_ms:',
                      timestamp_ms)
            timestamp_ms += frame_interval_ms
            index = (index + 1) % dataset_size
            if 1:  # Optional delay between iterations, host driven pipeline
                sleep(frame_interval_ms / 1000)
        # Handle output streams
        for q in q_list:
            name = q.getName()
            image = q.get()
            #print("Received frame:", name)
            # Skip some streams for now, to reduce CPU load
            if name in ['left', 'right', 'depth']: continue
            frame = convert_to_cv2_frame(name, image)
            cv2.imshow(name, frame)
        if cv2.waitKey(1) == ord('q'):
            break

    print("Closing device")
    del device
Exemplo n.º 16
0
def main(args):
    """
    Main programm loop.

    Parameters
    ----------
    args : command line arguments parsed by parse_arguments
    """
    # Set up PoseEstimator, pipeline, window with sliders for PoseEstimator
    # options and load the dataset
    if args["model"] not in model_list:
        raise ValueError("Unknown model '{}'".format(args["model"]))
    model_config = model_list[args["model"]]
    pose_estimator = get_poseestimator(model_config, **args)

    with dai.Device(create_pipeline(model_config, camera=False,
                                    **args)) as device:
        device.startPipeline()

        pose_in_queue = device.getInputQueue("pose_in")
        pose_queue = device.getOutputQueue("pose")

        result_window = SliderWindow("Result")
        result_window.add_poseestimator_options(pose_estimator, args)
        cv2.namedWindow("Original")

        # load coco keypoint annotations
        coco_data = COCOData(**args)

        # Start main loop
        last_img_id = -1
        cur_img_id = 0
        original_frame = None
        results_frame = None
        raw_output = None
        redraw = True
        while True:
            # Check for and handle slider changes, redraw if there was a change
            slider_changes = result_window.get_changes()
            for option_name, value in slider_changes.items():
                pose_estimator.set_option(option_name, value)
                redraw = True

            # When new image was selected process it
            if last_img_id != cur_img_id:
                img, gt_keypoints = coco_data[cur_img_id]

                # Send image off for inference
                input_frame = pose_estimator.get_input_frame(img)
                frame_nn = dai.ImgFrame()
                frame_nn.setSequenceNum(0)
                frame_nn.setWidth(input_frame.shape[2])
                frame_nn.setHeight(input_frame.shape[1])
                frame_nn.setFrame(input_frame)
                pose_in_queue.send(frame_nn)

                # Draw ground truth annotations
                original_frame = img.copy()
                coco_data.draw(gt_keypoints, original_frame)
                cv2.rectangle(original_frame, (0, 0), (150, 25), (0, 0, 0), -1)
                cv2.putText(original_frame,
                            "#people: {}".format(len(gt_keypoints)), (2, 15),
                            cv2.FONT_HERSHEY_TRIPLEX,
                            0.4,
                            color=(255, 255, 255))

                # Discard previous results
                results_frame = None
                raw_output = None
                last_img_id = cur_img_id

            # Store the raw results once available
            if pose_queue.has():
                raw_output = pose_queue.get()

            # Once we've got the raw output and again whenever an option
            # changes we need to decode and draw
            if redraw and raw_output is not None:
                results_frame = img.copy()
                pred_keypoints = pose_estimator.get_pose_data(raw_output)
                pose_estimator.draw_results(pred_keypoints, results_frame)
                cv2.rectangle(results_frame, (0, 0), (150, 25), (0, 0, 0), -1)
                cv2.putText(results_frame,
                            "#people: {}".format(len(pred_keypoints)), (2, 15),
                            cv2.FONT_HERSHEY_TRIPLEX,
                            0.4,
                            color=(255, 255, 255))

            cv2.imshow("Original", original_frame)
            if results_frame is not None:
                cv2.imshow("Result", results_frame)

            c = cv2.waitKey(1)
            if c == ord("q"):
                break
            elif c == ord("n"):
                if cur_img_id < len(coco_data) - 1:
                    cur_img_id += 1
            elif c == ord("p"):
                if cur_img_id > 0:
                    cur_img_id -= 1
Exemplo n.º 17
0
    def run(self):

        device = dai.Device(self.create_pipeline())
        device.startPipeline()

        # Define data queues
        if self.input_type == "internal":
            q_video = device.getOutputQueue(name="cam_out",
                                            maxSize=1,
                                            blocking=False)
            q_pd_out = device.getOutputQueue(name="pd_out",
                                             maxSize=1,
                                             blocking=False)
            q_lm_out = device.getOutputQueue(name="lm_out",
                                             maxSize=2,
                                             blocking=False)
            q_lm_in = device.getInputQueue(name="lm_in")
        else:
            q_pd_in = device.getInputQueue(name="pd_in")
            q_pd_out = device.getOutputQueue(name="pd_out",
                                             maxSize=4,
                                             blocking=True)
            q_lm_out = device.getOutputQueue(name="lm_out",
                                             maxSize=4,
                                             blocking=True)
            q_lm_in = device.getInputQueue(name="lm_in")

        self.fps = FPS(mean_nb_frames=20)

        seq_num = 0
        nb_pd_inferences = 0
        nb_lm_inferences = 0
        glob_pd_rtrip_time = 0
        glob_lm_rtrip_time = 0
        while True:
            self.fps.update()

            if self.input_type == "internal":
                in_video = q_video.get()
                video_frame = in_video.getCvFrame()
                self.frame_size = video_frame.shape[
                    0]  # The image is square cropped on the device
                self.pad_w = self.pad_h = 0
            else:
                if self.input_type == "image":
                    vid_frame = self.img
                else:
                    ok, vid_frame = self.cap.read()
                    if not ok:
                        break

                h, w = vid_frame.shape[:2]
                if self.crop:
                    # Cropping the long side to get a square shape
                    self.frame_size = min(h, w)
                    dx = (w - self.frame_size) // 2
                    dy = (h - self.frame_size) // 2
                    video_frame = vid_frame[dy:dy + self.frame_size,
                                            dx:dx + self.frame_size]
                else:
                    # Padding on the small side to get a square shape
                    self.frame_size = max(h, w)
                    self.pad_h = int((self.frame_size - h) / 2)
                    self.pad_w = int((self.frame_size - w) / 2)
                    video_frame = cv2.copyMakeBorder(vid_frame, self.pad_h,
                                                     self.pad_h, self.pad_w,
                                                     self.pad_w,
                                                     cv2.BORDER_CONSTANT)

                frame_nn = dai.ImgFrame()
                frame_nn.setSequenceNum(seq_num)
                frame_nn.setWidth(self.pd_input_length)
                frame_nn.setHeight(self.pd_input_length)
                frame_nn.setData(
                    to_planar(video_frame,
                              (self.pd_input_length, self.pd_input_length)))
                pd_rtrip_time = now()
                q_pd_in.send(frame_nn)

                seq_num += 1

            annotated_frame = video_frame.copy()

            # Get pose detection
            inference = q_pd_out.get()
            if self.input_type != "internal":
                pd_rtrip_time = now() - pd_rtrip_time
                glob_pd_rtrip_time += pd_rtrip_time
            self.pd_postprocess(inference)
            self.pd_render(annotated_frame)
            nb_pd_inferences += 1

            # Landmarks
            self.nb_active_regions = 0
            if self.show_3d:
                self.vis3d.clear_geometries()
                self.vis3d.add_geometry(self.grid_floor,
                                        reset_bounding_box=False)
                self.vis3d.add_geometry(self.grid_wall,
                                        reset_bounding_box=False)
            for i, r in enumerate(self.regions):
                frame_nn = mpu.warp_rect_img(r.rect_points, video_frame,
                                             self.lm_input_length,
                                             self.lm_input_length)
                nn_data = dai.NNData()
                nn_data.setLayer(
                    "input_1",
                    to_planar(frame_nn,
                              (self.lm_input_length, self.lm_input_length)))
                if i == 0:
                    lm_rtrip_time = now(
                    )  # We measure only for the first region
                q_lm_in.send(nn_data)

                # Get landmarks
                inference = q_lm_out.get()
                if i == 0:
                    lm_rtrip_time = now() - lm_rtrip_time
                    glob_lm_rtrip_time += lm_rtrip_time
                    nb_lm_inferences += 1
                self.lm_postprocess(r, inference)
                self.lm_render(annotated_frame, r)
            if self.show_3d:
                self.vis3d.poll_events()
                self.vis3d.update_renderer()
            if self.smoothing and self.nb_active_regions == 0:
                self.filter.reset()

            if self.input_type != "internal" and not self.crop:
                annotated_frame = annotated_frame[self.pad_h:self.pad_h + h,
                                                  self.pad_w:self.pad_w + w]

            if self.show_fps:
                self.fps.display(annotated_frame,
                                 orig=(50, 50),
                                 size=1,
                                 color=(240, 180, 100))
            cv2.imshow("Blazepose", annotated_frame)

            if self.output:
                self.output.write(annotated_frame)

            key = cv2.waitKey(1)
            if key == ord('q') or key == 27:
                break
            elif key == 32:
                # Pause on space bar
                cv2.waitKey(0)
            elif key == ord('1'):
                self.show_pd_box = not self.show_pd_box
            elif key == ord('2'):
                self.show_pd_kps = not self.show_pd_kps
            elif key == ord('3'):
                self.show_rot_rect = not self.show_rot_rect
            elif key == ord('4'):
                self.show_landmarks = not self.show_landmarks
            elif key == ord('5'):
                self.show_scores = not self.show_scores
            elif key == ord('6'):
                self.show_gesture = not self.show_gesture
            elif key == ord('f'):
                self.show_fps = not self.show_fps

        # Print some stats
        print(f"# pose detection inferences : {nb_pd_inferences}")
        print(f"# landmark inferences       : {nb_lm_inferences}")
        if self.input_type != "internal" and nb_pd_inferences != 0:
            print(
                f"Pose detection round trip   : {glob_pd_rtrip_time/nb_pd_inferences*1000:.1f} ms"
            )
        if nb_lm_inferences != 0:
            print(
                f"Landmark round trip         : {glob_lm_rtrip_time/nb_lm_inferences*1000:.1f} ms"
            )

        if self.output:
            self.output.release()
Exemplo n.º 18
0
        if args.video:
            return cap.read()
        else:
            return True, qRgb.get().getCvFrame()

    while shouldRun():
        read_correctly, frame = getFrame()

        if not read_correctly:
            break

        fps.next_iter()

        if not args.camera:
            tstamp = time.monotonic()
            lic_frame = dai.ImgFrame()
            lic_frame.setData(to_planar(frame, (300, 300)))
            lic_frame.setTimestamp(tstamp)
            lic_frame.setType(dai.RawImgFrame.Type.BGR888p)
            lic_frame.setWidth(300)
            lic_frame.setHeight(300)
            detIn.send(lic_frame)

        detections = qDet.get().detections
        cv2.putText(frame,
                    "Fps: {:.2f}".format(fps.fps()), (2, frame.shape[0] - 4),
                    cv2.FONT_HERSHEY_TRIPLEX,
                    0.4,
                    color=(255, 255, 255))

        for detection in detections:
Exemplo n.º 19
0
def main(args):
    """
    Main programm loop.

    Parameters
    ----------
    args : command line arguments parsed by parse_arguments
    """
    # Setup PoseEstimator, pipeline, windows with sliders for PoseEstimator
    # options and load video if running on local video file
    camera = args["video"] is None
    if args["model"] not in model_list:
        raise ValueError("Unknown model '{}'".format(args["model"]))
    model_config = model_list[args["model"]]
    pose_estimator = get_poseestimator(model_config, **args)

    with dai.Device(
            create_pipeline(model_config, camera, passthrough=True,
                            **args)) as device:
        device.startPipeline()

        if camera:
            preview_queue = device.getOutputQueue("preview",
                                                  maxSize=4,
                                                  blocking=False)
        else:
            pose_in_queue = device.getInputQueue("pose_in")
        pose_queue = device.getOutputQueue("pose")
        passthrough_queue = device.getOutputQueue("passthrough")

        # Load video if given in command line and set the variables used below
        # to control FPS and looping of the video
        if not camera:
            if not os.path.exists(args["video"]):
                raise ValueError("Video '{}' does not exist.".format(
                    args["video"]))
            print("Loading video", args["video"])
            video = cv2.VideoCapture(args["video"])
            frame_interval = 1 / video.get(cv2.CAP_PROP_FPS)
            last_frame_time = 0
            frame_id = 0
        else:
            print("Running on OAK camera preview stream")

        # Create windows for the original video and the video of frames from
        # the NN passthrough. The window for the original video gets all the
        # option sliders to change pose estimator config
        video_window_name = "Original Video"
        passthrough_window_name = "Processed Video"
        video_window = SliderWindow(video_window_name)
        cv2.namedWindow(passthrough_window_name)
        video_window.add_poseestimator_options(pose_estimator, args)

        # Start main loop
        frame = None
        keypoints = None
        fps = FPS("Video", "NN", interval=0.1)
        timer = Timer("inference", "decode")
        while True:
            # Check for and handle slider changes
            slider_changes = video_window.get_changes()
            for option_name, value in slider_changes.items():
                pose_estimator.set_option(option_name, value)

            fps.start_frame()
            # Get next video frame (and submit for processing if local video)
            if camera:
                frame = preview_queue.get().getCvFrame()
                fps.count("Video")
            else:
                frame_time = time.perf_counter()
                # Only grab next frame from file at certain intervals to
                # roughly preserve its original FPS
                if frame_time - last_frame_time > frame_interval:
                    if video.grab():
                        __, frame = video.retrieve()
                        fps.count("Video")
                        last_frame_time = frame_time
                        # Create DepthAI ImgFrame object to pass to the
                        # camera
                        input_frame = pose_estimator.get_input_frame(frame)
                        frame_nn = dai.ImgFrame()
                        frame_nn.setSequenceNum(frame_id)
                        frame_nn.setWidth(input_frame.shape[2])
                        frame_nn.setHeight(input_frame.shape[1])
                        frame_nn.setType(dai.RawImgFrame.Type.BGR888p)
                        frame_nn.setFrame(input_frame)
                        pose_in_queue.send(frame_nn)
                        frame_id += 1
                    else:
                        frame_id = 0
                        video.set(cv2.CAP_PROP_POS_FRAMES, frame_id)

            # Process pose data whenever a new packet arrives
            if pose_queue.has():
                raw_output = pose_queue.get()
                timer.start_timer("decode")
                keypoints = pose_estimator.get_pose_data(raw_output)
                timer.stop_timer("decode")
                fps.count("NN")
                # When keypoints are available we should also have a
                # passthrough frame to process and display. Make sure it is
                # availabe to avoid suprises.
                if passthrough_queue.has():
                    passthrough = passthrough_queue.get()
                    timer.frame_time("inference", passthrough)
                    passthrough_frame = passthrough.getCvFrame()
                    passthrough_frame = pose_estimator.get_original_frame(
                        passthrough_frame)
                    pose_estimator.draw_results(keypoints, passthrough_frame)
                    cv2.imshow(passthrough_window_name, passthrough_frame)

            # Annotate current video frame with keypoints and FPS
            if keypoints is not None:
                pose_estimator.draw_results(keypoints, frame)
            fps.update()
            fps.display(frame)

            cv2.imshow(video_window_name, frame)

            if cv2.waitKey(1) == ord("q"):
                break
        fps.print_totals()
        timer.print_times()
        cv2.destroyAllWindows()
Exemplo n.º 20
0
    def run(self):
        device = dai.Device(self.create_pipeline())
        device.startPipeline()

        q_video = device.getOutputQueue(name="cam_out",
                                        maxSize=1,
                                        blocking=False)
        q_pd_in = device.getInputQueue(name="pd_in")
        q_pd_out = device.getOutputQueue(name="pd_out",
                                         maxSize=4,
                                         blocking=True)
        q_lm_out = device.getOutputQueue(name="lm_out",
                                         maxSize=4,
                                         blocking=True)
        q_lm_in = device.getInputQueue(name="lm_in")
        q_asl_out = device.getOutputQueue(name="asl_out",
                                          maxSize=4,
                                          blocking=True)
        q_asl_in = device.getInputQueue(name="asl_in")

        while True:
            in_video = q_video.get()
            video_frame = in_video.getCvFrame()

            h, w = video_frame.shape[:2]
            self.frame_size = max(h, w)
            self.pad_h = int((self.frame_size - h) / 2)
            self.pad_w = int((self.frame_size - w) / 2)

            video_frame = cv2.copyMakeBorder(video_frame, self.pad_h,
                                             self.pad_h, self.pad_w,
                                             self.pad_w, cv2.BORDER_CONSTANT)

            frame_nn = dai.ImgFrame()
            frame_nn.setWidth(self.pd_input_length)
            frame_nn.setHeight(self.pd_input_length)
            frame_nn.setData(
                to_planar(video_frame,
                          (self.pd_input_length, self.pd_input_length)))
            q_pd_in.send(frame_nn)

            annotated_frame = video_frame.copy()

            # Get palm detection
            inference = q_pd_out.get()
            self.pd_postprocess(inference)

            # Send data for hand landmarks
            for i, r in enumerate(self.regions):
                img_hand = mpu.warp_rect_img(r.rect_points, video_frame,
                                             self.lm_input_length,
                                             self.lm_input_length)
                nn_data = dai.NNData()
                nn_data.setLayer(
                    "input_1",
                    to_planar(img_hand,
                              (self.lm_input_length, self.lm_input_length)))
                q_lm_in.send(nn_data)

            # Retrieve hand landmarks
            for i, r in enumerate(self.regions):
                inference = q_lm_out.get()
                self.lm_postprocess(r, inference)
                hand_frame, handedness, hand_bbox = self.lm_render(
                    video_frame, annotated_frame, r)
                # ASL recognition
                if hand_frame is not None and self.asl_recognition:
                    hand_frame = cv2.resize(
                        hand_frame,
                        (self.asl_input_length, self.asl_input_length),
                        interpolation=cv2.INTER_NEAREST)
                    hand_frame = hand_frame.transpose(2, 0, 1)
                    nn_data = dai.NNData()
                    nn_data.setLayer("input", hand_frame)
                    q_asl_in.send(nn_data)
                    asl_result = np.array(q_asl_out.get().getFirstLayerFp16())
                    asl_idx = np.argmax(asl_result)
                    # Recognized ASL character is associated with a probability
                    asl_char = [
                        characters[asl_idx],
                        round(asl_result[asl_idx] * 100, 1)
                    ]
                    selected_char = asl_char
                    current_char_queue = None
                    if handedness > 0.5:
                        current_char_queue = self.right_char_queue
                    else:
                        current_char_queue = self.left_char_queue
                    current_char_queue.append(selected_char)
                    # Peform filtering of recognition resuls using the previous 5 results
                    # If there aren't enough reults, take the first result as output
                    if len(current_char_queue) < 5:
                        selected_char = current_char_queue[0]
                    else:
                        char_candidate = {}
                        for i in range(5):
                            if current_char_queue[i][0] not in char_candidate:
                                char_candidate[current_char_queue[i][0]] = [
                                    1, current_char_queue[i][1]
                                ]
                            else:
                                char_candidate[current_char_queue[i]
                                               [0]][0] += 1
                                char_candidate[current_char_queue[i][0]][
                                    1] += current_char_queue[i][1]
                        most_voted_char = ""
                        max_votes = 0
                        most_voted_char_prob = 0
                        for key in char_candidate:
                            if char_candidate[key][0] > max_votes:
                                max_votes = char_candidate[key][0]
                                most_voted_char = key
                                most_voted_char_prob = round(
                                    char_candidate[key][1] /
                                    char_candidate[key][0], 1)
                        selected_char = (most_voted_char, most_voted_char_prob)

                    if self.show_asl:
                        gesture_string = "Letter: " + selected_char[
                            0] + ", " + str(selected_char[1]) + "%"
                        textSize = self.ft.getTextSize(gesture_string,
                                                       fontHeight=14,
                                                       thickness=-1)[0]
                        cv2.rectangle(video_frame,
                                      (hand_bbox[0] - 5, hand_bbox[1]),
                                      (hand_bbox[0] + textSize[0] + 5,
                                       hand_bbox[1] - 18), (36, 152, 0), -1)
                        self.ft.putText(img=video_frame,
                                        text=gesture_string,
                                        org=(hand_bbox[0], hand_bbox[1] - 5),
                                        fontHeight=14,
                                        color=(255, 255, 255),
                                        thickness=-1,
                                        line_type=cv2.LINE_AA,
                                        bottomLeftOrigin=True)

            video_frame = video_frame[self.pad_h:self.pad_h + h,
                                      self.pad_w:self.pad_w + w]
            cv2.imshow("hand tracker", video_frame)
            key = cv2.waitKey(1)
            if key == ord('q') or key == 27:
                break
            elif key == 32:
                # Pause on space bar
                cv2.waitKey(0)
            elif key == ord('1'):
                self.show_hand_box = not self.show_hand_box
            elif key == ord('2'):
                self.show_landmarks = not self.show_landmarks
            elif key == ord('3'):
                self.show_asl = not self.show_asl
Exemplo n.º 21
0
def main(args):
    """
    Main programm loop.

    Parameters
    ----------
    args : command line arguments parsed by parse_arguments
    """
    # Set up PoseEstimator and pipeline  and load the dataset
    if args["model"] not in model_list:
        raise ValueError("Unknown model '{}'".format(args["model"]))
    model_config = model_list[args["model"]]
    pose_estimator = get_poseestimator(model_config, **args)

    with dai.Device(
            create_pipeline(model_config, camera=False, sync=True,
                            **args)) as device:
        device.startPipeline()

        pose_in_queue = device.getInputQueue("pose_in")
        pose_queue = device.getOutputQueue("pose")

        # Load coco keypoint annotations
        coco_data = COCOData(**args)
        # The keypoint selector allows to subset and re-order the predicted
        # keypoints to align with the annotation format of the COCO dataset
        keypoint_selector = coco_data.get_keypoint_selector(
            pose_estimator.landmarks)
        results_filename = "results_{model}_{conf}.json".format(
            model=args["model"], conf=args["detection_threshold"])

        # Re-use saved results if available. Iterate over dataset if not.
        if not os.path.exists(results_filename):
            timer = Timer("inference")
            results = []
            for img_id in trange(len(coco_data)):
                img = coco_data.get_image(img_id)

                input_frame = pose_estimator.get_input_frame(img)
                frame_nn = dai.ImgFrame()
                frame_nn.setSequenceNum(img_id)
                frame_nn.setWidth(input_frame.shape[2])
                frame_nn.setHeight(input_frame.shape[1])
                frame_nn.setFrame(input_frame)
                timer.start_timer("inference")
                pose_in_queue.send(frame_nn)

                raw_output = pose_queue.get()
                timer.stop_timer("inference")
                pred_keypoints = pose_estimator.get_pose_data(raw_output)

                # Convert each individual person into output format expected by
                # COCO evaluation tools
                for i in range(pred_keypoints.shape[0]):
                    score = pred_keypoints[i, :, 2]
                    score = np.sum(score) / np.count_nonzero(score)
                    pred_keypoints[i, :, 2] = 1
                    keypoints = np.around(pred_keypoints[i])
                    keypoints = keypoints[keypoint_selector]
                    results.append({
                        "image_id":
                        coco_data.get_coco_imageid(img_id),
                        "category_id":
                        1,
                        "keypoints":
                        keypoints.flatten().tolist(),
                        "score":
                        score
                    })

            with open(results_filename, "w") as results_file:
                json.dump(results, results_file)
            timer.print_times()

        coco_data.evaluate_results(results_filename)
Exemplo n.º 22
0
                    roi = roi_data.roi.denormalize(depth_frame.shape[1],
                                                   depth_frame.shape[0])
                    top_left = roi.topLeft()
                    bottom_right = roi.bottomRight()
                    # Display SBB on the disparity map
                    cv2.rectangle(pv.get("depth"),
                                  (int(top_left.x), int(top_left.y)),
                                  (int(bottom_right.x), int(bottom_right.y)),
                                  bbox_color, cv2.FONT_HERSHEY_SCRIPT_SIMPLEX)
        else:
            read_correctly, host_frame = cap.read()
            if not read_correctly:
                break

            scaled_frame = cv2.resize(host_frame, (in_w, in_h))
            frame_nn = dai.ImgFrame()
            frame_nn.setSequenceNum(seq_num)
            frame_nn.setWidth(in_w)
            frame_nn.setHeight(in_h)
            frame_nn.setData(to_planar(scaled_frame))
            nn_in.send(frame_nn)
            seq_num += 1

            # if high quality, send original frames
            if not conf.useHQ:
                host_frame = scaled_frame
            fps.tick('host')

        in_nn = nn_out.tryGetAll()
        if len(in_nn) > 0:
            if nn_manager.output_format == "detection":
Exemplo n.º 23
0
def main(args):
    """
    Main programm loop.

    Parameters
    ----------
    args : command line arguments parsed by parse_arguments
    """
    # Set up PoseEstimator, pipeline, window with sliders for PoseEstimator
    # options and load image
    if args["model"] not in model_list:
        raise ValueError("Unknown model '{}'".format(args["model"]))
    model_config = model_list[args["model"]]
    pose_estimator = get_poseestimator(model_config, **args)

    with dai.Device(create_pipeline(model_config, camera=False,
                                    **args)) as device:
        device.startPipeline()

        pose_in_queue = device.getInputQueue("pose_in")
        pose_queue = device.getOutputQueue("pose")

        if not os.path.exists(args["image"]):
            raise ValueError("Image '{}' does not exist.".format(
                args["image"]))
        print("Loading image", args["image"])
        image = cv2.imread(args["image"])

        window = SliderWindow("preview")
        window.add_poseestimator_options(pose_estimator, args)

        # Start main loop
        frame = None
        keypoints = None
        raw_output = None
        redraw = True
        timer = Timer("inference", "decode")
        while True:
            # Check for and handle slider changes, redraw if there was a change
            slider_changes = window.get_changes()
            for option_name, value in slider_changes.items():
                pose_estimator.set_option(option_name, value)
                redraw = True

            # On the first iteration pass the image to the NN for inference
            # Raw results are kept after so changes in PoseEstimator options
            # only require decoding the results again, not another inference
            if frame is None:
                frame = image.copy()
                # Create DepthAI ImgFrame object to pass to the camera
                input_frame = pose_estimator.get_input_frame(frame)
                frame_nn = dai.ImgFrame()
                frame_nn.setSequenceNum(0)
                frame_nn.setWidth(input_frame.shape[2])
                frame_nn.setHeight(input_frame.shape[1])
                frame_nn.setFrame(input_frame)
                timer.start_timer("inference")
                pose_in_queue.send(frame_nn)

            # Store the raw results once available
            if pose_queue.has():
                raw_output = pose_queue.get()
                timer.stop_timer("inference")

            # Once we've got the raw output and again whenever an option
            # changes we need to decode and draw
            if redraw and raw_output is not None:
                # keep a clean copy of the image for redrawing
                frame = image.copy()
                timer.start_timer("decode")
                keypoints = pose_estimator.get_pose_data(raw_output)
                timer.stop_timer("decode")
                pose_estimator.draw_results(keypoints, frame)
                redraw = False

            cv2.imshow("preview", frame)

            if cv2.waitKey(1) == ord("q"):
                break
        cv2.destroyAllWindows()
        timer.print_times()
    def process_image(self, img):
        annotated_frame = img
        if self.camera:
            in_video = self.q_video.get()
            # Convert NV12 to BGR
            yuv = in_video.getData().reshape(
                (in_video.getHeight() * 3 // 2, in_video.getWidth()))
            video_frame = cv2.cvtColor(yuv, cv2.COLOR_YUV2BGR_NV12)
        else:
            if self.image_mode is None:
                vid_frame = img
                height, width, _ = img.shape
                self.video_size = int(min(width, height))
            elif self.image_mode:
                vid_frame = self.img
            else:
                ok, vid_frame = self.cap.read()
                if not ok:
                    # print("not OK video frame")
                    return [], img  #break

            h, w = vid_frame.shape[:2]
            dx = (w - self.video_size) // 2
            dy = (h - self.video_size) // 2
            video_frame = vid_frame[dy:dy + self.video_size,
                                    dx:dx + self.video_size]
            frame_nn = dai.ImgFrame()
            frame_nn.setSequenceNum(self.seq_num)
            frame_nn.setWidth(self.pd_input_length)
            frame_nn.setHeight(self.pd_input_length)
            frame_nn.setData(
                to_planar(video_frame,
                          (self.pd_input_length, self.pd_input_length)))

            self.q_pd_in.send(frame_nn)

            self.seq_num += 1

        annotated_frame = video_frame.copy()

        inference = self.q_pd_out.get()
        self.pd_postprocess(inference)
        self.pd_render(annotated_frame)

        # Hand landmarks
        if self.use_lm:
            for i, r in enumerate(self.regions):
                img_hand = mpu.warp_rect_img(r.rect_points, video_frame,
                                             self.lm_input_length,
                                             self.lm_input_length)
                nn_data = dai.NNData()
                nn_data.setLayer(
                    "input_1",
                    to_planar(img_hand,
                              (self.lm_input_length, self.lm_input_length)))
                self.q_lm_in.send(nn_data)

            # Retrieve hand landmarks
            for i, r in enumerate(self.regions):
                inference = self.q_lm_out.get()
                self.lm_postprocess(r, inference)
                self.lm_render(annotated_frame, r)

        return self.regions, annotated_frame
Exemplo n.º 25
0
                    # TODO make it work taking args like in OpenCV:
                    # rr = ((256, 256), (128, 64), 30)
                    rr = dai.RotatedRect()
                    rr.center.x = rotated_rect[0][0]
                    rr.center.y = rotated_rect[0][1]
                    rr.size.width = rotated_rect[1][0]
                    rr.size.height = rotated_rect[1][1]
                    rr.angle = rotated_rect[2]
                    cfg = dai.ImageManipConfig()
                    cfg.setCropRotatedRect(rr, False)
                    cfg.setResize(120, 32)
                    # Send frame and config to device
                    if idx == 0:
                        w, h, c = frame.shape
                        imgFrame = dai.ImgFrame()
                        imgFrame.setData(to_planar(frame))
                        imgFrame.setType(dai.ImgFrame.Type.BGR888p)
                        imgFrame.setWidth(w)
                        imgFrame.setHeight(h)
                        q_manip_img.send(imgFrame)
                    else:
                        cfg.setReusePreviousImage(True)
                    q_manip_cfg.send(cfg)

                    # Get manipulated image from the device
                    transformed = q_manip_out.get().getCvFrame()

                    rec_placeholder_img = np.zeros((32, 200, 3), np.uint8)
                    transformed = np.hstack((transformed, rec_placeholder_img))
                    if cropped_stacked is None: