def main(args): model = ocr_predictor(args.detection, args.recognition, pretrained=True) if args.path.lower().endswith(".pdf"): doc = DocumentFile.from_pdf(args.path) else: doc = DocumentFile.from_images(args.path) out = model(doc) for page, img in zip(out.pages, doc): page.show(img, block=not args.noblock, interactive=not args.static)
def test_qr_code_detector(mock_image_folder): detector = BarCodeDetector() for img in os.listdir(mock_image_folder): image = DocumentFile.from_images(os.path.join(mock_image_folder, img))[0] barcode = detector(image) assert len(barcode) == 0
def test_face_detector(mock_image_folder): detector = FaceDetector(n_faces=1) for img in os.listdir(mock_image_folder): image = DocumentFile.from_images(os.path.join(mock_image_folder, img))[0] faces = detector(image) assert len(faces) <= 1
def test_extract_rcrops(mock_pdf): # noqa: F811 doc_img = DocumentFile.from_pdf(mock_pdf).as_images()[0] num_crops = 2 rel_boxes = np.array([[[idx / num_crops, idx / num_crops], [idx / num_crops + .1, idx / num_crops], [idx / num_crops + .1, idx / num_crops + .1], [idx / num_crops, idx / num_crops]] for idx in range(num_crops)], dtype=np.float32) abs_boxes = deepcopy(rel_boxes) abs_boxes[:, :, 0] *= doc_img.shape[1] abs_boxes[:, :, 1] *= doc_img.shape[0] abs_boxes = abs_boxes.astype(np.int) with pytest.raises(AssertionError): extract_rcrops(doc_img, np.zeros((1, 8))) for boxes in (rel_boxes, abs_boxes): croped_imgs = extract_rcrops(doc_img, boxes) # Number of crops assert len(croped_imgs) == num_crops # Data type and shape assert all(isinstance(crop, np.ndarray) for crop in croped_imgs) assert all(crop.ndim == 3 for crop in croped_imgs) # No box assert extract_rcrops(doc_img, np.zeros((0, 4, 2))) == []
def test_recognitionpredictor(mock_pdf, mock_vocab): # noqa: F811 batch_size = 4 predictor = RecognitionPredictor( PreProcessor(output_size=(32, 128), batch_size=batch_size, preserve_aspect_ratio=True), recognition.crnn_vgg16_bn(vocab=mock_vocab, input_shape=(32, 128, 3)), ) pages = DocumentFile.from_pdf(mock_pdf).as_images() # Create bounding boxes boxes = np.array([[0.5, 0.5, 0.75, 0.75], [0.5, 0.5, 1.0, 1.0]], dtype=np.float32) crops = extract_crops(pages[0], boxes) out = predictor(crops) # One prediction per crop assert len(out) == boxes.shape[0] assert all( isinstance(val, str) and isinstance(conf, float) for val, conf in out) # Dimension check with pytest.raises(ValueError): input_crop = (255 * np.random.rand(1, 128, 64, 3)).astype(np.uint8) _ = predictor([input_crop]) return predictor
def test_extract_crops(mock_pdf): # noqa: F811 doc_img = DocumentFile.from_pdf(mock_pdf).as_images()[0] num_crops = 2 rel_boxes = np.array([[idx / num_crops, idx / num_crops, (idx + 1) / num_crops, (idx + 1) / num_crops] for idx in range(num_crops)], dtype=np.float32) abs_boxes = np.array([[int(idx * doc_img.shape[1] / num_crops), int(idx * doc_img.shape[0]) / num_crops, int((idx + 1) * doc_img.shape[1] / num_crops), int((idx + 1) * doc_img.shape[0] / num_crops)] for idx in range(num_crops)], dtype=np.float32) with pytest.raises(AssertionError): extract_crops(doc_img, np.zeros((1, 5))) for boxes in (rel_boxes, abs_boxes): croped_imgs = extract_crops(doc_img, boxes) # Number of crops assert len(croped_imgs) == num_crops # Data type and shape assert all(isinstance(crop, np.ndarray) for crop in croped_imgs) assert all(crop.ndim == 3 for crop in croped_imgs) # Identity assert np.all(doc_img == extract_crops(doc_img, np.array([[0, 0, 1, 1]], dtype=np.float32), channels_last=True)[0]) torch_img = np.transpose(doc_img, axes=(-1, 0, 1)) assert np.all(torch_img == np.transpose( extract_crops(doc_img, np.array([[0, 0, 1, 1]], dtype=np.float32), channels_last=False)[0], axes=(-1, 0, 1) )) # No box assert extract_crops(doc_img, np.zeros((0, 4))) == []
def test_ocrpredictor(mock_pdf, mock_vocab, assume_straight_pages, straighten_pages): det_bsize = 4 det_predictor = DetectionPredictor( PreProcessor(output_size=(512, 512), batch_size=det_bsize), detection.db_mobilenet_v3_large( pretrained=False, pretrained_backbone=False, assume_straight_pages=assume_straight_pages, ), ) assert not det_predictor.model.training reco_bsize = 32 reco_predictor = RecognitionPredictor( PreProcessor(output_size=(32, 128), batch_size=reco_bsize, preserve_aspect_ratio=True), recognition.crnn_vgg16_bn(pretrained=False, pretrained_backbone=False, vocab=mock_vocab), ) assert not reco_predictor.model.training doc = DocumentFile.from_pdf(mock_pdf) predictor = OCRPredictor( det_predictor, reco_predictor, assume_straight_pages=assume_straight_pages, straighten_pages=straighten_pages, detect_orientation=True, detect_language=True, ) if assume_straight_pages: assert predictor.crop_orientation_predictor is None else: assert isinstance(predictor.crop_orientation_predictor, nn.Module) out = predictor(doc) assert isinstance(out, Document) assert len(out.pages) == 2 # Dimension check with pytest.raises(ValueError): input_page = (255 * np.random.rand(1, 256, 512, 3)).astype(np.uint8) _ = predictor([input_page]) orientation = 0 assert out.pages[0].orientation["value"] == orientation
def _process_file(model, file_path: Path, out_format: str) -> str: if str(file_path).lower().endswith(".pdf"): doc = DocumentFile.from_pdf(file_path) else: doc = DocumentFile.from_images(file_path) out = model(doc) export = out.export() if out_format == _OUTPUT_CHOICE_JSON: out_txt = json.dumps(export, indent=2) elif out_format == _OUTPUT_CHOICE_TEXT: out_txt = "" for page in export["pages"]: for block in page["blocks"]: for line in block["lines"]: for word in line["words"]: out_txt += word["value"] + " " out_txt += "\n" out_txt += "\n\n" else: out_txt = "" return out_txt
def _get_doctr_docs(self, raw_documents: List[Path]): if not hasattr(self, "doctr_model"): self.doctr_model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True) list_doctr_docs = [] for doc in raw_documents: if not doc.exists(): print(f"Doc {doc} could not be found.") continue res_doctr = None try: if doc.suffix == "pdf": doc_doctr = DocumentFile.from_pdf(doc) else: doc_doctr = DocumentFile.from_images(doc) res_doctr = self.doctr_model(doc_doctr) except Exception as e: print(f"Could not analyze document {doc}. Error: {e}") if res_doctr: list_doctr_docs.append(res_doctr) return list_doctr_docs
def test_trained_ocr_predictor(mock_tilted_payslip): doc = DocumentFile.from_images(mock_tilted_payslip) det_predictor = detection_predictor("db_resnet50", pretrained=True, batch_size=2, assume_straight_pages=True) reco_predictor = recognition_predictor("crnn_vgg16_bn", pretrained=True, batch_size=128) predictor = OCRPredictor( det_predictor, reco_predictor, assume_straight_pages=True, straighten_pages=True, ) out = predictor(doc) assert out.pages[0].blocks[0].lines[0].words[0].value == "Mr." geometry_mr = np.array( [[0.08844472, 0.35763523], [0.11625107, 0.34320644], [0.12588427, 0.35771032], [0.09807791, 0.37213911]] ) assert np.allclose(np.array(out.pages[0].blocks[0].lines[0].words[0].geometry), geometry_mr) assert out.pages[0].blocks[1].lines[0].words[-1].value == "revised" geometry_revised = np.array( [[0.50422498, 0.19551784], [0.55741975, 0.16791493], [0.56705294, 0.18241881], [0.51385817, 0.21002172]] ) assert np.allclose(np.array(out.pages[0].blocks[1].lines[0].words[-1].geometry), geometry_revised) det_predictor = detection_predictor( "db_resnet50", pretrained=True, batch_size=2, assume_straight_pages=True, preserve_aspect_ratio=True, symmetric_pad=True, ) predictor = OCRPredictor( det_predictor, reco_predictor, assume_straight_pages=True, straighten_pages=True, preserve_aspect_ratio=True, symmetric_pad=True, ) out = predictor(doc) assert out.pages[0].blocks[0].lines[0].words[0].value == "Mr."
def test_detectionpredictor(mock_pdf): # noqa: F811 batch_size = 4 predictor = DetectionPredictor( PreProcessor(output_size=(512, 512), batch_size=batch_size), detection.db_resnet50(input_shape=(512, 512, 3))) pages = DocumentFile.from_pdf(mock_pdf).as_images() out = predictor(pages) # The input PDF has 2 pages assert len(out) == 2 # Dimension check with pytest.raises(ValueError): input_page = (255 * np.random.rand(1, 256, 512, 3)).astype(np.uint8) _ = predictor([input_page]) return predictor
def test_ocrpredictor(mock_pdf, mock_vocab, assume_straight_pages, straighten_pages): det_bsize = 4 det_predictor = DetectionPredictor( PreProcessor(output_size=(512, 512), batch_size=det_bsize), detection.db_mobilenet_v3_large( pretrained=True, pretrained_backbone=False, input_shape=(512, 512, 3), assume_straight_pages=assume_straight_pages, )) reco_bsize = 16 reco_predictor = RecognitionPredictor( PreProcessor(output_size=(32, 128), batch_size=reco_bsize, preserve_aspect_ratio=True), recognition.crnn_vgg16_bn(pretrained=False, pretrained_backbone=False, vocab=mock_vocab)) doc = DocumentFile.from_pdf(mock_pdf).as_images() predictor = OCRPredictor( det_predictor, reco_predictor, assume_straight_pages=assume_straight_pages, straighten_pages=straighten_pages, ) if assume_straight_pages: assert predictor.crop_orientation_predictor is None else: assert isinstance(predictor.crop_orientation_predictor, NestedObject) out = predictor(doc) assert isinstance(out, Document) assert len(out.pages) == 2 # Dimension check with pytest.raises(ValueError): input_page = (255 * np.random.rand(1, 256, 512, 3)).astype(np.uint8) _ = predictor([input_page])
def main(): # Wide mode st.set_page_config(layout="wide") # Designing the interface st.title("docTR: Document Text Recognition") # For newline st.write('\n') # Instructions st.markdown( "*Hint: click on the top-right corner of an image to enlarge it!*") # Set the columns cols = st.columns((1, 1, 1, 1)) cols[0].subheader("Input page") cols[1].subheader("Segmentation heatmap") cols[2].subheader("OCR output") cols[3].subheader("Page reconstitution") # Sidebar # File selection st.sidebar.title("Document selection") # Disabling warning st.set_option('deprecation.showfileUploaderEncoding', False) # Choose your own image uploaded_file = st.sidebar.file_uploader( "Upload files", type=['pdf', 'png', 'jpeg', 'jpg']) if uploaded_file is not None: if uploaded_file.name.endswith('.pdf'): doc = DocumentFile.from_pdf(uploaded_file.read()).as_images() else: doc = DocumentFile.from_images(uploaded_file.read()) page_idx = st.sidebar.selectbox( "Page selection", [idx + 1 for idx in range(len(doc))]) - 1 cols[0].image(doc[page_idx]) # Model selection st.sidebar.title("Model selection") det_arch = st.sidebar.selectbox("Text detection model", DET_ARCHS) reco_arch = st.sidebar.selectbox("Text recognition model", RECO_ARCHS) # For newline st.sidebar.write('\n') if st.sidebar.button("Analyze page"): if uploaded_file is None: st.sidebar.write("Please upload a document") else: with st.spinner('Loading model...'): predictor = ocr_predictor(det_arch, reco_arch, pretrained=True) with st.spinner('Analyzing...'): # Forward the image to the model processed_batches = predictor.det_predictor.pre_processor( [doc[page_idx]]) out = predictor.det_predictor.model(processed_batches[0], return_model_output=True) seg_map = out["out_map"] seg_map = tf.squeeze(seg_map[0, ...], axis=[2]) seg_map = cv2.resize( seg_map.numpy(), (doc[page_idx].shape[1], doc[page_idx].shape[0]), interpolation=cv2.INTER_LINEAR) # Plot the raw heatmap fig, ax = plt.subplots() ax.imshow(seg_map) ax.axis('off') cols[1].pyplot(fig) # Plot OCR output out = predictor([doc[page_idx]]) fig = visualize_page(out.pages[0].export(), doc[page_idx], interactive=False) cols[2].pyplot(fig) # Page reconsitution under input page page_export = out.pages[0].export() img = out.pages[0].synthesize() cols[3].image(img, clamp=True) # Display JSON st.markdown("\nHere are your analysis results in JSON format:") st.json(page_export)
def main(det_archs, reco_archs): """Build a streamlit layout""" # Wide mode st.set_page_config(layout="wide") # Designing the interface st.title("docTR: Document Text Recognition") # For newline st.write("\n") # Instructions st.markdown( "*Hint: click on the top-right corner of an image to enlarge it!*") # Set the columns cols = st.columns((1, 1, 1, 1)) cols[0].subheader("Input page") cols[1].subheader("Segmentation heatmap") cols[2].subheader("OCR output") cols[3].subheader("Page reconstitution") # Sidebar # File selection st.sidebar.title("Document selection") # Disabling warning st.set_option("deprecation.showfileUploaderEncoding", False) # Choose your own image uploaded_file = st.sidebar.file_uploader( "Upload files", type=["pdf", "png", "jpeg", "jpg"]) if uploaded_file is not None: if uploaded_file.name.endswith(".pdf"): doc = DocumentFile.from_pdf(uploaded_file.read()) else: doc = DocumentFile.from_images(uploaded_file.read()) page_idx = st.sidebar.selectbox( "Page selection", [idx + 1 for idx in range(len(doc))]) - 1 page = doc[page_idx] cols[0].image(page) # Model selection st.sidebar.title("Model selection") st.sidebar.markdown("**Backend**: " + ("TensorFlow" if is_tf_available() else "PyTorch")) det_arch = st.sidebar.selectbox("Text detection model", det_archs) reco_arch = st.sidebar.selectbox("Text recognition model", reco_archs) # For newline st.sidebar.write("\n") if st.sidebar.button("Analyze page"): if uploaded_file is None: st.sidebar.write("Please upload a document") else: with st.spinner("Loading model..."): predictor = load_predictor(det_arch, reco_arch, forward_device) with st.spinner("Analyzing..."): # Forward the image to the model seg_map = forward_image(predictor, page, forward_device) seg_map = np.squeeze(seg_map) seg_map = cv2.resize(seg_map, (page.shape[1], page.shape[0]), interpolation=cv2.INTER_LINEAR) # Plot the raw heatmap fig, ax = plt.subplots() ax.imshow(seg_map) ax.axis("off") cols[1].pyplot(fig) # Plot OCR output out = predictor([page]) fig = visualize_page(out.pages[0].export(), page, interactive=False) cols[2].pyplot(fig) # Page reconsitution under input page page_export = out.pages[0].export() if "rotation" not in det_arch: img = out.pages[0].synthesize() cols[3].image(img, clamp=True) # Display JSON st.markdown("\nHere are your analysis results in JSON format:") st.json(page_export)