def test_io(self): example_rect = Rectangle(x_min=0, y_min=1, x_max=2, y_max=33) self.assertEqual(example_rect.as_dict, dict(x_min=0, y_min=1, x_max=2, y_max=33)) self.assertEqual( example_rect, Rectangle.from_dict(dict(x_min=0, y_min=1, x_max=2, y_max=33))) self.assertEqual(example_rect.to_coco(), dict(x_center=1, y_center=17, width=2, height=32)) self.assertEqual( example_rect, Rectangle.from_coco( **dict(x_center=1, y_center=17, width=2, height=32)))
def get_bounding_box_of_elem(elem: html.HtmlElement) -> Rectangle: """Return coordinates of the bounding box of a word, as a 4-tuple.""" return Rectangle( x_min=elem.attrib["xmin"], y_min=elem.attrib["ymin"], x_max=elem.attrib["xmax"], y_max=elem.attrib["ymax"])
def ocr_one_image(cls, img: Image.Image, lang: str = "eng", config: str = "--psm 1 --oem 3") -> List[Dict]: """Compute a dictionary with detected words and bounding boxes. :param img: input image :param lang: language code :param config: tesseract configuration :return: list of dictionaries of type {"word": word, "bb": bounding box of the word, relative to page size} """ d = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, lang=lang, config=config) result = [] for i, word in enumerate(d["text"]): if word.strip(): left, top, width, height = d["left"][i], d["top"][i], d[ "width"][i], d["height"][i] result.append({ "word": word, "bb": Rectangle(x_min=left, y_min=top, x_max=left + width, y_max=top + height).relative_to_size( width=img.size[0], height=img.size[1]) }) return result
def test_subrectangle(self): small = Rectangle(0, 1, 2, 3) medium1 = Rectangle(0, 1, 4, 10) medium2 = Rectangle(-1, 1, 4, 3) large = Rectangle(-1, 0, 4, 10) self.assertTrue(small in medium1) self.assertTrue(small in medium2) self.assertTrue(small in large) self.assertTrue(medium1 in large) self.assertTrue(medium2 in large) self.assertFalse(medium1 in small) self.assertFalse(medium2 in small) self.assertFalse(large in small) self.assertFalse(large in medium1) self.assertFalse(large in medium2)
def test_normalization(self): first = Rectangle(0, 0, 2, 2) second = Rectangle(3, 3, 5, 5) in_between = Rectangle(1, 1, 4, 4) far_away = Rectangle(100, 1, 120, 4) normalization = Rectangle.normalize_list_of_rectangles( [first, second, far_away, in_between]) expected_normalization_1 = Rectangle(x_min=100.0, y_min=1.0, x_max=120.0, y_max=4.0) expected_normalization_2 = Rectangle(x_min=0.0, y_min=0.0, x_max=5.0, y_max=5.0) self.assertTrue( any([ normalization == [ expected_normalization_1, expected_normalization_2 ], normalization == [ expected_normalization_2, expected_normalization_1 ] ]))
def test_resizing_methods(self): example_rect = Rectangle(0.0, 1.5, 2.0, 33.3) self.assertEqual(example_rect.rescale(0.1, 10), Rectangle(0, 15, 0.2, 333)) self.assertEqual(example_rect.to_int(), Rectangle(0, 1, 2, 33)) self.assertEqual( example_rect.relative_to_size(width=100, height=200), Rectangle(x_min=0, y_min=1.5 / 200, x_max=2.0 / 100, y_max=33.3 / 200))
def test_smallest_common_superrectangle(self): r1, r2 = Rectangle(0, 0, 1, 1), Rectangle(10, 10, 11, 11) self.assertEqual(r1.smallest_common_superrectangle(r2), Rectangle(0, 0, 11, 11)) self.assertEqual(r2.smallest_common_superrectangle(r1), Rectangle(0, 0, 11, 11))
def test_pdf_box_to_image_box(self): """Transform bounding box from points to pixels. If shape ratios do not match, an exception should be risen. """ pdf_box = Rectangle(10, 10, 20, 30) image_box_1 = Rectangle(100, 100, 200, 300) pdf_box_converted = converter.pdf_box_to_image_box(pdf_box=pdf_box, pdf_width=50, pdf_height=100, img_width=500, img_height=1000) self.assertEqual(pdf_box_converted, image_box_1) self.assertRaises(converter.RotatedPdfException, converter.pdf_box_to_image_box, pdf_box=pdf_box, pdf_width=50, pdf_height=100, img_width=1000, img_height=500)
def test_text_extraction_from_pdf(self): """This is essentially testing pdftotext (probably coming from Poppler, of Xpdf).""" simple_text = self.pdf.simple_text layout_text = self.pdf.layout_text # xml with bounding boxes of words root = self.pdf.get_page_as_html(0) # list of strings (one per page) simple_pages = [page for page in simple_text.split("\f") if page] layout_pages = [page for page in layout_text.split("\f") if page] # We have two pages in the pdf self.assertEqual(len(simple_pages), 2) self.assertEqual(len(layout_pages), 2) # Test that first page contain expected words words_in_first_page = set(simple_pages[0].split()) self.assertTrue({"Lorem", "ipsum", "Aron", "killed", "*****@*****.**"}.issubset(words_in_first_page)) self.assertFalse({ "Autobahn", "Das", "The", "name", "hungry", "*****@*****.**" } & words_in_first_page) # this regex should be matched in a reasonably extracted layout-first-page-text self.assertTrue( re.search(r"Stolen\s+bike\s+500\s+Euro\s+3%", layout_pages[0])) self.assertTrue(re.search(r"[email protected]\s*\n", layout_pages[1])) # Find bounding box of 'extreme' word on first page extreme_element = root.xpath(".//word[text()='extreme']")[0] extreme_bb = Rectangle(x_min=extreme_element.attrib["xmin"], y_min=extreme_element.attrib["ymin"], x_max=extreme_element.attrib["xmax"], y_max=extreme_element.attrib["ymax"]) # Check that the bounding box is reasonable self.assertTrue(extreme_bb in Rectangle( x_min=220, y_min=530, x_max=290, y_max=590))
def test_text_extraction_from_rotated_pdf(self): """Check that bounding box of a word in pdf is where it should be.""" pages = self.pdf.get_pages() pages_rotated = self.pdf_rotated.get_pages() pages_txt = self.pdf.get_pages_as_text() self.assertEqual(len(pages), 2) self.assertEqual(len(pages_txt), 2) first_el_pdf = pages[0][0] first_el_pdf_rotated = pages_rotated[0][0] # both pdf should start with "Insurance" on the first page self.assertEqual(first_el_pdf.text, "Insurance") self.assertEqual(first_el_pdf_rotated.text, "Insurance") # enforce approximate bounding box of this first word self.assertTrue( self.pdf.get_bounding_box_of_elem(pages[0][0]) in ( Rectangle(x_min=72, y_min=98, x_max=165, y_max=128))) self.assertTrue( self.pdf_rotated.get_bounding_box_of_elem(pages_rotated[0][0]) in ( Rectangle(x_min=712, y_min=70, x_max=750, y_max=162)))
def test_dump_annotations_to_file(self): """Dump annotations to file, load them from file, and compare that all is consistent.""" annotations = self.extractor.get_annot_from_pdf(self.annotated_pdf) temp_json_file = mkstemp()[1] self.extractor.dump_annotations_to_file(annotations, temp_json_file) with open(temp_json_file) as f: annots_from_file = json.load(f) for i, annot in enumerate(annots_from_file): # check that i'th annotation on page page_idx is the same in annotations and in annots_from_file with self.subTest(annotation=annot): self.assertTrue( annotations_are_similar( Annotation(page=annot["page"], type=annot["type"], box=Rectangle.from_dict(annot["box"]), text_content=annot["text_content"], who_annotated=annot["who_annotated"], label=annot["label"]), annotations[i])) os.remove(temp_json_file)
def _create_annotations_bounding_box(box_as_list: List, page_height: Union[int, float], from_above: bool = True) -> Rectangle: """Get the rectangle representing the bounding box of an annotation. When extracting an annotation from PyPDF2 Reader, then the bounding box of an annotation has some slightly unintuitive format. Here we convert it to our standard Rectangle object. :param box_as_list: list of 4 numbers, assumed to be (x_min, y_min, x_max, y_max). Vertical coordinates are increasing from below! :param page_height: number :param from_above: if True (default), we renormalize the vertical coordinate to increase from above. :return: adjusted Rectangle object """ return Rectangle( x_min=float(box_as_list[0]), y_min=float(page_height) - float(box_as_list[3]) if from_above else float(box_as_list[1]), x_max=float(box_as_list[2]), y_max=float(page_height) - float(box_as_list[1]) if from_above else float(box_as_list[3]))
def pdf_box_to_image_box(pdf_box: Rectangle, pdf_width: int, pdf_height: int, img_width: int, img_height: int) -> Rectangle: """Convert a box in pdf coordinates into the box in image coordinates. :param pdf_box: a Rectangle representing some area in a pdf page :param pdf_width: :param pdf_height: :param img_width: :param img_height: :return: a corresponding Rectangle in the image-coordinates """ if abs(img_height / img_width - pdf_height / pdf_width) > 0.1: raise RotatedPdfException("Pdf seems to be rotated, skipping") w_scale = img_width / pdf_width h_scale = img_width / pdf_width return Rectangle( x_min=pdf_box.x_min * w_scale, y_min=pdf_box.y_min * h_scale, x_max=pdf_box.x_max * w_scale, y_max=pdf_box.y_max * h_scale, dtype=int, )
def test_annotation_creation(self): """Test the creation of one Annotation object.""" ann = Annotation(page=12, type="rectangle", box=Rectangle(10, 10, 13, 13), text_content="FPP3", who_annotated="terminator II", label=3) expected_annotation_as_dict = { "page": 12, "type": "rectangle", "box": { "x_min": 10, "y_min": 10, "x_max": 13, "y_max": 13 }, "text_content": "FPP3", "who_annotated": "terminator II", "label": 3 } self.assertEqual(ann.as_dict, expected_annotation_as_dict)
class TestAnnotation(unittest.TestCase): annotated_pdf = Pdf(ANNOTATED_PDF_PATH) extractor = AnnotationExtractor() expected_annotations = [ Annotation( page=0, type="note", box=Rectangle(x_min=87.58, y_min=45.574, x_max=107.58, y_max=65.574), text_content= "Daniel, include also the remaining 133 pages in the pdf!!!!", who_annotated="peter"), Annotation(page=0, type="rectangle", box=Rectangle(x_min=83.46, y_min=504.12, x_max=221.12, y_max=518), text_content="risk", who_annotated="peter"), Annotation(page=0, type="rectangle", box=Rectangle(x_min=321.27, y_min=503, x_max=374.16, y_max=517.63), text_content="coverage_total", who_annotated="peter"), Annotation(page=0, type="rectangle", box=Rectangle(x_min=373.4, y_min=504.5, x_max=399.66, y_max=518), text_content="currency", who_annotated="peter"), Annotation(page=0, type="rectangle", box=Rectangle(x_min=465.68, y_min=504.87, x_max=486.31, y_max=517.25), text_content="deductible in %", who_annotated="peter"), Annotation(page=1, type="oval", box=Rectangle(x_min=55.7, y_min=133.72, x_max=338.9, y_max=177.23), text_content="add Honza", who_annotated="peter"), ] def test_assertion_in_annotation_type(self): """If type is not in ADMISSIBLE_ANNOTATIONS, an error should be raised.""" self.assertRaises( AssertionError, lambda: Annotation( page=0, type="invisible", box=Rectangle(0, 0, 0, 0))) def test_annotation_creation(self): """Test the creation of one Annotation object.""" ann = Annotation(page=12, type="rectangle", box=Rectangle(10, 10, 13, 13), text_content="FPP3", who_annotated="terminator II", label=3) expected_annotation_as_dict = { "page": 12, "type": "rectangle", "box": { "x_min": 10, "y_min": 10, "x_max": 13, "y_max": 13 }, "text_content": "FPP3", "who_annotated": "terminator II", "label": 3 } self.assertEqual(ann.as_dict, expected_annotation_as_dict) def test_annotation_extraction(self): """Extract annotation from file and check that they correspond to expected annotations.""" annotations = self.extractor.get_annot_from_pdf(self.annotated_pdf) # each annotation is found in expected for annot in annotations: with self.subTest(annotation=annot): self.assertTrue( any( annotations_are_similar(annot, other) for other in self.expected_annotations)) # each expected annotation is found in annotations for exp_annot in self.expected_annotations: with self.subTest(expected_annotation=exp_annot): self.assertTrue( any( annotations_are_similar(exp_annot, other) for other in annotations)) def test_dump_annotations_to_file(self): """Dump annotations to file, load them from file, and compare that all is consistent.""" annotations = self.extractor.get_annot_from_pdf(self.annotated_pdf) temp_json_file = mkstemp()[1] self.extractor.dump_annotations_to_file(annotations, temp_json_file) with open(temp_json_file) as f: annots_from_file = json.load(f) for i, annot in enumerate(annots_from_file): # check that i'th annotation on page page_idx is the same in annotations and in annots_from_file with self.subTest(annotation=annot): self.assertTrue( annotations_are_similar( Annotation(page=annot["page"], type=annot["type"], box=Rectangle.from_dict(annot["box"]), text_content=annot["text_content"], who_annotated=annot["who_annotated"], label=annot["label"]), annotations[i])) os.remove(temp_json_file)
def test_assertion_in_annotation_type(self): """If type is not in ADMISSIBLE_ANNOTATIONS, an error should be raised.""" self.assertRaises( AssertionError, lambda: Annotation( page=0, type="invisible", box=Rectangle(0, 0, 0, 0)))
def test_image2box(self): h, w = 123, 234 black_image = np.zeros((h, w)) self.assertEqual(Rectangle(0, 0, w, h), Rectangle.from_image(black_image))
def test_intersection(self): """Tests the intersection and iou.""" r1 = Rectangle(0, 1, 2, 3) r2 = Rectangle(-10, -10, 0, 0) r3 = Rectangle(1, 1, 5, 5) self.assertIsNone(r1.intersection(r2)) self.assertIsNone(r2.intersection(r1)) self.assertIsNone(r2.intersection(r3)) self.assertIsNone(r3.intersection(r2)) self.assertEqual(r1.intersection(r3), Rectangle(1, 1, 2, 3)) self.assertEqual(r3.intersection(r1), Rectangle(1, 1, 2, 3)) self.assertEqual(r1.get_iou(r2), 0) self.assertEqual(r2.get_iou(r1), 0) self.assertEqual(r1.get_iou(r3), 1 / 9) self.assertEqual(r3.get_iou(r1), 1 / 9) self.assertTrue(r1.intersection_width_some_other([r2, r3])) self.assertFalse(r2.intersection_width_some_other([r1, r3]))
def test_width_height(self): example_rect = Rectangle(x_min=0, y_min=1, x_max=2, y_max=33) self.assertEqual(example_rect.width, 2) self.assertEqual(example_rect.height, 32) self.assertEqual(example_rect.area, 64) self.assertEqual(example_rect.center, (1, 17))