Пример #1
0
    def test_io(self):
        example_rect = Rectangle(x_min=0, y_min=1, x_max=2, y_max=33)
        self.assertEqual(example_rect.as_dict,
                         dict(x_min=0, y_min=1, x_max=2, y_max=33))
        self.assertEqual(
            example_rect,
            Rectangle.from_dict(dict(x_min=0, y_min=1, x_max=2, y_max=33)))

        self.assertEqual(example_rect.to_coco(),
                         dict(x_center=1, y_center=17, width=2, height=32))
        self.assertEqual(
            example_rect,
            Rectangle.from_coco(
                **dict(x_center=1, y_center=17, width=2, height=32)))
Пример #2
0
 def get_bounding_box_of_elem(elem: html.HtmlElement) -> Rectangle:
     """Return coordinates of the bounding box of a word, as a 4-tuple."""
     return Rectangle(
         x_min=elem.attrib["xmin"],
         y_min=elem.attrib["ymin"],
         x_max=elem.attrib["xmax"],
         y_max=elem.attrib["ymax"])
Пример #3
0
    def ocr_one_image(cls,
                      img: Image.Image,
                      lang: str = "eng",
                      config: str = "--psm 1 --oem 3") -> List[Dict]:
        """Compute a dictionary with detected words and bounding boxes.

        :param img: input image
        :param lang: language code
        :param config: tesseract configuration
        :return: list of dictionaries of type {"word": word, "bb": bounding box of the word, relative to page size}
        """
        d = pytesseract.image_to_data(img,
                                      output_type=pytesseract.Output.DICT,
                                      lang=lang,
                                      config=config)
        result = []
        for i, word in enumerate(d["text"]):
            if word.strip():
                left, top, width, height = d["left"][i], d["top"][i], d[
                    "width"][i], d["height"][i]
                result.append({
                    "word":
                    word,
                    "bb":
                    Rectangle(x_min=left,
                              y_min=top,
                              x_max=left + width,
                              y_max=top + height).relative_to_size(
                                  width=img.size[0], height=img.size[1])
                })
        return result
Пример #4
0
    def test_subrectangle(self):
        small = Rectangle(0, 1, 2, 3)
        medium1 = Rectangle(0, 1, 4, 10)
        medium2 = Rectangle(-1, 1, 4, 3)
        large = Rectangle(-1, 0, 4, 10)

        self.assertTrue(small in medium1)
        self.assertTrue(small in medium2)
        self.assertTrue(small in large)
        self.assertTrue(medium1 in large)
        self.assertTrue(medium2 in large)

        self.assertFalse(medium1 in small)
        self.assertFalse(medium2 in small)
        self.assertFalse(large in small)
        self.assertFalse(large in medium1)
        self.assertFalse(large in medium2)
Пример #5
0
    def test_normalization(self):
        first = Rectangle(0, 0, 2, 2)
        second = Rectangle(3, 3, 5, 5)
        in_between = Rectangle(1, 1, 4, 4)
        far_away = Rectangle(100, 1, 120, 4)

        normalization = Rectangle.normalize_list_of_rectangles(
            [first, second, far_away, in_between])
        expected_normalization_1 = Rectangle(x_min=100.0,
                                             y_min=1.0,
                                             x_max=120.0,
                                             y_max=4.0)
        expected_normalization_2 = Rectangle(x_min=0.0,
                                             y_min=0.0,
                                             x_max=5.0,
                                             y_max=5.0)

        self.assertTrue(
            any([
                normalization == [
                    expected_normalization_1, expected_normalization_2
                ], normalization == [
                    expected_normalization_2, expected_normalization_1
                ]
            ]))
Пример #6
0
    def test_resizing_methods(self):
        example_rect = Rectangle(0.0, 1.5, 2.0, 33.3)
        self.assertEqual(example_rect.rescale(0.1, 10),
                         Rectangle(0, 15, 0.2, 333))

        self.assertEqual(example_rect.to_int(), Rectangle(0, 1, 2, 33))

        self.assertEqual(
            example_rect.relative_to_size(width=100, height=200),
            Rectangle(x_min=0,
                      y_min=1.5 / 200,
                      x_max=2.0 / 100,
                      y_max=33.3 / 200))
Пример #7
0
    def test_smallest_common_superrectangle(self):
        r1, r2 = Rectangle(0, 0, 1, 1), Rectangle(10, 10, 11, 11)
        self.assertEqual(r1.smallest_common_superrectangle(r2),
                         Rectangle(0, 0, 11, 11))

        self.assertEqual(r2.smallest_common_superrectangle(r1),
                         Rectangle(0, 0, 11, 11))
Пример #8
0
    def test_pdf_box_to_image_box(self):
        """Transform bounding box from points to pixels.

        If shape ratios do not match, an exception should be risen.
        """
        pdf_box = Rectangle(10, 10, 20, 30)
        image_box_1 = Rectangle(100, 100, 200, 300)

        pdf_box_converted = converter.pdf_box_to_image_box(pdf_box=pdf_box,
                                                           pdf_width=50,
                                                           pdf_height=100,
                                                           img_width=500,
                                                           img_height=1000)
        self.assertEqual(pdf_box_converted, image_box_1)

        self.assertRaises(converter.RotatedPdfException,
                          converter.pdf_box_to_image_box,
                          pdf_box=pdf_box,
                          pdf_width=50,
                          pdf_height=100,
                          img_width=1000,
                          img_height=500)
Пример #9
0
    def test_text_extraction_from_pdf(self):
        """This is essentially testing pdftotext (probably coming from Poppler, of Xpdf)."""
        simple_text = self.pdf.simple_text
        layout_text = self.pdf.layout_text
        # xml with bounding boxes of words
        root = self.pdf.get_page_as_html(0)

        # list of strings (one per page)
        simple_pages = [page for page in simple_text.split("\f") if page]
        layout_pages = [page for page in layout_text.split("\f") if page]

        # We have two pages in the pdf
        self.assertEqual(len(simple_pages), 2)
        self.assertEqual(len(layout_pages), 2)

        # Test that first page contain expected words
        words_in_first_page = set(simple_pages[0].split())
        self.assertTrue({"Lorem", "ipsum", "Aron", "killed",
                         "*****@*****.**"}.issubset(words_in_first_page))
        self.assertFalse({
            "Autobahn", "Das", "The", "name", "hungry", "*****@*****.**"
        } & words_in_first_page)

        # this regex should be matched in a reasonably extracted layout-first-page-text
        self.assertTrue(
            re.search(r"Stolen\s+bike\s+500\s+Euro\s+3%", layout_pages[0]))
        self.assertTrue(re.search(r"[email protected]\s*\n", layout_pages[1]))

        # Find bounding box of 'extreme' word on first page
        extreme_element = root.xpath(".//word[text()='extreme']")[0]
        extreme_bb = Rectangle(x_min=extreme_element.attrib["xmin"],
                               y_min=extreme_element.attrib["ymin"],
                               x_max=extreme_element.attrib["xmax"],
                               y_max=extreme_element.attrib["ymax"])

        # Check that the bounding box is reasonable
        self.assertTrue(extreme_bb in Rectangle(
            x_min=220, y_min=530, x_max=290, y_max=590))
Пример #10
0
    def test_text_extraction_from_rotated_pdf(self):
        """Check that bounding box of a word in pdf is where it should be."""
        pages = self.pdf.get_pages()
        pages_rotated = self.pdf_rotated.get_pages()
        pages_txt = self.pdf.get_pages_as_text()

        self.assertEqual(len(pages), 2)
        self.assertEqual(len(pages_txt), 2)

        first_el_pdf = pages[0][0]
        first_el_pdf_rotated = pages_rotated[0][0]

        # both pdf should start with "Insurance" on the first page
        self.assertEqual(first_el_pdf.text, "Insurance")
        self.assertEqual(first_el_pdf_rotated.text, "Insurance")

        # enforce approximate bounding box of this first word
        self.assertTrue(
            self.pdf.get_bounding_box_of_elem(pages[0][0]) in (
                Rectangle(x_min=72, y_min=98, x_max=165, y_max=128)))

        self.assertTrue(
            self.pdf_rotated.get_bounding_box_of_elem(pages_rotated[0][0]) in (
                Rectangle(x_min=712, y_min=70, x_max=750, y_max=162)))
Пример #11
0
    def test_dump_annotations_to_file(self):
        """Dump annotations to file, load them from file, and compare that all is consistent."""
        annotations = self.extractor.get_annot_from_pdf(self.annotated_pdf)
        temp_json_file = mkstemp()[1]
        self.extractor.dump_annotations_to_file(annotations, temp_json_file)
        with open(temp_json_file) as f:
            annots_from_file = json.load(f)

        for i, annot in enumerate(annots_from_file):
            # check that i'th annotation on page page_idx is the same in annotations and in annots_from_file
            with self.subTest(annotation=annot):
                self.assertTrue(
                    annotations_are_similar(
                        Annotation(page=annot["page"],
                                   type=annot["type"],
                                   box=Rectangle.from_dict(annot["box"]),
                                   text_content=annot["text_content"],
                                   who_annotated=annot["who_annotated"],
                                   label=annot["label"]), annotations[i]))

        os.remove(temp_json_file)
Пример #12
0
    def _create_annotations_bounding_box(box_as_list: List,
                                         page_height: Union[int, float],
                                         from_above: bool = True) -> Rectangle:
        """Get the rectangle representing the bounding box of an annotation.

        When extracting an annotation from PyPDF2 Reader, then the bounding box of an annotation
        has some slightly unintuitive format. Here we convert it to our standard Rectangle object.

        :param box_as_list: list of 4 numbers, assumed to be (x_min, y_min, x_max, y_max).
            Vertical coordinates are increasing from below!
        :param page_height: number
        :param from_above: if True (default), we renormalize the vertical coordinate to increase from above.
        :return: adjusted Rectangle object
        """
        return Rectangle(
            x_min=float(box_as_list[0]),
            y_min=float(page_height) -
            float(box_as_list[3]) if from_above else float(box_as_list[1]),
            x_max=float(box_as_list[2]),
            y_max=float(page_height) -
            float(box_as_list[1]) if from_above else float(box_as_list[3]))
Пример #13
0
def pdf_box_to_image_box(pdf_box: Rectangle, pdf_width: int, pdf_height: int,
                         img_width: int, img_height: int) -> Rectangle:
    """Convert a box in pdf coordinates into the box in image coordinates.

    :param pdf_box: a Rectangle representing some area in a pdf page
    :param pdf_width:
    :param pdf_height:
    :param img_width:
    :param img_height:
    :return: a corresponding Rectangle in the image-coordinates
    """
    if abs(img_height / img_width - pdf_height / pdf_width) > 0.1:
        raise RotatedPdfException("Pdf seems to be rotated, skipping")

    w_scale = img_width / pdf_width
    h_scale = img_width / pdf_width
    return Rectangle(
        x_min=pdf_box.x_min * w_scale,
        y_min=pdf_box.y_min * h_scale,
        x_max=pdf_box.x_max * w_scale,
        y_max=pdf_box.y_max * h_scale,
        dtype=int,
    )
Пример #14
0
    def test_annotation_creation(self):
        """Test the creation of one Annotation object."""
        ann = Annotation(page=12,
                         type="rectangle",
                         box=Rectangle(10, 10, 13, 13),
                         text_content="FPP3",
                         who_annotated="terminator II",
                         label=3)

        expected_annotation_as_dict = {
            "page": 12,
            "type": "rectangle",
            "box": {
                "x_min": 10,
                "y_min": 10,
                "x_max": 13,
                "y_max": 13
            },
            "text_content": "FPP3",
            "who_annotated": "terminator II",
            "label": 3
        }

        self.assertEqual(ann.as_dict, expected_annotation_as_dict)
Пример #15
0
class TestAnnotation(unittest.TestCase):

    annotated_pdf = Pdf(ANNOTATED_PDF_PATH)
    extractor = AnnotationExtractor()

    expected_annotations = [
        Annotation(
            page=0,
            type="note",
            box=Rectangle(x_min=87.58,
                          y_min=45.574,
                          x_max=107.58,
                          y_max=65.574),
            text_content=
            "Daniel, include also the remaining 133 pages in the pdf!!!!",
            who_annotated="peter"),
        Annotation(page=0,
                   type="rectangle",
                   box=Rectangle(x_min=83.46,
                                 y_min=504.12,
                                 x_max=221.12,
                                 y_max=518),
                   text_content="risk",
                   who_annotated="peter"),
        Annotation(page=0,
                   type="rectangle",
                   box=Rectangle(x_min=321.27,
                                 y_min=503,
                                 x_max=374.16,
                                 y_max=517.63),
                   text_content="coverage_total",
                   who_annotated="peter"),
        Annotation(page=0,
                   type="rectangle",
                   box=Rectangle(x_min=373.4,
                                 y_min=504.5,
                                 x_max=399.66,
                                 y_max=518),
                   text_content="currency",
                   who_annotated="peter"),
        Annotation(page=0,
                   type="rectangle",
                   box=Rectangle(x_min=465.68,
                                 y_min=504.87,
                                 x_max=486.31,
                                 y_max=517.25),
                   text_content="deductible in %",
                   who_annotated="peter"),
        Annotation(page=1,
                   type="oval",
                   box=Rectangle(x_min=55.7,
                                 y_min=133.72,
                                 x_max=338.9,
                                 y_max=177.23),
                   text_content="add Honza",
                   who_annotated="peter"),
    ]

    def test_assertion_in_annotation_type(self):
        """If type is not in ADMISSIBLE_ANNOTATIONS, an error should be raised."""
        self.assertRaises(
            AssertionError, lambda: Annotation(
                page=0, type="invisible", box=Rectangle(0, 0, 0, 0)))

    def test_annotation_creation(self):
        """Test the creation of one Annotation object."""
        ann = Annotation(page=12,
                         type="rectangle",
                         box=Rectangle(10, 10, 13, 13),
                         text_content="FPP3",
                         who_annotated="terminator II",
                         label=3)

        expected_annotation_as_dict = {
            "page": 12,
            "type": "rectangle",
            "box": {
                "x_min": 10,
                "y_min": 10,
                "x_max": 13,
                "y_max": 13
            },
            "text_content": "FPP3",
            "who_annotated": "terminator II",
            "label": 3
        }

        self.assertEqual(ann.as_dict, expected_annotation_as_dict)

    def test_annotation_extraction(self):
        """Extract annotation from file and check that they correspond to expected annotations."""
        annotations = self.extractor.get_annot_from_pdf(self.annotated_pdf)

        # each annotation is found in expected
        for annot in annotations:
            with self.subTest(annotation=annot):
                self.assertTrue(
                    any(
                        annotations_are_similar(annot, other)
                        for other in self.expected_annotations))
        # each expected annotation is found in annotations
        for exp_annot in self.expected_annotations:
            with self.subTest(expected_annotation=exp_annot):
                self.assertTrue(
                    any(
                        annotations_are_similar(exp_annot, other)
                        for other in annotations))

    def test_dump_annotations_to_file(self):
        """Dump annotations to file, load them from file, and compare that all is consistent."""
        annotations = self.extractor.get_annot_from_pdf(self.annotated_pdf)
        temp_json_file = mkstemp()[1]
        self.extractor.dump_annotations_to_file(annotations, temp_json_file)
        with open(temp_json_file) as f:
            annots_from_file = json.load(f)

        for i, annot in enumerate(annots_from_file):
            # check that i'th annotation on page page_idx is the same in annotations and in annots_from_file
            with self.subTest(annotation=annot):
                self.assertTrue(
                    annotations_are_similar(
                        Annotation(page=annot["page"],
                                   type=annot["type"],
                                   box=Rectangle.from_dict(annot["box"]),
                                   text_content=annot["text_content"],
                                   who_annotated=annot["who_annotated"],
                                   label=annot["label"]), annotations[i]))

        os.remove(temp_json_file)
Пример #16
0
 def test_assertion_in_annotation_type(self):
     """If type is not in ADMISSIBLE_ANNOTATIONS, an error should be raised."""
     self.assertRaises(
         AssertionError, lambda: Annotation(
             page=0, type="invisible", box=Rectangle(0, 0, 0, 0)))
Пример #17
0
 def test_image2box(self):
     h, w = 123, 234
     black_image = np.zeros((h, w))
     self.assertEqual(Rectangle(0, 0, w, h),
                      Rectangle.from_image(black_image))
Пример #18
0
    def test_intersection(self):
        """Tests the intersection and iou."""
        r1 = Rectangle(0, 1, 2, 3)
        r2 = Rectangle(-10, -10, 0, 0)
        r3 = Rectangle(1, 1, 5, 5)

        self.assertIsNone(r1.intersection(r2))
        self.assertIsNone(r2.intersection(r1))
        self.assertIsNone(r2.intersection(r3))
        self.assertIsNone(r3.intersection(r2))

        self.assertEqual(r1.intersection(r3), Rectangle(1, 1, 2, 3))
        self.assertEqual(r3.intersection(r1), Rectangle(1, 1, 2, 3))

        self.assertEqual(r1.get_iou(r2), 0)
        self.assertEqual(r2.get_iou(r1), 0)
        self.assertEqual(r1.get_iou(r3), 1 / 9)
        self.assertEqual(r3.get_iou(r1), 1 / 9)

        self.assertTrue(r1.intersection_width_some_other([r2, r3]))
        self.assertFalse(r2.intersection_width_some_other([r1, r3]))
Пример #19
0
 def test_width_height(self):
     example_rect = Rectangle(x_min=0, y_min=1, x_max=2, y_max=33)
     self.assertEqual(example_rect.width, 2)
     self.assertEqual(example_rect.height, 32)
     self.assertEqual(example_rect.area, 64)
     self.assertEqual(example_rect.center, (1, 17))