示例#1
0
def create_pdf_document(
    elements: Union[List[LTComponent], Dict[int, List[LTComponent]]],
    font_mapping: Optional[Dict[str, str]] = None,
    font_mapping_is_regex: bool = False,
    regex_flags: Union[int, re.RegexFlag] = 0,
    font_size_precision: int = 1,
    element_ordering: Union[ElementOrdering, Callable[
        [List], List]] = ElementOrdering.LEFT_TO_RIGHT_TOP_TO_BOTTOM,
) -> "PDFDocument":
    """
    Creates a PDF document with the given elements.
    "elements" can be a list of elements (in which case a document with a single page
    will be created) or a dictionary mapping page number to its list of elements.
    """
    if not isinstance(elements, dict):
        pages = {1: Page(elements=elements, width=100, height=100)}
    else:
        pages = {
            page_number: Page(elements=elements_list, width=100, height=100)
            for page_number, elements_list in elements.items()
        }

    return PDFDocument(
        pages=pages,
        font_mapping=font_mapping,
        font_mapping_is_regex=font_mapping_is_regex,
        regex_flags=regex_flags,
        font_size_precision=font_size_precision,
        element_ordering=element_ordering,
    )
示例#2
0
    def test_filter_by_page(self):
        elem1 = FakePDFMinerTextElement()
        elem2 = FakePDFMinerTextElement()
        elem3 = FakePDFMinerTextElement()
        page1 = Page(width=100, height=100, elements=[elem1, elem2])
        page2 = Page(width=100, height=100, elements=[elem3])
        doc = PDFDocument({1: page1, 2: page2})

        self.assertEqual(len(doc.elements.filter_by_page(1)), 2)
        self.assert_original_element_in(elem1, doc.elements.filter_by_page(1))
        self.assert_original_element_in(elem2, doc.elements.filter_by_page(1))
示例#3
0
    def test_eq(self):
        with self.assertRaises(NotImplementedError):
            self.elem_list == "foo"

        second_elem_list = ElementList(self.doc, set([0, 1, 2, 3, 4, 5]))
        self.assertTrue(self.elem_list == second_elem_list)

        # Test with different indexes
        second_elem_list = ElementList(self.doc, set([0, 1, 2, 3, 4]))
        self.assertFalse(self.elem_list == second_elem_list)

        # Test with different document
        doc = PDFDocument(
            pages={
                1: Page(
                    elements=[
                        FakePDFMinerTextElement(),
                        FakePDFMinerTextElement(),
                        FakePDFMinerTextElement(),
                        FakePDFMinerTextElement(),
                        FakePDFMinerTextElement(),
                        FakePDFMinerTextElement(),
                    ],
                    width=100,
                    height=100,
                )
            }
        )
        second_elem_list = ElementList(doc, set([0, 1, 2, 3, 4, 5]))
        self.assertFalse(self.elem_list == second_elem_list)
示例#4
0
    def test_filter_partially_within_bounding_box(self, partially_within_mock):
        partially_within_mock.side_effect = (
            lambda self, bounding_box: self.text() == "within"
        )

        elem1 = FakePDFMinerTextElement(text="within")
        elem2 = FakePDFMinerTextElement(text="within")
        elem3 = FakePDFMinerTextElement()
        elem4 = FakePDFMinerTextElement(text="within")
        elem5 = FakePDFMinerTextElement()
        elem6 = FakePDFMinerTextElement(text="within")

        page1 = Page(elements=[elem1, elem2, elem3, elem4], width=100, height=100)
        page2 = Page(elements=[elem5, elem6], width=100, height=100)

        doc = PDFDocument(pages={1: page1, 2: page2})
        elem_list = doc.elements

        pdf_elem1 = self.extract_element_from_list(elem1, elem_list)
        pdf_elem2 = self.extract_element_from_list(elem2, elem_list)
        pdf_elem3 = self.extract_element_from_list(elem3, elem_list)
        pdf_elem4 = self.extract_element_from_list(elem4, elem_list)

        result = elem_list.filter_partially_within_bounding_box(
            BoundingBox(0, 1, 0, 1), 1
        )

        # expected_bbox is from the left edge of elem1 to the left edge of the page
        expected_bbox = BoundingBox(0, 1, 0, 1)
        partially_within_mock.assert_has_calls(
            [
                call(pdf_elem1, expected_bbox),
                call(pdf_elem2, expected_bbox),
                call(pdf_elem3, expected_bbox),
                call(pdf_elem4, expected_bbox),
            ],
            any_order=True,
        )

        self.assertEqual(len(result), 3)
        self.assertIn(pdf_elem1, result)
        self.assertIn(pdf_elem2, result)
        self.assertIn(pdf_elem4, result)
示例#5
0
    def test_extract_single_element(self):
        with self.assertRaises(MultipleElementsFoundError):
            self.elem_list.extract_single_element()

        with self.assertRaises(NoElementFoundError):
            self.elem_list.filter_by_tag("non_existent_tag").extract_single_element()

        elem1 = FakePDFMinerTextElement()
        page = Page(elements=[elem1], width=100, height=100)
        doc = PDFDocument(pages={1: page})
        pdf_elem_1 = self.extract_element_from_list(elem1, doc.elements)

        result = doc.elements.extract_single_element()
        self.assertEqual(result, pdf_elem_1)
示例#6
0
 def test_document_with_blank_page(self):
     with self.assertRaises(NoElementsOnPageError):
         PDFDocument(pages={1: Page(elements=[], width=100, height=100)})
示例#7
0
    def test_document(self):
        el_page_1_top_left = FakePDFMinerTextElement(BoundingBox(0, 1, 2, 3))
        el_page_1_top_right = FakePDFMinerTextElement(BoundingBox(2, 3, 2, 3))
        el_page_1_bottom_left = FakePDFMinerTextElement(BoundingBox(
            0, 1, 0, 1))
        el_page_1_bottom_right = FakePDFMinerTextElement(
            BoundingBox(2, 3, 0, 1))
        page_1 = Page(
            elements=[
                el_page_1_top_left,
                el_page_1_top_right,
                el_page_1_bottom_left,
                el_page_1_bottom_right,
            ],
            width=100,
            height=100,
        )

        el_page_2_top_left = FakePDFMinerTextElement(BoundingBox(0, 1, 2, 3))
        el_page_2_top_right = FakePDFMinerTextElement(BoundingBox(2, 3, 2, 3))
        el_page_2_bottom_left = FakePDFMinerTextElement(BoundingBox(
            0, 1, 0, 1))
        el_page_2_bottom_right = FakePDFMinerTextElement(
            BoundingBox(2, 3, 0, 1))
        page_2 = Page(
            elements=[
                el_page_2_bottom_right,
                el_page_2_bottom_left,
                el_page_2_top_right,
                el_page_2_top_left,
            ],
            width=100,
            height=100,
        )

        document = PDFDocument(pages={1: page_1, 2: page_2})

        # Checks elements were reordered
        expected_ordered_list = [
            el_page_1_top_left,
            el_page_1_top_right,
            el_page_1_bottom_left,
            el_page_1_bottom_right,
            el_page_2_top_left,
            el_page_2_top_right,
            el_page_2_bottom_left,
            el_page_2_bottom_right,
        ]
        self.assertEqual(
            [elem.original_element for elem in document._element_list],
            expected_ordered_list,
        )

        # Checks indexes were assigned properly
        self.assertEqual([elem._index for elem in document._element_list],
                         [0, 1, 2, 3, 4, 5, 6, 7])

        # Checks page numbers is correct
        self.assertEqual(document.page_numbers, [1, 2])

        # Checks number of pages is correct
        self.assertEqual(document.number_of_pages, 2)

        # Checks pages were assigned properly
        self.assertEqual(
            [elem.page_number for elem in document._element_list],
            [1, 1, 1, 1, 2, 2, 2, 2],
        )

        # Checks pages were instantiated correctly
        pdf_page_1 = document.get_page(1)
        self.assertEqual(page_1.width, pdf_page_1.width)
        self.assertEqual(page_1.height, pdf_page_1.height)
        self.assertEqual(el_page_1_top_left,
                         pdf_page_1.start_element.original_element)
        self.assertEqual(el_page_1_bottom_right,
                         pdf_page_1.end_element.original_element)
        self.assertEqual(pdf_page_1.page_number, 1)
        self.assertEqual(pdf_page_1.elements,
                         ElementList(document, set([0, 1, 2, 3])))

        pdf_page_2 = document.get_page(2)
        self.assertEqual(page_2.width, pdf_page_2.width)
        self.assertEqual(page_2.height, pdf_page_2.height)
        self.assertEqual(el_page_2_top_left,
                         pdf_page_2.start_element.original_element)
        self.assertEqual(el_page_2_bottom_right,
                         pdf_page_2.end_element.original_element)
        self.assertEqual(pdf_page_2.page_number, 2)
        self.assertEqual(pdf_page_2.elements,
                         ElementList(document, set([4, 5, 6, 7])))

        self.assertEqual(document.pages, [pdf_page_1, pdf_page_2])

        self.assertEqual(document.elements,
                         ElementList(document, set([0, 1, 2, 3, 4, 5, 6, 7])))
        with self.assertRaises(PageNotFoundError):
            document.get_page(3)
示例#8
0
    def test_horizontally_in_line_with(self, partially_within_mock):
        partially_within_mock.side_effect = (
            lambda self, bounding_box: self.text() == "within"
        )

        elem1 = FakePDFMinerTextElement(
            text="within", bounding_box=BoundingBox(50, 51, 50, 51)
        )
        elem2 = FakePDFMinerTextElement(text="within")
        elem3 = FakePDFMinerTextElement()
        elem4 = FakePDFMinerTextElement(text="within")
        elem5 = FakePDFMinerTextElement()
        elem6 = FakePDFMinerTextElement(text="within")

        page1 = Page(elements=[elem1, elem2, elem3, elem4], width=100, height=100)
        page2 = Page(elements=[elem5, elem6], width=100, height=100)

        doc = PDFDocument(pages={1: page1, 2: page2})
        elem_list = doc.elements

        pdf_elem1 = self.extract_element_from_list(elem1, elem_list)
        pdf_elem2 = self.extract_element_from_list(elem2, elem_list)
        pdf_elem3 = self.extract_element_from_list(elem3, elem_list)
        pdf_elem4 = self.extract_element_from_list(elem4, elem_list)

        result = elem_list.horizontally_in_line_with(pdf_elem1)

        # expected_bbox is from the left edge of elem1 to the left edge of the page
        expected_bbox = BoundingBox(0, 100, 50, 51)
        partially_within_mock.assert_has_calls(
            [
                call(pdf_elem1, expected_bbox),
                call(pdf_elem2, expected_bbox),
                call(pdf_elem3, expected_bbox),
                call(pdf_elem4, expected_bbox),
            ],
            any_order=True,
        )

        self.assertEqual(len(result), 2)
        self.assertIn(pdf_elem2, result)
        self.assertIn(pdf_elem4, result)

        # Also test with inclusive=True
        partially_within_mock.reset_mock()
        result = elem_list.horizontally_in_line_with(pdf_elem1, inclusive=True)

        partially_within_mock.assert_has_calls(
            [
                call(pdf_elem1, expected_bbox),
                call(pdf_elem2, expected_bbox),
                call(pdf_elem3, expected_bbox),
                call(pdf_elem4, expected_bbox),
            ],
            any_order=True,
        )

        self.assertEqual(len(result), 3)
        self.assertIn(pdf_elem1, result)
        self.assertIn(pdf_elem2, result)
        self.assertIn(pdf_elem4, result)

        # Test specifying tolerance
        expected_bbox = BoundingBox(0, 100, 50.1, 50.9)

        partially_within_mock.reset_mock()
        result = elem_list.horizontally_in_line_with(pdf_elem1, tolerance=0.1)

        partially_within_mock.assert_has_calls(
            [
                call(pdf_elem1, expected_bbox),
                call(pdf_elem2, expected_bbox),
                call(pdf_elem3, expected_bbox),
                call(pdf_elem4, expected_bbox),
            ],
            any_order=True,
        )