def create_pdf_document( elements: Union[List[LTComponent], Dict[int, List[LTComponent]]], font_mapping: Optional[Dict[str, str]] = None, font_mapping_is_regex: bool = False, regex_flags: Union[int, re.RegexFlag] = 0, font_size_precision: int = 1, element_ordering: Union[ElementOrdering, Callable[ [List], List]] = ElementOrdering.LEFT_TO_RIGHT_TOP_TO_BOTTOM, ) -> "PDFDocument": """ Creates a PDF document with the given elements. "elements" can be a list of elements (in which case a document with a single page will be created) or a dictionary mapping page number to its list of elements. """ if not isinstance(elements, dict): pages = {1: Page(elements=elements, width=100, height=100)} else: pages = { page_number: Page(elements=elements_list, width=100, height=100) for page_number, elements_list in elements.items() } return PDFDocument( pages=pages, font_mapping=font_mapping, font_mapping_is_regex=font_mapping_is_regex, regex_flags=regex_flags, font_size_precision=font_size_precision, element_ordering=element_ordering, )
def test_filter_by_page(self): elem1 = FakePDFMinerTextElement() elem2 = FakePDFMinerTextElement() elem3 = FakePDFMinerTextElement() page1 = Page(width=100, height=100, elements=[elem1, elem2]) page2 = Page(width=100, height=100, elements=[elem3]) doc = PDFDocument({1: page1, 2: page2}) self.assertEqual(len(doc.elements.filter_by_page(1)), 2) self.assert_original_element_in(elem1, doc.elements.filter_by_page(1)) self.assert_original_element_in(elem2, doc.elements.filter_by_page(1))
def test_eq(self): with self.assertRaises(NotImplementedError): self.elem_list == "foo" second_elem_list = ElementList(self.doc, set([0, 1, 2, 3, 4, 5])) self.assertTrue(self.elem_list == second_elem_list) # Test with different indexes second_elem_list = ElementList(self.doc, set([0, 1, 2, 3, 4])) self.assertFalse(self.elem_list == second_elem_list) # Test with different document doc = PDFDocument( pages={ 1: Page( elements=[ FakePDFMinerTextElement(), FakePDFMinerTextElement(), FakePDFMinerTextElement(), FakePDFMinerTextElement(), FakePDFMinerTextElement(), FakePDFMinerTextElement(), ], width=100, height=100, ) } ) second_elem_list = ElementList(doc, set([0, 1, 2, 3, 4, 5])) self.assertFalse(self.elem_list == second_elem_list)
def test_filter_partially_within_bounding_box(self, partially_within_mock): partially_within_mock.side_effect = ( lambda self, bounding_box: self.text() == "within" ) elem1 = FakePDFMinerTextElement(text="within") elem2 = FakePDFMinerTextElement(text="within") elem3 = FakePDFMinerTextElement() elem4 = FakePDFMinerTextElement(text="within") elem5 = FakePDFMinerTextElement() elem6 = FakePDFMinerTextElement(text="within") page1 = Page(elements=[elem1, elem2, elem3, elem4], width=100, height=100) page2 = Page(elements=[elem5, elem6], width=100, height=100) doc = PDFDocument(pages={1: page1, 2: page2}) elem_list = doc.elements pdf_elem1 = self.extract_element_from_list(elem1, elem_list) pdf_elem2 = self.extract_element_from_list(elem2, elem_list) pdf_elem3 = self.extract_element_from_list(elem3, elem_list) pdf_elem4 = self.extract_element_from_list(elem4, elem_list) result = elem_list.filter_partially_within_bounding_box( BoundingBox(0, 1, 0, 1), 1 ) # expected_bbox is from the left edge of elem1 to the left edge of the page expected_bbox = BoundingBox(0, 1, 0, 1) partially_within_mock.assert_has_calls( [ call(pdf_elem1, expected_bbox), call(pdf_elem2, expected_bbox), call(pdf_elem3, expected_bbox), call(pdf_elem4, expected_bbox), ], any_order=True, ) self.assertEqual(len(result), 3) self.assertIn(pdf_elem1, result) self.assertIn(pdf_elem2, result) self.assertIn(pdf_elem4, result)
def test_extract_single_element(self): with self.assertRaises(MultipleElementsFoundError): self.elem_list.extract_single_element() with self.assertRaises(NoElementFoundError): self.elem_list.filter_by_tag("non_existent_tag").extract_single_element() elem1 = FakePDFMinerTextElement() page = Page(elements=[elem1], width=100, height=100) doc = PDFDocument(pages={1: page}) pdf_elem_1 = self.extract_element_from_list(elem1, doc.elements) result = doc.elements.extract_single_element() self.assertEqual(result, pdf_elem_1)
def test_document_with_blank_page(self): with self.assertRaises(NoElementsOnPageError): PDFDocument(pages={1: Page(elements=[], width=100, height=100)})
def test_document(self): el_page_1_top_left = FakePDFMinerTextElement(BoundingBox(0, 1, 2, 3)) el_page_1_top_right = FakePDFMinerTextElement(BoundingBox(2, 3, 2, 3)) el_page_1_bottom_left = FakePDFMinerTextElement(BoundingBox( 0, 1, 0, 1)) el_page_1_bottom_right = FakePDFMinerTextElement( BoundingBox(2, 3, 0, 1)) page_1 = Page( elements=[ el_page_1_top_left, el_page_1_top_right, el_page_1_bottom_left, el_page_1_bottom_right, ], width=100, height=100, ) el_page_2_top_left = FakePDFMinerTextElement(BoundingBox(0, 1, 2, 3)) el_page_2_top_right = FakePDFMinerTextElement(BoundingBox(2, 3, 2, 3)) el_page_2_bottom_left = FakePDFMinerTextElement(BoundingBox( 0, 1, 0, 1)) el_page_2_bottom_right = FakePDFMinerTextElement( BoundingBox(2, 3, 0, 1)) page_2 = Page( elements=[ el_page_2_bottom_right, el_page_2_bottom_left, el_page_2_top_right, el_page_2_top_left, ], width=100, height=100, ) document = PDFDocument(pages={1: page_1, 2: page_2}) # Checks elements were reordered expected_ordered_list = [ el_page_1_top_left, el_page_1_top_right, el_page_1_bottom_left, el_page_1_bottom_right, el_page_2_top_left, el_page_2_top_right, el_page_2_bottom_left, el_page_2_bottom_right, ] self.assertEqual( [elem.original_element for elem in document._element_list], expected_ordered_list, ) # Checks indexes were assigned properly self.assertEqual([elem._index for elem in document._element_list], [0, 1, 2, 3, 4, 5, 6, 7]) # Checks page numbers is correct self.assertEqual(document.page_numbers, [1, 2]) # Checks number of pages is correct self.assertEqual(document.number_of_pages, 2) # Checks pages were assigned properly self.assertEqual( [elem.page_number for elem in document._element_list], [1, 1, 1, 1, 2, 2, 2, 2], ) # Checks pages were instantiated correctly pdf_page_1 = document.get_page(1) self.assertEqual(page_1.width, pdf_page_1.width) self.assertEqual(page_1.height, pdf_page_1.height) self.assertEqual(el_page_1_top_left, pdf_page_1.start_element.original_element) self.assertEqual(el_page_1_bottom_right, pdf_page_1.end_element.original_element) self.assertEqual(pdf_page_1.page_number, 1) self.assertEqual(pdf_page_1.elements, ElementList(document, set([0, 1, 2, 3]))) pdf_page_2 = document.get_page(2) self.assertEqual(page_2.width, pdf_page_2.width) self.assertEqual(page_2.height, pdf_page_2.height) self.assertEqual(el_page_2_top_left, pdf_page_2.start_element.original_element) self.assertEqual(el_page_2_bottom_right, pdf_page_2.end_element.original_element) self.assertEqual(pdf_page_2.page_number, 2) self.assertEqual(pdf_page_2.elements, ElementList(document, set([4, 5, 6, 7]))) self.assertEqual(document.pages, [pdf_page_1, pdf_page_2]) self.assertEqual(document.elements, ElementList(document, set([0, 1, 2, 3, 4, 5, 6, 7]))) with self.assertRaises(PageNotFoundError): document.get_page(3)
def test_horizontally_in_line_with(self, partially_within_mock): partially_within_mock.side_effect = ( lambda self, bounding_box: self.text() == "within" ) elem1 = FakePDFMinerTextElement( text="within", bounding_box=BoundingBox(50, 51, 50, 51) ) elem2 = FakePDFMinerTextElement(text="within") elem3 = FakePDFMinerTextElement() elem4 = FakePDFMinerTextElement(text="within") elem5 = FakePDFMinerTextElement() elem6 = FakePDFMinerTextElement(text="within") page1 = Page(elements=[elem1, elem2, elem3, elem4], width=100, height=100) page2 = Page(elements=[elem5, elem6], width=100, height=100) doc = PDFDocument(pages={1: page1, 2: page2}) elem_list = doc.elements pdf_elem1 = self.extract_element_from_list(elem1, elem_list) pdf_elem2 = self.extract_element_from_list(elem2, elem_list) pdf_elem3 = self.extract_element_from_list(elem3, elem_list) pdf_elem4 = self.extract_element_from_list(elem4, elem_list) result = elem_list.horizontally_in_line_with(pdf_elem1) # expected_bbox is from the left edge of elem1 to the left edge of the page expected_bbox = BoundingBox(0, 100, 50, 51) partially_within_mock.assert_has_calls( [ call(pdf_elem1, expected_bbox), call(pdf_elem2, expected_bbox), call(pdf_elem3, expected_bbox), call(pdf_elem4, expected_bbox), ], any_order=True, ) self.assertEqual(len(result), 2) self.assertIn(pdf_elem2, result) self.assertIn(pdf_elem4, result) # Also test with inclusive=True partially_within_mock.reset_mock() result = elem_list.horizontally_in_line_with(pdf_elem1, inclusive=True) partially_within_mock.assert_has_calls( [ call(pdf_elem1, expected_bbox), call(pdf_elem2, expected_bbox), call(pdf_elem3, expected_bbox), call(pdf_elem4, expected_bbox), ], any_order=True, ) self.assertEqual(len(result), 3) self.assertIn(pdf_elem1, result) self.assertIn(pdf_elem2, result) self.assertIn(pdf_elem4, result) # Test specifying tolerance expected_bbox = BoundingBox(0, 100, 50.1, 50.9) partially_within_mock.reset_mock() result = elem_list.horizontally_in_line_with(pdf_elem1, tolerance=0.1) partially_within_mock.assert_has_calls( [ call(pdf_elem1, expected_bbox), call(pdf_elem2, expected_bbox), call(pdf_elem3, expected_bbox), call(pdf_elem4, expected_bbox), ], any_order=True, )