def test_eq(self): bbox_1 = BoundingBox(0, 1, 0, 1) bbox_2 = BoundingBox(0, 1, 0, 1) self.assertEqual(bbox_1, bbox_2) bbox_3 = BoundingBox(0, 1, 0, 3) self.assertNotEqual(bbox_1, bbox_3)
def test_extract_simple_table(self): # Checks that simple 2*2 table is correctly extracted # # elem_1 elem_2 # elem_3 elem_4 # elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5)) elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5)) document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4]) elem_list = document.elements result = extract_simple_table(elem_list) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assert_original_element_list_list_equal( [[elem_1, elem_2], [elem_3, elem_4]], result ) # Checks that it raises an exception when table is not rectangular i.e table # has empty cells # # elem_1 elem_2 # elem_3 elem_4 elem_5 # elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 0, 5)) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4, elem_5] ) elem_list = document.elements with self.assertRaises(TableExtractionError): extract_simple_table(elem_list)
def test_extract_text_from_simple_table(self): # Checks that text from simple 2*2 table is correctly extracted # # elem_1 elem_2 # elem_3 elem_4 # elem_1 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 6, 10), text="fake_text_1" ) elem_2 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 6, 10), text="fake_text_2" ) elem_3 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 0, 5), text="fake_text_3" ) elem_4 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 0, 5), text="fake_text_4 " ) document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4]) elem_list = document.elements result = extract_simple_table(elem_list, as_text=True) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assertListEqual( [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4"]], result ) result = extract_simple_table(elem_list, as_text=True, strip_text=False) self.assertListEqual( [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4 "]], result )
def test_fix_element_in_multiple_cols(self): # Checks that the following table is correctly extracted: # --------- # | 1 | # --------| # | 2 | 3 | # --------- elem_1 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 10, 6, 10), text="fake_text_1" ) elem_2 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 0, 5), text="fake_text_2" ) elem_3 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 0, 5), text="fake_text_3" ) document = create_pdf_document(elements=[elem_1, elem_2, elem_3]) elem_list = document.elements with self.assertRaises(TableExtractionError): result = extract_table(elem_list, as_text=True) result = extract_table( elem_list, as_text=True, fix_element_in_multiple_cols=True ) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assertListEqual( [["fake_text_1", ""], ["fake_text_2", "fake_text_3"]], result )
def test_extract_table_with_tolerance(self): # Checks that simple 2*2 table is correctly extracted # # elem_1 elem_2 # elem_3 elem_4 # # But with elem_4 slightly overlapping elem_2, counteracted by setting tolerance elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5)) elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 6.1)) document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4]) elem_list = document.elements with self.assertRaises(TableExtractionError): extract_table(elem_list) result = extract_table(elem_list, tolerance=0.2) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assert_original_element_list_list_equal( [[elem_1, elem_2], [elem_3, elem_4]], result )
def test_create_bounding_box(self): bbox = BoundingBox(0, 1, 0, 1) self.assertEqual(bbox.width, 1) self.assertEqual(bbox.height, 1) # Checks that it raises an exception if coordinates are not valid with self.assertRaises(InvalidCoordinatesError): BoundingBox(1, 0, 0, 1) with self.assertRaises(InvalidCoordinatesError): BoundingBox(0, 1, 1, 0)
def test_element_ordering(self): # elem_1 elem_2 # elem_3 elem_4 elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_2 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 6, 10)) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5)) elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5)) # Check default: left to right, top to bottom document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4]) self.assert_original_element_list_equal( [elem_1, elem_2, elem_3, elem_4], document.elements) # Check other presets document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4], element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM, ) self.assert_original_element_list_equal( [elem_2, elem_1, elem_4, elem_3], document.elements) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4], element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT, ) self.assert_original_element_list_equal( [elem_1, elem_3, elem_2, elem_4], document.elements) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4], element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT, ) self.assert_original_element_list_equal( [elem_2, elem_4, elem_1, elem_3], document.elements) # Check custom function document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4], element_ordering=lambda elements: [ elements[0], elements[3], elements[1], elements[2], ], ) self.assert_original_element_list_equal( [elem_1, elem_4, elem_2, elem_3], document.elements)
def test_extract_simple_table_with_gaps_and_wrong_reference(self): # elem_1 elem_2 elem_3 # elem_4 elem_5 elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10)) elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5)) elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5)) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4, elem_5] ) elem_list = document.elements reference_element = self.extract_element_from_list(elem_3, elem_list) with self.assertRaises(TableExtractionError): extract_simple_table( elem_list, allow_gaps=True, reference_element=reference_element )
def test_extract_table(self): # Checks that simple 2*2 table is correctly extracted # # elem_1 elem_2 # elem_3 elem_4 # elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5)) elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5)) document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4]) elem_list = document.elements result = extract_table(elem_list) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assert_original_element_list_list_equal( [[elem_1, elem_2], [elem_3, elem_4]], result ) # Checks that the following table is correctly extracted # # elem_1 elem_2 elem_6 # elem_3 elem_4 elem_5 # elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 0, 5)) elem_6 = FakePDFMinerTextElement(bounding_box=BoundingBox(16, 20, 6, 10)) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6] ) elem_list = document.elements result = extract_table(elem_list) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 4) self.assertEqual(len(result[1]), 4) self.assert_original_element_list_list_equal( [[elem_1, elem_2, None, elem_6], [elem_3, elem_4, elem_5, None]], result ) # Checks that it raises an error if one element is in two rows elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(3, 8, 6, 10)) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6] ) elem_list = document.elements with self.assertRaises(TableExtractionError): result = extract_table(elem_list) # Checks that it raises an error if one element is in two columns elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 3, 8)) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6] ) elem_list = document.elements with self.assertRaises(TableExtractionError): result = extract_table(elem_list)
def test_filter_partially_within_bounding_box(self, partially_within_mock): partially_within_mock.side_effect = ( lambda self, bounding_box: self.text() == "within" ) elem1 = FakePDFMinerTextElement(text="within") elem2 = FakePDFMinerTextElement(text="within") elem3 = FakePDFMinerTextElement() elem4 = FakePDFMinerTextElement(text="within") elem5 = FakePDFMinerTextElement() elem6 = FakePDFMinerTextElement(text="within") page1 = Page(elements=[elem1, elem2, elem3, elem4], width=100, height=100) page2 = Page(elements=[elem5, elem6], width=100, height=100) doc = PDFDocument(pages={1: page1, 2: page2}) elem_list = doc.elements pdf_elem1 = self.extract_element_from_list(elem1, elem_list) pdf_elem2 = self.extract_element_from_list(elem2, elem_list) pdf_elem3 = self.extract_element_from_list(elem3, elem_list) pdf_elem4 = self.extract_element_from_list(elem4, elem_list) result = elem_list.filter_partially_within_bounding_box( BoundingBox(0, 1, 0, 1), 1 ) # expected_bbox is from the left edge of elem1 to the left edge of the page expected_bbox = BoundingBox(0, 1, 0, 1) partially_within_mock.assert_has_calls( [ call(pdf_elem1, expected_bbox), call(pdf_elem2, expected_bbox), call(pdf_elem3, expected_bbox), call(pdf_elem4, expected_bbox), ], any_order=True, ) self.assertEqual(len(result), 3) self.assertIn(pdf_elem1, result) self.assertIn(pdf_elem2, result) self.assertIn(pdf_elem4, result)
def test_extract_simple_table_with_gaps(self): # elem_1 elem_2 elem_3 # elem_4 elem_5 elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10)) elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5)) elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5)) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4, elem_5] ) elem_list = document.elements result = extract_simple_table(elem_list, allow_gaps=True) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 3) self.assertEqual(len(result[1]), 3) self.assert_original_element_list_list_equal( [[elem_1, elem_2, elem_3], [elem_4, elem_5, None]], result )
def test_extract_table_from_different_pages(self): # Checks that simple 2*2 tables are correctly extracted from different pages # # Page 1: # elem_p1_1 elem_p1_2 # elem_p1_3 elem_p1_4 # # Page 2: # elem_p2_1 elem_p2_2 # elem_p2_3 elem_p2_4 # elem_p1_1 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 6, 10)) elem_p1_2 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 6, 10)) elem_p1_3 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 0, 5)) elem_p1_4 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 0, 5)) elem_p2_1 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 6, 10)) elem_p2_2 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 6, 10)) elem_p2_3 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 0, 5)) elem_p2_4 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 0, 5)) document = create_pdf_document( elements={ 1: [elem_p1_1, elem_p1_2, elem_p1_3, elem_p1_4], 2: [elem_p2_1, elem_p2_2, elem_p2_3, elem_p2_4], }) elem_list = document.elements result = extract_table(elem_list) self.assertEqual(len(result), 4) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assertEqual(len(result[2]), 2) self.assertEqual(len(result[3]), 2) self.assert_original_element_list_list_equal( [ [elem_p1_1, elem_p1_2], [elem_p1_3, elem_p1_4], [elem_p2_1, elem_p2_2], [elem_p2_3, elem_p2_4], ], result, )
def __init__( self, bounding_box: "BoundingBox" = BoundingBox(0, 1, 0, 1), text: str = "fake_text", font_name: str = "fake_font", font_size: float = 10, ): super().__init__(bbox=[ bounding_box.x0, bounding_box.y0, bounding_box.x1, bounding_box.y1 ]) self.text = text self.font_name = font_name self.font_size = font_size
def create_pdf_element( bounding_box: "BoundingBox" = BoundingBox(0, 1, 0, 1), text: str = "fake_text", font_name: str = "fake_font", font_size: float = 10, font_mapping: Optional[Dict[str, str]] = None, font_mapping_is_regex: bool = False, regex_flags: Union[int, re.RegexFlag] = 0, font_size_precision: int = 1, ) -> "PDFElement": document = create_pdf_document( elements=[ FakePDFMinerTextElement( bounding_box, text=text, font_name=font_name, font_size=font_size ) ], font_mapping=font_mapping, font_mapping_is_regex=font_mapping_is_regex, regex_flags=regex_flags, font_size_precision=font_size_precision, ) return document.elements[0]
def test_document(self): el_page_1_top_left = FakePDFMinerTextElement(BoundingBox(0, 1, 2, 3)) el_page_1_top_right = FakePDFMinerTextElement(BoundingBox(2, 3, 2, 3)) el_page_1_bottom_left = FakePDFMinerTextElement(BoundingBox( 0, 1, 0, 1)) el_page_1_bottom_right = FakePDFMinerTextElement( BoundingBox(2, 3, 0, 1)) page_1 = Page( elements=[ el_page_1_top_left, el_page_1_top_right, el_page_1_bottom_left, el_page_1_bottom_right, ], width=100, height=100, ) el_page_2_top_left = FakePDFMinerTextElement(BoundingBox(0, 1, 2, 3)) el_page_2_top_right = FakePDFMinerTextElement(BoundingBox(2, 3, 2, 3)) el_page_2_bottom_left = FakePDFMinerTextElement(BoundingBox( 0, 1, 0, 1)) el_page_2_bottom_right = FakePDFMinerTextElement( BoundingBox(2, 3, 0, 1)) page_2 = Page( elements=[ el_page_2_bottom_right, el_page_2_bottom_left, el_page_2_top_right, el_page_2_top_left, ], width=100, height=100, ) document = PDFDocument(pages={1: page_1, 2: page_2}) # Checks elements were reordered expected_ordered_list = [ el_page_1_top_left, el_page_1_top_right, el_page_1_bottom_left, el_page_1_bottom_right, el_page_2_top_left, el_page_2_top_right, el_page_2_bottom_left, el_page_2_bottom_right, ] self.assertEqual( [elem.original_element for elem in document._element_list], expected_ordered_list, ) # Checks indexes were assigned properly self.assertEqual([elem._index for elem in document._element_list], [0, 1, 2, 3, 4, 5, 6, 7]) # Checks page numbers is correct self.assertEqual(document.page_numbers, [1, 2]) # Checks number of pages is correct self.assertEqual(document.number_of_pages, 2) # Checks pages were assigned properly self.assertEqual( [elem.page_number for elem in document._element_list], [1, 1, 1, 1, 2, 2, 2, 2], ) # Checks pages were instantiated correctly pdf_page_1 = document.get_page(1) self.assertEqual(page_1.width, pdf_page_1.width) self.assertEqual(page_1.height, pdf_page_1.height) self.assertEqual(el_page_1_top_left, pdf_page_1.start_element.original_element) self.assertEqual(el_page_1_bottom_right, pdf_page_1.end_element.original_element) self.assertEqual(pdf_page_1.page_number, 1) self.assertEqual(pdf_page_1.elements, ElementList(document, set([0, 1, 2, 3]))) pdf_page_2 = document.get_page(2) self.assertEqual(page_2.width, pdf_page_2.width) self.assertEqual(page_2.height, pdf_page_2.height) self.assertEqual(el_page_2_top_left, pdf_page_2.start_element.original_element) self.assertEqual(el_page_2_bottom_right, pdf_page_2.end_element.original_element) self.assertEqual(pdf_page_2.page_number, 2) self.assertEqual(pdf_page_2.elements, ElementList(document, set([4, 5, 6, 7]))) self.assertEqual(document.pages, [pdf_page_1, pdf_page_2]) self.assertEqual(document.elements, ElementList(document, set([0, 1, 2, 3, 4, 5, 6, 7]))) with self.assertRaises(PageNotFoundError): document.get_page(3)
class TestPDFElement(BaseTestCase): element_bbox = BoundingBox(2, 5, 2, 5) def test_page_number(self): element = create_pdf_element() self.assertEqual(element.page_number, 1) with self.assertRaises(AttributeError): element.page_number = 2 def test_font_name(self): element = create_pdf_element(font_name="test_font") self.assertEqual(element.font_name, "test_font") def test_font_size(self): element = create_pdf_element(font_size=2) self.assertEqual(element.font_size, 2) def test_font_size_precision(self): element = create_pdf_element(font_size=1.234) self.assertEqual(element.font_size, 1.2) element = create_pdf_element(font_size=1.234, font_size_precision=0) self.assertEqual(element.font_size, 1) element = create_pdf_element(font_size=1.234, font_size_precision=3) self.assertEqual(element.font_size, 1.234) def test_font(self): element = create_pdf_element(font_name="test_font", font_size=2) self.assertEqual(element.font, "test_font,2") element = create_pdf_element( font_name="test_font", font_size=3, font_mapping={"test_font,3": "test_named_font"}, ) self.assertEqual(element.font, "test_named_font") element = create_pdf_element( font_name="test_font", font_size=2, font_mapping={"test_font,3": "test_named_font"}, ) self.assertEqual(element.font, "test_font,2") # Test when font_mapping argument is passed to PDFDocument font_mapping = {} element = create_pdf_element(font_name="fake_font_1", font_size=10, font_mapping=font_mapping) self.assertEqual(element.font, "fake_font_1,10") font_mapping = {"fake_font_1,10": "large_text"} element = create_pdf_element(font_name="fake_font_1", font_size=10, font_mapping=font_mapping) self.assertEqual(element.font, "large_text") font_mapping = {r"^fake_font_\d,10$": "large_text"} element = create_pdf_element( font_name="fake_font_1", font_size=10, font_mapping=font_mapping, font_mapping_is_regex=True, ) self.assertEqual(element.font, "large_text") font_mapping = {r"^fake_font_\d,10$": "large_text"} element = create_pdf_element( font_name="FAKE_FONT_1", font_size=10, font_mapping=font_mapping, font_mapping_is_regex=True, ) self.assertEqual(element.font, "FAKE_FONT_1,10") font_mapping = {r"^fake_font_\d,10$": "large_text"} element = create_pdf_element( font_name="FAKE_FONT_1", font_size=10, font_mapping=font_mapping, font_mapping_is_regex=True, regex_flags=re.IGNORECASE, ) self.assertEqual(element.font, "large_text") def test_text(self): element = create_pdf_element(text=" test ") self.assertEqual(element.text(), "test") self.assertEqual(element.text(stripped=False), " test ") def test_add_tag(self): element = create_pdf_element() self.assertEqual(element.tags, set()) element.add_tag("foo") self.assertEqual(element.tags, set(["foo"])) element.add_tag("foo") self.assertEqual(element.tags, set(["foo"])) element.add_tag("bar") self.assertEqual(element.tags, set(["foo", "bar"])) def test_repr(self): element = create_pdf_element(font_name="test_font", font_size=2) self.assertEqual(repr(element), "<PDFElement tags: set(), font: 'test_font,2'>") element.add_tag("foo") self.assertEqual(repr(element), "<PDFElement tags: {'foo'}, font: 'test_font,2'>") element.ignore() self.assertEqual( repr(element), "<PDFElement tags: {'foo'}, font: 'test_font,2', ignored>") @data( BoundingBox(1, 6, 1, 6), # This box fully encloses the element BoundingBox(1, 6, 0, 3), # This box intersects the bottom of the element BoundingBox(1, 6, 0, 2), # This box touches the bottom of the element BoundingBox(1, 6, 4, 6), # This box intersects the top of the element BoundingBox(1, 6, 5, 6), # This box touches the top of the element BoundingBox(1, 6, 3, 4), # This box goes through center horizontally BoundingBox(1, 3, 1, 6), # This box intersects the left of the element BoundingBox(1, 2, 1, 6), # This box touches the left of the element BoundingBox(4, 6, 1, 6), # This box intersects the left of the element BoundingBox(5, 6, 1, 6), # This box touches the left of the element BoundingBox(3, 4, 1, 6), # This box goes through the center vertically BoundingBox(3, 4, 3, 4), # This box is enclosed inside the element ) def test_partially_within_true(self, bounding_box): element = create_pdf_element(self.element_bbox) self.assertTrue(element.partially_within(bounding_box)) @data( BoundingBox(1, 6, 0, 1), # This box is underneath the element BoundingBox(1, 6, 6, 7), # This box is above the element BoundingBox(0, 1, 1, 6), # This box is to the left of the element BoundingBox(6, 7, 1, 6), # This box is to the lerightft of the element ) def test_partially_within_false(self, bounding_box): element = create_pdf_element(self.element_bbox) self.assertFalse(element.partially_within(bounding_box)) @data(BoundingBox(1, 6, 1, 6)) # This box fully encloses the element def test_entirely_within_true(self, bounding_box): element = create_pdf_element(self.element_bbox) self.assertTrue(element.entirely_within(bounding_box)) @data( BoundingBox(1, 6, 0, 3), # This box intersects the bottom of the element BoundingBox(1, 6, 0, 2), # This box touches the bottom of the element BoundingBox(1, 6, 4, 6), # This box intersects the top of the element BoundingBox(1, 6, 5, 6), # This box touches the top of the element BoundingBox(1, 6, 3, 4), # This box goes through center horizontally BoundingBox(1, 3, 1, 6), # This box intersects the left of the element BoundingBox(1, 2, 1, 6), # This box touches the left of the element BoundingBox(4, 6, 1, 6), # This box intersects the left of the element BoundingBox(5, 6, 1, 6), # This box touches the left of the element BoundingBox(3, 4, 1, 6), # This box goes through the center vertically BoundingBox(1, 6, 0, 1), # This box is underneath the element BoundingBox(1, 6, 6, 7), # This box is above the element BoundingBox(0, 1, 1, 6), # This box is to the left of the element BoundingBox(6, 7, 1, 6), # This box is to the right of the element BoundingBox(3, 4, 3, 4), # This box is enclosed inside the element ) def test_entirely_within_false(self, bounding_box): element = create_pdf_element(self.element_bbox) self.assertFalse(element.entirely_within(bounding_box))
def test_horizontally_in_line_with(self, partially_within_mock): partially_within_mock.side_effect = ( lambda self, bounding_box: self.text() == "within" ) elem1 = FakePDFMinerTextElement( text="within", bounding_box=BoundingBox(50, 51, 50, 51) ) elem2 = FakePDFMinerTextElement(text="within") elem3 = FakePDFMinerTextElement() elem4 = FakePDFMinerTextElement(text="within") elem5 = FakePDFMinerTextElement() elem6 = FakePDFMinerTextElement(text="within") page1 = Page(elements=[elem1, elem2, elem3, elem4], width=100, height=100) page2 = Page(elements=[elem5, elem6], width=100, height=100) doc = PDFDocument(pages={1: page1, 2: page2}) elem_list = doc.elements pdf_elem1 = self.extract_element_from_list(elem1, elem_list) pdf_elem2 = self.extract_element_from_list(elem2, elem_list) pdf_elem3 = self.extract_element_from_list(elem3, elem_list) pdf_elem4 = self.extract_element_from_list(elem4, elem_list) result = elem_list.horizontally_in_line_with(pdf_elem1) # expected_bbox is from the left edge of elem1 to the left edge of the page expected_bbox = BoundingBox(0, 100, 50, 51) partially_within_mock.assert_has_calls( [ call(pdf_elem1, expected_bbox), call(pdf_elem2, expected_bbox), call(pdf_elem3, expected_bbox), call(pdf_elem4, expected_bbox), ], any_order=True, ) self.assertEqual(len(result), 2) self.assertIn(pdf_elem2, result) self.assertIn(pdf_elem4, result) # Also test with inclusive=True partially_within_mock.reset_mock() result = elem_list.horizontally_in_line_with(pdf_elem1, inclusive=True) partially_within_mock.assert_has_calls( [ call(pdf_elem1, expected_bbox), call(pdf_elem2, expected_bbox), call(pdf_elem3, expected_bbox), call(pdf_elem4, expected_bbox), ], any_order=True, ) self.assertEqual(len(result), 3) self.assertIn(pdf_elem1, result) self.assertIn(pdf_elem2, result) self.assertIn(pdf_elem4, result) # Test specifying tolerance expected_bbox = BoundingBox(0, 100, 50.1, 50.9) partially_within_mock.reset_mock() result = elem_list.horizontally_in_line_with(pdf_elem1, tolerance=0.1) partially_within_mock.assert_has_calls( [ call(pdf_elem1, expected_bbox), call(pdf_elem2, expected_bbox), call(pdf_elem3, expected_bbox), call(pdf_elem4, expected_bbox), ], any_order=True, )
def test_extract_text_from_table(self): # Checks that text from 2*2 table is correctly extracted # # elem_1 elem_2 # elem_3 elem_4 # elem_1 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 6, 10), text="fake_text_1" ) elem_2 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 6, 10), text="fake_text_2" ) elem_3 = FakePDFMinerTextElement( bounding_box=BoundingBox(0, 5, 0, 5), text="fake_text_3" ) elem_4 = FakePDFMinerTextElement( bounding_box=BoundingBox(6, 10, 0, 5), text="fake_text_4 " ) document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4]) elem_list = document.elements result = extract_table(elem_list, as_text=True) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assertListEqual( [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4"]], result ) result = extract_table(elem_list, as_text=True, strip_text=False) self.assertListEqual( [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4 "]], result ) # Checks that text from the following table is correctly extracted # # elem_1 elem_2 elem_6 # elem_3 elem_4 elem_5 # elem_5 = FakePDFMinerTextElement( bounding_box=BoundingBox(11, 15, 0, 5), text="fake_text_5" ) elem_6 = FakePDFMinerTextElement( bounding_box=BoundingBox(16, 20, 6, 10), text="fake_text_6" ) document = create_pdf_document( elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6] ) elem_list = document.elements result = extract_table(elem_list, as_text=True) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), 4) self.assertEqual(len(result[1]), 4) self.assertListEqual( [ ["fake_text_1", "fake_text_2", "", "fake_text_6"], ["fake_text_3", "fake_text_4", "fake_text_5", ""], ], result, ) result = extract_table(elem_list, as_text=True, strip_text=False) self.assertListEqual( [ ["fake_text_1", "fake_text_2", "", "fake_text_6"], ["fake_text_3", "fake_text_4 ", "fake_text_5", ""], ], result, )
def test_extract_table_removing_duplicate_header_different_fonts_or_text(self): # header_elem_1 header_elem_2 # header_elem_3_different_font header_elem_4 # header_elem_5_different_text header_elem_6 # header_elem_1 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 21, 25), ) header_elem_2 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(11, 15, 21, 25), ) header_elem_3_different_font = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=12, bounding_box=BoundingBox(0, 5, 16, 20), ) header_elem_4 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(11, 15, 16, 20), ) header_elem_5_different_text = FakePDFMinerTextElement( text="header with a different name", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 11, 15), ) header_elem_6 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(11, 15, 11, 15), ) document = create_pdf_document( elements=[ header_elem_1, header_elem_2, header_elem_3_different_font, header_elem_4, header_elem_5_different_text, header_elem_6, ] ) elem_list = document.elements result = extract_table(elem_list, remove_duplicate_header_rows=True) self.assertEqual(len(result), 3) self.assertEqual(len(result[0]), 2) self.assertEqual(len(result[1]), 2) self.assertEqual(len(result[2]), 2) self.assert_original_element_list_list_equal( [ [header_elem_1, header_elem_2], [header_elem_3_different_font, header_elem_4], [header_elem_5_different_text, header_elem_6], ], result, )
def test_extract_table_removing_duplicate_header_rows(self): # header_elem_1 header_elem_2 header_elem_1 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 21, 25), ) header_elem_2 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(11, 15, 21, 25), ) document = create_pdf_document(elements=[header_elem_1, header_elem_2]) elem_list = document.elements result = extract_table(elem_list, remove_duplicate_header_rows=True) # Extraction here should just return the whole table as it is not possible to # have duplicates of a single lined table. self.assertEqual(len(result), 1) self.assertEqual(len(result[0]), 2) self.assert_original_element_list_list_equal( [[header_elem_1, header_elem_2]], result ) # header_elem_1 header_elem_2 # elem_1 elem_2 # header_elem_3 header_elem_4 # elem_3 elem_4 # header_elem_5 header_elem_6 # elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 16, 20)) elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 16, 20)) header_elem_3 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 11, 15), ) header_elem_4 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(11, 15, 11, 15), ) elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10)) header_elem_5 = FakePDFMinerTextElement( text="header 1", font_name="header font", font_size=10, bounding_box=BoundingBox(0, 5, 0, 5), ) header_elem_6 = FakePDFMinerTextElement( text="header 2", font_name="header font", font_size=10, bounding_box=BoundingBox(6, 10, 0, 5), ) document = create_pdf_document( elements=[ header_elem_1, header_elem_2, elem_1, elem_2, header_elem_3, header_elem_4, elem_3, elem_4, header_elem_5, header_elem_6, ] ) elem_list = document.elements result = extract_table(elem_list, remove_duplicate_header_rows=True) # The last row will not be removed as the gaps do not match the header row self.assertEqual(len(result), 4) self.assertEqual(len(result[0]), 3) self.assertEqual(len(result[1]), 3) self.assertEqual(len(result[2]), 3) self.assertEqual(len(result[3]), 3) self.assert_original_element_list_list_equal( [ [header_elem_1, None, header_elem_2], [elem_1, elem_2, None], [elem_3, None, elem_4], [header_elem_5, header_elem_6, None], ], result, )
def test_repr(self): bbox = BoundingBox(0, 1, 0, 1) self.assertEqual(repr(bbox), "<BoundingBox x0=0, x1=1, y0=0, y1=1>")