Exemplo n.º 1
0
    def test_fix_element_in_multiple_cols(self):
        # Checks that the following table is correctly extracted:
        # ---------
        # | 1     |
        # --------|
        # | 2 | 3 |
        # ---------

        elem_1 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 10, 6, 10), text="fake_text_1"
        )
        elem_2 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 0, 5), text="fake_text_2"
        )
        elem_3 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 0, 5), text="fake_text_3"
        )

        document = create_pdf_document(elements=[elem_1, elem_2, elem_3])
        elem_list = document.elements

        with self.assertRaises(TableExtractionError):
            result = extract_table(elem_list, as_text=True)

        result = extract_table(
            elem_list, as_text=True, fix_element_in_multiple_cols=True
        )
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assertListEqual(
            [["fake_text_1", ""], ["fake_text_2", "fake_text_3"]], result
        )
Exemplo n.º 2
0
    def test_extract_table_with_tolerance(self):
        # Checks that simple 2*2 table is correctly extracted
        #
        #       elem_1      elem_2
        #       elem_3      elem_4
        #
        # But with elem_4 slightly overlapping elem_2, counteracted by setting tolerance
        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
        elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 6.1))

        document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4])
        elem_list = document.elements

        with self.assertRaises(TableExtractionError):
            extract_table(elem_list)

        result = extract_table(elem_list, tolerance=0.2)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assert_original_element_list_list_equal(
            [[elem_1, elem_2], [elem_3, elem_4]], result
        )
Exemplo n.º 3
0
    def test_extract_table(self):
        # Checks that simple 2*2 table is correctly extracted
        #
        #       elem_1      elem_2
        #       elem_3      elem_4
        #
        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
        elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))

        document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4])
        elem_list = document.elements

        result = extract_table(elem_list)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assert_original_element_list_list_equal(
            [[elem_1, elem_2], [elem_3, elem_4]], result
        )
        # Checks that the following table is correctly extracted
        #
        #       elem_1      elem_2                  elem_6
        #       elem_3      elem_4      elem_5
        #
        elem_5 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 0, 5))
        elem_6 = FakePDFMinerTextElement(bounding_box=BoundingBox(16, 20, 6, 10))
        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6]
        )
        elem_list = document.elements
        result = extract_table(elem_list)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 4)
        self.assertEqual(len(result[1]), 4)
        self.assert_original_element_list_list_equal(
            [[elem_1, elem_2, None, elem_6], [elem_3, elem_4, elem_5, None]], result
        )
        # Checks that it raises an error if one element is in two rows
        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(3, 8, 6, 10))
        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6]
        )
        elem_list = document.elements
        with self.assertRaises(TableExtractionError):
            result = extract_table(elem_list)
        # Checks that it raises an error if one element is in two columns
        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 3, 8))
        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6]
        )
        elem_list = document.elements
        with self.assertRaises(TableExtractionError):
            result = extract_table(elem_list)
Exemplo n.º 4
0
    def test_extract_table_from_different_pages(self):
        # Checks that simple 2*2 tables are correctly extracted from different pages
        #
        # Page 1:
        #       elem_p1_1      elem_p1_2
        #       elem_p1_3      elem_p1_4
        #
        # Page 2:
        #       elem_p2_1      elem_p2_2
        #       elem_p2_3      elem_p2_4
        #
        elem_p1_1 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 6, 10))
        elem_p1_2 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 6, 10))
        elem_p1_3 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 0, 5))
        elem_p1_4 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 0, 5))

        elem_p2_1 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 6, 10))
        elem_p2_2 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 6, 10))
        elem_p2_3 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 0, 5))
        elem_p2_4 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 0, 5))

        document = create_pdf_document(
            elements={
                1: [elem_p1_1, elem_p1_2, elem_p1_3, elem_p1_4],
                2: [elem_p2_1, elem_p2_2, elem_p2_3, elem_p2_4],
            })
        elem_list = document.elements

        result = extract_table(elem_list)
        self.assertEqual(len(result), 4)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assertEqual(len(result[2]), 2)
        self.assertEqual(len(result[3]), 2)
        self.assert_original_element_list_list_equal(
            [
                [elem_p1_1, elem_p1_2],
                [elem_p1_3, elem_p1_4],
                [elem_p2_1, elem_p2_2],
                [elem_p2_3, elem_p2_4],
            ],
            result,
        )
Exemplo n.º 5
0
    def test_extract_text_from_table(self):
        # Checks that text from 2*2 table is correctly extracted
        #
        #       elem_1      elem_2
        #       elem_3      elem_4
        #
        elem_1 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 6, 10), text="fake_text_1"
        )
        elem_2 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 6, 10), text="fake_text_2"
        )
        elem_3 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(0, 5, 0, 5), text="fake_text_3"
        )
        elem_4 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(6, 10, 0, 5), text="fake_text_4 "
        )

        document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4])
        elem_list = document.elements

        result = extract_table(elem_list, as_text=True)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assertListEqual(
            [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4"]], result
        )

        result = extract_table(elem_list, as_text=True, strip_text=False)
        self.assertListEqual(
            [["fake_text_1", "fake_text_2"], ["fake_text_3", "fake_text_4 "]], result
        )

        # Checks that text from the following table is correctly extracted
        #
        #       elem_1      elem_2                  elem_6
        #       elem_3      elem_4      elem_5
        #
        elem_5 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(11, 15, 0, 5), text="fake_text_5"
        )
        elem_6 = FakePDFMinerTextElement(
            bounding_box=BoundingBox(16, 20, 6, 10), text="fake_text_6"
        )
        document = create_pdf_document(
            elements=[elem_1, elem_2, elem_3, elem_4, elem_5, elem_6]
        )
        elem_list = document.elements
        result = extract_table(elem_list, as_text=True)
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), 4)
        self.assertEqual(len(result[1]), 4)
        self.assertListEqual(
            [
                ["fake_text_1", "fake_text_2", "", "fake_text_6"],
                ["fake_text_3", "fake_text_4", "fake_text_5", ""],
            ],
            result,
        )

        result = extract_table(elem_list, as_text=True, strip_text=False)
        self.assertListEqual(
            [
                ["fake_text_1", "fake_text_2", "", "fake_text_6"],
                ["fake_text_3", "fake_text_4 ", "fake_text_5", ""],
            ],
            result,
        )
Exemplo n.º 6
0
    def test_extract_table_removing_duplicate_header_different_fonts_or_text(self):
        #    header_elem_1                     header_elem_2
        #    header_elem_3_different_font      header_elem_4
        #    header_elem_5_different_text      header_elem_6
        #
        header_elem_1 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 21, 25),
        )
        header_elem_2 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(11, 15, 21, 25),
        )
        header_elem_3_different_font = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=12,
            bounding_box=BoundingBox(0, 5, 16, 20),
        )
        header_elem_4 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(11, 15, 16, 20),
        )
        header_elem_5_different_text = FakePDFMinerTextElement(
            text="header with a different name",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 11, 15),
        )
        header_elem_6 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(11, 15, 11, 15),
        )

        document = create_pdf_document(
            elements=[
                header_elem_1,
                header_elem_2,
                header_elem_3_different_font,
                header_elem_4,
                header_elem_5_different_text,
                header_elem_6,
            ]
        )
        elem_list = document.elements

        result = extract_table(elem_list, remove_duplicate_header_rows=True)
        self.assertEqual(len(result), 3)
        self.assertEqual(len(result[0]), 2)
        self.assertEqual(len(result[1]), 2)
        self.assertEqual(len(result[2]), 2)
        self.assert_original_element_list_list_equal(
            [
                [header_elem_1, header_elem_2],
                [header_elem_3_different_font, header_elem_4],
                [header_elem_5_different_text, header_elem_6],
            ],
            result,
        )
Exemplo n.º 7
0
    def test_extract_table_removing_duplicate_header_rows(self):
        #    header_elem_1    header_elem_2
        header_elem_1 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 21, 25),
        )
        header_elem_2 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(11, 15, 21, 25),
        )
        document = create_pdf_document(elements=[header_elem_1, header_elem_2])
        elem_list = document.elements

        result = extract_table(elem_list, remove_duplicate_header_rows=True)
        # Extraction here should just return the whole table as it is not possible to
        # have duplicates of a single lined table.
        self.assertEqual(len(result), 1)
        self.assertEqual(len(result[0]), 2)
        self.assert_original_element_list_list_equal(
            [[header_elem_1, header_elem_2]], result
        )

        #    header_elem_1                     header_elem_2
        #       elem_1           elem_2
        #    header_elem_3                     header_elem_4
        #       elem_3                         elem_4
        #    header_elem_5    header_elem_6
        #
        elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 16, 20))
        elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 16, 20))
        header_elem_3 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 11, 15),
        )
        header_elem_4 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(11, 15, 11, 15),
        )
        elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
        elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(11, 15, 6, 10))
        header_elem_5 = FakePDFMinerTextElement(
            text="header 1",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(0, 5, 0, 5),
        )
        header_elem_6 = FakePDFMinerTextElement(
            text="header 2",
            font_name="header font",
            font_size=10,
            bounding_box=BoundingBox(6, 10, 0, 5),
        )

        document = create_pdf_document(
            elements=[
                header_elem_1,
                header_elem_2,
                elem_1,
                elem_2,
                header_elem_3,
                header_elem_4,
                elem_3,
                elem_4,
                header_elem_5,
                header_elem_6,
            ]
        )
        elem_list = document.elements

        result = extract_table(elem_list, remove_duplicate_header_rows=True)
        # The last row will not be removed as the gaps do not match the header row
        self.assertEqual(len(result), 4)
        self.assertEqual(len(result[0]), 3)
        self.assertEqual(len(result[1]), 3)
        self.assertEqual(len(result[2]), 3)
        self.assertEqual(len(result[3]), 3)
        self.assert_original_element_list_list_equal(
            [
                [header_elem_1, None, header_elem_2],
                [elem_1, elem_2, None],
                [elem_3, None, elem_4],
                [header_elem_5, header_elem_6, None],
            ],
            result,
        )
Exemplo n.º 8
0
    def test_output_is_correct(self):
        file_path = os.path.join(os.path.dirname(__file__),
                                 "../../docs/source/example_files/tables.pdf")

        # Step 1 - Load the file
        FONT_MAPPING = {
            "BAAAAA+LiberationSerif-Bold,12.0": "header",
            "CAAAAA+LiberationSerif,12.0": "table_element",
        }
        document = load_file(file_path, font_mapping=FONT_MAPPING)

        headers = document.elements.filter_by_font("header")

        # Extract reference elements
        simple_table_header = headers.filter_by_text_equal(
            "Simple Table").extract_single_element()

        simple_table_with_gaps_header = headers.filter_by_text_equal(
            "Simple Table with gaps").extract_single_element()

        simple_table_with_gaps_in_first_row_col_header = headers.filter_by_text_equal(
            "Simple Table with gaps in first row/col").extract_single_element(
            )

        non_simple_table_header = headers.filter_by_text_equal(
            "Non Simple Table").extract_single_element()

        non_simple_table_with_merged_cols_header = headers.filter_by_text_equal(
            "Non Simple Table with Merged Columns").extract_single_element()

        non_simple_table_with_merged_rows_header = headers.filter_by_text_equal(
            "Non Simple Table with Merged Rows and Columns"
        ).extract_single_element()

        over_the_page_header = headers.filter_by_text_equal(
            "Over the page").extract_single_element()

        # Extract table elements
        simple_table_elements = document.elements.between(
            simple_table_header, simple_table_with_gaps_header)
        simple_table_with_gaps_elements = document.elements.between(
            simple_table_with_gaps_header,
            simple_table_with_gaps_in_first_row_col_header,
        )

        simple_table_with_gaps_in_first_row_col_elements = document.elements.between(
            simple_table_with_gaps_in_first_row_col_header,
            non_simple_table_header)

        non_simple_table_elements = document.elements.between(
            non_simple_table_header, non_simple_table_with_merged_cols_header)

        non_simple_table_with_merged_cols_elements = document.elements.between(
            non_simple_table_with_merged_cols_header,
            non_simple_table_with_merged_rows_header,
        )

        non_simple_table_with_merged_rows_and_cols_elements = document.elements.between(
            non_simple_table_with_merged_rows_header, over_the_page_header)

        over_the_page_elements = document.elements.after(over_the_page_header)

        # Simple Table
        table = tables.extract_simple_table(simple_table_elements,
                                            as_text=True)
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "A", "1"],
                ["B", "2", "B", "2"],
                ["C", "3", "C", "3"],
            ],
        )

        # Simple Table with gaps

        with self.assertRaises(TableExtractionError):
            tables.extract_simple_table(simple_table_with_gaps_elements,
                                        as_text=True)

        table = tables.extract_simple_table(simple_table_with_gaps_elements,
                                            as_text=True,
                                            allow_gaps=True)
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "", "1"],
                ["B", "", "", ""],
                ["C", "", "C", "3"],
            ],
        )

        # Simple Table with gaps in first row/col
        with self.assertRaises(TableExtractionError):
            tables.extract_simple_table(
                simple_table_with_gaps_in_first_row_col_elements,
                as_text=True,
                allow_gaps=True,
            )

        reference_element = simple_table_with_gaps_in_first_row_col_elements[9]
        table = tables.extract_simple_table(
            simple_table_with_gaps_in_first_row_col_elements,
            as_text=True,
            allow_gaps=True,
            reference_element=reference_element,
        )
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "", "Heading 4"],
                ["", "1", "A", ""],
                ["B", "2", "", "2"],
                ["C", "3", "C", "3"],
            ],
        )

        # Non Simple Table
        table = tables.extract_table(non_simple_table_elements, as_text=True)
        self.assertListEqual(
            table,
            [
                ["", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "", "1"],
                ["B", "", "B", "2"],
                ["C", "3", "C", ""],
            ],
        )

        # Non Simple Table with Merged Columns
        with self.assertRaises(TableExtractionError):
            tables.extract_table(non_simple_table_with_merged_cols_elements,
                                 as_text=True)

        table = tables.extract_table(
            non_simple_table_with_merged_cols_elements,
            as_text=True,
            fix_element_in_multiple_cols=True,
        )
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "A", "1"],
                ["This text spans across multiple columns", "", "B", "2"],
                ["C", "3", "C", "3"],
            ],
        )

        # Non Simple Table with Merged Rows and Columns
        table = tables.extract_table(
            non_simple_table_with_merged_rows_and_cols_elements,
            as_text=True,
            fix_element_in_multiple_rows=True,
            fix_element_in_multiple_cols=True,
        )
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                [
                    "This text spans across multiple rows and \nmultiple columns.",
                    "",
                    "A",
                    "1",
                ],
                ["", "", "B", "2"],
                ["C", "3", "C", "3"],
            ],
        )

        # Over the page
        table = tables.extract_simple_table(over_the_page_elements,
                                            as_text=True)
        self.assertListEqual(
            table,
            [
                ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
                ["A", "1", "A", "1"],
                ["B", "2", "B", "2"],
                ["C", "3", "C", "3"],
            ],
        )