def test_should_provide_empty_list_for_empty_pages(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[],
                    meta=LayoutPageMeta(
                        page_number=1,
                        coordinates=LAYOUT_PAGE_COORDINATES_1._replace(
                            page_number=1)))
     ])
     result = get_page_numbers_with_mostly_bitmap_graphics(layout_document)
     assert result == []
 def test_should_preserve_meta(self):
     page_meta = LayoutPageMeta(COORDINATES_1)
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[LayoutBlock.for_text('token1'), EMPTY_BLOCK],
                    graphics=[],
                    meta=page_meta)
     ])
     retokenized_layout_document = remove_empty_blocks(layout_document)
     page = retokenized_layout_document.pages[0]
     assert page.meta == page_meta
 def test_should_provide_page_number_with_uncomment_page_dimension(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[],
                    meta=LayoutPageMeta(
                        page_number=1,
                        coordinates=LAYOUT_PAGE_COORDINATES_1._replace(
                            page_number=1))),
         LayoutPage(blocks=[],
                    meta=LayoutPageMeta(
                        page_number=2,
                        coordinates=LAYOUT_PAGE_COORDINATES_2._replace(
                            page_number=2))),
         LayoutPage(blocks=[],
                    meta=LayoutPageMeta(
                        page_number=3,
                        coordinates=LAYOUT_PAGE_COORDINATES_1._replace(
                            page_number=3)))
     ])
     result = get_page_numbers_with_uncommon_page_dimension(layout_document)
     assert result == [2]
 def test_should_replace_text_and_graphics_within_bounding_box_of_semantic_graphics(
         self):
     page_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(0, 0, 200, 200), page_number=1)
     semantic_graphic_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 90, 100, 50), page_number=1)
     keep_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 10, 100, 20), page_number=1)
     remove_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 100, 100, 20), page_number=1)
     empty_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 100, 0, 0), page_number=1)
     keep_token = LayoutToken('keep', coordinates=keep_coordinates)
     remove_token = LayoutToken('remove', coordinates=remove_coordinates)
     keep_graphic = LayoutGraphic(coordinates=keep_coordinates,
                                  graphic_type='keep-graphic')
     remove_graphic = LayoutGraphic(coordinates=remove_coordinates,
                                    graphic_type='remove-graphic')
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(
                 lines=[LayoutLine(tokens=[keep_token, remove_token])])
         ],
                    graphics=[keep_graphic, remove_graphic],
                    meta=LayoutPageMeta(
                        page_number=page_coordinates.page_number,
                        coordinates=page_coordinates))
     ])
     layout_graphic = LayoutGraphic(
         coordinates=semantic_graphic_coordinates,
         graphic_type='new-graphic')
     no_coords_layout_graphic = LayoutGraphic(
         coordinates=empty_coordinates, graphic_type='empty-coords-graphic')
     result = get_layout_document_with_text_and_graphics_replaced_by_graphics(
         layout_document,
         semantic_graphics=[
             SemanticGraphic(layout_graphic=layout_graphic),
             SemanticGraphic(layout_graphic=no_coords_layout_graphic)
         ])
     LOGGER.debug('result.pages[0].graphics: %r', result.pages[0].graphics)
     assert result.pages[0].graphics[:-1] == [keep_graphic]
     LOGGER.debug('result.pages[0].graphics[-1]: %r',
                  result.pages[0].graphics[-1])
     assert result.pages[0].graphics[
         -1].graphic_type == layout_graphic.graphic_type
     assert result.pages[0].graphics[
         -1].coordinates == layout_graphic.coordinates
     assert list(
         result.pages[0].blocks[0].iter_all_tokens()) == [keep_token]
     assert list(
         result.pages[0].graphics[-1].related_block.iter_all_tokens()) == [
             keep_token, remove_token
         ]
Exemplo n.º 5
0
def _create_page(
    coordinates: LayoutPageCoordinates,
    graphics: Optional[Sequence[LayoutGraphic]] = None
) -> LayoutPage:
    return LayoutPage(
        meta=LayoutPageMeta(
            page_number=coordinates.page_number,
            coordinates=coordinates
        ),
        blocks=[],
        graphics=graphics or []
    )
 def test_should_preserve_meta(self):
     page_meta = LayoutPageMeta(COORDINATES_1)
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock.for_tokens([LayoutToken('token1 token2')])
         ],
                    graphics=[],
                    meta=page_meta)
     ])
     retokenized_layout_document = retokenize_layout_document(
         layout_document)
     page = retokenized_layout_document.pages[0]
     assert page.meta == page_meta
 def test_should_ignore_small_bitmap(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[],
                    graphics=[
                        LayoutGraphic(
                            graphic_type='image',
                            coordinates=LAYOUT_PAGE_COORDINATES_1._replace(
                                page_number=1, width=1, height=1))
                    ],
                    meta=LayoutPageMeta(
                        page_number=1,
                        coordinates=LAYOUT_PAGE_COORDINATES_1._replace(
                            page_number=1)))
     ])
     result = get_page_numbers_with_mostly_bitmap_graphics(layout_document)
     assert result == []
Exemplo n.º 8
0
 def parse_page(self, page_node: etree.ElementBase,
                page_index: int) -> LayoutPage:
     page_number_str = page_node.attrib.get('PHYSICAL_IMG_NR')
     page_number = int(
         page_number_str) if page_number_str else 1 + page_index
     width_str = page_node.attrib.get('WIDTH')
     height_str = page_node.attrib.get('HEIGHT')
     coordinates = (LayoutPageCoordinates(x=0,
                                          y=0,
                                          width=float(width_str),
                                          height=float(height_str),
                                          page_number=page_number)
                    if width_str and height_str else None)
     return LayoutPage(
         meta=LayoutPageMeta(page_number=page_number,
                             coordinates=coordinates),
         blocks=[
             self.parse_block(block_node, page_number=page_number)
             for block_node in alto_xpath(page_node, './/alto:TextBlock')
         ],
         graphics=[
             self.parse_graphic(graphic_node, page_number=page_number) for
             graphic_node in alto_xpath(page_node, './/alto:Illustration')
         ])