def test_should_provide_empty_list_for_empty_pages(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[], meta=LayoutPageMeta( page_number=1, coordinates=LAYOUT_PAGE_COORDINATES_1._replace( page_number=1))) ]) result = get_page_numbers_with_mostly_bitmap_graphics(layout_document) assert result == []
def test_should_preserve_meta(self): page_meta = LayoutPageMeta(COORDINATES_1) layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[LayoutBlock.for_text('token1'), EMPTY_BLOCK], graphics=[], meta=page_meta) ]) retokenized_layout_document = remove_empty_blocks(layout_document) page = retokenized_layout_document.pages[0] assert page.meta == page_meta
def test_should_provide_page_number_with_uncomment_page_dimension(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[], meta=LayoutPageMeta( page_number=1, coordinates=LAYOUT_PAGE_COORDINATES_1._replace( page_number=1))), LayoutPage(blocks=[], meta=LayoutPageMeta( page_number=2, coordinates=LAYOUT_PAGE_COORDINATES_2._replace( page_number=2))), LayoutPage(blocks=[], meta=LayoutPageMeta( page_number=3, coordinates=LAYOUT_PAGE_COORDINATES_1._replace( page_number=3))) ]) result = get_page_numbers_with_uncommon_page_dimension(layout_document) assert result == [2]
def test_should_replace_text_and_graphics_within_bounding_box_of_semantic_graphics( self): page_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(0, 0, 200, 200), page_number=1) semantic_graphic_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 90, 100, 50), page_number=1) keep_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 10, 100, 20), page_number=1) remove_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 100, 100, 20), page_number=1) empty_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 100, 0, 0), page_number=1) keep_token = LayoutToken('keep', coordinates=keep_coordinates) remove_token = LayoutToken('remove', coordinates=remove_coordinates) keep_graphic = LayoutGraphic(coordinates=keep_coordinates, graphic_type='keep-graphic') remove_graphic = LayoutGraphic(coordinates=remove_coordinates, graphic_type='remove-graphic') layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock( lines=[LayoutLine(tokens=[keep_token, remove_token])]) ], graphics=[keep_graphic, remove_graphic], meta=LayoutPageMeta( page_number=page_coordinates.page_number, coordinates=page_coordinates)) ]) layout_graphic = LayoutGraphic( coordinates=semantic_graphic_coordinates, graphic_type='new-graphic') no_coords_layout_graphic = LayoutGraphic( coordinates=empty_coordinates, graphic_type='empty-coords-graphic') result = get_layout_document_with_text_and_graphics_replaced_by_graphics( layout_document, semantic_graphics=[ SemanticGraphic(layout_graphic=layout_graphic), SemanticGraphic(layout_graphic=no_coords_layout_graphic) ]) LOGGER.debug('result.pages[0].graphics: %r', result.pages[0].graphics) assert result.pages[0].graphics[:-1] == [keep_graphic] LOGGER.debug('result.pages[0].graphics[-1]: %r', result.pages[0].graphics[-1]) assert result.pages[0].graphics[ -1].graphic_type == layout_graphic.graphic_type assert result.pages[0].graphics[ -1].coordinates == layout_graphic.coordinates assert list( result.pages[0].blocks[0].iter_all_tokens()) == [keep_token] assert list( result.pages[0].graphics[-1].related_block.iter_all_tokens()) == [ keep_token, remove_token ]
def _create_page( coordinates: LayoutPageCoordinates, graphics: Optional[Sequence[LayoutGraphic]] = None ) -> LayoutPage: return LayoutPage( meta=LayoutPageMeta( page_number=coordinates.page_number, coordinates=coordinates ), blocks=[], graphics=graphics or [] )
def test_should_preserve_meta(self): page_meta = LayoutPageMeta(COORDINATES_1) layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock.for_tokens([LayoutToken('token1 token2')]) ], graphics=[], meta=page_meta) ]) retokenized_layout_document = retokenize_layout_document( layout_document) page = retokenized_layout_document.pages[0] assert page.meta == page_meta
def test_should_ignore_small_bitmap(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[], graphics=[ LayoutGraphic( graphic_type='image', coordinates=LAYOUT_PAGE_COORDINATES_1._replace( page_number=1, width=1, height=1)) ], meta=LayoutPageMeta( page_number=1, coordinates=LAYOUT_PAGE_COORDINATES_1._replace( page_number=1))) ]) result = get_page_numbers_with_mostly_bitmap_graphics(layout_document) assert result == []
def parse_page(self, page_node: etree.ElementBase, page_index: int) -> LayoutPage: page_number_str = page_node.attrib.get('PHYSICAL_IMG_NR') page_number = int( page_number_str) if page_number_str else 1 + page_index width_str = page_node.attrib.get('WIDTH') height_str = page_node.attrib.get('HEIGHT') coordinates = (LayoutPageCoordinates(x=0, y=0, width=float(width_str), height=float(height_str), page_number=page_number) if width_str and height_str else None) return LayoutPage( meta=LayoutPageMeta(page_number=page_number, coordinates=coordinates), blocks=[ self.parse_block(block_node, page_number=page_number) for block_node in alto_xpath(page_node, './/alto:TextBlock') ], graphics=[ self.parse_graphic(graphic_node, page_number=page_number) for graphic_node in alto_xpath(page_node, './/alto:Illustration') ])