def test_should_find_bbox_and_map_to_page_coordinates( # pylint: disable=too-many-locals self, computer_vision_model_mock: MagicMock, tmp_path: Path, extract_graphic_assets: bool ): image_path = tmp_path / 'page10.png' image = PIL.Image.new('RGB', size=(20, 10), color=(255, 0, 0)) image.save(image_path) page_images = [DocumentPageImage( page_number=10, page_image_path=str(image_path) )] layout_document = LayoutDocument(pages=[ _create_page( coordinates=LayoutPageCoordinates( x=0, y=0, width=200, height=100, page_number=10 ) ) ]) cv_result = computer_vision_model_mock.predict_single.return_value cv_bbox = BoundingBox(x=1, y=2, width=3, height=4) cv_result.get_instances_by_type_name.return_value = [ SimpleComputerVisionModelInstance(bounding_box=cv_bbox) ] expected_page_coordinates = LayoutPageCoordinates( x=10, y=20, width=30, height=40, page_number=10 ) graphic_provider = ComputerVisionDocumentGraphicProvider( computer_vision_model=computer_vision_model_mock, page_image_iterable=page_images, temp_dir=str(tmp_path) ) semantic_graphic_list = list(graphic_provider.iter_semantic_graphic_for_layout_document( layout_document=layout_document, extract_graphic_assets=extract_graphic_assets )) assert semantic_graphic_list semantic_graphic = semantic_graphic_list[0] LOGGER.debug('semantic_graphic: %s', semantic_graphic) layout_graphic = semantic_graphic.layout_graphic assert layout_graphic is not None assert layout_graphic.coordinates == expected_page_coordinates if extract_graphic_assets: assert layout_graphic.local_file_path assert ( semantic_graphic.relative_path == os.path.basename(layout_graphic.local_file_path) ) with PIL.Image.open(layout_graphic.local_file_path) as cropped_image: assert cropped_image.width == cv_bbox.width assert cropped_image.height == cv_bbox.height else: assert not semantic_graphic.relative_path
def test_should_replace_text_and_graphics_within_bounding_box_of_semantic_graphics( self): page_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(0, 0, 200, 200), page_number=1) semantic_graphic_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 90, 100, 50), page_number=1) keep_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 10, 100, 20), page_number=1) remove_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 100, 100, 20), page_number=1) empty_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 100, 0, 0), page_number=1) keep_token = LayoutToken('keep', coordinates=keep_coordinates) remove_token = LayoutToken('remove', coordinates=remove_coordinates) keep_graphic = LayoutGraphic(coordinates=keep_coordinates, graphic_type='keep-graphic') remove_graphic = LayoutGraphic(coordinates=remove_coordinates, graphic_type='remove-graphic') layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock( lines=[LayoutLine(tokens=[keep_token, remove_token])]) ], graphics=[keep_graphic, remove_graphic], meta=LayoutPageMeta( page_number=page_coordinates.page_number, coordinates=page_coordinates)) ]) layout_graphic = LayoutGraphic( coordinates=semantic_graphic_coordinates, graphic_type='new-graphic') no_coords_layout_graphic = LayoutGraphic( coordinates=empty_coordinates, graphic_type='empty-coords-graphic') result = get_layout_document_with_text_and_graphics_replaced_by_graphics( layout_document, semantic_graphics=[ SemanticGraphic(layout_graphic=layout_graphic), SemanticGraphic(layout_graphic=no_coords_layout_graphic) ]) LOGGER.debug('result.pages[0].graphics: %r', result.pages[0].graphics) assert result.pages[0].graphics[:-1] == [keep_graphic] LOGGER.debug('result.pages[0].graphics[-1]: %r', result.pages[0].graphics[-1]) assert result.pages[0].graphics[ -1].graphic_type == layout_graphic.graphic_type assert result.pages[0].graphics[ -1].coordinates == layout_graphic.coordinates assert list( result.pages[0].blocks[0].iter_all_tokens()) == [keep_token] assert list( result.pages[0].graphics[-1].related_block.iter_all_tokens()) == [ keep_token, remove_token ]
def test_should_detect_indented_blocks(self): line_indentation_status_feature = LineIndentationStatusFeature() line_indentation_status_feature.on_new_block() line_indentation_status_feature.on_new_line() assert line_indentation_status_feature.get_is_indented_and_update( LayoutToken('x', coordinates=LayoutPageCoordinates( x=10, y=10, width=10, height=10))) is False line_indentation_status_feature.on_new_line() assert line_indentation_status_feature.get_is_indented_and_update( LayoutToken('x', coordinates=LayoutPageCoordinates( x=50, y=10, width=10, height=10))) is True
def test_should_not_merge_coordinates_on_different_pages(self): coordinates_list = [ LayoutPageCoordinates(x=10, y=10, width=100, height=100, page_number=1), LayoutPageCoordinates(x=110, y=10, width=100, height=100, page_number=2) ] assert get_merged_coordinates_list( coordinates_list) == coordinates_list
def test_should_retokenize_document_with_placeholders(self): text = 'token1 token2' layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock.for_tokens([ LayoutToken(text, whitespace='\n', coordinates=LayoutPageCoordinates( x=10, y=10, width=100, height=50)) ]) ], graphics=[]) ]) retokenized_layout_document = retokenize_layout_document( layout_document) line = retokenized_layout_document.pages[0].blocks[0].lines[0] assert [t.text for t in line.tokens] == ['token1', 'token2'] assert [t.whitespace for t in line.tokens] == [' ', '\n'] assert line.tokens[0].coordinates.x == 10.0 assert line.tokens[0].coordinates.width == 100 * len('token1') / len( text) assert line.tokens[ 1].coordinates.x == 10.0 + 100 * len('token1 ') / len(text) assert line.tokens[1].coordinates.width == 100 * len('token2') / len( text)
def parse_page_coordinates(self, node: etree.ElementBase, page_number: int) -> LayoutPageCoordinates: return LayoutPageCoordinates(x=float(node.attrib.get('HPOS', 0)), y=float(node.attrib.get('VPOS', 0)), width=float(node.attrib.get('WIDTH', 0)), height=float(node.attrib.get('HEIGHT', 0)), page_number=page_number)
def iter_semantic_graphic_for_image( # pylint: disable=too-many-locals self, image: PIL.Image.Image, extract_graphic_assets: bool, page_number: int, page: Optional[LayoutPage]) -> Iterable[SemanticGraphic]: LOGGER.debug('image size: %d x %d', image.width, image.height) page_coordinates = (page.meta.coordinates if page is not None else None) page_graphics = (page.graphics if page is not None else []) cv_start = monotonic() cv_result = self.computer_vision_model.predict_single(image) cv_end = monotonic() figure_instances = cv_result.get_instances_by_type_name('Figure') figure_coordinates_list = [ instance.get_bounding_box() for instance in figure_instances ] LOGGER.info( 'cv result, took=%.3fs, page_number=%d, image_size=%dx%d, figure_coordinates_list=%r', cv_end - cv_start, page_number, image.width, image.height, figure_coordinates_list) for figure_index, figure_coordinates in enumerate( figure_coordinates_list): figure_number = 1 + figure_index local_image_path: Optional[str] = None relative_image_path: Optional[str] = None scaled_figure_coordinates = figure_coordinates if page_coordinates: scaled_figure_coordinates = (figure_coordinates.scale_by( page_coordinates.width / image.width, page_coordinates.height / image.height)) matching_layout_graphic = get_layout_graphic_with_similar_coordinates( page_graphics=page_graphics, bounding_box=scaled_figure_coordinates, ignored_graphic_types=self.ignored_graphic_types) if matching_layout_graphic is not None: yield get_semantic_graphic_for_layout_graphic( matching_layout_graphic, extract_graphic_assets=extract_graphic_assets) continue if extract_graphic_assets: local_image_path = os.path.join( self.temp_dir, f'figure-{page_number}-{figure_number}.png') relative_image_path = os.path.basename(local_image_path) cropped_image = get_cropped_image(image, figure_coordinates) cropped_image.save(local_image_path) layout_graphic = LayoutGraphic(coordinates=LayoutPageCoordinates( x=scaled_figure_coordinates.x, y=scaled_figure_coordinates.y, width=scaled_figure_coordinates.width, height=scaled_figure_coordinates.height, page_number=page_number), graphic_type='cv-figure', local_file_path=local_image_path) semantic_graphic = SemanticGraphic( layout_graphic=layout_graphic, relative_path=relative_image_path) yield semantic_graphic
def test_should_return_the_best_matching_graphic( self ): page_graphics = [ LayoutGraphic(coordinates=LayoutPageCoordinates( x=10, y=10, width=200, height=100 )), LayoutGraphic(coordinates=LayoutPageCoordinates( x=10, y=10, width=100, height=100 )), LayoutGraphic(coordinates=LayoutPageCoordinates( x=100, y=10, width=100, height=100 )), ] result = get_layout_graphic_with_similar_coordinates( page_graphics, BoundingBox(x=10, y=10, width=90, height=100) ) assert result == page_graphics[1]
def test_should_prefer_embedded_graphic( # pylint: disable=too-many-locals self, computer_vision_model_mock: MagicMock, tmp_path: Path ): image_path = tmp_path / 'page10.png' image = PIL.Image.new('RGB', size=(20, 10), color=(255, 0, 0)) image.save(image_path) page_images = [DocumentPageImage( page_number=10, page_image_path=str(image_path) )] embedded_graphic = LayoutGraphic( coordinates=LayoutPageCoordinates( x=10, y=20, width=30, height=40, page_number=10 ) ) layout_document = LayoutDocument(pages=[ _create_page( coordinates=LayoutPageCoordinates( x=0, y=0, width=200, height=100, page_number=10 ), graphics=[embedded_graphic] ) ]) cv_result = computer_vision_model_mock.predict_single.return_value cv_bbox = BoundingBox(x=1, y=2, width=3, height=4) cv_result.get_instances_by_type_name.return_value = [ SimpleComputerVisionModelInstance(bounding_box=cv_bbox) ] graphic_provider = ComputerVisionDocumentGraphicProvider( computer_vision_model=computer_vision_model_mock, page_image_iterable=page_images, temp_dir=str(tmp_path) ) semantic_graphic_list = list(graphic_provider.iter_semantic_graphic_for_layout_document( layout_document=layout_document, extract_graphic_assets=True )) assert semantic_graphic_list semantic_graphic = semantic_graphic_list[0] LOGGER.debug('semantic_graphic: %s', semantic_graphic) assert semantic_graphic.layout_graphic == embedded_graphic
def test_should_merge_coordinates_on_same_line(self): assert get_merged_coordinates_list([ LayoutPageCoordinates(x=10, y=10, width=100, height=100, page_number=1), LayoutPageCoordinates(x=110, y=10, width=100, height=100, page_number=1) ]) == [ LayoutPageCoordinates(x=10, y=10, width=110 - 10 + 100, height=100, page_number=1) ]
def test_should_merge_coordinates_above_each_other(self): assert get_merged_coordinates_list([ LayoutPageCoordinates(x=10, y=10, width=100, height=100, page_number=1), LayoutPageCoordinates(x=10, y=110, width=100, height=100, page_number=1) ]) == [ LayoutPageCoordinates(x=10, y=10, width=100, height=110 - 10 + 100, page_number=1) ]
def test_should_parse_page_meta_data(self): page = AltoParser().parse_page( ALTO_E.Page( {'PHYSICAL_IMG_NR': '10', 'WIDTH': '101', 'HEIGHT': '102'}, ALTO_E.PrintSpace(), ), page_index=0 ) assert page.meta.page_number == 10 assert page.meta.coordinates == LayoutPageCoordinates( x=0, y=0, width=101, height=102, page_number=10 )
def test_should_ignore_matches_below_threshold( self ): page_graphics = [ LayoutGraphic(coordinates=LayoutPageCoordinates( x=10, y=10, width=100, height=100 )) ] result = get_layout_graphic_with_similar_coordinates( page_graphics, BoundingBox(x=10, y=10, width=10, height=1000) ) assert result is None
def test_should_ignore_svg_graphics( self ): page_graphics = [ LayoutGraphic(coordinates=LayoutPageCoordinates.from_bounding_box( BOUNDING_BOX_1 ), graphic_type='svg') ] result = get_layout_graphic_with_similar_coordinates( page_graphics, BOUNDING_BOX_1, ignored_graphic_types={'svg'} ) assert result is None
def parse_page(self, page_node: etree.ElementBase, page_index: int) -> LayoutPage: page_number_str = page_node.attrib.get('PHYSICAL_IMG_NR') page_number = int( page_number_str) if page_number_str else 1 + page_index width_str = page_node.attrib.get('WIDTH') height_str = page_node.attrib.get('HEIGHT') coordinates = (LayoutPageCoordinates(x=0, y=0, width=float(width_str), height=float(height_str), page_number=page_number) if width_str and height_str else None) return LayoutPage( meta=LayoutPageMeta(page_number=page_number, coordinates=coordinates), blocks=[ self.parse_block(block_node, page_number=page_number) for block_node in alto_xpath(page_node, './/alto:TextBlock') ], graphics=[ self.parse_graphic(graphic_node, page_number=page_number) for graphic_node in alto_xpath(page_node, './/alto:Illustration') ])
from sciencebeam_parser.utils.bounding_box import BoundingBox from sciencebeam_parser.document.layout_document import ( LayoutBlock, LayoutDocument, LayoutGraphic, LayoutLine, LayoutPage, LayoutPageCoordinates, LayoutPageMeta, LayoutToken) from sciencebeam_parser.processors.graphic_provider import ( SimpleDocumentGraphicProvider, get_layout_document_with_graphics_replaced_by_graphics, get_layout_document_with_text_and_graphics_replaced_by_graphics, get_page_numbers_with_mostly_bitmap_graphics, get_page_numbers_with_uncommon_page_dimension) LOGGER = logging.getLogger(__name__) LAYOUT_PAGE_COORDINATES_1 = LayoutPageCoordinates(x=10, y=11, width=100, height=101, page_number=1) LAYOUT_PAGE_COORDINATES_2 = LayoutPageCoordinates(x=10, y=11, width=200, height=101, page_number=2) def _get_layout_document_for_layout_graphic( layout_graphic: LayoutGraphic) -> LayoutDocument: return LayoutDocument( pages=[LayoutPage(blocks=[], graphics=[layout_graphic])])
FONTSIZE_1 = 11.1 FONT_ID_2 = 'font2' FONTFAMILY_2 = 'fontfamily2' FONTSIZE_2 = 22.2 BOLD = 'bold' ITALICS = 'italics' SUBSCRIPT = 'subscript' SUPERSCRIPT = 'superscript' TOKEN_1 = 'token1' TOKEN_2 = 'token2' COORDINATES_1 = LayoutPageCoordinates( x=100.1, y=101.1, width=102.2, height=103.3, page_number=1 ) COORDINATES_2 = LayoutPageCoordinates( x=200.1, y=201.1, width=202.2, height=203.3, page_number=1 ) class TestAltoParser: def test_should_parse_font_without_fontstyle(self): font = AltoParser().parse_font(ALTO_E.TextStyle( ID=FONT_ID_1, FONTFAMILY=FONTFAMILY_1, FONTSIZE=str(FONTSIZE_1) )) assert font.is_bold is False
from sciencebeam_parser.document.layout_document import ( EMPTY_BLOCK, LayoutLineDescriptor, LayoutPageCoordinates, LayoutPageMeta, LayoutToken, LayoutLine, LayoutBlock, LayoutPage, LayoutDocument, LayoutTokensText, get_merged_coordinates_list, retokenize_layout_document, remove_empty_blocks) COORDINATES_1 = LayoutPageCoordinates(x=10, y=10, width=100, height=100, page_number=1) class TestGetMergedCoordinatesList: def test_should_merge_coordinates_on_same_line(self): assert get_merged_coordinates_list([ LayoutPageCoordinates(x=10, y=10, width=100, height=100, page_number=1), LayoutPageCoordinates(x=110, y=10, width=100, height=100, page_number=1) ]) == [ LayoutPageCoordinates(x=10, y=10, width=110 - 10 + 100, height=100,
def test_should_extract_figure_label_caption_from_body( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels, segmentation_label: str): citation_block = LayoutBlock.for_text('Figure 1') _coordinates = LayoutPageCoordinates(x=10, y=10, width=100, height=10) graphic_local_file_path = '/path/to/graphic1.svg' graphic = LayoutGraphic(coordinates=_coordinates, local_file_path=graphic_local_file_path) _coordinates = _coordinates.move_by(dy=10) label_block = LayoutBlock.for_text('Figure 1', coordinates=_coordinates) _coordinates = _coordinates.move_by(dy=10) caption_block = LayoutBlock.for_text('Caption 1', coordinates=_coordinates) other_block = LayoutBlock.for_text('Other') figure_block = LayoutBlock.merge_blocks( [label_block, other_block, caption_block]) fulltext_block = LayoutBlock.merge_blocks( [citation_block, figure_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_figure_fields=True, extract_graphic_bounding_boxes=True, extract_graphic_assets=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock figure_model_mock = fulltext_models_mock.figure_model_mock segmentation_model_mock.update_label_by_layout_block( fulltext_block, segmentation_label) fulltext_model_mock.update_label_by_layout_block( citation_block, '<figure_marker>') fulltext_model_mock.update_label_by_layout_block( figure_block, '<figure>') figure_model_mock.update_label_by_layout_block(label_block, '<label>') figure_model_mock.update_label_by_layout_block(caption_block, '<figDesc>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[fulltext_block], graphics=[graphic])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None figure_list = list( iter_by_semantic_type_recursively([ semantic_document.body_section, semantic_document.back_section ], SemanticFigure)) assert len(figure_list) == 1 figure = figure_list[0] assert figure.get_text_by_type(SemanticLabel) == label_block.text assert figure.get_text_by_type(SemanticCaption) == caption_block.text assert figure.content_id == 'fig_0' figure_citation_list = list( semantic_document.iter_by_type_recursively(SemanticFigureCitation)) assert len(figure_citation_list) == 1 assert figure_citation_list[0].get_text() == citation_block.text assert figure_citation_list[0].target_content_id == 'fig_0' semantic_graphic_list = list(figure.iter_by_type(SemanticGraphic)) assert semantic_graphic_list assert semantic_graphic_list[0].layout_graphic == graphic assert semantic_graphic_list[0].relative_path == os.path.basename( graphic_local_file_path)
from sciencebeam_parser.document.layout_document import (LayoutBlock, LayoutGraphic, LayoutPageCoordinates, LayoutToken) from sciencebeam_parser.document.semantic_document import ( SemanticContentWrapper, SemanticFigure, SemanticGraphic, SemanticLabel, SemanticMixedContentWrapper) from sciencebeam_parser.processors.graphic_matching import ( BoundingBoxDistanceGraphicMatcher, GraphicRelatedBlockTextGraphicMatcher, OpticalCharacterRecognitionGraphicMatcher, get_bounding_box_list_distance) LOGGER = logging.getLogger(__name__) COORDINATES_1 = LayoutPageCoordinates(x=10, y=100, width=200, height=100, page_number=1) GRAPHIC_ABOVE_FIGURE_COORDINATES_1 = LayoutPageCoordinates(x=10, y=100, width=200, height=100, page_number=1) FIGURE_BELOW_GRAPHIC_COORDINATES_1 = LayoutPageCoordinates( x=10, y=GRAPHIC_ABOVE_FIGURE_COORDINATES_1.y + GRAPHIC_ABOVE_FIGURE_COORDINATES_1.height + 10, width=200, height=20,
SemanticNameSuffix, SemanticNameTitle, SemanticParagraph, SemanticRawEquation, SemanticRawEquationContent, SemanticRawReference, SemanticRawReferenceText, SemanticReference, SemanticReferenceCitation, SemanticReferenceList, SemanticSection, SemanticSectionTypes, SemanticSurname, SemanticTable, SemanticTableCitation, SemanticTextContentWrapper, SemanticTitle) from sciencebeam_parser.document.tei_document import ( get_tei_for_semantic_document) from tests.document.tei.common_test import (TOKEN_1, TOKEN_2) LOGGER = logging.getLogger(__name__) WEB_URL_1 = 'http://host/path' DOI_1 = '10.1234/test' COORDINATES_1 = LayoutPageCoordinates(10, 20, 110, 120) class TestGetTeiForSemanticDocument: # pylint: disable=too-many-public-methods def test_should_return_empty_document(self): semantic_document = SemanticDocument() tei_document = get_tei_for_semantic_document(semantic_document) assert not tei_document.xpath('//tei:div') def test_should_set_manuscript_title(self): semantic_document = SemanticDocument() semantic_document.front.add_content( SemanticTitle(layout_block=LayoutBlock.for_text(TOKEN_1))) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list(