def test_should_replace_text_and_graphics_within_bounding_box_of_semantic_graphics( self): page_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(0, 0, 200, 200), page_number=1) semantic_graphic_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 90, 100, 50), page_number=1) keep_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 10, 100, 20), page_number=1) remove_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 100, 100, 20), page_number=1) empty_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 100, 0, 0), page_number=1) keep_token = LayoutToken('keep', coordinates=keep_coordinates) remove_token = LayoutToken('remove', coordinates=remove_coordinates) keep_graphic = LayoutGraphic(coordinates=keep_coordinates, graphic_type='keep-graphic') remove_graphic = LayoutGraphic(coordinates=remove_coordinates, graphic_type='remove-graphic') layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock( lines=[LayoutLine(tokens=[keep_token, remove_token])]) ], graphics=[keep_graphic, remove_graphic], meta=LayoutPageMeta( page_number=page_coordinates.page_number, coordinates=page_coordinates)) ]) layout_graphic = LayoutGraphic( coordinates=semantic_graphic_coordinates, graphic_type='new-graphic') no_coords_layout_graphic = LayoutGraphic( coordinates=empty_coordinates, graphic_type='empty-coords-graphic') result = get_layout_document_with_text_and_graphics_replaced_by_graphics( layout_document, semantic_graphics=[ SemanticGraphic(layout_graphic=layout_graphic), SemanticGraphic(layout_graphic=no_coords_layout_graphic) ]) LOGGER.debug('result.pages[0].graphics: %r', result.pages[0].graphics) assert result.pages[0].graphics[:-1] == [keep_graphic] LOGGER.debug('result.pages[0].graphics[-1]: %r', result.pages[0].graphics[-1]) assert result.pages[0].graphics[ -1].graphic_type == layout_graphic.graphic_type assert result.pages[0].graphics[ -1].coordinates == layout_graphic.coordinates assert list( result.pages[0].blocks[0].iter_all_tokens()) == [keep_token] assert list( result.pages[0].graphics[-1].related_block.iter_all_tokens()) == [ keep_token, remove_token ]
def parse_graphic(self, graphic_node: etree.ElementBase, page_number: int) -> LayoutGraphic: attrib = graphic_node.attrib return LayoutGraphic(local_file_path=attrib.get('FILEID'), coordinates=self.parse_page_coordinates( graphic_node, page_number=page_number), graphic_type=attrib.get('TYPE'))
def test_should_ignore_layout_graphic_without_coordinates(self): layout_graphic = LayoutGraphic(local_file_path='/path/to/image.png', coordinates=None) semantic_graphic_list = list(SimpleDocumentGraphicProvider( ).iter_semantic_graphic_for_layout_document( _get_layout_document_for_layout_graphic(layout_graphic), extract_graphic_assets=False)) assert not semantic_graphic_list
def iter_semantic_graphic_for_image( # pylint: disable=too-many-locals self, image: PIL.Image.Image, extract_graphic_assets: bool, page_number: int, page: Optional[LayoutPage]) -> Iterable[SemanticGraphic]: LOGGER.debug('image size: %d x %d', image.width, image.height) page_coordinates = (page.meta.coordinates if page is not None else None) page_graphics = (page.graphics if page is not None else []) cv_start = monotonic() cv_result = self.computer_vision_model.predict_single(image) cv_end = monotonic() figure_instances = cv_result.get_instances_by_type_name('Figure') figure_coordinates_list = [ instance.get_bounding_box() for instance in figure_instances ] LOGGER.info( 'cv result, took=%.3fs, page_number=%d, image_size=%dx%d, figure_coordinates_list=%r', cv_end - cv_start, page_number, image.width, image.height, figure_coordinates_list) for figure_index, figure_coordinates in enumerate( figure_coordinates_list): figure_number = 1 + figure_index local_image_path: Optional[str] = None relative_image_path: Optional[str] = None scaled_figure_coordinates = figure_coordinates if page_coordinates: scaled_figure_coordinates = (figure_coordinates.scale_by( page_coordinates.width / image.width, page_coordinates.height / image.height)) matching_layout_graphic = get_layout_graphic_with_similar_coordinates( page_graphics=page_graphics, bounding_box=scaled_figure_coordinates, ignored_graphic_types=self.ignored_graphic_types) if matching_layout_graphic is not None: yield get_semantic_graphic_for_layout_graphic( matching_layout_graphic, extract_graphic_assets=extract_graphic_assets) continue if extract_graphic_assets: local_image_path = os.path.join( self.temp_dir, f'figure-{page_number}-{figure_number}.png') relative_image_path = os.path.basename(local_image_path) cropped_image = get_cropped_image(image, figure_coordinates) cropped_image.save(local_image_path) layout_graphic = LayoutGraphic(coordinates=LayoutPageCoordinates( x=scaled_figure_coordinates.x, y=scaled_figure_coordinates.y, width=scaled_figure_coordinates.width, height=scaled_figure_coordinates.height, page_number=page_number), graphic_type='cv-figure', local_file_path=local_image_path) semantic_graphic = SemanticGraphic( layout_graphic=layout_graphic, relative_path=relative_image_path) yield semantic_graphic
def test_should_provide_semantic_graphic_without_assets(self): layout_graphic = LayoutGraphic(local_file_path='/path/to/image.png', coordinates=LAYOUT_PAGE_COORDINATES_1) semantic_graphic_list = list(SimpleDocumentGraphicProvider( ).iter_semantic_graphic_for_layout_document( _get_layout_document_for_layout_graphic(layout_graphic), extract_graphic_assets=False)) assert len(semantic_graphic_list) == 1 assert semantic_graphic_list[0].layout_graphic == layout_graphic assert semantic_graphic_list[0].relative_path is None
def test_should_return_the_best_matching_graphic( self ): page_graphics = [ LayoutGraphic(coordinates=LayoutPageCoordinates( x=10, y=10, width=200, height=100 )), LayoutGraphic(coordinates=LayoutPageCoordinates( x=10, y=10, width=100, height=100 )), LayoutGraphic(coordinates=LayoutPageCoordinates( x=100, y=10, width=100, height=100 )), ] result = get_layout_graphic_with_similar_coordinates( page_graphics, BoundingBox(x=10, y=10, width=90, height=100) ) assert result == page_graphics[1]
def test_should_ignore_graphics_without_coordinates( self ): page_graphics = [ LayoutGraphic(coordinates=None) ] result = get_layout_graphic_with_similar_coordinates( page_graphics, BoundingBox(x=10, y=10, width=10, height=1000) ) assert result is None
def test_should_ignore_layout_graphic_without_related_block(self): semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=FAR_AWAY_COORDINATES_1, related_block=None)) candidate_semantic_content_1 = SemanticFigure( [SemanticLabel(layout_block=LayoutBlock.for_text('Figure 1'))]) result = GraphicRelatedBlockTextGraphicMatcher().get_graphic_matches( semantic_graphic_list=[semantic_graphic_1], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) assert not result.graphic_matches assert result.unmatched_graphics == [semantic_graphic_1]
def test_should_render_graphic_element(self): semantic_figure = SemanticFigure([ SemanticLabel(layout_block=LayoutBlock.for_text('Label 1')), SemanticCaption(layout_block=LayoutBlock.for_text('Caption 1')), SemanticGraphic(layout_graphic=LayoutGraphic( local_file_path='image1.png')) ], content_id='fig_0') result = _get_wrapped_figure_tei_element(semantic_figure) assert result.get_xpath_text_content_list( f'{FIGURE_XPATH}/tei:graphic')
def test_should_not_match_empty_graphic(self): empty_semantic_graphic_1 = SemanticGraphic( layout_graphic=LayoutGraphic( coordinates=COORDINATES_1._replace(width=0, height=0))) candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates( coordinates=COORDINATES_1) result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches( semantic_graphic_list=[empty_semantic_graphic_1], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) assert not result
def test_should_not_match_graphic_on_another_page(self): semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=COORDINATES_1._replace( page_number=COORDINATES_1.page_number + 1))) candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates( coordinates=COORDINATES_1) result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches( semantic_graphic_list=[semantic_graphic_1], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) assert not result.graphic_matches assert result.unmatched_graphics == [semantic_graphic_1]
def test_should_not_match_further_away_graphic_to_same_semantic_content( self): semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=GRAPHIC_ABOVE_FIGURE_COORDINATES_1)) candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates( coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1) further_away_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1.move_by(dy=500))) further_away_graphic_2 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1.move_by(dy=1000))) result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches( semantic_graphic_list=[ further_away_graphic_1, semantic_graphic_1, further_away_graphic_2 ], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) assert len(result) == 1 first_match = result.graphic_matches[0] assert first_match.semantic_graphic == semantic_graphic_1 assert first_match.candidate_semantic_content == candidate_semantic_content_1
def test_should_ignore_matches_below_threshold( self ): page_graphics = [ LayoutGraphic(coordinates=LayoutPageCoordinates( x=10, y=10, width=100, height=100 )) ] result = get_layout_graphic_with_similar_coordinates( page_graphics, BoundingBox(x=10, y=10, width=10, height=1000) ) assert result is None
def test_should_ignore_svg_graphics( self ): page_graphics = [ LayoutGraphic(coordinates=LayoutPageCoordinates.from_bounding_box( BOUNDING_BOX_1 ), graphic_type='svg') ] result = get_layout_graphic_with_similar_coordinates( page_graphics, BOUNDING_BOX_1, ignored_graphic_types={'svg'} ) assert result is None
def test_should_ignore_layout_graphic_without_local_path( self, ocr_model_mock: MagicMock): ocr_model_mock.predict_single.return_value.get_text.side_effect = RuntimeError semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=FAR_AWAY_COORDINATES_1, local_file_path=None)) candidate_semantic_content_1 = SemanticFigure( [SemanticLabel(layout_block=LayoutBlock.for_text('Figure 1'))]) result = OpticalCharacterRecognitionGraphicMatcher( ocr_model=ocr_model_mock).get_graphic_matches( semantic_graphic_list=[semantic_graphic_1], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) assert not result.graphic_matches assert result.unmatched_graphics == [semantic_graphic_1]
def test_should_not_convert_pdf_to_jats_zip( # pylint: disable=too-many-locals self, sciencebeam_parser_session: ScienceBeamParserSession, get_tei_for_semantic_document_mock: MagicMock, full_text_processor_class_mock: MagicMock, full_text_processor_mock: MagicMock, xslt_transformer_wrapper_mock: MagicMock, request_temp_path: Path ): expected_pdf_path = request_temp_path / 'test.pdf' expected_output_path = request_temp_path / TEMP_ALTO_XML_FILENAME graphic_local_file_path = request_temp_path / 'image1.png' graphic_relative_path = graphic_local_file_path.name expected_output_path.write_bytes(XML_CONTENT_1) graphic_local_file_path.write_bytes(IMAGE_DATA_1) get_tei_for_semantic_document_mock.return_value = ( TeiDocument(etree.fromstring(TEI_XML_CONTENT_1)) ) xslt_transformer_wrapper_mock.return_value = ( etree.fromstring(JATS_XML_CONTENT_1) ) semantic_document = SemanticDocument() semantic_document.back_section.add_content( SemanticGraphic( layout_graphic=LayoutGraphic( local_file_path=str(graphic_local_file_path) ), relative_path=graphic_relative_path ) ) full_text_processor_mock.get_semantic_document_for_layout_document.return_value = ( semantic_document ) result_file = ( sciencebeam_parser_session.get_source( str(expected_pdf_path), MediaTypes.PDF ).get_local_file_for_response_media_type( MediaTypes.JATS_ZIP ) ) with ZipFile(result_file, 'r') as zip_file: jats_xml_data = zip_file.read('jats.xml') assert jats_xml_data == JATS_XML_CONTENT_1 image_data = zip_file.read(graphic_relative_path) assert image_data == IMAGE_DATA_1 full_text_processor_kwargs = full_text_processor_class_mock.call_args[1] full_text_processor_config = full_text_processor_kwargs['config'] assert full_text_processor_config.extract_graphic_assets is True
def test_should_render_graphic_element_with_url(self): semantic_graphic = SemanticGraphic( relative_path='rel-image1.png', layout_graphic=LayoutGraphic( local_file_path='image1.png', graphic_type='svg' ) ) result = _get_wrapped_graphic_tei_element(semantic_graphic) graphic_elements = result.xpath_nodes( '//tei:graphic' ) assert len(graphic_elements) == 1 graphic_element = graphic_elements[0] assert graphic_element.attrib.get('url') == 'rel-image1.png'
def test_should_provide_page_number_with_bitmap_graphics(self): layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[], graphics=[ LayoutGraphic(graphic_type='image', coordinates=LAYOUT_PAGE_COORDINATES_1. _replace(page_number=1)) ], meta=LayoutPageMeta( page_number=1, coordinates=LAYOUT_PAGE_COORDINATES_1._replace( page_number=1))) ]) result = get_page_numbers_with_mostly_bitmap_graphics(layout_document) assert result == [1]
def test_should_unmatched_graphics_to_back(self): semantic_document = SemanticDocument() semantic_document.back_section.add_content( SemanticMixedNote([ SemanticGraphic( layout_graphic=LayoutGraphic(coordinates=COORDINATES_1), relative_path='image1.svg') ], note_type='unmatched_graphics')) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) graphics_xpath = '//tei:note[@type="unmatched_graphics"]//tei:graphic' assert tei_document.xpath_nodes(graphics_xpath) assert tei_document.get_xpath_text_content_list( f'{graphics_xpath}/@url') == ['image1.svg']
def test_should_match_graphic_of_specific(self, graphic_type: str, should_match: bool): semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=GRAPHIC_ABOVE_FIGURE_COORDINATES_1, graphic_type=graphic_type)) candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates( coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1) result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches( semantic_graphic_list=[semantic_graphic_1], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) if should_match: assert len(result) == 1 first_match = result.graphic_matches[0] assert first_match.semantic_graphic == semantic_graphic_1 else: assert not result.graphic_matches assert result.unmatched_graphics == [semantic_graphic_1]
def test_should_prefer_embedded_graphic( # pylint: disable=too-many-locals self, computer_vision_model_mock: MagicMock, tmp_path: Path ): image_path = tmp_path / 'page10.png' image = PIL.Image.new('RGB', size=(20, 10), color=(255, 0, 0)) image.save(image_path) page_images = [DocumentPageImage( page_number=10, page_image_path=str(image_path) )] embedded_graphic = LayoutGraphic( coordinates=LayoutPageCoordinates( x=10, y=20, width=30, height=40, page_number=10 ) ) layout_document = LayoutDocument(pages=[ _create_page( coordinates=LayoutPageCoordinates( x=0, y=0, width=200, height=100, page_number=10 ), graphics=[embedded_graphic] ) ]) cv_result = computer_vision_model_mock.predict_single.return_value cv_bbox = BoundingBox(x=1, y=2, width=3, height=4) cv_result.get_instances_by_type_name.return_value = [ SimpleComputerVisionModelInstance(bounding_box=cv_bbox) ] graphic_provider = ComputerVisionDocumentGraphicProvider( computer_vision_model=computer_vision_model_mock, page_image_iterable=page_images, temp_dir=str(tmp_path) ) semantic_graphic_list = list(graphic_provider.iter_semantic_graphic_for_layout_document( layout_document=layout_document, extract_graphic_assets=True )) assert semantic_graphic_list semantic_graphic = semantic_graphic_list[0] LOGGER.debug('semantic_graphic: %s', semantic_graphic) assert semantic_graphic.layout_graphic == embedded_graphic
def test_should_match_based_on_figure_label(self, related_text: str, figure_label: str, should_match: bool): semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=FAR_AWAY_COORDINATES_1, related_block=LayoutBlock.for_text(related_text))) candidate_semantic_content_1 = SemanticFigure( [SemanticLabel(layout_block=LayoutBlock.for_text(figure_label))]) result = GraphicRelatedBlockTextGraphicMatcher().get_graphic_matches( semantic_graphic_list=[semantic_graphic_1], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) if should_match: assert len(result) == 1 first_match = result.graphic_matches[0] assert first_match.semantic_graphic == semantic_graphic_1 else: assert not result.graphic_matches assert result.unmatched_graphics == [semantic_graphic_1]
def test_should_render_graphic_element_with_coords(self): semantic_graphic = SemanticGraphic( layout_graphic=LayoutGraphic( local_file_path='image1.png', coordinates=COORDINATES_1, graphic_type='svg' ) ) result = _get_wrapped_graphic_tei_element(semantic_graphic) graphic_elements = result.xpath_nodes( '//tei:graphic' ) assert len(graphic_elements) == 1 graphic_element = graphic_elements[0] assert graphic_element.attrib.get('coords') == format_coordinates( COORDINATES_1 ) assert graphic_element.attrib.get('type') == 'svg' assert not graphic_element.attrib.get('url')
def test_should_match_graphic_above_semantic_content(self): semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=GRAPHIC_ABOVE_FIGURE_COORDINATES_1)) candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates( coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1) result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches( semantic_graphic_list=[semantic_graphic_1], candidate_semantic_content_list=[ _get_semantic_content_for_page_coordinates( coordinates=FAR_AWAY_COORDINATES_1), candidate_semantic_content_1, _get_semantic_content_for_page_coordinates( coordinates=FAR_AWAY_COORDINATES_2) ]) LOGGER.debug('result: %r', result) assert len(result) == 1 first_match = result.graphic_matches[0] assert first_match.semantic_graphic == semantic_graphic_1 assert first_match.candidate_semantic_content == candidate_semantic_content_1
def test_should_match_based_on_figure_label( self, ocr_model_mock: MagicMock, ocr_text: str, figure_label: str, should_match: bool, tmp_path: Path): local_graphic_path = tmp_path / 'image.png' PIL.Image.new('RGB', (10, 10), (0, 1, 2)).save(local_graphic_path) ocr_model_mock.predict_single.return_value.get_text.return_value = ocr_text semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=FAR_AWAY_COORDINATES_1, local_file_path=str(local_graphic_path))) candidate_semantic_content_1 = SemanticFigure( [SemanticLabel(layout_block=LayoutBlock.for_text(figure_label))]) result = OpticalCharacterRecognitionGraphicMatcher( ocr_model=ocr_model_mock).get_graphic_matches( semantic_graphic_list=[semantic_graphic_1], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) if should_match: assert len(result) == 1 first_match = result.graphic_matches[0] assert first_match.semantic_graphic == semantic_graphic_1 else: assert not result.graphic_matches assert result.unmatched_graphics == [semantic_graphic_1]
def test_should_extract_figure_label_caption_from_body( # pylint: disable=too-many-locals self, fulltext_models_mock: MockFullTextModels, segmentation_label: str): citation_block = LayoutBlock.for_text('Figure 1') _coordinates = LayoutPageCoordinates(x=10, y=10, width=100, height=10) graphic_local_file_path = '/path/to/graphic1.svg' graphic = LayoutGraphic(coordinates=_coordinates, local_file_path=graphic_local_file_path) _coordinates = _coordinates.move_by(dy=10) label_block = LayoutBlock.for_text('Figure 1', coordinates=_coordinates) _coordinates = _coordinates.move_by(dy=10) caption_block = LayoutBlock.for_text('Caption 1', coordinates=_coordinates) other_block = LayoutBlock.for_text('Other') figure_block = LayoutBlock.merge_blocks( [label_block, other_block, caption_block]) fulltext_block = LayoutBlock.merge_blocks( [citation_block, figure_block]) fulltext_processor = FullTextProcessor( fulltext_models_mock, FullTextProcessorConfig(extract_figure_fields=True, extract_graphic_bounding_boxes=True, extract_graphic_assets=True)) segmentation_model_mock = fulltext_models_mock.segmentation_model_mock fulltext_model_mock = fulltext_models_mock.fulltext_model_mock figure_model_mock = fulltext_models_mock.figure_model_mock segmentation_model_mock.update_label_by_layout_block( fulltext_block, segmentation_label) fulltext_model_mock.update_label_by_layout_block( citation_block, '<figure_marker>') fulltext_model_mock.update_label_by_layout_block( figure_block, '<figure>') figure_model_mock.update_label_by_layout_block(label_block, '<label>') figure_model_mock.update_label_by_layout_block(caption_block, '<figDesc>') layout_document = LayoutDocument( pages=[LayoutPage(blocks=[fulltext_block], graphics=[graphic])]) semantic_document = fulltext_processor.get_semantic_document_for_layout_document( layout_document=layout_document) LOGGER.debug('semantic_document: %s', semantic_document) assert semantic_document is not None figure_list = list( iter_by_semantic_type_recursively([ semantic_document.body_section, semantic_document.back_section ], SemanticFigure)) assert len(figure_list) == 1 figure = figure_list[0] assert figure.get_text_by_type(SemanticLabel) == label_block.text assert figure.get_text_by_type(SemanticCaption) == caption_block.text assert figure.content_id == 'fig_0' figure_citation_list = list( semantic_document.iter_by_type_recursively(SemanticFigureCitation)) assert len(figure_citation_list) == 1 assert figure_citation_list[0].get_text() == citation_block.text assert figure_citation_list[0].target_content_id == 'fig_0' semantic_graphic_list = list(figure.iter_by_type(SemanticGraphic)) assert semantic_graphic_list assert semantic_graphic_list[0].layout_graphic == graphic assert semantic_graphic_list[0].relative_path == os.path.basename( graphic_local_file_path)