def test_should_replace_text_and_graphics_within_bounding_box_of_semantic_graphics( self): page_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(0, 0, 200, 200), page_number=1) semantic_graphic_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 90, 100, 50), page_number=1) keep_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 10, 100, 20), page_number=1) remove_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 100, 100, 20), page_number=1) empty_coordinates = LayoutPageCoordinates.from_bounding_box( BoundingBox(10, 100, 0, 0), page_number=1) keep_token = LayoutToken('keep', coordinates=keep_coordinates) remove_token = LayoutToken('remove', coordinates=remove_coordinates) keep_graphic = LayoutGraphic(coordinates=keep_coordinates, graphic_type='keep-graphic') remove_graphic = LayoutGraphic(coordinates=remove_coordinates, graphic_type='remove-graphic') layout_document = LayoutDocument(pages=[ LayoutPage(blocks=[ LayoutBlock( lines=[LayoutLine(tokens=[keep_token, remove_token])]) ], graphics=[keep_graphic, remove_graphic], meta=LayoutPageMeta( page_number=page_coordinates.page_number, coordinates=page_coordinates)) ]) layout_graphic = LayoutGraphic( coordinates=semantic_graphic_coordinates, graphic_type='new-graphic') no_coords_layout_graphic = LayoutGraphic( coordinates=empty_coordinates, graphic_type='empty-coords-graphic') result = get_layout_document_with_text_and_graphics_replaced_by_graphics( layout_document, semantic_graphics=[ SemanticGraphic(layout_graphic=layout_graphic), SemanticGraphic(layout_graphic=no_coords_layout_graphic) ]) LOGGER.debug('result.pages[0].graphics: %r', result.pages[0].graphics) assert result.pages[0].graphics[:-1] == [keep_graphic] LOGGER.debug('result.pages[0].graphics[-1]: %r', result.pages[0].graphics[-1]) assert result.pages[0].graphics[ -1].graphic_type == layout_graphic.graphic_type assert result.pages[0].graphics[ -1].coordinates == layout_graphic.coordinates assert list( result.pages[0].blocks[0].iter_all_tokens()) == [keep_token] assert list( result.pages[0].graphics[-1].related_block.iter_all_tokens()) == [ keep_token, remove_token ]
def get_semantic_graphic_for_layout_graphic( layout_graphic: LayoutGraphic, extract_graphic_assets: bool) -> SemanticGraphic: relative_path: Optional[str] = None if layout_graphic.local_file_path and extract_graphic_assets: relative_path = os.path.basename(layout_graphic.local_file_path) return SemanticGraphic(layout_graphic=layout_graphic, relative_path=relative_path)
def iter_semantic_graphic_for_image( # pylint: disable=too-many-locals self, image: PIL.Image.Image, extract_graphic_assets: bool, page_number: int, page: Optional[LayoutPage]) -> Iterable[SemanticGraphic]: LOGGER.debug('image size: %d x %d', image.width, image.height) page_coordinates = (page.meta.coordinates if page is not None else None) page_graphics = (page.graphics if page is not None else []) cv_start = monotonic() cv_result = self.computer_vision_model.predict_single(image) cv_end = monotonic() figure_instances = cv_result.get_instances_by_type_name('Figure') figure_coordinates_list = [ instance.get_bounding_box() for instance in figure_instances ] LOGGER.info( 'cv result, took=%.3fs, page_number=%d, image_size=%dx%d, figure_coordinates_list=%r', cv_end - cv_start, page_number, image.width, image.height, figure_coordinates_list) for figure_index, figure_coordinates in enumerate( figure_coordinates_list): figure_number = 1 + figure_index local_image_path: Optional[str] = None relative_image_path: Optional[str] = None scaled_figure_coordinates = figure_coordinates if page_coordinates: scaled_figure_coordinates = (figure_coordinates.scale_by( page_coordinates.width / image.width, page_coordinates.height / image.height)) matching_layout_graphic = get_layout_graphic_with_similar_coordinates( page_graphics=page_graphics, bounding_box=scaled_figure_coordinates, ignored_graphic_types=self.ignored_graphic_types) if matching_layout_graphic is not None: yield get_semantic_graphic_for_layout_graphic( matching_layout_graphic, extract_graphic_assets=extract_graphic_assets) continue if extract_graphic_assets: local_image_path = os.path.join( self.temp_dir, f'figure-{page_number}-{figure_number}.png') relative_image_path = os.path.basename(local_image_path) cropped_image = get_cropped_image(image, figure_coordinates) cropped_image.save(local_image_path) layout_graphic = LayoutGraphic(coordinates=LayoutPageCoordinates( x=scaled_figure_coordinates.x, y=scaled_figure_coordinates.y, width=scaled_figure_coordinates.width, height=scaled_figure_coordinates.height, page_number=page_number), graphic_type='cv-figure', local_file_path=local_image_path) semantic_graphic = SemanticGraphic( layout_graphic=layout_graphic, relative_path=relative_image_path) yield semantic_graphic
def test_should_not_match_empty_graphic(self): empty_semantic_graphic_1 = SemanticGraphic( layout_graphic=LayoutGraphic( coordinates=COORDINATES_1._replace(width=0, height=0))) candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates( coordinates=COORDINATES_1) result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches( semantic_graphic_list=[empty_semantic_graphic_1], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) assert not result
def test_should_ignore_layout_graphic_without_related_block(self): semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=FAR_AWAY_COORDINATES_1, related_block=None)) candidate_semantic_content_1 = SemanticFigure( [SemanticLabel(layout_block=LayoutBlock.for_text('Figure 1'))]) result = GraphicRelatedBlockTextGraphicMatcher().get_graphic_matches( semantic_graphic_list=[semantic_graphic_1], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) assert not result.graphic_matches assert result.unmatched_graphics == [semantic_graphic_1]
def test_should_render_graphic_element(self): semantic_figure = SemanticFigure([ SemanticLabel(layout_block=LayoutBlock.for_text('Label 1')), SemanticCaption(layout_block=LayoutBlock.for_text('Caption 1')), SemanticGraphic(layout_graphic=LayoutGraphic( local_file_path='image1.png')) ], content_id='fig_0') result = _get_wrapped_figure_tei_element(semantic_figure) assert result.get_xpath_text_content_list( f'{FIGURE_XPATH}/tei:graphic')
def test_should_not_match_graphic_on_another_page(self): semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=COORDINATES_1._replace( page_number=COORDINATES_1.page_number + 1))) candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates( coordinates=COORDINATES_1) result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches( semantic_graphic_list=[semantic_graphic_1], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) assert not result.graphic_matches assert result.unmatched_graphics == [semantic_graphic_1]
def test_should_not_match_further_away_graphic_to_same_semantic_content( self): semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=GRAPHIC_ABOVE_FIGURE_COORDINATES_1)) candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates( coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1) further_away_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1.move_by(dy=500))) further_away_graphic_2 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1.move_by(dy=1000))) result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches( semantic_graphic_list=[ further_away_graphic_1, semantic_graphic_1, further_away_graphic_2 ], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) assert len(result) == 1 first_match = result.graphic_matches[0] assert first_match.semantic_graphic == semantic_graphic_1 assert first_match.candidate_semantic_content == candidate_semantic_content_1
def test_should_ignore_layout_graphic_without_local_path( self, ocr_model_mock: MagicMock): ocr_model_mock.predict_single.return_value.get_text.side_effect = RuntimeError semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=FAR_AWAY_COORDINATES_1, local_file_path=None)) candidate_semantic_content_1 = SemanticFigure( [SemanticLabel(layout_block=LayoutBlock.for_text('Figure 1'))]) result = OpticalCharacterRecognitionGraphicMatcher( ocr_model=ocr_model_mock).get_graphic_matches( semantic_graphic_list=[semantic_graphic_1], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) assert not result.graphic_matches assert result.unmatched_graphics == [semantic_graphic_1]
def test_should_not_convert_pdf_to_jats_zip( # pylint: disable=too-many-locals self, sciencebeam_parser_session: ScienceBeamParserSession, get_tei_for_semantic_document_mock: MagicMock, full_text_processor_class_mock: MagicMock, full_text_processor_mock: MagicMock, xslt_transformer_wrapper_mock: MagicMock, request_temp_path: Path ): expected_pdf_path = request_temp_path / 'test.pdf' expected_output_path = request_temp_path / TEMP_ALTO_XML_FILENAME graphic_local_file_path = request_temp_path / 'image1.png' graphic_relative_path = graphic_local_file_path.name expected_output_path.write_bytes(XML_CONTENT_1) graphic_local_file_path.write_bytes(IMAGE_DATA_1) get_tei_for_semantic_document_mock.return_value = ( TeiDocument(etree.fromstring(TEI_XML_CONTENT_1)) ) xslt_transformer_wrapper_mock.return_value = ( etree.fromstring(JATS_XML_CONTENT_1) ) semantic_document = SemanticDocument() semantic_document.back_section.add_content( SemanticGraphic( layout_graphic=LayoutGraphic( local_file_path=str(graphic_local_file_path) ), relative_path=graphic_relative_path ) ) full_text_processor_mock.get_semantic_document_for_layout_document.return_value = ( semantic_document ) result_file = ( sciencebeam_parser_session.get_source( str(expected_pdf_path), MediaTypes.PDF ).get_local_file_for_response_media_type( MediaTypes.JATS_ZIP ) ) with ZipFile(result_file, 'r') as zip_file: jats_xml_data = zip_file.read('jats.xml') assert jats_xml_data == JATS_XML_CONTENT_1 image_data = zip_file.read(graphic_relative_path) assert image_data == IMAGE_DATA_1 full_text_processor_kwargs = full_text_processor_class_mock.call_args[1] full_text_processor_config = full_text_processor_kwargs['config'] assert full_text_processor_config.extract_graphic_assets is True
def test_should_render_graphic_element_with_url(self): semantic_graphic = SemanticGraphic( relative_path='rel-image1.png', layout_graphic=LayoutGraphic( local_file_path='image1.png', graphic_type='svg' ) ) result = _get_wrapped_graphic_tei_element(semantic_graphic) graphic_elements = result.xpath_nodes( '//tei:graphic' ) assert len(graphic_elements) == 1 graphic_element = graphic_elements[0] assert graphic_element.attrib.get('url') == 'rel-image1.png'
def test_should_unmatched_graphics_to_back(self): semantic_document = SemanticDocument() semantic_document.back_section.add_content( SemanticMixedNote([ SemanticGraphic( layout_graphic=LayoutGraphic(coordinates=COORDINATES_1), relative_path='image1.svg') ], note_type='unmatched_graphics')) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) graphics_xpath = '//tei:note[@type="unmatched_graphics"]//tei:graphic' assert tei_document.xpath_nodes(graphics_xpath) assert tei_document.get_xpath_text_content_list( f'{graphics_xpath}/@url') == ['image1.svg']
def test_should_match_graphic_of_specific(self, graphic_type: str, should_match: bool): semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=GRAPHIC_ABOVE_FIGURE_COORDINATES_1, graphic_type=graphic_type)) candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates( coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1) result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches( semantic_graphic_list=[semantic_graphic_1], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) if should_match: assert len(result) == 1 first_match = result.graphic_matches[0] assert first_match.semantic_graphic == semantic_graphic_1 else: assert not result.graphic_matches assert result.unmatched_graphics == [semantic_graphic_1]
def test_should_match_graphic_above_semantic_content(self): semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=GRAPHIC_ABOVE_FIGURE_COORDINATES_1)) candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates( coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1) result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches( semantic_graphic_list=[semantic_graphic_1], candidate_semantic_content_list=[ _get_semantic_content_for_page_coordinates( coordinates=FAR_AWAY_COORDINATES_1), candidate_semantic_content_1, _get_semantic_content_for_page_coordinates( coordinates=FAR_AWAY_COORDINATES_2) ]) LOGGER.debug('result: %r', result) assert len(result) == 1 first_match = result.graphic_matches[0] assert first_match.semantic_graphic == semantic_graphic_1 assert first_match.candidate_semantic_content == candidate_semantic_content_1
def test_should_render_graphic_element_with_coords(self): semantic_graphic = SemanticGraphic( layout_graphic=LayoutGraphic( local_file_path='image1.png', coordinates=COORDINATES_1, graphic_type='svg' ) ) result = _get_wrapped_graphic_tei_element(semantic_graphic) graphic_elements = result.xpath_nodes( '//tei:graphic' ) assert len(graphic_elements) == 1 graphic_element = graphic_elements[0] assert graphic_element.attrib.get('coords') == format_coordinates( COORDINATES_1 ) assert graphic_element.attrib.get('type') == 'svg' assert not graphic_element.attrib.get('url')
def test_should_match_based_on_figure_label(self, related_text: str, figure_label: str, should_match: bool): semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=FAR_AWAY_COORDINATES_1, related_block=LayoutBlock.for_text(related_text))) candidate_semantic_content_1 = SemanticFigure( [SemanticLabel(layout_block=LayoutBlock.for_text(figure_label))]) result = GraphicRelatedBlockTextGraphicMatcher().get_graphic_matches( semantic_graphic_list=[semantic_graphic_1], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) if should_match: assert len(result) == 1 first_match = result.graphic_matches[0] assert first_match.semantic_graphic == semantic_graphic_1 else: assert not result.graphic_matches assert result.unmatched_graphics == [semantic_graphic_1]
def test_should_match_based_on_figure_label( self, ocr_model_mock: MagicMock, ocr_text: str, figure_label: str, should_match: bool, tmp_path: Path): local_graphic_path = tmp_path / 'image.png' PIL.Image.new('RGB', (10, 10), (0, 1, 2)).save(local_graphic_path) ocr_model_mock.predict_single.return_value.get_text.return_value = ocr_text semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic( coordinates=FAR_AWAY_COORDINATES_1, local_file_path=str(local_graphic_path))) candidate_semantic_content_1 = SemanticFigure( [SemanticLabel(layout_block=LayoutBlock.for_text(figure_label))]) result = OpticalCharacterRecognitionGraphicMatcher( ocr_model=ocr_model_mock).get_graphic_matches( semantic_graphic_list=[semantic_graphic_1], candidate_semantic_content_list=[candidate_semantic_content_1]) LOGGER.debug('result: %r', result) if should_match: assert len(result) == 1 first_match = result.graphic_matches[0] assert first_match.semantic_graphic == semantic_graphic_1 else: assert not result.graphic_matches assert result.unmatched_graphics == [semantic_graphic_1]