def test_should_replace_text_and_graphics_within_bounding_box_of_semantic_graphics(
         self):
     page_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(0, 0, 200, 200), page_number=1)
     semantic_graphic_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 90, 100, 50), page_number=1)
     keep_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 10, 100, 20), page_number=1)
     remove_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 100, 100, 20), page_number=1)
     empty_coordinates = LayoutPageCoordinates.from_bounding_box(
         BoundingBox(10, 100, 0, 0), page_number=1)
     keep_token = LayoutToken('keep', coordinates=keep_coordinates)
     remove_token = LayoutToken('remove', coordinates=remove_coordinates)
     keep_graphic = LayoutGraphic(coordinates=keep_coordinates,
                                  graphic_type='keep-graphic')
     remove_graphic = LayoutGraphic(coordinates=remove_coordinates,
                                    graphic_type='remove-graphic')
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(
                 lines=[LayoutLine(tokens=[keep_token, remove_token])])
         ],
                    graphics=[keep_graphic, remove_graphic],
                    meta=LayoutPageMeta(
                        page_number=page_coordinates.page_number,
                        coordinates=page_coordinates))
     ])
     layout_graphic = LayoutGraphic(
         coordinates=semantic_graphic_coordinates,
         graphic_type='new-graphic')
     no_coords_layout_graphic = LayoutGraphic(
         coordinates=empty_coordinates, graphic_type='empty-coords-graphic')
     result = get_layout_document_with_text_and_graphics_replaced_by_graphics(
         layout_document,
         semantic_graphics=[
             SemanticGraphic(layout_graphic=layout_graphic),
             SemanticGraphic(layout_graphic=no_coords_layout_graphic)
         ])
     LOGGER.debug('result.pages[0].graphics: %r', result.pages[0].graphics)
     assert result.pages[0].graphics[:-1] == [keep_graphic]
     LOGGER.debug('result.pages[0].graphics[-1]: %r',
                  result.pages[0].graphics[-1])
     assert result.pages[0].graphics[
         -1].graphic_type == layout_graphic.graphic_type
     assert result.pages[0].graphics[
         -1].coordinates == layout_graphic.coordinates
     assert list(
         result.pages[0].blocks[0].iter_all_tokens()) == [keep_token]
     assert list(
         result.pages[0].graphics[-1].related_block.iter_all_tokens()) == [
             keep_token, remove_token
         ]
示例#2
0
def get_semantic_graphic_for_layout_graphic(
        layout_graphic: LayoutGraphic,
        extract_graphic_assets: bool) -> SemanticGraphic:
    relative_path: Optional[str] = None
    if layout_graphic.local_file_path and extract_graphic_assets:
        relative_path = os.path.basename(layout_graphic.local_file_path)
    return SemanticGraphic(layout_graphic=layout_graphic,
                           relative_path=relative_path)
示例#3
0
 def iter_semantic_graphic_for_image(  # pylint: disable=too-many-locals
         self, image: PIL.Image.Image, extract_graphic_assets: bool,
         page_number: int,
         page: Optional[LayoutPage]) -> Iterable[SemanticGraphic]:
     LOGGER.debug('image size: %d x %d', image.width, image.height)
     page_coordinates = (page.meta.coordinates
                         if page is not None else None)
     page_graphics = (page.graphics if page is not None else [])
     cv_start = monotonic()
     cv_result = self.computer_vision_model.predict_single(image)
     cv_end = monotonic()
     figure_instances = cv_result.get_instances_by_type_name('Figure')
     figure_coordinates_list = [
         instance.get_bounding_box() for instance in figure_instances
     ]
     LOGGER.info(
         'cv result, took=%.3fs, page_number=%d, image_size=%dx%d, figure_coordinates_list=%r',
         cv_end - cv_start, page_number, image.width, image.height,
         figure_coordinates_list)
     for figure_index, figure_coordinates in enumerate(
             figure_coordinates_list):
         figure_number = 1 + figure_index
         local_image_path: Optional[str] = None
         relative_image_path: Optional[str] = None
         scaled_figure_coordinates = figure_coordinates
         if page_coordinates:
             scaled_figure_coordinates = (figure_coordinates.scale_by(
                 page_coordinates.width / image.width,
                 page_coordinates.height / image.height))
         matching_layout_graphic = get_layout_graphic_with_similar_coordinates(
             page_graphics=page_graphics,
             bounding_box=scaled_figure_coordinates,
             ignored_graphic_types=self.ignored_graphic_types)
         if matching_layout_graphic is not None:
             yield get_semantic_graphic_for_layout_graphic(
                 matching_layout_graphic,
                 extract_graphic_assets=extract_graphic_assets)
             continue
         if extract_graphic_assets:
             local_image_path = os.path.join(
                 self.temp_dir, f'figure-{page_number}-{figure_number}.png')
             relative_image_path = os.path.basename(local_image_path)
             cropped_image = get_cropped_image(image, figure_coordinates)
             cropped_image.save(local_image_path)
         layout_graphic = LayoutGraphic(coordinates=LayoutPageCoordinates(
             x=scaled_figure_coordinates.x,
             y=scaled_figure_coordinates.y,
             width=scaled_figure_coordinates.width,
             height=scaled_figure_coordinates.height,
             page_number=page_number),
                                        graphic_type='cv-figure',
                                        local_file_path=local_image_path)
         semantic_graphic = SemanticGraphic(
             layout_graphic=layout_graphic,
             relative_path=relative_image_path)
         yield semantic_graphic
示例#4
0
 def test_should_not_match_empty_graphic(self):
     empty_semantic_graphic_1 = SemanticGraphic(
         layout_graphic=LayoutGraphic(
             coordinates=COORDINATES_1._replace(width=0, height=0)))
     candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates(
         coordinates=COORDINATES_1)
     result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches(
         semantic_graphic_list=[empty_semantic_graphic_1],
         candidate_semantic_content_list=[candidate_semantic_content_1])
     LOGGER.debug('result: %r', result)
     assert not result
示例#5
0
 def test_should_ignore_layout_graphic_without_related_block(self):
     semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic(
         coordinates=FAR_AWAY_COORDINATES_1, related_block=None))
     candidate_semantic_content_1 = SemanticFigure(
         [SemanticLabel(layout_block=LayoutBlock.for_text('Figure 1'))])
     result = GraphicRelatedBlockTextGraphicMatcher().get_graphic_matches(
         semantic_graphic_list=[semantic_graphic_1],
         candidate_semantic_content_list=[candidate_semantic_content_1])
     LOGGER.debug('result: %r', result)
     assert not result.graphic_matches
     assert result.unmatched_graphics == [semantic_graphic_1]
 def test_should_render_graphic_element(self):
     semantic_figure = SemanticFigure([
         SemanticLabel(layout_block=LayoutBlock.for_text('Label 1')),
         SemanticCaption(layout_block=LayoutBlock.for_text('Caption 1')),
         SemanticGraphic(layout_graphic=LayoutGraphic(
             local_file_path='image1.png'))
     ],
                                      content_id='fig_0')
     result = _get_wrapped_figure_tei_element(semantic_figure)
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/tei:graphic')
示例#7
0
 def test_should_not_match_graphic_on_another_page(self):
     semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic(
         coordinates=COORDINATES_1._replace(
             page_number=COORDINATES_1.page_number + 1)))
     candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates(
         coordinates=COORDINATES_1)
     result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches(
         semantic_graphic_list=[semantic_graphic_1],
         candidate_semantic_content_list=[candidate_semantic_content_1])
     LOGGER.debug('result: %r', result)
     assert not result.graphic_matches
     assert result.unmatched_graphics == [semantic_graphic_1]
示例#8
0
 def test_should_not_match_further_away_graphic_to_same_semantic_content(
         self):
     semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic(
         coordinates=GRAPHIC_ABOVE_FIGURE_COORDINATES_1))
     candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates(
         coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1)
     further_away_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic(
         coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1.move_by(dy=500)))
     further_away_graphic_2 = SemanticGraphic(layout_graphic=LayoutGraphic(
         coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1.move_by(dy=1000)))
     result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches(
         semantic_graphic_list=[
             further_away_graphic_1, semantic_graphic_1,
             further_away_graphic_2
         ],
         candidate_semantic_content_list=[candidate_semantic_content_1])
     LOGGER.debug('result: %r', result)
     assert len(result) == 1
     first_match = result.graphic_matches[0]
     assert first_match.semantic_graphic == semantic_graphic_1
     assert first_match.candidate_semantic_content == candidate_semantic_content_1
示例#9
0
 def test_should_ignore_layout_graphic_without_local_path(
         self, ocr_model_mock: MagicMock):
     ocr_model_mock.predict_single.return_value.get_text.side_effect = RuntimeError
     semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic(
         coordinates=FAR_AWAY_COORDINATES_1, local_file_path=None))
     candidate_semantic_content_1 = SemanticFigure(
         [SemanticLabel(layout_block=LayoutBlock.for_text('Figure 1'))])
     result = OpticalCharacterRecognitionGraphicMatcher(
         ocr_model=ocr_model_mock).get_graphic_matches(
             semantic_graphic_list=[semantic_graphic_1],
             candidate_semantic_content_list=[candidate_semantic_content_1])
     LOGGER.debug('result: %r', result)
     assert not result.graphic_matches
     assert result.unmatched_graphics == [semantic_graphic_1]
示例#10
0
 def test_should_not_convert_pdf_to_jats_zip(  # pylint: disable=too-many-locals
     self,
     sciencebeam_parser_session: ScienceBeamParserSession,
     get_tei_for_semantic_document_mock: MagicMock,
     full_text_processor_class_mock: MagicMock,
     full_text_processor_mock: MagicMock,
     xslt_transformer_wrapper_mock: MagicMock,
     request_temp_path: Path
 ):
     expected_pdf_path = request_temp_path / 'test.pdf'
     expected_output_path = request_temp_path / TEMP_ALTO_XML_FILENAME
     graphic_local_file_path = request_temp_path / 'image1.png'
     graphic_relative_path = graphic_local_file_path.name
     expected_output_path.write_bytes(XML_CONTENT_1)
     graphic_local_file_path.write_bytes(IMAGE_DATA_1)
     get_tei_for_semantic_document_mock.return_value = (
         TeiDocument(etree.fromstring(TEI_XML_CONTENT_1))
     )
     xslt_transformer_wrapper_mock.return_value = (
         etree.fromstring(JATS_XML_CONTENT_1)
     )
     semantic_document = SemanticDocument()
     semantic_document.back_section.add_content(
         SemanticGraphic(
             layout_graphic=LayoutGraphic(
                 local_file_path=str(graphic_local_file_path)
             ),
             relative_path=graphic_relative_path
         )
     )
     full_text_processor_mock.get_semantic_document_for_layout_document.return_value = (
         semantic_document
     )
     result_file = (
         sciencebeam_parser_session.get_source(
             str(expected_pdf_path),
             MediaTypes.PDF
         ).get_local_file_for_response_media_type(
             MediaTypes.JATS_ZIP
         )
     )
     with ZipFile(result_file, 'r') as zip_file:
         jats_xml_data = zip_file.read('jats.xml')
         assert jats_xml_data == JATS_XML_CONTENT_1
         image_data = zip_file.read(graphic_relative_path)
         assert image_data == IMAGE_DATA_1
     full_text_processor_kwargs = full_text_processor_class_mock.call_args[1]
     full_text_processor_config = full_text_processor_kwargs['config']
     assert full_text_processor_config.extract_graphic_assets is True
示例#11
0
 def test_should_render_graphic_element_with_url(self):
     semantic_graphic = SemanticGraphic(
         relative_path='rel-image1.png',
         layout_graphic=LayoutGraphic(
             local_file_path='image1.png',
             graphic_type='svg'
         )
     )
     result = _get_wrapped_graphic_tei_element(semantic_graphic)
     graphic_elements = result.xpath_nodes(
         '//tei:graphic'
     )
     assert len(graphic_elements) == 1
     graphic_element = graphic_elements[0]
     assert graphic_element.attrib.get('url') == 'rel-image1.png'
 def test_should_unmatched_graphics_to_back(self):
     semantic_document = SemanticDocument()
     semantic_document.back_section.add_content(
         SemanticMixedNote([
             SemanticGraphic(
                 layout_graphic=LayoutGraphic(coordinates=COORDINATES_1),
                 relative_path='image1.svg')
         ],
                           note_type='unmatched_graphics'))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     graphics_xpath = '//tei:note[@type="unmatched_graphics"]//tei:graphic'
     assert tei_document.xpath_nodes(graphics_xpath)
     assert tei_document.get_xpath_text_content_list(
         f'{graphics_xpath}/@url') == ['image1.svg']
示例#13
0
 def test_should_match_graphic_of_specific(self, graphic_type: str,
                                           should_match: bool):
     semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic(
         coordinates=GRAPHIC_ABOVE_FIGURE_COORDINATES_1,
         graphic_type=graphic_type))
     candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates(
         coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1)
     result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches(
         semantic_graphic_list=[semantic_graphic_1],
         candidate_semantic_content_list=[candidate_semantic_content_1])
     LOGGER.debug('result: %r', result)
     if should_match:
         assert len(result) == 1
         first_match = result.graphic_matches[0]
         assert first_match.semantic_graphic == semantic_graphic_1
     else:
         assert not result.graphic_matches
         assert result.unmatched_graphics == [semantic_graphic_1]
示例#14
0
 def test_should_match_graphic_above_semantic_content(self):
     semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic(
         coordinates=GRAPHIC_ABOVE_FIGURE_COORDINATES_1))
     candidate_semantic_content_1 = _get_semantic_content_for_page_coordinates(
         coordinates=FIGURE_BELOW_GRAPHIC_COORDINATES_1)
     result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches(
         semantic_graphic_list=[semantic_graphic_1],
         candidate_semantic_content_list=[
             _get_semantic_content_for_page_coordinates(
                 coordinates=FAR_AWAY_COORDINATES_1),
             candidate_semantic_content_1,
             _get_semantic_content_for_page_coordinates(
                 coordinates=FAR_AWAY_COORDINATES_2)
         ])
     LOGGER.debug('result: %r', result)
     assert len(result) == 1
     first_match = result.graphic_matches[0]
     assert first_match.semantic_graphic == semantic_graphic_1
     assert first_match.candidate_semantic_content == candidate_semantic_content_1
示例#15
0
 def test_should_render_graphic_element_with_coords(self):
     semantic_graphic = SemanticGraphic(
         layout_graphic=LayoutGraphic(
             local_file_path='image1.png',
             coordinates=COORDINATES_1,
             graphic_type='svg'
         )
     )
     result = _get_wrapped_graphic_tei_element(semantic_graphic)
     graphic_elements = result.xpath_nodes(
         '//tei:graphic'
     )
     assert len(graphic_elements) == 1
     graphic_element = graphic_elements[0]
     assert graphic_element.attrib.get('coords') == format_coordinates(
         COORDINATES_1
     )
     assert graphic_element.attrib.get('type') == 'svg'
     assert not graphic_element.attrib.get('url')
示例#16
0
 def test_should_match_based_on_figure_label(self, related_text: str,
                                             figure_label: str,
                                             should_match: bool):
     semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic(
         coordinates=FAR_AWAY_COORDINATES_1,
         related_block=LayoutBlock.for_text(related_text)))
     candidate_semantic_content_1 = SemanticFigure(
         [SemanticLabel(layout_block=LayoutBlock.for_text(figure_label))])
     result = GraphicRelatedBlockTextGraphicMatcher().get_graphic_matches(
         semantic_graphic_list=[semantic_graphic_1],
         candidate_semantic_content_list=[candidate_semantic_content_1])
     LOGGER.debug('result: %r', result)
     if should_match:
         assert len(result) == 1
         first_match = result.graphic_matches[0]
         assert first_match.semantic_graphic == semantic_graphic_1
     else:
         assert not result.graphic_matches
         assert result.unmatched_graphics == [semantic_graphic_1]
示例#17
0
 def test_should_match_based_on_figure_label(
         self, ocr_model_mock: MagicMock, ocr_text: str, figure_label: str,
         should_match: bool, tmp_path: Path):
     local_graphic_path = tmp_path / 'image.png'
     PIL.Image.new('RGB', (10, 10), (0, 1, 2)).save(local_graphic_path)
     ocr_model_mock.predict_single.return_value.get_text.return_value = ocr_text
     semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic(
         coordinates=FAR_AWAY_COORDINATES_1,
         local_file_path=str(local_graphic_path)))
     candidate_semantic_content_1 = SemanticFigure(
         [SemanticLabel(layout_block=LayoutBlock.for_text(figure_label))])
     result = OpticalCharacterRecognitionGraphicMatcher(
         ocr_model=ocr_model_mock).get_graphic_matches(
             semantic_graphic_list=[semantic_graphic_1],
             candidate_semantic_content_list=[candidate_semantic_content_1])
     LOGGER.debug('result: %r', result)
     if should_match:
         assert len(result) == 1
         first_match = result.graphic_matches[0]
         assert first_match.semantic_graphic == semantic_graphic_1
     else:
         assert not result.graphic_matches
         assert result.unmatched_graphics == [semantic_graphic_1]