예제 #1
0
 def iter_semantic_content_for_entity_blocks(  # pylint: disable=arguments-differ
         self, entity_tokens: Iterable[Tuple[str, LayoutBlock]],
         **kwargs) -> Iterable[SemanticContentWrapper]:
     entity_tokens = list(entity_tokens)
     LOGGER.debug('entity_tokens: %s', entity_tokens)
     figure: Optional[SemanticFigure] = None
     for name, layout_block in entity_tokens:
         if not figure:
             figure = SemanticFigure()
         semantic_content = self.get_semantic_content_for_entity_name(
             name, layout_block=layout_block)
         figure.add_content(semantic_content)
     if figure:
         yield figure
 def test_should_add_asset_citation_for_resolved_figure(self):
     semantic_document = SemanticDocument()
     semantic_document.body_section.add_content(
         SemanticSection([
             SemanticParagraph([
                 SemanticTextContentWrapper(
                     layout_block=LayoutBlock.for_text('See')),
                 SemanticFigureCitation(
                     layout_block=LayoutBlock.for_text('Fig 1'),
                     target_content_id='fig_0')
             ]),
             SemanticFigure([
                 SemanticLabel(
                     layout_block=LayoutBlock.for_text('Figure 1'))
             ],
                            content_id='fig_0')
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p') == ['See Fig 1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p/tei:ref[@type="figure"]') == ['Fig 1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p/tei:ref[@type="figure"]/@target') == [
             '#fig_0'
         ]
 def test_should_add_section_figures_to_back(self):
     semantic_document = SemanticDocument()
     semantic_document.back_section.add_content(
         SemanticSection([
             SemanticFigure([
                 SemanticLabel(
                     layout_block=LayoutBlock.for_text('Label 1')),
                 SemanticCaption(
                     layout_block=LayoutBlock.for_text('Caption 1'))
             ],
                            content_id='fig_0')
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     figure_xpath = (
         '//tei:back/tei:div[@type="annex"]/tei:figure[not(contains(@type, "table"))]'
     )
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:head') == ['Label 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:label') == ['Label 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:figDesc') == ['Caption 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/@xml:id') == ['fig_0']
     assert not tei_document.xpath(
         '//tei:back/tei:div[@type="annex"]/tei:div')
예제 #4
0
 def test_should_ignore_layout_graphic_without_related_block(self):
     semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic(
         coordinates=FAR_AWAY_COORDINATES_1, related_block=None))
     candidate_semantic_content_1 = SemanticFigure(
         [SemanticLabel(layout_block=LayoutBlock.for_text('Figure 1'))])
     result = GraphicRelatedBlockTextGraphicMatcher().get_graphic_matches(
         semantic_graphic_list=[semantic_graphic_1],
         candidate_semantic_content_list=[candidate_semantic_content_1])
     LOGGER.debug('result: %r', result)
     assert not result.graphic_matches
     assert result.unmatched_graphics == [semantic_graphic_1]
 def test_should_render_graphic_element(self):
     semantic_figure = SemanticFigure([
         SemanticLabel(layout_block=LayoutBlock.for_text('Label 1')),
         SemanticCaption(layout_block=LayoutBlock.for_text('Caption 1')),
         SemanticGraphic(layout_graphic=LayoutGraphic(
             local_file_path='image1.png'))
     ],
                                      content_id='fig_0')
     result = _get_wrapped_figure_tei_element(semantic_figure)
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/tei:graphic')
예제 #6
0
 def test_should_ignore_layout_graphic_without_local_path(
         self, ocr_model_mock: MagicMock):
     ocr_model_mock.predict_single.return_value.get_text.side_effect = RuntimeError
     semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic(
         coordinates=FAR_AWAY_COORDINATES_1, local_file_path=None))
     candidate_semantic_content_1 = SemanticFigure(
         [SemanticLabel(layout_block=LayoutBlock.for_text('Figure 1'))])
     result = OpticalCharacterRecognitionGraphicMatcher(
         ocr_model=ocr_model_mock).get_graphic_matches(
             semantic_graphic_list=[semantic_graphic_1],
             candidate_semantic_content_list=[candidate_semantic_content_1])
     LOGGER.debug('result: %r', result)
     assert not result.graphic_matches
     assert result.unmatched_graphics == [semantic_graphic_1]
 def test_should_render_label_description_and_id(self):
     semantic_figure = SemanticFigure([
         SemanticLabel(layout_block=LayoutBlock.for_text('Label 1')),
         SemanticCaption(layout_block=LayoutBlock.for_text('Caption 1'))
     ],
                                      content_id='fig_0')
     result = _get_wrapped_figure_tei_element(semantic_figure)
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/tei:head') == ['Label 1']
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/tei:label') == ['Label 1']
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/tei:figDesc') == ['Caption 1']
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/@xml:id') == ['fig_0']
예제 #8
0
 def test_should_match_based_on_figure_label(self, related_text: str,
                                             figure_label: str,
                                             should_match: bool):
     semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic(
         coordinates=FAR_AWAY_COORDINATES_1,
         related_block=LayoutBlock.for_text(related_text)))
     candidate_semantic_content_1 = SemanticFigure(
         [SemanticLabel(layout_block=LayoutBlock.for_text(figure_label))])
     result = GraphicRelatedBlockTextGraphicMatcher().get_graphic_matches(
         semantic_graphic_list=[semantic_graphic_1],
         candidate_semantic_content_list=[candidate_semantic_content_1])
     LOGGER.debug('result: %r', result)
     if should_match:
         assert len(result) == 1
         first_match = result.graphic_matches[0]
         assert first_match.semantic_graphic == semantic_graphic_1
     else:
         assert not result.graphic_matches
         assert result.unmatched_graphics == [semantic_graphic_1]
예제 #9
0
 def test_should_match_based_on_figure_label(
         self, ocr_model_mock: MagicMock, ocr_text: str, figure_label: str,
         should_match: bool, tmp_path: Path):
     local_graphic_path = tmp_path / 'image.png'
     PIL.Image.new('RGB', (10, 10), (0, 1, 2)).save(local_graphic_path)
     ocr_model_mock.predict_single.return_value.get_text.return_value = ocr_text
     semantic_graphic_1 = SemanticGraphic(layout_graphic=LayoutGraphic(
         coordinates=FAR_AWAY_COORDINATES_1,
         local_file_path=str(local_graphic_path)))
     candidate_semantic_content_1 = SemanticFigure(
         [SemanticLabel(layout_block=LayoutBlock.for_text(figure_label))])
     result = OpticalCharacterRecognitionGraphicMatcher(
         ocr_model=ocr_model_mock).get_graphic_matches(
             semantic_graphic_list=[semantic_graphic_1],
             candidate_semantic_content_list=[candidate_semantic_content_1])
     LOGGER.debug('result: %r', result)
     if should_match:
         assert len(result) == 1
         first_match = result.graphic_matches[0]
         assert first_match.semantic_graphic == semantic_graphic_1
     else:
         assert not result.graphic_matches
         assert result.unmatched_graphics == [semantic_graphic_1]
예제 #10
0
def _get_semantic_content_for_page_coordinates(
        coordinates: LayoutPageCoordinates) -> SemanticContentWrapper:
    return SemanticFigure(layout_block=LayoutBlock.for_tokens(
        [LayoutToken(text='dummy', coordinates=coordinates)]))