예제 #1
0
 def update_section_with_entity_blocks(
         self,
         parent_section: SemanticSection,
         entity_tokens: Iterable[Tuple[str, LayoutBlock]],
         section_type: str = SemanticSectionTypes.OTHER):
     semantic_extractor = self.get_semantic_extractor()
     for semantic_content in semantic_extractor.iter_semantic_content_for_entity_blocks(
             entity_tokens=entity_tokens, section_type=section_type):
         parent_section.add_content(semantic_content)
예제 #2
0
 def test_should_add_note_for_other_text_to_body(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('O', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2))
     ]))
     parent_section = SemanticSection(semantic_content_list)
     assert parent_section.get_notes_text_list('fulltext:other') == [
         SECTION_PARAGRAPH_1
     ]
     sections = parent_section.sections
     assert len(sections) == 1
     assert sections[0].get_paragraph_text_list() == [SECTION_PARAGRAPH_2]
예제 #3
0
 def get_section_for_entity_blocks(
         self,
         entity_tokens: Iterable[Tuple[str,
                                       LayoutBlock]]) -> SemanticSection:
     parent_section = SemanticSection()
     self.update_section_with_entity_blocks(parent_section, entity_tokens)
     return parent_section
 def test_should_add_asset_citation_for_resolved_reference(self):
     semantic_document = SemanticDocument()
     semantic_document.body_section.add_content(
         SemanticSection([
             SemanticParagraph([
                 SemanticTextContentWrapper(
                     layout_block=LayoutBlock.for_text('See')),
                 SemanticReferenceCitation(
                     layout_block=LayoutBlock.for_text('Ref 1'),
                     target_content_id='b0')
             ]),
             SemanticReferenceList([
                 SemanticReference([
                     SemanticLabel(layout_block=LayoutBlock.for_text('1'))
                 ],
                                   content_id='b0')
             ])
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p') == ['See Ref 1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]') == ['Ref 1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]/@target') == [
             '#b0'
         ]
 def test_should_add_section_figures_to_back(self):
     semantic_document = SemanticDocument()
     semantic_document.back_section.add_content(
         SemanticSection([
             SemanticFigure([
                 SemanticLabel(
                     layout_block=LayoutBlock.for_text('Label 1')),
                 SemanticCaption(
                     layout_block=LayoutBlock.for_text('Caption 1'))
             ],
                            content_id='fig_0')
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     figure_xpath = (
         '//tei:back/tei:div[@type="annex"]/tei:figure[not(contains(@type, "table"))]'
     )
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:head') == ['Label 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:label') == ['Label 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:figDesc') == ['Caption 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/@xml:id') == ['fig_0']
     assert not tei_document.xpath(
         '//tei:back/tei:div[@type="annex"]/tei:div')
 def test_should_add_raw_equation_with_label_to_paragraph(self):
     # to be consistent with Java GROBID
     semantic_document = SemanticDocument()
     semantic_document.body_section.add_content(
         SemanticSection([
             SemanticParagraph([
                 SemanticTextContentWrapper(
                     layout_block=LayoutBlock.for_text('Next')),
                 SemanticRawEquation([
                     SemanticRawEquationContent(
                         layout_block=LayoutBlock.for_text('Equation 1')),
                     SemanticLabel(layout_block=LayoutBlock.for_text('(1)'))
                 ])
             ]),
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:formula') == ['Equation 1 (1)']
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:formula/tei:label') == ['(1)']
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p') == ['Next']
 def test_should_add_section_tables_to_body(self):
     semantic_document = SemanticDocument()
     semantic_document.body_section.add_content(
         SemanticSection([
             SemanticTable([
                 SemanticLabel(
                     layout_block=LayoutBlock.for_text('Label 1')),
                 SemanticCaption(
                     layout_block=LayoutBlock.for_text('Caption 1'))
             ],
                           content_id='tab_0')
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     table_xpath = '//tei:body/tei:figure[@type="table"]'
     assert tei_document.get_xpath_text_content_list(
         f'{table_xpath}/tei:head') == ['Label 1']
     assert tei_document.get_xpath_text_content_list(
         f'{table_xpath}/tei:label') == ['Label 1']
     assert tei_document.get_xpath_text_content_list(
         f'{table_xpath}/tei:figDesc') == ['Caption 1']
     assert tei_document.get_xpath_text_content_list(
         f'{table_xpath}/@xml:id') == ['tab_0']
     assert not tei_document.xpath('//tei:body/tei:div')
예제 #8
0
 def iter_semantic_content_for_entity_blocks(  # noqa pylint: disable=arguments-differ, too-many-branches
     self,
     entity_tokens: Iterable[Tuple[str, LayoutBlock]],
     section_type: str = SemanticSectionTypes.OTHER,
     **kwargs
 ) -> Iterable[SemanticContentWrapper]:
     entity_tokens = list(entity_tokens)
     LOGGER.debug('entity_tokens: %s', entity_tokens)
     section: Optional[SemanticSection] = None
     paragraph: Optional[SemanticParagraph] = None
     raw_equation: Optional[SemanticRawEquation] = None
     _previous_tag: Optional[str] = None
     for name, layout_block in entity_tokens:
         if LOGGER.isEnabledFor(logging.DEBUG):
             LOGGER.debug('entity_block: %r, %r', name, layout_block.text)
         previous_tag = _previous_tag
         _previous_tag = name
         if name in {'O'}:
             LOGGER.debug('ignoring content (%r): %r', name, layout_block)
             note_type = 'fulltext:other' if name == 'O' else name
             if section:
                 section.add_note(layout_block, note_type=note_type)
             else:
                 yield SemanticNote(
                     layout_block=layout_block,
                     note_type=note_type
                 )
             continue
         if name == '<section>':
             paragraph = None
             raw_equation = None
             if section:
                 yield section
             section = SemanticSection(section_type=section_type)
             section.add_content(self.get_semantic_heading(layout_block))
             continue
         if not section:
             section = SemanticSection(section_type=section_type)
         if name in SIMPLE_SEMANTIC_CONTENT_CLASS_BY_TAG:
             section.add_content(self.get_semantic_content_for_entity_name(
                 name, layout_block=layout_block
             ))
             continue
         # treat everything else as paragraph content
         if (
             not paragraph
             or (
                 name == '<paragraph>'
                 and previous_tag == '<paragraph>'
             )
         ):
             paragraph = section.add_new_paragraph()
         if name in {'<equation>', '<equation_label>'}:
             semantic_content = self.get_raw_equation_child_semantic_content(
                 name, layout_block=layout_block
             )
             if (
                 isinstance(semantic_content, SemanticRawEquationContent)
                 and raw_equation
                 and raw_equation.has_type(SemanticRawEquationContent)
             ):
                 LOGGER.debug('already has equation content, start new one')
                 raw_equation = None
             if not raw_equation:
                 raw_equation = SemanticRawEquation()
                 paragraph.add_content(raw_equation)
             raw_equation.add_content(semantic_content)
             continue
         raw_equation = None
         self.add_paragraph_content(
             paragraph, name, layout_block
         )
     if section:
         yield section