def update_section_with_entity_blocks( self, parent_section: SemanticSection, entity_tokens: Iterable[Tuple[str, LayoutBlock]], section_type: str = SemanticSectionTypes.OTHER): semantic_extractor = self.get_semantic_extractor() for semantic_content in semantic_extractor.iter_semantic_content_for_entity_blocks( entity_tokens=entity_tokens, section_type=section_type): parent_section.add_content(semantic_content)
def test_should_add_note_for_other_text_to_body(self): semantic_content_list = list(FullTextSemanticExtractor( ).iter_semantic_content_for_entity_blocks([ ('O', LayoutBlock.for_text(SECTION_PARAGRAPH_1)), ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2)) ])) parent_section = SemanticSection(semantic_content_list) assert parent_section.get_notes_text_list('fulltext:other') == [ SECTION_PARAGRAPH_1 ] sections = parent_section.sections assert len(sections) == 1 assert sections[0].get_paragraph_text_list() == [SECTION_PARAGRAPH_2]
def get_section_for_entity_blocks( self, entity_tokens: Iterable[Tuple[str, LayoutBlock]]) -> SemanticSection: parent_section = SemanticSection() self.update_section_with_entity_blocks(parent_section, entity_tokens) return parent_section
def test_should_add_asset_citation_for_resolved_reference(self): semantic_document = SemanticDocument() semantic_document.body_section.add_content( SemanticSection([ SemanticParagraph([ SemanticTextContentWrapper( layout_block=LayoutBlock.for_text('See')), SemanticReferenceCitation( layout_block=LayoutBlock.for_text('Ref 1'), target_content_id='b0') ]), SemanticReferenceList([ SemanticReference([ SemanticLabel(layout_block=LayoutBlock.for_text('1')) ], content_id='b0') ]) ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p') == ['See Ref 1'] assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]') == ['Ref 1'] assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]/@target') == [ '#b0' ]
def test_should_add_section_figures_to_back(self): semantic_document = SemanticDocument() semantic_document.back_section.add_content( SemanticSection([ SemanticFigure([ SemanticLabel( layout_block=LayoutBlock.for_text('Label 1')), SemanticCaption( layout_block=LayoutBlock.for_text('Caption 1')) ], content_id='fig_0') ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) figure_xpath = ( '//tei:back/tei:div[@type="annex"]/tei:figure[not(contains(@type, "table"))]' ) assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/tei:head') == ['Label 1'] assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/tei:label') == ['Label 1'] assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/tei:figDesc') == ['Caption 1'] assert tei_document.get_xpath_text_content_list( f'{figure_xpath}/@xml:id') == ['fig_0'] assert not tei_document.xpath( '//tei:back/tei:div[@type="annex"]/tei:div')
def test_should_add_raw_equation_with_label_to_paragraph(self): # to be consistent with Java GROBID semantic_document = SemanticDocument() semantic_document.body_section.add_content( SemanticSection([ SemanticParagraph([ SemanticTextContentWrapper( layout_block=LayoutBlock.for_text('Next')), SemanticRawEquation([ SemanticRawEquationContent( layout_block=LayoutBlock.for_text('Equation 1')), SemanticLabel(layout_block=LayoutBlock.for_text('(1)')) ]) ]), ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:formula') == ['Equation 1 (1)'] assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:formula/tei:label') == ['(1)'] assert tei_document.get_xpath_text_content_list( '//tei:body/tei:div/tei:p') == ['Next']
def test_should_add_section_tables_to_body(self): semantic_document = SemanticDocument() semantic_document.body_section.add_content( SemanticSection([ SemanticTable([ SemanticLabel( layout_block=LayoutBlock.for_text('Label 1')), SemanticCaption( layout_block=LayoutBlock.for_text('Caption 1')) ], content_id='tab_0') ])) tei_document = get_tei_for_semantic_document(semantic_document) LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root)) table_xpath = '//tei:body/tei:figure[@type="table"]' assert tei_document.get_xpath_text_content_list( f'{table_xpath}/tei:head') == ['Label 1'] assert tei_document.get_xpath_text_content_list( f'{table_xpath}/tei:label') == ['Label 1'] assert tei_document.get_xpath_text_content_list( f'{table_xpath}/tei:figDesc') == ['Caption 1'] assert tei_document.get_xpath_text_content_list( f'{table_xpath}/@xml:id') == ['tab_0'] assert not tei_document.xpath('//tei:body/tei:div')
def iter_semantic_content_for_entity_blocks( # noqa pylint: disable=arguments-differ, too-many-branches self, entity_tokens: Iterable[Tuple[str, LayoutBlock]], section_type: str = SemanticSectionTypes.OTHER, **kwargs ) -> Iterable[SemanticContentWrapper]: entity_tokens = list(entity_tokens) LOGGER.debug('entity_tokens: %s', entity_tokens) section: Optional[SemanticSection] = None paragraph: Optional[SemanticParagraph] = None raw_equation: Optional[SemanticRawEquation] = None _previous_tag: Optional[str] = None for name, layout_block in entity_tokens: if LOGGER.isEnabledFor(logging.DEBUG): LOGGER.debug('entity_block: %r, %r', name, layout_block.text) previous_tag = _previous_tag _previous_tag = name if name in {'O'}: LOGGER.debug('ignoring content (%r): %r', name, layout_block) note_type = 'fulltext:other' if name == 'O' else name if section: section.add_note(layout_block, note_type=note_type) else: yield SemanticNote( layout_block=layout_block, note_type=note_type ) continue if name == '<section>': paragraph = None raw_equation = None if section: yield section section = SemanticSection(section_type=section_type) section.add_content(self.get_semantic_heading(layout_block)) continue if not section: section = SemanticSection(section_type=section_type) if name in SIMPLE_SEMANTIC_CONTENT_CLASS_BY_TAG: section.add_content(self.get_semantic_content_for_entity_name( name, layout_block=layout_block )) continue # treat everything else as paragraph content if ( not paragraph or ( name == '<paragraph>' and previous_tag == '<paragraph>' ) ): paragraph = section.add_new_paragraph() if name in {'<equation>', '<equation_label>'}: semantic_content = self.get_raw_equation_child_semantic_content( name, layout_block=layout_block ) if ( isinstance(semantic_content, SemanticRawEquationContent) and raw_equation and raw_equation.has_type(SemanticRawEquationContent) ): LOGGER.debug('already has equation content, start new one') raw_equation = None if not raw_equation: raw_equation = SemanticRawEquation() paragraph.add_content(raw_equation) raw_equation.add_content(semantic_content) continue raw_equation = None self.add_paragraph_content( paragraph, name, layout_block ) if section: yield section