예제 #1
0
 def update_section_with_entity_blocks(
         self,
         parent_section: SemanticSection,
         entity_tokens: Iterable[Tuple[str, LayoutBlock]],
         section_type: str = SemanticSectionTypes.OTHER):
     semantic_extractor = self.get_semantic_extractor()
     for semantic_content in semantic_extractor.iter_semantic_content_for_entity_blocks(
             entity_tokens=entity_tokens, section_type=section_type):
         parent_section.add_content(semantic_content)
예제 #2
0
 def iter_semantic_content_for_entity_blocks(  # noqa pylint: disable=arguments-differ, too-many-branches
     self,
     entity_tokens: Iterable[Tuple[str, LayoutBlock]],
     section_type: str = SemanticSectionTypes.OTHER,
     **kwargs
 ) -> Iterable[SemanticContentWrapper]:
     entity_tokens = list(entity_tokens)
     LOGGER.debug('entity_tokens: %s', entity_tokens)
     section: Optional[SemanticSection] = None
     paragraph: Optional[SemanticParagraph] = None
     raw_equation: Optional[SemanticRawEquation] = None
     _previous_tag: Optional[str] = None
     for name, layout_block in entity_tokens:
         if LOGGER.isEnabledFor(logging.DEBUG):
             LOGGER.debug('entity_block: %r, %r', name, layout_block.text)
         previous_tag = _previous_tag
         _previous_tag = name
         if name in {'O'}:
             LOGGER.debug('ignoring content (%r): %r', name, layout_block)
             note_type = 'fulltext:other' if name == 'O' else name
             if section:
                 section.add_note(layout_block, note_type=note_type)
             else:
                 yield SemanticNote(
                     layout_block=layout_block,
                     note_type=note_type
                 )
             continue
         if name == '<section>':
             paragraph = None
             raw_equation = None
             if section:
                 yield section
             section = SemanticSection(section_type=section_type)
             section.add_content(self.get_semantic_heading(layout_block))
             continue
         if not section:
             section = SemanticSection(section_type=section_type)
         if name in SIMPLE_SEMANTIC_CONTENT_CLASS_BY_TAG:
             section.add_content(self.get_semantic_content_for_entity_name(
                 name, layout_block=layout_block
             ))
             continue
         # treat everything else as paragraph content
         if (
             not paragraph
             or (
                 name == '<paragraph>'
                 and previous_tag == '<paragraph>'
             )
         ):
             paragraph = section.add_new_paragraph()
         if name in {'<equation>', '<equation_label>'}:
             semantic_content = self.get_raw_equation_child_semantic_content(
                 name, layout_block=layout_block
             )
             if (
                 isinstance(semantic_content, SemanticRawEquationContent)
                 and raw_equation
                 and raw_equation.has_type(SemanticRawEquationContent)
             ):
                 LOGGER.debug('already has equation content, start new one')
                 raw_equation = None
             if not raw_equation:
                 raw_equation = SemanticRawEquation()
                 paragraph.add_content(raw_equation)
             raw_equation.add_content(semantic_content)
             continue
         raw_equation = None
         self.add_paragraph_content(
             paragraph, name, layout_block
         )
     if section:
         yield section