def update_section_with_entity_blocks( self, parent_section: SemanticSection, entity_tokens: Iterable[Tuple[str, LayoutBlock]], section_type: str = SemanticSectionTypes.OTHER): semantic_extractor = self.get_semantic_extractor() for semantic_content in semantic_extractor.iter_semantic_content_for_entity_blocks( entity_tokens=entity_tokens, section_type=section_type): parent_section.add_content(semantic_content)
def iter_semantic_content_for_entity_blocks( # noqa pylint: disable=arguments-differ, too-many-branches self, entity_tokens: Iterable[Tuple[str, LayoutBlock]], section_type: str = SemanticSectionTypes.OTHER, **kwargs ) -> Iterable[SemanticContentWrapper]: entity_tokens = list(entity_tokens) LOGGER.debug('entity_tokens: %s', entity_tokens) section: Optional[SemanticSection] = None paragraph: Optional[SemanticParagraph] = None raw_equation: Optional[SemanticRawEquation] = None _previous_tag: Optional[str] = None for name, layout_block in entity_tokens: if LOGGER.isEnabledFor(logging.DEBUG): LOGGER.debug('entity_block: %r, %r', name, layout_block.text) previous_tag = _previous_tag _previous_tag = name if name in {'O'}: LOGGER.debug('ignoring content (%r): %r', name, layout_block) note_type = 'fulltext:other' if name == 'O' else name if section: section.add_note(layout_block, note_type=note_type) else: yield SemanticNote( layout_block=layout_block, note_type=note_type ) continue if name == '<section>': paragraph = None raw_equation = None if section: yield section section = SemanticSection(section_type=section_type) section.add_content(self.get_semantic_heading(layout_block)) continue if not section: section = SemanticSection(section_type=section_type) if name in SIMPLE_SEMANTIC_CONTENT_CLASS_BY_TAG: section.add_content(self.get_semantic_content_for_entity_name( name, layout_block=layout_block )) continue # treat everything else as paragraph content if ( not paragraph or ( name == '<paragraph>' and previous_tag == '<paragraph>' ) ): paragraph = section.add_new_paragraph() if name in {'<equation>', '<equation_label>'}: semantic_content = self.get_raw_equation_child_semantic_content( name, layout_block=layout_block ) if ( isinstance(semantic_content, SemanticRawEquationContent) and raw_equation and raw_equation.has_type(SemanticRawEquationContent) ): LOGGER.debug('already has equation content, start new one') raw_equation = None if not raw_equation: raw_equation = SemanticRawEquation() paragraph.add_content(raw_equation) raw_equation.add_content(semantic_content) continue raw_equation = None self.add_paragraph_content( paragraph, name, layout_block ) if section: yield section