def iter_semantic_content_for_entity_blocks( # pylint: disable=arguments-differ self, entity_tokens: Iterable[Tuple[str, LayoutBlock]], semantic_raw_reference: Optional[SemanticRawReference] = None, **kwargs ) -> Iterable[SemanticContentWrapper]: entity_tokens = list(entity_tokens) LOGGER.debug('entity_tokens: %s', entity_tokens) ids_iterator = iter(iter_ids('b')) ref: Optional[SemanticReference] = None for name, layout_block in entity_tokens: if not ref: ref = SemanticReference() if semantic_raw_reference: ref.content_id = semantic_raw_reference.content_id for semantic_content in semantic_raw_reference: ref.add_content(semantic_content) if not ref.content_id: ref.content_id = next(ids_iterator, '?') semantic_content = self.get_semantic_content_for_entity_name( name, layout_block=layout_block ) ref.add_content(semantic_content) if ref and not is_reference_valid(ref): yield get_invalid_reference(ref) elif ref: yield ref
def iter_semantic_content_for_entity_blocks( self, entity_tokens: Iterable[Tuple[str, LayoutBlock]], **kwargs) -> Iterable[SemanticContentWrapper]: entity_tokens = list(entity_tokens) LOGGER.debug('entity_tokens: %s', entity_tokens) ids_iterator = iter(iter_ids('b')) ref: Optional[SemanticRawReference] = None is_first_ref = True for name, layout_block in entity_tokens: if name == '<label>': if not ref: ref = SemanticRawReference( content_id=next(ids_iterator, '?')) ref.add_content(SemanticLabel(layout_block=layout_block)) continue if name == '<reference>': if not ref and is_first_ref and not is_looks_like_reference( layout_block): yield SemanticHeading(layout_block=layout_block) is_first_ref = False continue if not ref: ref = SemanticRawReference( content_id=next(ids_iterator, '?')) ref.add_content( SemanticRawReferenceText(layout_block=layout_block)) yield ref ref = None is_first_ref = False continue yield SemanticNote(layout_block=layout_block, note_type=name) if ref: yield ref
def iter_semantic_content_for_entity_blocks( self, entity_tokens: Iterable[Tuple[str, LayoutBlock]], **kwargs ) -> Iterable[SemanticContentWrapper]: entity_tokens = list(entity_tokens) LOGGER.debug('entity_tokens: %s', entity_tokens) ids_iterator = iter(iter_ids('aff')) aff: Optional[SemanticAffiliationAddress] = None for name, layout_block in entity_tokens: if name == '<marker>': if aff: yield aff aff = SemanticAffiliationAddress(content_id=next(ids_iterator, '?')) aff.add_content(SemanticMarker(layout_block=layout_block)) continue prefix_block, cleaned_block, suffix_block = ( get_regex_cleaned_layout_block_with_prefix_suffix( layout_block, CLEAN_REGEX_BY_TAG.get(name) ) ) semantic_content = self.get_semantic_content_for_entity_name( name, cleaned_block ) if ( aff is not None and isinstance(semantic_content, SemanticInstitution) and aff.has_type(SemanticInstitution) ): yield aff aff = None if not aff: if isinstance(semantic_content, SemanticNote): yield semantic_content continue aff = SemanticAffiliationAddress(content_id=next(ids_iterator, '?')) if prefix_block: aff.add_content(SemanticNote(layout_block=prefix_block, note_type=f'{name}-prefix')) aff.add_content(semantic_content) if suffix_block: aff.add_content(SemanticNote(layout_block=suffix_block, note_type=f'{name}-suffix')) if aff: yield aff
def _process_raw_affiliations(self, semantic_document: SemanticDocument): result_content: List[SemanticContentWrapper] = [] raw_aff_address_list: List[SemanticRawAffiliationAddress] = [] for semantic_content in semantic_document.front: if isinstance(semantic_content, SemanticRawAffiliationAddress): raw_aff_address_list.append(semantic_content) continue result_content.append(semantic_content) if raw_aff_address_list: raw_aff_layout_documents = [ LayoutDocument.for_blocks(list(raw_aff_or_address.iter_blocks())) for raw_aff_or_address in raw_aff_address_list ] labeled_layout_tokens_list = ( self.affiliation_address_model .predict_labels_for_layout_documents( raw_aff_layout_documents, app_features_context=self.app_features_context ) ) LOGGER.debug('labeled_layout_tokens_list (aff): %r', labeled_layout_tokens_list) aff_iterable = ( aff for labeled_layout_tokens in labeled_layout_tokens_list for aff in ( self.affiliation_address_model .iter_semantic_content_for_labeled_layout_tokens(labeled_layout_tokens) ) ) for aff in aff_iterable: result_content.append(aff) semantic_document.front.mixed_content = result_content self._assign_content_ids( semantic_document.front.iter_by_type(SemanticAffiliationAddress), iter(iter_ids('aff')) )
def get_semantic_document_for_layout_document( self, layout_document: LayoutDocument, context: Optional[FullTextProcessorDocumentContext] = None ) -> SemanticDocument: if context is None: context = FullTextProcessorDocumentContext() layout_document = self._preprocess_layout_graphics( layout_document, context=context ) segmentation_label_result = self.segmentation_model.get_label_layout_document_result( layout_document, app_features_context=self.app_features_context ) header_layout_document = segmentation_label_result.get_filtered_document_by_label( '<header>' ).remove_empty_blocks() document = SemanticDocument() if self.config.extract_front: self._process_header_layout_document( header_layout_document=header_layout_document, semantic_document=document ) if self.config.extract_body_sections: self._update_semantic_section_using_segmentation_result_and_fulltext_model( document.body_section, segmentation_label_result, '<body>', SemanticSectionTypes.OTHER ) if self.config.extract_acknowledgements: self._update_semantic_section_using_segmentation_result_and_fulltext_model( document.back_section, segmentation_label_result, '<acknowledgement>', SemanticSectionTypes.ACKNOWLEDGEMENT ) if self.config.extract_back_sections: self._update_semantic_section_using_segmentation_result_and_fulltext_model( document.back_section, segmentation_label_result, '<annex>', SemanticSectionTypes.OTHER ) if self.config.extract_references: self._extract_raw_references_from_segmentation( semantic_document=document, segmentation_label_result=segmentation_label_result ) if self.config.extract_citation_fields: self._extract_reference_fields_from_raw_references( semantic_document=document ) if self.config.extract_citation_authors or self.config.extract_citation_editors: self._extract_reference_name_lists_from_raw_references( semantic_document=document ) references = list(document.iter_by_type_recursively(SemanticReference)) ref_citations = list(document.iter_by_type_recursively(SemanticReferenceCitation)) self._assign_content_ids(references, iter(iter_ids('b'))) self._assign_target_content_ids(ref_citations, ChainedContentIdMatcher([ SimpleContentIdMatcher( self._get_semantic_content_text_by_content_id(references, SemanticLabel) ), PartialContentIdMatcher( self._get_semantic_content_text_by_content_id( references, SemanticRawReferenceText ) ) ])) if self.config.extract_figure_fields: self._extract_figure_fields_from_raw_figures(semantic_document=document) figures = list(document.iter_by_type_recursively(SemanticFigure)) figure_citations = list(document.iter_by_type_recursively(SemanticFigureCitation)) self._assign_content_ids(figures, iter(iter_ids('fig_'))) self._assign_target_content_ids(figure_citations, SimpleContentIdMatcher( self._get_semantic_content_text_by_content_id(figures, SemanticLabel) )) if self.config.extract_table_fields: self._extract_table_fields_from_raw_tables(semantic_document=document) tables = list(document.iter_by_type_recursively(SemanticTable)) table_citations = list(document.iter_by_type_recursively(SemanticTableCitation)) self._assign_content_ids(tables, iter(iter_ids('tab_'))) self._assign_target_content_ids(table_citations, SimpleContentIdMatcher( self._get_semantic_content_text_by_content_id(tables, SemanticLabel) )) if self.config.extract_graphic_bounding_boxes: self._process_graphics( document=document, layout_document=layout_document, context=context ) return document
def test_should_return_sequence_of_ids(self): assert list(islice(iter_ids('prefix'), 3)) == ['prefix0', 'prefix1', 'prefix2']