示例#1
0
 def iter_semantic_content_for_entity_blocks(  # pylint: disable=arguments-differ
     self,
     entity_tokens: Iterable[Tuple[str, LayoutBlock]],
     semantic_raw_reference: Optional[SemanticRawReference] = None,
     **kwargs
 ) -> Iterable[SemanticContentWrapper]:
     entity_tokens = list(entity_tokens)
     LOGGER.debug('entity_tokens: %s', entity_tokens)
     ids_iterator = iter(iter_ids('b'))
     ref: Optional[SemanticReference] = None
     for name, layout_block in entity_tokens:
         if not ref:
             ref = SemanticReference()
             if semantic_raw_reference:
                 ref.content_id = semantic_raw_reference.content_id
                 for semantic_content in semantic_raw_reference:
                     ref.add_content(semantic_content)
             if not ref.content_id:
                 ref.content_id = next(ids_iterator, '?')
         semantic_content = self.get_semantic_content_for_entity_name(
             name, layout_block=layout_block
         )
         ref.add_content(semantic_content)
     if ref and not is_reference_valid(ref):
         yield get_invalid_reference(ref)
     elif ref:
         yield ref
示例#2
0
 def iter_semantic_content_for_entity_blocks(
         self, entity_tokens: Iterable[Tuple[str, LayoutBlock]],
         **kwargs) -> Iterable[SemanticContentWrapper]:
     entity_tokens = list(entity_tokens)
     LOGGER.debug('entity_tokens: %s', entity_tokens)
     ids_iterator = iter(iter_ids('b'))
     ref: Optional[SemanticRawReference] = None
     is_first_ref = True
     for name, layout_block in entity_tokens:
         if name == '<label>':
             if not ref:
                 ref = SemanticRawReference(
                     content_id=next(ids_iterator, '?'))
             ref.add_content(SemanticLabel(layout_block=layout_block))
             continue
         if name == '<reference>':
             if not ref and is_first_ref and not is_looks_like_reference(
                     layout_block):
                 yield SemanticHeading(layout_block=layout_block)
                 is_first_ref = False
                 continue
             if not ref:
                 ref = SemanticRawReference(
                     content_id=next(ids_iterator, '?'))
             ref.add_content(
                 SemanticRawReferenceText(layout_block=layout_block))
             yield ref
             ref = None
             is_first_ref = False
             continue
         yield SemanticNote(layout_block=layout_block, note_type=name)
     if ref:
         yield ref
示例#3
0
 def iter_semantic_content_for_entity_blocks(
     self,
     entity_tokens: Iterable[Tuple[str, LayoutBlock]],
     **kwargs
 ) -> Iterable[SemanticContentWrapper]:
     entity_tokens = list(entity_tokens)
     LOGGER.debug('entity_tokens: %s', entity_tokens)
     ids_iterator = iter(iter_ids('aff'))
     aff: Optional[SemanticAffiliationAddress] = None
     for name, layout_block in entity_tokens:
         if name == '<marker>':
             if aff:
                 yield aff
             aff = SemanticAffiliationAddress(content_id=next(ids_iterator, '?'))
             aff.add_content(SemanticMarker(layout_block=layout_block))
             continue
         prefix_block, cleaned_block, suffix_block = (
             get_regex_cleaned_layout_block_with_prefix_suffix(
                 layout_block,
                 CLEAN_REGEX_BY_TAG.get(name)
             )
         )
         semantic_content = self.get_semantic_content_for_entity_name(
             name, cleaned_block
         )
         if (
             aff is not None
             and isinstance(semantic_content, SemanticInstitution)
             and aff.has_type(SemanticInstitution)
         ):
             yield aff
             aff = None
         if not aff:
             if isinstance(semantic_content, SemanticNote):
                 yield semantic_content
                 continue
             aff = SemanticAffiliationAddress(content_id=next(ids_iterator, '?'))
         if prefix_block:
             aff.add_content(SemanticNote(layout_block=prefix_block, note_type=f'{name}-prefix'))
         aff.add_content(semantic_content)
         if suffix_block:
             aff.add_content(SemanticNote(layout_block=suffix_block, note_type=f'{name}-suffix'))
     if aff:
         yield aff
示例#4
0
 def _process_raw_affiliations(self, semantic_document: SemanticDocument):
     result_content: List[SemanticContentWrapper] = []
     raw_aff_address_list: List[SemanticRawAffiliationAddress] = []
     for semantic_content in semantic_document.front:
         if isinstance(semantic_content, SemanticRawAffiliationAddress):
             raw_aff_address_list.append(semantic_content)
             continue
         result_content.append(semantic_content)
     if raw_aff_address_list:
         raw_aff_layout_documents = [
             LayoutDocument.for_blocks(list(raw_aff_or_address.iter_blocks()))
             for raw_aff_or_address in raw_aff_address_list
         ]
         labeled_layout_tokens_list = (
             self.affiliation_address_model
             .predict_labels_for_layout_documents(
                 raw_aff_layout_documents,
                 app_features_context=self.app_features_context
             )
         )
         LOGGER.debug('labeled_layout_tokens_list (aff): %r', labeled_layout_tokens_list)
         aff_iterable = (
             aff
             for labeled_layout_tokens in labeled_layout_tokens_list
             for aff in (
                 self.affiliation_address_model
                 .iter_semantic_content_for_labeled_layout_tokens(labeled_layout_tokens)
             )
         )
         for aff in aff_iterable:
             result_content.append(aff)
     semantic_document.front.mixed_content = result_content
     self._assign_content_ids(
         semantic_document.front.iter_by_type(SemanticAffiliationAddress),
         iter(iter_ids('aff'))
     )
示例#5
0
    def get_semantic_document_for_layout_document(
        self,
        layout_document: LayoutDocument,
        context: Optional[FullTextProcessorDocumentContext] = None
    ) -> SemanticDocument:
        if context is None:
            context = FullTextProcessorDocumentContext()
        layout_document = self._preprocess_layout_graphics(
            layout_document,
            context=context
        )
        segmentation_label_result = self.segmentation_model.get_label_layout_document_result(
            layout_document,
            app_features_context=self.app_features_context
        )
        header_layout_document = segmentation_label_result.get_filtered_document_by_label(
            '<header>'
        ).remove_empty_blocks()
        document = SemanticDocument()
        if self.config.extract_front:
            self._process_header_layout_document(
                header_layout_document=header_layout_document,
                semantic_document=document
            )

        if self.config.extract_body_sections:
            self._update_semantic_section_using_segmentation_result_and_fulltext_model(
                document.body_section,
                segmentation_label_result,
                '<body>',
                SemanticSectionTypes.OTHER
            )
        if self.config.extract_acknowledgements:
            self._update_semantic_section_using_segmentation_result_and_fulltext_model(
                document.back_section,
                segmentation_label_result,
                '<acknowledgement>',
                SemanticSectionTypes.ACKNOWLEDGEMENT
            )
        if self.config.extract_back_sections:
            self._update_semantic_section_using_segmentation_result_and_fulltext_model(
                document.back_section,
                segmentation_label_result,
                '<annex>',
                SemanticSectionTypes.OTHER
            )
        if self.config.extract_references:
            self._extract_raw_references_from_segmentation(
                semantic_document=document,
                segmentation_label_result=segmentation_label_result
            )
        if self.config.extract_citation_fields:
            self._extract_reference_fields_from_raw_references(
                semantic_document=document
            )
            if self.config.extract_citation_authors or self.config.extract_citation_editors:
                self._extract_reference_name_lists_from_raw_references(
                    semantic_document=document
                )
            references = list(document.iter_by_type_recursively(SemanticReference))
            ref_citations = list(document.iter_by_type_recursively(SemanticReferenceCitation))
            self._assign_content_ids(references, iter(iter_ids('b')))
            self._assign_target_content_ids(ref_citations, ChainedContentIdMatcher([
                SimpleContentIdMatcher(
                    self._get_semantic_content_text_by_content_id(references, SemanticLabel)
                ),
                PartialContentIdMatcher(
                    self._get_semantic_content_text_by_content_id(
                        references, SemanticRawReferenceText
                    )
                )
            ]))
        if self.config.extract_figure_fields:
            self._extract_figure_fields_from_raw_figures(semantic_document=document)
            figures = list(document.iter_by_type_recursively(SemanticFigure))
            figure_citations = list(document.iter_by_type_recursively(SemanticFigureCitation))
            self._assign_content_ids(figures, iter(iter_ids('fig_')))
            self._assign_target_content_ids(figure_citations, SimpleContentIdMatcher(
                self._get_semantic_content_text_by_content_id(figures, SemanticLabel)
            ))
        if self.config.extract_table_fields:
            self._extract_table_fields_from_raw_tables(semantic_document=document)
            tables = list(document.iter_by_type_recursively(SemanticTable))
            table_citations = list(document.iter_by_type_recursively(SemanticTableCitation))
            self._assign_content_ids(tables, iter(iter_ids('tab_')))
            self._assign_target_content_ids(table_citations, SimpleContentIdMatcher(
                self._get_semantic_content_text_by_content_id(tables, SemanticLabel)
            ))
        if self.config.extract_graphic_bounding_boxes:
            self._process_graphics(
                document=document,
                layout_document=layout_document,
                context=context
            )
        return document
示例#6
0
 def test_should_return_sequence_of_ids(self):
     assert list(islice(iter_ids('prefix'),
                        3)) == ['prefix0', 'prefix1', 'prefix2']