Пример #1
0
 def _match_graphic_elements(
     self,
     semantic_graphic_list: Sequence[SemanticGraphic],
     candidate_semantic_content_list: Sequence[SemanticContentWrapper],
     unmatched_graphics_container: SemanticMixedContentWrapper
 ):
     _graphic_matchers: List[GraphicMatcher] = [
         BoundingBoxDistanceGraphicMatcher(),
         GraphicRelatedBlockTextGraphicMatcher()
     ]
     if self.config.use_ocr_model:
         assert self.fulltext_models.ocr_model
         _graphic_matchers.append(
             OpticalCharacterRecognitionGraphicMatcher(
                 ocr_model=self.fulltext_models.ocr_model
             )
         )
     graphic_matcher = ChainedGraphicMatcher(_graphic_matchers)
     graphic_match_result = graphic_matcher.get_graphic_matches(
         semantic_graphic_list=semantic_graphic_list,
         candidate_semantic_content_list=candidate_semantic_content_list
     )
     for graphic_match in graphic_match_result.graphic_matches:
         if isinstance(graphic_match.candidate_semantic_content,  SemanticMixedContentWrapper):
             graphic_match.candidate_semantic_content.add_content(
                 graphic_match.semantic_graphic
             )
     LOGGER.info('unmatched_graphics: %r', graphic_match_result.unmatched_graphics)
     for unmatched_graphic in graphic_match_result.unmatched_graphics:
         unmatched_graphics_container.add_content(unmatched_graphic)
Пример #2
0
    def iter_model_layout_documents(
        self, layout_document: LayoutDocument,
        document_context: TrainingDataDocumentContext
    ) -> Iterable[LayoutDocument]:
        reference_segmenter_model = document_context.fulltext_models.reference_segmenter_model
        citation_model = document_context.fulltext_models.citation_model
        segmentation_label_result = get_segmentation_label_result(
            layout_document, document_context=document_context)
        references_layout_document = segmentation_label_result.get_filtered_document_by_label(
            '<references>').remove_empty_blocks()
        reference_segmenter_labeled_layout_tokens = (
            get_labeled_layout_tokens_for_model_and_layout_document(
                model=reference_segmenter_model,
                layout_document=references_layout_document,
                document_context=document_context))
        raw_reference_text_list = [
            raw_reference_text
            for raw_reference in SemanticMixedContentWrapper(
                list(
                    reference_segmenter_model.
                    iter_semantic_content_for_labeled_layout_tokens(
                        reference_segmenter_labeled_layout_tokens))
            ).iter_by_type(SemanticRawReference) for raw_reference_text in
            raw_reference.iter_by_type(SemanticRawReferenceText)
        ]
        LOGGER.info('raw_reference_text_list count: %d',
                    len(raw_reference_text_list))
        if not raw_reference_text_list:
            return []
        citation_layout_documents = [
            LayoutDocument.for_blocks(
                list(semantic_raw_reference_text.iter_blocks()))
            for semantic_raw_reference_text in raw_reference_text_list
        ]
        citation_labeled_layout_tokens_list = (
            get_labeled_layout_tokens_list_for_model_and_layout_documents(
                model=citation_model,
                layout_documents=citation_layout_documents,
                document_context=document_context))
        semantic_raw_author_list = [
            raw_author for citation_labeled_layout_tokens in
            citation_labeled_layout_tokens_list
            for raw_author in SemanticMixedContentWrapper(
                list(
                    citation_model.
                    iter_semantic_content_for_labeled_layout_tokens(
                        citation_labeled_layout_tokens))
            ).iter_by_type_recursively(SemanticRawAuthors)
        ]
        LOGGER.info('semantic_raw_author_list count: %d',
                    len(semantic_raw_author_list))
        if not semantic_raw_author_list:
            return []

        return [
            LayoutDocument.for_blocks([
                block for semantic_raw_author in semantic_raw_author_list
                for block in semantic_raw_author.iter_blocks()
            ])
        ]
Пример #3
0
def append_semantic_markers_for_layout_block(
        parent_semantic_content: SemanticMixedContentWrapper,
        layout_block: LayoutBlock) -> None:
    semantic_markers = list(
        iter_semantic_markers_for_layout_block(layout_block))
    for semantic_marker in semantic_markers:
        parent_semantic_content.add_content(semantic_marker)
Пример #4
0
 def test_should_split_marker_on_non_numeric_characters(self):
     semantic_markers = list(
         iter_semantic_markers_for_layout_block(
             LayoutBlock.for_text('+*!')))
     semantic_content_wrapper = SemanticMixedContentWrapper(
         semantic_markers)
     assert semantic_content_wrapper.view_by_type(
         SemanticMarker).get_text_list() == ['+', '*', '!']
     assert semantic_content_wrapper.merged_block.text == '+*!'
Пример #5
0
 def test_should_not_split_markers_on_digit(self):
     semantic_markers = list(
         iter_semantic_markers_for_layout_block(
             LayoutBlock.for_text('11,12')))
     semantic_content_wrapper = SemanticMixedContentWrapper(
         semantic_markers)
     assert semantic_content_wrapper.view_by_type(
         SemanticMarker).get_text_list() == ['11', '12']
     assert semantic_content_wrapper.merged_block.text == '11,12'
Пример #6
0
 def test_should_split_markers_on_space(self):
     semantic_markers = list(
         iter_semantic_markers_for_layout_block(
             LayoutBlock.for_text('1 2')))
     LOGGER.debug('semantic_markers: %r', semantic_markers)
     semantic_content_wrapper = SemanticMixedContentWrapper(
         semantic_markers)
     assert semantic_content_wrapper.view_by_type(
         SemanticMarker).get_text_list() == ['1', '2']
     assert semantic_content_wrapper.merged_block.text == '1 2'
Пример #7
0
 def iter_filter_layout_document(
         self, layout_document: LayoutDocument) -> Iterable[LayoutDocument]:
     header_layout_document = self.filter_layout_document_by_segmentation_label(
         layout_document, '<header>')
     labeled_layout_tokens = self.header_model.predict_labels_for_layout_document(
         header_layout_document,
         app_features_context=self.app_features_context)
     LOGGER.debug('labeled_layout_tokens: %r', labeled_layout_tokens)
     semantic_raw_authors_list = list(
         SemanticMixedContentWrapper(
             list(
                 self.header_model.
                 iter_semantic_content_for_labeled_layout_tokens(
                     labeled_layout_tokens))).iter_by_type(
                         SemanticRawAuthors))
     LOGGER.info('semantic_raw_authors_list count: %d',
                 len(semantic_raw_authors_list))
     LOGGER.info('merge_raw_authors: %s', self.merge_raw_authors)
     if self.merge_raw_authors:
         return [
             LayoutDocument.for_blocks([
                 block for semantic_raw_authors in semantic_raw_authors_list
                 for block in semantic_raw_authors.iter_blocks()
             ]).remove_empty_blocks()
         ]
     return [
         LayoutDocument.for_blocks(list(
             semantic_raw_authors.iter_blocks())).remove_empty_blocks()
         for semantic_raw_authors in semantic_raw_authors_list
     ]
Пример #8
0
    def iter_model_layout_documents(
        self, layout_document: LayoutDocument,
        document_context: TrainingDataDocumentContext
    ) -> Iterable[LayoutDocument]:
        fulltext_model = document_context.fulltext_models.fulltext_model
        segmentation_label_result = get_segmentation_label_result(
            layout_document, document_context=document_context)
        body_layout_document = segmentation_label_result.get_filtered_document_by_label(
            '<body>').remove_empty_blocks()
        if not body_layout_document.pages:
            return []
        fulltext_labeled_layout_tokens = get_labeled_layout_tokens_for_model_and_layout_document(
            model=fulltext_model,
            layout_document=body_layout_document,
            document_context=document_context)
        raw_table_list = list(
            SemanticMixedContentWrapper(
                list(
                    fulltext_model.
                    iter_semantic_content_for_labeled_layout_tokens(
                        fulltext_labeled_layout_tokens))).
            iter_by_type_recursively(SemanticRawTable))
        LOGGER.info('raw_table_list count: %d', len(raw_table_list))

        if not raw_table_list:
            return []
        return [
            LayoutDocument.for_blocks(list(raw_table.iter_blocks()))
            for raw_table in raw_table_list
        ]
Пример #9
0
    def iter_model_layout_documents(
        self, layout_document: LayoutDocument,
        document_context: TrainingDataDocumentContext
    ) -> Iterable[LayoutDocument]:
        header_model = document_context.fulltext_models.header_model
        segmentation_label_result = get_segmentation_label_result(
            layout_document, document_context=document_context)
        header_layout_document = segmentation_label_result.get_filtered_document_by_label(
            '<header>').remove_empty_blocks()
        LOGGER.debug('header_layout_document: %r', header_layout_document)
        if not header_layout_document.pages:
            return []
        header_labeled_layout_tokens = get_labeled_layout_tokens_for_model_and_layout_document(
            model=header_model,
            layout_document=header_layout_document,
            document_context=document_context)
        semantic_raw_author_list = list(
            SemanticMixedContentWrapper(
                list(
                    header_model.
                    iter_semantic_content_for_labeled_layout_tokens(
                        header_labeled_layout_tokens))).iter_by_type(
                            SemanticRawAuthors))
        LOGGER.info('semantic_raw_author_list count: %d',
                    len(semantic_raw_author_list))
        if not semantic_raw_author_list:
            return []

        return [
            LayoutDocument.for_blocks([
                block for semantic_raw_author in semantic_raw_author_list
                for block in semantic_raw_author.iter_blocks()
            ])
        ]
Пример #10
0
 def iter_filter_layout_document(
         self, layout_document: LayoutDocument) -> Iterable[LayoutDocument]:
     references_layout_document = self.filter_layout_document_by_segmentation_label(
         layout_document, '<references>')
     labeled_layout_tokens = self.reference_segmenter_model.predict_labels_for_layout_document(
         references_layout_document,
         app_features_context=self.app_features_context)
     LOGGER.debug('labeled_layout_tokens: %r', labeled_layout_tokens)
     semantic_raw_references = list(
         SemanticMixedContentWrapper(
             list(
                 self.reference_segmenter_model.
                 iter_semantic_content_for_labeled_layout_tokens(
                     labeled_layout_tokens))).iter_by_type(
                         SemanticRawReference))
     LOGGER.info('semantic_raw_references count: %d',
                 len(semantic_raw_references))
     raw_reference_documents = [
         LayoutDocument.for_blocks([
             semantic_raw_reference.view_by_type(
                 SemanticRawReferenceText).merged_block
         ]).remove_empty_blocks()
         for semantic_raw_reference in semantic_raw_references
     ]
     citation_labeled_layout_tokens_list = (
         self.citation_model.predict_labels_for_layout_documents(
             raw_reference_documents,
             app_features_context=self.app_features_context))
     raw_authors = [
         raw_author for citation_labeled_layout_tokens in
         citation_labeled_layout_tokens_list
         for ref in (self.citation_model.
                     iter_semantic_content_for_labeled_layout_tokens(
                         citation_labeled_layout_tokens))
         if isinstance(ref, SemanticReference)
         for raw_author in ref.iter_by_type(SemanticRawAuthors)
     ]
     return [
         LayoutDocument.for_blocks([raw_author.merged_block
                                    ]).remove_empty_blocks()
         for raw_author in raw_authors
     ]
Пример #11
0
 def _process_raw_authors(self, semantic_parent: SemanticMixedContentWrapper):
     result_content: List[SemanticContentWrapper] = []
     raw_authors: List[SemanticRawAuthors] = []
     for semantic_content in semantic_parent:
         if isinstance(semantic_content, SemanticRawAuthors):
             raw_authors.append(semantic_content)
             continue
         result_content.append(semantic_content)
     if raw_authors:
         if self.config.merge_raw_authors:
             raw_authors_layout_documents = [
                 LayoutDocument.for_blocks([
                     block
                     for raw_author in raw_authors
                     for block in raw_author.iter_blocks()
                 ])
             ]
         else:
             raw_authors_layout_documents = [
                 LayoutDocument.for_blocks(list(raw_author.iter_blocks()))
                 for raw_author in raw_authors
             ]
         labeled_layout_tokens_list = self.name_header_model.predict_labels_for_layout_documents(
             raw_authors_layout_documents,
             app_features_context=self.app_features_context
         )
         LOGGER.debug('labeled_layout_tokens_list (author): %r', labeled_layout_tokens_list)
         authors_iterable = (
             author
             for labeled_layout_tokens in labeled_layout_tokens_list
             for author in (
                 self.name_header_model.iter_semantic_content_for_labeled_layout_tokens(
                     labeled_layout_tokens
                 )
             )
         )
         for author in authors_iterable:
             result_content.append(author)
     semantic_parent.mixed_content = result_content
Пример #12
0
 def test_should_return_empty_list_with_empty_list_of_graphics(self):
     result = BoundingBoxDistanceGraphicMatcher().get_graphic_matches(
         semantic_graphic_list=[],
         candidate_semantic_content_list=[SemanticMixedContentWrapper()])
     assert not result