示例#1
0
 def test_should_reject_equal_match(self):
     matcher = PartialContentIdMatcher({
         CONTENT_ID_1:
         'The title, Smith, 1999',
         CONTENT_ID_2:
         'Other title, Smith, 1999'
     })
     assert matcher.get_id_by_text('Smith 1999') is None
示例#2
0
    def get_semantic_document_for_layout_document(
        self,
        layout_document: LayoutDocument,
        context: Optional[FullTextProcessorDocumentContext] = None
    ) -> SemanticDocument:
        if context is None:
            context = FullTextProcessorDocumentContext()
        layout_document = self._preprocess_layout_graphics(
            layout_document,
            context=context
        )
        segmentation_label_result = self.segmentation_model.get_label_layout_document_result(
            layout_document,
            app_features_context=self.app_features_context
        )
        header_layout_document = segmentation_label_result.get_filtered_document_by_label(
            '<header>'
        ).remove_empty_blocks()
        document = SemanticDocument()
        if self.config.extract_front:
            self._process_header_layout_document(
                header_layout_document=header_layout_document,
                semantic_document=document
            )

        if self.config.extract_body_sections:
            self._update_semantic_section_using_segmentation_result_and_fulltext_model(
                document.body_section,
                segmentation_label_result,
                '<body>',
                SemanticSectionTypes.OTHER
            )
        if self.config.extract_acknowledgements:
            self._update_semantic_section_using_segmentation_result_and_fulltext_model(
                document.back_section,
                segmentation_label_result,
                '<acknowledgement>',
                SemanticSectionTypes.ACKNOWLEDGEMENT
            )
        if self.config.extract_back_sections:
            self._update_semantic_section_using_segmentation_result_and_fulltext_model(
                document.back_section,
                segmentation_label_result,
                '<annex>',
                SemanticSectionTypes.OTHER
            )
        if self.config.extract_references:
            self._extract_raw_references_from_segmentation(
                semantic_document=document,
                segmentation_label_result=segmentation_label_result
            )
        if self.config.extract_citation_fields:
            self._extract_reference_fields_from_raw_references(
                semantic_document=document
            )
            if self.config.extract_citation_authors or self.config.extract_citation_editors:
                self._extract_reference_name_lists_from_raw_references(
                    semantic_document=document
                )
            references = list(document.iter_by_type_recursively(SemanticReference))
            ref_citations = list(document.iter_by_type_recursively(SemanticReferenceCitation))
            self._assign_content_ids(references, iter(iter_ids('b')))
            self._assign_target_content_ids(ref_citations, ChainedContentIdMatcher([
                SimpleContentIdMatcher(
                    self._get_semantic_content_text_by_content_id(references, SemanticLabel)
                ),
                PartialContentIdMatcher(
                    self._get_semantic_content_text_by_content_id(
                        references, SemanticRawReferenceText
                    )
                )
            ]))
        if self.config.extract_figure_fields:
            self._extract_figure_fields_from_raw_figures(semantic_document=document)
            figures = list(document.iter_by_type_recursively(SemanticFigure))
            figure_citations = list(document.iter_by_type_recursively(SemanticFigureCitation))
            self._assign_content_ids(figures, iter(iter_ids('fig_')))
            self._assign_target_content_ids(figure_citations, SimpleContentIdMatcher(
                self._get_semantic_content_text_by_content_id(figures, SemanticLabel)
            ))
        if self.config.extract_table_fields:
            self._extract_table_fields_from_raw_tables(semantic_document=document)
            tables = list(document.iter_by_type_recursively(SemanticTable))
            table_citations = list(document.iter_by_type_recursively(SemanticTableCitation))
            self._assign_content_ids(tables, iter(iter_ids('tab_')))
            self._assign_target_content_ids(table_citations, SimpleContentIdMatcher(
                self._get_semantic_content_text_by_content_id(tables, SemanticLabel)
            ))
        if self.config.extract_graphic_bounding_boxes:
            self._process_graphics(
                document=document,
                layout_document=layout_document,
                context=context
            )
        return document
示例#3
0
 def test_should_match_on_exact_match(self):
     matcher = PartialContentIdMatcher({
         CONTENT_ID_1: '1',
         CONTENT_ID_2: '2'
     })
     assert matcher.get_id_by_text('1') == CONTENT_ID_1
示例#4
0
 def test_should_not_match_on_initial_characters_only(self):
     matcher = PartialContentIdMatcher({
         CONTENT_ID_1: 'The title, Smooth',
         CONTENT_ID_2: 'Other title, X'
     })
     assert matcher.get_id_by_text('Smith') is None
示例#5
0
 def test_should_match_on_partial_text_match(self):
     matcher = PartialContentIdMatcher({
         CONTENT_ID_1: 'The title, Smith, 1999',
         CONTENT_ID_2: 'Other title, 1999'
     })
     assert matcher.get_id_by_text('Smith 1999') == CONTENT_ID_1
示例#6
0
 def test_should_normalize_whitespace(self):
     matcher = PartialContentIdMatcher({
         CONTENT_ID_1: ' Text\n1 ',
         CONTENT_ID_2: OTHER_TEXT_1
     })
     assert matcher.get_id_by_text('\nText 1\n') == CONTENT_ID_1
示例#7
0
 def test_should_ignore_punctuation(self):
     matcher = PartialContentIdMatcher({
         CONTENT_ID_1: 'Text 1.',
         CONTENT_ID_2: OTHER_TEXT_1
     })
     assert matcher.get_id_by_text('Text 1:') == CONTENT_ID_1
示例#8
0
 def test_should_match_case_insensitive(self):
     matcher = PartialContentIdMatcher({
         CONTENT_ID_1: 'TeXt 1',
         CONTENT_ID_2: OTHER_TEXT_1
     })
     assert matcher.get_id_by_text('tExt 1') == CONTENT_ID_1