def test_should_reject_equal_match(self): matcher = PartialContentIdMatcher({ CONTENT_ID_1: 'The title, Smith, 1999', CONTENT_ID_2: 'Other title, Smith, 1999' }) assert matcher.get_id_by_text('Smith 1999') is None
def get_semantic_document_for_layout_document( self, layout_document: LayoutDocument, context: Optional[FullTextProcessorDocumentContext] = None ) -> SemanticDocument: if context is None: context = FullTextProcessorDocumentContext() layout_document = self._preprocess_layout_graphics( layout_document, context=context ) segmentation_label_result = self.segmentation_model.get_label_layout_document_result( layout_document, app_features_context=self.app_features_context ) header_layout_document = segmentation_label_result.get_filtered_document_by_label( '<header>' ).remove_empty_blocks() document = SemanticDocument() if self.config.extract_front: self._process_header_layout_document( header_layout_document=header_layout_document, semantic_document=document ) if self.config.extract_body_sections: self._update_semantic_section_using_segmentation_result_and_fulltext_model( document.body_section, segmentation_label_result, '<body>', SemanticSectionTypes.OTHER ) if self.config.extract_acknowledgements: self._update_semantic_section_using_segmentation_result_and_fulltext_model( document.back_section, segmentation_label_result, '<acknowledgement>', SemanticSectionTypes.ACKNOWLEDGEMENT ) if self.config.extract_back_sections: self._update_semantic_section_using_segmentation_result_and_fulltext_model( document.back_section, segmentation_label_result, '<annex>', SemanticSectionTypes.OTHER ) if self.config.extract_references: self._extract_raw_references_from_segmentation( semantic_document=document, segmentation_label_result=segmentation_label_result ) if self.config.extract_citation_fields: self._extract_reference_fields_from_raw_references( semantic_document=document ) if self.config.extract_citation_authors or self.config.extract_citation_editors: self._extract_reference_name_lists_from_raw_references( semantic_document=document ) references = list(document.iter_by_type_recursively(SemanticReference)) ref_citations = list(document.iter_by_type_recursively(SemanticReferenceCitation)) self._assign_content_ids(references, iter(iter_ids('b'))) self._assign_target_content_ids(ref_citations, ChainedContentIdMatcher([ SimpleContentIdMatcher( self._get_semantic_content_text_by_content_id(references, SemanticLabel) ), PartialContentIdMatcher( self._get_semantic_content_text_by_content_id( references, SemanticRawReferenceText ) ) ])) if self.config.extract_figure_fields: self._extract_figure_fields_from_raw_figures(semantic_document=document) figures = list(document.iter_by_type_recursively(SemanticFigure)) figure_citations = list(document.iter_by_type_recursively(SemanticFigureCitation)) self._assign_content_ids(figures, iter(iter_ids('fig_'))) self._assign_target_content_ids(figure_citations, SimpleContentIdMatcher( self._get_semantic_content_text_by_content_id(figures, SemanticLabel) )) if self.config.extract_table_fields: self._extract_table_fields_from_raw_tables(semantic_document=document) tables = list(document.iter_by_type_recursively(SemanticTable)) table_citations = list(document.iter_by_type_recursively(SemanticTableCitation)) self._assign_content_ids(tables, iter(iter_ids('tab_'))) self._assign_target_content_ids(table_citations, SimpleContentIdMatcher( self._get_semantic_content_text_by_content_id(tables, SemanticLabel) )) if self.config.extract_graphic_bounding_boxes: self._process_graphics( document=document, layout_document=layout_document, context=context ) return document
def test_should_match_on_exact_match(self): matcher = PartialContentIdMatcher({ CONTENT_ID_1: '1', CONTENT_ID_2: '2' }) assert matcher.get_id_by_text('1') == CONTENT_ID_1
def test_should_not_match_on_initial_characters_only(self): matcher = PartialContentIdMatcher({ CONTENT_ID_1: 'The title, Smooth', CONTENT_ID_2: 'Other title, X' }) assert matcher.get_id_by_text('Smith') is None
def test_should_match_on_partial_text_match(self): matcher = PartialContentIdMatcher({ CONTENT_ID_1: 'The title, Smith, 1999', CONTENT_ID_2: 'Other title, 1999' }) assert matcher.get_id_by_text('Smith 1999') == CONTENT_ID_1
def test_should_normalize_whitespace(self): matcher = PartialContentIdMatcher({ CONTENT_ID_1: ' Text\n1 ', CONTENT_ID_2: OTHER_TEXT_1 }) assert matcher.get_id_by_text('\nText 1\n') == CONTENT_ID_1
def test_should_ignore_punctuation(self): matcher = PartialContentIdMatcher({ CONTENT_ID_1: 'Text 1.', CONTENT_ID_2: OTHER_TEXT_1 }) assert matcher.get_id_by_text('Text 1:') == CONTENT_ID_1
def test_should_match_case_insensitive(self): matcher = PartialContentIdMatcher({ CONTENT_ID_1: 'TeXt 1', CONTENT_ID_2: OTHER_TEXT_1 }) assert matcher.get_id_by_text('tExt 1') == CONTENT_ID_1