def iter_model_layout_documents( self, layout_document: LayoutDocument, document_context: TrainingDataDocumentContext ) -> Iterable[LayoutDocument]: reference_segmenter_model = document_context.fulltext_models.reference_segmenter_model citation_model = document_context.fulltext_models.citation_model segmentation_label_result = get_segmentation_label_result( layout_document, document_context=document_context) references_layout_document = segmentation_label_result.get_filtered_document_by_label( '<references>').remove_empty_blocks() reference_segmenter_labeled_layout_tokens = ( get_labeled_layout_tokens_for_model_and_layout_document( model=reference_segmenter_model, layout_document=references_layout_document, document_context=document_context)) raw_reference_text_list = [ raw_reference_text for raw_reference in SemanticMixedContentWrapper( list( reference_segmenter_model. iter_semantic_content_for_labeled_layout_tokens( reference_segmenter_labeled_layout_tokens)) ).iter_by_type(SemanticRawReference) for raw_reference_text in raw_reference.iter_by_type(SemanticRawReferenceText) ] LOGGER.info('raw_reference_text_list count: %d', len(raw_reference_text_list)) if not raw_reference_text_list: return [] citation_layout_documents = [ LayoutDocument.for_blocks( list(semantic_raw_reference_text.iter_blocks())) for semantic_raw_reference_text in raw_reference_text_list ] citation_labeled_layout_tokens_list = ( get_labeled_layout_tokens_list_for_model_and_layout_documents( model=citation_model, layout_documents=citation_layout_documents, document_context=document_context)) semantic_raw_author_list = [ raw_author for citation_labeled_layout_tokens in citation_labeled_layout_tokens_list for raw_author in SemanticMixedContentWrapper( list( citation_model. iter_semantic_content_for_labeled_layout_tokens( citation_labeled_layout_tokens)) ).iter_by_type_recursively(SemanticRawAuthors) ] LOGGER.info('semantic_raw_author_list count: %d', len(semantic_raw_author_list)) if not semantic_raw_author_list: return [] return [ LayoutDocument.for_blocks([ block for semantic_raw_author in semantic_raw_author_list for block in semantic_raw_author.iter_blocks() ]) ]
def iter_filter_layout_document( self, layout_document: LayoutDocument) -> Iterable[LayoutDocument]: header_layout_document = self.filter_layout_document_by_segmentation_label( layout_document, '<header>') labeled_layout_tokens = self.header_model.predict_labels_for_layout_document( header_layout_document, app_features_context=self.app_features_context) LOGGER.debug('labeled_layout_tokens: %r', labeled_layout_tokens) semantic_raw_authors_list = list( SemanticMixedContentWrapper( list( self.header_model. iter_semantic_content_for_labeled_layout_tokens( labeled_layout_tokens))).iter_by_type( SemanticRawAuthors)) LOGGER.info('semantic_raw_authors_list count: %d', len(semantic_raw_authors_list)) LOGGER.info('merge_raw_authors: %s', self.merge_raw_authors) if self.merge_raw_authors: return [ LayoutDocument.for_blocks([ block for semantic_raw_authors in semantic_raw_authors_list for block in semantic_raw_authors.iter_blocks() ]).remove_empty_blocks() ] return [ LayoutDocument.for_blocks(list( semantic_raw_authors.iter_blocks())).remove_empty_blocks() for semantic_raw_authors in semantic_raw_authors_list ]
def iter_filter_layout_document( self, layout_document: LayoutDocument) -> Iterable[LayoutDocument]: fulltext_layout_documents = list( self.iter_filter_layout_document_by_segmentation_labels( layout_document, self.segmentation_labels)) fulltext_labeled_layout_tokens_list = ( self.fulltext_model.predict_labels_for_layout_documents( fulltext_layout_documents, app_features_context=self.app_features_context)) LOGGER.debug('fulltext_labeled_layout_tokens_list: %r', fulltext_labeled_layout_tokens_list) semanti_content_list = [ semantic_content for fulltext_labeled_layout_tokens in fulltext_labeled_layout_tokens_list for semantic_content in iter_by_semantic_type_recursively( self.fulltext_model. iter_semantic_content_for_labeled_layout_tokens( fulltext_labeled_layout_tokens), self.semantic_type) ] LOGGER.debug('semanti_content_list: %s', semanti_content_list) return [ LayoutDocument.for_blocks([semanti_content.merged_block ]).remove_empty_blocks() for semanti_content in semanti_content_list ]
def convert_two_letter_uppercase_given_name_to_given_middle_name( name: T_SemanticName): given_names = list(name.iter_by_type(SemanticGivenName)) middle_names = list(name.iter_by_type(SemanticMiddleName)) if middle_names: LOGGER.debug('already has a middle name: %r', middle_names) return if len(given_names) != 1: LOGGER.debug('no or too many given names: %r', given_names) return given_name_text = given_names[0].get_text() if len(given_name_text) != 2 or not given_name_text.isupper(): LOGGER.debug('not two uppercase characters: %r', given_name_text) return layout_document = LayoutDocument.for_blocks( list(given_names[0].iter_blocks())) retokenized_layout_document = layout_document.retokenize( tokenize_fn=tokenize_individual_characters) LOGGER.debug('retokenized_layout_document: %r', retokenized_layout_document) split_name_parts = [ (SemanticGivenName(layout_block=LayoutBlock.for_tokens([token])) if index == 0 else SemanticMiddleName( layout_block=LayoutBlock.for_tokens([token]))) for index, token in enumerate( retokenized_layout_document.iter_all_tokens()) ] LOGGER.debug('split_name_parts: %r', split_name_parts) name.flat_map_inplace_by_type(SemanticGivenName, lambda _: split_name_parts)
def test_should_include_layout_document_text_in_tei_output(self): layout_document = LayoutDocument.for_blocks( [LayoutBlock.for_text(TEXT_1)]) xml_root = get_training_tei_xml_for_layout_document(layout_document) nodes = tei_xpath(xml_root, FIGURE_XPATH) assert len(nodes) == 1 assert get_text_content(nodes[0]).rstrip() == TEXT_1
def iter_model_layout_documents( self, layout_document: LayoutDocument, document_context: TrainingDataDocumentContext ) -> Iterable[LayoutDocument]: fulltext_model = document_context.fulltext_models.fulltext_model segmentation_label_result = get_segmentation_label_result( layout_document, document_context=document_context) body_layout_document = segmentation_label_result.get_filtered_document_by_label( '<body>').remove_empty_blocks() if not body_layout_document.pages: return [] fulltext_labeled_layout_tokens = get_labeled_layout_tokens_for_model_and_layout_document( model=fulltext_model, layout_document=body_layout_document, document_context=document_context) raw_table_list = list( SemanticMixedContentWrapper( list( fulltext_model. iter_semantic_content_for_labeled_layout_tokens( fulltext_labeled_layout_tokens))). iter_by_type_recursively(SemanticRawTable)) LOGGER.info('raw_table_list count: %d', len(raw_table_list)) if not raw_table_list: return [] return [ LayoutDocument.for_blocks(list(raw_table.iter_blocks())) for raw_table in raw_table_list ]
def iter_model_layout_documents( self, layout_document: LayoutDocument, document_context: TrainingDataDocumentContext ) -> Iterable[LayoutDocument]: header_model = document_context.fulltext_models.header_model segmentation_label_result = get_segmentation_label_result( layout_document, document_context=document_context) header_layout_document = segmentation_label_result.get_filtered_document_by_label( '<header>').remove_empty_blocks() LOGGER.debug('header_layout_document: %r', header_layout_document) if not header_layout_document.pages: return [] header_labeled_layout_tokens = get_labeled_layout_tokens_for_model_and_layout_document( model=header_model, layout_document=header_layout_document, document_context=document_context) semantic_raw_author_list = list( SemanticMixedContentWrapper( list( header_model. iter_semantic_content_for_labeled_layout_tokens( header_labeled_layout_tokens))).iter_by_type( SemanticRawAuthors)) LOGGER.info('semantic_raw_author_list count: %d', len(semantic_raw_author_list)) if not semantic_raw_author_list: return [] return [ LayoutDocument.for_blocks([ block for semantic_raw_author in semantic_raw_author_list for block in semantic_raw_author.iter_blocks() ]) ]
def test_should_include_layout_document_text_in_tei_output(self): training_data_generator = SegmentationTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks( [LayoutBlock.for_text(TEXT_1)]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) text_nodes = xml_root.xpath('./text') assert len(text_nodes) == 1 assert get_text_content(text_nodes[0]).rstrip() == TEXT_1
def iter_filter_layout_document( self, layout_document: LayoutDocument) -> Iterable[LayoutDocument]: references_layout_document = self.filter_layout_document_by_segmentation_label( layout_document, '<references>') labeled_layout_tokens = self.reference_segmenter_model.predict_labels_for_layout_document( references_layout_document, app_features_context=self.app_features_context) LOGGER.debug('labeled_layout_tokens: %r', labeled_layout_tokens) semantic_raw_references = list( SemanticMixedContentWrapper( list( self.reference_segmenter_model. iter_semantic_content_for_labeled_layout_tokens( labeled_layout_tokens))).iter_by_type( SemanticRawReference)) LOGGER.info('semantic_raw_references count: %d', len(semantic_raw_references)) raw_reference_documents = [ LayoutDocument.for_blocks([ semantic_raw_reference.view_by_type( SemanticRawReferenceText).merged_block ]).remove_empty_blocks() for semantic_raw_reference in semantic_raw_references ] citation_labeled_layout_tokens_list = ( self.citation_model.predict_labels_for_layout_documents( raw_reference_documents, app_features_context=self.app_features_context)) raw_authors = [ raw_author for citation_labeled_layout_tokens in citation_labeled_layout_tokens_list for ref in (self.citation_model. iter_semantic_content_for_labeled_layout_tokens( citation_labeled_layout_tokens)) if isinstance(ref, SemanticReference) for raw_author in ref.iter_by_type(SemanticRawAuthors) ] return [ LayoutDocument.for_blocks([raw_author.merged_block ]).remove_empty_blocks() for raw_author in raw_authors ]
def test_should_include_layout_document_text_in_tei_output(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks( [LayoutBlock.for_text(TEXT_1)]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_text_content(aff_nodes[0]).rstrip() == TEXT_1
def test_should_keep_original_whitespace(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() text = 'Token1, Token2 ,Token3' layout_document = LayoutDocument.for_blocks([ LayoutBlock( lines=[LayoutLine.for_text(text, tail_whitespace='\n')]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_text_content(aff_nodes[0]).rstrip() == text
def test_should_add_line_feeds(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_text_content(aff_nodes[0]).rstrip() == '\n'.join( [TEXT_1, TEXT_2])
def test_should_lb_elements_before_line_feeds(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb') assert len(lb_nodes) == 2 assert lb_nodes[0].getparent().text == TEXT_1 assert lb_nodes[0].tail == '\n' + TEXT_2
def test_should_generate_tei_from_model_data(self): layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ get_next_layout_line_for_text(TEXT_1), get_next_layout_line_for_text(TEXT_2) ]) ]) data_generator = get_data_generator() model_data_iterable = data_generator.iter_model_data_for_layout_document( layout_document) training_data_generator = AffiliationAddressTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( model_data_iterable) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb') assert len(lb_nodes) == 2 assert lb_nodes[0].getparent().text == TEXT_1 assert lb_nodes[0].tail == '\n' + TEXT_2
def test_should_generate_tei_from_model_data(self): layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ get_next_layout_line_for_text(TEXT_1), get_next_layout_line_for_text(TEXT_2) ]) ]) data_generator = SegmentationDataGenerator( DEFAULT_DOCUMENT_FEATURES_CONTEXT, use_first_token_of_block=True) model_data_iterable = data_generator.iter_model_data_for_layout_document( layout_document) training_data_generator = SegmentationTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( model_data_iterable) LOGGER.debug('xml: %r', etree.tostring(xml_root)) text_nodes = xml_root.xpath('./text') assert len(text_nodes) == 1 lb_nodes = text_nodes[0].xpath('lb') assert len(lb_nodes) == 2 assert lb_nodes[0].getparent().text == TEXT_1 assert lb_nodes[0].tail == '\n' + TEXT_2