예제 #1
0
    def iter_model_layout_documents(
        self, layout_document: LayoutDocument,
        document_context: TrainingDataDocumentContext
    ) -> Iterable[LayoutDocument]:
        reference_segmenter_model = document_context.fulltext_models.reference_segmenter_model
        citation_model = document_context.fulltext_models.citation_model
        segmentation_label_result = get_segmentation_label_result(
            layout_document, document_context=document_context)
        references_layout_document = segmentation_label_result.get_filtered_document_by_label(
            '<references>').remove_empty_blocks()
        reference_segmenter_labeled_layout_tokens = (
            get_labeled_layout_tokens_for_model_and_layout_document(
                model=reference_segmenter_model,
                layout_document=references_layout_document,
                document_context=document_context))
        raw_reference_text_list = [
            raw_reference_text
            for raw_reference in SemanticMixedContentWrapper(
                list(
                    reference_segmenter_model.
                    iter_semantic_content_for_labeled_layout_tokens(
                        reference_segmenter_labeled_layout_tokens))
            ).iter_by_type(SemanticRawReference) for raw_reference_text in
            raw_reference.iter_by_type(SemanticRawReferenceText)
        ]
        LOGGER.info('raw_reference_text_list count: %d',
                    len(raw_reference_text_list))
        if not raw_reference_text_list:
            return []
        citation_layout_documents = [
            LayoutDocument.for_blocks(
                list(semantic_raw_reference_text.iter_blocks()))
            for semantic_raw_reference_text in raw_reference_text_list
        ]
        citation_labeled_layout_tokens_list = (
            get_labeled_layout_tokens_list_for_model_and_layout_documents(
                model=citation_model,
                layout_documents=citation_layout_documents,
                document_context=document_context))
        semantic_raw_author_list = [
            raw_author for citation_labeled_layout_tokens in
            citation_labeled_layout_tokens_list
            for raw_author in SemanticMixedContentWrapper(
                list(
                    citation_model.
                    iter_semantic_content_for_labeled_layout_tokens(
                        citation_labeled_layout_tokens))
            ).iter_by_type_recursively(SemanticRawAuthors)
        ]
        LOGGER.info('semantic_raw_author_list count: %d',
                    len(semantic_raw_author_list))
        if not semantic_raw_author_list:
            return []

        return [
            LayoutDocument.for_blocks([
                block for semantic_raw_author in semantic_raw_author_list
                for block in semantic_raw_author.iter_blocks()
            ])
        ]
예제 #2
0
 def iter_filter_layout_document(
         self, layout_document: LayoutDocument) -> Iterable[LayoutDocument]:
     header_layout_document = self.filter_layout_document_by_segmentation_label(
         layout_document, '<header>')
     labeled_layout_tokens = self.header_model.predict_labels_for_layout_document(
         header_layout_document,
         app_features_context=self.app_features_context)
     LOGGER.debug('labeled_layout_tokens: %r', labeled_layout_tokens)
     semantic_raw_authors_list = list(
         SemanticMixedContentWrapper(
             list(
                 self.header_model.
                 iter_semantic_content_for_labeled_layout_tokens(
                     labeled_layout_tokens))).iter_by_type(
                         SemanticRawAuthors))
     LOGGER.info('semantic_raw_authors_list count: %d',
                 len(semantic_raw_authors_list))
     LOGGER.info('merge_raw_authors: %s', self.merge_raw_authors)
     if self.merge_raw_authors:
         return [
             LayoutDocument.for_blocks([
                 block for semantic_raw_authors in semantic_raw_authors_list
                 for block in semantic_raw_authors.iter_blocks()
             ]).remove_empty_blocks()
         ]
     return [
         LayoutDocument.for_blocks(list(
             semantic_raw_authors.iter_blocks())).remove_empty_blocks()
         for semantic_raw_authors in semantic_raw_authors_list
     ]
예제 #3
0
 def iter_filter_layout_document(
         self, layout_document: LayoutDocument) -> Iterable[LayoutDocument]:
     fulltext_layout_documents = list(
         self.iter_filter_layout_document_by_segmentation_labels(
             layout_document, self.segmentation_labels))
     fulltext_labeled_layout_tokens_list = (
         self.fulltext_model.predict_labels_for_layout_documents(
             fulltext_layout_documents,
             app_features_context=self.app_features_context))
     LOGGER.debug('fulltext_labeled_layout_tokens_list: %r',
                  fulltext_labeled_layout_tokens_list)
     semanti_content_list = [
         semantic_content for fulltext_labeled_layout_tokens in
         fulltext_labeled_layout_tokens_list
         for semantic_content in iter_by_semantic_type_recursively(
             self.fulltext_model.
             iter_semantic_content_for_labeled_layout_tokens(
                 fulltext_labeled_layout_tokens), self.semantic_type)
     ]
     LOGGER.debug('semanti_content_list: %s', semanti_content_list)
     return [
         LayoutDocument.for_blocks([semanti_content.merged_block
                                    ]).remove_empty_blocks()
         for semanti_content in semanti_content_list
     ]
예제 #4
0
def convert_two_letter_uppercase_given_name_to_given_middle_name(
        name: T_SemanticName):
    given_names = list(name.iter_by_type(SemanticGivenName))
    middle_names = list(name.iter_by_type(SemanticMiddleName))
    if middle_names:
        LOGGER.debug('already has a middle name: %r', middle_names)
        return
    if len(given_names) != 1:
        LOGGER.debug('no or too many given names: %r', given_names)
        return
    given_name_text = given_names[0].get_text()
    if len(given_name_text) != 2 or not given_name_text.isupper():
        LOGGER.debug('not two uppercase characters: %r', given_name_text)
        return
    layout_document = LayoutDocument.for_blocks(
        list(given_names[0].iter_blocks()))
    retokenized_layout_document = layout_document.retokenize(
        tokenize_fn=tokenize_individual_characters)
    LOGGER.debug('retokenized_layout_document: %r',
                 retokenized_layout_document)
    split_name_parts = [
        (SemanticGivenName(layout_block=LayoutBlock.for_tokens([token]))
         if index == 0 else SemanticMiddleName(
             layout_block=LayoutBlock.for_tokens([token])))
        for index, token in enumerate(
            retokenized_layout_document.iter_all_tokens())
    ]
    LOGGER.debug('split_name_parts: %r', split_name_parts)
    name.flat_map_inplace_by_type(SemanticGivenName,
                                  lambda _: split_name_parts)
예제 #5
0
 def test_should_include_layout_document_text_in_tei_output(self):
     layout_document = LayoutDocument.for_blocks(
         [LayoutBlock.for_text(TEXT_1)])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     nodes = tei_xpath(xml_root, FIGURE_XPATH)
     assert len(nodes) == 1
     assert get_text_content(nodes[0]).rstrip() == TEXT_1
예제 #6
0
    def iter_model_layout_documents(
        self, layout_document: LayoutDocument,
        document_context: TrainingDataDocumentContext
    ) -> Iterable[LayoutDocument]:
        fulltext_model = document_context.fulltext_models.fulltext_model
        segmentation_label_result = get_segmentation_label_result(
            layout_document, document_context=document_context)
        body_layout_document = segmentation_label_result.get_filtered_document_by_label(
            '<body>').remove_empty_blocks()
        if not body_layout_document.pages:
            return []
        fulltext_labeled_layout_tokens = get_labeled_layout_tokens_for_model_and_layout_document(
            model=fulltext_model,
            layout_document=body_layout_document,
            document_context=document_context)
        raw_table_list = list(
            SemanticMixedContentWrapper(
                list(
                    fulltext_model.
                    iter_semantic_content_for_labeled_layout_tokens(
                        fulltext_labeled_layout_tokens))).
            iter_by_type_recursively(SemanticRawTable))
        LOGGER.info('raw_table_list count: %d', len(raw_table_list))

        if not raw_table_list:
            return []
        return [
            LayoutDocument.for_blocks(list(raw_table.iter_blocks()))
            for raw_table in raw_table_list
        ]
예제 #7
0
    def iter_model_layout_documents(
        self, layout_document: LayoutDocument,
        document_context: TrainingDataDocumentContext
    ) -> Iterable[LayoutDocument]:
        header_model = document_context.fulltext_models.header_model
        segmentation_label_result = get_segmentation_label_result(
            layout_document, document_context=document_context)
        header_layout_document = segmentation_label_result.get_filtered_document_by_label(
            '<header>').remove_empty_blocks()
        LOGGER.debug('header_layout_document: %r', header_layout_document)
        if not header_layout_document.pages:
            return []
        header_labeled_layout_tokens = get_labeled_layout_tokens_for_model_and_layout_document(
            model=header_model,
            layout_document=header_layout_document,
            document_context=document_context)
        semantic_raw_author_list = list(
            SemanticMixedContentWrapper(
                list(
                    header_model.
                    iter_semantic_content_for_labeled_layout_tokens(
                        header_labeled_layout_tokens))).iter_by_type(
                            SemanticRawAuthors))
        LOGGER.info('semantic_raw_author_list count: %d',
                    len(semantic_raw_author_list))
        if not semantic_raw_author_list:
            return []

        return [
            LayoutDocument.for_blocks([
                block for semantic_raw_author in semantic_raw_author_list
                for block in semantic_raw_author.iter_blocks()
            ])
        ]
 def test_should_include_layout_document_text_in_tei_output(self):
     training_data_generator = SegmentationTeiTrainingDataGenerator()
     layout_document = LayoutDocument.for_blocks(
         [LayoutBlock.for_text(TEXT_1)])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     text_nodes = xml_root.xpath('./text')
     assert len(text_nodes) == 1
     assert get_text_content(text_nodes[0]).rstrip() == TEXT_1
예제 #9
0
 def iter_filter_layout_document(
         self, layout_document: LayoutDocument) -> Iterable[LayoutDocument]:
     references_layout_document = self.filter_layout_document_by_segmentation_label(
         layout_document, '<references>')
     labeled_layout_tokens = self.reference_segmenter_model.predict_labels_for_layout_document(
         references_layout_document,
         app_features_context=self.app_features_context)
     LOGGER.debug('labeled_layout_tokens: %r', labeled_layout_tokens)
     semantic_raw_references = list(
         SemanticMixedContentWrapper(
             list(
                 self.reference_segmenter_model.
                 iter_semantic_content_for_labeled_layout_tokens(
                     labeled_layout_tokens))).iter_by_type(
                         SemanticRawReference))
     LOGGER.info('semantic_raw_references count: %d',
                 len(semantic_raw_references))
     raw_reference_documents = [
         LayoutDocument.for_blocks([
             semantic_raw_reference.view_by_type(
                 SemanticRawReferenceText).merged_block
         ]).remove_empty_blocks()
         for semantic_raw_reference in semantic_raw_references
     ]
     citation_labeled_layout_tokens_list = (
         self.citation_model.predict_labels_for_layout_documents(
             raw_reference_documents,
             app_features_context=self.app_features_context))
     raw_authors = [
         raw_author for citation_labeled_layout_tokens in
         citation_labeled_layout_tokens_list
         for ref in (self.citation_model.
                     iter_semantic_content_for_labeled_layout_tokens(
                         citation_labeled_layout_tokens))
         if isinstance(ref, SemanticReference)
         for raw_author in ref.iter_by_type(SemanticRawAuthors)
     ]
     return [
         LayoutDocument.for_blocks([raw_author.merged_block
                                    ]).remove_empty_blocks()
         for raw_author in raw_authors
     ]
예제 #10
0
 def test_should_include_layout_document_text_in_tei_output(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     layout_document = LayoutDocument.for_blocks(
         [LayoutBlock.for_text(TEXT_1)])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_text_content(aff_nodes[0]).rstrip() == TEXT_1
예제 #11
0
 def test_should_keep_original_whitespace(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     text = 'Token1, Token2  ,Token3'
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(
             lines=[LayoutLine.for_text(text, tail_whitespace='\n')])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_text_content(aff_nodes[0]).rstrip() == text
예제 #12
0
 def test_should_add_line_feeds(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_text_content(aff_nodes[0]).rstrip() == '\n'.join(
         [TEXT_1, TEXT_2])
예제 #13
0
 def test_should_lb_elements_before_line_feeds(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
예제 #14
0
 def test_should_generate_tei_from_model_data(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             get_next_layout_line_for_text(TEXT_1),
             get_next_layout_line_for_text(TEXT_2)
         ])
     ])
     data_generator = get_data_generator()
     model_data_iterable = data_generator.iter_model_data_for_layout_document(
         layout_document)
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         model_data_iterable)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
 def test_should_generate_tei_from_model_data(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             get_next_layout_line_for_text(TEXT_1),
             get_next_layout_line_for_text(TEXT_2)
         ])
     ])
     data_generator = SegmentationDataGenerator(
         DEFAULT_DOCUMENT_FEATURES_CONTEXT, use_first_token_of_block=True)
     model_data_iterable = data_generator.iter_model_data_for_layout_document(
         layout_document)
     training_data_generator = SegmentationTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         model_data_iterable)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     text_nodes = xml_root.xpath('./text')
     assert len(text_nodes) == 1
     lb_nodes = text_nodes[0].xpath('lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2