예제 #1
0
    def test_should_extract_references_fields_from_document(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        other_body = LayoutBlock.for_text('the body')
        citation_block = LayoutBlock.for_text('1')
        body_block = LayoutBlock.merge_blocks([other_body, citation_block])
        label_block = LayoutBlock.for_text('1')
        ref_title_block = LayoutBlock.for_text('Reference Title 1')
        ref_text_block = LayoutBlock.merge_blocks([ref_title_block])
        ref_block = LayoutBlock.merge_blocks([label_block, ref_text_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_citation_fields=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock
        citation_model_mock = fulltext_models_mock.citation_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            body_block, '<body>')
        segmentation_model_mock.update_label_by_layout_block(
            ref_block, '<references>')

        fulltext_model_mock.update_label_by_layout_block(
            other_body, '<section>')
        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<citation_marker>')

        reference_segmenter_model_mock.update_label_by_layout_block(
            label_block, '<label>')
        reference_segmenter_model_mock.update_label_by_layout_block(
            ref_text_block, '<reference>')

        citation_model_mock.update_label_by_layout_block(
            ref_title_block, '<title>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[body_block, ref_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        reference_list = list(
            semantic_document.back_section.iter_by_type(SemanticReferenceList))
        assert len(reference_list) == 1
        references = list(reference_list[0].iter_by_type(SemanticReference))
        assert len(references) == 1
        ref = references[0]
        assert ref.get_text_by_type(SemanticTitle) == ref_title_block.text
        assert ref.get_text_by_type(SemanticLabel) == label_block.text
        assert ref.get_text_by_type(
            SemanticRawReferenceText) == ref_text_block.text
        assert ref.content_id == 'b0'
        ref_citations = list(
            semantic_document.iter_by_type_recursively(
                SemanticReferenceCitation))
        assert len(ref_citations) == 1
        assert ref_citations[0].target_content_id == 'b0'
예제 #2
0
    def test_should_extract_table_label_caption_from_body(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels,
            segmentation_label: str):
        citation_block = LayoutBlock.for_text('Table 1')
        label_block = LayoutBlock.for_text('Table 1')
        caption_block = LayoutBlock.for_text('Caption 1')
        other_block = LayoutBlock.for_text('Other')
        figure_block = LayoutBlock.merge_blocks(
            [label_block, other_block, caption_block])
        fulltext_block = LayoutBlock.merge_blocks(
            [citation_block, figure_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_table_fields=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        table_model_mock = fulltext_models_mock.table_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            fulltext_block, segmentation_label)

        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<table_marker>')
        fulltext_model_mock.update_label_by_layout_block(
            figure_block, '<table>')

        table_model_mock.update_label_by_layout_block(label_block, '<label>')
        table_model_mock.update_label_by_layout_block(caption_block,
                                                      '<figDesc>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[fulltext_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        table_list = list(
            iter_by_semantic_type_recursively([
                semantic_document.body_section, semantic_document.back_section
            ], SemanticTable))
        assert len(table_list) == 1
        table = table_list[0]
        assert table.get_text_by_type(SemanticLabel) == label_block.text
        assert table.get_text_by_type(SemanticCaption) == caption_block.text
        assert table.content_id == 'tab_0'
        table_citation_list = list(
            semantic_document.iter_by_type_recursively(SemanticTableCitation))
        assert len(table_citation_list) == 1
        assert table_citation_list[0].get_text() == citation_block.text
        assert table_citation_list[0].target_content_id == 'tab_0'
예제 #3
0
    def test_should_extract_editor_names_from_references_fields(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        given_name_block = LayoutBlock.for_text('Given name')
        surname_block = LayoutBlock.for_text('Surname')
        other_block = LayoutBlock.for_text('Other')
        editors_block = LayoutBlock.merge_blocks(
            [given_name_block, other_block, surname_block])
        ref_text_block = LayoutBlock.merge_blocks([editors_block])
        ref_block = LayoutBlock.merge_blocks([ref_text_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_citation_fields=True,
                                    extract_citation_authors=False,
                                    extract_citation_editors=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock
        citation_model_mock = fulltext_models_mock.citation_model_mock
        name_citation_model_mock = fulltext_models_mock.name_citation_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            ref_block, '<references>')

        reference_segmenter_model_mock.update_label_by_layout_block(
            ref_text_block, '<reference>')

        citation_model_mock.update_label_by_layout_block(
            editors_block, '<editor>')

        name_citation_model_mock.update_label_by_layout_block(
            given_name_block, '<forename>')
        name_citation_model_mock.update_label_by_layout_block(
            surname_block, '<surname>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[ref_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        reference_list = list(
            semantic_document.back_section.iter_by_type(SemanticReferenceList))
        assert len(reference_list) == 1
        references = list(reference_list[0].iter_by_type(SemanticReference))
        assert len(references) == 1
        ref = references[0]
        editors = list(ref.iter_by_type(SemanticEditor))
        assert len(editors) == 1
        assert editors[0].given_name_text == given_name_block.text
        assert editors[0].surname_text == surname_block.text
예제 #4
0
    def test_should_extract_affiliation_address_from_document(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        marker_block = LayoutBlock.for_text('1')
        institution_block = LayoutBlock.for_text('Institution1')
        country_block = LayoutBlock.for_text('Country1')
        aff_block = LayoutBlock.merge_blocks([marker_block, institution_block])
        address_block = LayoutBlock.merge_blocks([country_block])
        aff_address_block = LayoutBlock.merge_blocks(
            [aff_block, address_block])
        fulltext_processor = FullTextProcessor(fulltext_models_mock)
        header_block = aff_address_block

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        header_model_mock = fulltext_models_mock.header_model_mock
        affiliation_address_model_mock = fulltext_models_mock.affiliation_address_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            header_block, '<header>')

        header_model_mock.update_label_by_layout_block(aff_block,
                                                       '<affiliation>')
        header_model_mock.update_label_by_layout_block(address_block,
                                                       '<address>')

        affiliation_address_model_mock.update_label_by_layout_block(
            marker_block, '<marker>')
        affiliation_address_model_mock.update_label_by_layout_block(
            institution_block, '<institution>')
        affiliation_address_model_mock.update_label_by_layout_block(
            country_block, '<country>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[header_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        assert semantic_document is not None
        assert semantic_document.front.get_text() == aff_address_block.text
        assert (semantic_document.front.view_by_type(
            SemanticAffiliationAddress).get_text()) == aff_address_block.text
        affiliations = list(
            semantic_document.front.iter_by_type(SemanticAffiliationAddress))
        assert len(affiliations) == 1
        assert affiliations[0].get_text_by_type(
            SemanticMarker) == marker_block.text
        assert affiliations[0].get_text_by_type(
            SemanticInstitution) == institution_block.text
        assert affiliations[0].get_text_by_type(
            SemanticCountry) == country_block.text
        assert affiliations[0].content_id == 'aff0'
예제 #5
0
def normalize_name_parts(name: T_SemanticName):
    if not list(name.iter_by_type(SemanticSurname)):
        return SemanticNote(layout_block=LayoutBlock.merge_blocks(
            name.iter_blocks()),
                            note_type='invalid_author_name')
    convert_two_letter_uppercase_given_name_to_given_middle_name(name)
    convert_name_parts_to_title_case(name)
    return name
예제 #6
0
    def __init__(self) -> None:
        self.title_block = LayoutBlock.for_text('This is the title')

        self.author_surname_block = LayoutBlock.for_text('Author Surname 1')
        self.author_block = LayoutBlock.merge_blocks(
            [self.author_surname_block])

        self.institution_block = LayoutBlock.for_text('Institution 1')
        self.affiliation_block = LayoutBlock.merge_blocks(
            [self.institution_block])

        self.header_block = LayoutBlock.merge_blocks(
            [self.title_block, self.author_block, self.affiliation_block])

        self.figure_head_block = LayoutBlock.for_text('Figure 1')
        self.figure_block = LayoutBlock.merge_blocks([self.figure_head_block])

        self.table_head_block = LayoutBlock.for_text('Table 1')
        self.table_block = LayoutBlock.merge_blocks([self.table_head_block])

        self.body_section_title_block = LayoutBlock.for_text('Section 1')
        self.body_section_paragraph_block = LayoutBlock.for_text('Paragraph 1')
        self.body_block = LayoutBlock.merge_blocks([
            self.body_section_title_block, self.body_section_paragraph_block,
            self.figure_block, self.table_block
        ])

        self.ref_author_surname_block = LayoutBlock.for_text(
            'Ref Author Surname 1')
        self.ref_author_block = LayoutBlock.merge_blocks(
            [self.ref_author_surname_block])

        self.ref_label_block = LayoutBlock.for_text('1')
        self.ref_title_block = LayoutBlock.for_text('Reference 1')
        self.ref_text_block = LayoutBlock.merge_blocks(
            [self.ref_title_block, self.ref_author_block])
        self.ref_ref_block = LayoutBlock.merge_blocks(
            [self.ref_label_block, self.ref_text_block])

        self.layout_document = LayoutDocument(pages=[
            LayoutPage(blocks=[
                self.header_block, self.body_block, self.ref_ref_block
            ])
        ])
예제 #7
0
    def test_should_extract_invalid_reference_from_document(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        other_body = LayoutBlock.for_text('the body')
        citation_block = LayoutBlock.for_text('1')
        body_block = LayoutBlock.merge_blocks([other_body, citation_block])
        invalid_reference_block = LayoutBlock.for_text(
            'This is an invalid reference 1')
        ref_text_block = invalid_reference_block
        ref_block = ref_text_block
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_citation_fields=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock
        citation_model_mock = fulltext_models_mock.citation_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            body_block, '<body>')
        segmentation_model_mock.update_label_by_layout_block(
            ref_block, '<references>')

        fulltext_model_mock.update_label_by_layout_block(
            other_body, '<section>')
        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<citation_marker>')

        reference_segmenter_model_mock.update_label_by_layout_block(
            ref_text_block, '<reference>')

        citation_model_mock.update_label_by_layout_block(
            invalid_reference_block, 'O')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[body_block, ref_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        reference_list = list(
            semantic_document.back_section.iter_by_type(SemanticReferenceList))
        assert len(reference_list) == 1
        references = list(
            reference_list[0].iter_by_type(SemanticInvalidReference))
        assert len(references) == 1
        assert references[0].get_text() == invalid_reference_block.text
예제 #8
0
 def test_should_add_raw_affiliation_with_formatting(self):
     semantic_affiliation_address = SemanticAffiliationAddress([
         SemanticMarker(layout_block=LayoutBlock.for_text('1')),
         SemanticInstitution(layout_block=LayoutBlock.merge_blocks([
             LayoutBlock.for_text('bold', font=BOLD_FONT_1),
             LayoutBlock.for_text('italic', font=ITALICS_FONT_1)
         ]))
     ])
     tei_aff = TeiElementWrapper(
         get_tei_affiliation_for_semantic_affiliation_address_element(
             semantic_affiliation_address,
             context=DEFAULT_TEI_ELEMENT_FACTORY_CONTEXT
         )
     )
     LOGGER.debug('tei_aff: %r', etree.tostring(tei_aff.element))
     assert tei_aff.get_xpath_text_content_list(
         'tei:note[@type="raw_affiliation"]'
     ) == [semantic_affiliation_address.get_text()]
     assert tei_aff.get_xpath_text_content_list(
         'tei:note[@type="raw_affiliation"]/tei:label'
     ) == ['1']
예제 #9
0
    def test_should_extract_author_names_separated_by_another_tag(
            self, fulltext_models_mock: MockFullTextModels):
        given_name_block = LayoutBlock.for_text('Given name')
        surname_block = LayoutBlock.for_text('Surname')
        other_block = LayoutBlock.for_text('Other')
        authors_block = LayoutBlock.merge_blocks(
            [given_name_block, other_block, surname_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            config=FullTextProcessorConfig(merge_raw_authors=True))
        header_block = authors_block

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        header_model_mock = fulltext_models_mock.header_model_mock
        name_header_model_mock = fulltext_models_mock.name_header_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            header_block, '<header>')

        header_model_mock.update_label_by_layout_block(given_name_block,
                                                       '<author>')
        header_model_mock.update_label_by_layout_block(surname_block,
                                                       '<author>')

        name_header_model_mock.update_label_by_layout_block(
            given_name_block, '<forename>')
        name_header_model_mock.update_label_by_layout_block(
            surname_block, '<surname>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[header_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        assert semantic_document is not None
        authors = semantic_document.front.authors
        assert len(authors) == 1
        assert authors[0].given_name_text == given_name_block.text
        assert authors[0].surname_text == surname_block.text
예제 #10
0
    def test_should_extract_raw_references_from_document(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        label_block = LayoutBlock.for_text('1')
        ref_text_block = LayoutBlock.for_text('Reference 1')
        ref_block = LayoutBlock.merge_blocks([label_block, ref_text_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_citation_fields=False))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            ref_block, '<references>')

        reference_segmenter_model_mock.update_label_by_layout_block(
            label_block, '<label>')
        reference_segmenter_model_mock.update_label_by_layout_block(
            ref_text_block, '<reference>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[ref_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        assert semantic_document.back_section.get_text() == ref_block.text
        reference_list = list(
            semantic_document.back_section.iter_by_type(SemanticReferenceList))
        assert len(reference_list) == 1
        references = list(reference_list[0].iter_by_type(SemanticRawReference))
        assert len(references) == 1
        ref = references[0]
        assert ref.get_text_by_type(SemanticLabel) == label_block.text
        assert ref.get_text_by_type(
            SemanticRawReferenceText) == ref_text_block.text
        assert ref.content_id == 'b0'
예제 #11
0
    def test_should_not_merge_separate_raw_affiliations(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        aff_suffix_texts = ['1', '2']
        institution_blocks = [
            LayoutBlock.for_text(f'Institution{t}') for t in aff_suffix_texts
        ]
        aff_blocks = institution_blocks
        aff_address_blocks = aff_blocks
        fulltext_processor = FullTextProcessor(fulltext_models_mock)
        header_block = LayoutBlock.merge_blocks(aff_address_blocks)

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        header_model_mock = fulltext_models_mock.header_model_mock
        affiliation_address_model_mock = fulltext_models_mock.affiliation_address_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            header_block, '<header>')

        for aff_block in aff_blocks:
            header_model_mock.update_label_by_layout_block(
                aff_block, '<affiliation>')

        for institution_block in institution_blocks:
            affiliation_address_model_mock.update_label_by_layout_block(
                institution_block, '<institution>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[header_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        assert semantic_document is not None
        affiliations = list(
            semantic_document.front.iter_by_type(SemanticAffiliationAddress))
        LOGGER.debug('affiliations: %r', affiliations)
        assert ([aff.get_text() for aff in affiliations
                 ] == [aff_block.text for aff_block in aff_blocks])
        assert ([aff.content_id for aff in affiliations] == ['aff0', 'aff1'])
예제 #12
0
    def test_should_extract_author_names_from_document(
            self, fulltext_models_mock: MockFullTextModels):
        given_name_block = LayoutBlock.for_text('Given name')
        surname_block = LayoutBlock.for_text('Surname')
        authors_block = LayoutBlock.merge_blocks(
            [given_name_block, surname_block])
        fulltext_processor = FullTextProcessor(fulltext_models_mock)
        header_block = authors_block

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        header_model_mock = fulltext_models_mock.header_model_mock
        name_header_model_mock = fulltext_models_mock.name_header_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            header_block, '<header>')

        header_model_mock.update_label_by_layout_block(authors_block,
                                                       '<author>')

        name_header_model_mock.update_label_by_layout_block(
            given_name_block, '<forename>')
        name_header_model_mock.update_label_by_layout_block(
            surname_block, '<surname>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[header_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        assert semantic_document is not None
        assert semantic_document.front.get_text() == authors_block.text
        assert (semantic_document.front.view_by_type(
            SemanticAuthor).get_text()) == authors_block.text
        authors = semantic_document.front.authors
        assert len(authors) == 1
        assert authors[0].given_name_text == given_name_block.text
        assert authors[0].surname_text == surname_block.text
예제 #13
0
 def merged_block(self) -> LayoutBlock:
     return LayoutBlock.merge_blocks(self.iter_blocks())
예제 #14
0
    def test_should_extract_figure_label_caption_from_body(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels,
            segmentation_label: str):
        citation_block = LayoutBlock.for_text('Figure 1')
        _coordinates = LayoutPageCoordinates(x=10, y=10, width=100, height=10)
        graphic_local_file_path = '/path/to/graphic1.svg'
        graphic = LayoutGraphic(coordinates=_coordinates,
                                local_file_path=graphic_local_file_path)
        _coordinates = _coordinates.move_by(dy=10)
        label_block = LayoutBlock.for_text('Figure 1',
                                           coordinates=_coordinates)
        _coordinates = _coordinates.move_by(dy=10)
        caption_block = LayoutBlock.for_text('Caption 1',
                                             coordinates=_coordinates)
        other_block = LayoutBlock.for_text('Other')
        figure_block = LayoutBlock.merge_blocks(
            [label_block, other_block, caption_block])
        fulltext_block = LayoutBlock.merge_blocks(
            [citation_block, figure_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_figure_fields=True,
                                    extract_graphic_bounding_boxes=True,
                                    extract_graphic_assets=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        figure_model_mock = fulltext_models_mock.figure_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            fulltext_block, segmentation_label)

        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<figure_marker>')
        fulltext_model_mock.update_label_by_layout_block(
            figure_block, '<figure>')

        figure_model_mock.update_label_by_layout_block(label_block, '<label>')
        figure_model_mock.update_label_by_layout_block(caption_block,
                                                       '<figDesc>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[fulltext_block], graphics=[graphic])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        figure_list = list(
            iter_by_semantic_type_recursively([
                semantic_document.body_section, semantic_document.back_section
            ], SemanticFigure))
        assert len(figure_list) == 1
        figure = figure_list[0]
        assert figure.get_text_by_type(SemanticLabel) == label_block.text
        assert figure.get_text_by_type(SemanticCaption) == caption_block.text
        assert figure.content_id == 'fig_0'
        figure_citation_list = list(
            semantic_document.iter_by_type_recursively(SemanticFigureCitation))
        assert len(figure_citation_list) == 1
        assert figure_citation_list[0].get_text() == citation_block.text
        assert figure_citation_list[0].target_content_id == 'fig_0'
        semantic_graphic_list = list(figure.iter_by_type(SemanticGraphic))
        assert semantic_graphic_list
        assert semantic_graphic_list[0].layout_graphic == graphic
        assert semantic_graphic_list[0].relative_path == os.path.basename(
            graphic_local_file_path)