def test_should_add_single_author(self):
     semantic_document = SemanticDocument()
     title = SemanticNameTitle(layout_block=LayoutBlock.for_text('Title1'))
     given_name = SemanticGivenName(
         layout_block=LayoutBlock.for_text('Given1'))
     middle_name = SemanticMiddleName(
         layout_block=LayoutBlock.for_text('Middle1'))
     surname = SemanticSurname(
         layout_block=LayoutBlock.for_text('Surname1'))
     suffix = SemanticNameSuffix(
         layout_block=LayoutBlock.for_text('Suffix1'))
     author = SemanticAuthor(
         [title, given_name, middle_name, surname, suffix])
     semantic_document.front.add_content(author)
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:roleName') == ['Title1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:forename[@type="first"]') == ['Given1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:forename[@type="middle"]') == ['Middle1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:surname') == ['Surname1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:genName') == ['Suffix1']
예제 #2
0
 def test_should_extract_single_affiliation_address(self):
     semantic_content_list = list(AffiliationAddressSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<marker>', LayoutBlock.for_text('1')),
         ('<institution>', LayoutBlock.for_text('Institution 1')),
         ('<department>', LayoutBlock.for_text('Department 1')),
         ('<laboratory>', LayoutBlock.for_text('Laboratory 1')),
         ('<addrLine>', LayoutBlock.for_text('Address Line 1')),
         ('<postCode>', LayoutBlock.for_text('Post Code 1')),
         ('<postBox>', LayoutBlock.for_text('Post Box 1')),
         ('<region>', LayoutBlock.for_text('Region 1')),
         ('<settlement>', LayoutBlock.for_text('Settlement 1')),
         ('<country>', LayoutBlock.for_text('Country 1'))
     ]))
     assert len(semantic_content_list) == 1
     author = semantic_content_list[0]
     assert isinstance(author, SemanticAffiliationAddress)
     assert author.view_by_type(SemanticMarker).get_text() == '1'
     assert author.view_by_type(
         SemanticInstitution).get_text() == 'Institution 1'
     assert author.view_by_type(
         SemanticDepartment).get_text() == 'Department 1'
     assert author.view_by_type(
         SemanticLaboratory).get_text() == 'Laboratory 1'
     assert author.view_by_type(
         SemanticAddressLine).get_text() == 'Address Line 1'
     assert author.view_by_type(
         SemanticPostCode).get_text() == 'Post Code 1'
     assert author.view_by_type(SemanticPostBox).get_text() == 'Post Box 1'
     assert author.view_by_type(SemanticRegion).get_text() == 'Region 1'
     assert author.view_by_type(
         SemanticSettlement).get_text() == 'Settlement 1'
     assert author.view_by_type(SemanticCountry).get_text() == 'Country 1'
 def test_should_add_asset_citation_for_resolved_reference(self):
     semantic_document = SemanticDocument()
     semantic_document.body_section.add_content(
         SemanticSection([
             SemanticParagraph([
                 SemanticTextContentWrapper(
                     layout_block=LayoutBlock.for_text('See')),
                 SemanticReferenceCitation(
                     layout_block=LayoutBlock.for_text('Ref 1'),
                     target_content_id='b0')
             ]),
             SemanticReferenceList([
                 SemanticReference([
                     SemanticLabel(layout_block=LayoutBlock.for_text('1'))
                 ],
                                   content_id='b0')
             ])
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p') == ['See Ref 1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]') == ['Ref 1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]/@target') == [
             '#b0'
         ]
 def test_should_add_parsed_references(self):
     semantic_document = SemanticDocument()
     semantic_ref = SemanticReference([
         SemanticTitle(
             layout_block=LayoutBlock.for_text('Reference Title 1')),
         SemanticRawReferenceText(
             layout_block=LayoutBlock.for_text('Reference 1'))
     ])
     semantic_ref.content_id = 'b0'
     semantic_document.back_section.add_content(
         SemanticReferenceList([
             SemanticHeading(
                 layout_block=LayoutBlock.for_text('References')),
             semantic_ref
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="references"]/tei:listBibl/tei:head'
     ) == ['References']
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="references"]/tei:listBibl'
         '/tei:biblStruct/tei:analytic/tei:title[@type="main"]') == [
             'Reference Title 1'
         ]
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="references"]/tei:listBibl'
         '/tei:biblStruct/tei:note[@type="raw_reference"]') == [
             'Reference 1'
         ]
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="references"]/tei:listBibl'
         '/tei:biblStruct/@xml:id') == ['b0']
 def test_should_add_section_figures_to_back(self):
     semantic_document = SemanticDocument()
     semantic_document.back_section.add_content(
         SemanticSection([
             SemanticFigure([
                 SemanticLabel(
                     layout_block=LayoutBlock.for_text('Label 1')),
                 SemanticCaption(
                     layout_block=LayoutBlock.for_text('Caption 1'))
             ],
                            content_id='fig_0')
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     figure_xpath = (
         '//tei:back/tei:div[@type="annex"]/tei:figure[not(contains(@type, "table"))]'
     )
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:head') == ['Label 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:label') == ['Label 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:figDesc') == ['Caption 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/@xml:id') == ['fig_0']
     assert not tei_document.xpath(
         '//tei:back/tei:div[@type="annex"]/tei:div')
예제 #6
0
 def test_should_provide_page_and_block_status_for_single_token_blocks(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock.for_text('line1'),
             LayoutBlock.for_text('line2'),
             LayoutBlock.for_text('line3')
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'page_status': features.get_page_status(),
             'block_status': features.get_block_status()
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [{
         'page_status': 'PAGESTART',
         'block_status': 'BLOCKSTART'
     }, {
         'page_status': 'PAGEIN',
         'block_status': 'BLOCKSTART'
     }, {
         'page_status': 'PAGEEND',
         'block_status': 'BLOCKSTART'
     }]
예제 #7
0
    def test_should_extract_references_fields_from_document(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        other_body = LayoutBlock.for_text('the body')
        citation_block = LayoutBlock.for_text('1')
        body_block = LayoutBlock.merge_blocks([other_body, citation_block])
        label_block = LayoutBlock.for_text('1')
        ref_title_block = LayoutBlock.for_text('Reference Title 1')
        ref_text_block = LayoutBlock.merge_blocks([ref_title_block])
        ref_block = LayoutBlock.merge_blocks([label_block, ref_text_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_citation_fields=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock
        citation_model_mock = fulltext_models_mock.citation_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            body_block, '<body>')
        segmentation_model_mock.update_label_by_layout_block(
            ref_block, '<references>')

        fulltext_model_mock.update_label_by_layout_block(
            other_body, '<section>')
        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<citation_marker>')

        reference_segmenter_model_mock.update_label_by_layout_block(
            label_block, '<label>')
        reference_segmenter_model_mock.update_label_by_layout_block(
            ref_text_block, '<reference>')

        citation_model_mock.update_label_by_layout_block(
            ref_title_block, '<title>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[body_block, ref_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        reference_list = list(
            semantic_document.back_section.iter_by_type(SemanticReferenceList))
        assert len(reference_list) == 1
        references = list(reference_list[0].iter_by_type(SemanticReference))
        assert len(references) == 1
        ref = references[0]
        assert ref.get_text_by_type(SemanticTitle) == ref_title_block.text
        assert ref.get_text_by_type(SemanticLabel) == label_block.text
        assert ref.get_text_by_type(
            SemanticRawReferenceText) == ref_text_block.text
        assert ref.content_id == 'b0'
        ref_citations = list(
            semantic_document.iter_by_type_recursively(
                SemanticReferenceCitation))
        assert len(ref_citations) == 1
        assert ref_citations[0].target_content_id == 'b0'
예제 #8
0
 def test_should_set_title_and_abstract(self):
     semantic_content_list = list(
         HeaderSemanticExtractor().iter_semantic_content_for_entity_blocks([
             ('<title>', LayoutBlock.for_text(TITLE_1)),
             ('<abstract>', LayoutBlock.for_text(ABSTRACT_1))
         ]))
     front = SemanticFront(semantic_content_list)
     LOGGER.debug('front: %s', front)
     assert front.get_text_by_type(SemanticTitle) == TITLE_1
     assert front.get_text_by_type(SemanticAbstract) == ABSTRACT_1
예제 #9
0
 def test_should_not_strip_dot_from_label(self):
     semantic_heading = SemanticHeading([
         SemanticLabel(layout_block=LayoutBlock.for_text('1.')),
         SemanticTitle(layout_block=LayoutBlock.for_text('Section Title 1'))
     ])
     tei_head = get_tei_child_element_for_semantic_content(semantic_heading)
     LOGGER.debug('tei_head: %r', etree.tostring(tei_head))
     assert tei_head.attrib.get('n') == '1.'
     assert get_text_content(tei_head) == 'Section Title 1'
     assert not list(tei_head)
예제 #10
0
 def test_should_extract_single_figure(self):
     semantic_content_list = list(
         FigureSemanticExtractor().iter_semantic_content_for_entity_blocks([
             ('<label>', LayoutBlock.for_text('Figure 1')),
             ('<figDesc>', LayoutBlock.for_text('Caption 1'))
         ]))
     assert len(semantic_content_list) == 1
     figure = semantic_content_list[0]
     assert isinstance(figure, SemanticFigure)
     assert figure.view_by_type(SemanticLabel).get_text() == 'Figure 1'
     assert figure.view_by_type(SemanticCaption).get_text() == 'Caption 1'
예제 #11
0
 def test_should_be_able_to_extract_single_editor(self):
     semantic_content_list = list(
         NameSemanticExtractor().iter_semantic_content_for_entity_blocks(
             [('<forename>', LayoutBlock.for_text('John')),
              ('<surname>', LayoutBlock.for_text('Smith'))],
             name_type=SemanticEditor))
     assert len(semantic_content_list) == 1
     author = semantic_content_list[0]
     assert isinstance(author, SemanticEditor)
     assert author.given_name_text == 'John'
     assert author.surname_text == 'Smith'
예제 #12
0
 def test_should_render_graphic_element(self):
     semantic_figure = SemanticFigure([
         SemanticLabel(layout_block=LayoutBlock.for_text('Label 1')),
         SemanticCaption(layout_block=LayoutBlock.for_text('Caption 1')),
         SemanticGraphic(layout_graphic=LayoutGraphic(
             local_file_path='image1.png'))
     ],
                                      content_id='fig_0')
     result = _get_wrapped_figure_tei_element(semantic_figure)
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/tei:graphic')
예제 #13
0
 def test_should_remove_trailing_dot_from_country(self):
     semantic_content_list = list(AffiliationAddressSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<marker>', LayoutBlock.for_text('1')),
         ('<country>', LayoutBlock.for_text('Country1.'))
     ]))
     assert len(semantic_content_list) == 1
     aff1 = semantic_content_list[0]
     assert isinstance(aff1, SemanticAffiliationAddress)
     assert aff1.view_by_type(SemanticMarker).get_text() == '1'
     assert aff1.view_by_type(SemanticCountry).get_text() == 'Country1'
예제 #14
0
 def test_should_extract_single_raw_reference(self):
     semantic_content_list = list(ReferenceSegmenterSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<label>', LayoutBlock.for_text('1')),
         ('<reference>', LayoutBlock.for_text('Reference 1'))
     ]))
     assert len(semantic_content_list) == 1
     ref = semantic_content_list[0]
     assert isinstance(ref, SemanticRawReference)
     assert ref.view_by_type(SemanticLabel).get_text() == '1'
     assert ref.view_by_type(
         SemanticRawReferenceText).get_text() == 'Reference 1'
예제 #15
0
 def test_should_add_paragraphs_without_title(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2))
     ]))
     assert len(semantic_content_list) == 1
     section = semantic_content_list[0]
     assert isinstance(section, SemanticSection)
     assert section.get_paragraph_text_list() == [
         SECTION_PARAGRAPH_1, SECTION_PARAGRAPH_2
     ]
예제 #16
0
 def test_should_raw_table_for_table_text_to_section(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<table>', LayoutBlock.for_text(SECTION_PARAGRAPH_2))
     ]))
     assert len(semantic_content_list) == 1
     section = semantic_content_list[0]
     assert isinstance(section, SemanticSection)
     assert section.get_paragraph_text_list() == [SECTION_PARAGRAPH_1]
     assert section.get_text_by_type(
         SemanticRawTable) == SECTION_PARAGRAPH_2
예제 #17
0
 def test_should_add_all_fields(self):
     semantic_affiliation_address = SemanticAffiliationAddress([
         SemanticMarker(layout_block=LayoutBlock.for_text('1')),
         SemanticInstitution(layout_block=LayoutBlock.for_text('Institution1')),
         SemanticDepartment(layout_block=LayoutBlock.for_text('Department1')),
         SemanticLaboratory(layout_block=LayoutBlock.for_text('Lab1')),
         SemanticAddressLine(layout_block=LayoutBlock.for_text('AddressLine1')),
         SemanticPostCode(layout_block=LayoutBlock.for_text('PostCode1')),
         SemanticPostBox(layout_block=LayoutBlock.for_text('PostBox1')),
         SemanticRegion(layout_block=LayoutBlock.for_text('Region1')),
         SemanticSettlement(layout_block=LayoutBlock.for_text('Settlement1')),
         SemanticCountry(layout_block=LayoutBlock.for_text('Country1')),
     ])
     tei_aff = TeiElementWrapper(
         get_tei_affiliation_for_semantic_affiliation_address_element(
             semantic_affiliation_address,
             context=DEFAULT_TEI_ELEMENT_FACTORY_CONTEXT
         )
     )
     LOGGER.debug('tei_aff: %r', etree.tostring(tei_aff.element))
     assert tei_aff.get_xpath_text_content_list(
         'tei:note[@type="raw_affiliation"]'
     ) == [semantic_affiliation_address.get_text()]
     assert tei_aff.get_xpath_text_content_list(
         'tei:note[@type="raw_affiliation"]/tei:label'
     ) == ['1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:orgName[@type="institution"]'
     ) == ['Institution1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:orgName[@type="department"]'
     ) == ['Department1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:orgName[@type="laboratory"]'
     ) == ['Lab1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:address/tei:addrLine'
     ) == ['AddressLine1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:address/tei:postCode'
     ) == ['PostCode1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:address/tei:postBox'
     ) == ['PostBox1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:address/tei:region'
     ) == ['Region1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:address/tei:settlement'
     ) == ['Settlement1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:address/tei:country'
     ) == ['Country1']
예제 #18
0
 def test_should_create_back_section(self):
     semantic_document = SemanticDocument()
     section = semantic_document.back_section.add_new_section()
     section.add_heading_block(LayoutBlock.for_text(TOKEN_1))
     paragraph = section.add_new_paragraph()
     paragraph.add_block_content(LayoutBlock.for_text(TOKEN_2))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="annex"]/tei:div/tei:head') == [TOKEN_1]
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="annex"]/tei:div/tei:p') == [TOKEN_2]
예제 #19
0
 def test_should_ignore_additional_title_and_abstract(self):
     # Note: this behaviour should be reviewed
     semantic_content_list = list(
         HeaderSemanticExtractor().iter_semantic_content_for_entity_blocks([
             ('<title>', LayoutBlock.for_text(TITLE_1)),
             ('<abstract>', LayoutBlock.for_text(ABSTRACT_1)),
             ('<title>', LayoutBlock.for_text('other')),
             ('<abstract>', LayoutBlock.for_text('other'))
         ]))
     front = SemanticFront(semantic_content_list)
     LOGGER.debug('front: %s', front)
     assert front.get_text_by_type(SemanticTitle) == TITLE_1
     assert front.get_text_by_type(SemanticAbstract) == ABSTRACT_1
예제 #20
0
 def test_should_add_note_for_other_text_to_body(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('O', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2))
     ]))
     parent_section = SemanticSection(semantic_content_list)
     assert parent_section.get_notes_text_list('fulltext:other') == [
         SECTION_PARAGRAPH_1
     ]
     sections = parent_section.sections
     assert len(sections) == 1
     assert sections[0].get_paragraph_text_list() == [SECTION_PARAGRAPH_2]
예제 #21
0
    def test_should_extract_table_label_caption_from_body(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels,
            segmentation_label: str):
        citation_block = LayoutBlock.for_text('Table 1')
        label_block = LayoutBlock.for_text('Table 1')
        caption_block = LayoutBlock.for_text('Caption 1')
        other_block = LayoutBlock.for_text('Other')
        figure_block = LayoutBlock.merge_blocks(
            [label_block, other_block, caption_block])
        fulltext_block = LayoutBlock.merge_blocks(
            [citation_block, figure_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_table_fields=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        fulltext_model_mock = fulltext_models_mock.fulltext_model_mock
        table_model_mock = fulltext_models_mock.table_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            fulltext_block, segmentation_label)

        fulltext_model_mock.update_label_by_layout_block(
            citation_block, '<table_marker>')
        fulltext_model_mock.update_label_by_layout_block(
            figure_block, '<table>')

        table_model_mock.update_label_by_layout_block(label_block, '<label>')
        table_model_mock.update_label_by_layout_block(caption_block,
                                                      '<figDesc>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[fulltext_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        table_list = list(
            iter_by_semantic_type_recursively([
                semantic_document.body_section, semantic_document.back_section
            ], SemanticTable))
        assert len(table_list) == 1
        table = table_list[0]
        assert table.get_text_by_type(SemanticLabel) == label_block.text
        assert table.get_text_by_type(SemanticCaption) == caption_block.text
        assert table.content_id == 'tab_0'
        table_citation_list = list(
            semantic_document.iter_by_type_recursively(SemanticTableCitation))
        assert len(table_citation_list) == 1
        assert table_citation_list[0].get_text() == citation_block.text
        assert table_citation_list[0].target_content_id == 'tab_0'
예제 #22
0
 def test_should_split_raw_affiliation_on_new_aff_without_address(self):
     semantic_content_list = list(
         HeaderSemanticExtractor().iter_semantic_content_for_entity_blocks([
             ('<affiliation>', LayoutBlock.for_text(AFFILIATION_1)),
             ('<affiliation>', LayoutBlock.for_text(AFFILIATION_2))
         ]))
     front = SemanticFront(semantic_content_list)
     LOGGER.debug('front: %s', front)
     aff_address_list = list(
         front.iter_by_type(SemanticRawAffiliationAddress))
     assert [
         aff_address.get_text_by_type(SemanticRawAffiliation)
         for aff_address in aff_address_list
     ] == [AFFILIATION_1, AFFILIATION_2]
예제 #23
0
 def test_should_add_separate_section_label(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<section>', LayoutBlock.for_text('1 ' + SECTION_TITLE_1)),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1))
     ]))
     assert len(semantic_content_list) == 1
     section = semantic_content_list[0]
     assert isinstance(section, SemanticSection)
     semantic_headings = list(section.iter_by_type(SemanticHeading))
     assert len(semantic_headings) == 1
     assert semantic_headings[0].get_text_by_type(SemanticLabel) == '1'
     assert semantic_headings[0].get_text_by_type(
         SemanticTitle) == SECTION_TITLE_1
예제 #24
0
    def test_should_extract_editor_names_from_references_fields(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        given_name_block = LayoutBlock.for_text('Given name')
        surname_block = LayoutBlock.for_text('Surname')
        other_block = LayoutBlock.for_text('Other')
        editors_block = LayoutBlock.merge_blocks(
            [given_name_block, other_block, surname_block])
        ref_text_block = LayoutBlock.merge_blocks([editors_block])
        ref_block = LayoutBlock.merge_blocks([ref_text_block])
        fulltext_processor = FullTextProcessor(
            fulltext_models_mock,
            FullTextProcessorConfig(extract_citation_fields=True,
                                    extract_citation_authors=False,
                                    extract_citation_editors=True))

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        reference_segmenter_model_mock = fulltext_models_mock.reference_segmenter_model_mock
        citation_model_mock = fulltext_models_mock.citation_model_mock
        name_citation_model_mock = fulltext_models_mock.name_citation_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            ref_block, '<references>')

        reference_segmenter_model_mock.update_label_by_layout_block(
            ref_text_block, '<reference>')

        citation_model_mock.update_label_by_layout_block(
            editors_block, '<editor>')

        name_citation_model_mock.update_label_by_layout_block(
            given_name_block, '<forename>')
        name_citation_model_mock.update_label_by_layout_block(
            surname_block, '<surname>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[ref_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        LOGGER.debug('semantic_document: %s', semantic_document)
        assert semantic_document is not None
        reference_list = list(
            semantic_document.back_section.iter_by_type(SemanticReferenceList))
        assert len(reference_list) == 1
        references = list(reference_list[0].iter_by_type(SemanticReference))
        assert len(references) == 1
        ref = references[0]
        editors = list(ref.iter_by_type(SemanticEditor))
        assert len(editors) == 1
        assert editors[0].given_name_text == given_name_block.text
        assert editors[0].surname_text == surname_block.text
예제 #25
0
    def test_should_extract_affiliation_address_from_document(  # pylint: disable=too-many-locals
            self, fulltext_models_mock: MockFullTextModels):
        marker_block = LayoutBlock.for_text('1')
        institution_block = LayoutBlock.for_text('Institution1')
        country_block = LayoutBlock.for_text('Country1')
        aff_block = LayoutBlock.merge_blocks([marker_block, institution_block])
        address_block = LayoutBlock.merge_blocks([country_block])
        aff_address_block = LayoutBlock.merge_blocks(
            [aff_block, address_block])
        fulltext_processor = FullTextProcessor(fulltext_models_mock)
        header_block = aff_address_block

        segmentation_model_mock = fulltext_models_mock.segmentation_model_mock
        header_model_mock = fulltext_models_mock.header_model_mock
        affiliation_address_model_mock = fulltext_models_mock.affiliation_address_model_mock

        segmentation_model_mock.update_label_by_layout_block(
            header_block, '<header>')

        header_model_mock.update_label_by_layout_block(aff_block,
                                                       '<affiliation>')
        header_model_mock.update_label_by_layout_block(address_block,
                                                       '<address>')

        affiliation_address_model_mock.update_label_by_layout_block(
            marker_block, '<marker>')
        affiliation_address_model_mock.update_label_by_layout_block(
            institution_block, '<institution>')
        affiliation_address_model_mock.update_label_by_layout_block(
            country_block, '<country>')

        layout_document = LayoutDocument(
            pages=[LayoutPage(blocks=[header_block])])
        semantic_document = fulltext_processor.get_semantic_document_for_layout_document(
            layout_document=layout_document)
        assert semantic_document is not None
        assert semantic_document.front.get_text() == aff_address_block.text
        assert (semantic_document.front.view_by_type(
            SemanticAffiliationAddress).get_text()) == aff_address_block.text
        affiliations = list(
            semantic_document.front.iter_by_type(SemanticAffiliationAddress))
        assert len(affiliations) == 1
        assert affiliations[0].get_text_by_type(
            SemanticMarker) == marker_block.text
        assert affiliations[0].get_text_by_type(
            SemanticInstitution) == institution_block.text
        assert affiliations[0].get_text_by_type(
            SemanticCountry) == country_block.text
        assert affiliations[0].content_id == 'aff0'
예제 #26
0
 def test_should_add_raw_affiliation_address(self):
     semantic_content_list = list(
         HeaderSemanticExtractor().iter_semantic_content_for_entity_blocks([
             ('<affiliation>', LayoutBlock.for_text(AFFILIATION_1)),
             ('<address>', LayoutBlock.for_text(ADDRESS_1))
         ]))
     front = SemanticFront(semantic_content_list)
     LOGGER.debug('front: %s', front)
     aff_address_list = list(
         front.iter_by_type(SemanticRawAffiliationAddress))
     assert len(aff_address_list) == 1
     aff_address = aff_address_list[0]
     assert aff_address.get_text_by_type(
         SemanticRawAffiliation) == AFFILIATION_1
     assert aff_address.get_text_by_type(SemanticRawAddress) == ADDRESS_1
예제 #27
0
 def test_should_include_reference_citation_in_paragraph(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<citation_marker>', LayoutBlock.for_text('Ref 1')),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2)),
     ]))
     LOGGER.debug('semantic_content_list: %s', semantic_content_list)
     assert len(semantic_content_list) == 1
     section = semantic_content_list[0]
     assert isinstance(section, SemanticSection)
     reference_citations = list(
         section.iter_by_type_recursively(SemanticReferenceCitation))
     assert len(reference_citations) == 1
     assert reference_citations[0].get_text() == 'Ref 1'
예제 #28
0
 def test_should_extract_preceeding_other_text(self):
     semantic_content_list = list(AffiliationAddressSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('O', LayoutBlock.for_text('Other 1')),
         ('<marker>', LayoutBlock.for_text('1')),
         ('<institution>', LayoutBlock.for_text('Institution 1')),
     ]))
     assert len(semantic_content_list) == 2
     note = semantic_content_list[0]
     assert isinstance(note, SemanticNote)
     author = semantic_content_list[1]
     assert isinstance(author, SemanticAffiliationAddress)
     assert author.view_by_type(SemanticMarker).get_text() == '1'
     assert author.view_by_type(
         SemanticInstitution).get_text() == 'Institution 1'
예제 #29
0
 def test_should_reject_reference_without_any_detected_fields(self):
     semantic_raw_ref_text = SemanticRawReferenceText(
         layout_block=LayoutBlock.for_text('Reference 1'))
     semantic_raw_ref = SemanticRawReference([semantic_raw_ref_text],
                                             content_id='raw1')
     semantic_content_list = list(CitationSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks(
         [
             ('O', LayoutBlock.for_text(semantic_raw_ref_text.get_text())),
         ],
         semantic_raw_reference=semantic_raw_ref))
     assert len(semantic_content_list) == 1
     ref = semantic_content_list[0]
     assert isinstance(ref, SemanticInvalidReference)
     assert ref.get_text() == semantic_raw_ref_text.get_text()
예제 #30
0
 def test_should_render_label_description_and_id(self):
     semantic_figure = SemanticFigure([
         SemanticLabel(layout_block=LayoutBlock.for_text('Label 1')),
         SemanticCaption(layout_block=LayoutBlock.for_text('Caption 1'))
     ],
                                      content_id='fig_0')
     result = _get_wrapped_figure_tei_element(semantic_figure)
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/tei:head') == ['Label 1']
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/tei:label') == ['Label 1']
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/tei:figDesc') == ['Caption 1']
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/@xml:id') == ['fig_0']