def test_should_add_section_figures_to_back(self):
     semantic_document = SemanticDocument()
     semantic_document.back_section.add_content(
         SemanticSection([
             SemanticFigure([
                 SemanticLabel(
                     layout_block=LayoutBlock.for_text('Label 1')),
                 SemanticCaption(
                     layout_block=LayoutBlock.for_text('Caption 1'))
             ],
                            content_id='fig_0')
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     figure_xpath = (
         '//tei:back/tei:div[@type="annex"]/tei:figure[not(contains(@type, "table"))]'
     )
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:head') == ['Label 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:label') == ['Label 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/tei:figDesc') == ['Caption 1']
     assert tei_document.get_xpath_text_content_list(
         f'{figure_xpath}/@xml:id') == ['fig_0']
     assert not tei_document.xpath(
         '//tei:back/tei:div[@type="annex"]/tei:div')
 def test_should_add_single_author(self):
     semantic_document = SemanticDocument()
     title = SemanticNameTitle(layout_block=LayoutBlock.for_text('Title1'))
     given_name = SemanticGivenName(
         layout_block=LayoutBlock.for_text('Given1'))
     middle_name = SemanticMiddleName(
         layout_block=LayoutBlock.for_text('Middle1'))
     surname = SemanticSurname(
         layout_block=LayoutBlock.for_text('Surname1'))
     suffix = SemanticNameSuffix(
         layout_block=LayoutBlock.for_text('Suffix1'))
     author = SemanticAuthor(
         [title, given_name, middle_name, surname, suffix])
     semantic_document.front.add_content(author)
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:roleName') == ['Title1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:forename[@type="first"]') == ['Given1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:forename[@type="middle"]') == ['Middle1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:surname') == ['Surname1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:author//tei:genName') == ['Suffix1']
 def test_should_add_parsed_references(self):
     semantic_document = SemanticDocument()
     semantic_ref = SemanticReference([
         SemanticTitle(
             layout_block=LayoutBlock.for_text('Reference Title 1')),
         SemanticRawReferenceText(
             layout_block=LayoutBlock.for_text('Reference 1'))
     ])
     semantic_ref.content_id = 'b0'
     semantic_document.back_section.add_content(
         SemanticReferenceList([
             SemanticHeading(
                 layout_block=LayoutBlock.for_text('References')),
             semantic_ref
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="references"]/tei:listBibl/tei:head'
     ) == ['References']
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="references"]/tei:listBibl'
         '/tei:biblStruct/tei:analytic/tei:title[@type="main"]') == [
             'Reference Title 1'
         ]
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="references"]/tei:listBibl'
         '/tei:biblStruct/tei:note[@type="raw_reference"]') == [
             'Reference 1'
         ]
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="references"]/tei:listBibl'
         '/tei:biblStruct/@xml:id') == ['b0']
Exemplo n.º 4
0
 def test_should_provide_page_and_block_status_for_single_token_blocks(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock.for_text('line1'),
             LayoutBlock.for_text('line2'),
             LayoutBlock.for_text('line3')
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'page_status': features.get_page_status(),
             'block_status': features.get_block_status()
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [{
         'page_status': 'PAGESTART',
         'block_status': 'BLOCKSTART'
     }, {
         'page_status': 'PAGEIN',
         'block_status': 'BLOCKSTART'
     }, {
         'page_status': 'PAGEEND',
         'block_status': 'BLOCKSTART'
     }]
Exemplo n.º 5
0
def convert_two_letter_uppercase_given_name_to_given_middle_name(
        name: T_SemanticName):
    given_names = list(name.iter_by_type(SemanticGivenName))
    middle_names = list(name.iter_by_type(SemanticMiddleName))
    if middle_names:
        LOGGER.debug('already has a middle name: %r', middle_names)
        return
    if len(given_names) != 1:
        LOGGER.debug('no or too many given names: %r', given_names)
        return
    given_name_text = given_names[0].get_text()
    if len(given_name_text) != 2 or not given_name_text.isupper():
        LOGGER.debug('not two uppercase characters: %r', given_name_text)
        return
    layout_document = LayoutDocument.for_blocks(
        list(given_names[0].iter_blocks()))
    retokenized_layout_document = layout_document.retokenize(
        tokenize_fn=tokenize_individual_characters)
    LOGGER.debug('retokenized_layout_document: %r',
                 retokenized_layout_document)
    split_name_parts = [
        (SemanticGivenName(layout_block=LayoutBlock.for_tokens([token]))
         if index == 0 else SemanticMiddleName(
             layout_block=LayoutBlock.for_tokens([token])))
        for index, token in enumerate(
            retokenized_layout_document.iter_all_tokens())
    ]
    LOGGER.debug('split_name_parts: %r', split_name_parts)
    name.flat_map_inplace_by_type(SemanticGivenName,
                                  lambda _: split_name_parts)
Exemplo n.º 6
0
 def test_should_extract_single_affiliation_address(self):
     semantic_content_list = list(AffiliationAddressSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<marker>', LayoutBlock.for_text('1')),
         ('<institution>', LayoutBlock.for_text('Institution 1')),
         ('<department>', LayoutBlock.for_text('Department 1')),
         ('<laboratory>', LayoutBlock.for_text('Laboratory 1')),
         ('<addrLine>', LayoutBlock.for_text('Address Line 1')),
         ('<postCode>', LayoutBlock.for_text('Post Code 1')),
         ('<postBox>', LayoutBlock.for_text('Post Box 1')),
         ('<region>', LayoutBlock.for_text('Region 1')),
         ('<settlement>', LayoutBlock.for_text('Settlement 1')),
         ('<country>', LayoutBlock.for_text('Country 1'))
     ]))
     assert len(semantic_content_list) == 1
     author = semantic_content_list[0]
     assert isinstance(author, SemanticAffiliationAddress)
     assert author.view_by_type(SemanticMarker).get_text() == '1'
     assert author.view_by_type(
         SemanticInstitution).get_text() == 'Institution 1'
     assert author.view_by_type(
         SemanticDepartment).get_text() == 'Department 1'
     assert author.view_by_type(
         SemanticLaboratory).get_text() == 'Laboratory 1'
     assert author.view_by_type(
         SemanticAddressLine).get_text() == 'Address Line 1'
     assert author.view_by_type(
         SemanticPostCode).get_text() == 'Post Code 1'
     assert author.view_by_type(SemanticPostBox).get_text() == 'Post Box 1'
     assert author.view_by_type(SemanticRegion).get_text() == 'Region 1'
     assert author.view_by_type(
         SemanticSettlement).get_text() == 'Settlement 1'
     assert author.view_by_type(SemanticCountry).get_text() == 'Country 1'
 def test_should_add_asset_citation_for_resolved_reference(self):
     semantic_document = SemanticDocument()
     semantic_document.body_section.add_content(
         SemanticSection([
             SemanticParagraph([
                 SemanticTextContentWrapper(
                     layout_block=LayoutBlock.for_text('See')),
                 SemanticReferenceCitation(
                     layout_block=LayoutBlock.for_text('Ref 1'),
                     target_content_id='b0')
             ]),
             SemanticReferenceList([
                 SemanticReference([
                     SemanticLabel(layout_block=LayoutBlock.for_text('1'))
                 ],
                                   content_id='b0')
             ])
         ]))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p') == ['See Ref 1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]') == ['Ref 1']
     assert tei_document.get_xpath_text_content_list(
         '//tei:body/tei:div/tei:p/tei:ref[@type="bibr"]/@target') == [
             '#b0'
         ]
Exemplo n.º 8
0
 def test_should_set_title_and_abstract(self):
     semantic_content_list = list(
         HeaderSemanticExtractor().iter_semantic_content_for_entity_blocks([
             ('<title>', LayoutBlock.for_text(TITLE_1)),
             ('<abstract>', LayoutBlock.for_text(ABSTRACT_1))
         ]))
     front = SemanticFront(semantic_content_list)
     LOGGER.debug('front: %s', front)
     assert front.get_text_by_type(SemanticTitle) == TITLE_1
     assert front.get_text_by_type(SemanticAbstract) == ABSTRACT_1
Exemplo n.º 9
0
 def test_should_not_strip_dot_from_label(self):
     semantic_heading = SemanticHeading([
         SemanticLabel(layout_block=LayoutBlock.for_text('1.')),
         SemanticTitle(layout_block=LayoutBlock.for_text('Section Title 1'))
     ])
     tei_head = get_tei_child_element_for_semantic_content(semantic_heading)
     LOGGER.debug('tei_head: %r', etree.tostring(tei_head))
     assert tei_head.attrib.get('n') == '1.'
     assert get_text_content(tei_head) == 'Section Title 1'
     assert not list(tei_head)
Exemplo n.º 10
0
 def test_should_remove_trailing_dot_from_country(self):
     semantic_content_list = list(AffiliationAddressSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<marker>', LayoutBlock.for_text('1')),
         ('<country>', LayoutBlock.for_text('Country1.'))
     ]))
     assert len(semantic_content_list) == 1
     aff1 = semantic_content_list[0]
     assert isinstance(aff1, SemanticAffiliationAddress)
     assert aff1.view_by_type(SemanticMarker).get_text() == '1'
     assert aff1.view_by_type(SemanticCountry).get_text() == 'Country1'
Exemplo n.º 11
0
 def test_should_be_able_to_extract_single_editor(self):
     semantic_content_list = list(
         NameSemanticExtractor().iter_semantic_content_for_entity_blocks(
             [('<forename>', LayoutBlock.for_text('John')),
              ('<surname>', LayoutBlock.for_text('Smith'))],
             name_type=SemanticEditor))
     assert len(semantic_content_list) == 1
     author = semantic_content_list[0]
     assert isinstance(author, SemanticEditor)
     assert author.given_name_text == 'John'
     assert author.surname_text == 'Smith'
Exemplo n.º 12
0
 def test_should_extract_single_figure(self):
     semantic_content_list = list(
         FigureSemanticExtractor().iter_semantic_content_for_entity_blocks([
             ('<label>', LayoutBlock.for_text('Figure 1')),
             ('<figDesc>', LayoutBlock.for_text('Caption 1'))
         ]))
     assert len(semantic_content_list) == 1
     figure = semantic_content_list[0]
     assert isinstance(figure, SemanticFigure)
     assert figure.view_by_type(SemanticLabel).get_text() == 'Figure 1'
     assert figure.view_by_type(SemanticCaption).get_text() == 'Caption 1'
Exemplo n.º 13
0
 def test_should_render_graphic_element(self):
     semantic_figure = SemanticFigure([
         SemanticLabel(layout_block=LayoutBlock.for_text('Label 1')),
         SemanticCaption(layout_block=LayoutBlock.for_text('Caption 1')),
         SemanticGraphic(layout_graphic=LayoutGraphic(
             local_file_path='image1.png'))
     ],
                                      content_id='fig_0')
     result = _get_wrapped_figure_tei_element(semantic_figure)
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/tei:graphic')
Exemplo n.º 14
0
 def test_should_extract_single_raw_reference(self):
     semantic_content_list = list(ReferenceSegmenterSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<label>', LayoutBlock.for_text('1')),
         ('<reference>', LayoutBlock.for_text('Reference 1'))
     ]))
     assert len(semantic_content_list) == 1
     ref = semantic_content_list[0]
     assert isinstance(ref, SemanticRawReference)
     assert ref.view_by_type(SemanticLabel).get_text() == '1'
     assert ref.view_by_type(
         SemanticRawReferenceText).get_text() == 'Reference 1'
 def test_should_preserve_empty_pages_if_requested(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])])
         ],
                    graphics=[]),
         LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])],
                    graphics=[]),
     ])
     cleaned_layout_document = remove_empty_blocks(
         layout_document, preserve_empty_pages=True)
     assert len(cleaned_layout_document.pages) == 2
Exemplo n.º 16
0
 def test_should_add_paragraphs_without_title(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2))
     ]))
     assert len(semantic_content_list) == 1
     section = semantic_content_list[0]
     assert isinstance(section, SemanticSection)
     assert section.get_paragraph_text_list() == [
         SECTION_PARAGRAPH_1, SECTION_PARAGRAPH_2
     ]
Exemplo n.º 17
0
 def test_should_raw_table_for_table_text_to_section(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<table>', LayoutBlock.for_text(SECTION_PARAGRAPH_2))
     ]))
     assert len(semantic_content_list) == 1
     section = semantic_content_list[0]
     assert isinstance(section, SemanticSection)
     assert section.get_paragraph_text_list() == [SECTION_PARAGRAPH_1]
     assert section.get_text_by_type(
         SemanticRawTable) == SECTION_PARAGRAPH_2
Exemplo n.º 18
0
 def test_should_add_all_fields(self):
     semantic_affiliation_address = SemanticAffiliationAddress([
         SemanticMarker(layout_block=LayoutBlock.for_text('1')),
         SemanticInstitution(layout_block=LayoutBlock.for_text('Institution1')),
         SemanticDepartment(layout_block=LayoutBlock.for_text('Department1')),
         SemanticLaboratory(layout_block=LayoutBlock.for_text('Lab1')),
         SemanticAddressLine(layout_block=LayoutBlock.for_text('AddressLine1')),
         SemanticPostCode(layout_block=LayoutBlock.for_text('PostCode1')),
         SemanticPostBox(layout_block=LayoutBlock.for_text('PostBox1')),
         SemanticRegion(layout_block=LayoutBlock.for_text('Region1')),
         SemanticSettlement(layout_block=LayoutBlock.for_text('Settlement1')),
         SemanticCountry(layout_block=LayoutBlock.for_text('Country1')),
     ])
     tei_aff = TeiElementWrapper(
         get_tei_affiliation_for_semantic_affiliation_address_element(
             semantic_affiliation_address,
             context=DEFAULT_TEI_ELEMENT_FACTORY_CONTEXT
         )
     )
     LOGGER.debug('tei_aff: %r', etree.tostring(tei_aff.element))
     assert tei_aff.get_xpath_text_content_list(
         'tei:note[@type="raw_affiliation"]'
     ) == [semantic_affiliation_address.get_text()]
     assert tei_aff.get_xpath_text_content_list(
         'tei:note[@type="raw_affiliation"]/tei:label'
     ) == ['1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:orgName[@type="institution"]'
     ) == ['Institution1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:orgName[@type="department"]'
     ) == ['Department1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:orgName[@type="laboratory"]'
     ) == ['Lab1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:address/tei:addrLine'
     ) == ['AddressLine1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:address/tei:postCode'
     ) == ['PostCode1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:address/tei:postBox'
     ) == ['PostBox1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:address/tei:region'
     ) == ['Region1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:address/tei:settlement'
     ) == ['Settlement1']
     assert tei_aff.get_xpath_text_content_list(
         'tei:address/tei:country'
     ) == ['Country1']
Exemplo n.º 19
0
 def test_should_create_back_section(self):
     semantic_document = SemanticDocument()
     section = semantic_document.back_section.add_new_section()
     section.add_heading_block(LayoutBlock.for_text(TOKEN_1))
     paragraph = section.add_new_paragraph()
     paragraph.add_block_content(LayoutBlock.for_text(TOKEN_2))
     tei_document = get_tei_for_semantic_document(semantic_document)
     LOGGER.debug('tei xml: %r', etree.tostring(tei_document.root))
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="annex"]/tei:div/tei:head') == [TOKEN_1]
     assert tei_document.get_xpath_text_content_list(
         '//tei:back/tei:div[@type="annex"]/tei:div/tei:p') == [TOKEN_2]
Exemplo n.º 20
0
 def test_should_ignore_additional_title_and_abstract(self):
     # Note: this behaviour should be reviewed
     semantic_content_list = list(
         HeaderSemanticExtractor().iter_semantic_content_for_entity_blocks([
             ('<title>', LayoutBlock.for_text(TITLE_1)),
             ('<abstract>', LayoutBlock.for_text(ABSTRACT_1)),
             ('<title>', LayoutBlock.for_text('other')),
             ('<abstract>', LayoutBlock.for_text('other'))
         ]))
     front = SemanticFront(semantic_content_list)
     LOGGER.debug('front: %s', front)
     assert front.get_text_by_type(SemanticTitle) == TITLE_1
     assert front.get_text_by_type(SemanticAbstract) == ABSTRACT_1
 def test_should_remove_empty_line_block_and_page(self):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[LayoutLine(tokens=[LayoutToken('token1')])])
         ],
                    graphics=[]),
         LayoutPage(blocks=[LayoutBlock(lines=[LayoutLine(tokens=[])])],
                    graphics=[]),
     ])
     cleaned_layout_document = remove_empty_blocks(layout_document)
     assert len(cleaned_layout_document.pages) == 1
     line = cleaned_layout_document.pages[0].blocks[0].lines[0]
     assert [t.text for t in line.tokens] == ['token1']
Exemplo n.º 22
0
 def test_should_add_note_for_other_text_to_body(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('O', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2))
     ]))
     parent_section = SemanticSection(semantic_content_list)
     assert parent_section.get_notes_text_list('fulltext:other') == [
         SECTION_PARAGRAPH_1
     ]
     sections = parent_section.sections
     assert len(sections) == 1
     assert sections[0].get_paragraph_text_list() == [SECTION_PARAGRAPH_2]
Exemplo n.º 23
0
 def test_should_add_separate_section_label(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<section>', LayoutBlock.for_text('1 ' + SECTION_TITLE_1)),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1))
     ]))
     assert len(semantic_content_list) == 1
     section = semantic_content_list[0]
     assert isinstance(section, SemanticSection)
     semantic_headings = list(section.iter_by_type(SemanticHeading))
     assert len(semantic_headings) == 1
     assert semantic_headings[0].get_text_by_type(SemanticLabel) == '1'
     assert semantic_headings[0].get_text_by_type(
         SemanticTitle) == SECTION_TITLE_1
Exemplo n.º 24
0
 def test_should_split_raw_affiliation_on_new_aff_without_address(self):
     semantic_content_list = list(
         HeaderSemanticExtractor().iter_semantic_content_for_entity_blocks([
             ('<affiliation>', LayoutBlock.for_text(AFFILIATION_1)),
             ('<affiliation>', LayoutBlock.for_text(AFFILIATION_2))
         ]))
     front = SemanticFront(semantic_content_list)
     LOGGER.debug('front: %s', front)
     aff_address_list = list(
         front.iter_by_type(SemanticRawAffiliationAddress))
     assert [
         aff_address.get_text_by_type(SemanticRawAffiliation)
         for aff_address in aff_address_list
     ] == [AFFILIATION_1, AFFILIATION_2]
Exemplo n.º 25
0
 def test_should_reject_reference_without_any_detected_fields(self):
     semantic_raw_ref_text = SemanticRawReferenceText(
         layout_block=LayoutBlock.for_text('Reference 1'))
     semantic_raw_ref = SemanticRawReference([semantic_raw_ref_text],
                                             content_id='raw1')
     semantic_content_list = list(CitationSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks(
         [
             ('O', LayoutBlock.for_text(semantic_raw_ref_text.get_text())),
         ],
         semantic_raw_reference=semantic_raw_ref))
     assert len(semantic_content_list) == 1
     ref = semantic_content_list[0]
     assert isinstance(ref, SemanticInvalidReference)
     assert ref.get_text() == semantic_raw_ref_text.get_text()
Exemplo n.º 26
0
 def test_should_include_reference_citation_in_paragraph(self):
     semantic_content_list = list(FullTextSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_1)),
         ('<citation_marker>', LayoutBlock.for_text('Ref 1')),
         ('<paragraph>', LayoutBlock.for_text(SECTION_PARAGRAPH_2)),
     ]))
     LOGGER.debug('semantic_content_list: %s', semantic_content_list)
     assert len(semantic_content_list) == 1
     section = semantic_content_list[0]
     assert isinstance(section, SemanticSection)
     reference_citations = list(
         section.iter_by_type_recursively(SemanticReferenceCitation))
     assert len(reference_citations) == 1
     assert reference_citations[0].get_text() == 'Ref 1'
Exemplo n.º 27
0
 def test_should_render_label_description_and_id(self):
     semantic_figure = SemanticFigure([
         SemanticLabel(layout_block=LayoutBlock.for_text('Label 1')),
         SemanticCaption(layout_block=LayoutBlock.for_text('Caption 1'))
     ],
                                      content_id='fig_0')
     result = _get_wrapped_figure_tei_element(semantic_figure)
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/tei:head') == ['Label 1']
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/tei:label') == ['Label 1']
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/tei:figDesc') == ['Caption 1']
     assert result.get_xpath_text_content_list(
         f'{FIGURE_XPATH}/@xml:id') == ['fig_0']
Exemplo n.º 28
0
 def test_should_add_raw_affiliation_address(self):
     semantic_content_list = list(
         HeaderSemanticExtractor().iter_semantic_content_for_entity_blocks([
             ('<affiliation>', LayoutBlock.for_text(AFFILIATION_1)),
             ('<address>', LayoutBlock.for_text(ADDRESS_1))
         ]))
     front = SemanticFront(semantic_content_list)
     LOGGER.debug('front: %s', front)
     aff_address_list = list(
         front.iter_by_type(SemanticRawAffiliationAddress))
     assert len(aff_address_list) == 1
     aff_address = aff_address_list[0]
     assert aff_address.get_text_by_type(
         SemanticRawAffiliation) == AFFILIATION_1
     assert aff_address.get_text_by_type(SemanticRawAddress) == ADDRESS_1
Exemplo n.º 29
0
 def test_should_extract_preceeding_other_text(self):
     semantic_content_list = list(AffiliationAddressSemanticExtractor(
     ).iter_semantic_content_for_entity_blocks([
         ('O', LayoutBlock.for_text('Other 1')),
         ('<marker>', LayoutBlock.for_text('1')),
         ('<institution>', LayoutBlock.for_text('Institution 1')),
     ]))
     assert len(semantic_content_list) == 2
     note = semantic_content_list[0]
     assert isinstance(note, SemanticNote)
     author = semantic_content_list[1]
     assert isinstance(author, SemanticAffiliationAddress)
     assert author.view_by_type(SemanticMarker).get_text() == '1'
     assert author.view_by_type(
         SemanticInstitution).get_text() == 'Institution 1'
Exemplo n.º 30
0
 def test_should_provide_block_relative_line_length(
         self, features_provider: SegmentationLineFeaturesProvider):
     layout_document = LayoutDocument(pages=[
         LayoutPage(blocks=[
             LayoutBlock(lines=[
                 LayoutLine.for_text('1'),
                 LayoutLine.for_text('12'),
                 LayoutLine.for_text('1234567890'),
             ])
         ])
     ])
     feature_values = []
     for features in _iter_line_features(features_provider,
                                         layout_document):
         feature_values.append({
             'str_block_relative_line_length_feature':
             (features.get_str_block_relative_line_length_feature())
         })
     LOGGER.debug('feature_values: %r', feature_values)
     assert feature_values == [
         {
             'str_block_relative_line_length_feature': '1',  # 1 * 10 / 10
         },
         {
             'str_block_relative_line_length_feature': '2',  # 2 * 10 / 10
         },
         {
             'str_block_relative_line_length_feature': '10',  # 10 * 10 / 10
         },
     ]