def test_should_not_create_person_name_for_note(self):
     label_and_layout_line_list_list = [[
         ('O', get_next_layout_line_for_text('before')),
         ('<forename>', get_next_layout_line_for_text('John')),
         ('<surname>', get_next_layout_line_for_text('Smith')),
         ('<marker>', get_next_layout_line_for_text('1')),
         ('O', get_next_layout_line_for_text(',')),
         ('<forename>', get_next_layout_line_for_text('Maria')),
         ('<surname>', get_next_layout_line_for_text('Madison')),
         ('<marker>', get_next_layout_line_for_text('2')),
         ('O', get_next_layout_line_for_text('after')),
     ]]
     labeled_model_data_list_list = get_labeled_model_data_list_list(
         label_and_layout_line_list_list,
         data_generator=get_data_generator())
     training_data_generator = get_tei_training_data_generator()
     xml_root = training_data_generator.get_training_tei_xml_for_multiple_model_data_iterables(
         labeled_model_data_list_list)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 2
     assert get_tei_xpath_text_content_list(nodes[0],
                                            './tei:forename') == ['John']
     assert get_tei_xpath_text_content_list(nodes[1],
                                            './tei:forename') == ['Maria']
示例#2
0
 def test_should_add_bold_and_italics_text(self):
     block = LayoutBlock.for_tokens([
         LayoutToken(TOKEN_1),
         LayoutToken(TOKEN_2, font=BOLD_ITALICS_FONT_1),
         LayoutToken(TOKEN_3)
     ])
     node = TEI_E.node(*iter_layout_block_tei_children(block))
     LOGGER.debug('xml: %r', etree.tostring(node))
     assert get_tei_xpath_text_content_list(
         node, './/tei:hi[@rend="bold"]') == [TOKEN_2]
     assert get_tei_xpath_text_content_list(
         node, './/tei:hi[@rend="italic"]') == [TOKEN_2]
     assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
示例#3
0
 def test_should_generate_tei_for_most_labels(self):
     label_and_layout_line_list = [
         ('<figure_head>', get_next_layout_line_for_text('Figure Head 1')),
         ('<figDesc>', get_next_layout_line_for_text('Figure Desc 1')),
     ]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     xml_root = get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     assert get_tei_xpath_text_content_list(
         xml_root, f'{FIGURE_XPATH}/head') == ['Figure Head 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{FIGURE_XPATH}/figDesc') == ['Figure Desc 1']
示例#4
0
 def test_should_add_label_to_head_element_without_additional_text(self):
     label_and_layout_line_list = [
         ('<label>', get_next_layout_line_for_text('Figure Label 1')),
         ('<figDesc>', get_next_layout_line_for_text('Figure Desc 1'))
     ]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     xml_root = get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     assert get_tei_xpath_text_content_list(
         xml_root, f'{FIGURE_XPATH}/head/label') == ['Figure Label 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{FIGURE_XPATH}/head') == ['Figure Label 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{FIGURE_XPATH}/figDesc') == ['Figure Desc 1']
示例#5
0
 def test_should_add_label_at_the_end_inside_head_element(self):
     label_and_layout_line_list = [
         ('<figure_head>', get_next_layout_line_for_text('Figure Head 1')),
         ('<label>', get_next_layout_line_for_text('Figure Label 1'))
     ]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     xml_root = get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     assert get_tei_xpath_text_content_list(
         xml_root, f'{FIGURE_XPATH}/head/label') == ['Figure Label 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{FIGURE_XPATH}/head') == [
             '\n'.join(['Figure Head 1', 'Figure Label 1'])
         ]
示例#6
0
 def test_should_generate_tei_from_multiple_model_data_lists_using_model_labels(
         self):
     label_and_layout_line_list_list = [[
         ('<figure_head>', get_next_layout_line_for_text(TEXT_1))
     ], [('<figure_head>', get_next_layout_line_for_text(TEXT_2))]]
     labeled_model_data_list_list = get_labeled_model_data_list_list(
         label_and_layout_line_list_list,
         data_generator=get_data_generator())
     training_data_generator = get_tei_training_data_generator()
     xml_root = training_data_generator.get_training_tei_xml_for_multiple_model_data_iterables(
         labeled_model_data_list_list)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     nodes = tei_xpath(xml_root, FIGURE_XPATH)
     assert len(nodes) == 2
     assert get_tei_xpath_text_content_list(nodes[0], './head') == [TEXT_1]
     assert get_tei_xpath_text_content_list(nodes[1], './head') == [TEXT_2]
示例#7
0
 def test_should_generate_tei_from_multiple_model_data_lists_using_model_labels(
         self):
     label_and_layout_line_list_list = [[
         ('<institution>', get_next_layout_line_for_text(TEXT_1))
     ], [('<institution>', get_next_layout_line_for_text(TEXT_2))]]
     labeled_model_data_list_list = get_labeled_model_data_list_list(
         label_and_layout_line_list_list,
         data_generator=get_data_generator())
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_multiple_model_data_iterables(
         labeled_model_data_list_list)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 2
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:orgName[@type="institution"]') == [TEXT_1]
     assert get_tei_xpath_text_content_list(
         aff_nodes[1], './tei:orgName[@type="institution"]') == [TEXT_2]
 def test_should_map_unknown_label_to_note(self):
     label_and_layout_line_list = [('<unknown>',
                                    get_next_layout_line_for_text(TEXT_1))]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     xml_root = get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     assert get_tei_xpath_text_content_list(
         xml_root, f'{AUTHOR_XPATH}/tei:note[@type="unknown"]') == [TEXT_1]
 def test_should_map_pubnum_to_idno_with_type_if_detected(
         self, test_input: str, expected_type: Optional[str]):
     label_and_layout_line_list = [
         ('<pubnum>', get_next_layout_line_for_text(test_input))
     ]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     xml_root = get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     if expected_type:
         assert get_tei_xpath_text_content_list(
             xml_root,
             f'{BIBL_XPATH}/tei:idno[@type="{expected_type}"]') == [
                 test_input
             ]
     else:
         assert get_tei_xpath_text_content_list(
             xml_root,
             f'{BIBL_XPATH}/tei:idno[not(@type)]') == [test_input]
示例#10
0
 def test_should_add_superscript_text(self):
     block = LayoutBlock.for_tokens([
         LayoutToken(TOKEN_1),
         LayoutToken(TOKEN_2, font=SUPERSCRIPT_FONT_1),
         LayoutToken(TOKEN_3)
     ])
     node = TEI_E.node(*iter_layout_block_tei_children(block))
     assert get_tei_xpath_text_content_list(
         node, './tei:hi[@rend="superscript"]') == [TOKEN_2]
     assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
 def test_should_map_other_label_as_text_without_note(self):
     label_and_layout_line_list = [('<other>',
                                    get_next_layout_line_for_text(TEXT_1))]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     training_data_generator = get_tei_training_data_generator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     assert not tei_xpath(xml_root, f'{BIBL_XPATH}//tei:note')
     assert get_tei_xpath_text_content_list(xml_root,
                                            BIBL_XPATH) == [f'{TEXT_1}\n']
 def test_should_generate_tei_for_most_labels(self):
     label_and_layout_line_list = [
         ('<marker>', get_next_layout_line_for_text('Marker 1')),
         ('<title>', get_next_layout_line_for_text('Title 1')),
         ('<forename>', get_next_layout_line_for_text('Forename 1')),
         ('<middlename>', get_next_layout_line_for_text('Middlename 1')),
         ('<surname>', get_next_layout_line_for_text('Surname 1')),
         ('<suffix>', get_next_layout_line_for_text('Suffix 1'))
     ]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     xml_root = get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     assert get_tei_xpath_text_content_list(
         xml_root, f'{AUTHOR_XPATH}/tei:marker') == ['Marker 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{AUTHOR_XPATH}/tei:roleName') == ['Title 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{AUTHOR_XPATH}/tei:forename') == ['Forename 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{AUTHOR_XPATH}/tei:middlename') == ['Middlename 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{AUTHOR_XPATH}/tei:surname') == ['Surname 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{AUTHOR_XPATH}/tei:suffix') == ['Suffix 1']
示例#13
0
 def test_should_map_unknown_label_to_note(self):
     label_and_layout_line_list = [('<unknown>',
                                    get_next_layout_line_for_text(TEXT_1))]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list, )
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:note[@type="unknown"]') == [TEXT_1]
     assert get_text_content(aff_nodes[0]) == f'{TEXT_1}\n'
示例#14
0
def _check_tei_training_data_generator_output(
        tei_training_data_generator: TeiTrainingDataGenerator,
        output_path: Path, expect_raw_data: bool, tei_xml_xpath: str,
        tei_expected_values: Sequence[str], **kwargs):
    xml_root = _check_tei_training_data_generator_output_and_return_xml_root(
        tei_training_data_generator=tei_training_data_generator,
        output_path=output_path,
        expect_raw_data=expect_raw_data,
        **kwargs)
    assert normalize_whitespace_list(
        get_tei_xpath_text_content_list(xml_root, tei_xml_xpath)) == [
            normalize_whitespace(tei_expected_value)
            for tei_expected_value in tei_expected_values
        ]
示例#15
0
 def test_should_not_join_separate_labels(self):
     label_and_layout_line_list = [
         ('<institution>', get_next_layout_line_for_text(TEXT_1)),
         ('<institution>', get_next_layout_line_for_text(TEXT_2))
     ]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_tei_xpath_text_content_list(
         aff_nodes[0],
         './tei:orgName[@type="institution"]') == [TEXT_1, TEXT_2]
     assert get_text_content(aff_nodes[0]) == f'{TEXT_1}\n{TEXT_2}\n'
示例#16
0
 def test_should_be_able_to_set_title_with_italic_layout_tokens(self):
     title_block = LayoutBlock.for_tokens([
         LayoutToken('rend'),
         LayoutToken('italic1', font=ITALICS_FONT_1),
         LayoutToken('test')
     ])
     document = TeiDocument()
     document.set_title_layout_block(title_block)
     LOGGER.debug('xml: %r', etree.tostring(document.root))
     nodes = document.root.xpath(
         '//tei:fileDesc/tei:titleStmt/tei:title[@level="a"][@type="main"]',
         namespaces=TEI_NS_MAP)
     assert len(nodes) == 1
     title_node = nodes[0]
     assert get_tei_xpath_text_content_list(
         title_node, './tei:hi[@rend="italic"]') == ['italic1']
     assert document.get_title() == 'rend italic1 test'
示例#17
0
 def test_should_add_label_to_head_element_without_additional_text(self):
     label_and_layout_line_list = [
         ('<label>', get_next_layout_line_for_text('Table Label 1')),
         ('<figDesc>', get_next_layout_line_for_text('Table Desc 1')),
         ('<content>', get_next_layout_line_for_text('Content 1')),
         ('<other>', get_next_layout_line_for_text('Other 1')),
         ('<note>', get_next_layout_line_for_text('Note 1'))
     ]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     xml_root = get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     assert get_tei_xpath_text_content_list(
         xml_root, f'{TABLE_XPATH}/head/label') == ['Table Label 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{TABLE_XPATH}/head') == ['Table Label 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{TABLE_XPATH}/figDesc') == ['Table Desc 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{TABLE_XPATH}/table') == ['Content 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{TABLE_XPATH}/other') == ['Other 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{TABLE_XPATH}/note') == ['Note 1']
示例#18
0
 def get_abstract(self) -> str:
     return '\n'.join(
         get_tei_xpath_text_content_list(
             self.root,
             '//tei:abstract/tei:p',
         ))
示例#19
0
 def get_title_text(self) -> str:
     return '\n'.join(
         get_tei_xpath_text_content_list(
             self.element,
             '//tei:head',
         ))
示例#20
0
 def get_paragraph_text_list(self) -> List[str]:
     return get_tei_xpath_text_content_list(
         self.element,
         '//tei:p',
     )
示例#21
0
 def get_title(self) -> str:
     return '\n'.join(
         get_tei_xpath_text_content_list(
             self.root,
             '//tei:fileDesc/tei:titleStmt/tei:title[@level="a"][@type="main"]',
         ))
示例#22
0
 def test_should_generate_tei_for_most_labels(self):
     label_and_layout_line_list = [
         ('<marker>', get_next_layout_line_for_text('Marker 1')),
         ('<institution>', get_next_layout_line_for_text('Institution 1')),
         ('<department>', get_next_layout_line_for_text('Department 1')),
         ('<laboratory>', get_next_layout_line_for_text('Laboratory 1')),
         ('<addrLine>', get_next_layout_line_for_text('AddrLine 1')),
         ('O', get_next_layout_line_for_text(',')),
         ('<postCode>', get_next_layout_line_for_text('PostCode 1')),
         ('O', get_next_layout_line_for_text(',')),
         ('<postBox>', get_next_layout_line_for_text('PostBox 1')),
         ('O', get_next_layout_line_for_text(',')),
         ('<region>', get_next_layout_line_for_text('Region 1')),
         ('O', get_next_layout_line_for_text(',')),
         ('<settlement>', get_next_layout_line_for_text('Settlement 1')),
         ('O', get_next_layout_line_for_text(',')),
         ('<country>', get_next_layout_line_for_text('Country 1'))
     ]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_tei_xpath_text_content_list(aff_nodes[0],
                                            './tei:marker') == ['Marker 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0],
         './tei:orgName[@type="institution"]') == ['Institution 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0],
         './tei:orgName[@type="department"]') == ['Department 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0],
         './tei:orgName[@type="laboratory"]') == ['Laboratory 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:address/tei:addrLine') == ['AddrLine 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:address/tei:postCode') == ['PostCode 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:address/tei:postBox') == ['PostBox 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:address/tei:region') == ['Region 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:address/tei:settlement') == ['Settlement 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:address/tei:country') == ['Country 1']
     assert get_tei_xpath_text_content_list(aff_nodes[0],
                                            './tei:address') == [
                                                '\n,\n'.join([
                                                    'AddrLine 1',
                                                    'PostCode 1',
                                                    'PostBox 1', 'Region 1',
                                                    'Settlement 1',
                                                    'Country 1'
                                                ])
                                            ]
 def test_should_generate_tei_for_most_labels(self):
     label_and_layout_line_list = [
         ('<title>', get_next_layout_line_for_text('Title 1')),
         ('<author>', get_next_layout_line_for_text('Author 1')),
         ('<editor>', get_next_layout_line_for_text('Editor 1')),
         ('<institution>', get_next_layout_line_for_text('Institution 1')),
         ('<collaboration>',
          get_next_layout_line_for_text('Collaboration 1')),
         ('<journal>', get_next_layout_line_for_text('Journal 1')),
         ('<series>', get_next_layout_line_for_text('Series 1')),
         ('<booktitle>', get_next_layout_line_for_text('Book Title 1')),
         ('<date>', get_next_layout_line_for_text('Date 1')),
         ('<volume>', get_next_layout_line_for_text('Volume 1')),
         ('<issue>', get_next_layout_line_for_text('Issue 1')),
         ('<pages>', get_next_layout_line_for_text('Pages 1')),
         ('<publisher>', get_next_layout_line_for_text('Publisher 1')),
         ('<location>', get_next_layout_line_for_text('Location 1')),
         ('<tech>', get_next_layout_line_for_text('Tech 1')),
         ('<pubnum>', get_next_layout_line_for_text('Pubnum 1')),
         ('<web>', get_next_layout_line_for_text('Web 1')),
         ('<note>', get_next_layout_line_for_text('Note 1'))
     ]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     xml_root = get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     assert get_tei_xpath_text_content_list(
         xml_root, f'{BIBL_XPATH}/tei:title[@level="a"]') == ['Title 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{BIBL_XPATH}/tei:author') == ['Author 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{BIBL_XPATH}/tei:editor') == ['Editor 1']
     assert get_tei_xpath_text_content_list(
         xml_root,
         f'{BIBL_XPATH}/tei:orgName[not(@type)]') == ['Institution 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{BIBL_XPATH}/tei:orgName[@type="collaboration"]') == [
             'Collaboration 1'
         ]
     assert get_tei_xpath_text_content_list(
         xml_root, f'{BIBL_XPATH}/tei:title[@level="j"]') == ['Journal 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{BIBL_XPATH}/tei:title[@level="s"]') == ['Series 1']
     assert get_tei_xpath_text_content_list(
         xml_root,
         f'{BIBL_XPATH}/tei:title[@level="m"]') == ['Book Title 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{BIBL_XPATH}/tei:date') == ['Date 1']
     assert get_tei_xpath_text_content_list(
         xml_root,
         f'{BIBL_XPATH}/tei:biblScope[@unit="volume"]') == ['Volume 1']
     assert get_tei_xpath_text_content_list(
         xml_root,
         f'{BIBL_XPATH}/tei:biblScope[@unit="issue"]') == ['Issue 1']
     assert get_tei_xpath_text_content_list(
         xml_root,
         f'{BIBL_XPATH}/tei:biblScope[@unit="page"]') == ['Pages 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{BIBL_XPATH}/tei:publisher') == ['Publisher 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{BIBL_XPATH}/tei:pubPlace') == ['Location 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{BIBL_XPATH}/tei:note[@type="report"]') == ['Tech 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{BIBL_XPATH}/tei:idno') == ['Pubnum 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{BIBL_XPATH}/tei:ptr[@type="web"]') == ['Web 1']
     assert get_tei_xpath_text_content_list(
         xml_root, f'{BIBL_XPATH}/tei:note[not(@type)]') == ['Note 1']