def test_should_not_create_person_name_for_note(self): label_and_layout_line_list_list = [[ ('O', get_next_layout_line_for_text('before')), ('<forename>', get_next_layout_line_for_text('John')), ('<surname>', get_next_layout_line_for_text('Smith')), ('<marker>', get_next_layout_line_for_text('1')), ('O', get_next_layout_line_for_text(',')), ('<forename>', get_next_layout_line_for_text('Maria')), ('<surname>', get_next_layout_line_for_text('Madison')), ('<marker>', get_next_layout_line_for_text('2')), ('O', get_next_layout_line_for_text('after')), ]] labeled_model_data_list_list = get_labeled_model_data_list_list( label_and_layout_line_list_list, data_generator=get_data_generator()) training_data_generator = get_tei_training_data_generator() xml_root = training_data_generator.get_training_tei_xml_for_multiple_model_data_iterables( labeled_model_data_list_list) LOGGER.debug('xml: %r', etree.tostring(xml_root)) nodes = tei_xpath(xml_root, AUTHOR_XPATH) assert len(nodes) == 2 assert get_tei_xpath_text_content_list(nodes[0], './tei:forename') == ['John'] assert get_tei_xpath_text_content_list(nodes[1], './tei:forename') == ['Maria']
def test_should_add_bold_and_italics_text(self): block = LayoutBlock.for_tokens([ LayoutToken(TOKEN_1), LayoutToken(TOKEN_2, font=BOLD_ITALICS_FONT_1), LayoutToken(TOKEN_3) ]) node = TEI_E.node(*iter_layout_block_tei_children(block)) LOGGER.debug('xml: %r', etree.tostring(node)) assert get_tei_xpath_text_content_list( node, './/tei:hi[@rend="bold"]') == [TOKEN_2] assert get_tei_xpath_text_content_list( node, './/tei:hi[@rend="italic"]') == [TOKEN_2] assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
def test_should_generate_tei_for_most_labels(self): label_and_layout_line_list = [ ('<figure_head>', get_next_layout_line_for_text('Figure Head 1')), ('<figDesc>', get_next_layout_line_for_text('Figure Desc 1')), ] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) xml_root = get_training_tei_xml_for_model_data_iterable( labeled_model_data_list) assert get_tei_xpath_text_content_list( xml_root, f'{FIGURE_XPATH}/head') == ['Figure Head 1'] assert get_tei_xpath_text_content_list( xml_root, f'{FIGURE_XPATH}/figDesc') == ['Figure Desc 1']
def test_should_add_label_to_head_element_without_additional_text(self): label_and_layout_line_list = [ ('<label>', get_next_layout_line_for_text('Figure Label 1')), ('<figDesc>', get_next_layout_line_for_text('Figure Desc 1')) ] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) xml_root = get_training_tei_xml_for_model_data_iterable( labeled_model_data_list) assert get_tei_xpath_text_content_list( xml_root, f'{FIGURE_XPATH}/head/label') == ['Figure Label 1'] assert get_tei_xpath_text_content_list( xml_root, f'{FIGURE_XPATH}/head') == ['Figure Label 1'] assert get_tei_xpath_text_content_list( xml_root, f'{FIGURE_XPATH}/figDesc') == ['Figure Desc 1']
def test_should_add_label_at_the_end_inside_head_element(self): label_and_layout_line_list = [ ('<figure_head>', get_next_layout_line_for_text('Figure Head 1')), ('<label>', get_next_layout_line_for_text('Figure Label 1')) ] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) xml_root = get_training_tei_xml_for_model_data_iterable( labeled_model_data_list) assert get_tei_xpath_text_content_list( xml_root, f'{FIGURE_XPATH}/head/label') == ['Figure Label 1'] assert get_tei_xpath_text_content_list( xml_root, f'{FIGURE_XPATH}/head') == [ '\n'.join(['Figure Head 1', 'Figure Label 1']) ]
def test_should_generate_tei_from_multiple_model_data_lists_using_model_labels( self): label_and_layout_line_list_list = [[ ('<figure_head>', get_next_layout_line_for_text(TEXT_1)) ], [('<figure_head>', get_next_layout_line_for_text(TEXT_2))]] labeled_model_data_list_list = get_labeled_model_data_list_list( label_and_layout_line_list_list, data_generator=get_data_generator()) training_data_generator = get_tei_training_data_generator() xml_root = training_data_generator.get_training_tei_xml_for_multiple_model_data_iterables( labeled_model_data_list_list) LOGGER.debug('xml: %r', etree.tostring(xml_root)) nodes = tei_xpath(xml_root, FIGURE_XPATH) assert len(nodes) == 2 assert get_tei_xpath_text_content_list(nodes[0], './head') == [TEXT_1] assert get_tei_xpath_text_content_list(nodes[1], './head') == [TEXT_2]
def test_should_generate_tei_from_multiple_model_data_lists_using_model_labels( self): label_and_layout_line_list_list = [[ ('<institution>', get_next_layout_line_for_text(TEXT_1)) ], [('<institution>', get_next_layout_line_for_text(TEXT_2))]] labeled_model_data_list_list = get_labeled_model_data_list_list( label_and_layout_line_list_list, data_generator=get_data_generator()) training_data_generator = AffiliationAddressTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_multiple_model_data_iterables( labeled_model_data_list_list) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 2 assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:orgName[@type="institution"]') == [TEXT_1] assert get_tei_xpath_text_content_list( aff_nodes[1], './tei:orgName[@type="institution"]') == [TEXT_2]
def test_should_map_unknown_label_to_note(self): label_and_layout_line_list = [('<unknown>', get_next_layout_line_for_text(TEXT_1))] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) xml_root = get_training_tei_xml_for_model_data_iterable( labeled_model_data_list) assert get_tei_xpath_text_content_list( xml_root, f'{AUTHOR_XPATH}/tei:note[@type="unknown"]') == [TEXT_1]
def test_should_map_pubnum_to_idno_with_type_if_detected( self, test_input: str, expected_type: Optional[str]): label_and_layout_line_list = [ ('<pubnum>', get_next_layout_line_for_text(test_input)) ] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) xml_root = get_training_tei_xml_for_model_data_iterable( labeled_model_data_list) if expected_type: assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:idno[@type="{expected_type}"]') == [ test_input ] else: assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:idno[not(@type)]') == [test_input]
def test_should_add_superscript_text(self): block = LayoutBlock.for_tokens([ LayoutToken(TOKEN_1), LayoutToken(TOKEN_2, font=SUPERSCRIPT_FONT_1), LayoutToken(TOKEN_3) ]) node = TEI_E.node(*iter_layout_block_tei_children(block)) assert get_tei_xpath_text_content_list( node, './tei:hi[@rend="superscript"]') == [TOKEN_2] assert get_text_content(node) == ' '.join([TOKEN_1, TOKEN_2, TOKEN_3])
def test_should_map_other_label_as_text_without_note(self): label_and_layout_line_list = [('<other>', get_next_layout_line_for_text(TEXT_1))] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) training_data_generator = get_tei_training_data_generator() xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( labeled_model_data_list) LOGGER.debug('xml: %r', etree.tostring(xml_root)) assert not tei_xpath(xml_root, f'{BIBL_XPATH}//tei:note') assert get_tei_xpath_text_content_list(xml_root, BIBL_XPATH) == [f'{TEXT_1}\n']
def test_should_generate_tei_for_most_labels(self): label_and_layout_line_list = [ ('<marker>', get_next_layout_line_for_text('Marker 1')), ('<title>', get_next_layout_line_for_text('Title 1')), ('<forename>', get_next_layout_line_for_text('Forename 1')), ('<middlename>', get_next_layout_line_for_text('Middlename 1')), ('<surname>', get_next_layout_line_for_text('Surname 1')), ('<suffix>', get_next_layout_line_for_text('Suffix 1')) ] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) xml_root = get_training_tei_xml_for_model_data_iterable( labeled_model_data_list) assert get_tei_xpath_text_content_list( xml_root, f'{AUTHOR_XPATH}/tei:marker') == ['Marker 1'] assert get_tei_xpath_text_content_list( xml_root, f'{AUTHOR_XPATH}/tei:roleName') == ['Title 1'] assert get_tei_xpath_text_content_list( xml_root, f'{AUTHOR_XPATH}/tei:forename') == ['Forename 1'] assert get_tei_xpath_text_content_list( xml_root, f'{AUTHOR_XPATH}/tei:middlename') == ['Middlename 1'] assert get_tei_xpath_text_content_list( xml_root, f'{AUTHOR_XPATH}/tei:surname') == ['Surname 1'] assert get_tei_xpath_text_content_list( xml_root, f'{AUTHOR_XPATH}/tei:suffix') == ['Suffix 1']
def test_should_map_unknown_label_to_note(self): label_and_layout_line_list = [('<unknown>', get_next_layout_line_for_text(TEXT_1))] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) training_data_generator = AffiliationAddressTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( labeled_model_data_list, ) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:note[@type="unknown"]') == [TEXT_1] assert get_text_content(aff_nodes[0]) == f'{TEXT_1}\n'
def _check_tei_training_data_generator_output( tei_training_data_generator: TeiTrainingDataGenerator, output_path: Path, expect_raw_data: bool, tei_xml_xpath: str, tei_expected_values: Sequence[str], **kwargs): xml_root = _check_tei_training_data_generator_output_and_return_xml_root( tei_training_data_generator=tei_training_data_generator, output_path=output_path, expect_raw_data=expect_raw_data, **kwargs) assert normalize_whitespace_list( get_tei_xpath_text_content_list(xml_root, tei_xml_xpath)) == [ normalize_whitespace(tei_expected_value) for tei_expected_value in tei_expected_values ]
def test_should_not_join_separate_labels(self): label_and_layout_line_list = [ ('<institution>', get_next_layout_line_for_text(TEXT_1)), ('<institution>', get_next_layout_line_for_text(TEXT_2)) ] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) training_data_generator = AffiliationAddressTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( labeled_model_data_list) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:orgName[@type="institution"]') == [TEXT_1, TEXT_2] assert get_text_content(aff_nodes[0]) == f'{TEXT_1}\n{TEXT_2}\n'
def test_should_be_able_to_set_title_with_italic_layout_tokens(self): title_block = LayoutBlock.for_tokens([ LayoutToken('rend'), LayoutToken('italic1', font=ITALICS_FONT_1), LayoutToken('test') ]) document = TeiDocument() document.set_title_layout_block(title_block) LOGGER.debug('xml: %r', etree.tostring(document.root)) nodes = document.root.xpath( '//tei:fileDesc/tei:titleStmt/tei:title[@level="a"][@type="main"]', namespaces=TEI_NS_MAP) assert len(nodes) == 1 title_node = nodes[0] assert get_tei_xpath_text_content_list( title_node, './tei:hi[@rend="italic"]') == ['italic1'] assert document.get_title() == 'rend italic1 test'
def test_should_add_label_to_head_element_without_additional_text(self): label_and_layout_line_list = [ ('<label>', get_next_layout_line_for_text('Table Label 1')), ('<figDesc>', get_next_layout_line_for_text('Table Desc 1')), ('<content>', get_next_layout_line_for_text('Content 1')), ('<other>', get_next_layout_line_for_text('Other 1')), ('<note>', get_next_layout_line_for_text('Note 1')) ] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) xml_root = get_training_tei_xml_for_model_data_iterable( labeled_model_data_list) assert get_tei_xpath_text_content_list( xml_root, f'{TABLE_XPATH}/head/label') == ['Table Label 1'] assert get_tei_xpath_text_content_list( xml_root, f'{TABLE_XPATH}/head') == ['Table Label 1'] assert get_tei_xpath_text_content_list( xml_root, f'{TABLE_XPATH}/figDesc') == ['Table Desc 1'] assert get_tei_xpath_text_content_list( xml_root, f'{TABLE_XPATH}/table') == ['Content 1'] assert get_tei_xpath_text_content_list( xml_root, f'{TABLE_XPATH}/other') == ['Other 1'] assert get_tei_xpath_text_content_list( xml_root, f'{TABLE_XPATH}/note') == ['Note 1']
def get_abstract(self) -> str: return '\n'.join( get_tei_xpath_text_content_list( self.root, '//tei:abstract/tei:p', ))
def get_title_text(self) -> str: return '\n'.join( get_tei_xpath_text_content_list( self.element, '//tei:head', ))
def get_paragraph_text_list(self) -> List[str]: return get_tei_xpath_text_content_list( self.element, '//tei:p', )
def get_title(self) -> str: return '\n'.join( get_tei_xpath_text_content_list( self.root, '//tei:fileDesc/tei:titleStmt/tei:title[@level="a"][@type="main"]', ))
def test_should_generate_tei_for_most_labels(self): label_and_layout_line_list = [ ('<marker>', get_next_layout_line_for_text('Marker 1')), ('<institution>', get_next_layout_line_for_text('Institution 1')), ('<department>', get_next_layout_line_for_text('Department 1')), ('<laboratory>', get_next_layout_line_for_text('Laboratory 1')), ('<addrLine>', get_next_layout_line_for_text('AddrLine 1')), ('O', get_next_layout_line_for_text(',')), ('<postCode>', get_next_layout_line_for_text('PostCode 1')), ('O', get_next_layout_line_for_text(',')), ('<postBox>', get_next_layout_line_for_text('PostBox 1')), ('O', get_next_layout_line_for_text(',')), ('<region>', get_next_layout_line_for_text('Region 1')), ('O', get_next_layout_line_for_text(',')), ('<settlement>', get_next_layout_line_for_text('Settlement 1')), ('O', get_next_layout_line_for_text(',')), ('<country>', get_next_layout_line_for_text('Country 1')) ] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) training_data_generator = AffiliationAddressTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( labeled_model_data_list) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_tei_xpath_text_content_list(aff_nodes[0], './tei:marker') == ['Marker 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:orgName[@type="institution"]') == ['Institution 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:orgName[@type="department"]') == ['Department 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:orgName[@type="laboratory"]') == ['Laboratory 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:address/tei:addrLine') == ['AddrLine 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:address/tei:postCode') == ['PostCode 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:address/tei:postBox') == ['PostBox 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:address/tei:region') == ['Region 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:address/tei:settlement') == ['Settlement 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:address/tei:country') == ['Country 1'] assert get_tei_xpath_text_content_list(aff_nodes[0], './tei:address') == [ '\n,\n'.join([ 'AddrLine 1', 'PostCode 1', 'PostBox 1', 'Region 1', 'Settlement 1', 'Country 1' ]) ]
def test_should_generate_tei_for_most_labels(self): label_and_layout_line_list = [ ('<title>', get_next_layout_line_for_text('Title 1')), ('<author>', get_next_layout_line_for_text('Author 1')), ('<editor>', get_next_layout_line_for_text('Editor 1')), ('<institution>', get_next_layout_line_for_text('Institution 1')), ('<collaboration>', get_next_layout_line_for_text('Collaboration 1')), ('<journal>', get_next_layout_line_for_text('Journal 1')), ('<series>', get_next_layout_line_for_text('Series 1')), ('<booktitle>', get_next_layout_line_for_text('Book Title 1')), ('<date>', get_next_layout_line_for_text('Date 1')), ('<volume>', get_next_layout_line_for_text('Volume 1')), ('<issue>', get_next_layout_line_for_text('Issue 1')), ('<pages>', get_next_layout_line_for_text('Pages 1')), ('<publisher>', get_next_layout_line_for_text('Publisher 1')), ('<location>', get_next_layout_line_for_text('Location 1')), ('<tech>', get_next_layout_line_for_text('Tech 1')), ('<pubnum>', get_next_layout_line_for_text('Pubnum 1')), ('<web>', get_next_layout_line_for_text('Web 1')), ('<note>', get_next_layout_line_for_text('Note 1')) ] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) xml_root = get_training_tei_xml_for_model_data_iterable( labeled_model_data_list) assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:title[@level="a"]') == ['Title 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:author') == ['Author 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:editor') == ['Editor 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:orgName[not(@type)]') == ['Institution 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:orgName[@type="collaboration"]') == [ 'Collaboration 1' ] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:title[@level="j"]') == ['Journal 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:title[@level="s"]') == ['Series 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:title[@level="m"]') == ['Book Title 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:date') == ['Date 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:biblScope[@unit="volume"]') == ['Volume 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:biblScope[@unit="issue"]') == ['Issue 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:biblScope[@unit="page"]') == ['Pages 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:publisher') == ['Publisher 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:pubPlace') == ['Location 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:note[@type="report"]') == ['Tech 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:idno') == ['Pubnum 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:ptr[@type="web"]') == ['Web 1'] assert get_tei_xpath_text_content_list( xml_root, f'{BIBL_XPATH}/tei:note[not(@type)]') == ['Note 1']