def test_should_map_other_label_as_text_without_note(self):
     label_and_layout_line_list = [('<other>',
                                    get_next_layout_line_for_text(TEXT_1))]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     training_data_generator = get_tei_training_data_generator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     assert not tei_xpath(xml_root, f'{AUTHOR_XPATH}//tei:note')
     assert not tei_xpath(xml_root, f'{AUTHOR_XPATH}//tei:other')
     assert get_tei_xpath_text_content_list(
         xml_root, AUTHOR_XPATH) == [f'{TEXT_1}\n']
 def test_should_lb_elements_before_line_feeds(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     lb_nodes = tei_xpath(nodes[0], 'tei:lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
 def test_should_not_create_person_name_for_note(self):
     label_and_layout_line_list_list = [[
         ('O', get_next_layout_line_for_text('before')),
         ('<forename>', get_next_layout_line_for_text('John')),
         ('<surname>', get_next_layout_line_for_text('Smith')),
         ('<marker>', get_next_layout_line_for_text('1')),
         ('O', get_next_layout_line_for_text(',')),
         ('<forename>', get_next_layout_line_for_text('Maria')),
         ('<surname>', get_next_layout_line_for_text('Madison')),
         ('<marker>', get_next_layout_line_for_text('2')),
         ('O', get_next_layout_line_for_text('after')),
     ]]
     labeled_model_data_list_list = get_labeled_model_data_list_list(
         label_and_layout_line_list_list,
         data_generator=get_data_generator())
     training_data_generator = get_tei_training_data_generator()
     xml_root = training_data_generator.get_training_tei_xml_for_multiple_model_data_iterables(
         labeled_model_data_list_list)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 2
     assert get_tei_xpath_text_content_list(nodes[0],
                                            './tei:forename') == ['John']
     assert get_tei_xpath_text_content_list(nodes[1],
                                            './tei:forename') == ['Maria']
예제 #4
0
 def test_should_include_layout_document_text_in_tei_output(self):
     layout_document = LayoutDocument.for_blocks(
         [LayoutBlock.for_text(TEXT_1)])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     nodes = tei_xpath(xml_root, FIGURE_XPATH)
     assert len(nodes) == 1
     assert get_text_content(nodes[0]).rstrip() == TEXT_1
예제 #5
0
 def test_should_generate_tei_for_most_labels(self):
     label_and_layout_line_list = [
         ('<marker>', get_next_layout_line_for_text('Marker 1')),
         ('<institution>', get_next_layout_line_for_text('Institution 1')),
         ('<department>', get_next_layout_line_for_text('Department 1')),
         ('<laboratory>', get_next_layout_line_for_text('Laboratory 1')),
         ('<addrLine>', get_next_layout_line_for_text('AddrLine 1')),
         ('O', get_next_layout_line_for_text(',')),
         ('<postCode>', get_next_layout_line_for_text('PostCode 1')),
         ('O', get_next_layout_line_for_text(',')),
         ('<postBox>', get_next_layout_line_for_text('PostBox 1')),
         ('O', get_next_layout_line_for_text(',')),
         ('<region>', get_next_layout_line_for_text('Region 1')),
         ('O', get_next_layout_line_for_text(',')),
         ('<settlement>', get_next_layout_line_for_text('Settlement 1')),
         ('O', get_next_layout_line_for_text(',')),
         ('<country>', get_next_layout_line_for_text('Country 1'))
     ]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_tei_xpath_text_content_list(aff_nodes[0],
                                            './tei:marker') == ['Marker 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0],
         './tei:orgName[@type="institution"]') == ['Institution 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0],
         './tei:orgName[@type="department"]') == ['Department 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0],
         './tei:orgName[@type="laboratory"]') == ['Laboratory 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:address/tei:addrLine') == ['AddrLine 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:address/tei:postCode') == ['PostCode 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:address/tei:postBox') == ['PostBox 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:address/tei:region') == ['Region 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:address/tei:settlement') == ['Settlement 1']
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:address/tei:country') == ['Country 1']
     assert get_tei_xpath_text_content_list(aff_nodes[0],
                                            './tei:address') == [
                                                '\n,\n'.join([
                                                    'AddrLine 1',
                                                    'PostCode 1',
                                                    'PostBox 1', 'Region 1',
                                                    'Settlement 1',
                                                    'Country 1'
                                                ])
                                            ]
예제 #6
0
 def test_should_lb_elements_before_line_feeds(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
예제 #7
0
 def get_post_processed_xml_root(self, xml_root: etree.ElementBase):
     for idno_element in tei_xpath(xml_root, '//tei:idno'):
         external_identifier_type = get_detected_external_identifier_type_for_text(
             get_text_content(idno_element))
         if not external_identifier_type:
             continue
         idno_element.attrib['type'] = external_identifier_type
     return xml_root
 def test_should_include_layout_document_text_in_tei_output(self):
     layout_document = LayoutDocument.for_blocks(
         [LayoutBlock.for_text(TEXT_1)])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     assert xml_root.tag == f'{TEI_NS_PREFIX}TEI'
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     assert get_text_content(nodes[0]).rstrip() == TEXT_1
 def test_should_generate_tei_from_model_data(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             get_next_layout_line_for_text(TEXT_1),
             get_next_layout_line_for_text(TEXT_2)
         ])
     ])
     data_generator = get_data_generator()
     model_data_iterable = data_generator.iter_model_data_for_layout_document(
         layout_document)
     xml_root = get_training_tei_xml_for_model_data_iterable(
         model_data_iterable)
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     lb_nodes = tei_xpath(nodes[0], 'tei:lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
 def test_should_keep_original_whitespace(self):
     text = 'Token1, Token2  ,Token3'
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(
             lines=[LayoutLine.for_text(text, tail_whitespace='\n')])
     ])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     assert get_text_content(nodes[0]).rstrip() == text
 def test_should_include_layout_document_text_in_tei_output(self):
     training_data_generator = get_tei_training_data_generator()
     layout_document = LayoutDocument.for_blocks(
         [LayoutBlock.for_text(TEXT_1)])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     text_nodes = tei_xpath(xml_root, './text/listBibl')
     assert len(text_nodes) == 1
     assert get_text_content(text_nodes[0]).rstrip() == TEXT_1
예제 #12
0
 def test_should_generate_tei_from_model_data(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             get_next_layout_line_for_text(TEXT_1),
             get_next_layout_line_for_text(TEXT_2)
         ])
     ])
     data_generator = get_data_generator()
     model_data_iterable = data_generator.iter_model_data_for_layout_document(
         layout_document)
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         model_data_iterable)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb')
     assert len(lb_nodes) == 2
     assert lb_nodes[0].getparent().text == TEXT_1
     assert lb_nodes[0].tail == '\n' + TEXT_2
예제 #13
0
 def test_should_include_layout_document_text_in_tei_output(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     layout_document = LayoutDocument.for_blocks(
         [LayoutBlock.for_text(TEXT_1)])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_text_content(aff_nodes[0]).rstrip() == TEXT_1
 def test_should_add_line_feeds(self):
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = get_training_tei_xml_for_layout_document(layout_document)
     nodes = tei_xpath(xml_root, AUTHOR_XPATH)
     assert len(nodes) == 1
     assert get_text_content(nodes[0]).rstrip() == '\n'.join(
         [TEXT_1, TEXT_2])
예제 #15
0
 def test_should_keep_original_whitespace(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     text = 'Token1, Token2  ,Token3'
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(
             lines=[LayoutLine.for_text(text, tail_whitespace='\n')])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_text_content(aff_nodes[0]).rstrip() == text
예제 #16
0
 def test_should_map_unknown_label_to_note(self):
     label_and_layout_line_list = [('<unknown>',
                                    get_next_layout_line_for_text(TEXT_1))]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list, )
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:note[@type="unknown"]') == [TEXT_1]
     assert get_text_content(aff_nodes[0]) == f'{TEXT_1}\n'
예제 #17
0
 def test_should_add_line_feeds(self):
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     layout_document = LayoutDocument.for_blocks([
         LayoutBlock(lines=[
             LayoutLine.for_text(TEXT_1, tail_whitespace='\n'),
             LayoutLine.for_text(TEXT_2, tail_whitespace='\n')
         ])
     ])
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         get_model_data_list_for_layout_document(
             layout_document, data_generator=get_data_generator()))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_text_content(aff_nodes[0]).rstrip() == '\n'.join(
         [TEXT_1, TEXT_2])
예제 #18
0
 def test_should_generate_tei_from_multiple_model_data_lists_using_model_labels(
         self):
     label_and_layout_line_list_list = [[
         ('<figure_head>', get_next_layout_line_for_text(TEXT_1))
     ], [('<figure_head>', get_next_layout_line_for_text(TEXT_2))]]
     labeled_model_data_list_list = get_labeled_model_data_list_list(
         label_and_layout_line_list_list,
         data_generator=get_data_generator())
     training_data_generator = get_tei_training_data_generator()
     xml_root = training_data_generator.get_training_tei_xml_for_multiple_model_data_iterables(
         labeled_model_data_list_list)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     nodes = tei_xpath(xml_root, FIGURE_XPATH)
     assert len(nodes) == 2
     assert get_tei_xpath_text_content_list(nodes[0], './head') == [TEXT_1]
     assert get_tei_xpath_text_content_list(nodes[1], './head') == [TEXT_2]
예제 #19
0
 def test_should_not_join_separate_labels(self):
     label_and_layout_line_list = [
         ('<institution>', get_next_layout_line_for_text(TEXT_1)),
         ('<institution>', get_next_layout_line_for_text(TEXT_2))
     ]
     labeled_model_data_list = get_labeled_model_data_list(
         label_and_layout_line_list, data_generator=get_data_generator())
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable(
         labeled_model_data_list)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 1
     assert get_tei_xpath_text_content_list(
         aff_nodes[0],
         './tei:orgName[@type="institution"]') == [TEXT_1, TEXT_2]
     assert get_text_content(aff_nodes[0]) == f'{TEXT_1}\n{TEXT_2}\n'
예제 #20
0
 def test_should_generate_tei_from_multiple_model_data_lists_using_model_labels(
         self):
     label_and_layout_line_list_list = [[
         ('<institution>', get_next_layout_line_for_text(TEXT_1))
     ], [('<institution>', get_next_layout_line_for_text(TEXT_2))]]
     labeled_model_data_list_list = get_labeled_model_data_list_list(
         label_and_layout_line_list_list,
         data_generator=get_data_generator())
     training_data_generator = AffiliationAddressTeiTrainingDataGenerator()
     xml_root = training_data_generator.get_training_tei_xml_for_multiple_model_data_iterables(
         labeled_model_data_list_list)
     LOGGER.debug('xml: %r', etree.tostring(xml_root))
     aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH)
     assert len(aff_nodes) == 2
     assert get_tei_xpath_text_content_list(
         aff_nodes[0], './tei:orgName[@type="institution"]') == [TEXT_1]
     assert get_tei_xpath_text_content_list(
         aff_nodes[1], './tei:orgName[@type="institution"]') == [TEXT_2]
예제 #21
0
 def get_body_sections(self) -> List[TeiSection]:
     return [
         TeiSection(element)
         for element in tei_xpath(self.get_body_element(), './tei:div')
     ]