def test_should_generate_tei_for_most_labels(self): label_and_layout_line_list = [ ('<marker>', get_next_layout_line_for_text('Marker 1')), ('<institution>', get_next_layout_line_for_text('Institution 1')), ('<department>', get_next_layout_line_for_text('Department 1')), ('<laboratory>', get_next_layout_line_for_text('Laboratory 1')), ('<addrLine>', get_next_layout_line_for_text('AddrLine 1')), ('O', get_next_layout_line_for_text(',')), ('<postCode>', get_next_layout_line_for_text('PostCode 1')), ('O', get_next_layout_line_for_text(',')), ('<postBox>', get_next_layout_line_for_text('PostBox 1')), ('O', get_next_layout_line_for_text(',')), ('<region>', get_next_layout_line_for_text('Region 1')), ('O', get_next_layout_line_for_text(',')), ('<settlement>', get_next_layout_line_for_text('Settlement 1')), ('O', get_next_layout_line_for_text(',')), ('<country>', get_next_layout_line_for_text('Country 1')) ] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) training_data_generator = AffiliationAddressTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( labeled_model_data_list) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_tei_xpath_text_content_list(aff_nodes[0], './tei:marker') == ['Marker 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:orgName[@type="institution"]') == ['Institution 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:orgName[@type="department"]') == ['Department 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:orgName[@type="laboratory"]') == ['Laboratory 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:address/tei:addrLine') == ['AddrLine 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:address/tei:postCode') == ['PostCode 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:address/tei:postBox') == ['PostBox 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:address/tei:region') == ['Region 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:address/tei:settlement') == ['Settlement 1'] assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:address/tei:country') == ['Country 1'] assert get_tei_xpath_text_content_list(aff_nodes[0], './tei:address') == [ '\n,\n'.join([ 'AddrLine 1', 'PostCode 1', 'PostBox 1', 'Region 1', 'Settlement 1', 'Country 1' ]) ]
def test_should_include_layout_document_text_in_tei_output(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks( [LayoutBlock.for_text(TEXT_1)]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_text_content(aff_nodes[0]).rstrip() == TEXT_1
def test_should_keep_original_whitespace(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() text = 'Token1, Token2 ,Token3' layout_document = LayoutDocument.for_blocks([ LayoutBlock( lines=[LayoutLine.for_text(text, tail_whitespace='\n')]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_text_content(aff_nodes[0]).rstrip() == text
def test_should_map_unknown_label_to_note(self): label_and_layout_line_list = [('<unknown>', get_next_layout_line_for_text(TEXT_1))] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) training_data_generator = AffiliationAddressTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( labeled_model_data_list, ) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:note[@type="unknown"]') == [TEXT_1] assert get_text_content(aff_nodes[0]) == f'{TEXT_1}\n'
def test_should_add_line_feeds(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_text_content(aff_nodes[0]).rstrip() == '\n'.join( [TEXT_1, TEXT_2])
def test_should_lb_elements_before_line_feeds(self): training_data_generator = AffiliationAddressTeiTrainingDataGenerator() layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ LayoutLine.for_text(TEXT_1, tail_whitespace='\n'), LayoutLine.for_text(TEXT_2, tail_whitespace='\n') ]) ]) xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( get_model_data_list_for_layout_document( layout_document, data_generator=get_data_generator())) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb') assert len(lb_nodes) == 2 assert lb_nodes[0].getparent().text == TEXT_1 assert lb_nodes[0].tail == '\n' + TEXT_2
def test_should_not_join_separate_labels(self): label_and_layout_line_list = [ ('<institution>', get_next_layout_line_for_text(TEXT_1)), ('<institution>', get_next_layout_line_for_text(TEXT_2)) ] labeled_model_data_list = get_labeled_model_data_list( label_and_layout_line_list, data_generator=get_data_generator()) training_data_generator = AffiliationAddressTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( labeled_model_data_list) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:orgName[@type="institution"]') == [TEXT_1, TEXT_2] assert get_text_content(aff_nodes[0]) == f'{TEXT_1}\n{TEXT_2}\n'
def test_should_generate_tei_from_multiple_model_data_lists_using_model_labels( self): label_and_layout_line_list_list = [[ ('<institution>', get_next_layout_line_for_text(TEXT_1)) ], [('<institution>', get_next_layout_line_for_text(TEXT_2))]] labeled_model_data_list_list = get_labeled_model_data_list_list( label_and_layout_line_list_list, data_generator=get_data_generator()) training_data_generator = AffiliationAddressTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_multiple_model_data_iterables( labeled_model_data_list_list) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 2 assert get_tei_xpath_text_content_list( aff_nodes[0], './tei:orgName[@type="institution"]') == [TEXT_1] assert get_tei_xpath_text_content_list( aff_nodes[1], './tei:orgName[@type="institution"]') == [TEXT_2]
def test_should_generate_tei_from_model_data(self): layout_document = LayoutDocument.for_blocks([ LayoutBlock(lines=[ get_next_layout_line_for_text(TEXT_1), get_next_layout_line_for_text(TEXT_2) ]) ]) data_generator = get_data_generator() model_data_iterable = data_generator.iter_model_data_for_layout_document( layout_document) training_data_generator = AffiliationAddressTeiTrainingDataGenerator() xml_root = training_data_generator.get_training_tei_xml_for_model_data_iterable( model_data_iterable) LOGGER.debug('xml: %r', etree.tostring(xml_root)) aff_nodes = tei_xpath(xml_root, AFFILIATION_XPATH) assert len(aff_nodes) == 1 lb_nodes = tei_xpath(aff_nodes[0], 'tei:lb') assert len(lb_nodes) == 2 assert lb_nodes[0].getparent().text == TEXT_1 assert lb_nodes[0].tail == '\n' + TEXT_2
def test_should_generate_data_using_mock_models( # noqa pylint: disable=too-many-locals, too-many-statements self, tmp_path: Path, sample_layout_document: SampleLayoutDocument, fulltext_models_mock: MockFullTextModels): configure_fulltext_models_mock_with_sample_document( fulltext_models_mock, sample_layout_document) output_path = tmp_path / 'output' output_path.mkdir() generate_training_data_for_layout_document( layout_document=sample_layout_document.layout_document, output_path=str(output_path), source_filename=SOURCE_FILENAME_1, document_features_context=DEFAULT_DOCUMENT_FEATURES_CONTEXT, fulltext_models=fulltext_models_mock, use_model=True, use_directory_structure=False) _check_tei_training_data_generator_output( SegmentationTeiTrainingDataGenerator(), output_path=output_path, expect_raw_data=True, tei_xml_xpath='text/front', tei_expected_values=[sample_layout_document.header_block.text]) _check_tei_training_data_generator_output( HeaderTeiTrainingDataGenerator(), output_path=output_path, expect_raw_data=True, tei_xml_xpath='text/front/docTitle/titlePart', tei_expected_values=[sample_layout_document.title_block.text]) _check_tei_training_data_generator_output( NameTeiTrainingDataGenerator(), pre_file_path_suffix='.header', output_path=output_path, expect_raw_data=False, tei_xml_xpath='//tei:author//tei:surname', tei_expected_values=[ sample_layout_document.author_surname_block.text ]) _check_tei_training_data_generator_output( NameTeiTrainingDataGenerator(), pre_file_path_suffix='.citations', output_path=output_path, expect_raw_data=False, tei_xml_xpath='//tei:author//tei:surname', tei_expected_values=[ sample_layout_document.ref_author_surname_block.text ]) _check_tei_training_data_generator_output( AffiliationAddressTeiTrainingDataGenerator(), output_path=output_path, expect_raw_data=False, tei_xml_xpath='//tei:affiliation/tei:orgName[@type="institution"]', tei_expected_values=[ sample_layout_document.institution_block.text ]) _check_tei_training_data_generator_output( FullTextTeiTrainingDataGenerator(), output_path=output_path, expect_raw_data=True, tei_xml_xpath='//head', tei_expected_values=[ sample_layout_document.body_section_title_block.text ]) _check_tei_training_data_generator_output( FigureTeiTrainingDataGenerator(), output_path=output_path, expect_raw_data=True, tei_xml_xpath='//head', tei_expected_values=[ sample_layout_document.figure_head_block.text ]) _check_tei_training_data_generator_output( TableTeiTrainingDataGenerator(), output_path=output_path, expect_raw_data=True, tei_xml_xpath='//head', tei_expected_values=[sample_layout_document.table_head_block.text]) _check_tei_training_data_generator_output( ReferenceSegmenterTeiTrainingDataGenerator(), output_path=output_path, expect_raw_data=True, tei_xml_xpath='//bibl', tei_expected_values=[sample_layout_document.ref_ref_block.text]) _check_tei_training_data_generator_output( CitationTeiTrainingDataGenerator(), output_path=output_path, expect_raw_data=False, tei_xml_xpath='//tei:bibl/tei:title[@level="a"]', tei_expected_values=[sample_layout_document.ref_title_block.text])
def get_tei_training_data_generator( self) -> AffiliationAddressTeiTrainingDataGenerator: return AffiliationAddressTeiTrainingDataGenerator()