def test_should_remove_invalid_affiliation( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): # we only create a single jats affiliation that would usually change the tei affiliation # with --segment-affiliation, we expect to the affiliation segmentation to be updated prefix = 'Some affiliation' jats_text = prefix + '.' tei_text = prefix + ' .' invalid_affiliation_text = 'invalid affiliation' target_jats_xml = etree.tostring( get_target_xml_node(affiliation_nodes=[E.aff(jats_text)])) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_affiliation_tei_node([ TEI_E.affiliation(tei_text), TEI_E.affiliation(invalid_affiliation_text) ]))) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'fields': 'author_aff', 'segment-affiliation': True, 'remove-invalid-affiliations': True }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_text_content(tei_auto_root) == tei_text
def test_should_not_merge_multiple_affiliation_annotations_surrounded_by_line_feeds( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper, segment_affiliation: bool): aff1_text = '\nSome affiliation 1\n' aff2_text = '\nSome affiliation 2\n' target_jats_xml = etree.tostring( get_target_xml_node( affiliation_nodes=[E.aff(aff1_text), E.aff(aff2_text)])) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_affiliation_tei_node([ TEI_E.affiliation(aff1_text), TEI_E.affiliation(aff2_text) ]))) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'segment-affiliation': segment_affiliation, 'fields': 'author_aff' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert [ s.strip() for s in get_text_content_list(get_all_affiliations(tei_auto_root)) ] == [aff1_text.strip(), aff2_text.strip()]
def test_should_preserve_original_affiliation_annotation( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): # we only create a single jats affiliation that would usually change the tei affiliation # without --segment-affiliation, we expect to retain the original affiliation segmentation prefix = 'Some affiliation' jats_text = prefix + '.' tei_text = prefix + ' .' target_jats_xml = etree.tostring( get_target_xml_node(affiliation_nodes=[ E.aff(jats_text, jats_text), ])) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_affiliation_tei_node( [TEI_E.affiliation(tei_text), TEI_E.affiliation(tei_text)]))) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'fields': 'author_aff' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_text_content_list( get_all_affiliations(tei_auto_root)) == [tei_text, tei_text]
def test_should_ignore_sub_fields_if_excluded( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper, segment_affiliation: bool, sub_fields: str, expected_country: str): target_jats_xml = etree.tostring( get_target_xml_node(affiliation_nodes=[ E.aff(E.label(LABEL_1), ' ', E.country(COUNTRY_1)), ])) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_affiliation_tei_node( [TEI_E.affiliation(TEI_E.other(LABEL_1), ' ', COUNTRY_1)]))) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'segment-affiliation': segment_affiliation, 'fields': 'author_aff', 'sub-fields': sub_fields, 'preserve-sub-tags': True, 'no-preserve-sub-fields': 'author_aff-label' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() first_aff = get_first_affiliation(tei_auto_root) assert get_tei_xpath_text(first_aff, './tei:marker') == LABEL_1 assert get_tei_xpath_text( first_aff, './tei:address/tei:country') == expected_country
def test_should_not_preserve_existing_sub_tags_if_provided_by_excluded( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper, segment_affiliation: bool): target_jats_xml = etree.tostring( get_target_xml_node(affiliation_nodes=[ E.aff(E.label(LABEL_1), ' ' + LABEL_2 + ' ' + TEXT_1), ])) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_affiliation_tei_node([ TEI_E.affiliation(TEI_E.other(LABEL_1), ' ', TEI_E.marker(LABEL_2), ' ', TEI_E.preserved(TEXT_1)) ]))) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'segment-affiliation': segment_affiliation, 'fields': 'author_aff', 'preserve-sub-tags': True, 'no-preserve-sub-fields': 'author_aff-label' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() first_aff = get_first_affiliation(tei_auto_root) assert get_tei_xpath_text(first_aff, './tei:marker') == LABEL_1 assert get_tei_xpath_text(first_aff, './tei:preserved') == TEXT_1
def test_should_group_preserved_and_auto_annotated_address_fields( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_jats_xml = etree.tostring( get_target_xml_node(affiliation_nodes=[ E.aff(E.label(LABEL_1), TEXT_1 + ' ' + CITY_1 + ', ', E.country(COUNTRY_1)), ])) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_affiliation_tei_node([ TEI_E.affiliation( TEI_E.marker(LABEL_1), ' ' + TEXT_1 + ' ', TEI_E.address(TEI_E.settlement(CITY_1), ', ', TEI_E.country(COUNTRY_1))) ]))) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'segment-affiliation': True, 'preserve-sub-tags': True, 'no-preserve-sub-fields': 'author_aff-label,author_aff-country', 'fields': 'author_aff' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() tei_auto_aff = get_first_affiliation(tei_auto_root) assert get_tei_xpath_text(tei_auto_aff, './tei:marker') == LABEL_1 assert get_tei_xpath_text(tei_auto_aff, './tei:address/tei:settlement') == CITY_1 assert get_tei_xpath_text(tei_auto_aff, './tei:address/tei:country') == COUNTRY_1 assert get_tei_xpath_text(tei_auto_aff, './tei:address', delimiter='|') == (CITY_1 + ', ' + COUNTRY_1)
def test_should_auto_annotate_single_affiliation_with_all_supported_fields( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_reference_content_nodes = [ E('label', LABEL_1), ' ', E('addr-line', E('named-content', {'content-type': 'department'}, DEPARTMENT_1)), ' ', E.institution(INSTITUTION_1), ' ', E('addr-line', E('named-content', {'content-type': 'city'}, CITY_1)), ' ', E('addr-line', E('named-content', {'content-type': 'postcode'}, POST_CODE_1)), ' ', E('addr-line', E('named-content', {'content-type': 'state'}, STATE_1)), ' ', E.country(COUNTRY_1) ] target_jats_xml = etree.tostring( get_target_xml_node(affiliation_nodes=[ E.aff(*target_reference_content_nodes), ])) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_affiliation_tei_node([ TEI_E.affiliation( get_nodes_text(target_reference_content_nodes)) ]))) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'fields': 'author_aff' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() first_aff = get_first_affiliation(tei_auto_root) assert get_tei_xpath_text(first_aff, './tei:marker') == LABEL_1 assert get_tei_xpath_text( first_aff, './tei:orgName[@type="department"]') == (DEPARTMENT_1) assert get_tei_xpath_text( first_aff, './tei:orgName[@type="institution"]') == (INSTITUTION_1) assert get_tei_xpath_text(first_aff, './tei:address/tei:settlement') == CITY_1 assert get_tei_xpath_text(first_aff, './tei:address/tei:postCode') == POST_CODE_1 assert get_tei_xpath_text(first_aff, './tei:address/tei:region') == STATE_1 assert get_tei_xpath_text(first_aff, './tei:address/tei:country') == COUNTRY_1
def test_should_auto_annotate_single_affiliation_with_single_field( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_reference_content_nodes = [E('label', LABEL_1), ' Some text'] target_jats_xml = etree.tostring( get_target_xml_node(affiliation_nodes=[ E.aff(*target_reference_content_nodes), ])) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_affiliation_tei_node([ TEI_E.affiliation( get_nodes_text(target_reference_content_nodes)) ]))) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'fields': 'author_aff' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() first_aff = get_first_affiliation(tei_auto_root) assert get_tei_xpath_text(first_aff, './tei:marker') == (LABEL_1)
def test_should_preserve_existing_sub_tags( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper, segment_affiliation: bool): untagged_text = ' some affiliation ' other1_sub_tag_text = 'other-sub-tag1' other2_sub_tag_text = 'other-sub-tag2' target_jats_xml = etree.tostring( get_target_xml_node(affiliation_nodes=[ E.aff(other1_sub_tag_text + LABEL_1 + untagged_text + other2_sub_tag_text), ])) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_affiliation_tei_node([ TEI_E.affiliation(TEI_E.other1(other1_sub_tag_text), TEI_E.marker(LABEL_1), get_nodes_text([untagged_text]), TEI_E.other2(other2_sub_tag_text)) ]))) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'segment-affiliation': segment_affiliation, 'fields': 'author_aff', 'preserve-sub-tags': True }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() first_aff = get_first_affiliation(tei_auto_root) assert get_tei_xpath_text(first_aff, './tei:marker') == LABEL_1 assert get_tei_xpath_text(first_aff, './tei:other1') == other1_sub_tag_text assert get_tei_xpath_text(first_aff, './tei:other2') == other2_sub_tag_text