def test_should_replace_affiliation_with_author_if_single_tokens( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): author_text = 'Mary Maison 1, John Smith 1' affiliation_text = '1 University of Science, Smithonia' test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_header_tei_node([ E.note(author_text), E.lb(), E.note(affiliation_text), E.lb() ]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(author_nodes=[ E.contrib( E.name(E.surname('Maison'), E('given-names', 'Mary'))), E.contrib( E.name(E.surname('Smith'), E('given-names', 'John'))), E.aff(E.label('1'), E.institution('University of Science'), E.country('Smithonia')) ]))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join(['title', 'author', 'author_aff', 'abstract']), 'matcher': 'simple' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text(tei_auto_root, '//byline/docAuthor') == author_text assert get_xpath_text(tei_auto_root, '//byline/affiliation') == affiliation_text
def test_should_auto_annotate_label_containing_dot_within_reference( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): label_with_dot = '1.' test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_reference_segmenter_tei_node( [E.note(label_with_dot + ' ' + REFERENCE_TEXT_1 + ' ')]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(reference_nodes=[ get_jats_reference_node(label_with_dot, REFERENCE_TEXT_1), ]))) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'fields': 'reference', 'xml-mapping-overrides': 'reference.use-raw-text=true' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text(tei_auto_root, '//listBibl/bibl[1]') == ' '.join( [label_with_dot, REFERENCE_TEXT_1]) assert get_xpath_text(tei_auto_root, '//listBibl/bibl[1]/label') == label_with_dot
def test_should_auto_annotate_affiliation_preceding_number_using_simple_matcher( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): author_text = 'Mary Maison 1, John Smith 1' affiliation_text_1 = '1' affiliation_text_2 = 'University of Science, Smithonia' affiliation_text = ' '.join([affiliation_text_1, affiliation_text_2]) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_header_tei_node([ E.note(TITLE_1), E.lb(), E.note(author_text), E.lb(), E.note(affiliation_text_1), E.lb(), E.note(affiliation_text_2), E.lb(), E.note(ABSTRACT_PREFIX_1, E.lb(), ABSTRACT_1) ]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node( title=TITLE_1, author_nodes=[ E.contrib( E.name(E.surname('Maison'), E('given-names', 'Mary'))), E.contrib( E.name(E.surname('Smith'), E('given-names', 'John'))), E.aff(E.institution('University of Science'), E.country('Smithonia')) ], abstract_node=E.abstract(E.p(ABSTRACT_1))))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join(['title', 'author', 'author_aff', 'abstract']), 'matcher': 'simple' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text(tei_auto_root, '//docTitle/titlePart') == TITLE_1 assert get_xpath_text(tei_auto_root, '//byline/docAuthor') == author_text assert get_xpath_text(tei_auto_root, '//byline/affiliation') == affiliation_text assert get_xpath_text( tei_auto_root, '//div[@type="abstract"]') == (ABSTRACT_PREFIX_1 + ABSTRACT_1)
def test_should_not_preserve_sub_tags(self, annotator: MagicMock): structured_document = _structured_document_with_sub_elements( E.sub1('sub1'), E.sub2('sub2')) assert get_xpath_text(_get_root(structured_document), '//sub1') == 'sub1' assert get_xpath_text(_get_root(structured_document), '//sub2') == 'sub2' annotate_structured_document_inplace(structured_document, annotator=annotator, preserve_tags=True, preserve_sub_tags=False, fields={'title'}) assert get_xpath_text(_get_root(structured_document), '//sub1') == '' assert get_xpath_text(_get_root(structured_document), '//sub2') == ''
def test_should_not_auto_annotate_other_sub_tags( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_reference_content_nodes = [ E('article-title', ARTICLE_TITLE_1), ' ', E.source(SOURCE_1), ] reference_text = get_nodes_text(target_reference_content_nodes) target_jats_xml = etree.tostring( get_target_xml_node(reference_nodes=[ get_jats_reference_node(LABEL_1, * target_reference_content_nodes), ])) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_reference_segmenter_tei_node( [E.note(LABEL_1 + ' ' + reference_text)]))) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'fields': 'reference' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text(tei_auto_root, '//listBibl/bibl') == ' '.join( [LABEL_1, reference_text])
def test_should_auto_annotate_title( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): test_helper.tei_raw_file_path.write_bytes( etree.tostring(get_header_tei_node([E.note(TEXT_1)]))) test_helper.xml_file_path.write_bytes( etree.tostring(get_target_xml_node(title=TEXT_1))) main([*test_helper.main_args], save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text(tei_auto_root, '//docTitle/titlePart') == TEXT_1
def test_should_extend_title_annotation_to_whole_line( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): title_text = 'Chocolate bars for mice' test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_header_tei_node([E.note('Title: ' + title_text)]))) test_helper.xml_file_path.write_bytes( etree.tostring(get_target_xml_node(title=title_text))) main([*test_helper.main_args, '--matcher=simple'], save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text(tei_auto_root, '//docTitle/titlePart') == title_text
def test_should_skip_errors( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): tei_raw_other_file_path = test_helper.tei_raw_path.joinpath( 'document0.header.tei.xml') tei_raw_other_file_path.write_bytes( etree.tostring(get_header_tei_node([E.note(TEXT_1)]))) xml_other_file_path = test_helper.xml_path.joinpath('document0.xml') xml_other_file_path.write_bytes( etree.tostring(get_target_xml_node(title=TEXT_1)) + b'error') test_helper.tei_raw_file_path.write_bytes( etree.tostring(get_header_tei_node([E.note(TEXT_1)]))) test_helper.xml_file_path.write_bytes( etree.tostring(get_target_xml_node(title=TEXT_1))) main([*test_helper.main_args, '--matcher=simple', '--skip-errors'], save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text(tei_auto_root, '//docTitle/titlePart') == TEXT_1
def test_should_auto_annotate_single_reference( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_reference_segmenter_tei_node( [E.note(LABEL_1 + ' ' + REFERENCE_TEXT_1)]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(reference_nodes=[ get_jats_reference_node(LABEL_1, REFERENCE_TEXT_1), ]))) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple', 'fields': 'reference' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text(tei_auto_root, '//listBibl/bibl') == ' '.join( [LABEL_1, REFERENCE_TEXT_1])
def get_tei_xpath_text(*args, **kwargs): return get_xpath_text(*args, namespaces=TEI_NS_MAP, **kwargs)