def test_should_auto_annotate_single_table_label_description_with_attrib( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_table_label_caption_content_nodes = [ E.label(LABEL_1), ' ', E.caption(E.title(CAPTION_TITLE_1), ' ', E.p(CAPTION_PARAGRAPH_1)) ] target_body_content_nodes = [ E.sec( E('table-wrap', *target_table_label_caption_content_nodes, *[' ', E.attrib(LONG_ATTRIB_TEXT_1)]), ) ] tei_text = (get_nodes_text(target_table_label_caption_content_nodes) + LONG_DATA_TEXT_1 + ' ' + LONG_ATTRIB_TEXT_1) test_helper.tei_raw_file_path.write_bytes( etree.tostring(get_training_tei_node([tei_text]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(body_nodes=target_body_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'expand-to-following-untagged-lines': True, 'fields': ','.join(['table']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//figure[@type="table"]') == [tei_text]
def test_should_ignore_nested_paragraphs_in_boxed_text( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_boxed_text_content_nodes = [ E.label(LABEL_1), ' ', E.caption(E.title(CAPTION_TITLE_1), ), ' ', # in `306415v1` the paragraph is outside the caption E.p(CAPTION_PARAGRAPH_1, ' ', E.p(CAPTION_PARAGRAPH_2)) ] target_body_content_nodes = [ E.sec(E('boxed-text', *target_boxed_text_content_nodes)) ] tei_text = get_nodes_text(target_body_content_nodes) test_helper.tei_raw_file_path.write_bytes( etree.tostring(get_training_tei_node([tei_text]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(body_nodes=target_body_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join([ 'section_title', 'section_paragraph', 'boxed_text_title', 'boxed_text_paragraph' ]) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//head[@type="box"]') == [ LABEL_1 + ' ' + CAPTION_TITLE_1 ] assert get_xpath_text_list(tei_auto_root, '//p[@type="box"]') == [ CAPTION_PARAGRAPH_1 + ' ' + CAPTION_PARAGRAPH_2 ]
def test_should_auto_annotate_single_paragraph_citations_in_list_items_outside_paragraphs( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_paragraph_content_nodes = [TEXT_1, ' '] for key, value in CITATION_TEXT_BY_JATS_REF_TYPE_MAP.items(): target_paragraph_content_nodes.append( E.xref({'ref-type': key}, value)) target_paragraph_content_nodes.append(' ') target_paragraph_content_nodes.append(TEXT_2) target_body_content_nodes = [ E.sec(E.list(E('list-item', *target_paragraph_content_nodes))), '\nOther' ] paragraph_text = get_nodes_text(target_paragraph_content_nodes) tei_text = get_nodes_text(target_body_content_nodes) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_training_tei_node(get_tei_nodes_for_text(tei_text)))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(body_nodes=target_body_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join(['section_paragraph', 'list_item']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() for key, tei_type_value in TEI_BY_JATS_REF_TYPE_MAP.items(): assert get_xpath_text_list( tei_auto_root, '//item/ref[@type="%s"]' % tei_type_value) == [CITATION_TEXT_BY_JATS_REF_TYPE_MAP[key]] assert get_xpath_text_list(tei_auto_root, '//list/item') == [paragraph_text]
def test_should_auto_annotate_multiple_section_title_and_paragraphs( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_body_content_nodes = [ E.sec( E.title(SECTION_TITLE_1), ' ', E.p(TEXT_1), ), ' ', E.sec(E.title(SECTION_TITLE_2), ' ', E.p(TEXT_2)) ] tei_text = get_nodes_text(target_body_content_nodes) test_helper.tei_raw_file_path.write_bytes( etree.tostring(get_training_tei_node([E.note(tei_text)]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(body_nodes=target_body_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join(['section_title', 'section_paragraph']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list( tei_auto_root, '//head') == [SECTION_TITLE_1, SECTION_TITLE_2] assert get_xpath_text_list(tei_auto_root, '//p') == [TEXT_1, TEXT_2]
def test_should_auto_annotate_keywords( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_keywords_nodes = [ E( 'kwd-group', *[ E.title('Keywords'), ': ', E.kwd('Keyword 1'), ', ', E.kwd('Keyword 2'), ', ', E.kwd('Keyword 3') ]) ] target_article_meta_nodes = [ 'Heading\n', *target_keywords_nodes, '\nMore text' ] tei_text = get_nodes_text(target_article_meta_nodes) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_training_tei_node(get_tei_nodes_for_text(tei_text)))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node( article_meta_nodes=target_article_meta_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join(['keywords']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//other[@type="keywords"]') == [ get_nodes_text(target_keywords_nodes) ]
def test_should_auto_annotate_single_list_within_paragraph_as_list_and_items( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): # e.g. `005587v1` contains list within a paragraph target_body_content_nodes = [ E.p( TEXT_3, ' ', E.list(E.label(LABEL_1), '\n', E.title(SECTION_TITLE_1), '\n', E('list-item', TEXT_1), '\nlist-text\n', E('list-item', TEXT_2))) ] tei_text = get_nodes_text(target_body_content_nodes) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_training_tei_node(get_tei_nodes_for_text(tei_text)))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(body_nodes=target_body_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join(['section_paragraph', 'list', 'list_item']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//list/item') == [TEXT_1, TEXT_2] assert get_xpath_text_list(tei_auto_root, '//list') == [ LABEL_1 + '\n' + SECTION_TITLE_1 + '\n' + TEXT_1 + '\nlist-text\n' + TEXT_2 ]
def test_should_ignore_fig_within_list_items( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): # e.g. `214296v1` contains a figure within a list-item target_body_content_nodes = [ E.list( E('list-item', E.p(TEXT_1, ' ', E.fig(E.caption(CAPTION_TITLE_1)))), ' ', E('list-item', E.p(TEXT_2))) ] tei_text = get_nodes_text(target_body_content_nodes) test_helper.tei_raw_file_path.write_bytes( etree.tostring(get_training_tei_node([tei_text]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(body_nodes=target_body_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join(['section_paragraph', 'figure', 'list', 'list_item']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//list/item') == [TEXT_1, TEXT_2] assert get_xpath_text_list(tei_auto_root, '//figure') == [CAPTION_TITLE_1]
def test_should_auto_annotate_single_table_label_description( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_table_label_caption_content_nodes = [ E.label(LABEL_1), ' ', E.caption(E.title(CAPTION_TITLE_1), ' ', E.p(CAPTION_PARAGRAPH_1)) ] target_body_content_nodes = [ E.sec(E.title(SECTION_TITLE_1), ' ', E.p(TEXT_1), ' ', E('table-wrap', *target_table_label_caption_content_nodes), ' ', E.p(TEXT_2)) ] tei_text = get_nodes_text(target_body_content_nodes) test_helper.tei_raw_file_path.write_bytes( etree.tostring(get_training_tei_node([tei_text]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(body_nodes=target_body_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join(['section_title', 'section_paragraph', 'table']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//head') == [SECTION_TITLE_1] assert get_xpath_text_list( tei_auto_root, '//figure[@type="table"]') == [ get_nodes_text(target_table_label_caption_content_nodes) ]
def test_should_auto_annotate_single_figure_data_before_label_description_if_enabled( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper, expand_to_untagged: bool): target_figure_label_caption_content_nodes = [ E.label(LABEL_1), ' ', E.caption(E.title(CAPTION_TITLE_1), ' ', E.p(CAPTION_PARAGRAPH_1)) ] target_body_content_nodes = [ E.sec(E.fig(*target_figure_label_caption_content_nodes)) ] tei_text = (LONG_DATA_TEXT_1 + ' ' + get_nodes_text(target_figure_label_caption_content_nodes)) figure_label_caption_tei_text = get_nodes_text( target_figure_label_caption_content_nodes) test_helper.tei_raw_file_path.write_bytes( etree.tostring(get_training_tei_node([tei_text]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(body_nodes=target_body_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'expand-to-previous-untagged-lines': expand_to_untagged, 'fields': ','.join(['figure', 'table']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//figure[not(@type="table")]') == [ tei_text if expand_to_untagged else figure_label_caption_tei_text ]
def test_should_auto_annotate_multiple_out_of_order_section_title_and_paragraphs( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): section_titles = ['First Title', 'Second Heading'] section_paragraphs = [TEXT_1, TEXT_2] target_section_1_content_nodes = [ E.sec( E.title(section_titles[0]), ' ', E.p(section_paragraphs[0]), ) ] target_section_2_content_nodes = [ E.sec( E.title(section_titles[1]), ' ', E.p(section_paragraphs[1]), ) ] target_body_content_nodes = [ *target_section_1_content_nodes, ' ', *target_section_2_content_nodes ] test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_training_tei_node([ get_nodes_text(target_section_2_content_nodes), E.lb(), *get_tei_nodes_for_text('x\n' * 100), E.lb(), get_nodes_text(target_section_1_content_nodes), E.lb() ]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(body_nodes=target_body_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'matcher-lookahead-lines': 10, 'fields': ','.join(['section_title', 'section_paragraph']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//head') == list(reversed(section_titles)) assert get_xpath_text_list(tei_auto_root, '//p') == [TEXT_2, TEXT_1]
def test_should_preserve_and_decode_quote_html_entities_after_invalid_xml( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_jats_xml = etree.tostring(get_target_xml_node()) test_helper.tei_raw_file_path.write_text(''.join([ '<tei><text>', '<figure></table>', 'before', ''', 'after', '</text></tei>' ])) LOGGER.debug('target_jats_xml: %s', target_jats_xml) test_helper.xml_file_path.write_bytes(target_jats_xml) main(dict_to_args({ **test_helper.main_args_dict, 'matcher': 'simple' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, './/text') == ['before\'after']
def test_should_auto_annotate_single_paragraphs_split_by_figure( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_body_content_nodes = [E.sec(E.p(TEXT_1 + ' ' + TEXT_2))] tei_text = TEXT_1 + '\n' + LONG_DATA_TEXT_1 + '\n' + TEXT_2 test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_training_tei_node(get_tei_nodes_for_text(tei_text)))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(body_nodes=target_body_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join(['section_paragraph']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//p') == [TEXT_1, TEXT_2]
def test_should_auto_annotate_single_top_level_body_paragraphs( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): # e.g. `153445v1` contains paragraphs as direct children of the body target_body_content_nodes = [E.p(TEXT_1)] tei_text = get_nodes_text(target_body_content_nodes) test_helper.tei_raw_file_path.write_bytes( etree.tostring(get_training_tei_node([tei_text]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(body_nodes=target_body_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join(['section_paragraph']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//p') == [TEXT_1]
def test_should_convert_note_other_to_other( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_body_content_nodes: List[etree.ElementBase] = [] test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_training_tei_node([ E.note({'type': 'other'}, TEXT_1), ]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(body_nodes=target_body_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join(['section_title', 'section_paragraph']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//other') == [TEXT_1]
def test_should_extend_to_line_by_default( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_body_content_nodes = [E.sec( 'x ', E.title(SECTION_TITLE_1), )] tei_text = get_nodes_text(target_body_content_nodes) test_helper.tei_raw_file_path.write_bytes( etree.tostring(get_training_tei_node([tei_text]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(body_nodes=target_body_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join(['section_title']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//head') == ['x ' + SECTION_TITLE_1]
def test_should_auto_annotate_single_app_group_and_app( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_app_nodes = [ E.label(LABEL_2), ' ', E.title(SECTION_TITLE_2), '\n', E.p(TEXT_1) ] target_app_group_nodes = [ E( 'app-group', *[ E.label(LABEL_1), ' ', E.title(SECTION_TITLE_1), '\n', E.app(*target_app_nodes) ]) ] tei_text = get_nodes_text(target_app_group_nodes) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_training_tei_node(get_tei_nodes_for_text(tei_text)))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(back_nodes=target_app_group_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join([ 'section_title', 'section_paragraph', 'appendix_group_title', 'appendix' ]) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//head[@type="appendix-group"]') == [ LABEL_1 + ' ' + SECTION_TITLE_1 ] assert get_xpath_text_list(tei_auto_root, '//figure[@xtype="appendix"]') == [ get_nodes_text(target_app_nodes) ]
def test_should_annotate_references_title( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_back_content_nodes = [E('ref-list', E.title(SECTION_TITLE_1))] test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_training_tei_node( [TEXT_1, E.lb(), SECTION_TITLE_1, E.lb(), TEXT_2]))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(back_nodes=target_back_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join( ['section_title', 'section_paragraph', 'reference_list_title']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list( tei_auto_root, '//other[@type="ref-list-title"]') == [SECTION_TITLE_1]
def test_should_auto_annotate_single_section_title_with_label( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_body_content_nodes = [ E.sec(E.label(SECTION_LABEL_1), '\n', E.title(SECTION_TITLE_1), '\n', E.p(TEXT_1)) ] tei_text = get_nodes_text(target_body_content_nodes) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_training_tei_node(get_tei_nodes_for_text(tei_text)))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(body_nodes=target_body_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': 'section_title' }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//head') == [ SECTION_LABEL_1 + '\n' + SECTION_TITLE_1 ]
def test_should_auto_annotate_single_back_ref_list_section_title( self, test_helper: SingleFileAutoAnnotateEndToEndTestHelper): target_back_content_nodes = [ E.sec(E.title(SECTION_TITLE_1)), '\n', E('ref-list', *[ E.title('References'), ]) ] tei_text = get_nodes_text(target_back_content_nodes) test_helper.tei_raw_file_path.write_bytes( etree.tostring( get_training_tei_node(get_tei_nodes_for_text(tei_text)))) test_helper.xml_file_path.write_bytes( etree.tostring( get_target_xml_node(back_nodes=target_back_content_nodes))) main(dict_to_args({ **test_helper.main_args_dict, 'fields': ','.join(['section_title']) }), save_main_session=False) tei_auto_root = test_helper.get_tei_auto_root() assert get_xpath_text_list(tei_auto_root, '//other') == ['References']