def test_should_return_empty_target_annotations_for_no_matching_annotations( self): xml_root = E.article(E.other(SOME_VALUE)) xml_mapping = {'article': {TAG1: 'title'}} target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert target_annotations == []
def test_should_not_apply_range_children_if_value_is_not_integer(self): fpage = 'abc' lpage = 'xyz' xml_root = E.article( E.entry(E.child1(SOME_VALUE), E.fpage(fpage), E.lpage(lpage))) xml_mapping = { 'article': { TAG1: 'entry', TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|lpage', TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{ 'min': { 'xpath': 'fpage' }, 'max': { 'xpath': 'lpage' } }]) } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert [(t.name, t.value) for t in target_annotations] == [(TAG1, [fpage, lpage])]
def test_should_apply_range_children_as_separate_target_annotations(self): num_values = [101, 102, 103, 104, 105, 106, 107] xml_root = E.article( E.entry(E.child1(SOME_VALUE), E.fpage(str(min(num_values))), E.lpage(str(max(num_values))))) xml_mapping = { 'article': { TAG1: 'entry', TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|lpage', TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{ 'min': { 'xpath': 'fpage' }, 'max': { 'xpath': 'lpage' }, 'standalone': True }]) } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert [(t.name, t.value) for t in target_annotations ] == [(TAG1, str(x)) for x in num_values]
def test_should_not_apply_range_children_if_xpath_not_matching(self): num_values = [101, 102, 103, 104, 105, 106, 107] fpage = str(min(num_values)) lpage = str(max(num_values)) xml_root = E.article( E.entry(E.child1(SOME_VALUE), E.fpage(fpage), E.lpage(lpage))) xml_mapping = { 'article': { TAG1: 'entry', TAG1 + XmlMappingSuffix.CHILDREN: 'fpage|unknown', TAG1 + XmlMappingSuffix.CHILDREN_RANGE: json.dumps([{ 'min': { 'xpath': 'fpage' }, 'max': { 'xpath': 'unknown' } }]) } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert [(t.name, t.value) for t in target_annotations] == [(TAG1, fpage)]
def test_should_not_apply_concat_children_if_one_node_was_not_found(self): num_values = ['101', '202'] xml_root = E.article( E.entry( E.parent(E.child1(SOME_VALUE), E.fpage(num_values[0]), E.lpage(num_values[1])))) xml_mapping = { 'article': { TAG1: 'entry', TAG1 + XmlMappingSuffix.CHILDREN: './/*', TAG1 + XmlMappingSuffix.CHILDREN_CONCAT: json.dumps([[{ 'xpath': './/fpage' }, { 'value': '-' }, { 'xpath': './/unknown' }]]) } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert [(t.name, t.value) for t in target_annotations ] == [(TAG1, [SOME_VALUE, num_values[0], num_values[1]])]
def test_should_return_matching_target_annotations(self): xml_root = E.article(E.title(SOME_VALUE)) xml_mapping = {'article': {TAG1: 'title'}} target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert len(target_annotations) == 1 assert target_annotations[0].name == TAG1 assert target_annotations[0].value == SOME_VALUE
def test_should_strip_extra_space(self): xml_root = E.article(E.abstract(SOME_VALUE + ' ' + SOME_VALUE_2)) xml_mapping = {'article': {TAG1: 'abstract'}} target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert len(target_annotations) == 1 assert target_annotations[0].name == TAG1 assert target_annotations[0].value == SOME_VALUE + ' ' + SOME_VALUE_2
def test_should_return_full_text(self): xml_root = E.article(E.title('some ', E.other('embedded'), ' text')) xml_mapping = {'article': {TAG1: 'title'}} target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert len(target_annotations) == 1 assert target_annotations[0].name == TAG1 assert target_annotations[0].value == 'some embedded text'
def convert(args): logger = get_logger() svg_filename_pattern = args.svg_path if not svg_filename_pattern: svg_filename_pattern = svg_pattern_for_lxml_path(args.lxml_path) logger.debug('svg_filename_pattern: %s', svg_filename_pattern) lxml_root = etree.parse(args.lxml_path).getroot() match_detail_reporter = None if args.annotate: annotators = DEFAULT_ANNOTATORS if args.debug_match: match_detail_reporter = CsvMatchDetailReporter( open_csv_output(args.debug_match), args.debug_match) if args.xml_path: xml_mapping = parse_xml_mapping(args.xml_mapping_path) target_annotations = xml_root_to_target_annotations( etree.parse(args.xml_path).getroot(), xml_mapping) annotators = annotators + [ MatchingAnnotator(target_annotations, match_detail_reporter=match_detail_reporter, use_tag_begin_prefix=True) ] annotator = Annotator(annotators) else: annotator = None if annotator: svg_roots = list(iter_svg_pages_for_lxml(lxml_root)) annotator.annotate(SvgStructuredDocument(svg_roots)) else: svg_roots = iter_svg_pages_for_lxml(lxml_root) for page_index, svg_root in enumerate(svg_roots): if annotator: svg_root = visualize_svg_annotations(svg_root) svg_filename = svg_filename_pattern.format(1 + page_index) logger.info('writing to: %s', svg_filename) with open(svg_filename, 'wb') as f: etree.ElementTree(svg_root).write(f, pretty_print=True) if annotator: tagging_evaluation_results = evaluate_document_by_page( SvgStructuredDocument(svg_roots)) logger.info( 'tagging evaluation:\n%s', '\n'.join([ 'page{}: {}'.format(1 + i, r) for i, r in enumerate(tagging_evaluation_results) ])) if args.annotation_evaluation_csv: write_dict_csv( args.annotation_evaluation_csv, DEFAULT_EVALUATION_COLUMNS, to_annotation_evaluation_csv_dict_rows( tagging_evaluation_results, document=os.path.basename(args.lxml_path))) if match_detail_reporter: match_detail_reporter.close()
def test_should_apply_match_require_next_flag(self): xml_root = E.article(E.title(SOME_VALUE)) xml_mapping = { 'article': { TAG1: 'title', TAG1 + XmlMappingSuffix.REQUIRE_NEXT: 'true' } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert [t.require_next for t in target_annotations] == [True]
def test_should_apply_match_multiple_flag(self): xml_root = E.article(E.title(SOME_VALUE)) xml_mapping = { 'article': { TAG1: 'title', TAG1 + XmlMappingSuffix.MATCH_MULTIPLE: 'true' } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert [t.match_multiple for t in target_annotations] == [True]
def test_should_apply_match_bonding_flag(self): xml_root = E.article(E.title(SOME_VALUE)) xml_mapping = { 'article': { TAG1: 'title', TAG1 + XmlMappingSuffix.BONDING: 'true' } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert [t.bonding for t in target_annotations] == [True]
def test_should_extract_single_value_if_its_the_only_value(self): xml_root = E.article(E.entry(E.value('12345'))) xml_mapping = { 'article': { TAG1: 'entry', TAG1 + XmlMappingSuffix.EXTRACT_REGEX: r'.*\b(\d+)\b.*' } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert len(target_annotations) == 1 assert [(t.name, t.value) for t in target_annotations] == [(TAG1, '12345')]
def test_should_use_multiple_xpaths(self): xml_root = E.article( E.entry(E.child1(SOME_VALUE), E.child2(SOME_VALUE_2))) xml_mapping = { 'article': { TAG1: '\n{}\n{}\n'.format('entry/child1', 'entry/child2') } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert [(t.name, t.value) for t in target_annotations] == [(TAG1, SOME_VALUE), (TAG1, SOME_VALUE_2)]
def test_should_extract_numbers_from_value_after_text(self): xml_root = E.article(E.entry(E.value(SOME_VALUE + ' 12345'))) xml_mapping = { 'article': { TAG1: 'entry', TAG1 + XmlMappingSuffix.EXTRACT_REGEX: r'.*\b(\d+)\b.*' } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert len(target_annotations) == 1 assert [(t.name, set(t.value)) for t in target_annotations ] == [(TAG1, {SOME_VALUE + ' 12345', SOME_VALUE, '12345'})]
def test_should_apply_regex_to_result(self): xml_root = E.article(E.title('1.1. ' + SOME_VALUE)) xml_mapping = { 'article': { TAG1: 'title', TAG1 + XmlMappingSuffix.REGEX: r'(?:\d+\.?)* ?(.*)' } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert len(target_annotations) == 1 assert target_annotations[0].name == TAG1 assert target_annotations[0].value == SOME_VALUE
def test_should_add_sub_annotations_with_multiple_values(self): xml_root = E.article( E.entry(E.value(SOME_VALUE), E.value(SOME_VALUE_2))) xml_mapping = { 'article': { TAG1: 'entry', TAG1 + XmlMappingSuffix.SUB + '.value': './value' } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert [ (t.name, t.value) for t in target_annotations[0].sub_annotations ] == [('value', SOME_VALUE), ('value', SOME_VALUE_2)]
def test_should_apply_children_xpaths_and_exclude_parents(self): xml_root = E.article( E.entry( E.parent(E.child2(SOME_LONGER_VALUE), E.child1(SOME_SHORTER_VALUE)))) xml_mapping = { 'article': { TAG1: 'entry', TAG1 + XmlMappingSuffix.CHILDREN: './/*' } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert [(t.name, t.value) for t in target_annotations ] == [(TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE])]
def test_should_add_sub_annotations(self): xml_root = E.article( E.entry(E.firstname(SOME_VALUE), E.givennames(SOME_VALUE_2))) xml_mapping = { 'article': { TAG1: 'entry', TAG1 + XmlMappingSuffix.SUB + '.firstname': './firstname', TAG1 + XmlMappingSuffix.SUB + '.givennames': './givennames', } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert [ (t.name, t.value) for t in target_annotations[0].sub_annotations ] == [('firstname', SOME_VALUE), ('givennames', SOME_VALUE_2)]
def test_should_return_target_annotations_in_order_of_xml(self): xml_root = E.article( E.tag1('tag1.1'), E.tag2('tag2.1'), E.tag1('tag1.2'), E.tag2('tag2.2'), ) xml_mapping = {'article': {TAG1: 'tag1', TAG2: 'tag2'}} target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert [(ta.name, ta.value) for ta in target_annotations] == [(TAG1, 'tag1.1'), (TAG2, 'tag2.1'), (TAG1, 'tag1.2'), (TAG2, 'tag2.2')]
def test_should_apply_multiple_children_xpaths_and_include_parent_text_if_enabled( self): xml_root = E.article( E.entry(E.child1(SOME_SHORTER_VALUE), SOME_LONGER_VALUE)) xml_mapping = { 'article': { TAG1: 'entry', TAG1 + XmlMappingSuffix.CHILDREN: '\n{}\n{}\n'.format('.//*', '.'), TAG1 + XmlMappingSuffix.UNMATCHED_PARENT_TEXT: 'true' } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert [(t.name, t.value) for t in target_annotations ] == [(TAG1, [SOME_LONGER_VALUE, SOME_SHORTER_VALUE])]
def convert_and_annotate_lxml_content(lxml_content, xml_content, xml_mapping, name=None): stop_watch_recorder = StopWatchRecorder() stop_watch_recorder.start('parse lxml') lxml_root = etree.fromstring(lxml_content) # use a more lenient way to parse xml as xml errors are not uncomment stop_watch_recorder.start('parse xml') xml_root = xml_from_string_with_recover(xml_content) stop_watch_recorder.start('extract target annotations') target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping ) stop_watch_recorder.stop() annotators = DEFAULT_ANNOTATORS + [MatchingAnnotator( target_annotations, use_tag_begin_prefix=True )] annotator = Annotator(annotators) stop_watch_recorder.start('convert to svg') svg_roots = list(iter_svg_pages_for_lxml(lxml_root)) stop_watch_recorder.start('annotate svg') annotator.annotate(SvgStructuredDocument(svg_roots)) stop_watch_recorder.start('add visualisation') svg_roots = [ visualize_svg_annotations(svg_root) for svg_root in svg_roots ] stop_watch_recorder.stop() get_logger().info( 'processed: name=%s, lxml size=%s, xml size=%s, timings=[%s] (native align impl=%s)', name, format(len(lxml_content), ','), format(len(xml_content), ','), stop_watch_recorder, align_native_enabled ) return svg_roots
def test_should_unnest_extract_value_from_children(self): xml_root = E.article( E.entry(E.value(SOME_VALUE + ' 12345'), E.value(SOME_VALUE_2 + ' 54321'))) xml_mapping = { 'article': { TAG1: 'entry', TAG1 + XmlMappingSuffix.CHILDREN: r'.//*', TAG1 + XmlMappingSuffix.EXTRACT_REGEX: r'.*\b(\d+)\b.*' } } target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert len(target_annotations) == 1 assert [(t.name, set(t.value)) for t in target_annotations] == [(TAG1, { SOME_VALUE + ' 12345', SOME_VALUE, '12345', SOME_VALUE_2 + ' 54321', SOME_VALUE_2, '54321' })]
def test_should_not_apply_match_require_next_flag_if_not_set(self): xml_root = E.article(E.title(SOME_VALUE)) xml_mapping = {'article': {TAG1: 'title'}} target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert [t.require_next for t in target_annotations] == [False]
def test_should_return_empty_target_annotations_for_empty_xml(self): xml_root = E.article() xml_mapping = {'article': {'title': 'title'}} target_annotations = xml_root_to_target_annotations( xml_root, xml_mapping) assert target_annotations == []