def test_should_annotate_over_multiple_lines_with_tag_transition(self): tag1_tokens_by_line = [ _tokens_for_text('this may'), _tokens_for_text('match') ] tag1_tokens = flatten(tag1_tokens_by_line) tag2_tokens_by_line = [ _tokens_for_text('another'), _tokens_for_text('tag here') ] tag2_tokens = flatten(tag2_tokens_by_line) tokens_by_line = [ tag1_tokens_by_line[0], tag1_tokens_by_line[1] + tag2_tokens_by_line[0], tag2_tokens_by_line[1] ] target_annotations = [ TargetAnnotation('this may match', TAG1), TargetAnnotation('another tag here', TAG2) ] doc = _document_for_tokens(tokens_by_line) SimpleMatchingAnnotator(target_annotations).annotate(doc) assert _get_tag_values_of_tokens(tag1_tokens) == [TAG1 ] * len(tag1_tokens) assert _get_tag_values_of_tokens(tag2_tokens) == [TAG2 ] * len(tag2_tokens)
def test_should_annotate_but_not_merge_multiple_authors_annotation_too_far_apart( self): pre_tokens = _tokens_for_text('this is') matching_tokens_1 = _tokens_for_text('john smith') mid_tokens = _tokens_for_text('etc') * 5 matching_tokens_2 = _tokens_for_text('mary maison') post_tokens = _tokens_for_text('the author') target_annotations = [ TargetAnnotation(['john', 'smith'], TAG1), TargetAnnotation(['mary', 'maison'], TAG1) ] doc = _document_for_tokens([ pre_tokens, matching_tokens_1, mid_tokens, matching_tokens_2, post_tokens ]) SimpleMatchingAnnotator(target_annotations).annotate(doc) assert _get_tag_values_of_tokens( matching_tokens_1) == [TAG1] * len(matching_tokens_1) assert _get_tag_values_of_tokens( matching_tokens_2) == [TAG1] * len(matching_tokens_2) assert _get_tag_values_of_tokens(mid_tokens) == [None ] * len(mid_tokens) assert _get_tag_values_of_tokens(pre_tokens) == [None ] * len(pre_tokens) assert _get_tag_values_of_tokens(post_tokens) == [None ] * len(post_tokens)
def test_should_fail_with_unsupported_annotation_attribute_sub_annotations( self): with pytest.raises(NotImplementedError): target_annotations = [ TargetAnnotation( 'test', TAG1, sub_annotations=[TargetAnnotation('sub', TAG2)]) ] doc = _document_for_tokens([_tokens_for_text('test')]) SimpleMatchingAnnotator(target_annotations).annotate(doc)
def test_should_annotate_separate_author_aff_with_begin_prefix(self): aff1_tokens = _tokens_for_text('University of Science') aff2_tokens = _tokens_for_text('University of Madness') target_annotations = [ TargetAnnotation(['1', 'University of Science'], TAG1), TargetAnnotation(['2', 'University of Madness'], TAG1) ] doc = _document_for_tokens([aff1_tokens, aff2_tokens]) SimpleMatchingAnnotator(target_annotations, tag_config_map={}).annotate(doc) assert (_get_tags_of_tokens(aff1_tokens) == [B_TAG1] + [I_TAG1] * (len(aff1_tokens) - 1)) assert (_get_tags_of_tokens(aff2_tokens) == [B_TAG1] + [I_TAG1] * (len(aff2_tokens) - 1))
def test_should_annotate_ignoring_dots_after_capitals_in_document(self): matching_tokens = _tokens_for_text('P.O. Box 12345') target_annotations = [TargetAnnotation('PO Box 12345', TAG1)] doc = _document_for_tokens([matching_tokens]) SimpleMatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_annotate_including_final_dot(self): matching_tokens = _tokens_for_text('this is matching.') target_annotations = [TargetAnnotation('this is matching.', TAG1)] doc = _document_for_tokens([matching_tokens]) SimpleMatchingAnnotator(target_annotations).annotate(doc) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_annotate_ignoring_space_after_dot_short_sequence(self): matching_tokens = [SimpleToken('A.B.,')] target_annotations = [TargetAnnotation('A. B.', TAG1)] doc = _document_for_tokens([matching_tokens]) SimpleMatchingAnnotator(target_annotations).annotate(doc) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_match_single_quotes_with_double_quotes(self): matching_tokens = _tokens_for_text('"this is matching"') target_annotations = [TargetAnnotation('\'this is matching\'', TAG1)] doc = SimpleStructuredDocument(lines=[SimpleLine(matching_tokens)]) SimpleMatchingAnnotator(target_annotations).annotate(doc) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_match_case_insensitive(self): matching_tokens = _tokens_for_text('This Is Matching') target_annotations = [TargetAnnotation('tHIS iS mATCHING', TAG1)] doc = SimpleStructuredDocument(lines=[SimpleLine(matching_tokens)]) SimpleMatchingAnnotator(target_annotations).annotate(doc) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_not_annotate_with_local_matching(self): tokens = _tokens_for_text('this is matching') target_annotations = [ TargetAnnotation('this is matching but not fully matching', TAG1) ] doc = _document_for_tokens([tokens]) SimpleMatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens(tokens) == [None] * len(tokens)
def test_should_not_annotate_not_matching(self): not_matching_tokens = _tokens_for_text( 'something completely different') target_annotations = [TargetAnnotation('this is matching', TAG1)] doc = _document_for_tokens([not_matching_tokens]) SimpleMatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( not_matching_tokens) == [None] * len(not_matching_tokens)
def test_should_not_annotate_author_aff_label_between_author_names(self): author_tokens = _tokens_for_text('Mary 1 , Smith 1') aff_tokens = _tokens_for_text('University of Science') target_annotations = [ TargetAnnotation(['Mary', 'Smith'], TAG1), TargetAnnotation(['1', 'University of Science'], TAG2) ] doc = _document_for_tokens([author_tokens, aff_tokens]) SimpleMatchingAnnotator( target_annotations, tag_config_map={ TAG1: SimpleTagConfig(extend_to_line_enabled=True) }).annotate(doc) assert _get_tag_values_of_tokens( author_tokens) == [TAG1] * len(author_tokens) assert _get_tag_values_of_tokens(aff_tokens) == [TAG2 ] * len(aff_tokens)
def test_should_not_annotate_fuzzily_matching_with_many_differences(self): matching_tokens = _tokens_for_text('this is matching') target_annotations = [ TargetAnnotation('txhxixsx ixsx mxaxtxcxhxixnxgx', TAG1) ] doc = _document_for_tokens([matching_tokens]) SimpleMatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [None] * len(matching_tokens)
def test_should_annotate_references_with_sub_tag_with_extend_to_line(self): matching_tokens_list = [_tokens_for_text('1 this is reference A')] matching_tokens = flatten(matching_tokens_list) target_annotations = [ TargetAnnotation('1 this is reference A', TAG1, sub_annotations=[TargetAnnotation('1', TAG2)]), ] pre_tokens = [_tokens_for_text('previous line')] * 5 doc = _document_for_tokens(pre_tokens + matching_tokens_list) SimpleMatchingAnnotator(target_annotations, lookahead_sequence_count=3, extend_to_line_enabled=True, use_sub_annotations=True).annotate(doc) LOGGER.debug('doc: %s', _get_document_token_tags(doc)) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_sub_tag_values_of_tokens(matching_tokens) == ( [TAG2] + [None] * (len(matching_tokens) - 1))
def test_should_annotate_references(self): matching_tokens_list = [ _tokens_for_text('1 this is reference A'), _tokens_for_text('2 this is reference B'), _tokens_for_text('3 this is reference C') ] matching_tokens = flatten(matching_tokens_list) target_annotations = [ TargetAnnotation('this is reference A', TAG1), TargetAnnotation('this is reference B', TAG1), TargetAnnotation('this is reference C', TAG1) ] pre_tokens = [_tokens_for_text('previous line')] * 5 doc = _document_for_tokens(pre_tokens + matching_tokens_list) SimpleMatchingAnnotator(target_annotations, lookahead_sequence_count=3).annotate(doc) LOGGER.debug('doc: %s', _get_document_token_tags(doc)) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_annotate_whole_line(self): pre_tokens = _tokens_for_text('this is') matching_tokens = _tokens_for_text('john smith 1, mary maison 2') post_tokens = _tokens_for_text('the author') target_annotations = [ TargetAnnotation(['john', 'smith'], TAG1), TargetAnnotation(['mary', 'maison'], TAG1) ] doc = _document_for_tokens([matching_tokens]) SimpleMatchingAnnotator( target_annotations, tag_config_map={ TAG1: SimpleTagConfig(extend_to_line_enabled=True) }).annotate(doc) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_tag_values_of_tokens(pre_tokens) == [None ] * len(pre_tokens) assert _get_tag_values_of_tokens(post_tokens) == [None ] * len(post_tokens)
def test_should_annotate_abstract_section_heading(self): matching_tokens = _tokens_for_text('Abstract\nthis is matching.') target_annotations = [TargetAnnotation('this is matching.', TAG1)] doc = _document_for_tokens([matching_tokens]) SimpleMatchingAnnotator( target_annotations, tag_config_map={ TAG1: SimpleTagConfig(match_prefix_regex=r'(abstract|summary)\s*$') }).annotate(doc) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_annotate_exactly_matching_across_multiple_lines(self): matching_tokens_per_line = [ _tokens_for_text('this is matching'), _tokens_for_text('and continues here') ] matching_tokens = flatten(matching_tokens_per_line) target_annotations = [ TargetAnnotation('this is matching and continues here', TAG1) ] doc = _document_for_tokens(matching_tokens_per_line) SimpleMatchingAnnotator(target_annotations).annotate(doc) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_annotate_multiple_value_annotation_in_reverse_order(self): pre_tokens = _tokens_for_text('this is') matching_tokens = _tokens_for_text('john smith') post_tokens = _tokens_for_text('the author') doc_tokens = pre_tokens + matching_tokens + post_tokens target_annotations = [TargetAnnotation(['smith', 'john'], TAG1)] doc = _document_for_tokens([doc_tokens]) SimpleMatchingAnnotator(target_annotations).annotate(doc) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_tag_values_of_tokens(pre_tokens) == [None ] * len(pre_tokens) assert _get_tag_values_of_tokens(post_tokens) == [None ] * len(post_tokens)
def test_should_prefer_word_boundaries(self): pre_tokens = _tokens_for_text('this') matching_tokens = _tokens_for_text('is') post_tokens = _tokens_for_text('miss') target_annotations = [TargetAnnotation('is', TAG1)] doc = _document_for_tokens( [pre_tokens + matching_tokens + post_tokens]) SimpleMatchingAnnotator(target_annotations).annotate(doc) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_tag_values_of_tokens(pre_tokens) == [None ] * len(pre_tokens) assert _get_tag_values_of_tokens(post_tokens) == [None ] * len(post_tokens)
def test_should_annotate_using_alternative_spellings(self): matching_tokens = _tokens_for_text('this is matching') target_annotations = [TargetAnnotation('alternative spelling', TAG1)] doc = _document_for_tokens([matching_tokens]) SimpleMatchingAnnotator( target_annotations, tag_config_map={ TAG1: SimpleTagConfig(alternative_spellings={ 'alternative spelling': ['this is matching'] }) }).annotate(doc) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_annotate_author_aff_preceding_number(self): number_tokens = _tokens_for_text('1') matching_tokens = _tokens_for_text('this is matching') target_annotations = [TargetAnnotation('this is matching', TAG1)] doc = _document_for_tokens([number_tokens, matching_tokens]) SimpleMatchingAnnotator( target_annotations, tag_config_map={ TAG1: SimpleTagConfig(match_prefix_regex=r'(?=^|\n)\d\s*$') }).annotate(doc) assert _get_tag_values_of_tokens( number_tokens) == [TAG1] * len(number_tokens) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_not_annotate_multiple_value_annotation_too_far_away(self): pre_tokens = _tokens_for_text('this is') matching_tokens = _tokens_for_text('smith') post_tokens = _tokens_for_text('etc') * 40 + _tokens_for_text('john') doc_tokens = pre_tokens + matching_tokens + post_tokens target_annotations = [TargetAnnotation(['john', 'smith'], TAG1)] doc = _document_for_tokens([doc_tokens]) SimpleMatchingAnnotator(target_annotations).annotate(doc) LOGGER.debug('doc: %s', _get_document_token_tags(doc)) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_tag_values_of_tokens(pre_tokens) == [None ] * len(pre_tokens) assert _get_tag_values_of_tokens(post_tokens) == [None ] * len(post_tokens)
def xml_root_to_target_annotations(xml_root, xml_mapping): if xml_root.tag not in xml_mapping: raise Exception("unrecognised tag: {} (available: {})".format( xml_root.tag, xml_mapping.sections())) mapping = xml_mapping[xml_root.tag] field_names = [k for k in mapping.keys() if '.' not in k] def get_mapping_flag(k, suffix): return mapping.get(k + suffix) == 'true' def get_match_multiple(k): return get_mapping_flag(k, XmlMappingSuffix.MATCH_MULTIPLE) def get_bonding_flag(k): return get_mapping_flag(k, XmlMappingSuffix.BONDING) def get_require_next_flag(k): return get_mapping_flag(k, XmlMappingSuffix.REQUIRE_NEXT) get_unmatched_parent_text_flag = ( lambda k: get_mapping_flag(k, XmlMappingSuffix.UNMATCHED_PARENT_TEXT)) LOGGER.debug('fields: %s', field_names) target_annotations_with_pos = [] xml_pos_by_node = {node: i for i, node in enumerate(xml_root.iter())} for k in field_names: match_multiple = get_match_multiple(k) bonding = get_bonding_flag(k) require_next = get_require_next_flag(k) unmatched_parent_text = get_unmatched_parent_text_flag(k) exclude_children_xpaths = parse_xpaths( mapping.get(k + XmlMappingSuffix.IGNORE)) LOGGER.debug('exclude_children_xpaths (%s): %s', k, exclude_children_xpaths) children_xpaths = parse_xpaths( mapping.get(k + XmlMappingSuffix.CHILDREN)) children_concat = parse_json_with_default( mapping.get(k + XmlMappingSuffix.CHILDREN_CONCAT), []) children_range = parse_json_with_default( mapping.get(k + XmlMappingSuffix.CHILDREN_RANGE), []) re_compiled_pattern = re_compile_or_none( mapping.get(k + XmlMappingSuffix.REGEX)) extract_re_compiled_pattern = re_compile_or_none( mapping.get(k + XmlMappingSuffix.EXTRACT_REGEX)) LOGGER.debug('extract_re_compiled_pattern (%s): %s', k, extract_re_compiled_pattern) priority = int(mapping.get(k + XmlMappingSuffix.PRIORITY, '0')) sub_xpaths = get_sub_mapping(mapping, k) LOGGER.debug('sub_xpaths (%s): %s', k, sub_xpaths) use_raw_text_value = mapping.get(k + XmlMappingSuffix.USE_RAW_TEXT) use_raw_text_config = strtobool( use_raw_text_value) if use_raw_text_value else None xpaths = parse_xpaths(mapping[k]) LOGGER.debug('xpaths(%s): %s', k, xpaths) for e in match_xpaths(xml_root, xpaths): e_pos = xml_pos_by_node.get(e) exclude_childrens = list(match_xpaths(e, exclude_children_xpaths)) LOGGER.debug('exclude_childrens (%s, %s): %s', k, e, exclude_childrens) sub_annotations = extract_sub_annotations(e, sub_xpaths, mapping, k) LOGGER.debug('sub_annotations (%s): %s', k, sub_annotations) use_raw_text = (use_raw_text_config if use_raw_text_config is not None else contains_raw_text(e)) should_use_children_xpaths = ( children_xpaths and (not is_wildcard_children_xpaths(children_xpaths) or not use_raw_text)) if should_use_children_xpaths: text_content_list, standalone_values = extract_children( e, children_xpaths, children_concat, children_range, unmatched_parent_text) else: text_content_list = filter_truthy( strip_all([ get_raw_text_content( e, exclude_childrens=exclude_childrens) ])) standalone_values = [] LOGGER.debug( 'text_content_list: %s, standalone_values: %s,' ' children_xpaths: %s, use_raw_text: %s', text_content_list, standalone_values, children_xpaths, use_raw_text) if re_compiled_pattern: text_content_list = filter_truthy([ apply_pattern(s, re_compiled_pattern) for s in text_content_list ]) if extract_re_compiled_pattern: text_content_list = filter_truthy([ extract_using_regex(s, extract_re_compiled_pattern) for s in text_content_list ]) text_content_list = flatten_if_nested(text_content_list) if text_content_list: value = (text_content_list[0] if len(text_content_list) == 1 else sorted(text_content_list, key=lambda s: -len(s))) target_annotations_with_pos.append( ((-priority, e_pos), TargetAnnotation(value, k, match_multiple=match_multiple, bonding=bonding, require_next=require_next, sub_annotations=sub_annotations))) if standalone_values: for i, standalone_value in enumerate(standalone_values): target_annotations_with_pos.append( ((-priority, e_pos, i), TargetAnnotation(standalone_value, k, match_multiple=match_multiple, bonding=bonding, sub_annotations=sub_annotations))) target_annotations_with_pos = sorted(target_annotations_with_pos, key=lambda x: x[0]) LOGGER.debug('target_annotations_with_pos:\n%s', target_annotations_with_pos) target_annotations = [x[1] for x in target_annotations_with_pos] LOGGER.debug('target_annotations:\n%s', '\n'.join([' ' + str(a) for a in target_annotations])) if not target_annotations and LOGGER.isEnabledFor(logging.DEBUG): LOGGER.debug( 'no target_annotations found for\nxml_mapping: %s\nxml: %s', xml_mapping, etree.tostring(xml_root, encoding='unicode')) return target_annotations
def test_should_not_fail_on_empty_line_with_blank_token(self): target_annotations = [TargetAnnotation('this is. matching', TAG1)] doc = _document_for_tokens([[SimpleToken('')]]) SimpleMatchingAnnotator(target_annotations).annotate(doc)