예제 #1
0
 def test_should_annotate_over_multiple_lines_with_tag_transition(self):
     tag1_tokens_by_line = [
         _tokens_for_text('this may'),
         _tokens_for_text('match')
     ]
     tag1_tokens = flatten(tag1_tokens_by_line)
     tag2_tokens_by_line = [
         _tokens_for_text('another'),
         _tokens_for_text('tag here')
     ]
     tag2_tokens = flatten(tag2_tokens_by_line)
     tokens_by_line = [
         tag1_tokens_by_line[0],
         tag1_tokens_by_line[1] + tag2_tokens_by_line[0],
         tag2_tokens_by_line[1]
     ]
     target_annotations = [
         TargetAnnotation('this may match', TAG1),
         TargetAnnotation('another tag here', TAG2)
     ]
     doc = _document_for_tokens(tokens_by_line)
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tag_values_of_tokens(tag1_tokens) == [TAG1
                                                       ] * len(tag1_tokens)
     assert _get_tag_values_of_tokens(tag2_tokens) == [TAG2
                                                       ] * len(tag2_tokens)
예제 #2
0
 def test_should_annotate_but_not_merge_multiple_authors_annotation_too_far_apart(
         self):
     pre_tokens = _tokens_for_text('this is')
     matching_tokens_1 = _tokens_for_text('john smith')
     mid_tokens = _tokens_for_text('etc') * 5
     matching_tokens_2 = _tokens_for_text('mary maison')
     post_tokens = _tokens_for_text('the author')
     target_annotations = [
         TargetAnnotation(['john', 'smith'], TAG1),
         TargetAnnotation(['mary', 'maison'], TAG1)
     ]
     doc = _document_for_tokens([
         pre_tokens, matching_tokens_1, mid_tokens, matching_tokens_2,
         post_tokens
     ])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tag_values_of_tokens(
         matching_tokens_1) == [TAG1] * len(matching_tokens_1)
     assert _get_tag_values_of_tokens(
         matching_tokens_2) == [TAG1] * len(matching_tokens_2)
     assert _get_tag_values_of_tokens(mid_tokens) == [None
                                                      ] * len(mid_tokens)
     assert _get_tag_values_of_tokens(pre_tokens) == [None
                                                      ] * len(pre_tokens)
     assert _get_tag_values_of_tokens(post_tokens) == [None
                                                       ] * len(post_tokens)
예제 #3
0
 def test_should_fail_with_unsupported_annotation_attribute_sub_annotations(
         self):
     with pytest.raises(NotImplementedError):
         target_annotations = [
             TargetAnnotation(
                 'test',
                 TAG1,
                 sub_annotations=[TargetAnnotation('sub', TAG2)])
         ]
         doc = _document_for_tokens([_tokens_for_text('test')])
         SimpleMatchingAnnotator(target_annotations).annotate(doc)
예제 #4
0
 def test_should_annotate_separate_author_aff_with_begin_prefix(self):
     aff1_tokens = _tokens_for_text('University of Science')
     aff2_tokens = _tokens_for_text('University of Madness')
     target_annotations = [
         TargetAnnotation(['1', 'University of Science'], TAG1),
         TargetAnnotation(['2', 'University of Madness'], TAG1)
     ]
     doc = _document_for_tokens([aff1_tokens, aff2_tokens])
     SimpleMatchingAnnotator(target_annotations,
                             tag_config_map={}).annotate(doc)
     assert (_get_tags_of_tokens(aff1_tokens) == [B_TAG1] + [I_TAG1] *
             (len(aff1_tokens) - 1))
     assert (_get_tags_of_tokens(aff2_tokens) == [B_TAG1] + [I_TAG1] *
             (len(aff2_tokens) - 1))
예제 #5
0
 def test_should_annotate_ignoring_dots_after_capitals_in_document(self):
     matching_tokens = _tokens_for_text('P.O. Box 12345')
     target_annotations = [TargetAnnotation('PO Box 12345', TAG1)]
     doc = _document_for_tokens([matching_tokens])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
예제 #6
0
 def test_should_annotate_including_final_dot(self):
     matching_tokens = _tokens_for_text('this is matching.')
     target_annotations = [TargetAnnotation('this is matching.', TAG1)]
     doc = _document_for_tokens([matching_tokens])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
예제 #7
0
 def test_should_annotate_ignoring_space_after_dot_short_sequence(self):
     matching_tokens = [SimpleToken('A.B.,')]
     target_annotations = [TargetAnnotation('A. B.', TAG1)]
     doc = _document_for_tokens([matching_tokens])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
예제 #8
0
 def test_should_match_single_quotes_with_double_quotes(self):
     matching_tokens = _tokens_for_text('"this is matching"')
     target_annotations = [TargetAnnotation('\'this is matching\'', TAG1)]
     doc = SimpleStructuredDocument(lines=[SimpleLine(matching_tokens)])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
예제 #9
0
 def test_should_match_case_insensitive(self):
     matching_tokens = _tokens_for_text('This Is Matching')
     target_annotations = [TargetAnnotation('tHIS iS mATCHING', TAG1)]
     doc = SimpleStructuredDocument(lines=[SimpleLine(matching_tokens)])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
예제 #10
0
 def test_should_not_annotate_with_local_matching(self):
     tokens = _tokens_for_text('this is matching')
     target_annotations = [
         TargetAnnotation('this is matching but not fully matching', TAG1)
     ]
     doc = _document_for_tokens([tokens])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(tokens) == [None] * len(tokens)
예제 #11
0
 def test_should_not_annotate_not_matching(self):
     not_matching_tokens = _tokens_for_text(
         'something completely different')
     target_annotations = [TargetAnnotation('this is matching', TAG1)]
     doc = _document_for_tokens([not_matching_tokens])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         not_matching_tokens) == [None] * len(not_matching_tokens)
예제 #12
0
 def test_should_not_annotate_author_aff_label_between_author_names(self):
     author_tokens = _tokens_for_text('Mary 1 , Smith 1')
     aff_tokens = _tokens_for_text('University of Science')
     target_annotations = [
         TargetAnnotation(['Mary', 'Smith'], TAG1),
         TargetAnnotation(['1', 'University of Science'], TAG2)
     ]
     doc = _document_for_tokens([author_tokens, aff_tokens])
     SimpleMatchingAnnotator(
         target_annotations,
         tag_config_map={
             TAG1: SimpleTagConfig(extend_to_line_enabled=True)
         }).annotate(doc)
     assert _get_tag_values_of_tokens(
         author_tokens) == [TAG1] * len(author_tokens)
     assert _get_tag_values_of_tokens(aff_tokens) == [TAG2
                                                      ] * len(aff_tokens)
예제 #13
0
 def test_should_not_annotate_fuzzily_matching_with_many_differences(self):
     matching_tokens = _tokens_for_text('this is matching')
     target_annotations = [
         TargetAnnotation('txhxixsx ixsx mxaxtxcxhxixnxgx', TAG1)
     ]
     doc = _document_for_tokens([matching_tokens])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [None] * len(matching_tokens)
예제 #14
0
 def test_should_annotate_references_with_sub_tag_with_extend_to_line(self):
     matching_tokens_list = [_tokens_for_text('1 this is reference A')]
     matching_tokens = flatten(matching_tokens_list)
     target_annotations = [
         TargetAnnotation('1 this is reference A',
                          TAG1,
                          sub_annotations=[TargetAnnotation('1', TAG2)]),
     ]
     pre_tokens = [_tokens_for_text('previous line')] * 5
     doc = _document_for_tokens(pre_tokens + matching_tokens_list)
     SimpleMatchingAnnotator(target_annotations,
                             lookahead_sequence_count=3,
                             extend_to_line_enabled=True,
                             use_sub_annotations=True).annotate(doc)
     LOGGER.debug('doc: %s', _get_document_token_tags(doc))
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
     assert _get_sub_tag_values_of_tokens(matching_tokens) == (
         [TAG2] + [None] * (len(matching_tokens) - 1))
예제 #15
0
 def test_should_annotate_references(self):
     matching_tokens_list = [
         _tokens_for_text('1 this is reference A'),
         _tokens_for_text('2 this is reference B'),
         _tokens_for_text('3 this is reference C')
     ]
     matching_tokens = flatten(matching_tokens_list)
     target_annotations = [
         TargetAnnotation('this is reference A', TAG1),
         TargetAnnotation('this is reference B', TAG1),
         TargetAnnotation('this is reference C', TAG1)
     ]
     pre_tokens = [_tokens_for_text('previous line')] * 5
     doc = _document_for_tokens(pre_tokens + matching_tokens_list)
     SimpleMatchingAnnotator(target_annotations,
                             lookahead_sequence_count=3).annotate(doc)
     LOGGER.debug('doc: %s', _get_document_token_tags(doc))
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
예제 #16
0
 def test_should_annotate_whole_line(self):
     pre_tokens = _tokens_for_text('this is')
     matching_tokens = _tokens_for_text('john smith 1, mary maison 2')
     post_tokens = _tokens_for_text('the author')
     target_annotations = [
         TargetAnnotation(['john', 'smith'], TAG1),
         TargetAnnotation(['mary', 'maison'], TAG1)
     ]
     doc = _document_for_tokens([matching_tokens])
     SimpleMatchingAnnotator(
         target_annotations,
         tag_config_map={
             TAG1: SimpleTagConfig(extend_to_line_enabled=True)
         }).annotate(doc)
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
     assert _get_tag_values_of_tokens(pre_tokens) == [None
                                                      ] * len(pre_tokens)
     assert _get_tag_values_of_tokens(post_tokens) == [None
                                                       ] * len(post_tokens)
예제 #17
0
 def test_should_annotate_abstract_section_heading(self):
     matching_tokens = _tokens_for_text('Abstract\nthis is matching.')
     target_annotations = [TargetAnnotation('this is matching.', TAG1)]
     doc = _document_for_tokens([matching_tokens])
     SimpleMatchingAnnotator(
         target_annotations,
         tag_config_map={
             TAG1:
             SimpleTagConfig(match_prefix_regex=r'(abstract|summary)\s*$')
         }).annotate(doc)
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
예제 #18
0
 def test_should_annotate_exactly_matching_across_multiple_lines(self):
     matching_tokens_per_line = [
         _tokens_for_text('this is matching'),
         _tokens_for_text('and continues here')
     ]
     matching_tokens = flatten(matching_tokens_per_line)
     target_annotations = [
         TargetAnnotation('this is matching and continues here', TAG1)
     ]
     doc = _document_for_tokens(matching_tokens_per_line)
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
예제 #19
0
 def test_should_annotate_multiple_value_annotation_in_reverse_order(self):
     pre_tokens = _tokens_for_text('this is')
     matching_tokens = _tokens_for_text('john smith')
     post_tokens = _tokens_for_text('the author')
     doc_tokens = pre_tokens + matching_tokens + post_tokens
     target_annotations = [TargetAnnotation(['smith', 'john'], TAG1)]
     doc = _document_for_tokens([doc_tokens])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
     assert _get_tag_values_of_tokens(pre_tokens) == [None
                                                      ] * len(pre_tokens)
     assert _get_tag_values_of_tokens(post_tokens) == [None
                                                       ] * len(post_tokens)
예제 #20
0
 def test_should_prefer_word_boundaries(self):
     pre_tokens = _tokens_for_text('this')
     matching_tokens = _tokens_for_text('is')
     post_tokens = _tokens_for_text('miss')
     target_annotations = [TargetAnnotation('is', TAG1)]
     doc = _document_for_tokens(
         [pre_tokens + matching_tokens + post_tokens])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
     assert _get_tag_values_of_tokens(pre_tokens) == [None
                                                      ] * len(pre_tokens)
     assert _get_tag_values_of_tokens(post_tokens) == [None
                                                       ] * len(post_tokens)
예제 #21
0
 def test_should_annotate_using_alternative_spellings(self):
     matching_tokens = _tokens_for_text('this is matching')
     target_annotations = [TargetAnnotation('alternative spelling', TAG1)]
     doc = _document_for_tokens([matching_tokens])
     SimpleMatchingAnnotator(
         target_annotations,
         tag_config_map={
             TAG1:
             SimpleTagConfig(alternative_spellings={
                 'alternative spelling': ['this is matching']
             })
         }).annotate(doc)
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
예제 #22
0
 def test_should_annotate_author_aff_preceding_number(self):
     number_tokens = _tokens_for_text('1')
     matching_tokens = _tokens_for_text('this is matching')
     target_annotations = [TargetAnnotation('this is matching', TAG1)]
     doc = _document_for_tokens([number_tokens, matching_tokens])
     SimpleMatchingAnnotator(
         target_annotations,
         tag_config_map={
             TAG1: SimpleTagConfig(match_prefix_regex=r'(?=^|\n)\d\s*$')
         }).annotate(doc)
     assert _get_tag_values_of_tokens(
         number_tokens) == [TAG1] * len(number_tokens)
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
예제 #23
0
 def test_should_not_annotate_multiple_value_annotation_too_far_away(self):
     pre_tokens = _tokens_for_text('this is')
     matching_tokens = _tokens_for_text('smith')
     post_tokens = _tokens_for_text('etc') * 40 + _tokens_for_text('john')
     doc_tokens = pre_tokens + matching_tokens + post_tokens
     target_annotations = [TargetAnnotation(['john', 'smith'], TAG1)]
     doc = _document_for_tokens([doc_tokens])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     LOGGER.debug('doc: %s', _get_document_token_tags(doc))
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
     assert _get_tag_values_of_tokens(pre_tokens) == [None
                                                      ] * len(pre_tokens)
     assert _get_tag_values_of_tokens(post_tokens) == [None
                                                       ] * len(post_tokens)
예제 #24
0
def xml_root_to_target_annotations(xml_root, xml_mapping):
    if xml_root.tag not in xml_mapping:
        raise Exception("unrecognised tag: {} (available: {})".format(
            xml_root.tag, xml_mapping.sections()))

    mapping = xml_mapping[xml_root.tag]

    field_names = [k for k in mapping.keys() if '.' not in k]

    def get_mapping_flag(k, suffix):
        return mapping.get(k + suffix) == 'true'

    def get_match_multiple(k):
        return get_mapping_flag(k, XmlMappingSuffix.MATCH_MULTIPLE)

    def get_bonding_flag(k):
        return get_mapping_flag(k, XmlMappingSuffix.BONDING)

    def get_require_next_flag(k):
        return get_mapping_flag(k, XmlMappingSuffix.REQUIRE_NEXT)

    get_unmatched_parent_text_flag = (
        lambda k: get_mapping_flag(k, XmlMappingSuffix.UNMATCHED_PARENT_TEXT))

    LOGGER.debug('fields: %s', field_names)

    target_annotations_with_pos = []
    xml_pos_by_node = {node: i for i, node in enumerate(xml_root.iter())}
    for k in field_names:
        match_multiple = get_match_multiple(k)
        bonding = get_bonding_flag(k)
        require_next = get_require_next_flag(k)
        unmatched_parent_text = get_unmatched_parent_text_flag(k)

        exclude_children_xpaths = parse_xpaths(
            mapping.get(k + XmlMappingSuffix.IGNORE))
        LOGGER.debug('exclude_children_xpaths (%s): %s', k,
                     exclude_children_xpaths)

        children_xpaths = parse_xpaths(
            mapping.get(k + XmlMappingSuffix.CHILDREN))
        children_concat = parse_json_with_default(
            mapping.get(k + XmlMappingSuffix.CHILDREN_CONCAT), [])
        children_range = parse_json_with_default(
            mapping.get(k + XmlMappingSuffix.CHILDREN_RANGE), [])
        re_compiled_pattern = re_compile_or_none(
            mapping.get(k + XmlMappingSuffix.REGEX))
        extract_re_compiled_pattern = re_compile_or_none(
            mapping.get(k + XmlMappingSuffix.EXTRACT_REGEX))
        LOGGER.debug('extract_re_compiled_pattern (%s): %s', k,
                     extract_re_compiled_pattern)

        priority = int(mapping.get(k + XmlMappingSuffix.PRIORITY, '0'))
        sub_xpaths = get_sub_mapping(mapping, k)
        LOGGER.debug('sub_xpaths (%s): %s', k, sub_xpaths)

        use_raw_text_value = mapping.get(k + XmlMappingSuffix.USE_RAW_TEXT)
        use_raw_text_config = strtobool(
            use_raw_text_value) if use_raw_text_value else None

        xpaths = parse_xpaths(mapping[k])
        LOGGER.debug('xpaths(%s): %s', k, xpaths)
        for e in match_xpaths(xml_root, xpaths):
            e_pos = xml_pos_by_node.get(e)

            exclude_childrens = list(match_xpaths(e, exclude_children_xpaths))
            LOGGER.debug('exclude_childrens (%s, %s): %s', k, e,
                         exclude_childrens)

            sub_annotations = extract_sub_annotations(e, sub_xpaths, mapping,
                                                      k)
            LOGGER.debug('sub_annotations (%s): %s', k, sub_annotations)

            use_raw_text = (use_raw_text_config if use_raw_text_config
                            is not None else contains_raw_text(e))
            should_use_children_xpaths = (
                children_xpaths
                and (not is_wildcard_children_xpaths(children_xpaths)
                     or not use_raw_text))
            if should_use_children_xpaths:
                text_content_list, standalone_values = extract_children(
                    e, children_xpaths, children_concat, children_range,
                    unmatched_parent_text)
            else:
                text_content_list = filter_truthy(
                    strip_all([
                        get_raw_text_content(
                            e, exclude_childrens=exclude_childrens)
                    ]))
                standalone_values = []
            LOGGER.debug(
                'text_content_list: %s, standalone_values: %s,'
                ' children_xpaths: %s, use_raw_text: %s', text_content_list,
                standalone_values, children_xpaths, use_raw_text)
            if re_compiled_pattern:
                text_content_list = filter_truthy([
                    apply_pattern(s, re_compiled_pattern)
                    for s in text_content_list
                ])
            if extract_re_compiled_pattern:
                text_content_list = filter_truthy([
                    extract_using_regex(s, extract_re_compiled_pattern)
                    for s in text_content_list
                ])
            text_content_list = flatten_if_nested(text_content_list)
            if text_content_list:
                value = (text_content_list[0] if len(text_content_list) == 1
                         else sorted(text_content_list, key=lambda s: -len(s)))
                target_annotations_with_pos.append(
                    ((-priority, e_pos),
                     TargetAnnotation(value,
                                      k,
                                      match_multiple=match_multiple,
                                      bonding=bonding,
                                      require_next=require_next,
                                      sub_annotations=sub_annotations)))
            if standalone_values:
                for i, standalone_value in enumerate(standalone_values):
                    target_annotations_with_pos.append(
                        ((-priority, e_pos, i),
                         TargetAnnotation(standalone_value,
                                          k,
                                          match_multiple=match_multiple,
                                          bonding=bonding,
                                          sub_annotations=sub_annotations)))
    target_annotations_with_pos = sorted(target_annotations_with_pos,
                                         key=lambda x: x[0])
    LOGGER.debug('target_annotations_with_pos:\n%s',
                 target_annotations_with_pos)
    target_annotations = [x[1] for x in target_annotations_with_pos]
    LOGGER.debug('target_annotations:\n%s',
                 '\n'.join([' ' + str(a) for a in target_annotations]))
    if not target_annotations and LOGGER.isEnabledFor(logging.DEBUG):
        LOGGER.debug(
            'no target_annotations found for\nxml_mapping: %s\nxml: %s',
            xml_mapping, etree.tostring(xml_root, encoding='unicode'))
    return target_annotations
예제 #25
0
 def test_should_not_fail_on_empty_line_with_blank_token(self):
     target_annotations = [TargetAnnotation('this is. matching', TAG1)]
     doc = _document_for_tokens([[SimpleToken('')]])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)