def test_should_annotate_exactly_matching_across_multiple_lines(self):
     matching_tokens_per_line = [
         _tokens_for_text('this is matching'),
         _tokens_for_text('and continues here')
     ]
     matching_tokens = flatten(matching_tokens_per_line)
     target_annotations = [
         TargetAnnotation('this is matching and continues here', TAG1)
     ]
     doc = _document_for_tokens(matching_tokens_per_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
示例#2
0
 def test_should_annotate_short_reference_item_followed_by_other_reference_items(
         self):
     reference_item_texts = ['ref_id', 'ref_title']
     reference_item_tokens = _tokens_for_text(
         ' '.join(reference_item_texts))
     tokens_per_line = [reference_item_tokens]
     target_annotations = [
         TargetAnnotation(reference_item_texts, 'reference', bonding=True)
     ]
     doc = _document_for_tokens(tokens_per_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert (_get_tags_of_tokens(reference_item_tokens) == ['reference'] *
             len(reference_item_tokens))
示例#3
0
 def test_should_annotate_sub_tag_exactly_matching_without_begin_prefix(
         self):
     matching_tokens = _tokens_for_text('this is matching')
     target_annotations = [
         TargetAnnotation('this is matching',
                          TAG2,
                          sub_annotations=[TargetAnnotation('this', TAG1)])
     ]
     doc = _document_for_tokens([matching_tokens])
     MatchingAnnotator(target_annotations,
                       use_tag_begin_prefix=False).annotate(doc)
     assert [doc.get_sub_tag(x)
             for x in matching_tokens] == [TAG1, None, None]
    def test_should_annotate_same_sequence_multiple_times_if_enabled(self):
        matching_tokens_per_line = [
            _tokens_for_text('this is matching'),
            _tokens_for_text('this is matching')
        ]

        matching_tokens = flatten(matching_tokens_per_line)
        target_annotations = [
            TargetAnnotation('this is matching', TAG1, match_multiple=True)
        ]
        doc = _document_for_tokens(matching_tokens_per_line)
        MatchingAnnotator(target_annotations).annotate(doc)
        assert _get_tags_of_tokens(
            matching_tokens) == [TAG1] * len(matching_tokens)
 def test_should_not_annotate_pre_annotated_tokens_on_separate_lines(self):
     line_no_tokens = _tokens_for_text('1')
     line_no_tokens[0].set_tag('line_no')
     matching_tokens = _tokens_for_text('this is matching')
     target_annotations = [
         TargetAnnotation('1', TAG2),
         TargetAnnotation('this is matching', TAG1)
     ]
     doc = _document_for_tokens([line_no_tokens + matching_tokens])
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(line_no_tokens) == ['line_no'
                                                    ] * len(line_no_tokens)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
 def test_should_not_annotate_shorter_target_annotation_in_longer_line_multiple_times(
         self):
     pre_tokens = _tokens_for_text('pre')
     matching_tokens = _tokens_for_text('this is matching')
     post_tokens = _tokens_for_text('post')
     first_line_tokens = pre_tokens + matching_tokens + post_tokens
     similar_line_tokens = _copy_tokens(first_line_tokens)
     target_annotations = [TargetAnnotation('this is matching', TAG1)]
     doc = _document_for_tokens([first_line_tokens, similar_line_tokens])
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
     assert _get_tags_of_tokens(
         similar_line_tokens) == [None] * len(similar_line_tokens)
 def test_should_annotate_mult_value_target_annot_rev_order_over_mult_lines_with_b_prefix(
         self):
     tokens_by_line = [
         _tokens_for_text('this may'),
         _tokens_for_text('match')
     ]
     matching_tokens = flatten(tokens_by_line)
     target_annotations = [
         TargetAnnotation(list(reversed(['this', 'may', 'match'])), TAG1)
     ]
     doc = _document_for_tokens(tokens_by_line)
     MatchingAnnotator(target_annotations,
                       use_tag_begin_prefix=True).annotate(doc)
     assert _get_tags_of_tokens(matching_tokens) == (
         [B_TAG_1] + [I_TAG_1] * (len(matching_tokens) - 1))
 def test_should_annotate_exactly_matching_across_multiple_lines_with_begin_prefix(
         self):
     matching_tokens_per_line = [
         _tokens_for_text('this is matching'),
         _tokens_for_text('and continues here')
     ]
     matching_tokens = flatten(matching_tokens_per_line)
     target_annotations = [
         TargetAnnotation('this is matching and continues here', TAG1)
     ]
     doc = _document_for_tokens(matching_tokens_per_line)
     MatchingAnnotator(target_annotations,
                       use_tag_begin_prefix=True).annotate(doc)
     assert _get_tags_of_tokens(matching_tokens) == (
         [B_TAG_1] + [I_TAG_1] * (len(matching_tokens) - 1))
 def test_should_not_annotate_short_reference_item_not_followed_by_other_reference_items(
         self):
     matching_reference_item_text = 'ref_id'
     reference_item_texts = [matching_reference_item_text] + ['ref_title']
     matching_reference_item_tokens = _tokens_for_text(
         matching_reference_item_text)
     other_tokens = _tokens_for_text('other')
     tokens_per_line = [matching_reference_item_tokens + other_tokens]
     target_annotations = [
         TargetAnnotation(reference_item_texts, 'reference', bonding=True)
     ]
     doc = _document_for_tokens(tokens_per_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert (_get_tags_of_tokens(matching_reference_item_tokens) == [None] *
             len(matching_reference_item_tokens))
 def test_should_not_annotate_short_section_title_if_paragraph_follows_later(
         self):
     section_title_text = 'section title'
     section_title_tokens = _tokens_for_text(section_title_text + '.')
     other_tokens = _tokens_for_text('other text to come here.')
     tokens_per_line = [section_title_tokens + other_tokens]
     target_annotations = [
         TargetAnnotation(section_title_text,
                          'section_title',
                          require_next=True)
     ]
     doc = _document_for_tokens(tokens_per_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert (_get_tags_of_tokens(section_title_tokens) == [None] *
             len(section_title_tokens))
 def test_should_annotate_fuzzily_matching_longer_matches_based_on_ratio(
         self):
     long_matching_text = 'this is matching and is really really long match that we can trust'
     matching_tokens = _tokens_for_text(long_matching_text)
     no_matching_tokens = _tokens_for_text('what comes next is different')
     target_annotations = [
         TargetAnnotation(
             long_matching_text + ' but this is not and is another matter',
             TAG1)
     ]
     doc = _document_for_tokens([matching_tokens + no_matching_tokens])
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
     assert _get_tags_of_tokens(
         no_matching_tokens) == [None] * len(no_matching_tokens)
 def test_should_annotate_not_match_distant_value_of_multiple_value_target_annotation(
         self):
     matching_tokens = _tokens_for_text('this may match')
     distant_matching_tokens = _tokens_for_text('not')
     distance_in_lines = 10
     tokens_by_line = [matching_tokens] + [
         _tokens_for_text('other') for _ in range(distance_in_lines)
     ] + [distant_matching_tokens]
     target_annotations = [
         TargetAnnotation(['this', 'may', 'match', 'not'], TAG1)
     ]
     doc = _document_for_tokens(tokens_by_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
     assert _get_tags_of_tokens(
         distant_matching_tokens) == [None] * len(distant_matching_tokens)
 def test_should_annotate_last_line_of_block_followed_by_other_text(self):
     block_text_lines = [
         'this is the first row', 'second row follows',
         'here we are on the third', 'last line of block'
     ]
     block_tokens_per_line = _tokens_for_text_lines(block_text_lines)
     block_tokens = flatten(block_tokens_per_line)
     tokens_per_line = block_tokens_per_line + [
         _tokens_for_text('other text')
     ]
     target_annotations = [
         TargetAnnotation('\n'.join(block_text_lines), TAG1)
     ]
     doc = _document_for_tokens(tokens_per_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert (_get_tags_of_tokens(block_tokens) == [TAG1] *
             len(block_tokens))
    def test_should_annotate_same_sequence_multiple_times_with_begin_prefix(
            self):
        matching_tokens_per_line = [
            _tokens_for_text('this is matching'),
            _tokens_for_text('this is matching')
        ]

        matching_tokens = flatten(matching_tokens_per_line)
        target_annotations = [
            TargetAnnotation('this is matching', TAG1, match_multiple=True)
        ]
        doc = _document_for_tokens(matching_tokens_per_line)
        MatchingAnnotator(target_annotations,
                          use_tag_begin_prefix=True).annotate(doc)
        # the begin tag should appear at the beginning of each match
        assert (_get_tags_of_tokens(matching_tokens) == [
            B_TAG_1, I_TAG_1, I_TAG_1, B_TAG_1, I_TAG_1, I_TAG_1
        ])
    def test_should_not_annotate_similar_sequence_multiple_times(self):
        matching_tokens_per_line = [
            _tokens_for_text('this is matching'),
            _tokens_for_text('and continues here')
        ]
        not_matching_tokens = _tokens_for_text('this is matching')

        matching_tokens = flatten(matching_tokens_per_line)
        target_annotations = [
            TargetAnnotation('this is matching and continues here', TAG1)
        ]
        doc = _document_for_tokens(matching_tokens_per_line +
                                   [not_matching_tokens])
        MatchingAnnotator(target_annotations).annotate(doc)
        assert _get_tags_of_tokens(
            matching_tokens) == [TAG1] * len(matching_tokens)
        assert _get_tags_of_tokens(
            not_matching_tokens) == [None] * len(not_matching_tokens)
示例#16
0
def convert_and_annotate_lxml_content(lxml_content, xml_content, xml_mapping, name=None):
    stop_watch_recorder = StopWatchRecorder()

    stop_watch_recorder.start('parse lxml')
    lxml_root = etree.fromstring(lxml_content)

    # use a more lenient way to parse xml as xml errors are not uncomment
    stop_watch_recorder.start('parse xml')
    xml_root = xml_from_string_with_recover(xml_content)

    stop_watch_recorder.start('extract target annotations')
    target_annotations = xml_root_to_target_annotations(
        xml_root,
        xml_mapping
    )
    stop_watch_recorder.stop()

    annotators = DEFAULT_ANNOTATORS + [MatchingAnnotator(
        target_annotations,
        use_tag_begin_prefix=True
    )]
    annotator = Annotator(annotators)

    stop_watch_recorder.start('convert to svg')
    svg_roots = list(iter_svg_pages_for_lxml(lxml_root))

    stop_watch_recorder.start('annotate svg')
    annotator.annotate(SvgStructuredDocument(svg_roots))

    stop_watch_recorder.start('add visualisation')
    svg_roots = [
        visualize_svg_annotations(svg_root)
        for svg_root in svg_roots
    ]
    stop_watch_recorder.stop()

    get_logger().info(
        'processed: name=%s, lxml size=%s, xml size=%s, timings=[%s] (native align impl=%s)',
        name, format(len(lxml_content), ','), format(len(xml_content), ','),
        stop_watch_recorder, align_native_enabled
    )

    return svg_roots
 def test_should_annotate_same_sub_annotations_multiple_times_with_begin_prefic(
         self):
     matching_tokens_by_line = [
         _tokens_for_text('this is matching'),
         _tokens_for_text('this is matching')
     ]
     matching_tokens = flatten(matching_tokens_by_line)
     target_annotations = [
         TargetAnnotation(
             'this is matching',
             TAG2,
             match_multiple=True,
             sub_annotations=[TargetAnnotation('this is', TAG1)])
     ]
     doc = _document_for_tokens(matching_tokens_by_line)
     MatchingAnnotator(target_annotations,
                       use_tag_begin_prefix=True).annotate(doc)
     assert ([doc.get_sub_tag(x) for x in matching_tokens
              ] == [B_TAG_1, I_TAG_1, None, B_TAG_1, I_TAG_1, None])
 def test_should_annotate_shorter_target_annotation_in_longer_line_multiple_times_if_enabled(
         self):
     pre_tokens = _tokens_for_text('pre')
     matching_tokens = _tokens_for_text('this is matching')
     post_tokens = _tokens_for_text('post')
     same_matching_tokens = _copy_tokens(matching_tokens)
     target_annotations = [
         TargetAnnotation('this is matching', TAG1, match_multiple=True)
     ]
     doc = _document_for_tokens([
         pre_tokens + matching_tokens + post_tokens,
         _copy_tokens(pre_tokens) + same_matching_tokens +
         _copy_tokens(post_tokens)
     ])
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
     assert _get_tags_of_tokens(
         same_matching_tokens) == [TAG1] * len(same_matching_tokens)
 def test_should_annotate_short_section_title_followed_by_paragraph(self):
     section_title_text = 'section title'
     section_paragraph_text = 'paragraph text to come here.'
     section_title_tokens = _tokens_for_text(section_title_text + '.')
     section_paragraph_tokens = _tokens_for_text(section_paragraph_text)
     tokens_per_line = [section_title_tokens + section_paragraph_tokens]
     target_annotations = [
         TargetAnnotation(section_title_text,
                          'section_title',
                          require_next=True),
         TargetAnnotation(section_paragraph_text, 'section_paragraph')
     ]
     doc = _document_for_tokens(tokens_per_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert (
         _get_tags_of_tokens(section_title_tokens) == ['section_title'] *
         len(section_title_tokens))
     assert (_get_tags_of_tokens(section_paragraph_tokens) ==
             ['section_paragraph'] * len(section_paragraph_tokens))
 def test_should_annotate_sub_tag_across_multiple_tokens(self):
     sub_matching_tokens = _tokens_for_text('this is matching')
     tag_matching_tokens = (_tokens_for_text('something before') +
                            sub_matching_tokens +
                            _tokens_for_text('more text to come'))
     all_tokens = (_tokens_for_text('not matching') + tag_matching_tokens +
                   _tokens_for_text('and there'))
     target_annotations = [
         TargetAnnotation(_tokens_to_text(tag_matching_tokens),
                          TAG2,
                          sub_annotations=[
                              TargetAnnotation(
                                  _tokens_to_text(sub_matching_tokens),
                                  TAG1)
                          ])
     ]
     doc = _document_for_tokens([all_tokens])
     MatchingAnnotator(target_annotations,
                       use_tag_begin_prefix=False).annotate(doc)
     assert [doc.get_sub_tag(x)
             for x in sub_matching_tokens] == [TAG1, TAG1, TAG1]
 def test_should_annotate_multiple_shorter_target_annotation_in_longer_line(
         self):
     pre_tokens = _tokens_for_text('pre')
     matching_tokens_tag_1 = _tokens_for_text('this is matching')
     mid_tokens = _tokens_for_text('mid')
     matching_tokens_tag_2 = _tokens_for_text('also good')
     post_tokens = _tokens_for_text('post')
     target_annotations = [
         TargetAnnotation('this is matching', TAG1),
         TargetAnnotation('also good', TAG2)
     ]
     doc = _document_for_tokens([
         pre_tokens + matching_tokens_tag_1 + mid_tokens +
         matching_tokens_tag_2 + post_tokens
     ])
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(pre_tokens) == [None] * len(pre_tokens)
     assert _get_tags_of_tokens(
         matching_tokens_tag_1) == [TAG1] * len(matching_tokens_tag_1)
     assert _get_tags_of_tokens(mid_tokens) == [None] * len(mid_tokens)
     assert _get_tags_of_tokens(
         matching_tokens_tag_2) == [TAG2] * len(matching_tokens_tag_2)
     assert _get_tags_of_tokens(post_tokens) == [None] * len(post_tokens)
 def test_should_annotate_shorter_sequence_over_multiple_lines_considering_next_line(
         self):
     # use a short sequence that wouldn't get matched on it's own
     matching_text_lines = ('this may', 'match')
     matching_tokens_by_line = _tokens_for_text_lines(matching_text_lines)
     matching_tokens = flatten(matching_tokens_by_line)
     # repeat the same text on the two lines, only by combining the lines would it be clear
     # which tokens to match
     pre_tokens = _tokens_for_text(matching_text_lines[0] +
                                   ' be some other longer preceeding text')
     post_tokens = _tokens_for_text('this is some text after but no ' +
                                    matching_text_lines[1])
     tokens_by_line = [
         pre_tokens + matching_tokens_by_line[0],
         matching_tokens_by_line[1] + post_tokens
     ]
     target_annotations = [TargetAnnotation('this may match', TAG1)]
     doc = _document_for_tokens(tokens_by_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
     assert _get_tags_of_tokens(pre_tokens) == [None] * len(pre_tokens)
     assert _get_tags_of_tokens(post_tokens) == [None] * len(post_tokens)
 def test_should_annotate_over_multiple_lines_with_tag_transition(self):
     tag1_tokens_by_line = [
         _tokens_for_text('this may'),
         _tokens_for_text('match')
     ]
     tag1_tokens = flatten(tag1_tokens_by_line)
     tag2_tokens_by_line = [
         _tokens_for_text('another'),
         _tokens_for_text('tag here')
     ]
     tag2_tokens = flatten(tag2_tokens_by_line)
     tokens_by_line = [
         tag1_tokens_by_line[0],
         tag1_tokens_by_line[1] + tag2_tokens_by_line[0],
         tag2_tokens_by_line[1]
     ]
     target_annotations = [
         TargetAnnotation('this may match', TAG1),
         TargetAnnotation('another tag here', TAG2)
     ]
     doc = _document_for_tokens(tokens_by_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(tag1_tokens) == [TAG1] * len(tag1_tokens)
     assert _get_tags_of_tokens(tag2_tokens) == [TAG2] * len(tag2_tokens)
 def test_should_not_fail_on_empty_document(self):
     doc = SimpleStructuredDocument(lines=[])
     MatchingAnnotator([]).annotate(doc)
 def test_should_not_fail_on_empty_line_with_blank_token(self):
     target_annotations = [TargetAnnotation('this is. matching', TAG1)]
     doc = _document_for_tokens([[SimpleToken('')]])
     MatchingAnnotator(target_annotations).annotate(doc)