def test_should_annotate_over_multiple_lines_with_tag_transition_with_begin_prefix(
         self):
     tag1_tokens_by_line = [
         _tokens_for_text('this may'),
         _tokens_for_text('match')
     ]
     tag1_tokens = flatten(tag1_tokens_by_line)
     tag2_tokens_by_line = [
         _tokens_for_text('another'),
         _tokens_for_text('tag here')
     ]
     tag2_tokens = flatten(tag2_tokens_by_line)
     tokens_by_line = [
         tag1_tokens_by_line[0],
         tag1_tokens_by_line[1] + tag2_tokens_by_line[0],
         tag2_tokens_by_line[1]
     ]
     target_annotations = [
         TargetAnnotation('this may match', TAG1),
         TargetAnnotation('another tag here', TAG2)
     ]
     doc = _document_for_tokens(tokens_by_line)
     MatchingAnnotator(target_annotations,
                       use_tag_begin_prefix=True).annotate(doc)
     assert _get_tags_of_tokens(tag1_tokens) == ([B_TAG_1] + [I_TAG_1] *
                                                 (len(tag1_tokens) - 1))
     assert _get_tags_of_tokens(tag2_tokens) == ([B_TAG_2] + [I_TAG_2] *
                                                 (len(tag2_tokens) - 1))
Exemplo n.º 2
0
 def test_should_annotate_over_multiple_lines_with_tag_transition(self):
     tag1_tokens_by_line = [
         _tokens_for_text('this may'),
         _tokens_for_text('match')
     ]
     tag1_tokens = flatten(tag1_tokens_by_line)
     tag2_tokens_by_line = [
         _tokens_for_text('another'),
         _tokens_for_text('tag here')
     ]
     tag2_tokens = flatten(tag2_tokens_by_line)
     tokens_by_line = [
         tag1_tokens_by_line[0],
         tag1_tokens_by_line[1] + tag2_tokens_by_line[0],
         tag2_tokens_by_line[1]
     ]
     target_annotations = [
         TargetAnnotation('this may match', TAG1),
         TargetAnnotation('another tag here', TAG2)
     ]
     doc = _document_for_tokens(tokens_by_line)
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tag_values_of_tokens(tag1_tokens) == [TAG1
                                                       ] * len(tag1_tokens)
     assert _get_tag_values_of_tokens(tag2_tokens) == [TAG2
                                                       ] * len(tag2_tokens)
 def test_should_annotate_longer_sequence_over_multiple_lines_considering_next_line(
         self):
     # we need a long enough sequence to fall into the first branch
     # and match the partial match threshold
     exact_matching_text_lines = (
         'this may', 'indeed match very well without the slightest doubt')
     # add a short prefix that doesn't affect the score much
     # but would be skipped if we only matched the second line
     matching_text_lines = (exact_matching_text_lines[0],
                            'x ' + exact_matching_text_lines[1])
     matching_tokens_by_line = _tokens_for_text_lines(matching_text_lines)
     matching_tokens = flatten(matching_tokens_by_line)
     pre_tokens = _tokens_for_text(matching_text_lines[0] + ' this may not')
     post_tokens = _tokens_for_text('or not')
     tokens_by_line = [
         pre_tokens + matching_tokens_by_line[0],
         matching_tokens_by_line[1] + post_tokens
     ]
     target_annotations = [
         TargetAnnotation(' '.join(exact_matching_text_lines), TAG1)
     ]
     doc = _document_for_tokens(tokens_by_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
     assert _get_tags_of_tokens(pre_tokens) == [None] * len(pre_tokens)
     assert _get_tags_of_tokens(post_tokens) == [None] * len(post_tokens)
Exemplo n.º 4
0
def to_csv_dict_rows(evaluation_result, document=None):
    return flatten(
        [{
            EvaluationFields.DOCUMENT: document,
            EvaluationFields.PAGE: 1 + page_index,
            EvaluationFields.TAG: tag,
            EvaluationFields.COUNT: count
        } for tag, count in iteritems(page_evaluation['count'])]
        for page_index, page_evaluation in enumerate(evaluation_result))
Exemplo n.º 5
0
 def test_should_not_return_line_number_tokens_at_unusual_position2(self):
     number_tokens = flatten([[
         SimpleToken(str(line_no), dict(x=str(x * 50), y=str(line_no * 20)))
         for line_no in range(1, 5)
     ] for x in range(1, 3)])
     doc = SimpleStructuredDocument(lines=[
         SimpleLine([number_token]) for number_token in number_tokens
     ])
     actual_line_number_tokens = list(find_line_number_tokens(doc))
     assert actual_line_number_tokens == []
 def test_should_not_annotate_shorter_sequence_if_next_line_does_not_match(
         self):
     tokens_per_line = [
         _tokens_for_text('this is'),
         _tokens_for_text('something completely different')
     ]
     tokens = flatten(tokens_per_line)
     target_annotations = [TargetAnnotation('this is not matching', TAG1)]
     doc = _document_for_tokens(tokens_per_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(tokens) == [None] * len(tokens)
 def test_should_annotate_multiple_value_target_annotation_over_multiple_lines(
         self):
     tokens_by_line = [
         _tokens_for_text('this may'),
         _tokens_for_text('match')
     ]
     matching_tokens = flatten(tokens_by_line)
     target_annotations = [TargetAnnotation(['this', 'may', 'match'], TAG1)]
     doc = _document_for_tokens(tokens_by_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
    def test_should_not_override_annotation(self):
        matching_tokens_per_line = [_tokens_for_text('this is matching')]

        matching_tokens = flatten(matching_tokens_per_line)
        target_annotations = [
            TargetAnnotation('this is matching', TAG1),
            TargetAnnotation('matching', TAG2)
        ]
        doc = _document_for_tokens(matching_tokens_per_line)
        MatchingAnnotator(target_annotations).annotate(doc)
        assert _get_tags_of_tokens(
            matching_tokens) == [TAG1] * len(matching_tokens)
 def test_should_annotate_exactly_matching_across_multiple_lines(self):
     matching_tokens_per_line = [
         _tokens_for_text('this is matching'),
         _tokens_for_text('and continues here')
     ]
     matching_tokens = flatten(matching_tokens_per_line)
     target_annotations = [
         TargetAnnotation('this is matching and continues here', TAG1)
     ]
     doc = _document_for_tokens(matching_tokens_per_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
    def test_should_annotate_same_sequence_multiple_times_if_enabled(self):
        matching_tokens_per_line = [
            _tokens_for_text('this is matching'),
            _tokens_for_text('this is matching')
        ]

        matching_tokens = flatten(matching_tokens_per_line)
        target_annotations = [
            TargetAnnotation('this is matching', TAG1, match_multiple=True)
        ]
        doc = _document_for_tokens(matching_tokens_per_line)
        MatchingAnnotator(target_annotations).annotate(doc)
        assert _get_tags_of_tokens(
            matching_tokens) == [TAG1] * len(matching_tokens)
 def test_should_annotate_mult_value_target_annot_rev_order_over_mult_lines_with_b_prefix(
         self):
     tokens_by_line = [
         _tokens_for_text('this may'),
         _tokens_for_text('match')
     ]
     matching_tokens = flatten(tokens_by_line)
     target_annotations = [
         TargetAnnotation(list(reversed(['this', 'may', 'match'])), TAG1)
     ]
     doc = _document_for_tokens(tokens_by_line)
     MatchingAnnotator(target_annotations,
                       use_tag_begin_prefix=True).annotate(doc)
     assert _get_tags_of_tokens(matching_tokens) == (
         [B_TAG_1] + [I_TAG_1] * (len(matching_tokens) - 1))
 def test_should_annotate_exactly_matching_across_multiple_lines_with_begin_prefix(
         self):
     matching_tokens_per_line = [
         _tokens_for_text('this is matching'),
         _tokens_for_text('and continues here')
     ]
     matching_tokens = flatten(matching_tokens_per_line)
     target_annotations = [
         TargetAnnotation('this is matching and continues here', TAG1)
     ]
     doc = _document_for_tokens(matching_tokens_per_line)
     MatchingAnnotator(target_annotations,
                       use_tag_begin_prefix=True).annotate(doc)
     assert _get_tags_of_tokens(matching_tokens) == (
         [B_TAG_1] + [I_TAG_1] * (len(matching_tokens) - 1))
 def test_should_annotate_last_line_of_block_followed_by_other_text(self):
     block_text_lines = [
         'this is the first row', 'second row follows',
         'here we are on the third', 'last line of block'
     ]
     block_tokens_per_line = _tokens_for_text_lines(block_text_lines)
     block_tokens = flatten(block_tokens_per_line)
     tokens_per_line = block_tokens_per_line + [
         _tokens_for_text('other text')
     ]
     target_annotations = [
         TargetAnnotation('\n'.join(block_text_lines), TAG1)
     ]
     doc = _document_for_tokens(tokens_per_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert (_get_tags_of_tokens(block_tokens) == [TAG1] *
             len(block_tokens))
    def test_should_not_annotate_similar_sequence_multiple_times(self):
        matching_tokens_per_line = [
            _tokens_for_text('this is matching'),
            _tokens_for_text('and continues here')
        ]
        not_matching_tokens = _tokens_for_text('this is matching')

        matching_tokens = flatten(matching_tokens_per_line)
        target_annotations = [
            TargetAnnotation('this is matching and continues here', TAG1)
        ]
        doc = _document_for_tokens(matching_tokens_per_line +
                                   [not_matching_tokens])
        MatchingAnnotator(target_annotations).annotate(doc)
        assert _get_tags_of_tokens(
            matching_tokens) == [TAG1] * len(matching_tokens)
        assert _get_tags_of_tokens(
            not_matching_tokens) == [None] * len(not_matching_tokens)
    def test_should_annotate_same_sequence_multiple_times_with_begin_prefix(
            self):
        matching_tokens_per_line = [
            _tokens_for_text('this is matching'),
            _tokens_for_text('this is matching')
        ]

        matching_tokens = flatten(matching_tokens_per_line)
        target_annotations = [
            TargetAnnotation('this is matching', TAG1, match_multiple=True)
        ]
        doc = _document_for_tokens(matching_tokens_per_line)
        MatchingAnnotator(target_annotations,
                          use_tag_begin_prefix=True).annotate(doc)
        # the begin tag should appear at the beginning of each match
        assert (_get_tags_of_tokens(matching_tokens) == [
            B_TAG_1, I_TAG_1, I_TAG_1, B_TAG_1, I_TAG_1, I_TAG_1
        ])
Exemplo n.º 16
0
 def test_should_annotate_references(self):
     matching_tokens_list = [
         _tokens_for_text('1 this is reference A'),
         _tokens_for_text('2 this is reference B'),
         _tokens_for_text('3 this is reference C')
     ]
     matching_tokens = flatten(matching_tokens_list)
     target_annotations = [
         TargetAnnotation('this is reference A', TAG1),
         TargetAnnotation('this is reference B', TAG1),
         TargetAnnotation('this is reference C', TAG1)
     ]
     pre_tokens = [_tokens_for_text('previous line')] * 5
     doc = _document_for_tokens(pre_tokens + matching_tokens_list)
     SimpleMatchingAnnotator(target_annotations,
                             lookahead_sequence_count=3).annotate(doc)
     LOGGER.debug('doc: %s', _get_document_token_tags(doc))
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
Exemplo n.º 17
0
 def test_should_annotate_references_with_sub_tag_with_extend_to_line(self):
     matching_tokens_list = [_tokens_for_text('1 this is reference A')]
     matching_tokens = flatten(matching_tokens_list)
     target_annotations = [
         TargetAnnotation('1 this is reference A',
                          TAG1,
                          sub_annotations=[TargetAnnotation('1', TAG2)]),
     ]
     pre_tokens = [_tokens_for_text('previous line')] * 5
     doc = _document_for_tokens(pre_tokens + matching_tokens_list)
     SimpleMatchingAnnotator(target_annotations,
                             lookahead_sequence_count=3,
                             extend_to_line_enabled=True,
                             use_sub_annotations=True).annotate(doc)
     LOGGER.debug('doc: %s', _get_document_token_tags(doc))
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
     assert _get_sub_tag_values_of_tokens(matching_tokens) == (
         [TAG2] + [None] * (len(matching_tokens) - 1))
 def test_should_annotate_same_sub_annotations_multiple_times_with_begin_prefic(
         self):
     matching_tokens_by_line = [
         _tokens_for_text('this is matching'),
         _tokens_for_text('this is matching')
     ]
     matching_tokens = flatten(matching_tokens_by_line)
     target_annotations = [
         TargetAnnotation(
             'this is matching',
             TAG2,
             match_multiple=True,
             sub_annotations=[TargetAnnotation('this is', TAG1)])
     ]
     doc = _document_for_tokens(matching_tokens_by_line)
     MatchingAnnotator(target_annotations,
                       use_tag_begin_prefix=True).annotate(doc)
     assert ([doc.get_sub_tag(x) for x in matching_tokens
              ] == [B_TAG_1, I_TAG_1, None, B_TAG_1, I_TAG_1, None])
 def test_should_annotate_shorter_sequence_over_multiple_lines_considering_next_line(
         self):
     # use a short sequence that wouldn't get matched on it's own
     matching_text_lines = ('this may', 'match')
     matching_tokens_by_line = _tokens_for_text_lines(matching_text_lines)
     matching_tokens = flatten(matching_tokens_by_line)
     # repeat the same text on the two lines, only by combining the lines would it be clear
     # which tokens to match
     pre_tokens = _tokens_for_text(matching_text_lines[0] +
                                   ' be some other longer preceeding text')
     post_tokens = _tokens_for_text('this is some text after but no ' +
                                    matching_text_lines[1])
     tokens_by_line = [
         pre_tokens + matching_tokens_by_line[0],
         matching_tokens_by_line[1] + post_tokens
     ]
     target_annotations = [TargetAnnotation('this may match', TAG1)]
     doc = _document_for_tokens(tokens_by_line)
     MatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tags_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
     assert _get_tags_of_tokens(pre_tokens) == [None] * len(pre_tokens)
     assert _get_tags_of_tokens(post_tokens) == [None] * len(post_tokens)
def tags_for_nodes(nodes):
    return sorted(set(flatten([tags_for_node(node) for node in nodes])))