def test_should_annotate_over_multiple_lines_with_tag_transition_with_begin_prefix( self): tag1_tokens_by_line = [ _tokens_for_text('this may'), _tokens_for_text('match') ] tag1_tokens = flatten(tag1_tokens_by_line) tag2_tokens_by_line = [ _tokens_for_text('another'), _tokens_for_text('tag here') ] tag2_tokens = flatten(tag2_tokens_by_line) tokens_by_line = [ tag1_tokens_by_line[0], tag1_tokens_by_line[1] + tag2_tokens_by_line[0], tag2_tokens_by_line[1] ] target_annotations = [ TargetAnnotation('this may match', TAG1), TargetAnnotation('another tag here', TAG2) ] doc = _document_for_tokens(tokens_by_line) MatchingAnnotator(target_annotations, use_tag_begin_prefix=True).annotate(doc) assert _get_tags_of_tokens(tag1_tokens) == ([B_TAG_1] + [I_TAG_1] * (len(tag1_tokens) - 1)) assert _get_tags_of_tokens(tag2_tokens) == ([B_TAG_2] + [I_TAG_2] * (len(tag2_tokens) - 1))
def test_should_annotate_over_multiple_lines_with_tag_transition(self): tag1_tokens_by_line = [ _tokens_for_text('this may'), _tokens_for_text('match') ] tag1_tokens = flatten(tag1_tokens_by_line) tag2_tokens_by_line = [ _tokens_for_text('another'), _tokens_for_text('tag here') ] tag2_tokens = flatten(tag2_tokens_by_line) tokens_by_line = [ tag1_tokens_by_line[0], tag1_tokens_by_line[1] + tag2_tokens_by_line[0], tag2_tokens_by_line[1] ] target_annotations = [ TargetAnnotation('this may match', TAG1), TargetAnnotation('another tag here', TAG2) ] doc = _document_for_tokens(tokens_by_line) SimpleMatchingAnnotator(target_annotations).annotate(doc) assert _get_tag_values_of_tokens(tag1_tokens) == [TAG1 ] * len(tag1_tokens) assert _get_tag_values_of_tokens(tag2_tokens) == [TAG2 ] * len(tag2_tokens)
def test_should_annotate_longer_sequence_over_multiple_lines_considering_next_line( self): # we need a long enough sequence to fall into the first branch # and match the partial match threshold exact_matching_text_lines = ( 'this may', 'indeed match very well without the slightest doubt') # add a short prefix that doesn't affect the score much # but would be skipped if we only matched the second line matching_text_lines = (exact_matching_text_lines[0], 'x ' + exact_matching_text_lines[1]) matching_tokens_by_line = _tokens_for_text_lines(matching_text_lines) matching_tokens = flatten(matching_tokens_by_line) pre_tokens = _tokens_for_text(matching_text_lines[0] + ' this may not') post_tokens = _tokens_for_text('or not') tokens_by_line = [ pre_tokens + matching_tokens_by_line[0], matching_tokens_by_line[1] + post_tokens ] target_annotations = [ TargetAnnotation(' '.join(exact_matching_text_lines), TAG1) ] doc = _document_for_tokens(tokens_by_line) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_tags_of_tokens(pre_tokens) == [None] * len(pre_tokens) assert _get_tags_of_tokens(post_tokens) == [None] * len(post_tokens)
def to_csv_dict_rows(evaluation_result, document=None): return flatten( [{ EvaluationFields.DOCUMENT: document, EvaluationFields.PAGE: 1 + page_index, EvaluationFields.TAG: tag, EvaluationFields.COUNT: count } for tag, count in iteritems(page_evaluation['count'])] for page_index, page_evaluation in enumerate(evaluation_result))
def test_should_not_return_line_number_tokens_at_unusual_position2(self): number_tokens = flatten([[ SimpleToken(str(line_no), dict(x=str(x * 50), y=str(line_no * 20))) for line_no in range(1, 5) ] for x in range(1, 3)]) doc = SimpleStructuredDocument(lines=[ SimpleLine([number_token]) for number_token in number_tokens ]) actual_line_number_tokens = list(find_line_number_tokens(doc)) assert actual_line_number_tokens == []
def test_should_not_annotate_shorter_sequence_if_next_line_does_not_match( self): tokens_per_line = [ _tokens_for_text('this is'), _tokens_for_text('something completely different') ] tokens = flatten(tokens_per_line) target_annotations = [TargetAnnotation('this is not matching', TAG1)] doc = _document_for_tokens(tokens_per_line) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens(tokens) == [None] * len(tokens)
def test_should_annotate_multiple_value_target_annotation_over_multiple_lines( self): tokens_by_line = [ _tokens_for_text('this may'), _tokens_for_text('match') ] matching_tokens = flatten(tokens_by_line) target_annotations = [TargetAnnotation(['this', 'may', 'match'], TAG1)] doc = _document_for_tokens(tokens_by_line) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_not_override_annotation(self): matching_tokens_per_line = [_tokens_for_text('this is matching')] matching_tokens = flatten(matching_tokens_per_line) target_annotations = [ TargetAnnotation('this is matching', TAG1), TargetAnnotation('matching', TAG2) ] doc = _document_for_tokens(matching_tokens_per_line) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_annotate_exactly_matching_across_multiple_lines(self): matching_tokens_per_line = [ _tokens_for_text('this is matching'), _tokens_for_text('and continues here') ] matching_tokens = flatten(matching_tokens_per_line) target_annotations = [ TargetAnnotation('this is matching and continues here', TAG1) ] doc = _document_for_tokens(matching_tokens_per_line) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_annotate_same_sequence_multiple_times_if_enabled(self): matching_tokens_per_line = [ _tokens_for_text('this is matching'), _tokens_for_text('this is matching') ] matching_tokens = flatten(matching_tokens_per_line) target_annotations = [ TargetAnnotation('this is matching', TAG1, match_multiple=True) ] doc = _document_for_tokens(matching_tokens_per_line) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_annotate_mult_value_target_annot_rev_order_over_mult_lines_with_b_prefix( self): tokens_by_line = [ _tokens_for_text('this may'), _tokens_for_text('match') ] matching_tokens = flatten(tokens_by_line) target_annotations = [ TargetAnnotation(list(reversed(['this', 'may', 'match'])), TAG1) ] doc = _document_for_tokens(tokens_by_line) MatchingAnnotator(target_annotations, use_tag_begin_prefix=True).annotate(doc) assert _get_tags_of_tokens(matching_tokens) == ( [B_TAG_1] + [I_TAG_1] * (len(matching_tokens) - 1))
def test_should_annotate_exactly_matching_across_multiple_lines_with_begin_prefix( self): matching_tokens_per_line = [ _tokens_for_text('this is matching'), _tokens_for_text('and continues here') ] matching_tokens = flatten(matching_tokens_per_line) target_annotations = [ TargetAnnotation('this is matching and continues here', TAG1) ] doc = _document_for_tokens(matching_tokens_per_line) MatchingAnnotator(target_annotations, use_tag_begin_prefix=True).annotate(doc) assert _get_tags_of_tokens(matching_tokens) == ( [B_TAG_1] + [I_TAG_1] * (len(matching_tokens) - 1))
def test_should_annotate_last_line_of_block_followed_by_other_text(self): block_text_lines = [ 'this is the first row', 'second row follows', 'here we are on the third', 'last line of block' ] block_tokens_per_line = _tokens_for_text_lines(block_text_lines) block_tokens = flatten(block_tokens_per_line) tokens_per_line = block_tokens_per_line + [ _tokens_for_text('other text') ] target_annotations = [ TargetAnnotation('\n'.join(block_text_lines), TAG1) ] doc = _document_for_tokens(tokens_per_line) MatchingAnnotator(target_annotations).annotate(doc) assert (_get_tags_of_tokens(block_tokens) == [TAG1] * len(block_tokens))
def test_should_not_annotate_similar_sequence_multiple_times(self): matching_tokens_per_line = [ _tokens_for_text('this is matching'), _tokens_for_text('and continues here') ] not_matching_tokens = _tokens_for_text('this is matching') matching_tokens = flatten(matching_tokens_per_line) target_annotations = [ TargetAnnotation('this is matching and continues here', TAG1) ] doc = _document_for_tokens(matching_tokens_per_line + [not_matching_tokens]) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_tags_of_tokens( not_matching_tokens) == [None] * len(not_matching_tokens)
def test_should_annotate_same_sequence_multiple_times_with_begin_prefix( self): matching_tokens_per_line = [ _tokens_for_text('this is matching'), _tokens_for_text('this is matching') ] matching_tokens = flatten(matching_tokens_per_line) target_annotations = [ TargetAnnotation('this is matching', TAG1, match_multiple=True) ] doc = _document_for_tokens(matching_tokens_per_line) MatchingAnnotator(target_annotations, use_tag_begin_prefix=True).annotate(doc) # the begin tag should appear at the beginning of each match assert (_get_tags_of_tokens(matching_tokens) == [ B_TAG_1, I_TAG_1, I_TAG_1, B_TAG_1, I_TAG_1, I_TAG_1 ])
def test_should_annotate_references(self): matching_tokens_list = [ _tokens_for_text('1 this is reference A'), _tokens_for_text('2 this is reference B'), _tokens_for_text('3 this is reference C') ] matching_tokens = flatten(matching_tokens_list) target_annotations = [ TargetAnnotation('this is reference A', TAG1), TargetAnnotation('this is reference B', TAG1), TargetAnnotation('this is reference C', TAG1) ] pre_tokens = [_tokens_for_text('previous line')] * 5 doc = _document_for_tokens(pre_tokens + matching_tokens_list) SimpleMatchingAnnotator(target_annotations, lookahead_sequence_count=3).annotate(doc) LOGGER.debug('doc: %s', _get_document_token_tags(doc)) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens)
def test_should_annotate_references_with_sub_tag_with_extend_to_line(self): matching_tokens_list = [_tokens_for_text('1 this is reference A')] matching_tokens = flatten(matching_tokens_list) target_annotations = [ TargetAnnotation('1 this is reference A', TAG1, sub_annotations=[TargetAnnotation('1', TAG2)]), ] pre_tokens = [_tokens_for_text('previous line')] * 5 doc = _document_for_tokens(pre_tokens + matching_tokens_list) SimpleMatchingAnnotator(target_annotations, lookahead_sequence_count=3, extend_to_line_enabled=True, use_sub_annotations=True).annotate(doc) LOGGER.debug('doc: %s', _get_document_token_tags(doc)) assert _get_tag_values_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_sub_tag_values_of_tokens(matching_tokens) == ( [TAG2] + [None] * (len(matching_tokens) - 1))
def test_should_annotate_same_sub_annotations_multiple_times_with_begin_prefic( self): matching_tokens_by_line = [ _tokens_for_text('this is matching'), _tokens_for_text('this is matching') ] matching_tokens = flatten(matching_tokens_by_line) target_annotations = [ TargetAnnotation( 'this is matching', TAG2, match_multiple=True, sub_annotations=[TargetAnnotation('this is', TAG1)]) ] doc = _document_for_tokens(matching_tokens_by_line) MatchingAnnotator(target_annotations, use_tag_begin_prefix=True).annotate(doc) assert ([doc.get_sub_tag(x) for x in matching_tokens ] == [B_TAG_1, I_TAG_1, None, B_TAG_1, I_TAG_1, None])
def test_should_annotate_shorter_sequence_over_multiple_lines_considering_next_line( self): # use a short sequence that wouldn't get matched on it's own matching_text_lines = ('this may', 'match') matching_tokens_by_line = _tokens_for_text_lines(matching_text_lines) matching_tokens = flatten(matching_tokens_by_line) # repeat the same text on the two lines, only by combining the lines would it be clear # which tokens to match pre_tokens = _tokens_for_text(matching_text_lines[0] + ' be some other longer preceeding text') post_tokens = _tokens_for_text('this is some text after but no ' + matching_text_lines[1]) tokens_by_line = [ pre_tokens + matching_tokens_by_line[0], matching_tokens_by_line[1] + post_tokens ] target_annotations = [TargetAnnotation('this may match', TAG1)] doc = _document_for_tokens(tokens_by_line) MatchingAnnotator(target_annotations).annotate(doc) assert _get_tags_of_tokens( matching_tokens) == [TAG1] * len(matching_tokens) assert _get_tags_of_tokens(pre_tokens) == [None] * len(pre_tokens) assert _get_tags_of_tokens(post_tokens) == [None] * len(post_tokens)
def tags_for_nodes(nodes): return sorted(set(flatten([tags_for_node(node) for node in nodes])))