Пример #1
0
 def test_should_annotate_ignoring_space_after_dot_short_sequence(self):
     matching_tokens = [SimpleToken('A.B.,')]
     target_annotations = [TargetAnnotation('A. B.', TAG1)]
     doc = _document_for_tokens([matching_tokens])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
     assert _get_tag_values_of_tokens(
         matching_tokens) == [TAG1] * len(matching_tokens)
Пример #2
0
def get_entity_tokens(tag: str, value: str) -> List[SimpleToken]:
    return [
        SimpleToken(token_text,
                    tag=add_tag_prefix(
                        tag,
                        prefix=B_TAG_PREFIX if index == 0 else I_TAG_PREFIX))
        for index, token_text in enumerate(get_token_texts_for_text(value))
    ]
Пример #3
0
def _tokens_for_text(text):
    return [SimpleToken(s) for s in re.split(r'(\W)', text) if s.strip()]
Пример #4
0
 def test_should_not_fail_on_empty_line_with_blank_token(self):
     target_annotations = [TargetAnnotation('this is. matching', TAG1)]
     doc = _document_for_tokens([[SimpleToken('')]])
     SimpleMatchingAnnotator(target_annotations).annotate(doc)
Пример #5
0
 def test_should_get_tokens_between(self):
     tokens = [SimpleToken(TOKEN_1), SimpleToken(TOKEN_2)]
     doc = SimpleStructuredDocument(lines=[SimpleLine(tokens)])
     seq = SequenceWrapper(doc, tokens)
     assert list(seq.tokens_between((0, len(TOKEN_1)))) == [tokens[0]]
Пример #6
0
 def test_should_join_text_without_space(self):
     tokens = [SimpleToken(TOKEN_1), SimpleToken(TOKEN_2)]
     tokens[0].whitespace = ''
     doc = SimpleStructuredDocument(lines=[SimpleLine(tokens)])
     seq = SequenceWrapper(doc, tokens)
     assert str(seq) == ''.join([TOKEN_1, TOKEN_2])