Exemplo n.º 1
0
def test_sentence_whitespace_tokenization():
    sentence = Sentence('I  love Berlin .')
    assert (4 == len(sentence.tokens))
    assert ('I' == sentence.get_token(1).text)
    assert ('love' == sentence.get_token(2).text)
    assert ('Berlin' == sentence.get_token(3).text)
    assert ('.' == sentence.get_token(4).text)
Exemplo n.º 2
0
    def _get_tars_formatted_sentence(self, label, sentence):

        original_text = sentence.to_tokenized_string()

        label_text_pair = f"{label} {self.separator} {original_text}" if self.prefix \
            else f"{original_text} {self.separator} {label}"

        label_length = 0 if not self.prefix else len(label.split(" ")) + len(
            self.separator.split(" "))

        tars_sentence = Sentence(label_text_pair, use_tokenizer=False)

        for token in sentence:
            tag = token.get_tag(self.get_current_tag_type()).value

            if "-" in tag and tag.split('-')[1] == label:
                tars_tag = tag.split('-')[0] + '-'
            elif tag == label:
                tars_tag = "S-"
            else:
                tars_tag = "O"

            tars_sentence.get_token(token.idx + label_length).add_tag(
                self.static_label_type, tars_tag)

        return tars_sentence
Exemplo n.º 3
0
    def _get_tars_formatted_sentence(self, label, sentence):

        original_text = sentence.to_tokenized_string()

        label_text_pair = f"{label} {self.separator} {original_text}" if self.prefix \
            else f"{original_text} {self.separator} {label}"

        label_length = 0 if not self.prefix else len(label.split(" ")) + len(
            self.separator.split(" "))

        # make a tars sentence where all labels are O by default
        tars_sentence = Sentence(label_text_pair, use_tokenizer=False)
        for token in tars_sentence:
            token.add_tag(self.static_label_type, "O")

        # overwrite O labels with tags
        for token in sentence:
            tag = token.get_tag(self.get_current_label_type()).value

            if tag == "O" or tag == "":
                tars_tag = "O"
            elif tag == label:
                tars_tag = "S-"
            elif tag[1] == "-" and tag[2:] == label:
                tars_tag = tag.split('-')[0] + '-'
            else:
                tars_tag = "O"

            tars_sentence.get_token(token.idx + label_length).add_tag(
                self.static_label_type, tars_tag)

        return tars_sentence
Exemplo n.º 4
0
    def _get_tars_formatted_sentence(self, label, sentence):

        original_text = sentence.to_tokenized_string()

        label_text_pair = (f"{label} {self.separator} {original_text}"
                           if self.prefix else
                           f"{original_text} {self.separator} {label}")

        label_length = 0 if not self.prefix else len(label.split(" ")) + len(
            self.separator.split(" "))

        # make a tars sentence where all labels are O by default
        tars_sentence = Sentence(label_text_pair, use_tokenizer=False)

        for entity_label in sentence.get_labels(self.label_type):
            if entity_label.value == label:
                new_span = [
                    tars_sentence.get_token(token.idx + label_length)
                    for token in entity_label.span
                ]
                tars_sentence.add_complex_label(
                    self.static_label_type,
                    SpanLabel(Span(new_span), value="entity"))

        return tars_sentence
Exemplo n.º 5
0
from flair.data import Sentence
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import BytePairEmbeddings
from flair.embeddings import BertEmbeddings, ELMoEmbeddings
from flair.embeddings import FlairEmbeddings, StackedEmbeddings

# 创建Sentense对象,Flair中共两个对象Sentense、token,sentense是由一系列token组成
sentence = Sentence('The grass is green .')
print(sentence)
print(sentence.get_token(4))
print(sentence[3])

# Glove Embeddings加载训练
glove_embedding_forward = WordEmbeddings('model/glove.gensim')
sentence = Sentence('The grass is green .')
glove_embedding_forward.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#Fasttest Embedding加载训练
fasttext_embedding_forward = WordEmbeddings('model/zh-wiki-fasttext-300d-1M')
sentence = Sentence('The grass is green .')
fasttext_embedding_forward.embed(sentence)
for token in sentence:
    print(token)
    print(token.embedding)

#Flair Embedding加载训练
flair_embedding_forward = FlairEmbeddings('model/news-forward-0.4.1.pt')
Exemplo n.º 6
0
def test_sentence_get_item():
    sentence = Sentence('I love Berlin.', use_tokenizer=True)
    assert (sentence.get_token(1) == sentence[0])
    assert (sentence.get_token(3) == sentence[2])
    with pytest.raises(IndexError):
        token = sentence[4]