def test_sentence_whitespace_tokenization(): sentence = Sentence('I love Berlin .') assert (4 == len(sentence.tokens)) assert ('I' == sentence.get_token(1).text) assert ('love' == sentence.get_token(2).text) assert ('Berlin' == sentence.get_token(3).text) assert ('.' == sentence.get_token(4).text)
def _get_tars_formatted_sentence(self, label, sentence): original_text = sentence.to_tokenized_string() label_text_pair = f"{label} {self.separator} {original_text}" if self.prefix \ else f"{original_text} {self.separator} {label}" label_length = 0 if not self.prefix else len(label.split(" ")) + len( self.separator.split(" ")) tars_sentence = Sentence(label_text_pair, use_tokenizer=False) for token in sentence: tag = token.get_tag(self.get_current_tag_type()).value if "-" in tag and tag.split('-')[1] == label: tars_tag = tag.split('-')[0] + '-' elif tag == label: tars_tag = "S-" else: tars_tag = "O" tars_sentence.get_token(token.idx + label_length).add_tag( self.static_label_type, tars_tag) return tars_sentence
def _get_tars_formatted_sentence(self, label, sentence): original_text = sentence.to_tokenized_string() label_text_pair = f"{label} {self.separator} {original_text}" if self.prefix \ else f"{original_text} {self.separator} {label}" label_length = 0 if not self.prefix else len(label.split(" ")) + len( self.separator.split(" ")) # make a tars sentence where all labels are O by default tars_sentence = Sentence(label_text_pair, use_tokenizer=False) for token in tars_sentence: token.add_tag(self.static_label_type, "O") # overwrite O labels with tags for token in sentence: tag = token.get_tag(self.get_current_label_type()).value if tag == "O" or tag == "": tars_tag = "O" elif tag == label: tars_tag = "S-" elif tag[1] == "-" and tag[2:] == label: tars_tag = tag.split('-')[0] + '-' else: tars_tag = "O" tars_sentence.get_token(token.idx + label_length).add_tag( self.static_label_type, tars_tag) return tars_sentence
def _get_tars_formatted_sentence(self, label, sentence): original_text = sentence.to_tokenized_string() label_text_pair = (f"{label} {self.separator} {original_text}" if self.prefix else f"{original_text} {self.separator} {label}") label_length = 0 if not self.prefix else len(label.split(" ")) + len( self.separator.split(" ")) # make a tars sentence where all labels are O by default tars_sentence = Sentence(label_text_pair, use_tokenizer=False) for entity_label in sentence.get_labels(self.label_type): if entity_label.value == label: new_span = [ tars_sentence.get_token(token.idx + label_length) for token in entity_label.span ] tars_sentence.add_complex_label( self.static_label_type, SpanLabel(Span(new_span), value="entity")) return tars_sentence
from flair.data import Sentence from flair.embeddings import WordEmbeddings from flair.embeddings import CharacterEmbeddings from flair.embeddings import BytePairEmbeddings from flair.embeddings import BertEmbeddings, ELMoEmbeddings from flair.embeddings import FlairEmbeddings, StackedEmbeddings # 创建Sentense对象,Flair中共两个对象Sentense、token,sentense是由一系列token组成 sentence = Sentence('The grass is green .') print(sentence) print(sentence.get_token(4)) print(sentence[3]) # Glove Embeddings加载训练 glove_embedding_forward = WordEmbeddings('model/glove.gensim') sentence = Sentence('The grass is green .') glove_embedding_forward.embed(sentence) for token in sentence: print(token) print(token.embedding) #Fasttest Embedding加载训练 fasttext_embedding_forward = WordEmbeddings('model/zh-wiki-fasttext-300d-1M') sentence = Sentence('The grass is green .') fasttext_embedding_forward.embed(sentence) for token in sentence: print(token) print(token.embedding) #Flair Embedding加载训练 flair_embedding_forward = FlairEmbeddings('model/news-forward-0.4.1.pt')
def test_sentence_get_item(): sentence = Sentence('I love Berlin.', use_tokenizer=True) assert (sentence.get_token(1) == sentence[0]) assert (sentence.get_token(3) == sentence[2]) with pytest.raises(IndexError): token = sentence[4]