def tokenize(self, text): from janome.tokenizer import Tokenizer import tinysegmenter import MeCab # type: (Text) -> List[Token] # Japanese tinysegmenter #tokenizer = tinysegmenter.TinySegmenter() #words = tokenizer.tokenize(text) # Japanese janome #tokenizer = Tokenizer() #words = tokenizer.tokenize(text, wakati = "True") #tokenized = [(word, text.index(word), text.index(word) + len(word)) for word in words] # Japanese Mecab m = MeCab.Tagger(" -d /usr/lib/mecab/dic/mecab-ipadic-neologd/") m.parse("") node = m.parseToNode(text) words = [] while node: words.append(node.surface) node = node.next tokenized = [(word, text.find(word), text.find(word) + len(word)) for word in words] tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
def tokenize(self, text): # type: (Text) -> List[Token] import jieba tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
def tokenize(self, text): # type: (Text) -> List[Token] # there is space or end of string after punctuation # because we do not want to replace 10.000 with 10 000 words = re.sub(r'[.,!?]+(\s|$)', ' ', text).split() running_offset = 0 tokens = [] for word in words: print("Word: {}".format(word)) word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len print("Ofset {}".format(word_offset)) print("Word Length: {}".format(word_len)) print("Running ofSet: {}".format(running_offset)) tokens.append(Token(word, word_offset)) # # print("Ofset {}".format(word_offset)) # print("Word Length: {}".format(word_len)) # print("Running ofSet: {}".format(running_offset)) print("========================================") print(tokens) print(dir(tokens)) return tokens
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def tokenize(self, text: str) -> List[Token]: '''Tokenize the sentence. ''' if self.user_dict_dir is not None: self.load_user_dictionary(self.user_dict_dir) tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
def tokenize(self, text): # type: (Text) -> List[Token] tokenized = self.tokenizer.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] #for i in tokens: # print(i.text) # print('hhhh\n') return tokens
def process(self, message, **kwargs): tokens_s = [] for token in message.get("tokens"): if token.text in self.slangs: for subtoken in self.slangs[token.text].split(" "): tokens_s.extend(Token(subtoken, 0)) else: tokens_s.append(token) message.set("tokens_slangprocessed", tokens_s)
def tokenize(self, text): #type: (Text) -> List[Token] self.dictionary = "mecabrc" self.tagger = MeCab.Tagger(self.dictionary) if not text: return [] words = [] if type(text) != str: text = u''.join((text)).encode('utf-8') node = self.tagger.parseToNode(str(text)) running_offset = 0 word_offset = 0 while node: features = node.feature.split(',') if features[self.INDEX_CATEGORY] in self.TARGET_CATEGORIES: if features[self.INDEX_ROOT_FORM] == "*": word_offset = text.index(node.surface, running_offset) word_len = len(node.surface) running_offset = word_offset + word_len words.append(Token(node.surface, word_offset)) else: try: word_offset = text.index( features[self.INDEX_ROOT_FORM], running_offset) word_len = len(features[self.INDEX_ROOT_FORM]) running_offset = word_offset + word_len words.append( Token(features[self.INDEX_ROOT_FORM], word_offset)) except ValueError: print("No such a string") if not word_offset: word_offset = 0 word_len = 1 running_offset = word_offset + word_len node = node.next # for eachword in words: # print('Word ==> {}'.format(eachword)) return words
def tokenize(self, text): # type: (Text) -> List[Token] import jieba if self.dictionary_path is not None: self.load_custom_dictionary(self.dictionary_path) tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
def tokenize(self, text): # type: (Text) -> List[Token] tokens = [] start = 0 for word in self.seg.segment(text): tokens.append(Token(word, start)) start += len(word) return tokens
def tokenize_text(self, text): # type: (Text) -> List[Token_text] tokenized = self.tokenizer.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] token_text = [] for i in tokens: token_text.append(i.text) # print(i.text) # print('hhhh\n') return token_text
def process(self, message, **kwargs): from textblob import Word token_spellchecked=[] T = None for token in message.get("tokens_slangprocessed"): w = Word(token.text).correct() if len(w)>=1: a = np.array(w) print(str(token.text)+" corrected to "+str(a)) token_spellchecked.append(Token(str(a),0)) message.set("token_spellchecked", token_spellchecked)
def tokenize(self, text): # type: (Text) -> List[Token] import mitie _text = text.encode('utf-8') tokenized = mitie.tokenize_with_offsets(_text) tokens = [ Token(token.decode('utf-8'), self._byte_to_char_offset(_text, offset)) for token, offset in tokenized ] return tokens
def tokenize(self, text): # type: (Text) -> List[Token] words = text.split() running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) return tokens
def tokenize(self, text): # type: (Text) -> List[Token] import jieba words = jieba.lcut(text.encode('utf-8')) running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) return tokens
def tokenize(self, text): # type: (Text) -> List[Token] from pyhanlp import HanLP terms = HanLP.segment(text) running_offset = 0 tokens = [] for term in terms: word_offset = text.index(term.word, running_offset) word_len = len(term.word) running_offset = word_offset + word_len tokens.append(Token(term.word, word_offset)) logging.debug(terms) return tokens
def tokenize(self, text): # type: (Text) -> List[Token] seg = self.Mycut(text) seg = self.add_userdict(seg) seg = self.split_userdict(seg) seg = seg.split('<>') tokens = [] i = 0 for w in seg: tokens.append(Token(w, i)) i += len(w) return tokens
def tokenize(self, text): # type: (Text) -> List[Token] # spacy_nlp = spacy.load('en_core_web_sm') # spacy_nlp.Defaults.stop_words |= {"no","not",""} # doc = spacy_nlp(text) # # print(words) # tok = [tokend for tokend in doc if not tokend.is_stop] # print(tok,'...............') # tokensd = nltk.tokenize.word_tokenize(text) # nltk_stopwords = nltk.corpus.stopwords.words('english') # tokensd = [tokend for tokend in tokensd if not tokend in nltk_stopwords] # print(tokensd,'----------------------') # there is space or end of string after punctuation # because we do not want to replace 10.000 with 10 000 # print(t(text) words = re.sub(r'[.,!?]+(\s|$)', ' ', text).split() tokensd = [] # for tokend in words: # if not tokend.lower() in stop_words: # if len(tokend.lower())>2: # tokensd.append(tokend.lower()) # elif tokend.isnumeric(): # tokensd.append(tokend.lower()) tokensd = [ tokend.lower() for tokend in words if not tokend.lower() in stop_words ] doc = nlp(str(' '.join(tokensd))) words = [str(lemm.lemma_) for lemm in doc] words = [ re.sub( r'[^\x00-\x7f]', '', re.sub('[\t\r\n,)([\]!%|!#$%&*+,.-/:;<=>?@^_`{|}~?]', '', str(i))).strip() for i in words ] # print(tokensd) running_offset = 0 tokens = [] texts = ' '.join(words) for word in words: word_offset = texts.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) # print('//////////////////') # print(tokensd) # print('//////////////////') return tokens
def tokenize(self, text): # type: (Text) -> List[Token] # words=self.parse_with_cabocha(text) words = self.parse_with_knp(text) running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) # print(word, word_offset) return tokens
def tokenize(self, text): # type: (Text) -> List[Token] token_list = self.mecab.morphs(text) running_offset = 0 result = [] for token in token_list: token_offset = text.index(token, running_offset) token_len = len(token) running_offset = token_offset + token_len result.append(Token(token, token_offset)) return result
def tokenize(self, text: Text) -> List[Token]: # there is space or end of string after punctuation # because we do not want to replace 10.000 with 10 000 words = re.sub(r'[.,!?]+(\s|$)', ' ', text).split() running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) return tokens
def tokenize(self, text): """ Tokenize a sentence and yields tuples of (word, start, end) type: (Text) -> List[Token] Parameter: - text: the str(unicode) to be segmented. """ tokens = [] start = 0 for char in list(text): print(char) #yield (w, start, start + width) tokens.append(Token(char, start)) start += 1 return tokens
def tokenize(self, text): words = [] mecab_features = [] node = self.mecab.parseToNode(text).next while node: words.append(node.surface) mecab_features.append(node.feature.split(',')) node = node.next running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) running_offset = word_offset + len(word) tokens.append(Token(word, word_offset)) return tokens, mecab_features
def tokenize_msg(text, msg_chunks): words=[] for chunk in msg_chunks.chunks: for token in chunk.tokens: # print(token, token.pos) words.append(token.surface) running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) return tokens
def tokenize(self, text): from pyvi.pyvi import ViTokenizer from underthesea import word_sent # type: (Text) -> List[Token] # Vietnamese pyvi #tokenizer = ViTokenizer() #words = tokenizer.tokenize(text) # Vietnamese underthesea words = word_sent(text) tokenized = [(word, text.find(word), text.find(word) + len(word)) for word in words] tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
def tokenize(self, text): """ Tokenize a sentence and yields tuples of (word, start, end) type: (Text) -> List[Token] Parameter: - text: the str(unicode) to be segmented. """ tokens = [] tokenized = self.tokenizer.segment(text) print(tokenized) start = 0 for term in tokenized: w = str(term).split('/')[0] width = len(w) #yield (w, start, start + width) tokens.append(Token(w, start)) start += width return tokens
def tokenize(self, text): # type: (Text) -> List[Token] words = text.split() running_offset = 0 tokens = [] print(words) for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len if word in self.numbermap: word = self.numbermap[word] print("ny word ", word) print("ny word ", word) tokens.append(Token(word, word_offset)) print("tokens ", tokens) return tokens
def tokenize(self, doc: 'Doc') -> typing.List[Token]: return [Token(t.text, t.idx) for t in doc]
logging.basicConfig(level="DEBUG") @pytest.fixture(scope="session") def duckling_interpreter(component_builder, tmpdir_factory): conf = RasaNLUModelConfig({"pipeline": [{"name": "ner_duckling_http"}]}) return utilities.interpreter_for( component_builder, data="./data/examples/rasa/demo-rasa.json", path=tmpdir_factory.mktemp("projects").strpath, config=conf) # Chinese Example # "对面食过敏" -> To be allergic to wheat-based food CH_wrong_segmentation = [Token("对面", 0), Token("食", 2), Token("过敏", 3)] # opposite, food, allergy CH_correct_segmentation = [Token("对", 0), Token("面食", 1), Token("过敏", 3)] # towards, wheat-based food, allergy CH_wrong_entity = {"start": 0, "end": 2, "value": "对面", "entity": "direction"} CH_correct_entity = { "start": 1, "end": 3, "value": "面食", "entity": "food_type" } # EN example
def tokenize(self, doc): # type: (Doc) -> List[Token] return [Token(t.text, t.idx) for t in doc]