i = len(text) - 1 while i >= 0: longest_word = text[i] for j in range(0, i): word = text[j:i + 1] if self.am.match(word): if len(word) > len(longest_word): longest_word = word break segments.append(longest_word) i -= len(longest_word) # 因为是从后往前匹配,输出去需要逆转 segments.reverse() return segments if __name__ == "__main__": import dataset import evaluation words, total = dataset.load_freq_words(proba=True, prefix=False) tokenizer = AutomatonTokenizer(words, algorithm="forward_segment") for text in dataset.load_sentences(): print(tokenizer.cut(text)) # 测试分词的完整性 text = dataset.load_human_history() words = tokenizer.cut(text) assert "".join(words) == text evaluation.evaluate_speed(tokenizer.cut, text, rounds=5)
size = len(sentence) for i in range(size): for j in range(i, size): word = sentence[i:j + 1] if word in self.words and self.words[word]: DAG[i].append(j) if not DAG[i]: DAG[i].append(i) return DAG def word_logproba(self, word): # 计算词的对数概率 return math.log(self.words.get(word) or 1) - self.logtotal if __name__ == "__main__": import dataset import evaluation words, total = dataset.load_freq_words() # words, total = dataset.load_chinese_words() tokenizer = Tokenizer(words, total) for text in dataset.load_sentences(): print(tokenizer.cut(text)) # 测试分词的完整性 text = dataset.load_human_history() words = tokenizer.cut(text) assert "".join(words) == text evaluation.evaluate_speed(tokenizer.cut, text, rounds=5)
import math import dataset import ahocorasick # ahocorasicks使用例子 words, _ = dataset.load_freq_words(proba=True) logtotal = math.log(sum(words.values())) am = ahocorasick.Automaton() for word, proba in words.items(): logproba = (math.log(proba) if proba > 0 else 0) - logtotal am.add_word(word, (word, proba)) am.make_automaton() text = "黑天鹅和灰犀牛是两个突发性事件" for end_idx, (word, proba) in am.iter(text): print(end_idx, word, proba) """ 0 黑 0.0001879472409333384 1 黑天 7.820042779627218e-07 1 天 0.0005986325939749099 2 黑天鹅 4.9915166678471605e-08 2 天鹅 9.184390668838775e-06 2 鹅 1.1680149002762355e-05 3 和 0.009247866122464898 4 灰 4.780209128908297e-05 5 灰犀 0.0 5 犀 2.2129057227455743e-06 6 灰犀牛 4.9915166678471605e-08 6 犀牛 2.645503833958995e-06