예제 #1
0
        i = len(text) - 1
        while i >= 0:
            longest_word = text[i]
            for j in range(0, i):
                word = text[j:i + 1]
                if self.am.match(word):
                    if len(word) > len(longest_word):
                        longest_word = word
                        break
            segments.append(longest_word)
            i -= len(longest_word)
        # 因为是从后往前匹配,输出去需要逆转
        segments.reverse()
        return segments


if __name__ == "__main__":
    import dataset
    import evaluation
    words, total = dataset.load_freq_words(proba=True, prefix=False)
    tokenizer = AutomatonTokenizer(words, algorithm="forward_segment")
    for text in dataset.load_sentences():
        print(tokenizer.cut(text))

    # 测试分词的完整性
    text = dataset.load_human_history()
    words = tokenizer.cut(text)
    assert "".join(words) == text

    evaluation.evaluate_speed(tokenizer.cut, text, rounds=5)
예제 #2
0
        size = len(sentence)
        for i in range(size):
            for j in range(i, size):
                word = sentence[i:j + 1]
                if word in self.words and self.words[word]:
                    DAG[i].append(j)
            if not DAG[i]:
                DAG[i].append(i)
        return DAG

    def word_logproba(self, word):
        # 计算词的对数概率
        return math.log(self.words.get(word) or 1) - self.logtotal


if __name__ == "__main__":
    import dataset
    import evaluation
    words, total = dataset.load_freq_words()
    # words, total = dataset.load_chinese_words()
    tokenizer = Tokenizer(words, total)
    for text in dataset.load_sentences():
        print(tokenizer.cut(text))

    # 测试分词的完整性
    text = dataset.load_human_history()
    words = tokenizer.cut(text)
    assert "".join(words) == text

    evaluation.evaluate_speed(tokenizer.cut, text, rounds=5)
예제 #3
0
import math
import dataset
import ahocorasick

# ahocorasicks使用例子

words, _ = dataset.load_freq_words(proba=True)
logtotal = math.log(sum(words.values()))
am = ahocorasick.Automaton()
for word, proba in words.items():
    logproba = (math.log(proba) if proba > 0 else 0) - logtotal
    am.add_word(word, (word, proba))

am.make_automaton()

text = "黑天鹅和灰犀牛是两个突发性事件"
for end_idx, (word, proba) in am.iter(text):
    print(end_idx, word, proba)
"""
0 黑 0.0001879472409333384
1 黑天 7.820042779627218e-07
1 天 0.0005986325939749099
2 黑天鹅 4.9915166678471605e-08
2 天鹅 9.184390668838775e-06
2 鹅 1.1680149002762355e-05
3 和 0.009247866122464898
4 灰 4.780209128908297e-05
5 灰犀 0.0
5 犀 2.2129057227455743e-06
6 灰犀牛 4.9915166678471605e-08
6 犀牛 2.645503833958995e-06