示例#1
0
def main():
    ret = subprocess.call("./download.sh", shell=True)
    if ret != 0:
        return False
    ret = _install_neologd()
    if not ret:
        return False
    with open(DATA_PATH, 'w') as w:
        with open('articles.txt', 'r') as f:
            for line in f:
                w.write(_process_syntax(line) + '\n')
    tone_list, lcounter_2gram, word2pos = _create_tone_list()
    with open(TONE_PATH, 'wb') as w:
        pickle.dump(tone_list, w, pickle.HIGHEST_PROTOCOL)
    with open(COUNTER_2GRAM_PATH, 'wb') as w:
        pickle.dump(lcounter_2gram, w, pickle.HIGHEST_PROTOCOL)
    with open(WORD2POS_PATH, 'wb') as w:
        pickle.dump(word2pos, w, pickle.HIGHEST_PROTOCOL)
    prefix_searcher = trie.DoubleArray(tone_list.keys())
    with open(PREFIX_SEARCHER_PATH, 'wb') as w:
        pickle.dump(prefix_searcher, w, pickle.HIGHEST_PROTOCOL)
    learner = _train_graph(prefix_searcher, tone_list, lcounter_2gram,
                           word2pos)
    with open(LEARNER_PATH, 'wb') as w:
        pickle.dump(learner, w, pickle.HIGHEST_PROTOCOL)
示例#2
0
 def test_prefix_search_multichars(self):
     da = trie.DoubleArray([('しゃ', ), ('しゃ', 'か'), ('しゃ', 'か', 'い')])
     result = da.prefix_search(['しゃ'])
     eq_(result, [('しゃ', )])
     result = da.prefix_search(['しゃ', 'か'])
     eq_(result, [('しゃ', ), ('しゃ', 'か')])
     result = da.prefix_search(['しゃ', 'か', 'い'])
     eq_(result, [('しゃ', ), ('しゃ', 'か'), ('しゃ', 'か', 'い')])
示例#3
0
 def test_prefix_search_case_non_vocabulary(self):
     da = trie.DoubleArray([('a'), ('a', 'b'), ('a', 'b', 'c')])
     result = da.prefix_search(['b'])
     eq_(result, [])
     result = da.prefix_search(['a', 'd'])
     eq_(result, [('a')])
     result = da.prefix_search(['a', 'b', 'a'])
     eq_(result, [('a'), ('a', 'b')])
示例#4
0
 def test_prefix_search(self):
     da = trie.DoubleArray([('a'), ('a', 'b'), ('a', 'b', 'c')])
     result = da.prefix_search(['a'])
     eq_(result, [('a')])
     result = da.prefix_search(['a', 'b'])
     eq_(result, [('a'), ('a', 'b')])
     result = da.prefix_search(['a', 'b', 'c'])
     eq_(result, [('a'), ('a', 'b'), ('a', 'b', 'c')])
示例#5
0
 def test_search_max_len(self):
     da = trie.DoubleArray([('a', 'd'), ('a', 'b', 'c')])
     result = da.search(['a'], max_len=1)
     eq_(result, [])
     result = da.search(['a'], max_len=2)
     eq_(result, [('a', 'd')])
     result = da.search(['a'], max_len=3)
     eq_(result, [('a', 'd'), ('a', 'b', 'c')])
示例#6
0
 def test_search_case_two_words(self):
     da = trie.DoubleArray([('a', 'b', 'c'), ('a', 'b', 'd')])
     result = da.search(['a'])
     eq_(result, [('a', 'b', 'c'), ('a', 'b', 'd')])
     result = da.search(['a', 'b'])
     eq_(result, [('a', 'b', 'c'), ('a', 'b', 'd')])
     result = da.search(['a', 'b', 'c'])
     eq_(result, [('a', 'b', 'c')])
示例#7
0
 def test_search_case_non_result(self):
     da = trie.DoubleArray([('a', 'b', 'c')])
     result = da.search(['a', 'b', 'a'])
     eq_(result, [])
     result = da.search(['a', 'c'])
     eq_(result, [])
     result = da.search(['b'])
     eq_(result, [])
示例#8
0
    ('a', 'o'): ['あお', 'かお', 'さと'],
    ('a', 'o', 'i'): ['あおい', 'さとみ'],
    ('o', ): ['と'],
    ('o', 'i'): ['とい', 'こい', 'とし'],
    ('o', 'i', 'o'): ['たいよ', 'はいりょ'],
    ('i', ): ['き', 'し'],
    ('i', 'o'): ['みこ', 'しお'],
    ('o', 'a'): ['もか', 'もさ'],
}

word2pos = {}
for t in tone_list:
    for w in tone_list[t]:
        word2pos[w] = w

prefix_searcher = trie.DoubleArray(tone_list.keys())


def test_construct_graph():
    g = graph.Graph.construct_graph(prefix_searcher, tone_list,
                                    ('a', 'o', 'i', 'o', 'a'))
    eq_([g.BOS], g.nodes[0])
    eq_(set([graph.Node(0, 'か'),
             graph.Node(0, 'あ'),
             graph.Node(0, 'さ')]), set(g.nodes[1]))
    eq_(
        set([
            graph.Node(0, 'あお'),
            graph.Node(0, 'かお'),
            graph.Node(0, 'さと'),
            graph.Node(1, 'と')
示例#9
0
 def test_create_multichars(self):
     da = trie.DoubleArray([('しゃ', 'か', 'い')])
     eq_(da._base, [0, 0, 0, -1])
     eq_(da._check, [-1, 0, 1, 2])
示例#10
0
 def test_create_contain_common_postfix(self):
     da = trie.DoubleArray([('a', 'b', 'c'), ('d', 'b', 'c')])
     eq_(da._base, [0, 0, 0, -1, 3, 3, -1])
     eq_(da._check, [-1, 0, 1, 2, 0, 4, 5])
示例#11
0
 def test_create(self):
     da = trie.DoubleArray([('a', 'b', 'c')])
     eq_(da._base, [0, 0, 0, -1])
     eq_(da._check, [-1, 0, 1, 2])
示例#12
0
 def test_prefix_search_over_checklen(self):
     da = trie.DoubleArray([('a', 'b', 'c'), ('a', 'd', 'c')])
     result = da.prefix_search(['a', 'd', 'd'])
     eq_(result, [])