-
Notifications
You must be signed in to change notification settings - Fork 0
/
wordlist.py
55 lines (41 loc) · 1.38 KB
/
wordlist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from nltk.corpus import cmudict
import cPickle as pickle
def make_wordlist():
"""
Add '#' to the start of strings
Add '$' to the end of strings (for markov chain use).
Pickle and dump to 'dict.p'.
"""
with open('wordlists/words.txt', 'r') as wordfile:
words = wordfile.readlines()
for i in xrange(len(words)):
words[i] = '#' + words[i].lower().strip() + '$'
#Use '$' to mark the end of words
# with open('wordlists/words_edited.p', 'w') as outfile:
# pickle.dump(words, outfile)
return words
def make_cmu_wordlist():
"""
Strip the CMU Pronunciation Dictionary of accent marks.
Add '$' to the end of strings (for markov chain use).
Pickle and dump to 'cmu.p'.
"""
d = cmudict.dict()
pronunciation_list = d.values()
edited_list = []
for entry in pronunciation_list:
for word in entry:
edited_word = ["#"]
for i in xrange(len(word)):
#remove accent marks
edited_word.append(word[i].rstrip('0123456789'))
#Use '$' to mark the end of words
edited_word.append('$')
edited_list.append(edited_word)
# with open('wordlists/cmu.p', 'w') as outfile:
# pickle.dump(edited_list, outfile)
return edited_list
def main():
make_wordlist()
if __name__ == '__main__':
main()