/
tokenization.py
63 lines (51 loc) · 1.78 KB
/
tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import sys
# import re
import token_hash_table
import heap
import sample
import nltk
def tokenize(filename):
file_source = open(filename, encoding='utf-8')
big_word_list = file_source.read()
# [brian] You forgot to close the file! It'll be closed automatically when
# the scripy finishes, but in a larger program this could be a pretty
# terrible mistake! Try writing:
with open(filename, encoding='utf-8') as file_source:
big_word_list = file_source.read()
# and by the time the program reaches this line, the file will already
# be closed.
sub = nltk.word_tokenize(big_word_list)
# regex = re.compile('\".+\"|\w+')
# sub = re.findall(regex, big_word_list)
return sub
if __name__ == '__main__':
token_list = []
if(len(sys.argv)) > 1:
token_list = tokenize(sys.argv[1])
else:
filename = input("Enter Filename: ")
token_list = tokenize(filename)
my_hash = token_hash_table.MyHash()
for i, token in enumerate(token_list):
# cur_val = my_hash.get(token)
if my_hash.get(token) == 0:
my_hash.set(token, 1)
# TODO: Also add to hash table of markov chain
else:
my_hash.update(token)
stoch_histo = sample.stochastic(my_hash)
new_keys = stoch_histo.keys()
new_freq = stoch_histo.values()
my_heap = heap.Heap()
for i in range(len(new_keys)):
my_heap.insert(new_keys[i], new_freq[i])
if(len(sys.argv)) > 2:
for i in range(int(sys.argv[2])):
print(my_heap.delete_max())
else:
large = input("Enter n max tokens: ")
for i in range(int(large)):
print(my_heap.delete_max())
# sent = sentence(stoch_histo, 7)
# print(" ".join(sent))
# print(sentence(stochastic(my_hash), 1))