-
Notifications
You must be signed in to change notification settings - Fork 0
/
WordFrequency.py
75 lines (59 loc) · 1.72 KB
/
WordFrequency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#
# WordFrequency.py
#
# defines class WordFrequency and related utility functions
#
import collections
import cmath
import cPickle
class WordFrequency:
def __init__(self):
self.dict = collections.defaultdict(int)
self.total = long(0)
def add_word_sequence(self, seq):
for word in seq:
self.dict[word] += 1
self.total += 1
def add_word_freq_pair(self, word, freq):
self.dict[word] += freq
self.total += freq
def total(self):
return self.total
def get_frequency(self, word):
return self.dict[word]
def get_probability(self, word, missing=None):
if word in self.dict:
return float(self.dict[word])/self.total
elif missing:
return missing(word)
else:
return 1.0 / self.total
def get_log_probability(self, word):
return cmath.log10(self.get_probability(word))
def __contains__(self, item):
return item in self.dict
def __len__(self):
return len(self.dict)
def __getitem__(self, item):
return self.get_frequency(item)
def __iter__(self):
return iter(self.dict)
@staticmethod
def from_freq_file(file_name):
result = WordFrequency()
for line in open(file_name, 'r'):
tokens = line.split()
word = tokens[0]
freq = int(tokens[1])
result.add_word_freq_pair(word, freq)
return result
def pickle(self, file_name):
f = open(file_name, "wb")
cPickle.dump(self, f)
f.close()
@staticmethod
def unpickle(file_name):
f = open(file_name, "rb")
result = cPickle.load(f)
f.close()
return result