/
word_comparator.py
54 lines (43 loc) · 1.76 KB
/
word_comparator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import nltk
from os.path import exists
from collections import Counter
from nltk.corpus import brown
from nltk.corpus import reuters
from nltk.stem import PorterStemmer
import pickle
import time
class WordComparator:
COUNTER_FILE_NAME = "word_counts.dump"
def __init__(self):
print("Initializing word comparator...")
start_time = time.time()
self.word_counter = self.word_frequencies()
self.stemmer = PorterStemmer()
print("Done. Initialization took {} seconds.".format(time.time() - start_time))
def better_word(self, word1 : str, word2 : str):
if word1 == None or word2 == None:
return word1 if word2 == None else word2
return word1 if self.evaluate_word(word1) > self.evaluate_word(word2) else word2
def evaluate_word(self, word):
w1_norm = self.normalize_word(word)
return self.word_counter[w1_norm]
def word_frequencies(self):
if exists(WordComparator.COUNTER_FILE_NAME):
return pickle.load(open(WordComparator.COUNTER_FILE_NAME, "rb"))
else:
counter = self.compute_word_frequencies()
pickle.dump(counter, open(WordComparator.COUNTER_FILE_NAME, "wb"))
return counter
def compute_word_frequencies(self):
nltk.download('brown')
nltk.download('reuters')
counter = Counter()
counter.update(self.normalize_words(brown.words()))
counter.update(self.normalize_words(reuters.words()))
return counter
def normalize_words(self, words : list):
return [self.normalize_word(word) for word in words]
def normalize_word(self, word):
return self.stemmer.stem(word.lower())
def serialize_counts(self):
self.word_counter