-
Notifications
You must be signed in to change notification settings - Fork 5
/
BootStrap.py
172 lines (145 loc) · 6.54 KB
/
BootStrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# coding: utf-8
# In[3]:
import heapq
import json
from collections import defaultdict
from ReadData import ReadData
from nltk import FreqDist
import numpy as np
modelDataDir = "modelData/"
class BootStrap:
def __init__(self, readDataObj):
self.corpus = readDataObj
#Aspect,Word -> freq matrix - frequency of word in that aspect
self.aspectWordMat = defaultdict(lambda: defaultdict(int))
#Aspect --> total count of words tagged in that aspect
# = sum of all row elements in a row in aspectWordMat matrix
self.aspectCount = defaultdict(int)
#Word --> frequency of jth tagged word(in all aspects)
# = sum of all elems in a column in aspectWordMat matrix
self.wordCount = defaultdict(int)
#Top p words from the corpus related to each aspect to update aspect keyword list
self.p=5
self.iter=7
#List of W matrix
self.wList=[]
#List of ratings Dictionary belonging to review class
self.ratingsList=[]
#List of Review IDs
self.reviewIdList=[]
'''def calcC1_C2_C3_C4(self):
for aspect, sentence in self.corpus.aspectSentences.items():
for sentence in sentences:
for word in self.corpus.wordFreq.keys() and not in sentence.wordFreqDict.keys():
self.aspectNotWordMat[aspect][word]+=1
for word,freq in sentence.wordFreqDict.items():
self.aspectWordMat[aspect][word]+=freq
'''
def assignAspect(self, sentence): #assigns aspects to sentence
sentence.assignedAspect = []
count = defaultdict(int) #count used for aspect assignment as in paper
#print("IN ASSIGN ASPECT FUNCTION:",len(sentence.wordFreqDict))
for word in sentence.wordFreqDict.keys():
for aspect, keywords in self.corpus.aspectKeywords.items():
if word in keywords:
count[aspect]+=1
if count: #if count is not empty
maxi = max(count.values())
for aspect, cnt in count.items():
if cnt==maxi:
sentence.assignedAspect.append(aspect)
if(len(sentence.assignedAspect)==1): #if only 1 aspect assigned to it
self.corpus.aspectSentences[sentence.assignedAspect[0]].append(sentence)
def populateAspectWordMat(self):
self.aspectWordMat.clear()
for aspect, sentences in self.corpus.aspectSentences.items():
for sentence in sentences:
for word,freq in sentence.wordFreqDict.items():
self.aspectWordMat[aspect][word]+=freq
self.aspectCount[aspect]+=freq
self.wordCount[word]+=freq
def chiSq(self, aspect, word):
#Total number of (tagged) word occurrences
C = sum(self.aspectCount.values())
#Frequency of word W in sentences tagged with aspect Ai
C1 = self.aspectWordMat[aspect][word]
#Frequency of word W in sentences NOT tagged with aspect Ai
C2 = self.wordCount[word]-C1
#Number of sentences of aspect A, NOT contain W
C3 = self.aspectCount[aspect]-C1
#Number of sentences of NOT aspect A, NOT contain W
C4 = C-C1
deno = (C1+C3)*(C2+C4)*(C1+C2)*(C3+C4)
#print(aspect, word, C, C1, C2, C3, C4)
if deno!=0:
return (C*(C1*C4 - C2*C3)*(C1*C4 - C2*C3))/deno
else:
return 0.0
def calcChiSq(self):
topPwords = {}
for aspect in self.corpus.aspectKeywords.keys():
topPwords[aspect] = []
for word in self.corpus.wordFreq.keys():
maxChi = 0.0 #max chi-sq value for this word
maxAspect = "" #corresponding aspect
for aspect in self.corpus.aspectKeywords.keys():
self.aspectWordMat[aspect][word] = self.chiSq(aspect,word)
if self.aspectWordMat[aspect][word] > maxChi:
maxChi = self.aspectWordMat[aspect][word]
maxAspect = aspect
if maxAspect!="":
topPwords[maxAspect].append((maxChi, word))
changed=False
for aspect in self.corpus.aspectKeywords.keys():
for t in heapq.nlargest(self.p,topPwords[aspect]):
if t[1] not in self.corpus.aspectKeywords[aspect]:
changed=True
self.corpus.aspectKeywords[aspect].append(t[1])
return changed
# Populate wList,ratingsList and reviewIdList
def populateLists(self):
for review in self.corpus.allReviews:
#Computing W matrix for each review
W = defaultdict(lambda: defaultdict(int))
for sentence in review.sentences:
if len(sentence.assignedAspect)==1:
for word,freq in sentence.wordFreqDict.items():
W[sentence.assignedAspect[0]][word]+=freq
if len(W)!=0:
self.wList.append(W)
self.ratingsList.append(review.ratings)
self.reviewIdList.append(review.reviewId)
def bootStrap(self):
changed=True
while self.iter>0 and changed:
self.iter-=1
self.corpus.aspectSentences.clear()
for review in self.corpus.allReviews:
for sentence in review.sentences:
self.assignAspect(sentence)
self.populateAspectWordMat()
changed=self.calcChiSq()
self.corpus.aspectSentences.clear()
for review in self.corpus.allReviews:
for sentence in review.sentences:
self.assignAspect(sentence)
print(self.corpus.aspectKeywords)
# Saves the object into the given file
def saveToFile(self,fileName,obj):
with open(modelDataDir+fileName,'w') as fp:
json.dump(obj,fp)
fp.close()
rd = ReadData()
rd.readAspectSeedWords()
rd.readStopWords()
rd.readReviewsFromJson()
rd.removeLessFreqWords()
bootstrapObj = BootStrap(rd)
bootstrapObj.bootStrap()
bootstrapObj.populateLists()
bootstrapObj.saveToFile("wList.json",bootstrapObj.wList)
bootstrapObj.saveToFile("ratingsList.json",bootstrapObj.ratingsList)
bootstrapObj.saveToFile("reviewIdList.json",bootstrapObj.reviewIdList)
bootstrapObj.saveToFile("vocab.json",list(bootstrapObj.corpus.wordFreq.keys()))
bootstrapObj.saveToFile("aspectKeywords.json",bootstrapObj.corpus.aspectKeywords)
# In[ ]: