forked from hughec/yelp_social_sentiment
/
cleanyelp.py
258 lines (204 loc) · 9.91 KB
/
cleanyelp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#!/usr/bin/env/ python
import readyelp
import datetime
import random
def split_data(train_ratio_of_total = 0.5):
""" Splits the data randomly according to the ratio of training data to the total size of the data set provided. The default argument of 0.5 splits the data evenly between training and test sets. """
reviews = readyelp.read_reviews_to_dict("./reviews.json")
users = readyelp.read_users_to_dict("./users.json")
clean_review_dict(reviews, users)
train = []
test = []
for review_id in reviews:
review = reviews[review_id]
assignment = random.random()
if assignment <= train_ratio_of_total:
train.append(review)
else:
test.append(review)
readyelp.write_output(train, "./train_reviews.json")
readyelp.write_output(test, "./test_reviews.json")
filter_users()
def split_data_by_business(train_ratio_of_total = 0.5):
""" Splits the data such that all reviews of a particular business end up in either the training set or the test set. This prevents links between reviews from being lost during the split. """
reviews = readyelp.read_reviews_to_dict("./reviews.json")
users = readyelp.read_users_to_dict("./users.json")
businesses = business_reviews_dict(reviews)
train_ids = []
test_ids = []
for business_id in businesses:
business_reviews = businesses[business_id]
assignment = random.random()
if assignment <= train_ratio_of_total:
train_ids.extend(business_reviews)
else:
test_ids.extend(business_reviews)
train = []
test = []
for train_id in train_ids:
review = reviews[train_id]
train.append(review)
for test_id in test_ids:
review = reviews[test_id]
test.append(review)
readyelp.write_output(train, "./train_reviews.json")
readyelp.write_output(test, "./test_reviews.json")
def business_reviews_dict(reviews):
businesses = {}
for review_id in reviews:
review = reviews[review_id]
business_id = review["business_id"]
if business_id in businesses:
businesses[business_id].append(review_id)
else:
businesses[business_id] = [review_id]
return businesses
def filter_users():
""" Removes from the set of users any users that do not have reviews in either the training or test datasets. """
user_dict = readyelp.read_users_to_dict("./users.json")
train_reviews = readyelp.read_reviews_to_dict("./train_reviews.json")
test_reviews = readyelp.read_reviews_to_dict("./test_reviews.json")
users_limited = []
for user_id in user_dict:
user = user_dict[user_id]
user_review_list = user["reviews"]
for review_id in user_review_list:
if review_id not in train_reviews and review_id not in test_reviews:
user_review_list.remove(review_id)
if len(user_review_list) > 0:
user["reviews"] = user_review_list
users_limited.append(user)
readyelp.write_output(users_limited, "./users.json")
def find_influencers(review, review_dict, user_dict):
""" Given a review, returns a list of reviews of the same business created by friends of the user who created the given review. These reviews are thought to influence the sentiment of the given review. """
influencers = []
user = user_dict[review["user_id"]]
friend_list = user["friends"]
for friend_id in friend_list:
friend = user_dict[friend_id]
friend_review_list = friend["reviews"]
for friend_review_id in friend_review_list:
if friend_review_id not in review_dict: continue
friend_review = review_dict[friend_review_id]
if friend_review["business_id"] == review["business_id"]:
influencers.append(friend_review_id)
return influencers
def median_date(review_dict):
review_dates = []
for review_id in review_dict:
review = review_dict[review_id]
review_dates.append(review["date"])
return review_dates[len(review_dates) / 2]
def _convert_review_date(date_string):
""" Converts the string representation of a date to a python date object. """
review_date = datetime.datetime.strptime(date_string, "%Y-%m-%d").date()
return review_date
def _convert_star_rating_to_binary_klass(star_rating):
""" Converts the star attribute of a review to its corresponding binary sentiment (negative or positive). """
if star_rating <= 3:
return "negative"
else:
return "positive"
def _convert_star_rating_to_three_klass(star_rating):
""" Converts the star attribute of a review to its corresponding sentiment value among positive, negative, and neutral. """
if star_rating < 3:
return "negative"
elif star_rating > 3:
return "positive"
else:
return "neutral"
def clean_review_dict(review_dict, user_dict):
""" Removes reviews created by users not in user_dict, standardizes star ratings to their appropriate klass, standardizes review date to python date object, and adds to each review a list of prior reviews of the same business by friends of the user. """
ids_to_remove_from_reviews = []
to_write_to_file = []
for review_id in review_dict:
review = review_dict[review_id]
review["rating"] = _convert_star_rating_to_binary_klass(review["rating"])
review_date_string = _convert_review_date(review["date"])
if review["user_id"] not in user_dict:
ids_to_remove_from_reviews.append(review_id)
else:
friend_reviews_of_business = find_influencers(review, review_dict, user_dict)
if len(friend_reviews_of_business) == 0:
ids_to_remove_from_reviews.append(review_id)
else:
review["friend_reviews_of_business"] = friend_reviews_of_business
review_dict[review_id] = review
to_write_to_file.append(review)
for review_id in ids_to_remove_from_reviews:
del review_dict[review_id]
readyelp.write_output(to_write_to_file, "./reviews.json")
def _user_reviews_by_business(review_ids, review_dict):
""" Returns a dictionary mapping a business_id to a review_id. """
review_dict_by_business = {}
for r_id in review_ids:
if r_id not in review_dict: continue
review = review_dict[r_id]
review_dict_by_business[review["business_id"]] = r_id
return review_dict_by_business
def find_review_pairs_by_friends(user_dict, review_dict):
""" Returns a set of pairs of review_id's where a pair of friends reviewed the same business. """
common_review_pairs = set()
for user_id in user_dict:
user = user_dict[user_id]
user_friends = user["friends"]
user_reviews = user["reviews"]
user_businesses = _user_reviews_by_business(user_reviews, review_dict)
for friend_id in user_friends:
friend = user_dict[friend_id]
friend_reviews = friend["reviews"]
friend_businesses = _user_reviews_by_business(friend_reviews, review_dict)
for business_id in friend_businesses:
# Identify whether the friend reviewed any of the same businesses as the user and add review_id's to set.
if business_id in user_businesses:
friend_review_id = friend_businesses[business_id]
friend_review = review_dict[friend_review_id]
user_review_id = user_businesses[business_id]
user_review = review_dict[user_review_id]
if user_review["date"] > friend_review["date"]:
common_review_pairs.add((user_review_id, friend_review_id))
elif user_review["date"] == friend_review["date"] and user_review_id > friend_review_id:
common_review_pairs.add((user_review_id, friend_review_id))
return common_review_pairs
def klass_counts(review_dict, klass_list):
counts = {"total" : 0}
for klass in klass_list:
counts[klass] = 0.0
for review_id in review_dict:
counts["total"] += 1
review = review_dict[review_id]
counts[review["rating"]] += 1
return counts
def _homophily_counts(review_pairs, review_dict, klass_list):
""" Counts instances of homophily in pairs of reviews, for each class. Homophily is used here to mean that the later review shares the sentiment of the earlier review. """
raw_counts = {}
homophily_counts = {}
total = 0
for klass in klass_list:
raw_counts[klass] = 0.0
homophily_counts[klass] = 0.0
for review_tuple in review_pairs:
total += 1
first_review = review_dict[review_tuple[0]]
second_review = review_dict[review_tuple[1]]
raw_counts[first_review["rating"]] += 1
if first_review["rating"] == second_review["rating"]:
homophily_counts[first_review["rating"]] += 1
for klass in klass_list:
print klass + " proportion of total, homophily proportion:", (raw_counts[klass]/total), (homophily_counts[klass] / raw_counts[klass])
def main():
""" Invoking cleanyelp.py will output basic statistics from the yelp data. """
user_dict = readyelp.read_users_to_dict("./users.json")
print "Total number of users with friends:", len(user_dict)
review_dict = readyelp.read_reviews_to_dict("./train_reviews.json")
# clean_review_dict(review_dict, user_dict)
print "Total number of reviews from these users:", len(review_dict)
common_review_pairs = find_review_pairs_by_friends(user_dict, review_dict)
print "Total number of friend review pairs of the same business:", len(common_review_pairs)
klass_list = ["negative", "positive"]
raw_counts = klass_counts(review_dict, klass_list)
for klass in klass_list:
print "Total reviews with " + klass + " sentiment:", raw_counts[klass]
_homophily_counts(common_review_pairs, review_dict, klass_list)
if __name__ == "__main__":
main()