-
Notifications
You must be signed in to change notification settings - Fork 0
/
reviews.py
149 lines (112 loc) · 4.11 KB
/
reviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import string
import numpy as np
from elasticsearch import Elasticsearch
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from textblob import TextBlob
from authTwitter import authTW
es = Elasticsearch([{'host': '10.22.12.229', 'port': 9200}], http_auth=('user', 'pkqcqbke5qPP'))
##
# Tweet processors
##
def map_tweets(biz):
twitter_obj = authTW()
return get_tweets_for_keyword(twitter_obj, biz["name"], biz["business_id"])
def get_sentiment(t):
# using TextBlob, create an Object from the input tweet
tb_object = TextBlob(t)
# compute the sentiment
if tb_object.sentiment.polarity > 0:
return 'pos', 4.5
elif tb_object.sentiment.polarity == 0:
return 'neu', 3
else:
return 'neg', 2.5
def clean_tweet(tweet):
text = tweet['text']
slices = []
# Strip out the urls.
if 'urls' in tweet['entities']:
for url in tweet['entities']['urls']:
slices += [{'start': url['indices'][0], 'stop': url['indices'][1]}]
# Strip out the hashtags.
if 'hashtags' in tweet['entities']:
for tag in tweet['entities']['hashtags']:
slices += [{'start': tag['indices'][0], 'stop': tag['indices'][1]}]
# Strip out the user mentions.
if 'user_mentions' in tweet['entities']:
for men in tweet['entities']['user_mentions']:
slices += [{'start': men['indices'][0], 'stop': men['indices'][1]}]
# Strip out the media.
if 'media' in tweet['entities']:
for med in tweet['entities']['media']:
slices += [{'start': med['indices'][0], 'stop': med['indices'][1]}]
# Strip out the symbols.
if 'symbols' in tweet['entities']:
for sym in tweet['entities']['symbols']:
slices += [{'start': sym['indices'][0], 'stop': sym['indices'][1]}]
# Sort the slices from highest start to lowest.
slices = sorted(slices, key=lambda x: -x['start'])
# No offsets, since we're sorted from highest to lowest.
for s in slices:
text = text[:s['start']] + text[s['stop']:]
# Remove emojis
text = text.encode('ascii', 'ignore').decode('ascii')
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove extraneous spacing
text = ' '.join(text.splitlines())
text = ' '.join(text.split())
# Remove retweet symbol
if text[0:3] == 'RT ':
text = text[3:]
return text
def get_tweets_for_keyword(t_obj, keyword, id):
# use the twitter api to get the tweets
search_results = t_obj.search.tweets(q=keyword, lang='en', count=150)
# filter the json results just to status
statuses = search_results['statuses']
# iterate through the status to get the tweet text and id, save only unique tweets
tw_list = []
tw_set = set()
for tw in statuses:
tw_text = clean_tweet(tw)
tw_sentiment, tw_rating = get_sentiment(tw_text)
# Add to list if
# - Not a duplicate
# - Not too short
# - Has trailing words (...)
if tw_text not in tw_set and len(tw_text) > 10 and '…' not in tw_text:
tw_list.append({
"business_id": id,
"sentiment": tw_sentiment,
"rating": tw_rating,
"text": tw_text
})
tw_set.add(tw_text)
return np.array(tw_list)
##
# Yelp processors
##
def clean_text(t):
sentence = t.lower()
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
filtered_words = filter(lambda token: token not in stopwords.words('english'), tokens)
return " ".join(filtered_words)
def get_simple_sentiment(stars):
if stars <= 2.5:
return 'neg'
if stars <= 3.5:
return 'neu'
else:
return 'pos'
def get_yelp_review():
res = es.search(index='raw_yelp_review', size=10000)
reviews = list(map(lambda x: {
"business_id": x['_source']['business_id'],
"sentiment": get_simple_sentiment(x['_source']['stars']),
"rating": x['_source']['stars'],
"text": clean_text(x['_source']['text'])
}, res['hits']['hits']))
return np.array(reviews)