/
visualizing_twitter_data.py
230 lines (182 loc) · 11.7 KB
/
visualizing_twitter_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
from tweepy import API
from tweepy import Cursor
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import twitter_credentials
import numpy as np
import pandas as pd
# I am importing the matplot library specifically from that I am importing pyplot and then I am
# going to refer to that as plt as a shorthand similar to what we did for numpy and for pandas
import matplotlib.pyplot as plt
# # # # TWITTER CLIENT # # # #
class TwitterClient():
def __init__(self, twitter_user=None):
self.auth = TwitterAuthenticator().authenticate_twitter_app()
self.twitter_client = API(self.auth)
self.twitter_user = twitter_user
def get_twitter_client_api(self):
return self.twitter_client
def get_user_timeline_tweets(self, num_tweets):
tweets = []
for tweet in Cursor(self.twitter_client.user_timeline, id=self.twitter_user).items(num_tweets):
tweets.append(tweet)
return tweets
def get_friend_list(self, num_friends):
friend_list = []
for friend in Cursor(self.twitter_client.friends, id=self.twitter_user).items(num_friends):
friend_list.append(friend)
return friend_list
def get_home_timeline_tweets(self, num_tweets):
home_timeline_tweets = []
for tweet in Cursor(self.twitter_client.home_timeline, id=self.twitter_user).items(num_tweets):
home_timeline_tweets.append(tweet)
return home_timeline_tweets
# # # # TWITTER AUTHENTICATER # # # #
class TwitterAuthenticator():
def authenticate_twitter_app(self):
auth = OAuthHandler(twitter_credentials.consumer_key, twitter_credentials.consumer_secret)
auth.set_access_token(twitter_credentials.access_token, twitter_credentials.access_secret)
return auth
# # # # TWITTER STREAMER # # # #
class TwitterStreamer():
"""
Class for streaming and processing live tweets.
"""
def __init__(self):
self.twitter_autenticator = TwitterAuthenticator()
def stream_tweets(self, fetched_tweets_filename, hash_tag_list):
# This handles Twitter authetification and the connection to Twitter Streaming API
listener = TwitterListener(fetched_tweets_filename)
auth = self.twitter_autenticator.authenticate_twitter_app()
stream = Stream(auth, listener)
# This line filter Twitter Streams to capture data by the keywords:
stream.filter(track=hash_tag_list)
# # # # TWITTER STREAM LISTENER # # # #
class TwitterListener(StreamListener):
"""
This is a basic listener that just prints received tweets to stdout.
"""
def __init__(self, fetched_tweets_filename):
self.fetched_tweets_filename = fetched_tweets_filename
def on_data(self, data):
try:
print(data)
with open(self.fetched_tweets_filename, 'a') as tf:
tf.write(data)
return True
except BaseException as e:
print("Error on_data %s" % str(e))
return True
def on_error(self, status):
if status == 420:
# Returning False on_data method in case rate limit occurs.
return False
print(status)
class TweetAnalyzer():
"""
Functionality for analyzing and categorizing content from tweets.
"""
def tweets_to_data_frame(self, tweets):
df = pd.DataFrame(data=[tweet.text for tweet in tweets], columns=['tweets'])
df['id'] = np.array([tweet.id for tweet in tweets])
df['len'] = np.array([len(tweet.text) for tweet in tweets])
df['date'] = np.array([tweet.created_at for tweet in tweets])
df['source'] = np.array([tweet.source for tweet in tweets])
df['likes'] = np.array([tweet.favorite_count for tweet in tweets])
df['retweets'] = np.array([tweet.retweet_count for tweet in tweets])
return df
if __name__ == '__main__':
twitter_client = TwitterClient()
tweet_analyzer = TweetAnalyzer()
api = twitter_client.get_twitter_client_api()
tweets = api.user_timeline(screen_name="realDonaldTrump", count=200)
#print(dir(tweets[0]))
#print(tweets[0].retweet_count)
df = tweet_analyzer.tweets_to_data_frame(tweets)
# Lets say we want to figure out what is the average length of all the tweets that we have in our data
# frame so we here collect 20 tweets and we want to figure out , what is the average length of all of those
# 20 tweets and of course we can change this count from 20 to 200 or 400.
# Now that we have stored everything in this dataframe its going to make it really easy form us to just manipulate the data
# thats already present in that data frame and then just get this answer and the way that we can do that is by manipulating
# the mean function in numpy which is just going to be something that we are going to run on a list in the data frame so I am
# just going to print out the result that comes from running the mean function, so I am going to say np.mean , so I am calling
# the mean function and what am I calling the mean function on I want to figure out the mean of the length of the tweets so I am
# saying df['len'] so basically what I am doing there is df length is going to return a list its going to return a list of all of
# the links of all of the 20 tweets in this case and the we are going to be running the mean function which is provided to us from
# numpy on that list and that we are going to get a single number which we are printing out to the screen, so thats what is going on in
# this line .
# Get average length over all tweets:
#print(np.mean(df['len']))
# Another thing we can ask is lets say we want to figure out what is the tweet that received the most
# likes in the sample that we have gotten. Here instead of taking the mean over a list what I want to
# do is take the max because what we want to do is we want to figure out the maximum no. of likes that
# is from the likes column in our data frame so the likes column stored all of the number of likes for
# every given tweet. I am taking the max of that list using the max function and then I am printing that
# out to the screen .
# Get the number of likes for the most liked tweet:
#print(np.max(df['likes']))
# Similarly we can get the most no. of retweets by following the same syntax.
# Get the number of retweets for the most retweeted tweet:
#print(np.max(df['retweets']))
#print(df.head(10))
# Time Series
# Lets say hypothetically that we want to do is we want to create a time series plot thats
# going to show us the number of lets say likes that Donald Trump recevied on any given day over
# the course of some days which we can extract from, you know some given count here so we extract lets
# say two hundred tweets maybe thats over you know ranged over some number of days and then for every one
# of those days donald trump got a certain number of likes, we want to plot that number of likes and he got
# on a given day and then just plot that for every given day over this time series of dates.
# So lets create a variable which we will call time_likes this is going to be equal to a panda's series object.
# So we are essentially creating a series onject- so we can eventually plot this as a time series so I want to say
# this is equal to pd.Series and then this takes two things the data, so there is a data frame that we want to feed
# it which is going to be tough values so this is pd data frame of likes so we want to actually get in the number of
# likes. So we are getting the values of the each of the likes, so every day there is going to be a certain number of
# likes that are given and we are extracting the values from that and then what we also want to receive in this time series
# function is the index and the index is essentially the x-axis so what we are plotting and then for each what we want to do
# is for each day show the number of likes so the number of likes is kind of the y-axis, the date is the time series isself
# the number of days so we are going to set the index is equal to the data frame of the date which is something that we have
# already extracted from before from our data frame so this is our time series object we have created from pandas and now thats
# ready for us to actually just go ahead and plot that so what we can do is we could say a time_likes.plot and what we can do is
# we can feed this a few arguments, I am going to be feeding this plot funtion with two arguments, baically just the size of the
# figure which specifies how big this graph is going to be and lets say the color so this is going to be the color of the line thats
# plotted throughout the days for every given day so I am gonna say figsize is equal to (16,4) that just the XY axis of the image that
# we are gonna see and then also the color which we can put as red so the function takes in a string in this case its just a single
# character that corresponds to a given color .
# So then what we do once we have kind of craated our data that we are going to plot once we have created our plot that we are going
# to show we actually need to show the plot so we are going to say plt.show(). So now this plt is something that we need to import from matplotlib.
# which is going to allow us to show the plots that we have created.
#time_likes = pd.Series(data=df['len'].values, index=df['date'])
#time_likes.plot(figsize=(16, 4), color='r')
#plt.show()
#time_favs = pd.Series(data=df['likes'].values, index=df['date'])
#time_favs.plot(figsize=(16, 4), color='r')
#plt.show()
# Lets do the time series for retweets , instead of time_likes lets call this time_retweets.
# We still want to see how the number of retweets changes over the course of a given set of days.
#time_retweets = pd.Series(data=df['retweets'].values, index=df['date'])
#time_retweets.plot(figsize=(16, 4), color='r')
#plt.show()
# To compare two graps we can bunch the time series together onto one plot so instead of plotting two
# spearate time series where we have one for the number of likes and one for the no. of retweets one thing
# we can do is we can just put them on the same plot and see how they correlate.
# I am going to add a label because we are going to have two lines on this time series
# one is going to be a label for the number of likes and one is going to be a label for
# the number of retweets. And then I am going to put in a legend=true. So this basically
# will put in a little box in the time series chart which will show us what line corresponds
# to what label and that is going to be kind of helpful because whats nice about pandas and matlpotlib
# is that if we plot multiplr lines here it will be smart enough to distinguish them by assigning them a
# different color and then what we are doing here is we are essentially just labeling to each of those lines
# from whatever they represent and then we are going to put in a legend which is essentially going to correspond
# to this blue line is likes , this orange line is retweets, its going to be easier for us to kind of visualize
# whats going on.
# Layered Time Series:
#time_likes = pd.Series(data=df['likes'].values, index=df['date'])
#time_likes.plot(figsize=(16, 4), label="likes", legend=True)
#time_retweets = pd.Series(data=df['retweets'].values, index=df['date'])
#time_retweets.plot(figsize=(16, 4), label="retweets", legend=True)
plt.show()
# Trial time series
#time_likes= pd.Series(data=df['likes'].values, index=df['date'])
#time_likes.plot(figsize=(16,4),color='r')
#plt.show()