-
Notifications
You must be signed in to change notification settings - Fork 0
/
TwitterCaller.py
executable file
·349 lines (288 loc) · 11.4 KB
/
TwitterCaller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
## local modules
from ProxyList import *
from Exceptions import APIParseException
from Exceptions import APICursorException
## python modules
from datetime import *
import eventlet
from eventlet.green.httplib import *
from time import sleep
from copy import copy
from math import *
import socket
####
## TwitterCaller
## @author bslawski
## Twitter API to TwitterCrawler thread interface
## Holds a proxy address and port number, tracking
## number of API calls made, switching proxies
## automatically when a max number of calls have
## been made. Parses .json files and returns follower,
## screenname, and tweet information
##
## fields:
## proxyList - ProxyList object to return new proxies
##
####
class TwitterCaller:
## constructor method
def __init__(self, proxyList):
## ProxyList object to retrieve new proxy addresses from
self.proxyList = ProxyList(proxyList, 100)
## number of attempted content lookups
self.lookupAttempts = 0
self.maxpage = 9999
## dictionary of month name to month number conversions
self.months = {}
self.months['Jan'] = 1
self.months['Feb'] = 2
self.months['Mar'] = 3
self.months['Apr'] = 4
self.months['May'] = 5
self.months['Jun'] = 6
self.months['Jul'] = 7
self.months['Aug'] = 8
self.months['Sep'] = 9
self.months['Oct'] = 10
self.months['Nov'] = 11
self.months['Dec'] = 12
## Splits into multiple threads, each getting the
## screename associated with an id in the given list.
## Returns the names found, as well as ids that could
## not be looked up
def getNameMulti(self, pageids):
foundNames = {}
badIds = []
nids = len(pageids)
pool = eventlet.GreenPool()
ngotten = 0
returned = False
for nametag in pool.imap(self.getNameMultiHelper, pageids):
if returned:
eventlet.kill()
ngotten += 1
name, tag = nametag
if tag:
id, username, popularity, importance, birthtime = name
foundNames[id] = (username, popularity, importance, birthtime)
else:
badIds.append(name)
if ngotten > nids * .8:
returned = True
return foundNames, badIds
return foundNames, badIds
## Tries to look up a users screename given their id,
## returning the name and True if successful,
## and the original id and False if not
def getNameMultiHelper(self, pageid):
## self.getName(pageid)
try:
iddata = self.getName(pageid)
return (iddata, True)
except:
return (pageid, False)
## Given a twitter page id as a string, retrieves and parses the
## user lookup .json file, returning as a string the user screen
## name corresponding to the id
def getName(self, pageid):
lookup = 'http://api.twitter.com/1/users/lookup.json?user_id=' \
+ pageid
content = self.getContent(lookup)
try:
name = ((content.split('\"screen_name\":\"'))[1]\
.split('\"'))[0]
nfollowing = ((content.split('\"friends_count\":'))[1]\
.split(','))[0]
nfollowing = int(nfollowing)
nfollowers = ((content.split('\"followers_count\":'))[1]\
.split(','))[0]
nfollowers = int(nfollowers)
birthtime = (((content.split('\"created_at\":\"'))[1])\
.split('\",\"'))[0]
birthtime = birthtime.split()
birthtime = birthtime[5] + ':' + str(self.months[birthtime[1]]) + ':' + birthtime[2]
## Can't find the user data in the file
except:
raise APIParseException("Name Parse Error!")
return pageid, name, nfollowers, log(float(nfollowing) / float(nfollowers), 10.), birthtime
## Splits into multiple threads to get the followers associated
## with each id in the given list.
## Found followers are returned as a list of lists, and ids
## that could not be looked up are returned in a list
def getFollowersMulti(self, pageids, maxpage=9999):
self.maxpage = maxpage
foundFollowers = []
badIds = []
pool = eventlet.GreenPool()
hasReturned = False
ngotten = 0
nids = len(pageids)
for idtag in pool.imap(self.getFollowersMultiHelper, pageids):
if hasReturned:
eventlet.kill()
ids, tag = idtag
if tag:
foundFollowers.append(ids)
ngotten += 1
else:
badIds.append(ids)
if ngotten > nids * .5:
hasReturned = True
return foundFollowers, badIds
return foundFollowers, badIds
## Tries to look up the followers associated with a given page,
## returning the list of followers and True if successful,
## and the original id and False if not
def getFollowersMultiHelper(self, pageid):
try:
ids = self.getFollowers(pageid)
return (ids, True)
except:
return (pageid, False)
## Given a twitter page id as a string, retrieves and parses the
## follwer .json file, returning as a list of strings the ids of
## the page
def getFollowers(self, pageid, verbose=False):
## looks up the first page
try:
nextcur, newids = self.getCursorFollowers(pageid, '-1')
except APICursorException:
return self.getFollowers(pageid)
page = 0
ids = newids
if self.maxpage == 0:
return ids
if verbose:
print len(ids), 'followers'
## looks up subsequent pages if necessary
while not nextcur == '0':
page += 1
## print 'page ', page, nextcur
gotPage = False
while not gotPage:
try:
oldcur = nextcur
oldids = newids
nextcur, newids = self.getCursorFollowers(pageid, nextcur)
gotPage = True
## didn't work, try again
except APICursorException:
nextcur = oldcur
newids = oldids
gotPage = False
ids.extend(newids)
if verbose:
print len(ids), 'followers'
return ids
## Helper function for getFollowers
## Retrieves a single page of a users followers
def getCursorFollowers(self, pageid, cursor):
lookup = 'http://api.twitter.com/1/followers/ids.json?cursor=' + \
cursor + '&user_id=' + pageid
content = self.getContent(lookup)
try:
ids = (((content.split('\"ids\":['))[1]).split(']'))[0]
ids = ids.split(',')
## Can't find followers to parse
except:
raise APICursorException("No followers!")
try:
nextcur = (((content.split('\"next_cursor_str\":\"'))[1])\
.split('\"'))[0]
## Can't find the cursor for the next page
except:
raise APICursorException("cursor error")
return nextcur, ids
## Splits into multiple threads to look up the tweets
## from each user in the given list.
## Returns found tweets as a list of lists, as well as
## a list of ids that could not be looked up
def getTweetsMulti(self, userids):
foundTweets = {}
badIds = []
nids = len(userids)
pool = eventlet.GreenPool()
ngotten = 0
notFound = copy(userids)
returned = False
for tweettag in pool.imap(self.getTweetsMultiHelper, userids):
if returned:
eventlet.kill()
ngotten += 1
tweets, tag, id = tweettag
if tag:
foundTweets[id] = tweets
else:
badIds.append(id)
notFound.remove(id)
if ngotten > nids * .8:
returned = True
return foundTweets, badIds, notFound
return foundTweets, badIds, notFound
## Tries to get the tweets for a given user,
## returning the tweets and True if successful,
## and the original screename and False if not,
## as well as the name
def getTweetsMultiHelper(self, id):
try:
tweets = self.getTweets(id)
return (tweets, True, id)
except APIParseException:
return (id, False, id)
## Given a twitter username as a string, retrieves and parses a
## .json file of the user's past week of tweets, returning the
## tweets as a list of tuples, with the julian time of the
## tweet as the first element and the tweet string as the second
def getTweets(self, userid):
tweets = []
lookup = 'http://api.twitter.com/1/statuses/user_timeline.json?count=200&user_id=' + userid
## lookup = 'http://search.twitter.com/search.json?q=from:' + username
content = self.getContent(lookup)
try:
tweetArr = content.split('},{')
tweetArr[-1] = (tweetArr[-1].split('}]'))[0]
for entry in tweetArr:
tweet = ((entry.split('\"text\":\"')[1]).split('\",\"'))[0]
tweettime = (((entry.split('\"created_at\":\"'))[1])\
.split('\",\"'))[0]
tweettime = tweettime.split()[1:]
tweet_t = tweettime[2].split(':')
tweettime = datetime(int(tweettime[4]), \
self.months[tweettime[0]], \
int(tweettime[1]), \
int(tweet_t[0]), \
int(tweet_t[1]), \
int(tweet_t[2]))
tweets.append((tweettime,tweet))
except:
raise APIParseException("No Tweets!")
return tweets
## Retrieves a given url using the TwitterCaller's proxy,
## returning the read page as a string
def getContent(self, lookup):
self.lookupAttempts += 1
try:
proxy = self.proxyList.getProxy()
except:
sleep(1)
return self.getContent(lookup)
## connect = HTTPConnection(proxy, timeout=20)
## connect.request("GET", lookup)
## content = connect.getresponse().read()
try:
connect = HTTPConnection(proxy, timeout=6)
connect.request("GET", lookup)
content = connect.getresponse().read()
## lookup failed, lets get a new proxy
except socket.timeout:
if self.lookupAttempts > 20:
return ''
## print 'timeout'
self.proxyList.reportBadProxy(proxy)
content = self.getContent(lookup)
except Exception,e:
## print e
self.proxyList.reportBadProxy(proxy)
return ''
self.lookupAttempts = 0
return content