/
twitter.py
56 lines (46 loc) · 1.85 KB
/
twitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""
Step 1 of the project.
The code below simply scrapes real-time tweets pertaining to 'Biden' or the hashtag '#USElection2020'
Scraped tweets are sent to a Kinesis stream I have setup in my account.
An AWS Firehose Stream is connected to the 'macs30123-final' stream, which takes each record, processes it into parquet format, and uploads a parquet file
to an S3 Bucket every 900 seconds.
"""
import sys, boto3, json
from tweepy import OAuthHandler, API, Stream
from tweepy.streaming import StreamListener
# Twitter API/AWS setup
kinesis = boto3.client('kinesis', region_name='us-west-1')
auth = OAuthHandler('****', '****')
auth.set_access_token('****', '****')
api = API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
class Listener(StreamListener):
def __init__(self):
super(Listener,self).__init__()
self.count = 0
def on_status(self, status):
# Ensure tweets have text, a user with a description, and aren't retweets or plain links
if status.text and status.user.description and status.text[:2] != 'RT' and status.text[:3] != 'http':
# Bundle the tweet data into json format
data = {'id': status.id, 'text': status.text, 'description': status.user.description, 'followers': status.user.followers_count}
data = json.dumps(data)
# Update the count and print progress to console
self.count += 1
print(self.count, end='\r')
# Send the json to the kinesis stream
kinesis.put_record(
StreamName='macs30123-final',
Data=data,
PartitionKey='partitionkey'
)
def on_error(self, status_code):
print(status_code)
return False
stream = Stream(auth=api.auth, listener=Listener())
try:
print('Start streaming.')
stream.filter(track=['Biden, #USElection2020'])
except KeyboardInterrupt:
print("Stopped.")
finally:
print('Done.')
stream.disconnect()