-
Notifications
You must be signed in to change notification settings - Fork 1
/
tweet_urls.py
executable file
·88 lines (71 loc) · 2.4 KB
/
tweet_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python
import urllib, simplejson as json, re, operator
import sqlalchemy as sql
from sqlalchemy import orm, ext
from sqlalchemy.ext import declarative
from functional import partial, flip
# helpers
def flatten(l):
return reduce(operator.add, l)
def encode(str, *args):
return str.encode(*args)
# DB stuff
Session = sql.orm.sessionmaker()
Base = sql.ext.declarative.declarative_base()
class Tweet(Base):
__tablename__ = 'tweets'
tweet_id = sql.Column(sql.Integer, primary_key = True)
def __init__(self, id):
self.tweet_id = id
class Link(Base):
__tablename__ = 'links'
id = sql.Column(sql.Integer, primary_key = True)
url = sql.Column(sql.String(500))
count = sql.Column(sql.Integer)
def __init__(self, url, count):
self.url = url
self.count = count
def __repr__(self):
return "<Link %s (%d)>" % (self.url, self.count)
# create the table if it doesn't exist
engine = sql.create_engine('sqlite:///tweets.db')
Tweet.metadata.create_all(engine)
Link.metadata.create_all(engine)
# get the timeline and links in it
timeline = urllib.urlopen('http://www.twitter.com/statuses/public_timeline.json').read()
timeline = json.loads(timeline)
encode = partial(flip(encode), 'ascii')
find_urls = re.compile(r'http://[^\s]+[^\s\.]')
links = []
tweets = [Tweet(t['id']) for t in timeline]
for matches in [map(encode, find_urls.findall(t['text'])) for t in timeline]:
if len(matches) > 0:
tweet_links = []
for url in matches:
# find the real URL (after redirects)
url = urllib.urlopen(url)
url = url.geturl()
tweet_links.append(Link(url, 1))
links.append(tweet_links)
else:
links.append(None)
stuff = zip(tweets, links)
stuff = filter(lambda s: s[1] != None, stuff)
# db stuff
Session.configure(bind = engine)
session = Session()
# if the tweet hasn't been seen before
for tweet, links in stuff:
if session.query(Tweet).filter_by(tweet_id = tweet.tweet_id).count() == 0:
session.add(tweet)
for new_link in links:
try:
orig_link = session.query(Link).filter_by(url = new_link.url).one()
except Exception:
session.add(new_link)
else:
orig_link.count += 1
session.add(orig_link)
# done adding, commit and close
session.commit()
session.close()