/
reddit_all_dump.py
156 lines (124 loc) · 5.58 KB
/
reddit_all_dump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
Can use request_json to plug holes:
https://praw.readthedocs.org/en/latest/pages/code_overview.html
"""
import praw
from reddit_submission import Submission
from reddit_database_manager import DatabaseManager
from base36 import base36decode
# from secret import REDDIT_USER_AGENT
from base36 import base36encode
import sys
REDDIT_USER_AGENT = ""
# 100 is the max entries before pagination
ENTRIES_TO_FETCH = 100
class AllFetcher:
def __init__(self, db_manager):
print "Starting AllFetcher with user_agent =", REDDIT_USER_AGENT
self.r = praw.Reddit(user_agent=REDDIT_USER_AGENT)
self.db_manager = db_manager
def get_most_recent_id(self):
url = "http://www.reddit.com/new"
submissions = self.r.request_json(url, params=None, data=None, as_objects=True, retry_on_error=True)
submissions = submissions['data']['children']
submission = submissions[0]
return base36decode(submission.id)
def __update_given_submission(self, submission):
submission_id = base36decode(submission.id)
if not self.db_manager.row_exists(submission_id):
s = Submission.from_reddit_api(submission)
self.db_manager.insert_submission(s)
else:
new_submission = Submission.from_reddit_api(submission)
query = 'SELECT * FROM submissions WHERE id = %d;' % submission_id
existing_submissions = [Submission(x) for x in self.db_manager.query(query)]
assert(len(existing_submissions) == 1)
existing_submission = existing_submissions[0]
if existing_submission.score != new_submission.score:
existing_submission.score = new_submission.score
self.db_manager.replace_submission(existing_submission)
def update_all_reddits(self, smallest_id, largest_id):
# http://www.reddit.com/dev/api
# t3_ means link
# example:
# http://www.reddit.com/by_id/t3_zcd40t3_zcd41,t3_zcd42,t3_zcd43/.json
print "Attempting to download all reddit submissions between id: ", smallest_id, " and ", largest_id
i = 0
entries_written = 0
entries_non_existent = 0
while smallest_id < largest_id:
i += 1
# Debug printing every 50 runs (after processing 5000 entries) ~ every 100 seconds:
if (i - 1) % 50 == 0:
#if self.db_manager.new_rows_written % 1000 == 0 and self.db_manager.new_rows_written != 0:
print "Entries written: ", entries_written, " [Non-existent: ", entries_non_existent, "] - on id: ", smallest_id
url = "http://www.reddit.com/by_id/"
submissions_to_fetch_int = set()
# Queue up 100 submissions to fetch which the database does not currently contain
while smallest_id < largest_id and len(submissions_to_fetch_int) < ENTRIES_TO_FETCH:
# print "row_id = ", most_recent_id
if not self.db_manager.row_exists(smallest_id):
submissions_to_fetch_int.add(smallest_id)
smallest_id += 1
# Create a URL string for the query
submissions_to_fetch_str = []
for s in submissions_to_fetch_int:
submissions_to_fetch_str.append("t3_" + base36encode(s))
url += ','.join(submissions_to_fetch_str)
# Query for the submissions
submissions = None
try:
submissions = self.r.request_json(url, params={'limit': 100}, data=None, as_objects=True, retry_on_error=True)
# print submissions
submissions = submissions['data']['children']
except:
print "Error when trying to fetch url: ", url
submissions_fetched_int = set()
if submissions:
for submission in submissions:
self.__update_given_submission(submission)
entries_written += 1
submission_id = base36decode(submission.id)
submissions_fetched_int.add(submission_id)
# subtract submissions_fetched_int from submissions_to_fetch_int
submissions_not_fetched = submissions_to_fetch_int.difference(submissions_fetched_int)
for submission_id in submissions_not_fetched:
# Mark nonexistent entries
if not self.db_manager.row_exists(submission_id):
non_existent_entry = Submission.non_existent_submission(submission_id)
self.db_manager.insert_submission(non_existent_entry)
entries_non_existent += 1
"""
def update_posts(self):
most_recent_id = self.get_most_recent_id()
self.update_all_reddits(most_recent_id)
"""
def remove_commas(s):
return ''.join(x for x in s if x not in ',')
def main():
global REDDIT_USER_AGENT
assert(len(sys.argv) == 4)
REDDIT_USER_AGENT = sys.argv[1]
smallest_id = int(remove_commas(sys.argv[2]))
largest_id = int(remove_commas(sys.argv[3]))
# DATABASE_PATH = sys.argv[4]
# each bucket has 1,000,000 entries
MILLION_BUCKET_SIZE = 1000000
number_of_buckets = (largest_id - smallest_id) / MILLION_BUCKET_SIZE
for i in range(number_of_buckets):
current_smallest_id = smallest_id + (i * MILLION_BUCKET_SIZE)
current_largest_id = current_smallest_id + MILLION_BUCKET_SIZE
database_path = "all_submissions_dump_%dm_%dm.sqlite" % (current_smallest_id / MILLION_BUCKET_SIZE, current_largest_id / MILLION_BUCKET_SIZE)
print "current_smallest_id: ", current_smallest_id
print "current_largest_id: ", current_largest_id
print "DATABASE_PATH: ", database_path
# TODO: process here
db_manager = DatabaseManager(database_path)
f = AllFetcher(db_manager)
f.update_all_reddits(current_smallest_id, current_largest_id)
print "Added", db_manager.new_rows_written, "new entries total."
print "Modified", db_manager.rows_written, " total entries ."
print "Found", db_manager.already_exist, "that already exist."
print "Closing DB."
if __name__ == "__main__":
main()