def update_all_reddits(self, smallest_id, largest_id): # http://www.reddit.com/dev/api # t3_ means link # example: # http://www.reddit.com/by_id/t3_zcd40t3_zcd41,t3_zcd42,t3_zcd43/.json print "Attempting to download all reddit submissions between id: ", smallest_id, " and ", largest_id i = 0 entries_written = 0 entries_non_existent = 0 while smallest_id < largest_id: i += 1 # Debug printing every 50 runs (after processing 5000 entries) ~ every 100 seconds: if (i - 1) % 50 == 0: #if self.db_manager.new_rows_written % 1000 == 0 and self.db_manager.new_rows_written != 0: print "Entries written: ", entries_written, " [Non-existent: ", entries_non_existent, "] - on id: ", smallest_id url = "http://www.reddit.com/by_id/" submissions_to_fetch_int = set() # Queue up 100 submissions to fetch which the database does not currently contain while smallest_id < largest_id and len(submissions_to_fetch_int) < ENTRIES_TO_FETCH: # print "row_id = ", most_recent_id if not self.db_manager.row_exists(smallest_id): submissions_to_fetch_int.add(smallest_id) smallest_id += 1 # Create a URL string for the query submissions_to_fetch_str = [] for s in submissions_to_fetch_int: submissions_to_fetch_str.append("t3_" + base36encode(s)) url += ','.join(submissions_to_fetch_str) # Query for the submissions submissions = None try: submissions = self.r.request_json(url, params={'limit': 100}, data=None, as_objects=True, retry_on_error=True) # print submissions submissions = submissions['data']['children'] except: print "Error when trying to fetch url: ", url submissions_fetched_int = set() if submissions: for submission in submissions: self.__update_given_submission(submission) entries_written += 1 submission_id = base36decode(submission.id) submissions_fetched_int.add(submission_id) # subtract submissions_fetched_int from submissions_to_fetch_int submissions_not_fetched = submissions_to_fetch_int.difference(submissions_fetched_int) for submission_id in submissions_not_fetched: # Mark nonexistent entries if not self.db_manager.row_exists(submission_id): non_existent_entry = Submission.non_existent_submission(submission_id) self.db_manager.insert_submission(non_existent_entry) entries_non_existent += 1