def update_submission_content(self, topics=None, sub_per_topic=10): if topics is None: print "No topics specified. Updating all subreddits." else: try: topicfile = "../data/" + topics topics = open(topicfile).read().splitlines() except IOError: print "Cannot find specified topics file." return while True: # Go through each subreddit in db or specified by topic list if topics: subreddits = topics else: subreddits = self.db.get_subreddits() for subreddit in subreddits: if topics: subreddit = self.db.subreddit_exists(subreddit) if not subreddit: continue topic = subreddit.get("subreddit_name") print "========", topic, "========" num_content = 0 # Only get the submissions whose (non-empty) urls haven't been scraped for submission in self.db.empty_submissions(topic): link = submission.get("url") try: link = str(link) except UnicodeEncodeError: continue html = links.scrape_link(str(link), topic) if html: print "Adding content from:", link html = str(html) num_content += 1 self.db.add_link_content(submission.get("_id"), html) if num_content >= sub_per_topic: break print "Crawled", num_content, "links for", topic, datetime.datetime.today()
def add_submission(self, submission, subreddit_name, follow_link=False): subreddit = self.subreddit_exists(subreddit_name) if subreddit: subreddit_id = subreddit.get("_id") print "found subreddit:", subreddit_id # If the comment has an author (could have been deleted), # take care of updating or adding user information if submission.author: print submission.author # If the submission's author already exists, update # the author's information. auth = self.user_exists(submission.author.id) if auth: auth_id = auth.get("_id") print self.update_user(auth_id, submission.subreddit, "submissions") # Otherwise create a new user object else: self.add_user(submission.author) author = submission.author.fullname else: print "No author" author = None # If the submission already exists, update comments sub = self.submission_exists(submission) if sub: print "Submission exists-updating" submission_id = sub.get("_id") self.update_comments(submission, submission_id, subreddit_id) # Otherwise create submission object and add all the current comments else: print "Adding new submission" document = {"subreddit_id": subreddit_id, "submission_title": submission.title, "submission_text": submission.selftext, "karma": submission.ups, "downvotes": submission.downs, "num_comments": submission.num_comments, "flair": submission.link_flair_text, "url": submission.url, "praw_id": submission.id, "praw_fullid": submission.fullname, "created": submission.created, "author": author } # First add the submission to the db to make sure we have it submission_id = self.submission_collection.insert(document) self.subreddit_collection.update({"_id": subreddit_id}, {"$set": {"last_update": int(datetime.today().strftime("%s"))}}) # Next add the comments, if they're available try: comments = layer_comments(submission.comments) for layer, comment_list in comments.iteritems(): for comment in comment_list: self.add_comment(comment, layer, submission_id, subreddit_id) # Sometimes submissions don't have comment attribute except AttributeError: pass # Now follow link if this was specified and if # submission contains a link to follow if follow_link and submission.url: content = links.scrape_link(submission.url, subreddit_name) self.add_link_content(submission_id, str(content)) return submission_id else: raise errors.MissingError("Subreddit %s does not exist in the database." % subreddit_name)