""" This script fills the subreddit_id field for wayback submissions by extracting the subreddit name from the submission's url. """ # Example: http://www.reddit.com/r/mildlyinteresting/comments/... --> mildlyinteresting def extract_subreddit(url): start = url.find("/r/") end = url.find("/", start + 3) return url[start + 3:end] from redditDB import RedditDB rdb = RedditDB("mciot", "r3dd1tmorgane", "blacksun.cs.mcgill.ca", 31050, "reddit_topics") for submission in rdb.get_wayback_submissions(): url = submission.get("comment_url") if url: subreddit = extract_subreddit(url) subreddit_obj = rdb.subreddit_exists(subreddit) # If the subreddit exists and the submission doesn't have an id, update if subreddit_obj and submission.get("subreddit_id") is None: rdb.update_wayback_submission(submission.get("_id"), "subreddit_id", subreddit_obj.get("_id"))
""" Get db stats of submissions/comments we have per-month. """ from redditDB import RedditDB from datetime import datetime rdb = RedditDB("mciot", "r3dd1tmorgane", "blacksun.cs.mcgill.ca", 31050, "reddit_topics") stats = {} num_no_created = 0 counter = 0 submissions = rdb.get_wayback_submissions() for submission in submissions: counter += 1 if counter % 100 == 0: print counter created = submission.get(u'created') if created is None: num_no_created += 1 continue if type(created) == float: date = datetime.fromtimestamp(created) elif type(created) == datetime: date = created else:
print "" #TODO: Do this straight with mongodb aggregation ## Popular domains ### print "=========Popular domains:current collection=========" print "Finding frequencies of all domains..." domain_freqs = domain_frequencies(rdb.submission_list()) print "Calculating top domains..." top_20_domains = top_domains(domain_freqs, top=100) print "DOMAIN\tCOUNT" for domain, num in top_20_domains: print domain, "\t", num print "" print "=========Popular domains:wayback collection=========" print "Finding frequencies of all domains..." domain_freqs_wb = domain_frequencies(rdb.get_wayback_submissions()) print "Calculating top domains..." top_20_domains_wb = top_domains(domain_freqs_wb, top=100) print "DOMAIN\tCOUNT" for domain, num in top_20_domains_wb: print domain, "\t", num print "" ### logging #### print "=========logging information=========" types = rdb.log_types() print 'TYPE', '\t', 'COUNT' for typ in types: print typ, '\t', rdb.logged_errors_count(typ)