def main(notify): session = Session() total = session.query(Subreddit).count() count = 0 notify("starting update of %d subs" % total) query = session.query(Subreddit).order_by("scraped_time asc") dbi = DBIterator(query=query, use_offset=None) for subreddit in dbi.results_iter(): count += 1 try: subreddit.update_from_praw(r.get_subreddit(subreddit.url.split('/')[2])) session.add(subreddit) except (praw.requests.exceptions.HTTPError, praw.errors.InvalidSubreddit) as e: print "ERROR", str(e) subreddit.touch() session.add(subreddit) if count % 2000 == 0 and notify is not None: notify("at %d of %d" % (count, total)) if count % 10 == 0: session.commit() session.commit()
def main(notify): parser = HTMLParser() session = Session() subreddit_count = session.query(Subreddit).count() start_count = session.query(DiscoveredSub).count() notify("discovering from %d exiting" % subreddit_count) discovered_subs = set() query = session.query(Subreddit.description_html).filter(Subreddit.description_html != None) dbi = DBIterator(query=query) for sub in dbi.results_iter(): links = set(map(lambda s: u'/r/' + s.lower().strip() + u'/', find_sub_links(parser.unescape(sub.description_html)))) if len(links) == 0: continue existing = set(map(lambda s: s.url.lower().strip(), session.query(Subreddit.url).filter(Subreddit.url.in_(links)))) found = set(map(lambda s: s.url.lower().strip(), session.query(DiscoveredSub.url).filter(DiscoveredSub.url.in_(links)))) new_subs = (links - existing) - found if len(new_subs) > 0: discovered_subs.update(new_subs) if len(discovered_subs) > 25: add_new_subs(session, discovered_subs) if len(discovered_subs) > 0: add_new_subs(session, discovered_subs) end_count = session.query(DiscoveredSub).count() notify("found additional %d" % (end_count - start_count))
def main(notify): g = nx.Graph() out_filename = "data/subreddits_edged_by_description_links.gexf" parser = HTMLParser() session = Session() query = session.query(Subreddit) dbi = DBIterator(query=query) for subreddit in dbi.results_iter(): sub = subreddit.url.split("/")[2].lower() initialize_node(g, sub) if not subreddit.description_html: continue html = parser.unescape(subreddit.description_html) for linked_sub in find_sub_links(html): if g.has_edge(sub, linked_sub): g[sub][linked_sub]["weight"] += 1 else: g.add_edge(sub, linked_sub, weight=1) nx.write_gexf(g, out_filename)