def on_status(self, status): # Check for retweet. retweet = status.text.startswith("RT @") # Get urls. urls = set([]) collect_urls(status._json, urls) user = status.user.screen_name.lower() for url in urls: # Check for blocked sites. if news.blocked(url): continue # Check for news site. Try to crawl all urls in tweets from feeds. # Otherwise the site must be in the whitelist. site = news.sitename(url) if user not in users: if retweet and not flags.arg.retweets: continue if site not in news.sites: continue # Crawl URL. print("---", user, "-", news.trim_url(url)) crawler.crawl(url) sys.stdout.flush()
reddit.read_only = True # Monitor live Reddit submission stream for news articles. crawler = news.Crawler("reddit") while True: try: for submission in reddit.subreddit('all').stream.submissions(): # Ignore self submissions. if submission.is_self: continue # Discard non-news sites. if submission.over_18: continue subreddit = str(submission.subreddit) url = submission.url if news.blocked(url): continue site = news.sitename(url) if subreddit not in news_reddits: if subreddit in ignored_reddits: continue if site not in news.sites: continue # Crawl URL. domain = str(submission.domain) title = str(submission.title) print("---", domain, subreddit, "-", title) crawler.crawl(url) sys.stdout.flush() print("restart submission stream") time.sleep(20) except KeyboardInterrupt as error: