示例#1
0
    def on_status(self, status):
        # Check for retweet.
        retweet = status.text.startswith("RT @")

        # Get urls.
        urls = set([])
        collect_urls(status._json, urls)

        user = status.user.screen_name.lower()
        for url in urls:
            # Check for blocked sites.
            if news.blocked(url): continue

            # Check for news site. Try to crawl all urls in tweets from feeds.
            # Otherwise the site must be in the whitelist.
            site = news.sitename(url)
            if user not in users:
                if retweet and not flags.arg.retweets: continue
                if site not in news.sites: continue

            # Crawl URL.
            print("---", user, "-", news.trim_url(url))
            crawler.crawl(url)
            sys.stdout.flush()
示例#2
0
reddit.read_only = True

# Monitor live Reddit submission stream for news articles.
crawler = news.Crawler("reddit")
while True:
    try:
        for submission in reddit.subreddit('all').stream.submissions():
            # Ignore self submissions.
            if submission.is_self: continue

            # Discard non-news sites.
            if submission.over_18: continue
            subreddit = str(submission.subreddit)
            url = submission.url
            if news.blocked(url): continue
            site = news.sitename(url)
            if subreddit not in news_reddits:
                if subreddit in ignored_reddits: continue
                if site not in news.sites: continue

            # Crawl URL.
            domain = str(submission.domain)
            title = str(submission.title)
            print("---", domain, subreddit, "-", title)
            crawler.crawl(url)
            sys.stdout.flush()

        print("restart submission stream")
        time.sleep(20)

    except KeyboardInterrupt as error: