# Load news site list. news.init() # Connect to Reddit. with open(flags.arg.apikeys, "r") as f: apikeys = json.load(f) reddit = praw.Reddit(client_id=apikeys["client_id"], client_secret=apikeys["client_secret"], user_agent=apikeys["user_agent"], check_for_updates=False) reddit.read_only = True # Monitor live Reddit submission stream for news articles. crawler = news.Crawler("reddit") while True: try: for submission in reddit.subreddit('all').stream.submissions(): # Ignore self submissions. if submission.is_self: continue # Discard non-news sites. if submission.over_18: continue subreddit = str(submission.subreddit) url = submission.url if news.blocked(url): continue site = news.sitename(url) if subreddit not in news_reddits: if subreddit in ignored_reddits: continue if site not in news.sites: continue
except Exception as e: print("*** XML parse error:", e, "in parsing news feed") sys.exit(1) if flags.arg.newsites: # Check for unknown news sites. news.init() newsites = collections.defaultdict(int) for item in root.iter("item"): child = item.find("link") if child is None: continue url = child.text if url == "https://newslookup.com/": continue site = news.sitename(url) if site not in news.sites: newsites[site] += 1 for site in sorted(newsites, key=newsites.get, reverse=True): print(newsites[site], site) else: # Fetch articles. crawler = news.Crawler("newslookup") for item in root.iter("item"): child = item.find("link") if child is None: continue url = child.text if url == "https://newslookup.com/": continue crawler.crawl(url) crawler.wait() crawler.dumpstats()
for domain, site in news.sites.items(): if site.twitter != None: users.add(site.twitter.lower()[1:]) if site.twitter in user_cache: feeds.add(user_cache[site.twitter]) else: try: user = api.get_user(site.twitter) feeds.add(str(user.id)) print(site.twitter, user.id) except Exception as e: print("Ignore bad feed for domain", domain, ":", site.twitter, e) # Initialize news crawler. crawler = news.Crawler("twitter") def collect_urls(obj, urls): if "entities" in obj: entities = obj["entities"] for url in entities["urls"]: expanded_url = url["expanded_url"] if expanded_url.startswith("https://twitter.com/"): continue if expanded_url.startswith("https://www.twitter.com/"): continue if expanded_url.startswith("https://mobile.twitter.com/"): continue urls.add(expanded_url) if "retweeted_status" in obj: retweet = obj["retweeted_status"] collect_urls(retweet, urls)
Fetch news articles and put them into news archive. """ import requests import sling import sling.flags as flags import sling.crawl.news as news flags.define("--urls", help="File with urls to fetch", default=None, metavar="FILE") flags.define("url", nargs="*", help="Article URLs to fetch", metavar="URL") flags.parse() news.init() crawler = news.Crawler("fetch") for url in flags.arg.url: crawler.crawl(url) if flags.arg.urls: with open(flags.arg.urls) as f: for url in f.readlines(): crawler.crawl(url.strip()) crawler.wait() crawler.dumpstats()
if text == None: return "" return text.strip().replace("\n", " ") def get_atom_element(e, tag): child = e.find(tag) if child == None: child = e.find("{http://www.w3.org/2005/Atom}" + tag) if child == None: return "" text = child.text if text == None: return "" return text.strip().replace("\n", " ") # Initialize news crawler. crawler = news.Crawler("rss") # Read RSS news feeds. feeds = {} f = open(flags.arg.feeds, "r") rsssession = requests.Session() for line in f.readlines(): line = line.strip() if len(line) == 0 or line[0] == "#": continue fields = line.split(" ") site = fields[0] rss = fields[1] print("=== RSS feed", rss) # Fetch RSS feed. try: