def crawl(feed_url): socket.setdefaulttimeout(DEFAULT_TIMEOUT) logger.info('Crawling %r...', feed_url) etag, modified = conditional_get_state(feed_url) parsed = feedparser.parse(feed_url, etag=etag, modified=modified) if not parsed.get('status'): logger.warn('No status returned while crawling %s. Parsed: %r', feed_url, parsed) return if parsed.status == 304: logger.info('Feed %s reported 304', feed_url) return if parsed.status not in (200, 301, 302): logger.warn('Status %s while crawling %s.', parsed.status, feed_url) return save_feed(feed_url, parsed.feed) with entry_urls(feed_url) as entry_url_db: for entry in parsed.entries: new_entry_url = entry_url(feed_url, entry) if new_entry_url.encode('utf-8') in entry_url_db: logger.info('%s has already been seen', new_entry_url) continue logger.info('New entry %s on %s', new_entry_url, feed_url) entry_url_db[new_entry_url] = entry_time(entry).isoformat() + 'Z' save_entry(feed_url, entry) with timeseries_lock: add_to_timeseries(slug(feed_url), entry_slug(feed_url, entry), entry_time(entry)) save_conditional_get_state(feed_url, parsed.get('etag'), parsed.get('modified'))
def index_entry(feed_url, entry, writer): writer.add_document(**{ 'feed': slug(feed_url), 'slug': entry_slug(feed_url, entry), 'title': entry_title(entry), 'published': entry_time(entry), 'text': entry_text(entry), 'authors': ' '.join(author_names(entry)), 'tags': ','.join(tags(entry)), })
def feed_stats(feed_url): try: listing = os.listdir(join(STORAGE_ROOT, 'feeds', slug(feed_url))) except OSError: listing = [] listing = [l for l in listing if l.startswith('2')] if not listing: return 0, None, None return len(listing), date_from_entry_slug(listing[0]), date_from_entry_slug(listing[-1])
def subscriptions(): feed_stats_by_slug = feed_stats() by_url = {} try: with open_file_from(STORAGE_ROOT, 'subscriptions.opml', 'rb') as f: for category, title, feed_url in opml_feeds(f): if feed_url not in by_url: by_url[feed_url] = { 'url': feed_url, 'slug': slug(feed_url), 'title': title, 'categories': [] } entry_count, first_entry, latest_entry = feed_stats_by_slug[slug(feed_url)] by_url[feed_url].update({ 'entry_count': entry_count, 'first_entry': first_entry, 'latest_entry': latest_entry }) by_url[feed_url]['categories'].append(category) return sorted(by_url.values(), key=lambda s: s['slug']) except OSError: return []
def remove_all_from_feed(feed_url): shutil.rmtree(join(STORAGE_ROOT, 'feeds', slug(feed_url)))
def remove_all_from_feed(feed_url): feed_slug = slug(feed_url) shutil.rmtree(join(STORAGE_ROOT, 'feeds', feed_slug)) with timeseries() as ts: ts.execute('DELETE FROM timeseries WHERE feed = ?', [feed_slug])
def test_slugifying_url(): assert slug('https://s.d.tld/feed.xml') == 'https-tld-d-s-feed-xml' assert slug('ftp://s.d.tld/feed.xml') == 'ftp-tld-d-s-feed-xml' assert (slug('http://subdomain.domain.tld/path-1/path-2?query=value#fragment') == 'http-tld-domain-subdomain-path-1-path-2-query-value-fragment')
def entry_directory(feed_url, entry): return join(STORAGE_ROOT, 'feeds', slug(feed_url), entry_slug(feed_url, entry))
def entry_slug(feed_url, entry): timestamp = entry_time(entry).isoformat() + 'Z' entry_slug = slug(entry_url(feed_url, entry)) directory_name = '-'.join([timestamp, entry_slug])[:255] return directory_name
def feed_directory(feed_url): return join(STORAGE_ROOT, 'feeds', slug(feed_url))