Пример #1
0
def crawl(feed_url):
    socket.setdefaulttimeout(DEFAULT_TIMEOUT)
    logger.info('Crawling %r...', feed_url)

    etag, modified = conditional_get_state(feed_url)
    parsed = feedparser.parse(feed_url, etag=etag, modified=modified)
    if not parsed.get('status'):
        logger.warn('No status returned while crawling %s.  Parsed: %r', feed_url, parsed)
        return

    if parsed.status == 304:
        logger.info('Feed %s reported 304', feed_url)
        return

    if parsed.status not in (200, 301, 302):
        logger.warn('Status %s while crawling %s.', parsed.status, feed_url)
        return

    save_feed(feed_url, parsed.feed)

    with entry_urls(feed_url) as entry_url_db:
        for entry in parsed.entries:
            new_entry_url = entry_url(feed_url, entry)
            if new_entry_url.encode('utf-8') in entry_url_db:
                logger.info('%s has already been seen', new_entry_url)
                continue

            logger.info('New entry %s on %s', new_entry_url, feed_url)
            entry_url_db[new_entry_url] = entry_time(entry).isoformat() + 'Z'
            save_entry(feed_url, entry)
            with timeseries_lock:
                add_to_timeseries(slug(feed_url), entry_slug(feed_url, entry), entry_time(entry))

    save_conditional_get_state(feed_url, parsed.get('etag'), parsed.get('modified'))
Пример #2
0
def index_entry(feed_url, entry, writer):
    writer.add_document(**{
        'feed': slug(feed_url),
        'slug': entry_slug(feed_url, entry),
        'title': entry_title(entry),
        'published': entry_time(entry),
        'text': entry_text(entry),
        'authors': ' '.join(author_names(entry)),
        'tags': ','.join(tags(entry)),
    })
Пример #3
0
def feed_stats(feed_url):
    try:
        listing = os.listdir(join(STORAGE_ROOT, 'feeds', slug(feed_url)))
    except OSError:
        listing = []

    listing = [l for l in listing if l.startswith('2')]

    if not listing:
        return 0, None, None

    return len(listing), date_from_entry_slug(listing[0]), date_from_entry_slug(listing[-1])
Пример #4
0
def subscriptions():
    feed_stats_by_slug = feed_stats()
    by_url = {}
    try:
        with open_file_from(STORAGE_ROOT, 'subscriptions.opml', 'rb') as f:
            for category, title, feed_url in opml_feeds(f):
                if feed_url not in by_url:
                    by_url[feed_url] = {
                        'url': feed_url,
                        'slug': slug(feed_url),
                        'title': title,
                        'categories': []
                    }

                    entry_count, first_entry, latest_entry = feed_stats_by_slug[slug(feed_url)]
                    by_url[feed_url].update({
                        'entry_count': entry_count,
                        'first_entry': first_entry,
                        'latest_entry': latest_entry
                    })
                by_url[feed_url]['categories'].append(category)
        return sorted(by_url.values(), key=lambda s: s['slug'])
    except OSError:
        return []
Пример #5
0
def remove_all_from_feed(feed_url):
    shutil.rmtree(join(STORAGE_ROOT, 'feeds', slug(feed_url)))
Пример #6
0
def remove_all_from_feed(feed_url):
    feed_slug = slug(feed_url)
    shutil.rmtree(join(STORAGE_ROOT, 'feeds', feed_slug))
    with timeseries() as ts:
        ts.execute('DELETE FROM timeseries WHERE feed = ?', [feed_slug])
Пример #7
0
def test_slugifying_url():
    assert slug('https://s.d.tld/feed.xml') == 'https-tld-d-s-feed-xml'
    assert slug('ftp://s.d.tld/feed.xml') == 'ftp-tld-d-s-feed-xml'
    assert (slug('http://subdomain.domain.tld/path-1/path-2?query=value#fragment') ==
            'http-tld-domain-subdomain-path-1-path-2-query-value-fragment')
Пример #8
0
def entry_directory(feed_url, entry):
    return join(STORAGE_ROOT, 'feeds', slug(feed_url), entry_slug(feed_url, entry))
Пример #9
0
def entry_slug(feed_url, entry):
    timestamp = entry_time(entry).isoformat() + 'Z'
    entry_slug = slug(entry_url(feed_url, entry))
    directory_name = '-'.join([timestamp, entry_slug])[:255]
    return directory_name
Пример #10
0
def feed_directory(feed_url):
    return join(STORAGE_ROOT, 'feeds', slug(feed_url))