Пример #1
0
def scrape():
    vprint('working from %s' % getcwd())
    handler = DataHandler(OUTPUT)
    handler.read_data()
    handler.remove_old_entries(DAYS_TO_KEEP)
    added, per_url = 0, 0
    for url in get_file_items(URLS_PATH):
        vprint('requesting feed from %s...' % url)
        feed = parse(url)
        if not feed or 'status' not in feed:
            tsprint('error: unable to reach target url. aborting.')
            exit(-1)
        status = feed['status']
        if status != 200:
            tsprint(
                'error: request from %s responded with error code %s. skipping...'
                % (url, status))
            continue
        vprint('status returned normal. scanning entries...')
        for entry in feed['entries']:
            try:
                id_ = entry['id']
                published = None
                if 'published_parsed' in entry:
                    published = entry['published_parsed']
                elif 'updated_parsed' in entry:
                    published = entry['updated_parsed']
                else:
                    print('entries may not have dates. skipping...')
                    break
                if not is_new(published) or handler.entry_exists(id_):
                    continue
                tags = []
                if 'www.reddit.com' in url or 'tags' not in entry or not entry[
                        'tags']:
                    tags = clean_unique_tags([entry['title']])
                    if per_url == 0:
                        vprint(
                            'no tags for entries, using title instead:\n  %s' %
                            tags)
                else:
                    tags = clean_unique_tags(
                        [tag['term'] for tag in entry['tags']])
                if not tags: continue
                extras = {}
                if 'www.reddit.com' in url and 'summary' in entry:
                    m = search(r'href="(\S+)">\[link\]', entry['summary'])
                    if m: extras = {"dlink": m.group(1)}
                handler.add_entry(url, id_, clean_entry(entry, tags, **extras))
                added += 1
                per_url += 1
            except KeyError as e:
                print('%s\nskipping...' % e)
                per_url = 0
                break
        vprint('got %s entries from %s' % (per_url, url))
        per_url = 0
    handler.write_data()
    tsprint('added %s new entries' % added)
    tsprint('%s entries total' % len(handler.get_all_entries()))