def scrape(): vprint('working from %s' % getcwd()) handler = DataHandler(OUTPUT) handler.read_data() handler.remove_old_entries(DAYS_TO_KEEP) added, per_url = 0, 0 for url in get_file_items(URLS_PATH): vprint('requesting feed from %s...' % url) feed = parse(url) if not feed or 'status' not in feed: tsprint('error: unable to reach target url. aborting.') exit(-1) status = feed['status'] if status != 200: tsprint( 'error: request from %s responded with error code %s. skipping...' % (url, status)) continue vprint('status returned normal. scanning entries...') for entry in feed['entries']: try: id_ = entry['id'] published = None if 'published_parsed' in entry: published = entry['published_parsed'] elif 'updated_parsed' in entry: published = entry['updated_parsed'] else: print('entries may not have dates. skipping...') break if not is_new(published) or handler.entry_exists(id_): continue tags = [] if 'www.reddit.com' in url or 'tags' not in entry or not entry[ 'tags']: tags = clean_unique_tags([entry['title']]) if per_url == 0: vprint( 'no tags for entries, using title instead:\n %s' % tags) else: tags = clean_unique_tags( [tag['term'] for tag in entry['tags']]) if not tags: continue extras = {} if 'www.reddit.com' in url and 'summary' in entry: m = search(r'href="(\S+)">\[link\]', entry['summary']) if m: extras = {"dlink": m.group(1)} handler.add_entry(url, id_, clean_entry(entry, tags, **extras)) added += 1 per_url += 1 except KeyError as e: print('%s\nskipping...' % e) per_url = 0 break vprint('got %s entries from %s' % (per_url, url)) per_url = 0 handler.write_data() tsprint('added %s new entries' % added) tsprint('%s entries total' % len(handler.get_all_entries()))