def crawl_post(post, total, append=True): success, favs, code = download(source_uri % post['id']) if success and favs: favs = favs['favorited_users'].split(',') favs = [{'id': post['id'], 'username': fav} for fav in favs] store_csv(favs, fav_file, append, ['id', 'username']) else: print('post', post['id'], 'failed with code', code)
from _connection import download from _storage import store_csv from tqdm import tqdm source_uri = 'https://e621.net/tag/index.json?limit=500&order=name&page=%d' target_uri = '../data/tags.csv' last_ids = [] page = 0 with tqdm(desc='Crawling') as bar: while True: page += 1 bar.update(1) success, tags, code = download(source_uri % page) if success and tags: tags = [tag for tag in tags if tag['id'] not in last_ids] last_ids = [tag['id'] for tag in tags] store_csv(tags, target_uri, append=(page > 1), fields=['id', 'name', 'count', 'type']) elif success: # no more tags, page is empty break else: print('\nFailed to access page %d, code %d' % (page, code)) continue
from _connection import download from _storage import store_csv from tqdm import tqdm source_uri = 'https://e621.net/artist/index.json?page=%d&limit=100' info_target_uri = '../data/artists.info.csv' urls_target_uri = '../data/artists.urls.csv' last_ids = [] page = 0 with tqdm(desc='Crawling') as bar: while True: page += 1 bar.update(1) success, artists, code = download(source_uri % page) if success and artists: info = [a for a in artists if a['id'] not in last_ids] urls = [{'id': a['id'], 'url': u} for a in info for u in a['urls']] last_ids = [artist['id'] for artist in info] store_csv(info, info_target_uri, append=(page > 1), fields=[ 'id', 'name', 'other_names', 'group_name', 'is_active', 'version', 'updater_id' ]) store_csv(urls, urls_target_uri, append=(page > 1),
from _storage import store_csv from datetime import datetime import sys from tqdm import tqdm source_uri = 'https://e621.net/user/index.json?page=%d' target_uri = '../data/users.csv' page = int(sys.argv[1]) - 1 if len(sys.argv) > 1 else 0 last = int(sys.argv[2]) if len(sys.argv) > 2 else float('inf') with tqdm(desc='Crawling') as bar: while True: page += 1 bar.update(1) success, users, code = download(source_uri % page) if success and users: for user in users: # flatten user['stats'] into user user.update(user['stats']) # reformat timestamp user['created_at'] = datetime.strptime( user['created_at'], '%Y-%m-%d %H:%M').strftime('%s') users = [user for user in users if user['id'] < last] last = users[-1]['id'] if users else last store_csv(users, target_uri, append=(page > 1),
from _connection import download from _storage import store_csv from tqdm import tqdm source_uri = 'https://e621.net/tag_alias/index.json?approved=true&page=%d' target_uri = '../data/aliases.csv' last_ids = [] page = 0 with tqdm(desc='Crawling') as bar: while True: page += 1 bar.update(1) success, aliases, code = download(source_uri % page) if success and aliases: aliases = [a for a in aliases if a['id'] not in last_ids] last_ids = [alias['id'] for alias in aliases] store_csv(aliases, target_uri, append=(page > 1), fields=['id', 'name', 'alias_id']) elif success: # no more tags, page is empty break else: print('\nFailed to access page %d, code %d' % (page, code)) continue
from tqdm import tqdm source_uri = 'https://e621.net/post/index.json?limit=320&before_id=%s' target_uri = { 'kpi': '../data/posts.kpi.csv', 'content': '../data/posts.content.csv', 'info': '../data/posts.info.csv', 'tags': '../data/posts.tags.csv', 'artists': '../data/posts.artists.csv' } last_id = '' with tqdm(desc='Crawling') as bar: while True: bar.update(1) success, posts, code = download(source_uri % last_id) if success and posts: for post in posts: # do some preprocessing so the output is more useful post['created_at'] = post['created_at']['s'] post['description'] = post['description'].replace('\n', ' ') artists = [{ 'id': post['id'], 'artist': artist } for post in posts for artist in post['artist']] tags = [{ 'id': post['id'], 'tag': tag } for post in posts for tag in post['tags'].split()]