def parse_row(row): tds = [td.text for td in row.select("td")] img = thumbnails.select_one(f'a[title="{tds[1]}"] img') data = { 'id': tds[0], 'name': tds[1], 'rarity': tds[2], 'type': tds[3], 'affiliation': tds[4], 'stats': { 'firepower': tds[5], 'health': tds[6], 'antiAir': tds[7], 'speed': tds[8], 'airPower': tds[9], 'torpedo': tds[10] }, 'url': build_url(row.select_one('a')['href']), 'thumbnail': build_url( img['srcset'].split()[0] if img.has_attr('srcset') else img['src']) } return data
def extract_pictures(html): return { 'images': [ extract_skin(tab) for tab in html.select('div.shiparttabbernew .tabbertab') ], 'icon': build_url(html.select_one('img')['src']) if html.select_one('img') else '', 'chibi': build_url(html.select_one('#talkingchibi img')['src']) if html.select_one('#talkingchibi img') else '' }
def extract_skin(tab): image_path = tab.select_one('img')['srcset'].split(' ')[-2] return {'name': tab['title'], 'url': build_url(image_path)}
from requests import get from shared import save_fixture, build_url from bs4 import BeautifulSoup html = get('https://azurlane.koumakan.jp/Equipment_List').text fp = BeautifulSoup(html, 'lxml') category_urls = [link['href'] for link in fp.find('ul').select('a')] for category_url in category_urls: category_html = get(build_url(category_url)).text table = BeautifulSoup(category_html, 'lxml').select_one('.tabbertab table') paths = set([ row.find('a')['href'] for row in table.find_all('tr') if not row.find('th') ]) for path in paths: name = path[1:] if 'w/index.php?' in name: continue print(name) save_fixture('equipment', name, get(build_url(path)).text)
def extract_picture(html): img = html.find('img') path = img['srcset'].split(' ')[-2] if img.has_attr( 'srcset') else img['src'] return build_url(path)
from requests import get from shared import save_fixture, build_url from bs4 import BeautifulSoup html = get('https://azurlane.koumakan.jp/List_of_Ships').text fp = BeautifulSoup(html, 'lxml') rows = fp.select('.mw-parser-output .wikitable tr') urls = [build_url(row.find('a')['href']) for row in rows if not row.find('th')] for url in urls: name = url.split('/')[-1] print(name) save_fixture('ships_long', name, get(url).text)