def scrape_tables(table: bs4.element.Tag): category = sanitize_str(table.findAll("tr")[0].text.strip()) columns = json.dumps( list(map(sanitize_str, table.findAll("tr")[1].stripped_strings)), allow_nan=False, ) grand_total = [0] grand_available = [0] grand_occupied = [0] records = [] doc = {"category": category, "columns": columns} start_time = time.time() _multi_process_scrape_partial = functools.partial( _multi_process_scrape, doc, records, category, grand_total, grand_available, grand_occupied, ) list(map(_multi_process_scrape_partial, table.findAll("tr")[3:])) return { "grand_total_beds": grand_total[0], "grand_occupied_beds": grand_occupied[0], "grand_available_beds": grand_available[0], "category": category, "columns": columns, "records": json.dumps(records, allow_nan=False), "hid": hash(doc["category"]), }
def extract_detail(item: bs4.element.Tag) -> DetailData: """商品に関するデータを抽出する Args: item (bs4.element.Tag): 商品情報 Returns: 商品に関するデータ """ item_review_num = item.findAll(**NUM_REVIEWS_PATTERN_ARGS) if len(item_review_num) == 1: item_review_num = item_review_num[0].text.strip() else: item_review_num = [ ir for ir in item_review_num if ir.text.strip().isdecimal() ] item_review_num = item_review_num[0].text.strip() item_star = item.findAll(**STAR_PATTERN_ARGS) if len(item_star) == 1: item_star = item_star[0].text.strip() else: item_star = item_star[-1].text.strip() item_review_num = int(item_review_num.replace(',', '')) item_star = item_star.split(' ')[-1] item_link = item.find(**LINK_PATTERN_ARGS).get('href') return DetailData(item_review_num, item_star, item_link)
def parse_song_from_block(block: bs4.element.Tag) -> Song: for link in block.findAll("a"): if "title" in link.attrs.keys(): title = link.attrs["title"] url = block.findAll("a", {"class": "-download-zip"})[0].attrs["href"] return Song(title, url)
def _parse_column(gatherer_column: bs4.element.Tag) -> GathererCard: """Parse a single gatherer page 'rightCol' entry.""" label_to_values = { row.find("div", class_="label").getText(strip=True).rstrip(":"): row.find("div", class_="value") for row in gatherer_column.findAll("div", class_="row") } card_name = label_to_values["Card Name"].getText(strip=True) card_types = label_to_values["Types"].getText(strip=True) flavor_lines = [] if "Flavor Text" in label_to_values: for flavorbox in label_to_values["Flavor Text"].findAll( "div", class_="flavortextbox"): flavor_lines.append(flavorbox.getText(strip=True)) text_lines = [] if "Card Text" in label_to_values: for textbox in label_to_values["Card Text"].findAll( "div", class_="cardtextbox"): text_lines.append(_replace_symbols(textbox).getText().strip()) return GathererCard( card_name=card_name, original_types=card_types, original_text="\n".join(text_lines).strip() or None, flavor_text="\n".join(flavor_lines).strip() or None, )
def tag2list(tag: bs4.element.Tag): result = list(map(lambda i: str(i.contents[0]), tag.findAll("p"))) return result