def scrape(limit: int = 1) -> None: page = 1 while page <= limit: time.sleep(0.1) url = 'https://www.mtggoldfish.com/deck/custom/penny_dreadful?page={n}#online'.format( n=page) soup = BeautifulSoup( fetcher.internal.fetch(url, character_encoding='utf-8'), 'html.parser') raw_decks = soup.find_all('div', {'class': 'deck-tile'}) if len(raw_decks) == 0: logger.warning( 'No decks found in {url} so stopping.'.format(url=url)) break for raw_deck in raw_decks: d = Container({'source': 'MTG Goldfish'}) a = raw_deck.select_one('h2 > span.deck-price-online > a') d.identifier = re.findall(r'/deck/(\d+)#online', a.get('href'))[0] d.url = 'https://www.mtggoldfish.com/deck/{identifier}#online'.format( identifier=d.identifier) d.name = a.contents[0].strip() d.mtggoldfish_username = without_by( raw_deck.select_one( 'div.deck-tile-author').contents[0].strip()) try: d.created_date = scrape_created_date(d) except InvalidDataException as e: msg = f'Got {e} trying to find a created_date in {d}, {raw_deck}' logger.error(msg) raise InvalidDataException(msg) time.sleep(1) d.cards = scrape_decklist(d) err = vivify_or_error(d) if err: logger.warning(err) continue deck.add_deck(d) page += 1
def normalize(d: Deck) -> str: try: name = d.original_name name = name.lower() name = replace_space_alternatives(name) name = remove_pd(name) name = remove_hashtags(name) name = remove_brackets(name) name = strip_leading_punctuation(name) unabbreviated = expand_common_abbreviations(name) if unabbreviated != name or name in ABBREVIATIONS.values(): name = unabbreviated elif whitelisted(name): pass else: name = add_colors_if_no_deckname(name, d.get('colors')) name = normalize_colors(name) name = add_archetype_if_just_colors(name, d.get('archetype_name')) name = remove_mono_if_not_first_word(name) name = ucase_trailing_roman_numerals(name) return titlecase.titlecase(name) except ValueError: raise InvalidDataException('Failed to normalize {d}'.format(d=repr(d)))
def insert_set(s) -> None: sql = 'INSERT INTO `set` (' sql += ', '.join(name for name, prop in card.set_properties().items() if prop['mtgjson']) sql += ') VALUES (' sql += ', '.join('%s' for name, prop in card.set_properties().items() if prop['mtgjson']) sql += ')' values = [ date2int(s.get(database2json(name)), name) for name, prop in card.set_properties().items() if prop['mtgjson'] ] db().execute(sql, values) set_id = db().last_insert_rowid() set_cards = s.get('cards', []) fix_bad_mtgjson_set_cards_data(set_cards) fix_mtgjson_melded_cards_array(set_cards) for c in set_cards: _, card_id = try_find_card_id(c) if card_id is None: raise InvalidDataException("Can't find id for: '{n}': {ns}".format( n=c['name'], ns='; '.join(c.get('names', [])))) sql = 'INSERT INTO printing (card_id, set_id, ' sql += ', '.join(name for name, prop in card.printing_properties().items() if prop['mtgjson']) sql += ') VALUES (%s, %s, ' sql += ', '.join('%s' for name, prop in card.printing_properties().items() if prop['mtgjson']) sql += ')' cards_values = [card_id, set_id] + [ c.get(database2json(name)) for name, prop in card.printing_properties().items() if prop['mtgjson'] ] db().execute(sql, cards_values)
def medal_winners(s: str) -> Dict[str, int]: winners = {} # The HTML of this page is so badly malformed that BeautifulSoup cannot really help us with this bit. rows = re.findall('<tr style=">(.*?)</tr>', s, re.MULTILINE | re.DOTALL) for row in rows: player = BeautifulSoup(row, 'html.parser').find_all('td')[2] if player.find('img'): mtgo_username = aliased(player.a.contents[0]) img = re.sub(r'styles/Chandra/images/(.*?)\.png', r'\1', player.img['src']) if img == WINNER: winners[mtgo_username] = 1 elif img == SECOND: winners[mtgo_username] = 2 elif img == TOP_4: winners[mtgo_username] = 3 elif img == TOP_8: winners[mtgo_username] = 5 elif img == 'verified': pass else: raise InvalidDataException( 'Unknown player image `{img}`'.format(img=img)) return winners
def get_source_id(source: str) -> int: sql = 'SELECT id FROM source WHERE name = %s' source_id = db().value(sql, [source]) if not source_id: raise InvalidDataException('Unknown source: `{source}`'.format(source=source)) return source_id
def lookup(gatherling_id: int) -> deck.Deck: try: return decks_by_identifier[gatherling_id] except KeyError: raise InvalidDataException("Unable to find deck with gatherling id '{0}'".format(gatherling_id))
def update_database(new_date: datetime.datetime) -> None: # pylint: disable=too-many-locals db().begin('update_database') db().execute('DELETE FROM scryfall_version') # In order to rebuild the card table, we must delete (and rebuild) all tables with a FK to it db().execute('DROP TABLE IF EXISTS _cache_card') db().execute(""" DELETE FROM card_color; DELETE FROM card_color_identity; DELETE FROM card_legality; DELETE FROM card_bug; DELETE FROM face; DELETE FROM printing; DELETE FROM card; DELETE FROM `set`; """) sets = {} for s in fetcher.all_sets(): sets[s['code']] = insert_set(s) every_card_printing = fetcher.all_cards() rarity_ids = { x['name']: x['id'] for x in db().select('SELECT id, name FROM rarity;') } scryfall_to_internal_rarity = { 'common': ('Common', rarity_ids['Common']), 'uncommon': ('Uncommon', rarity_ids['Uncommon']), 'rare': ('Rare', rarity_ids['Rare']), 'mythic': ('Mythic Rare', rarity_ids['Mythic Rare']) } # Strategy: # Iterate through all printings of each cards, building several queries to be executed at the end. # If we hit a new card, add it to the queries the several tables tracking cards: # card, face, card_color, card_color_identity, printing # If it's a printing of a card we already have, just add to the printing query # We need to special case the result (melded) side of meld cards, due to their general weirdness. cards: Dict[str, int] = {} meld_result_printings = [] card_query = 'INSERT INTO `card` (id, layout) VALUES ' card_values = [] card_color_query = 'INSERT IGNORE INTO `card_color` (card_id, color_id) VALUES ' card_color_values = [] card_color_identity_query = 'INSERT IGNORE INTO `card_color_identity` (card_id, color_id) VALUES ' card_color_identity_values = [] face_query = 'INSERT INTO `face` (card_id, position, ' face_query += ', '.join(name for name, prop in card.face_properties().items() if prop['scryfall']) face_query += ') VALUES ' face_values = [] printing_query = 'INSERT INTO `printing` (card_id, set_id, ' printing_query += 'system_id, rarity, flavor, artist, number, multiverseid, watermark, border, timeshifted, reserved, mci_number, rarity_id' printing_query += ') VALUES' printing_values = [] colors_raw = db().select( 'SELECT id, symbol FROM color GROUP BY name ORDER BY id;') colors = {c['symbol'].upper(): c['id'] for c in colors_raw} next_card_id = 1 card_legality_query = 'INSERT IGNORE INTO `card_legality` (card_id, format_id, legality) VALUES ' card_legality_values = [] for p in every_card_printing: # Exclude little girl because {hw} mana is a problem rn. if p['name'] == 'Little Girl': continue if is_meld_result(p): meld_result_printings.append(p) rarity, rarity_id = scryfall_to_internal_rarity[p['rarity']] try: set_id = sets[p['set']] except KeyError: raise InvalidDataException( f"We think we should have set {p['set']} but it's not in {sets} (from {p})" ) # If we already have the card, all we need is to record the next printing of it if p['name'] in cards: card_id = cards[p['name']] printing_values.append( printing_value(p, card_id, set_id, rarity_id, rarity)) continue card_id = next_card_id next_card_id += 1 cards[p['name']] = card_id card_values.append("({i},'{l}')".format(i=card_id, l=p['layout'])) if p['layout'] in [ 'augment', 'emblem', 'host', 'leveler', 'meld', 'normal', 'planar', 'saga', 'scheme', 'token', 'vanguard' ]: face_values.append(single_face_value(p, card_id)) elif p['layout'] in [ 'double_faced_token', 'flip', 'split', 'transform' ]: face_values += multiple_faces_values(p, card_id) for color in p.get('colors', []): color_id = colors[color] card_color_values.append(f'({card_id}, {color_id})') for color in p.get('color_identity', []): color_id = colors[color] card_color_identity_values.append(f'({card_id}, {color_id})') for format_, status in p.get('legalities', {}).items(): if status == 'not_legal': continue # Strictly speaking we could drop all this capitalizing and use what Scryfall sends us as the canonical name as it's just a holdover from mtgjson. format_id = get_format_id(format_.capitalize(), True) internal_status = status.capitalize() card_legality_values.append( f"({card_id}, {format_id}, '{internal_status}')") cards[p['name']] = card_id printing_values.append( printing_value(p, card_id, set_id, rarity_id, rarity)) card_query += ',\n'.join(card_values) card_query += ';' db().execute(card_query) card_color_query += ',\n'.join(card_color_values) + ';' db().execute(card_color_query) card_color_identity_query += ',\n'.join(card_color_identity_values) + ';' db().execute(card_color_identity_query) for p in meld_result_printings: insert_meld_result_faces(p, cards) printing_query += ',\n'.join(printing_values) printing_query += ';' db().execute(printing_query) face_query += ',\n'.join(face_values) face_query += ';' db().execute(face_query) card_legality_query += ',\n'.join(card_legality_values) card_legality_query += ';' db().execute(card_legality_query) # Create the current Penny Dreadful format. get_format_id('Penny Dreadful', True) update_bugged_cards() update_pd_legality() db().execute('INSERT INTO scryfall_version (last_updated) VALUES (%s)', [dtutil.dt2ts(new_date)]) db().commit('update_database')
def add_deck(params): if not params.get('mtgo_username') and not params.get( 'tappedout_username') and not params.get('mtggoldfish_username'): raise InvalidDataException( 'Did not find a username in {params}'.format(params=params)) person_id = get_or_insert_person_id(params.get('mtgo_username'), params.get('tappedout_username'), params.get('mtggoldfish_username')) deck_id = get_deck_id(params['source'], params['identifier']) if deck_id: add_cards(deck_id, params['cards']) return deck_id created_date = params.get('created_date') if not created_date: created_date = time.time() archetype_id = get_archetype_id(params.get('archetype')) for result in ['wins', 'losses', 'draws']: if params.get('competition_id') and not params.get(result): params[result] = 0 sql = """INSERT INTO deck ( created_date, updated_date, person_id, source_id, url, identifier, name, competition_id, archetype_id, resource_uri, featured_card, score, thumbnail_url, small_thumbnail_url, wins, losses, draws, finish, reviewed ) VALUES ( IFNULL(%s, UNIX_TIMESTAMP()), UNIX_TIMESTAMP(), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, FALSE )""" values = [ created_date, person_id, get_source_id(params['source']), params['url'], params['identifier'], params['name'], params.get('competition_id'), archetype_id, params.get('resource_uri'), params.get('featured_card'), params.get('score'), params.get('thumbnail_url'), params.get('small_thumbnail_url'), params.get('wins'), params.get('losses'), params.get('draws'), params.get('finish') ] deck_id = db().insert(sql, values) add_cards(deck_id, params['cards']) d = load_deck(deck_id) prime_cache(d) return d
def add_deck(params: RawDeckDescription) -> Deck: if not params.get('mtgo_username') and not params.get( 'tappedout_username') and not params.get('mtggoldfish_username'): raise InvalidDataException( 'Did not find a username in {params}'.format(params=params)) person_id = get_or_insert_person_id(params.get('mtgo_username'), params.get('tappedout_username'), params.get('mtggoldfish_username')) deck_id = get_deck_id(params['source'], params['identifier']) cards = params['cards'] if deck_id: db().begin('replace_deck_cards') db().execute('UPDATE deck SET decklist_hash = %s WHERE id = %s', [get_deckhash(cards), deck_id]) db().execute('DELETE FROM deck_card WHERE deck_id = %s', [deck_id]) add_cards(deck_id, cards) db().commit('replace_deck_cards') d = load_deck(deck_id) prime_cache(d) return d created_date = params.get('created_date') if not created_date: created_date = time.time() archetype_id = get_archetype_id(params.get('archetype')) for result in ['wins', 'losses', 'draws']: if params.get('competition_id') and not params.get(result): params[result] = 0 # type: ignore sql = """INSERT INTO deck ( created_date, updated_date, person_id, source_id, url, identifier, name, competition_id, archetype_id, resource_uri, featured_card, score, thumbnail_url, small_thumbnail_url, finish, decklist_hash, reviewed ) VALUES ( IFNULL(%s, UNIX_TIMESTAMP()), UNIX_TIMESTAMP(), %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, FALSE )""" values = [ created_date, person_id, get_source_id(params['source']), params['url'], params['identifier'], params['name'], params.get('competition_id'), archetype_id, params.get('resource_uri'), params.get('featured_card'), params.get('score'), params.get('thumbnail_url'), params.get('small_thumbnail_url'), params.get('finish'), get_deckhash(cards) ] db().begin('add_deck') deck_id = db().insert(sql, values) add_cards(deck_id, cards) d = load_deck(deck_id) prime_cache(d) db().commit('add_deck') return d
def fail(key: str, val: Any, expected_type: type) -> InvalidDataException: return InvalidDataException('Expected a {expected_type} for {key}, got `{val}` ({actual_type})'.format(expected_type=expected_type, key=key, val=val, actual_type=type(val)))
def scryfall_last_updated() -> datetime.datetime: d = fetch_tools.fetch_json('https://api.scryfall.com/bulk-data') for o in d['data']: if o['type'] == 'default_cards': return dtutil.parse_rfc3339(o['updated_at']) raise InvalidDataException(f'Could not get the last updated date from Scryfall: {d}')
def season_num(code_to_look_for: str) -> int: try: return SEASONS.index(code_to_look_for) + 1 except KeyError as c: raise InvalidDataException('I did not find the season code (`{code}`) in the list of seasons ({seasons}) and I am confused.'.format(code=code_to_look_for, seasons=','.join(SEASONS))) from c
def parse_line(line: str) -> Tuple[int, str]: match = re.match(r'(\d+)\s+(.*)', line) if match is None: raise InvalidDataException('No number specified with `{line}`'.format(line=line)) n, name = match.groups() return (int(n), name)