"""Extract pricing information from tcgplayer.com""" import re import urllib import miner server = "magic.tcgplayer.com" url_price = "/db/price_guide.asp?setname=%s" re_name = re.compile(r"<font\s+class=default_7> ([^<]*?)</font>") re_price = re.compile(r"<font\s+class=default_7>\$(\d+\.\d+) </font>") con = miner.new_connection(server) def mine_pricelist(setname): """Get the average price for a card""" url = urllib.quote(url_price % setname, "/?=") html = miner.download(con, url, convert_to_unicode=False) pricelist = [] for part in html.split("<TR height=20>"): match = re_name.search(part) if match is None: continue cardname = unicode(match.group(1), errors="ignore") cardname = cardname.replace(u"AE", u"\xc3") prices = re_price.findall(part) if prices is None or len(prices) < 3:
re_set3 = re.compile(r'<span[^>]*>\s*<a\s+href="[^"]*">([^<]+)</a>\s*</span>' '\s*<p>\s*<img[^>]*>[^<]*<i>([^<]*)</i>\s*</p>\s*' '<p>(.*?)(?:\s([\d\*X]+)/|)([\d\*X]+|),(?:\s+([\d\{\}/WUBRGXYZP]+)\s*' '(?:\(\d+\)|)|)\s*</p>\s*<p\s+class="ctext">\s*<b>(.*?)</b>\s*</p>\s*' '<p>\s*<i>([\d\D]*?)</i>\s*</p>\s*<p>Illus.\s+([^<]*)</p>') re_token = re.compile(r'<h2>([^<]*)</h2>') # set name re_token2 = re.compile(r'<tr[^>]*>\s*' '<td>\s*<a\s+href="([^"]*)">(.*?)' # link and name '(?:\s+([\d\*X]+)/([\d\*X]+)|)</a>\s*</td>\s*' # power and toughness '<td>Token</td>\s*' # token description '<td>(?:([\d\*X]+[a-z]?)/[\d\*X]+|-)</td>\s*' # number '<td>([^<]*)</td>\s*' # artist '</tr>' ) con = miner.new_connection(server) # httplib.HTTPConnection def mine_set(setcode, releasedate, magiccardsinfocode): """Mine the date for a magic set""" html = miner.download(con, url_set % magiccardsinfocode) # Get set name result = re_set.search(html) if result is None: raise RuntimeError(_("Pattern match failed.")) setname, code = result.groups() # Get set cards cids = re_set2.findall(html)