def parse_cdbaby_album(album_id): url = 'http://www.cdbaby.com/cd/%s' % (album_id,) page = fetch_page(url, html=True) release = {} album = guess_case_title(page.find('span', {'id': 'ctl00_rightColumn_lblAlbumName'}).text) album_id = extract_album_id(url) artist = guess_case(page.find('div', {'id': 'ctl00_rightColumn_pnlArtists'}).find('span').text) artist_id = extract_artist_id(page.find('a', {'id': 'ctl00_breadCrumb_lnkArtist'})['href']) barcode = page.find('span', {'id': 'ctl00_rightColumn_lblBarcode'}) if barcode: barcode = extract_barcode(barcode.text) release_date = int(page.find('span', {'id': 'ctl00_leftColumn_lblAlbumRelease'}).text) record_label = page.find('span', {'id': 'ctl00_rightColumn_lblRecordLabel'}) if record_label: record_label = re.sub(r'^Record Label: ', '', record_label.text) if artist.lower() == record_label.lower(): record_label = None release = { 'cdbaby_id': album_id, 'title': album, 'artist': artist, 'artist_cdbaby_id': artist_id, 'barcode': barcode, 'date': str(release_date), } if record_label: release['label'] = record_label medium = { 'position': 1, 'tracks': [], } release['mediums'] = [medium] for tr in page.find('table', {'id': 'tracks-display'}).findAll('tr'): tds = tr.findAll('td') if len(tds) != 4: continue track_no, track_title = parse_track_title(tds[1].text) track_length = parse_track_length(tds[2].find('span').text) medium['tracks'].append({ 'position': track_no, 'title': track_title, 'length': track_length, }) if page.find('input', {'class': 'cd-buynow-button'}): medium['format'] = 'CD' else: medium['format'] = 'Digital Media' return release
def parse_track_title(s): m = re.match(r'^(\d+). (.+)$', s) track_no, track_title = m.groups() return int(track_no), guess_case_title(track_title)