def search(self, album, artists): ret = [] if len(artists) > 1: artist = u'Various Artists' else: if hasattr(artists, 'items'): artist = artists.keys()[0] else: artist = artists[0] if self._useid and hasattr(artists, 'values'): tracks = [] [tracks.extend(z) for z in artists.values()] for field in ('amg_rovi_id', 'amg_pop_id', 'amgsqlid', 'amg_album_id',): album_id = find_id(tracks, field) if album_id: break if not isempty(album_id): write_log(u'Found Album ID %s' % album_id) try: return self.keyword_search(u':id %s' % album_id) except OldURLError: write_log("Invalid URL used. Doing normal search.") if not album: raise RetrievalError('Album name required.') write_log(u'Searching for %s' % album) try: searchpage = search(album) except urllib2.URLError, e: write_log(u'Error: While retrieving search page %s' % unicode(e)) raise RetrievalError(unicode(e))
def search(self, album, artists): ret = [] if len(artists) > 1: artist = 'Various Artists' else: if hasattr(artists, 'items'): artist = list(artists.keys())[0] else: artist = artists[0] if self._useid and hasattr(artists, 'values'): tracks = [] [tracks.extend(z) for z in artists.values()] for field in ('amg_rovi_id', 'amg_pop_id', 'amgsqlid', 'amg_album_id',): album_id = find_id(tracks, field) if album_id: break if not isempty(album_id): write_log('Found Album ID %s' % album_id) try: return self.keyword_search(':id %s' % album_id) except OldURLError: write_log("Invalid URL used. Doing normal search.") if not album: raise RetrievalError('Album name required.') write_log('Searching for %s' % album) try: searchpage = search(album) except urllib.error.URLError as e: write_log('Error: While retrieving search page %s' % str(e)) raise RetrievalError(str(e)) write_log('Retrieved search results.') search_results = parse_searchpage(searchpage, artist, album) if search_results: matched, matches = search_results else: return [] if matched and len(matches) == 1: ret = [(matches[0], [])] elif matched: write_log('Ambiguous matches found for: %s - %s' % (artist, album)) ret.extend([(z, []) for z in matches]) else: write_log('No exact matches found for: %s - %s' % (artist, album)) ret.extend([(z, []) for z in matches]) return ret
def parse_search_element(td, id_field=ALBUM_ID): """Parse search element td and returns dictionary with album info. Search pages contain all album info in a td element. This routine parses the element and returns all info in dictionary with the field as keys and value being the value. Returns a dictionary with at least the following keys: artist -- artist name found album -- album name found #albumurl -- link to album. #extrainfo -- tuple with first item description text and second item a link to the album. year -- album release year.""" def to_string(e): try: return convert(e.a.string) except AttributeError: try: return convert(e.string) except AttributeError: return '' info = {} album = td.find('div', {'class': 'title'}) info['album'] = to_string(album) info['#albumurl'] = convert(album.a.element.attrib['href']) info['amg_url'] = info['#albumurl'] info['artist'] = to_string(td.find('div', {'class': 'artist'})) if not info['artist']: artist = to_string(td.find('div', {'class': 'title'})) if ':' in artist: artist = [z.strip() for z in artist.split(':', 1)] info['artist'], info['album'] = artist else: info['album'] = artist info['year'] = to_string(td.find('div', {'class': 'year'})) info['genre'] = to_string(td.find('div', {'class': 'genres'})) info['#extrainfo'] = [ info['album'] + ' at AllMusic.com', info['#albumurl']] info[id_field] = re.search(r'-(mw\d+)$', info['#albumurl']).groups()[0] return dict((k, v) for k, v in info.items() if not isempty(v))
def capture_dict_values(d): ret = {} for key, v in d.items(): if key in RELEASE_PROPERTIES_TO_IGNORE or isempty(v) or isinstance( v, dict): continue elif isinstance(v, bytes): ret[key] = v.decode('utf8') elif isinstance(v, numbers.Number): ret[key] = str(v) else: ret[key] = v return ret
def parse_search_element(td, id_field=ALBUM_ID): """Parse search element td and returns dictionary with album info. Search pages contain all album info in a td element. This routine parses the element and returns all info in dictionary with the field as keys and value being the value. Returns a dictionary with at least the following keys: artist -- artist name found album -- album name found #albumurl -- link to album. #extrainfo -- tuple with first item description text and second item a link to the album. year -- album release year.""" def to_string(e): try: return convert(e.a.string) except AttributeError: try: return convert(e.string) except AttributeError: return u'' info = {} album = td.find('div', {'class': 'title'}) info['album'] = to_string(album) info['#albumurl'] = convert(album.a.element.attrib['href']) info['amg_url'] = info['#albumurl'] info['artist'] = to_string(td.find('div', {'class': 'artist'})) if not info['artist']: artist = to_string(td.find('div', {'class': 'title'})) if u':' in artist: artist = [z.strip() for z in artist.split(u':', 1)] info['artist'], info['album'] = artist else: info['album'] = artist info['year'] = to_string(td.find('div', {'class': 'year'})) info['genre'] = to_string(td.find('div', {'class': 'genres'})) info['#extrainfo'] = [ info['album'] + u' at AllMusic.com', info['#albumurl']] info[id_field] = re.search('-(mw\d+)$', info['#albumurl']).groups()[0] return dict((k,v) for k, v in info.iteritems() if not isempty(v))
def check_values(d): ret = {} for key, v in d.iteritems(): if key in INVALID_KEYS or isempty(v): continue if hasattr(v, '__iter__') and hasattr(v, 'items'): continue elif not hasattr(v, '__iter__'): v = unicode(v) elif isinstance(v, str): v = v.decode('utf8') ret[key] = v return ret
def parse_albumpage(page, artist=None, album=None): info = {} album_soup = parse_html.SoupWrapper(parse_html.parse(page)) artist = album_soup.find('div', {'class': 'album-artist'}) album = album_soup.find('div', {'class': 'album-title'}) release_title = album_soup.find('h3', 'release-title') if release_title: album = release_title details = album_soup.find('p', {'class': 'release-details'}) if details: info['release'] = convert(details.string) if not artist: artist = album_soup.find('h3', 'release-artist') if album is None: info.update({'artist': convert(artist.string), 'album': ''}) else: info.update({ 'artist': convert(artist.string), 'album': convert(album.string) }) info['albumartist'] = info['artist'] sidebar = album_soup.find('div', {'class': 'sidebar'}) info.update(parse_sidebar(sidebar)) info.update(convert_year(info)) content = album_soup.find('section', {'class': 'review read-more'}) if content: info.update(parse_review(content)) #swipe = main.find('div', {'id':"similar-albums", 'class':"grid-gallery"}) #info.update(parse_similar(swipe)) info = dict( (spanmap.get(k, k), v) for k, v in info.iteritems() if not isempty(v)) return [info, parse_tracks(album_soup, info)]
def parse_albumpage(page, artist=None, album=None): info = {} album_soup = parse_html.SoupWrapper(parse_html.parse(page)) artist = album_soup.find('div', {'class': 'album-artist'}) album = album_soup.find('div', {'class': 'album-title'}) release_title = album_soup.find('h3', 'release-title') if release_title: album = release_title details = album_soup.find('p', {'class': 'release-details'}) if details: info['release'] = convert(details.string) if not artist: artist = album_soup.find('h3', 'release-artist') if album is None: info.update({'artist': convert(artist.string), 'album': ''}) else: info.update({'artist': convert(artist.string), 'album': convert(album.string)}) info['albumartist'] = info['artist'] sidebar = album_soup.find('div', {'class': 'sidebar'}) info.update(parse_sidebar(sidebar)) info.update(convert_year(info)) content = album_soup.find('section', {'class': 'review read-more'}) if content: info.update(parse_review(content)) #swipe = main.find('div', {'id':"similar-albums", 'class':"grid-gallery"}) #info.update(parse_similar(swipe)) info = dict((spanmap.get(k,k),v) for k, v in info.iteritems() if not isempty(v)) return [info, parse_tracks(album_soup, info)]
def search(self, album, artists): ret = [] if len(artists) > 1: artist = u'Various Artists' else: if hasattr(artists, 'items'): artist = artists.keys()[0] else: artist = artists[0] if self._useid and hasattr(artists, 'values'): tracks = [] [tracks.extend(z) for z in artists.values()] for field in ( 'amg_rovi_id', 'amg_pop_id', 'amgsqlid', 'amg_album_id', ): album_id = find_id(tracks, field) if album_id: break if not isempty(album_id): write_log(u'Found Album ID %s' % album_id) try: return self.keyword_search(u':id %s' % album_id) except OldURLError: write_log("Invalid URL used. Doing normal search.") if not album: raise RetrievalError('Album name required.') write_log(u'Searching for %s' % album) try: searchpage = search(album) except urllib2.URLError, e: write_log(u'Error: While retrieving search page %s' % unicode(e)) raise RetrievalError(unicode(e))
def parse_track(tr, fields, performance_title=None): track = {} ignore = set(['pick-prefix', 'sample', 'stream', 'pick-suffix']) if tr.element.attrib.get('class') == 'perfomance-title': return convert(tr.string) for td, field in zip(tr.find_all('td'), fields): if field in ignore: continue elif field is None: field = td.element.attrib.get('class') if not field: continue sub_fields = td.find_all('div') if (sub_fields): for div in sub_fields: sub_field = div.element.attrib['class'] if field == 'performer' and sub_field == 'primary': sub_field = field elif field == 'performer' and sub_field != 'primary': if sub_field == 'featuring': track[field] = u'%s %s' % (track.get( field, u''), convert(div.string)) else: sub_field = 'composer' value = convert(div.string) track[sub_field] = value else: track[field] = convert(td.string) if performance_title and 'title' in track: track['title'] = performance_title + u': ' + track['title'] if 'artist' not in track and 'performer' in track: track['artist'] = track['performer'] return dict((spanmap.get(k, k), v) for k, v in track.iteritems() if spanmap.get(k, k) and not isempty(v))
def parse_track(tr, fields, performance_title=None): track = {} ignore = set(['pick-prefix', 'sample', 'stream', 'pick-suffix']) if tr.element.attrib.get('class') == 'perfomance-title': return convert(tr.string) for td, field in zip(tr.find_all('td'), fields): if field in ignore: continue elif field is None: field = td.element.attrib.get('class') if not field: continue sub_fields = td.find_all('div') if (sub_fields): for div in sub_fields: sub_field = div.element.attrib['class'] if field == 'performer' and sub_field == 'primary': sub_field = field elif field == 'performer' and sub_field != 'primary': if sub_field == 'featuring': track[field] = u'%s %s' % (track.get(field, u'') ,convert(div.string)) else: sub_field = 'composer' value = convert(div.string) track[sub_field] = value else: track[field] = convert(td.string) if performance_title and 'title' in track: track['title'] = performance_title + u': ' + track['title'] if 'artist' not in track and 'performer' in track: track['artist'] = track['performer'] return dict((spanmap.get(k,k),v) for k,v in track.iteritems() if spanmap.get(k,k) and not isempty(v))