def get_abstract_from_bbc(artist): """ Populate CachedArtist with short Wikipedia abstract from BBC API. BBC provide abstracts only for artists, so we skip it if argument is a release @param artist_or_releasegroup: a CachedArtist or CachedReleaseGroup object @return: a dictionary with an abstract structure """ abstract = {} if artist._doc_type == 'CachedArtist' and 'bbc' not in artist.cache_state: try: t = mmda_logger('bbc','request','abstract',artist.get_id) xml = urlopen("http://www.bbc.co.uk/music/artists/%s/wikipedia.xml" % artist.get_id, timeout=ABSTRACT_TIMEOUT).read() xmlSoup = BeautifulStoneSoup(xml, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) abstract = { 'content':xmlSoup.wikipedia_article.content.text, 'url':xmlSoup.wikipedia_article.url.text, 'lang':'en', 'provider':'Wikipedia' } except Exception, e: mmda_logger('bbc','ERROR',e) cache_state = 0 else: mmda_logger('bbc','result','found',abstract['url'],t) cache_state = 1 artist.cache_state['bbc'] = [cache_state,datetime.utcnow()] artist.changes_present = True
def _get_myspace_id(profile_url): """ Return myspace user id. Be smart, and stop download if ID is found. @param url: a string with Myspace profile URL @return: a string with user id """ BUFFERSIZE = 2048 id = None re_myspaceid = re.compile("blogs.myspace.com/index.cfm\?fuseaction=blog.ListAll&friendId=(?P<friend_id>\d+)") t = mmda_logger('mspc','request','find ID',profile_url) try: usock = HTTP_OPENER.open(profile_url, timeout=SEARCH_TIMEOUT) while 1: buffer = usock.read(BUFFERSIZE) r = re_myspaceid.search(buffer) if r and r.groups(): id = r.groups()[0] break if len(buffer) < BUFFERSIZE: break usock.close() except Exception, e: mmda_logger('myspace','ERROR',e)
def _get_news_sources(artist): """ Find RSS/Atom feeds avaiable for artist. @param artist: a CachedArtist object @return: a list of strings with URLs """ sources = [] future_calls = [] if 'Myspace' in artist.urls: if 'myspace_id' not in artist: myspace_profile = artist.urls['Myspace'][0] myspace_id = _get_myspace_id(myspace_profile) artist.myspace_id = myspace_id artist.changes_present = True if 'myspace_id' in artist and artist.myspace_id: myspace_blog_feed = "http://blogs.myspace.com/Modules/BlogV2/Pages/RssFeed.aspx?friendID=%s" % artist.myspace_id sources.append(myspace_blog_feed) t = mmda_logger('www','request','find feeds',artist.name) for source_type in LOOK_FOR_FEEDS: if source_type in artist.urls: future_calls = [Future(_get_feed_link_for,url) for url in artist.urls[source_type]] sources.extend(list(set([url() for url in future_calls if url()]))) mmda_logger('www','result','found feeds',len(sources),t) return [(src,None,None) for src in sources]
def save_any_changes(self): """ Store document in the database if it is marked as 'changes_present'. """ if 'changes_present' in self: del self.changes_present self.save() mmda_logger('db','store',self)
def get_basic_artist(mbid): """ Make sure basic artist document is present and contains required data. @param mbid: a string containing a MusicBrainz ID of an artist @return: a CachedArtist object containing required minimal data set """ #TODO: handle Various Artists' artist (VARIOUS_ARTISTS_ID) try: artist = CachedArtist.get(mbid) mmda_logger('db','present',artist._doc_type, artist.get_id) except ResourceNotFound: artist = _create_mb_artist(mbid) return artist
def populate_release_lastfm(release_group, release_mbid): """ Make sure all required and avaiable last.fm data is present in a CachedReleaseGroup document. @param release_group: a CachedReleaseGroup object @param release_mbid: a string containing a MusicBrainz ID of a release @return: a validated/updated CachedReleaseGroup object """ #if release_group.cache_state['lastfm'][0] == 0: release = release_group.releases[release_mbid] if 'lastfm' not in release_group.cache_state: lastfm = pylast.get_lastfm_network(api_key = settings.LASTFM_API_KEY) lastfm.enable_caching() try: t = mmda_logger('last','request','release-data',release_mbid) lastfm_album = lastfm.get_album_by_mbid(release_mbid) lastfm_abstract = None lastfm_cover = None lastfm_url = lastfm_album.get_url() if 'abstract' not in release_group: lastfm_abstract = Future(lastfm_album.get_wiki_summary) if 'cover' not in release: lastfm_cover = lastfm_album.get_cover_image() # wait for Future if 'abstract' not in release_group: lastfm_abstract() mmda_logger('last','result','release-data',release_mbid,t) except Exception, e: mmda_logger('pylast','ERROR',e) else: if 'urls' not in release: release['urls'] = {} release['urls']['Last.fm'] = [lastfm_url] if lastfm_abstract and lastfm_abstract(): release_group.abstract = {'content':strip_tags(lastfm_abstract()), 'lang':'en', 'provider':'Last.fm', 'url':lastfm_url} if lastfm_cover: release['cover'] = lastfm_cover # TODO: when to save? when failed do we retry? release_group.cache_state['lastfm'] = [1,datetime.utcnow()] release_group.changes_present = True
def get_basic_cached_search_result(query_type, query_string): """ Make sure proper CachedSearchResult is present and return its id. Method performs local, then optional remote (MusicBrainz) lookup of query result @param query_type: a string containing query type @param query_string: a string containing query @return: a string containing SHA1 hash of a query string (the ID of a CachedSearchResult document) """ query_id = hashlib.sha1((query_type+query_string).encode('utf-8')).hexdigest() search_result = CachedSearchResult.get_or_create(query_id) search_result.query_string = query_string search_result.query_type = query_type if 'mb' not in search_result.cache_state: #TODO: add 14day window check try: t = mmda_logger('mb','request','search for',query_string) if query_type == 'artist': filter = ws.ArtistFilter(name=query_string,limit=RESULTS_LIMIT) results = mb_query.getArtists(filter) #TODO: add try, or maybe better in 'create_search' as a global wrapper search_result.results = [ {'name':r.artist.name, 'mbid':extractUuid(r.artist.id), 'score':r.score, 'note':r.artist.disambiguation } for r in results ] elif query_type == 'release': filter = ws.ReleaseFilter(title=query_string,limit=RESULTS_LIMIT) results = mb_query.getReleases(filter) #TODO: add try, or maybe better in 'create_search' as a global wrapper search_result.results = [ {'artist':r.release.artist.name, 'title':r.release.title, 'mbid':extractUuid(r.release.id), 'artist_mbid':extractUuid(r.release.artist.id), 'score':r.score, 'tracks_count':r.release.tracksCount, 'year':r.release.getEarliestReleaseEvent().getDate() if r.release.getEarliestReleaseEvent() else None} for r in results ] elif query_type == 'tag': # TODO: refactor to other packages import pylast lastfm = pylast.get_lastfm_network(api_key = settings.LASTFM_API_KEY) lastfm_similar_tags = lastfm.search_for_tag(query_string).get_next_page() search_result.results = [ tag.name for tag in lastfm_similar_tags ] except Exception, e: # TODO: hard error here mmda_logger('search','ERROR',e) raise e else: mmda_logger('mb','result','results',len(search_result.results),t) search_result.cache_state['mb'] = [1,datetime.utcnow()] search_result.save()
def _create_mb_artist(mbid): """ Fetch basic metadata and store it as a CachedArtist document. @param mbid: a string containing a MusicBrainz ID of an artist @return: a CachedArtist object with basic MusicBrainz data """ try: t = mmda_logger('mb','request','artist',mbid) mb_artist = mb_query.getArtistById(mbid, MB_ARTIST_INCLUDES) mmda_logger('mb','result', 'artist',mb_artist.name,t) except WebServiceError, e: # TODO: hard error page here # TODO: 404 not found redirect to different page? conditional? # TODO: HTTP Error 503: Service Temporarily Unavailable -> special case: please wait few seconds and hit F5 mmda_logger('mb-artist','ERROR',e) raise e
def get_abstract_from_dbpedia(artist_or_releasegroup): """ Populate CachedArtist or CachedRleaseGroup with short abstract. @param artist_or_releasegroup: a CachedArtist or CachedReleaseGroup object @return: a dictionary with an abstract structure """ abstract = {} # if artist_or_releasegroup is ReleaseGroup, we look for release with wikipedia URL # TODO: check performance, and if better - replace in other parts # TODO: DRY: refactor if 'dbpedia' not in artist_or_releasegroup.cache_state: wiki_resource = None cache_state = 0 if 'releases' in artist_or_releasegroup: for release in artist_or_releasegroup['releases'].itervalues(): if 'urls' in release and 'Wikipedia' in release['urls']: wiki_resource, wiki_lang, wiki_url = find_best_wikipedia_resource(release['urls']['Wikipedia']) elif 'urls' in artist_or_releasegroup and 'Wikipedia' in artist_or_releasegroup['urls']: wiki_resource, wiki_lang, wiki_url = find_best_wikipedia_resource(artist_or_releasegroup['urls']['Wikipedia']) if wiki_resource: store = surf.Store(reader = "sparql_protocol", endpoint = "http://dbpedia.org/sparql") session = surf.Session(store) sparql_query = "SELECT ?abstract WHERE {{ <http://dbpedia.org/resource/%s> <http://dbpedia.org/property/abstract> ?abstract FILTER langMatches( lang(?abstract), '%s') } }" % (wiki_resource, wiki_lang) try: t = mmda_logger('wiki','request','abstract',wiki_resource) # TODO: timeout? sparql_result = session.default_store.execute_sparql(sparql_query) # TODO: error handling mmda_logger('wiki','result','found',len(sparql_result['results']['bindings']),t) if sparql_result['results']['bindings'][0]['abstract']: abstract = {'content':unicode(sparql_result['results']['bindings'][0]['abstract']), 'url':wiki_url, 'lang':wiki_lang, 'provider':'Wikipedia'} # TODO: add cache_status dbpedia except Exception, e: # TODO: handle it? mmda_logger('surf-dbpedia','ERROR',e) else: cache_state = 1 artist_or_releasegroup.cache_state['dbpedia'] = [cache_state,datetime.utcnow()] artist_or_releasegroup.changes_present = True
def populate_artist_pictures_flickr(artist_pictures): """ Make sure desired flickr pictures are present in a CachedArtistPictures document. @param artist_pictures: a CachedArtistPictures object @return: a validated/updated CachedArtistPictures object """ # TODO: cache flickr only for a day? if 'flickr' not in artist_pictures.cache_state: flickr = flickrapi.FlickrAPI(settings.FLICKR_API_KEY, cache=True) flickr.cache = cache # TODO: use artist aliases as alternative tags? (what about tag_mode?) artist_tags = artist_pictures.artist_name if 'artist_aliases' in artist_pictures: artist_tags += ',' + ','.join(artist_pictures.artist_aliases) includes = 'owner_name, url_sq, url_o' licenses = '1,2,3,4,5,6,7' # http://www.flickr.com/services/api/flickr.photos.licenses.getInfo.html data_walker = flickr.walk(tag_mode='any',tags=artist_tags.lower(),media='photos',license=licenses,extras=includes,per_page=FLICKR_LIMIT) # TODO: make two walks. first one with events from lastfm, second if result list is less than FLICKR_LIMIT flickr_photos = [] try: t = mmda_logger('flkr','request','artist pictures',artist_pictures._id) for i in xrange(FLICKR_LIMIT): f_photo = data_walker.next() photo = {'owner_id':f_photo.get('owner'), 'id':f_photo.get('id'), 'title':f_photo.get('title'), 'sq':f_photo.get('url_sq'), 'big':f_photo.get('url_o'), 'owner':f_photo.get('ownername')} flickr_photos.append(photo) mmda_logger('flkr','result','found pictures',len(flickr_photos), t) except Exception, e: mmda_logger('flickrapi','ERROR',e) if flickr_photos: artist_pictures.flickr = flickr_photos artist_pictures.cache_state['flickr'] = [2,datetime.utcnow()] artist_pictures.changes_present = True
def get_basic_release(mbid): """ Make sure release and its dependencies are present and contain required data. @param mbid: a string containing a MusicBrainz ID of an artist @return: a CachedReleaseGroup object containing required minimal data set """ release_group = CachedReleaseGroup.view('artists/releases',include_docs=True, key=mbid).one() if not release_group: # TODO: optimize? its just one additional request on rare ocassions tho.. try: t = mmda_logger('mb','request','artist mbid of release',mbid) mb_release = mb_query.getReleaseById(mbid, MB_RELEASE_ARTIST) artist_mbid = extractUuid(mb_release.artist.id) mmda_logger('mb','result','artist mbid',artist_mbid,t) except WebServiceError, e: # TODO: add error handling here mmda_logger('mb-release','ERROR',e) raise e else: get_basic_artist(artist_mbid) release_group = CachedReleaseGroup.view('artists/releases',include_docs=True, key=mbid).one()
def populate_tag_lastfm(tag): """ Make sure all avaiable last.fm data is present in a CachedTag document. @param tag: a CachedTag object @return: a validated/updated CachedTag object """ # TODO: when expire? if 'lastfm' not in tag.cache_state: lastfm = pylast.get_lastfm_network(api_key = settings.LASTFM_API_KEY) lastfm.enable_caching() try: t = mmda_logger('last','request','tag-artists',tag.get_id) lastfm_tag = lastfm.get_tag(tag.get_id.replace('-',' ')) # TODO: this is an ugly fix, make it pretty lastfm_artists = _lastfm_get_tag_artists_optimized(lastfm_tag) mmda_logger('last','result','found',len(lastfm_artists),t) except Exception, e: mmda_logger('pylast','ERROR',e) else: if lastfm_artists: tag.artists = lastfm_artists tag.cache_state['lastfm'] = [1,datetime.utcnow()] tag.changes_present = True
def populate_artist_pictures_lastfm(artist_pictures): """ Make sure all avaiable last.fm data is present in a CachedArtistPictures document. @param artist_pictures: a CachedArtistPictures object @return: a validated/updated CachedArtistPictures object """ if 'lastfm' not in artist_pictures.cache_state or artist_pictures.cache_state['lastfm'][0] == 1: lastfm = pylast.get_lastfm_network(api_key = settings.LASTFM_API_KEY) lastfm.enable_caching() try: t = mmda_logger('last','request','artist pictures',artist_pictures._id) lastfm_artist = lastfm.get_artist_by_mbid(artist_pictures._id) lastfm_images = lastfm_artist.get_images(order=pylast.IMAGES_ORDER_POPULARITY,limit=LASTFM_PICTURE_LIMIT) # TODO: add lastfm event info, that can be used as a tag in flickr search mmda_logger('last','result','found pictures',len(lastfm_images),t) except Exception, e: mmda_logger('pylast','ERROR',e) else: if lastfm_images: artist_pictures.lastfm = [ {'sq':i.sizes.largesquare, 'big':i.sizes.original, 'url':i.url,'title':i.title} for i in lastfm_images] artist_pictures.cache_state['lastfm'] = [2,datetime.utcnow()] artist_pictures.changes_present = True
def get_basic_artist_videos(mbid): """ Make sure document and its dependencies are present and contain required data. @param mbid: a string containing a MusicBrainz ID of an artist @return: a CachedArtistVideos object containing minimal data set """ try: artist_videos = CachedArtistVideos.get(mbid) if 'artist_name' not in artist_videos: raise ResourceNotFound except ResourceNotFound: # overhead, but in most cases artist page # is a place where user will go next anyway artist = get_basic_artist(mbid) # TODO: just an idea: create a view that store only names and aliases? artist_videos = CachedArtistVideos.get_or_create(mbid) artist_videos.artist_name = artist.name if 'aliases' in artist: artist_videos.artist_aliases = list(artist.aliases) artist_videos.save() mmda_logger('db','store', artist_videos) return artist_videos
def populate_artist_videos_youtube(artist_videos): """ Make sure all avaiable youtube meta-data is present in a CachedArtistVideos document. @param artist_videos: a CachedArtistVideos object @return: a validated/updated CachedArtistVideos object """ # TODO: expire in one week? if "youtube" not in artist_videos.cache_state: youtube_videos = [] yt_service = yts.YouTubeService() artist = get_basic_artist(artist_videos._id) try: t = mmda_logger("yt", "request", "artist-videos", artist_videos.artist_name) # check if artist has dedicated Youtube channel if "urls" in artist and "Youtube" in artist.urls: artist_videos.youtube_channel = artist.urls["Youtube"][0] youtube_id = _get_youtube_id(artist) feed = yt_service.GetYouTubeVideoFeed( "http://gdata.youtube.com/feeds/api/users/%s/uploads" % youtube_id ) # if there is no official channel, make a search query else: query = yts.YouTubeVideoQuery() query.orderby = "relevance" query.racy = "exclude" query.max_results = YOUTUBE_MAX_RESULTS query.categories.append("Music") # 'bug' workaround (http://bugs.python.org/issue1712522) query.vq = artist_videos.artist_name.encode("utf-8", "/") # TODO: aliases? atm they seems to lower the result quality # query.vq = u"Múm OR Múm OR mum OR múm".encode('utf-8', '/') feed = yt_service.YouTubeQuery(query) except Exception, e: mmda_logger("yt-search", "ERROR", e) # raise Http500 else: mmda_logger("yt", "result", "artist-videos", len(feed.entry), t) for entry in feed.entry: try: video = { "title": entry.media.title.text, "duration": entry.media.duration.seconds, "url": entry.media.player.url, "player": entry.GetSwfUrl(), "thumb": entry.media.thumbnail[0].url, } # sometimes objects we have are wicked -- we reject them # eg. when official channel contains blocked in some regions videos # example: http://www.youtube.com/user/dreamtheater # TODO: what if there is no videos left? # example: http://127.0.0.1:8000/artist/the-beatles/videos/b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d/ except (NameError, AttributeError): continue else: youtube_videos.append(video) if youtube_videos: artist_videos.youtube = youtube_videos artist_videos.cache_state["youtube"] = [1, datetime.utcnow()] artist_videos.changes_present = True mmda_logger("db", "store", artist_videos)
def populate_artist_news(news, artist): # TODO: add condition that search for new feeds each 7 days? if news.sources: mmda_logger('news','present','cached',len(news.sources.keys())) """ Sad story about ETag and Last-Modified Headers: SOME SITES JUST DON'T GIVE A DAMN. The End. Example: myspace feeds have no etag or L-M headers (17/Apr/2010) That is why MMDA uses 'cache' field to limit number of HTTP requests to such handicapped feeds separately reference: http://www.feedparser.org/docs/http-etag.html """ now = datetime.utcnow() pending_sources = [] for source_url in news.sources.keys(): source = news.sources[source_url] etag = source['etag'] if source.has_key('etag') else None last_modified = source['modified'].utctimetuple() if source.has_key('modified') else None feed_is_smart = etag or last_modified cache_time = (now - source['cache']).seconds if (feed_is_smart and cache_time > SMART_FEED_CACHE) or cache_time > FEED_CACHE: pending_sources.append((source_url, etag, last_modified)) else: # if there are no cached feeds pending_sources = _get_news_sources(artist) if pending_sources: try: t = mmda_logger('news','request','to check',len(pending_sources)) future_calls = [Future(_get_fetched_and_parsed_feed,source) for source in pending_sources] fetched_feeds = [future_obj() for future_obj in future_calls if timeout(future_obj,t=20)] except Exception, e: mmda_logger('feed-fetch','ERROR',e) else: for feed in fetched_feeds: try: if feed.status == 304: mmda_logger('news','present','no updates',feed.href) news.sources[feed.href]['cache'] = datetime.utcnow() news.changes_present = True continue elif feed.status == 404: mmda_logger('news','present','404',feed.href) del news.sources[feed.href] else: feed_src = urlparse.urlsplit(feed.feed.link).netloc.replace('www.','') mmda_logger('news','present','fetched',feed_src) feed_entries = [{ 'title': e.title.strip() if e.has_key('title') else None, 'summary': e.summary if e.has_key('summary') else None, 'date': datetime(*e.updated_parsed[0:6]), 'url': e.link } for e in feed.entries] news_source = { 'url': feed.feed.link, 'name': feed_src, 'items':feed_entries, 'cache':datetime.utcnow() } if feed.has_key('modified') and feed.modified: news_source['modified'] = datetime(*feed.modified[0:6]) if feed.has_key('etag') and feed.etag: news_source['etag'] = feed.etag except Exception, e: # some feeds may be badly formatted, sometimes FeedFinder may fail # some myspace feeds have no entries... trying # to predict all possible failures is pointless. # in such cases mmda just jumps to the next feed mmda_logger('feed-parse','ERROR',e) continue else: news.sources[feed.href] = news_source news.changes_present = True mmda_logger('news','result','got results',len(pending_sources),t)
def populate_artist_lastfm(artist): """ Make sure all required and avaiable last.fm data is present in a CachedArtist document. @param artist: a CachedArtist object @return: a validated/updated CachedArtist object """ if 'lastfm' not in artist.cache_state: lastfm = pylast.get_lastfm_network(api_key = settings.LASTFM_API_KEY) lastfm.enable_caching() try: t = mmda_logger('last','request','artist-data',artist._id) lastfm_artist = lastfm.get_artist_by_mbid(artist._id) # TODO: run there in parallel (?) lastfm_images = Future(lastfm_artist.get_images,pylast.IMAGES_ORDER_POPULARITY,5) lastfm_url = Future(lastfm_artist.get_url) # we get similar artists from lastfm database, but only those with mbid (omg, omg) # TODO: think about numbers of fetched things lastfm_similar = Future(_lastfm_get_similar_optimized,lastfm_artist,10) lastfm_tags = Future(lastfm_artist.get_top_tags,10) lastfm_abstract = None if 'abstract' not in artist: lastfm_abstract = lastfm_artist.get_bio_summary() # wait for all Future to come ;-) lastfm_url() lastfm_tags() lastfm_images() lastfm_similar() mmda_logger('last','result','artist-data',artist._id,t) except Exception, e: mmda_logger('pylast','ERROR',e) else: # TODO: make it compatible with tags imported from mb (TODO2: add tags from MusicBrainz) # TODO: remove random? import random random.shuffle(lastfm_tags()) if lastfm_abstract: artist.abstract = {'content':strip_tags(lastfm_abstract), 'lang':'en', 'provider':'Last.fm', 'url':lastfm_url()} artist.tags = [(t.item.name.lower(), int( float(t.weight)/(float(100)/float(4)) ) ) for t in lastfm_tags()] artist.similar = lastfm_similar() artist.urls['Last.fm'] = [lastfm_url()] # TODO: optimize if lastfm_images(): artist_pictures = CachedArtistPictures.get_or_create(artist._id) if 'lastfm' not in artist_pictures: artist_pictures.artist_name = artist.name artist_pictures.lastfm = [ {'sq':i.sizes.largesquare, 'big':i.sizes.original, 'url':i.url,'title':i.title} for i in lastfm_images()] artist_pictures.cache_state['lastfm'] = [1,datetime.utcnow()] artist_pictures.save() mmda_logger('db','store',artist_pictures) # if fail, store state too -- to avoid future attempts artist.cache_state['lastfm'] = [1,datetime.utcnow()] artist.changes_present = True
def _populate_deep_release_mb(release_group,release_mbid): """ Make sure ReleaseGroup contains additional, detailed information about specified release. @param release_group: a CachedReleaseGroup object @param release_mbid: a string containing a MusicBrainz ID of a release @return: a verified/updated CachedReleaseGroup object """ release = release_group.releases[release_mbid] if release['cache_state']['mb'][0] == 1: # TODO: remove unused includes try: t = mmda_logger('mb','request','release',release_mbid) mb_release = mb_query.getReleaseById(release_mbid, MB_RELEASE_INCLUDES) mmda_logger('mb','result','release',mb_release.title,t) except WebServiceError, e: # TODO: hard error here mmda_logger('mb-release','ERROR',e) raise e else: # make sure mbid of an artist is present if 'artist_mbid' not in release_group: release_group.artist_mbid = extractUuid(mb_release.artist.id) # TRACK LISTING # TODO: think about duration representation here tracks = [] for mb_track in mb_release.tracks: track = {'title':mb_track.title, 'mbid':extractUuid(mb_track.id)} if mb_track.duration: track['duration'] = humanize_duration(mb_track.duration) tracks.append(track) release['tracks'] = tracks # URL relations urls = {} for relation in mb_release.getRelations(Relation.TO_URL): relation_type = decruft_mb(relation.type) if relation_type not in urls: urls[relation_type] = [] urls[relation_type].append(relation.targetId) # urls is used in many places, so its handy to have it ready release['urls'] = urls # CREDIT relations credits = [{'type':decruft_mb(r.type), 'mbid':extractUuid(r.targetId), 'name':r.target.name} for r in mb_release.getRelations(Relation.TO_ARTIST)] if credits: release['credits'] = credits # MULTI-DISC-RELEASE information remasters = [] for relation in mb_release.getRelations(Relation.TO_RELEASE): relation_type = decruft_mb(relation.type) linked_release = {'mbid':extractUuid(relation.targetId), 'title':relation.target.title} if relation_type == 'PartOfSet': if relation.direction == 'backward': release['set_prev'] = linked_release else: release['set_next'] = linked_release elif relation_type == 'Remaster': if relation.direction == 'backward': remasters.append(linked_release) else: release['remaster_of'] = linked_release if remasters: release['remasters'] = remasters release['cache_state']['mb'] = [2,datetime.utcnow()] release_group = _perform_cover_lookup_on_mb_data(release_group, release_mbid) release_group.changes_present = True
if not release_group: # TODO: optimize? its just one additional request on rare ocassions tho.. try: t = mmda_logger('mb','request','artist mbid of release',mbid) mb_release = mb_query.getReleaseById(mbid, MB_RELEASE_ARTIST) artist_mbid = extractUuid(mb_release.artist.id) mmda_logger('mb','result','artist mbid',artist_mbid,t) except WebServiceError, e: # TODO: add error handling here mmda_logger('mb-release','ERROR',e) raise e else: get_basic_artist(artist_mbid) release_group = CachedReleaseGroup.view('artists/releases',include_docs=True, key=mbid).one() else: mmda_logger('db','present',release_group._doc_type, release_group.get_id) return release_group def _populate_deep_release_mb(release_group,release_mbid): """ Make sure ReleaseGroup contains additional, detailed information about specified release. @param release_group: a CachedReleaseGroup object @param release_mbid: a string containing a MusicBrainz ID of a release @return: a verified/updated CachedReleaseGroup object """ release = release_group.releases[release_mbid] if release['cache_state']['mb'][0] == 1: # TODO: remove unused includes try:
""" Return myspace user id. Be smart, and stop download if ID is found. @param url: a string with Myspace profile URL @return: a string with user id """ BUFFERSIZE = 2048 id = None re_myspaceid = re.compile("blogs.myspace.com/index.cfm\?fuseaction=blog.ListAll&friendId=(?P<friend_id>\d+)") t = mmda_logger('mspc','request','find ID',profile_url) try: usock = HTTP_OPENER.open(profile_url, timeout=SEARCH_TIMEOUT) while 1: buffer = usock.read(BUFFERSIZE) r = re_myspaceid.search(buffer) if r and r.groups(): id = r.groups()[0] break if len(buffer) < BUFFERSIZE: break usock.close() except Exception, e: mmda_logger('myspace','ERROR',e) mmda_logger('mspc','result','myspace ID',id,t) return id
try: t = mmda_logger('mb','request','artist',mbid) mb_artist = mb_query.getArtistById(mbid, MB_ARTIST_INCLUDES) mmda_logger('mb','result', 'artist',mb_artist.name,t) except WebServiceError, e: # TODO: hard error page here # TODO: 404 not found redirect to different page? conditional? # TODO: HTTP Error 503: Service Temporarily Unavailable -> special case: please wait few seconds and hit F5 mmda_logger('mb-artist','ERROR',e) raise e else: artist = CachedArtist.get_or_create(mbid) artist = _populate_artist_mb(artist, mb_artist) artist.cache_state['mb'] = [1,datetime.utcnow()] artist.save() mmda_logger('db','store',artist) # since we have some basic release data fetched with artist, store it _create_shallow_releases_mb(mb_artist) # TODO: think about genres and origin - fetch from freebase(?) # freebase.mqlread({"type":"/music/artist", "limit": 1, "key": [{"namespace" : '/authority/musicbrainz',"value" : '579ef111-19dd-4ae8-ad50-d5fa435472b9'}], "genre":[], "origin":None} ) return artist def _populate_artist_mb(artist, mb_artist): """ Process data from MusicBrainz and store it in dedicated structures of CachedArtist. @param artist: a CachedArtist object @param mb_artist: a musicbrainz2.model.Artist object
def _create_shallow_releases_mb(mb_artist): """ Create CachedReleaseGroup documents using basic MusicBrainz data fetched with artist. @param mb_artist: a musicbrainz2.model.Artist object """ mb_releases = mb_artist.getReleases() artist_mbid = extractUuid(mb_artist.id) # magical place where all data is cached/processed before database commit there_will_be_dragons = {} for mb_release in mb_releases: group_mbid = extractUuid(mb_release.releaseGroup.id) release_mbid = extractUuid(mb_release.id) # its ugly, but we fill this only once (place for future improvements) if group_mbid not in there_will_be_dragons: release_group = {} release_group['_id'] = group_mbid release_group['artist_mbid'] = artist_mbid release_group['artist_name'] = mb_artist.name release_group['title'] = mb_release.releaseGroup.title # small fix: in some rare cases, ReleaseGroup at Musicbrainz has no 'type' property release_group['release_type'] = decruft_mb(mb_release.releaseGroup.type) if mb_release.releaseGroup.type else 'Other' release_group['releases'] = {} there_will_be_dragons[group_mbid] = release_group else: release_group = there_will_be_dragons[group_mbid] # store only basic information about release event mb_release_events = [] for mb_event in mb_release.getReleaseEvents(): event = {} if mb_event.date: event['date'] = mb_event.date if mb_event.format: event['format'] = decruft_mb(mb_event.format) if mb_event.country: event['country'] = mb_event.country if event: mb_release_events.append(event) release_group['releases'][release_mbid] = { 'title':mb_release.title, 'tracks_count':mb_release.tracksCount, 'release_events':mb_release_events, 'cache_state':{'mb':[1,datetime.utcnow()]} } # primary release is the one with earliest release date (place for future improvements) mb_earliest_release_date = mb_release.getEarliestReleaseEvent().getDate() if mb_release.getEarliestReleaseEvent() else None if 'primary' not in release_group or release_group['primary'][1] == None or mb_earliest_release_date < release_group['primary'][1]: release_group['primary'] = [release_mbid, mb_earliest_release_date] # just to make sure no old data is left.. old_cached_release_groups = get_db('artists').view('artists/release_groups', key=artist_mbid) for group in old_cached_release_groups: del get_db('artists')[group['id']] for release_group in there_will_be_dragons.itervalues(): cached_release_group = CachedReleaseGroup.wrap(release_group) # TODO: think if wrap is the best way of dealing with this cached_release_group.cache_state['mb'] = [1,datetime.utcnow()] cached_release_group.save() # TODO: add try in case of ResourceConflict? mmda_logger('db','store', cached_release_group)