def get_artist_df(): artist_df = cache.get("artist_df") if artist_df is None: artist_df = pd.read_csv(path.join('.', 'data', 'artists.csv'), header=0, low_memory=False, converters={ 'member_list': lambda x: x[1:-1].split(', '), 'group_list': lambda x: x[1:-1].split(', ') }) cache.set("artist_df", artist_df) return artist_df
def get_similar(release_id): df = cache.get("df") # Get the df index that corresponds to the release_id argument idx = cache.get("indices")[release_id] # Get the release's corresponding artist_id, so we can filter it out of the recommendations artist_id = df[df['release_id'] == release_id]['artist_id'].iloc[0] # Get the pairwise similarity scores of all albums against our chosen release sim_scores = list(enumerate(cache.get("cosine_sim")[idx])) # Turn that list of tuples into a Pandas series sim_series = pd.Series([i[1] for i in sim_scores]) sim_series.rename('similarity', inplace=True) # Merge the series of similarity scores into the DataFrame containing the full collection similar = df.merge(sim_series, how='left', left_index=True, right_index=True) # Sort by similariry scores in decending order similar.sort_values(by=['similarity'], inplace=True, ascending=False) # Filter out any additional albums by the same artist similar = similar[similar['artist_id'] != artist_id] similar.reset_index(drop=True, inplace=True) i = 1 while i < 9: if similar.iloc[i].artist_id in similar[0:i].artist_id.values: similar.drop(similar.index[i], inplace=True) else: i += 1 # Return the top nine most similar albums similar = similar[0:9] cache.set("similar", similar) return similar
def analysis(df): genre_list = [] for genre in df['genres']: for i in genre: genre_list.append(i) genre_df = pd.DataFrame.from_dict(Counter(genre_list), orient='index', columns=['count']) genre_df.sort_values(by=['count'], inplace=True, ascending=False) cache.set("genre_df", genre_df) style_list = [] for style in df['styles']: for i in style: style_list.append(i) style_df = pd.DataFrame.from_dict(Counter(style_list), orient='index', columns=['count']) style_df.sort_values(by=['count'], inplace=True, ascending=False) style_df = style_df[0:10] style_df.reset_index(inplace=True) style_df.rename(columns={'index': 'style'}, inplace=True) cache.set("style_df", style_df) top_ten = df['artist_name'].value_counts().head(10) cache.set("top_ten", top_ten)
def transform_data(release_dict): # Make a DataFrame from the passed in dictionary containing the user's record collection # Reset the df's index to and rename the newly made 'index' col to 'release_id' df = pd.DataFrame.from_dict(release_dict, orient="index") df.reset_index(inplace=True) df.rename(columns={'index': 'release_id'}, inplace=True) # Make a Pandas series containing release_id's paired with their df index number indices = pd.Series(df.index, index=df.release_id) cache.set("indices", indices) # Clean the genre & style attributes df['genres'] = df['genres'].apply(clean_string) df['styles'] = df['styles'].apply(clean_string) df['descriptors'] = df['descriptors'].apply(clean_string) # Merge in the artist_df, so now each release contains band member info (where present) df = df.merge(get_artist_df(), how="left", on="artist_id").set_index(df.index) df.fillna('', inplace=True) # Make the word soup that we will pass into the CountVectorizer df['soup'] = df.apply(create_soup, axis=1) # Create CountVectorizer, fit_transform the release's word soup attribute, # and make a cosine similarity matrix that reflects the likeness between each count = CountVectorizer() count_matrix = count.fit_transform(df['soup']) cosine_sim = cosine_similarity(count_matrix, count_matrix) cache.set("df", df) cache.set("cosine_sim", cosine_sim) analysis(df)
def parse_collection(response): # Iterate through every release (r) in the API's response for r in response.json()['releases']: release_id = r['id'] descriptors = "" try: for entry in r['notes']: if entry['field_id'] == 5: descriptors = entry['value'].split(', ') if 'male vocals' in descriptors: descriptors.remove('male vocals') descriptors = [ descriptor.title() for descriptor in descriptors ] except KeyError: pass # Drill to the release's 'basic_information' field r = r['basic_information'] format_list = [i['name'] for i in r['formats']] if 'Vinyl' in format_list: release_name = r['title'] artist_name = r['artists'][0]['name'] artist_id = r['artists'][0]['id'] resource_url = r['resource_url'] year = r['year'] cover_url = r['cover_image'] genre_list = [i for i in r['genres']] style_list = [i for i in r['styles']] release_dict[release_id] = { "release_name": release_name, "artist_id": artist_id, "artist_name": artist_name, "year": year, "genres": genre_list, "styles": style_list, "resource_url": resource_url, "cover_url": cover_url, "descriptors": descriptors } pagination = response.json()['pagination'] # If there are more pages to be read, print the sync status, # then recursively call parse_list, passing in the "next url" already generate by Discogs' API if pagination['page'] < pagination['pages']: print( str(len(release_dict)) + ' of ' + str(pagination['items']) + ' items synced...') rate_limit(response) parse_collection( requests.get(pagination['urls']['next'], headers=headers)) cache.set("release_dict", release_dict)
def reset_dict(): release_dict.clear() cache.set("release_dict", None)
def reset_df(): reset_dict() cache.set("df", None)