# Gui : connect to artist_term.db from millionsongs.io import db_loader from millionsongs.indexes import lil_inverse_index import itertools # Load artists, terms + indexes and artists_terms raw tables loader=db_loader(r'D:\Data') artists,terms,by_artist_terms,by_terms_artists = loader.load() artists_by_ids = dict((a[0],a) for i,a in enumerate(artists)) artists_by_names = dict((a[2].lower(),a) for a in artists) # Build boolean inverted indexes terms_artists_index = lil_inverse_index(terms,[a[2].lower() for a in artists]) terms_artists_index.fill((id_t[1],artists_by_ids[id_t[0]][2].lower()) for id_t in by_terms_artists) artists_terms_index = lil_inverse_index([a[2].lower() for a in artists],terms) artists_terms_index.fill((artists_by_ids[id_t[0]][2].lower(),id_t[1]) for id_t in by_artist_terms) # Distribution of number of terms per artists by_artists_nterms= np.array(map( lambda (a,ts) : (a,len(list(ts))), itertools.groupby(by_artist_terms,lambda a_t : a_t[0])), dtype=[('artist', '|S10'), ('n_terms', '<i4')]) # Distribution of number of artists per terms by_terms_nartists= np.array(map( lambda (t,ars) : (t,len(list(ars))), itertools.groupby(by_terms_artists,lambda a_t : a_t[1])), dtype=[('term', '|S10'), ('n_artists', '<i4')]) np.sort(by_terms_nartists,order='n_artists')[-20:-1]
from millionsongs.io import db_loader, get_artists_from_csv from millionsongs.indexes import lil_inverse_index # Load artists, terms + indexes and artists_terms from raw metadata tables loader=db_loader(r'D:\Data') artists,terms,by_artist_terms,artists_terms_by_terms = loader.load() artists_by_ids = dict((a[0],a) for a in artists) artists_by_names = dict((a[2].lower(),a) for a in artists) print '{} artists with {} unique ids and {} unique names'.format(len(artists), len(artists_by_ids), len(artists_by_names)) print '{} terms and {} term x artist_id tuples'.format(len(terms), len(artists_terms_by_terms)) # Build and fill boolean inverted indexes terms_artists_index = lil_inverse_index(terms,[a[0] for a in artists]) terms_artists_index.fill((t,id) for id,t in artists_terms_by_terms) artists_terms_index = lil_inverse_index([a[0] for a in artists],terms) artists_terms_index.fill((id,t) for id,t in by_artist_terms) print '{}x{} boolean tf-idf matrix with {} non-null elements build from {} tuples'.format(terms_artists_index.inc_matrix.shape[0],terms_artists_index.inc_matrix.shape[1],terms_artists_index.inc_matrix.sum(),len(artists_terms_by_terms)) # Cumulating number of artists per term by_terms_nartists = np.squeeze(np.array(terms_artists_index.inc_matrix.sum(axis=1))) plt.hist(by_terms_nartists,bins=np.logspace(0,5,base=10,num=51,endpoint=True)) plt.xscale('log') plt.xlabel('Terms count (n_artists = {}'.format(len(artists))) plt.hist(100 * by_terms_nartists/len(artists),bins=np.logspace(0,2,base=10,num=21,endpoint=True)) plt.xscale('log') plt.xlabel('Terms frequency') # DEBUG