# Gui : connect to artist_term.db
from millionsongs.io import db_loader
from millionsongs.indexes import lil_inverse_index
import itertools

# Load artists, terms + indexes and artists_terms raw tables
loader=db_loader(r'D:\Data')
artists,terms,by_artist_terms,by_terms_artists = loader.load()
artists_by_ids = dict((a[0],a) for i,a in enumerate(artists))
artists_by_names = dict((a[2].lower(),a) for a in artists)

# Build boolean inverted indexes 
terms_artists_index = lil_inverse_index(terms,[a[2].lower() for a in artists])
terms_artists_index.fill((id_t[1],artists_by_ids[id_t[0]][2].lower()) for id_t in by_terms_artists)

artists_terms_index = lil_inverse_index([a[2].lower() for a in artists],terms)
artists_terms_index.fill((artists_by_ids[id_t[0]][2].lower(),id_t[1]) for id_t in by_artist_terms)

# Distribution of number of terms per artists
by_artists_nterms= np.array(map(
	lambda (a,ts) : (a,len(list(ts))),
	itertools.groupby(by_artist_terms,lambda a_t : a_t[0])),
	dtype=[('artist', '|S10'), ('n_terms', '<i4')])

# Distribution of number of artists per terms
by_terms_nartists= np.array(map(
	lambda (t,ars) : (t,len(list(ars))),
	itertools.groupby(by_terms_artists,lambda a_t : a_t[1])),
	dtype=[('term', '|S10'), ('n_artists', '<i4')])

np.sort(by_terms_nartists,order='n_artists')[-20:-1]
from millionsongs.io import db_loader, get_artists_from_csv
from millionsongs.indexes import lil_inverse_index

# Load artists, terms + indexes and artists_terms from raw metadata tables
loader=db_loader(r'D:\Data')
artists,terms,by_artist_terms,artists_terms_by_terms = loader.load()
artists_by_ids = dict((a[0],a) for a in artists)
artists_by_names = dict((a[2].lower(),a) for a in artists)

print '{} artists with {} unique ids and {} unique names'.format(len(artists), len(artists_by_ids), len(artists_by_names))
print '{} terms and {} term x artist_id tuples'.format(len(terms), len(artists_terms_by_terms))

# Build and fill boolean inverted indexes 
terms_artists_index = lil_inverse_index(terms,[a[0] for a in artists])
terms_artists_index.fill((t,id) for id,t in artists_terms_by_terms)

artists_terms_index = lil_inverse_index([a[0] for a in artists],terms)
artists_terms_index.fill((id,t) for id,t in by_artist_terms)

print '{}x{} boolean tf-idf matrix with {} non-null elements build from {} tuples'.format(terms_artists_index.inc_matrix.shape[0],terms_artists_index.inc_matrix.shape[1],terms_artists_index.inc_matrix.sum(),len(artists_terms_by_terms))

# Cumulating number of artists per term
by_terms_nartists = np.squeeze(np.array(terms_artists_index.inc_matrix.sum(axis=1)))
plt.hist(by_terms_nartists,bins=np.logspace(0,5,base=10,num=51,endpoint=True))
plt.xscale('log')
plt.xlabel('Terms count (n_artists = {}'.format(len(artists)))
plt.hist(100 * by_terms_nartists/len(artists),bins=np.logspace(0,2,base=10,num=21,endpoint=True))
plt.xscale('log')
plt.xlabel('Terms frequency')

# DEBUG