# Build colisten matrix from triplet CSV and save in mtx format # Usage: python colisten.py <infile> <outfile> import scipy.sparse, scipy.io import sys import util infile, outfile = sys.argv[1:] colisten = scipy.sparse.lil_matrix((util.N_SONGS, util.N_SONGS)) for listens in util.songs_by_user(infile): for s, _ in listens: for t, _ in listens: colisten[s-1, t-1] += 1 # Songs are 1-indexed, but scipy uses 0-indexing scipy.io.mmwrite(file(outfile, 'wb'), colisten)
# Build colisten matrix from triplet CSV and save in mtx format # Usage: python colisten.py <infile> <outfile> import scipy.sparse, scipy.io import sys import util infile, outfile = sys.argv[1:] colisten = scipy.sparse.lil_matrix((util.N_SONGS, util.N_SONGS)) for listens in util.songs_by_user(infile): for s, _ in listens: for t, _ in listens: colisten[s - 1, t - 1] += 1 # Songs are 1-indexed, but scipy uses 0-indexing scipy.io.mmwrite(file(outfile, 'wb'), colisten)
print "it takes %f secs to read the colisten matrix " % timetoread listens = colisten.diagonal() listenranked = numpy.argsort(-listens)[:500] predict_start = time.clock() print " predict starts at %f :\n" % predict_start with open(outfile,'w') as out: i = 0 for history in util.songs_by_user(evalfile): i=i+1 print " we are predict for %d user" % i songs,counts = zip(*history) sim = numpy.array(counts)[numpy.newaxis,:]*\ colisten[numpy.array(songs)-1,:] simidxs = sim.nonzero()[1] srt = numpy.lexsort((-listens[simidxs],-sim[0,simidxs])) rankidxs = simidxs[srt]
# Usage: python predict_colisten.py <mtxfile> <evalfile> <outfile> import sys import scipy.io import numpy import util mtxfile, evalfile, outfile = sys.argv[1:] colisten = scipy.io.mmread(file(mtxfile)).tocsr() listens = colisten.diagonal() listenranked = numpy.argsort(-listens)[:500] with open(outfile, 'w') as out: for history in util.songs_by_user(evalfile): songs, counts = zip(*history) sim = numpy.array(counts)[numpy.newaxis, :] * colisten[ numpy.array(songs) - 1, :] # All this nonsense is an optimization to avoid the fact that # sorting 300,000 numbers 110,000 times is bad for your health. # I only sort the songs where sim > 0 simidxs = sim.nonzero()[1] srt = numpy.lexsort((-listens[simidxs], -sim[0, simidxs])) rankidxs = simidxs[srt] guess = [] for s in rankidxs: if s + 1 in songs: