示例#1
0
# Build colisten matrix from triplet CSV and save in mtx format
# Usage: python colisten.py <infile> <outfile>

import scipy.sparse, scipy.io
import sys
import util

infile, outfile = sys.argv[1:]

colisten = scipy.sparse.lil_matrix((util.N_SONGS, util.N_SONGS))

for listens in util.songs_by_user(infile):
  for s, _ in listens:
    for t, _ in listens:
      colisten[s-1, t-1] += 1 # Songs are 1-indexed, but scipy uses 0-indexing

scipy.io.mmwrite(file(outfile, 'wb'), colisten)
示例#2
0
# Build colisten matrix from triplet CSV and save in mtx format
# Usage: python colisten.py <infile> <outfile>

import scipy.sparse, scipy.io
import sys
import util

infile, outfile = sys.argv[1:]

colisten = scipy.sparse.lil_matrix((util.N_SONGS, util.N_SONGS))

for listens in util.songs_by_user(infile):
    for s, _ in listens:
        for t, _ in listens:
            colisten[s - 1, t -
                     1] += 1  # Songs are 1-indexed, but scipy uses 0-indexing

scipy.io.mmwrite(file(outfile, 'wb'), colisten)
示例#3
0
print "it takes %f secs to read the colisten matrix " % timetoread

listens = colisten.diagonal()

listenranked = numpy.argsort(-listens)[:500]


predict_start = time.clock()

print " predict starts at %f :\n" % predict_start

with open(outfile,'w') as out:

    i = 0

    for history in util.songs_by_user(evalfile):

        i=i+1
        print " we are predict for %d user" % i
        
        songs,counts = zip(*history)

        sim = numpy.array(counts)[numpy.newaxis,:]*\
              colisten[numpy.array(songs)-1,:]


        simidxs = sim.nonzero()[1]

        srt = numpy.lexsort((-listens[simidxs],-sim[0,simidxs]))

        rankidxs = simidxs[srt]
示例#4
0
# Usage: python predict_colisten.py <mtxfile> <evalfile> <outfile>

import sys
import scipy.io
import numpy
import util

mtxfile, evalfile, outfile = sys.argv[1:]

colisten = scipy.io.mmread(file(mtxfile)).tocsr()
listens = colisten.diagonal()

listenranked = numpy.argsort(-listens)[:500]

with open(outfile, 'w') as out:
    for history in util.songs_by_user(evalfile):
        songs, counts = zip(*history)

        sim = numpy.array(counts)[numpy.newaxis, :] * colisten[
            numpy.array(songs) - 1, :]

        # All this nonsense is an optimization to avoid the fact that
        # sorting 300,000 numbers 110,000 times is bad for your health.
        # I only sort the songs where sim > 0
        simidxs = sim.nonzero()[1]
        srt = numpy.lexsort((-listens[simidxs], -sim[0, simidxs]))
        rankidxs = simidxs[srt]

        guess = []
        for s in rankidxs:
            if s + 1 in songs: