Exemplo n.º 1
0
 def flixster(minNnzRows=10, minNnzCols=2, quantile=90): 
     matrixFileName = PathDefaults.getDataDir() + "flixster/Ratings.timed.txt" 
     matrixFile = open(matrixFileName)
     matrixFile.readline()
     userIndexer = IdIndexer("i")
     movieIndexer = IdIndexer("i")
     
     ratings = array.array("f")
     logging.debug("Loading ratings from " + matrixFileName)
     
     for i, line in enumerate(matrixFile):
         if i % 1000000 == 0: 
             logging.debug("Iteration: " + str(i))
         vals = line.split()
         
         userIndexer.append(vals[0])
         movieIndexer.append(vals[1])
         ratings.append(float(vals[2]))
     
     rowInds = userIndexer.getArray()
     colInds = movieIndexer.getArray()
     ratings = numpy.array(ratings)
     
     X = sppy.csarray((len(userIndexer.getIdDict()), len(movieIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
     X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
     X.prune()
     
     X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
     
     logging.debug("Read file: " + matrixFileName)
     logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     #X = Sampling.sampleUsers(X, 1000)
     
     return X 
Exemplo n.º 2
0
    def epinions(minNnzRows=10, minNnzCols=3, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "epinions/rating.mat" 
        A = scipy.io.loadmat(matrixFileName)["rating"]
        
        userIndexer = IdIndexer("i")
        itemIndexer = IdIndexer("i")        
        
        for i in range(A.shape[0]): 
            userIndexer.append(A[i, 0])
            itemIndexer.append(A[i, 1])


        rowInds = userIndexer.getArray()
        colInds = itemIndexer.getArray()
        ratings = A[:, 3]        
        
        X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
        X.prune()
        
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X 
Exemplo n.º 3
0
 def mendeley2(minNnzRows=10, minNnzCols=2, quantile=90, dataset="Document"):
     authorAuthorFileName = PathDefaults.getDataDir() + "reference/author" + dataset + "Matrix.mtx"
     logging.debug("Reading file: " + authorAuthorFileName)
     X = sppy.io.mmread(authorAuthorFileName, storagetype="row")
             
     logging.debug("Raw non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
     
     logging.debug("Read file: " + authorAuthorFileName)
     logging.debug("Non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     return X 
Exemplo n.º 4
0
    def movieLens(minNnzRows=10, minNnzCols=2, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data" 
        data = numpy.loadtxt(matrixFileName)
        X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(data[:, 2]>3, numpy.int), numpy.array(data[:, 0]-1, numpy.int32), numpy.array(data[:, 1]-1, numpy.int32), init=True)
        #X = SparseUtilsCython.centerRowsCsarray(X)   
        #X[X.nonzero()] = X.values()>0
        X.prune()
        #maxNnz = numpy.percentile(X.sum(0), quantile)
        #X = SparseUtils.pruneMatrixCols(X, minNnz=minNnzCols, maxNnz=maxNnz)
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X
Exemplo n.º 5
0
 def mendeley(minNnzRows=10, minNnzCols=2, quantile=90, dataset="Doc", sigma=0.05, indicator=True):
     authorAuthorFileName = PathDefaults.getDataDir() + "reference/authorAuthor"+ dataset + "Matrix_sigma=" + str(sigma) + ".mtx"
     logging.debug("Reading file: " + authorAuthorFileName)
     X = sppy.io.mmread(authorAuthorFileName, storagetype="row")
     
     if indicator: 
         X[X.nonzero()] = 1
         X.prune()
     
     logging.debug("Raw non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
     
     logging.debug("Read file: " + authorAuthorFileName)
     logging.debug("Non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     return X
Exemplo n.º 6
0
    def bookCrossing(minNnzRows=10, minNnzCols=3, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "book-crossing/BX-Book-Ratings.csv" 
        matrixFile = open(matrixFileName)
        matrixFile.readline()
        userIndexer = IdIndexer("i")
        itemIndexer = IdIndexer("i")
        
        ratings = array.array("f")
        logging.debug("Loading ratings from " + matrixFileName)
        
        for i, line in enumerate(matrixFile):
            if i % 1000000 == 0: 
                logging.debug("Iteration: " + str(i))
            vals = line.split(";")
            
            field1 = vals[0].strip("\"")
            field2 = vals[1].strip("\"")
            field3 = int(vals[2].strip("\"\n\r"))            
            
            userIndexer.append(field1)
            itemIndexer.append(field2)
            ratings.append(field3)
                    
        rowInds = userIndexer.getArray()
        colInds = itemIndexer.getArray()
        ratings = numpy.array(ratings)
                
        X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(numpy.logical_or(ratings>4, ratings==0), numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
        X.prune()
        
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X