예제 #1
0
 def flixster(minNnzRows=10, minNnzCols=2, quantile=90): 
     matrixFileName = PathDefaults.getDataDir() + "flixster/Ratings.timed.txt" 
     matrixFile = open(matrixFileName)
     matrixFile.readline()
     userIndexer = IdIndexer("i")
     movieIndexer = IdIndexer("i")
     
     ratings = array.array("f")
     logging.debug("Loading ratings from " + matrixFileName)
     
     for i, line in enumerate(matrixFile):
         if i % 1000000 == 0: 
             logging.debug("Iteration: " + str(i))
         vals = line.split()
         
         userIndexer.append(vals[0])
         movieIndexer.append(vals[1])
         ratings.append(float(vals[2]))
     
     rowInds = userIndexer.getArray()
     colInds = movieIndexer.getArray()
     ratings = numpy.array(ratings)
     
     X = sppy.csarray((len(userIndexer.getIdDict()), len(movieIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
     X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
     X.prune()
     
     X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
     
     logging.debug("Read file: " + matrixFileName)
     logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     #X = Sampling.sampleUsers(X, 1000)
     
     return X 
예제 #2
0
 def coauthorsGraphFromAuthors2(self, relevantExperts, field): 
     """
     Take a set of relevant authors and return the graph. 
     """
     dataFileName = self.dataDir + "__" + field.replace(' ', '') + ".csv" 
     dataFile = open(dataFileName)
     authorIndexer = IdIndexer()
     author1Inds = array.array("i")
     author2Inds = array.array("i")
     
     articleDict = {}
     
     for relevantExpert in relevantExperts: 
         authorIndexer.append(relevantExpert)
     
     for i, line in enumerate(dataFile):
         try: 
             fields = [x.strip() for x in line.split(";")] 
             author = fields[1] + " " + fields[2]
             articleId = fields[4]
             
             if articleId not in articleDict.keys(): 
                 articleDict[articleId] = [author]
             else: 
                 articleDict[articleId].append(author)
         except IndexError:
             #Ignore bad lines 
             pass 
             
     dataFile.close()
                         
     for articleId in articleDict.keys(): 
         authors = articleDict[articleId]            
         
         if len(authors) != 0: 
             iterator = itertools.combinations(authors, 2)
         
             for author1, author2 in iterator: 
                 if author1 in relevantExperts and author2 in relevantExperts: 
                     author1Ind = authorIndexer.append(author1) 
                     author2Ind = authorIndexer.append(author2)
                         
                     author1Inds.append(author1Ind)
                     author2Inds.append(author2Ind)
 
     logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors")
                            
     #Coauthor graph is undirected 
     author1Inds = numpy.array(author1Inds, numpy.int)
     author2Inds = numpy.array(author2Inds, numpy.int)
     edges = numpy.c_[author1Inds, author2Inds]            
     
     graph = igraph.Graph()
     graph.add_vertices(len(authorIndexer.getIdDict()))
     graph.add_edges(edges)
     graph.es["weight"] = numpy.ones(graph.ecount())
     graph.simplify(combine_edges=sum)   
     graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) 
     
     return graph, authorIndexer
예제 #3
0
    def epinions(minNnzRows=10, minNnzCols=3, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "epinions/rating.mat" 
        A = scipy.io.loadmat(matrixFileName)["rating"]
        
        userIndexer = IdIndexer("i")
        itemIndexer = IdIndexer("i")        
        
        for i in range(A.shape[0]): 
            userIndexer.append(A[i, 0])
            itemIndexer.append(A[i, 1])


        rowInds = userIndexer.getArray()
        colInds = itemIndexer.getArray()
        ratings = A[:, 3]        
        
        X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
        X.prune()
        
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X 
예제 #4
0
 def __init__(self, dataFilename, split=0.8):
     """
     Read datasets from the specified files.
     """
     printStep = 1000000        
     
     authorIndexer = IdIndexer() 
     itemIndexer = IdIndexer() 
     ratings = array.array("i")
     
     #Read train files 
     dataFile = open(dataFilename)
     for i, line in enumerate(dataFile): 
         if i % printStep == 0: 
             logging.debug("Iteration: " + str(i))
         vals = line.split() 
         
         authorIndexer.append(vals[0])
         itemIndexer.append(vals[1])
         ratings.append(int(vals[2]))
         
     dataFile.close()
     logging.debug("Read file with " + str(i+1) + " lines")
         
     authorInds = numpy.array(authorIndexer.getArray())
     itemInds = numpy.array(itemIndexer.getArray())
     ratings = numpy.array(ratings)
     
     logging.debug("Number of authors: " + str(len(authorIndexer.getIdDict())))
     logging.debug("Number of items: " + str(len(itemIndexer.getIdDict())))
     logging.debug("Number of ratings: " + str(ratings.shape[0]))
     
     del authorIndexer 
     del itemIndexer
     gc.collect()
     
     shape = (numpy.max(authorInds)+1, numpy.max(itemInds)+1)
     inds = numpy.random.permutation(ratings.shape[0])
     trainInds = inds[0:int(inds.shape[0]*split)]
     trainX = scipy.sparse.csc_matrix((ratings[trainInds], (authorInds[trainInds], itemInds[trainInds])), shape=shape)
     
     testInds = inds[int(inds.shape[0]*split):]
     testX = scipy.sparse.csc_matrix((ratings[testInds], (authorInds[testInds], itemInds[testInds])), shape=shape)
     
     del authorInds, itemInds, ratings 
     gc.collect()
     
     self.trainXList = [trainX]
     self.testXList = [testX]
예제 #5
0
    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))
예제 #6
0
    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))
예제 #7
0
 def coauthorsGraphFromAuthors(self, relevantExperts): 
     """
     Take a set of relevant authors and return the graph. 
     """
     dataFile = open(self.dataFilename)  
     authorIndexer = IdIndexer()
     author1Inds = array.array("i")
     author2Inds = array.array("i")
     
     for relevantExpert in relevantExperts: 
         authorIndexer.append(relevantExpert)
     
     for i, line in enumerate(dataFile):
         Util.printIteration(i, self.stepSize, self.numLines)
         authors = re.findall("#@(.*)", line)  
                         
         if len(authors) != 0: 
             authors = set([x.strip() for x in authors[0].split(",")]) 
             if len(authors.intersection(relevantExperts)) != 0: 
                 iterator = itertools.combinations(authors, 2)
             
                 for author1, author2 in iterator: 
                     if author1 in relevantExperts and author2 in relevantExperts: 
                         author1Ind = authorIndexer.append(author1) 
                         author2Ind = authorIndexer.append(author2)
                             
                         author1Inds.append(author1Ind)
                         author2Inds.append(author2Ind)
     
     logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors")
                            
     #Coauthor graph is undirected 
     author1Inds = numpy.array(author1Inds, numpy.int)
     author2Inds = numpy.array(author2Inds, numpy.int)
     edges = numpy.c_[author1Inds, author2Inds]            
     
     graph = igraph.Graph()
     graph.add_vertices(len(authorIndexer.getIdDict()))
     graph.add_edges(edges)
     graph.es["weight"] = numpy.ones(graph.ecount())
     graph.simplify(combine_edges=sum)   
     graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) 
     
     return graph, authorIndexer
예제 #8
0
    def bookCrossing(minNnzRows=10, minNnzCols=3, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "book-crossing/BX-Book-Ratings.csv" 
        matrixFile = open(matrixFileName)
        matrixFile.readline()
        userIndexer = IdIndexer("i")
        itemIndexer = IdIndexer("i")
        
        ratings = array.array("f")
        logging.debug("Loading ratings from " + matrixFileName)
        
        for i, line in enumerate(matrixFile):
            if i % 1000000 == 0: 
                logging.debug("Iteration: " + str(i))
            vals = line.split(";")
            
            field1 = vals[0].strip("\"")
            field2 = vals[1].strip("\"")
            field3 = int(vals[2].strip("\"\n\r"))            
            
            userIndexer.append(field1)
            itemIndexer.append(field2)
            ratings.append(field3)
                    
        rowInds = userIndexer.getArray()
        colInds = itemIndexer.getArray()
        ratings = numpy.array(ratings)
                
        X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(numpy.logical_or(ratings>4, ratings==0), numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
        X.prune()
        
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X 
예제 #9
0
class IdIndexerTest(unittest.TestCase):
    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

    def testTranslate(self):
        self.assertEquals(self.indexer.translate(["mark"]), [2])
        self.assertEquals(self.indexer.translate(["john"]), [0])

        self.assertEquals(self.indexer.translate(["john", "james"]), [0, 1])

    def testReverseTranslate(self):
        self.assertEquals(self.indexer.reverseTranslate(0), "john")
        self.assertEquals(self.indexer.reverseTranslate(1), "james")
        self.assertEquals(self.indexer.reverseTranslate(2), "mark")

        self.assertEquals(self.indexer.reverseTranslate([2, 1, 0]),
                          ["mark", "james", "john"])

    def testReverseTranslateDict(self):
        indDict = self.indexer.reverseTranslateDict()

        for i in range(len(self.indexer.getIdDict())):
            self.assertEquals(self.indexer.append(indDict[i]), i)
예제 #10
0
class IdIndexerTest(unittest.TestCase):
    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

    def testTranslate(self):
        self.assertEquals(self.indexer.translate(["mark"]), [2])
        self.assertEquals(self.indexer.translate(["john"]), [0])

        self.assertEquals(self.indexer.translate(["john", "james"]), [0, 1])

    def testReverseTranslate(self):
        self.assertEquals(self.indexer.reverseTranslate(0), "john")
        self.assertEquals(self.indexer.reverseTranslate(1), "james")
        self.assertEquals(self.indexer.reverseTranslate(2), "mark")

        self.assertEquals(self.indexer.reverseTranslate([2, 1, 0]), ["mark", "james", "john"])

    def testReverseTranslateDict(self):
        indDict = self.indexer.reverseTranslateDict()

        for i in range(len(self.indexer.getIdDict())):
            self.assertEquals(self.indexer.append(indDict[i]), i)
def writeAuthorXMatrix(inputFileName, authorIndexerFilename, authorXFileName, reverse=False): 
    
    if not os.path.isfile(authorXFileName): 
        fileObj = open(inputFileName)
        
        authorIndex = IdIndexer()
        docIndex = IdIndexer()
        scores = array.array("i")
        
        for i, line in enumerate(fileObj):
            if i % 500000 == 0: 
                logging.debug(i)
            vals = line.split()
            #logging.debug(vals[0], vals[1], vals[2])
            
            if reverse: 
                authorIndex.append(vals[1])
                docIndex.append(vals[0])
            else: 
                authorIndex.append(vals[0])
                docIndex.append(vals[1])
                
            score = int(vals[2])
            scores.append(int(score))
        
        rowInds = numpy.array(authorIndex.getArray())
        colInds = numpy.array(docIndex.getArray())
        
        Y = scipy.sparse.csr_matrix((scores, (rowInds, colInds)))
            
        authorIndexerFile = open(authorIndexerFilename, "wb")
        pickle.dump(authorIndex, authorIndexerFile)
        authorIndexerFile.close()
        scipy.io.mmwrite(authorXFileName, Y)
        logging.debug("Saved matrix to " + authorXFileName)
    else: 
        logging.debug("File exists: " + authorXFileName)
예제 #12
0
 print(i)
 networkFilename = dataDir + "network_1_kcores/network_1-core" + str("%02d" % (i,)) + ".txt"
 
 networkFile = open(networkFilename)
 networkFile.readline()
 networkFile.readline()
 networkFile.readline()
 networkFile.readline()
 
 node1Inds = array.array("i")
 node2Inds = array.array("i")    
 
 for line in networkFile: 
     vals = line.split()
     
     node1Inds.append(indexer.append(vals[0]))
     node2Inds.append(indexer.append(vals[1]))
 
 node1Inds = numpy.array(node1Inds)
 node2Inds = numpy.array(node2Inds)
 
 m = len(indexer.getIdDict())    
 
 A = numpy.zeros((m, m))
 A[node1Inds, node2Inds] = 1
 A = (A+A.T)/2
 
 A = scipy.sparse.csr_matrix(A)
 L = GraphUtils.normalisedLaplacianSym(A)
 Ls.append(L)