Python IdIndexer示例，sandbox.util.IdIndexer.IdIndexer Python示例

示例#1

0

显示文件

文件： DatasetUtils.py 项目： charanpald/wallhack

 def flixster(minNnzRows=10, minNnzCols=2, quantile=90): 
     matrixFileName = PathDefaults.getDataDir() + "flixster/Ratings.timed.txt" 
     matrixFile = open(matrixFileName)
     matrixFile.readline()
     userIndexer = IdIndexer("i")
     movieIndexer = IdIndexer("i")
     
     ratings = array.array("f")
     logging.debug("Loading ratings from " + matrixFileName)
     
     for i, line in enumerate(matrixFile):
         if i % 1000000 == 0: 
             logging.debug("Iteration: " + str(i))
         vals = line.split()
         
         userIndexer.append(vals[0])
         movieIndexer.append(vals[1])
         ratings.append(float(vals[2]))
     
     rowInds = userIndexer.getArray()
     colInds = movieIndexer.getArray()
     ratings = numpy.array(ratings)
     
     X = sppy.csarray((len(userIndexer.getIdDict()), len(movieIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
     X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
     X.prune()
     
     X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
     
     logging.debug("Read file: " + matrixFileName)
     logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))
     
     #X = Sampling.sampleUsers(X, 1000)
     
     return X

示例#2

0

显示文件

文件： DatasetUtils.py 项目： charanpald/wallhack

    def epinions(minNnzRows=10, minNnzCols=3, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "epinions/rating.mat" 
        A = scipy.io.loadmat(matrixFileName)["rating"]
        
        userIndexer = IdIndexer("i")
        itemIndexer = IdIndexer("i")        
        
        for i in range(A.shape[0]): 
            userIndexer.append(A[i, 0])
            itemIndexer.append(A[i, 1])


        rowInds = userIndexer.getArray()
        colInds = itemIndexer.getArray()
        ratings = A[:, 3]        
        
        X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
        X.prune()
        
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X

示例#3

0

显示文件

文件： IdIndexerTest.py 项目： kentwang/sandbox

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

示例#4

0

显示文件

    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

示例#5

0

显示文件

文件： Static2IdValDataset.py 项目： charanpald/wallhack

 def __init__(self, dataFilename, split=0.8):
     """
     Read datasets from the specified files.
     """
     printStep = 1000000        
     
     authorIndexer = IdIndexer() 
     itemIndexer = IdIndexer() 
     ratings = array.array("i")
     
     #Read train files 
     dataFile = open(dataFilename)
     for i, line in enumerate(dataFile): 
         if i % printStep == 0: 
             logging.debug("Iteration: " + str(i))
         vals = line.split() 
         
         authorIndexer.append(vals[0])
         itemIndexer.append(vals[1])
         ratings.append(int(vals[2]))
         
     dataFile.close()
     logging.debug("Read file with " + str(i+1) + " lines")
         
     authorInds = numpy.array(authorIndexer.getArray())
     itemInds = numpy.array(itemIndexer.getArray())
     ratings = numpy.array(ratings)
     
     logging.debug("Number of authors: " + str(len(authorIndexer.getIdDict())))
     logging.debug("Number of items: " + str(len(itemIndexer.getIdDict())))
     logging.debug("Number of ratings: " + str(ratings.shape[0]))
     
     del authorIndexer 
     del itemIndexer
     gc.collect()
     
     shape = (numpy.max(authorInds)+1, numpy.max(itemInds)+1)
     inds = numpy.random.permutation(ratings.shape[0])
     trainInds = inds[0:int(inds.shape[0]*split)]
     trainX = scipy.sparse.csc_matrix((ratings[trainInds], (authorInds[trainInds], itemInds[trainInds])), shape=shape)
     
     testInds = inds[int(inds.shape[0]*split):]
     testX = scipy.sparse.csc_matrix((ratings[testInds], (authorInds[testInds], itemInds[testInds])), shape=shape)
     
     del authorInds, itemInds, ratings 
     gc.collect()
     
     self.trainXList = [trainX]
     self.testXList = [testX]

示例#6

0

显示文件

文件： IdIndexerTest.py 项目： kentwang/sandbox

    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

示例#7

0

显示文件

文件： ArnetMinerDataset.py 项目： charanpald/wallhack

 def coauthorsGraphFromAuthors2(self, relevantExperts, field): 
     """
     Take a set of relevant authors and return the graph. 
     """
     dataFileName = self.dataDir + "__" + field.replace(' ', '') + ".csv" 
     dataFile = open(dataFileName)
     authorIndexer = IdIndexer()
     author1Inds = array.array("i")
     author2Inds = array.array("i")
     
     articleDict = {}
     
     for relevantExpert in relevantExperts: 
         authorIndexer.append(relevantExpert)
     
     for i, line in enumerate(dataFile):
         try: 
             fields = [x.strip() for x in line.split(";")] 
             author = fields[1] + " " + fields[2]
             articleId = fields[4]
             
             if articleId not in articleDict.keys(): 
                 articleDict[articleId] = [author]
             else: 
                 articleDict[articleId].append(author)
         except IndexError:
             #Ignore bad lines 
             pass 
             
     dataFile.close()
                         
     for articleId in articleDict.keys(): 
         authors = articleDict[articleId]            
         
         if len(authors) != 0: 
             iterator = itertools.combinations(authors, 2)
         
             for author1, author2 in iterator: 
                 if author1 in relevantExperts and author2 in relevantExperts: 
                     author1Ind = authorIndexer.append(author1) 
                     author2Ind = authorIndexer.append(author2)
                         
                     author1Inds.append(author1Ind)
                     author2Inds.append(author2Ind)
 
     logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors")
                            
     #Coauthor graph is undirected 
     author1Inds = numpy.array(author1Inds, numpy.int)
     author2Inds = numpy.array(author2Inds, numpy.int)
     edges = numpy.c_[author1Inds, author2Inds]            
     
     graph = igraph.Graph()
     graph.add_vertices(len(authorIndexer.getIdDict()))
     graph.add_edges(edges)
     graph.es["weight"] = numpy.ones(graph.ecount())
     graph.simplify(combine_edges=sum)   
     graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) 
     
     return graph, authorIndexer

示例#8

0

显示文件

文件： DatasetUtils.py 项目： charanpald/wallhack

    def bookCrossing(minNnzRows=10, minNnzCols=3, quantile=90): 
        matrixFileName = PathDefaults.getDataDir() + "book-crossing/BX-Book-Ratings.csv" 
        matrixFile = open(matrixFileName)
        matrixFile.readline()
        userIndexer = IdIndexer("i")
        itemIndexer = IdIndexer("i")
        
        ratings = array.array("f")
        logging.debug("Loading ratings from " + matrixFileName)
        
        for i, line in enumerate(matrixFile):
            if i % 1000000 == 0: 
                logging.debug("Iteration: " + str(i))
            vals = line.split(";")
            
            field1 = vals[0].strip("\"")
            field2 = vals[1].strip("\"")
            field3 = int(vals[2].strip("\"\n\r"))            
            
            userIndexer.append(field1)
            itemIndexer.append(field2)
            ratings.append(field3)
                    
        rowInds = userIndexer.getArray()
        colInds = itemIndexer.getArray()
        ratings = numpy.array(ratings)
                
        X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int)
        X.put(numpy.array(numpy.logical_or(ratings>4, ratings==0), numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True)
        X.prune()
        
        X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols)
        
        logging.debug("Read file: " + matrixFileName)
        logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape))

        return X

示例#9

0

显示文件

文件： ArnetMinerDataset.py 项目： charanpald/wallhack

 def coauthorsGraphFromAuthors(self, relevantExperts): 
     """
     Take a set of relevant authors and return the graph. 
     """
     dataFile = open(self.dataFilename)  
     authorIndexer = IdIndexer()
     author1Inds = array.array("i")
     author2Inds = array.array("i")
     
     for relevantExpert in relevantExperts: 
         authorIndexer.append(relevantExpert)
     
     for i, line in enumerate(dataFile):
         Util.printIteration(i, self.stepSize, self.numLines)
         authors = re.findall("#@(.*)", line)  
                         
         if len(authors) != 0: 
             authors = set([x.strip() for x in authors[0].split(",")]) 
             if len(authors.intersection(relevantExperts)) != 0: 
                 iterator = itertools.combinations(authors, 2)
             
                 for author1, author2 in iterator: 
                     if author1 in relevantExperts and author2 in relevantExperts: 
                         author1Ind = authorIndexer.append(author1) 
                         author2Ind = authorIndexer.append(author2)
                             
                         author1Inds.append(author1Ind)
                         author2Inds.append(author2Ind)
     
     logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors")
                            
     #Coauthor graph is undirected 
     author1Inds = numpy.array(author1Inds, numpy.int)
     author2Inds = numpy.array(author2Inds, numpy.int)
     edges = numpy.c_[author1Inds, author2Inds]            
     
     graph = igraph.Graph()
     graph.add_vertices(len(authorIndexer.getIdDict()))
     graph.add_edges(edges)
     graph.es["weight"] = numpy.ones(graph.ecount())
     graph.simplify(combine_edges=sum)   
     graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) 
     
     return graph, authorIndexer

示例#10

0

显示文件

文件： BoundExp.py 项目： charanpald/wallhack

import matplotlib 
matplotlib.use("GTK3Agg")
import matplotlib.pyplot as plt  
from sandbox.util.PathDefaults import PathDefaults 
from sandbox.util.IdIndexer import IdIndexer
from sandbox.util.Latex import Latex 
from apgl.graph.GraphUtils import GraphUtils 

"""
We try to figure out the change in L_i and L_{i+1}
"""

numpy.set_printoptions(suppress=True, precision=4)

dataDir = PathDefaults.getDataDir() + "kcore/"
indexer = IdIndexer()

node1Inds = array.array("i")
node2Inds = array.array("i")

Ls = []
us = []

boundFro = [] 
bound2 = []
ks = []
eyes = []
deltas = []

for i in range(1, 9): 
    print(i)

示例#11

0

显示文件

文件： GenerateMendeleyCoauthors.py 项目： charanpald/wallhack

def writeAuthorXMatrix(inputFileName, authorIndexerFilename, authorXFileName, reverse=False): 
    
    if not os.path.isfile(authorXFileName): 
        fileObj = open(inputFileName)
        
        authorIndex = IdIndexer()
        docIndex = IdIndexer()
        scores = array.array("i")
        
        for i, line in enumerate(fileObj):
            if i % 500000 == 0: 
                logging.debug(i)
            vals = line.split()
            #logging.debug(vals[0], vals[1], vals[2])
            
            if reverse: 
                authorIndex.append(vals[1])
                docIndex.append(vals[0])
            else: 
                authorIndex.append(vals[0])
                docIndex.append(vals[1])
                
            score = int(vals[2])
            scores.append(int(score))
        
        rowInds = numpy.array(authorIndex.getArray())
        colInds = numpy.array(docIndex.getArray())
        
        Y = scipy.sparse.csr_matrix((scores, (rowInds, colInds)))
            
        authorIndexerFile = open(authorIndexerFilename, "wb")
        pickle.dump(authorIndex, authorIndexerFile)
        authorIndexerFile.close()
        scipy.io.mmwrite(authorXFileName, Y)
        logging.debug("Saved matrix to " + authorXFileName)
    else: 
        logging.debug("File exists: " + authorXFileName)

示例#12

0

显示文件

class IdIndexerTest(unittest.TestCase):
    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

    def testTranslate(self):
        self.assertEquals(self.indexer.translate(["mark"]), [2])
        self.assertEquals(self.indexer.translate(["john"]), [0])

        self.assertEquals(self.indexer.translate(["john", "james"]), [0, 1])

    def testReverseTranslate(self):
        self.assertEquals(self.indexer.reverseTranslate(0), "john")
        self.assertEquals(self.indexer.reverseTranslate(1), "james")
        self.assertEquals(self.indexer.reverseTranslate(2), "mark")

        self.assertEquals(self.indexer.reverseTranslate([2, 1, 0]),
                          ["mark", "james", "john"])

    def testReverseTranslateDict(self):
        indDict = self.indexer.reverseTranslateDict()

        for i in range(len(self.indexer.getIdDict())):
            self.assertEquals(self.indexer.append(indDict[i]), i)

示例#13

0

显示文件

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

示例#14

0

显示文件

文件： IdIndexerTest.py 项目： kentwang/sandbox

class IdIndexerTest(unittest.TestCase):
    def setUp(self):
        numpy.set_printoptions(suppress=True, precision=3, linewidth=150)
        numpy.random.seed(21)

        self.indexer = IdIndexer()

        self.indexer.append("john")
        self.indexer.append("james")
        self.indexer.append("mark")
        self.indexer.append("james")

    def testAppend(self):
        indexer = IdIndexer()

        indexer.append("john")
        indexer.append("james")
        indexer.append("mark")
        indexer.append("james")

        nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))

    def testTranslate(self):
        self.assertEquals(self.indexer.translate(["mark"]), [2])
        self.assertEquals(self.indexer.translate(["john"]), [0])

        self.assertEquals(self.indexer.translate(["john", "james"]), [0, 1])

    def testReverseTranslate(self):
        self.assertEquals(self.indexer.reverseTranslate(0), "john")
        self.assertEquals(self.indexer.reverseTranslate(1), "james")
        self.assertEquals(self.indexer.reverseTranslate(2), "mark")

        self.assertEquals(self.indexer.reverseTranslate([2, 1, 0]), ["mark", "james", "john"])

    def testReverseTranslateDict(self):
        indDict = self.indexer.reverseTranslateDict()

        for i in range(len(self.indexer.getIdDict())):
            self.assertEquals(self.indexer.append(indDict[i]), i)