def flixster(minNnzRows=10, minNnzCols=2, quantile=90): matrixFileName = PathDefaults.getDataDir() + "flixster/Ratings.timed.txt" matrixFile = open(matrixFileName) matrixFile.readline() userIndexer = IdIndexer("i") movieIndexer = IdIndexer("i") ratings = array.array("f") logging.debug("Loading ratings from " + matrixFileName) for i, line in enumerate(matrixFile): if i % 1000000 == 0: logging.debug("Iteration: " + str(i)) vals = line.split() userIndexer.append(vals[0]) movieIndexer.append(vals[1]) ratings.append(float(vals[2])) rowInds = userIndexer.getArray() colInds = movieIndexer.getArray() ratings = numpy.array(ratings) X = sppy.csarray((len(userIndexer.getIdDict()), len(movieIndexer.getIdDict())), storagetype="row", dtype=numpy.int) X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True) X.prune() X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + matrixFileName) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) #X = Sampling.sampleUsers(X, 1000) return X
def epinions(minNnzRows=10, minNnzCols=3, quantile=90): matrixFileName = PathDefaults.getDataDir() + "epinions/rating.mat" A = scipy.io.loadmat(matrixFileName)["rating"] userIndexer = IdIndexer("i") itemIndexer = IdIndexer("i") for i in range(A.shape[0]): userIndexer.append(A[i, 0]) itemIndexer.append(A[i, 1]) rowInds = userIndexer.getArray() colInds = itemIndexer.getArray() ratings = A[:, 3] X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int) X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True) X.prune() X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + matrixFileName) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) return X
def testAppend(self): indexer = IdIndexer() indexer.append("john") indexer.append("james") indexer.append("mark") indexer.append("james") nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))
def setUp(self): numpy.set_printoptions(suppress=True, precision=3, linewidth=150) numpy.random.seed(21) self.indexer = IdIndexer() self.indexer.append("john") self.indexer.append("james") self.indexer.append("mark") self.indexer.append("james")
def __init__(self, dataFilename, split=0.8): """ Read datasets from the specified files. """ printStep = 1000000 authorIndexer = IdIndexer() itemIndexer = IdIndexer() ratings = array.array("i") #Read train files dataFile = open(dataFilename) for i, line in enumerate(dataFile): if i % printStep == 0: logging.debug("Iteration: " + str(i)) vals = line.split() authorIndexer.append(vals[0]) itemIndexer.append(vals[1]) ratings.append(int(vals[2])) dataFile.close() logging.debug("Read file with " + str(i+1) + " lines") authorInds = numpy.array(authorIndexer.getArray()) itemInds = numpy.array(itemIndexer.getArray()) ratings = numpy.array(ratings) logging.debug("Number of authors: " + str(len(authorIndexer.getIdDict()))) logging.debug("Number of items: " + str(len(itemIndexer.getIdDict()))) logging.debug("Number of ratings: " + str(ratings.shape[0])) del authorIndexer del itemIndexer gc.collect() shape = (numpy.max(authorInds)+1, numpy.max(itemInds)+1) inds = numpy.random.permutation(ratings.shape[0]) trainInds = inds[0:int(inds.shape[0]*split)] trainX = scipy.sparse.csc_matrix((ratings[trainInds], (authorInds[trainInds], itemInds[trainInds])), shape=shape) testInds = inds[int(inds.shape[0]*split):] testX = scipy.sparse.csc_matrix((ratings[testInds], (authorInds[testInds], itemInds[testInds])), shape=shape) del authorInds, itemInds, ratings gc.collect() self.trainXList = [trainX] self.testXList = [testX]
def coauthorsGraphFromAuthors2(self, relevantExperts, field): """ Take a set of relevant authors and return the graph. """ dataFileName = self.dataDir + "__" + field.replace(' ', '') + ".csv" dataFile = open(dataFileName) authorIndexer = IdIndexer() author1Inds = array.array("i") author2Inds = array.array("i") articleDict = {} for relevantExpert in relevantExperts: authorIndexer.append(relevantExpert) for i, line in enumerate(dataFile): try: fields = [x.strip() for x in line.split(";")] author = fields[1] + " " + fields[2] articleId = fields[4] if articleId not in articleDict.keys(): articleDict[articleId] = [author] else: articleDict[articleId].append(author) except IndexError: #Ignore bad lines pass dataFile.close() for articleId in articleDict.keys(): authors = articleDict[articleId] if len(authors) != 0: iterator = itertools.combinations(authors, 2) for author1, author2 in iterator: if author1 in relevantExperts and author2 in relevantExperts: author1Ind = authorIndexer.append(author1) author2Ind = authorIndexer.append(author2) author1Inds.append(author1Ind) author2Inds.append(author2Ind) logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors") #Coauthor graph is undirected author1Inds = numpy.array(author1Inds, numpy.int) author2Inds = numpy.array(author2Inds, numpy.int) edges = numpy.c_[author1Inds, author2Inds] graph = igraph.Graph() graph.add_vertices(len(authorIndexer.getIdDict())) graph.add_edges(edges) graph.es["weight"] = numpy.ones(graph.ecount()) graph.simplify(combine_edges=sum) graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) return graph, authorIndexer
def bookCrossing(minNnzRows=10, minNnzCols=3, quantile=90): matrixFileName = PathDefaults.getDataDir() + "book-crossing/BX-Book-Ratings.csv" matrixFile = open(matrixFileName) matrixFile.readline() userIndexer = IdIndexer("i") itemIndexer = IdIndexer("i") ratings = array.array("f") logging.debug("Loading ratings from " + matrixFileName) for i, line in enumerate(matrixFile): if i % 1000000 == 0: logging.debug("Iteration: " + str(i)) vals = line.split(";") field1 = vals[0].strip("\"") field2 = vals[1].strip("\"") field3 = int(vals[2].strip("\"\n\r")) userIndexer.append(field1) itemIndexer.append(field2) ratings.append(field3) rowInds = userIndexer.getArray() colInds = itemIndexer.getArray() ratings = numpy.array(ratings) X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int) X.put(numpy.array(numpy.logical_or(ratings>4, ratings==0), numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True) X.prune() X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + matrixFileName) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) return X
def coauthorsGraphFromAuthors(self, relevantExperts): """ Take a set of relevant authors and return the graph. """ dataFile = open(self.dataFilename) authorIndexer = IdIndexer() author1Inds = array.array("i") author2Inds = array.array("i") for relevantExpert in relevantExperts: authorIndexer.append(relevantExpert) for i, line in enumerate(dataFile): Util.printIteration(i, self.stepSize, self.numLines) authors = re.findall("#@(.*)", line) if len(authors) != 0: authors = set([x.strip() for x in authors[0].split(",")]) if len(authors.intersection(relevantExperts)) != 0: iterator = itertools.combinations(authors, 2) for author1, author2 in iterator: if author1 in relevantExperts and author2 in relevantExperts: author1Ind = authorIndexer.append(author1) author2Ind = authorIndexer.append(author2) author1Inds.append(author1Ind) author2Inds.append(author2Ind) logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors") #Coauthor graph is undirected author1Inds = numpy.array(author1Inds, numpy.int) author2Inds = numpy.array(author2Inds, numpy.int) edges = numpy.c_[author1Inds, author2Inds] graph = igraph.Graph() graph.add_vertices(len(authorIndexer.getIdDict())) graph.add_edges(edges) graph.es["weight"] = numpy.ones(graph.ecount()) graph.simplify(combine_edges=sum) graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) return graph, authorIndexer
import matplotlib matplotlib.use("GTK3Agg") import matplotlib.pyplot as plt from sandbox.util.PathDefaults import PathDefaults from sandbox.util.IdIndexer import IdIndexer from sandbox.util.Latex import Latex from apgl.graph.GraphUtils import GraphUtils """ We try to figure out the change in L_i and L_{i+1} """ numpy.set_printoptions(suppress=True, precision=4) dataDir = PathDefaults.getDataDir() + "kcore/" indexer = IdIndexer() node1Inds = array.array("i") node2Inds = array.array("i") Ls = [] us = [] boundFro = [] bound2 = [] ks = [] eyes = [] deltas = [] for i in range(1, 9): print(i)
def writeAuthorXMatrix(inputFileName, authorIndexerFilename, authorXFileName, reverse=False): if not os.path.isfile(authorXFileName): fileObj = open(inputFileName) authorIndex = IdIndexer() docIndex = IdIndexer() scores = array.array("i") for i, line in enumerate(fileObj): if i % 500000 == 0: logging.debug(i) vals = line.split() #logging.debug(vals[0], vals[1], vals[2]) if reverse: authorIndex.append(vals[1]) docIndex.append(vals[0]) else: authorIndex.append(vals[0]) docIndex.append(vals[1]) score = int(vals[2]) scores.append(int(score)) rowInds = numpy.array(authorIndex.getArray()) colInds = numpy.array(docIndex.getArray()) Y = scipy.sparse.csr_matrix((scores, (rowInds, colInds))) authorIndexerFile = open(authorIndexerFilename, "wb") pickle.dump(authorIndex, authorIndexerFile) authorIndexerFile.close() scipy.io.mmwrite(authorXFileName, Y) logging.debug("Saved matrix to " + authorXFileName) else: logging.debug("File exists: " + authorXFileName)
class IdIndexerTest(unittest.TestCase): def setUp(self): numpy.set_printoptions(suppress=True, precision=3, linewidth=150) numpy.random.seed(21) self.indexer = IdIndexer() self.indexer.append("john") self.indexer.append("james") self.indexer.append("mark") self.indexer.append("james") def testAppend(self): indexer = IdIndexer() indexer.append("john") indexer.append("james") indexer.append("mark") indexer.append("james") nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1])) def testTranslate(self): self.assertEquals(self.indexer.translate(["mark"]), [2]) self.assertEquals(self.indexer.translate(["john"]), [0]) self.assertEquals(self.indexer.translate(["john", "james"]), [0, 1]) def testReverseTranslate(self): self.assertEquals(self.indexer.reverseTranslate(0), "john") self.assertEquals(self.indexer.reverseTranslate(1), "james") self.assertEquals(self.indexer.reverseTranslate(2), "mark") self.assertEquals(self.indexer.reverseTranslate([2, 1, 0]), ["mark", "james", "john"]) def testReverseTranslateDict(self): indDict = self.indexer.reverseTranslateDict() for i in range(len(self.indexer.getIdDict())): self.assertEquals(self.indexer.append(indDict[i]), i)