def flixster(minNnzRows=10, minNnzCols=2, quantile=90): matrixFileName = PathDefaults.getDataDir() + "flixster/Ratings.timed.txt" matrixFile = open(matrixFileName) matrixFile.readline() userIndexer = IdIndexer("i") movieIndexer = IdIndexer("i") ratings = array.array("f") logging.debug("Loading ratings from " + matrixFileName) for i, line in enumerate(matrixFile): if i % 1000000 == 0: logging.debug("Iteration: " + str(i)) vals = line.split() userIndexer.append(vals[0]) movieIndexer.append(vals[1]) ratings.append(float(vals[2])) rowInds = userIndexer.getArray() colInds = movieIndexer.getArray() ratings = numpy.array(ratings) X = sppy.csarray((len(userIndexer.getIdDict()), len(movieIndexer.getIdDict())), storagetype="row", dtype=numpy.int) X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True) X.prune() X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + matrixFileName) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) #X = Sampling.sampleUsers(X, 1000) return X
def coauthorsGraphFromAuthors2(self, relevantExperts, field): """ Take a set of relevant authors and return the graph. """ dataFileName = self.dataDir + "__" + field.replace(' ', '') + ".csv" dataFile = open(dataFileName) authorIndexer = IdIndexer() author1Inds = array.array("i") author2Inds = array.array("i") articleDict = {} for relevantExpert in relevantExperts: authorIndexer.append(relevantExpert) for i, line in enumerate(dataFile): try: fields = [x.strip() for x in line.split(";")] author = fields[1] + " " + fields[2] articleId = fields[4] if articleId not in articleDict.keys(): articleDict[articleId] = [author] else: articleDict[articleId].append(author) except IndexError: #Ignore bad lines pass dataFile.close() for articleId in articleDict.keys(): authors = articleDict[articleId] if len(authors) != 0: iterator = itertools.combinations(authors, 2) for author1, author2 in iterator: if author1 in relevantExperts and author2 in relevantExperts: author1Ind = authorIndexer.append(author1) author2Ind = authorIndexer.append(author2) author1Inds.append(author1Ind) author2Inds.append(author2Ind) logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors") #Coauthor graph is undirected author1Inds = numpy.array(author1Inds, numpy.int) author2Inds = numpy.array(author2Inds, numpy.int) edges = numpy.c_[author1Inds, author2Inds] graph = igraph.Graph() graph.add_vertices(len(authorIndexer.getIdDict())) graph.add_edges(edges) graph.es["weight"] = numpy.ones(graph.ecount()) graph.simplify(combine_edges=sum) graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) return graph, authorIndexer
def epinions(minNnzRows=10, minNnzCols=3, quantile=90): matrixFileName = PathDefaults.getDataDir() + "epinions/rating.mat" A = scipy.io.loadmat(matrixFileName)["rating"] userIndexer = IdIndexer("i") itemIndexer = IdIndexer("i") for i in range(A.shape[0]): userIndexer.append(A[i, 0]) itemIndexer.append(A[i, 1]) rowInds = userIndexer.getArray() colInds = itemIndexer.getArray() ratings = A[:, 3] X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int) X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True) X.prune() X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + matrixFileName) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) return X
def __init__(self, dataFilename, split=0.8): """ Read datasets from the specified files. """ printStep = 1000000 authorIndexer = IdIndexer() itemIndexer = IdIndexer() ratings = array.array("i") #Read train files dataFile = open(dataFilename) for i, line in enumerate(dataFile): if i % printStep == 0: logging.debug("Iteration: " + str(i)) vals = line.split() authorIndexer.append(vals[0]) itemIndexer.append(vals[1]) ratings.append(int(vals[2])) dataFile.close() logging.debug("Read file with " + str(i+1) + " lines") authorInds = numpy.array(authorIndexer.getArray()) itemInds = numpy.array(itemIndexer.getArray()) ratings = numpy.array(ratings) logging.debug("Number of authors: " + str(len(authorIndexer.getIdDict()))) logging.debug("Number of items: " + str(len(itemIndexer.getIdDict()))) logging.debug("Number of ratings: " + str(ratings.shape[0])) del authorIndexer del itemIndexer gc.collect() shape = (numpy.max(authorInds)+1, numpy.max(itemInds)+1) inds = numpy.random.permutation(ratings.shape[0]) trainInds = inds[0:int(inds.shape[0]*split)] trainX = scipy.sparse.csc_matrix((ratings[trainInds], (authorInds[trainInds], itemInds[trainInds])), shape=shape) testInds = inds[int(inds.shape[0]*split):] testX = scipy.sparse.csc_matrix((ratings[testInds], (authorInds[testInds], itemInds[testInds])), shape=shape) del authorInds, itemInds, ratings gc.collect() self.trainXList = [trainX] self.testXList = [testX]
def testAppend(self): indexer = IdIndexer() indexer.append("john") indexer.append("james") indexer.append("mark") indexer.append("james") nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))
def testAppend(self): indexer = IdIndexer() indexer.append("john") indexer.append("james") indexer.append("mark") indexer.append("james") nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1]))
def coauthorsGraphFromAuthors(self, relevantExperts): """ Take a set of relevant authors and return the graph. """ dataFile = open(self.dataFilename) authorIndexer = IdIndexer() author1Inds = array.array("i") author2Inds = array.array("i") for relevantExpert in relevantExperts: authorIndexer.append(relevantExpert) for i, line in enumerate(dataFile): Util.printIteration(i, self.stepSize, self.numLines) authors = re.findall("#@(.*)", line) if len(authors) != 0: authors = set([x.strip() for x in authors[0].split(",")]) if len(authors.intersection(relevantExperts)) != 0: iterator = itertools.combinations(authors, 2) for author1, author2 in iterator: if author1 in relevantExperts and author2 in relevantExperts: author1Ind = authorIndexer.append(author1) author2Ind = authorIndexer.append(author2) author1Inds.append(author1Ind) author2Inds.append(author2Ind) logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors") #Coauthor graph is undirected author1Inds = numpy.array(author1Inds, numpy.int) author2Inds = numpy.array(author2Inds, numpy.int) edges = numpy.c_[author1Inds, author2Inds] graph = igraph.Graph() graph.add_vertices(len(authorIndexer.getIdDict())) graph.add_edges(edges) graph.es["weight"] = numpy.ones(graph.ecount()) graph.simplify(combine_edges=sum) graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) return graph, authorIndexer
def bookCrossing(minNnzRows=10, minNnzCols=3, quantile=90): matrixFileName = PathDefaults.getDataDir() + "book-crossing/BX-Book-Ratings.csv" matrixFile = open(matrixFileName) matrixFile.readline() userIndexer = IdIndexer("i") itemIndexer = IdIndexer("i") ratings = array.array("f") logging.debug("Loading ratings from " + matrixFileName) for i, line in enumerate(matrixFile): if i % 1000000 == 0: logging.debug("Iteration: " + str(i)) vals = line.split(";") field1 = vals[0].strip("\"") field2 = vals[1].strip("\"") field3 = int(vals[2].strip("\"\n\r")) userIndexer.append(field1) itemIndexer.append(field2) ratings.append(field3) rowInds = userIndexer.getArray() colInds = itemIndexer.getArray() ratings = numpy.array(ratings) X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int) X.put(numpy.array(numpy.logical_or(ratings>4, ratings==0), numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True) X.prune() X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + matrixFileName) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) return X
class IdIndexerTest(unittest.TestCase): def setUp(self): numpy.set_printoptions(suppress=True, precision=3, linewidth=150) numpy.random.seed(21) self.indexer = IdIndexer() self.indexer.append("john") self.indexer.append("james") self.indexer.append("mark") self.indexer.append("james") def testAppend(self): indexer = IdIndexer() indexer.append("john") indexer.append("james") indexer.append("mark") indexer.append("james") nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1])) def testTranslate(self): self.assertEquals(self.indexer.translate(["mark"]), [2]) self.assertEquals(self.indexer.translate(["john"]), [0]) self.assertEquals(self.indexer.translate(["john", "james"]), [0, 1]) def testReverseTranslate(self): self.assertEquals(self.indexer.reverseTranslate(0), "john") self.assertEquals(self.indexer.reverseTranslate(1), "james") self.assertEquals(self.indexer.reverseTranslate(2), "mark") self.assertEquals(self.indexer.reverseTranslate([2, 1, 0]), ["mark", "james", "john"]) def testReverseTranslateDict(self): indDict = self.indexer.reverseTranslateDict() for i in range(len(self.indexer.getIdDict())): self.assertEquals(self.indexer.append(indDict[i]), i)
class IdIndexerTest(unittest.TestCase): def setUp(self): numpy.set_printoptions(suppress=True, precision=3, linewidth=150) numpy.random.seed(21) self.indexer = IdIndexer() self.indexer.append("john") self.indexer.append("james") self.indexer.append("mark") self.indexer.append("james") def testAppend(self): indexer = IdIndexer() indexer.append("john") indexer.append("james") indexer.append("mark") indexer.append("james") nptst.assert_array_equal(indexer.getArray(), numpy.array([0, 1, 2, 1])) def testTranslate(self): self.assertEquals(self.indexer.translate(["mark"]), [2]) self.assertEquals(self.indexer.translate(["john"]), [0]) self.assertEquals(self.indexer.translate(["john", "james"]), [0, 1]) def testReverseTranslate(self): self.assertEquals(self.indexer.reverseTranslate(0), "john") self.assertEquals(self.indexer.reverseTranslate(1), "james") self.assertEquals(self.indexer.reverseTranslate(2), "mark") self.assertEquals(self.indexer.reverseTranslate([2, 1, 0]), ["mark", "james", "john"]) def testReverseTranslateDict(self): indDict = self.indexer.reverseTranslateDict() for i in range(len(self.indexer.getIdDict())): self.assertEquals(self.indexer.append(indDict[i]), i)
def writeAuthorXMatrix(inputFileName, authorIndexerFilename, authorXFileName, reverse=False): if not os.path.isfile(authorXFileName): fileObj = open(inputFileName) authorIndex = IdIndexer() docIndex = IdIndexer() scores = array.array("i") for i, line in enumerate(fileObj): if i % 500000 == 0: logging.debug(i) vals = line.split() #logging.debug(vals[0], vals[1], vals[2]) if reverse: authorIndex.append(vals[1]) docIndex.append(vals[0]) else: authorIndex.append(vals[0]) docIndex.append(vals[1]) score = int(vals[2]) scores.append(int(score)) rowInds = numpy.array(authorIndex.getArray()) colInds = numpy.array(docIndex.getArray()) Y = scipy.sparse.csr_matrix((scores, (rowInds, colInds))) authorIndexerFile = open(authorIndexerFilename, "wb") pickle.dump(authorIndex, authorIndexerFile) authorIndexerFile.close() scipy.io.mmwrite(authorXFileName, Y) logging.debug("Saved matrix to " + authorXFileName) else: logging.debug("File exists: " + authorXFileName)
print(i) networkFilename = dataDir + "network_1_kcores/network_1-core" + str("%02d" % (i,)) + ".txt" networkFile = open(networkFilename) networkFile.readline() networkFile.readline() networkFile.readline() networkFile.readline() node1Inds = array.array("i") node2Inds = array.array("i") for line in networkFile: vals = line.split() node1Inds.append(indexer.append(vals[0])) node2Inds.append(indexer.append(vals[1])) node1Inds = numpy.array(node1Inds) node2Inds = numpy.array(node2Inds) m = len(indexer.getIdDict()) A = numpy.zeros((m, m)) A[node1Inds, node2Inds] = 1 A = (A+A.T)/2 A = scipy.sparse.csr_matrix(A) L = GraphUtils.normalisedLaplacianSym(A) Ls.append(L)