def readAuthorsAndDocuments(self, useAbstract=True): logging.debug("About to read file " + self.dataFilename) inFile = open(self.dataFilename) authorList = [] citationList = [] documentList = [] lastAbstract = "" lastVenue = "" lastTitle = "" lastAuthors = [] lastCitationNo = 0 for i, line in enumerate(inFile): Util.printIteration(i, self.stepSize, self.numLines) #Match the fields in the file emptyLine = line == "\n" title = re.findall("#\*(.*)", line) currentAuthors = re.findall("#@(.*)", line) abstract = re.findall("#!(.*)", line) venue = re.findall("#conf(.*)", line) citationNo = re.findall("#citation(.*)", line) if emptyLine: if useAbstract: document = lastTitle + " " + lastAbstract else: document = lastTitle documentList.append(document) authorList.append(lastAuthors) citationList.append(lastCitationNo) lastAbstract = "" lastTitle = "" lastAuthors = [] lastCitationNo = 0 if len(title) != 0 and len(title[0]) != 0: lastTitle = title[0] if len(venue) != 0 and len(venue[0]) != 0: lastVenue = venue[0] if len(abstract) != 0 and len(abstract[0]) != 0: lastAbstract = abstract[0] if len(citationNo) != 0 and len(citationNo[0]) != 0: lastCitationNo = int(citationNo[0]) if len(currentAuthors) != 0: currentAuthors = currentAuthors[0].split(",") currentAuthors = set([x.strip() for x in currentAuthors]) currentAuthors = currentAuthors.difference(set([""])) lastAuthors = currentAuthors inFile.close() logging.debug("Finished reading " + str(len(documentList)) + " articles") return authorList, documentList, citationList
def setVertices(self, vertices): """ Set the vertices to the given list of vertices. :param vertices: a set of vertices of the same shape as this object. """ Util.abstract()
def getAllEdges(self): """ Return an array of edges with each row representing an edge. :returns: A numpy array of all edges in this graph. """ Util.abstract()
def saveStats(args): i, theta, startDate, endDate, recordStep = args resultsFileName = outputDir + "SimStats" + str(i) + ".pkl" try: with open(resultsFileName) as f: pass except IOError as e: featureInds= numpy.ones(targetGraph.vlist.getNumFeatures(), numpy.bool) featureInds[HIVVertices.dobIndex] = False featureInds[HIVVertices.infectionTimeIndex] = False featureInds[HIVVertices.hiddenDegreeIndex] = False featureInds[HIVVertices.stateIndex] = False featureInds = numpy.arange(featureInds.shape[0])[featureInds] matcher = GraphMatch("PATH", alpha=0.5, featureInds=featureInds, useWeightM=False) graphMetrics = HIVGraphMetrics2(targetGraph, 1.0, matcher, float(endDate)) times, infectedIndices, removedIndices, graph = HIVModelUtils.simulate(thetaArray[i], startDate, endDate, recordStep, M, graphMetrics) times, vertexArray, removedGraphStats = HIVModelUtils.generateStatistics(graph, startDate, endDate, recordStep) stats = times, vertexArray, removedGraphStats, graphMetrics.dists, graphMetrics.graphDists, graphMetrics.labelDists Util.savePickle(stats, resultsFileName)
def predictEdges(self, vertexIndices): """ This makes a prediction for a series of edges using the following score \sum_z \in n(x) \cup n(y) = 1/|log(n(z)| Returns a matrix with rows are a ranked list of verticies of length self.windowSize. """ Parameter.checkInt(self.windowSize, 1, self.graph.getNumVertices()) logging.info("Running predictEdges in " + str(self.__class__.__name__)) P = numpy.zeros((vertexIndices.shape[0], self.windowSize)) S = numpy.zeros((vertexIndices.shape[0], self.windowSize)) W = self.graph.getWeightMatrix() for i in range(vertexIndices.shape[0]): Util.printIteration(i, self.printStep, vertexIndices.shape[0]) scores = numpy.zeros(self.graph.getNumVertices()) for j in range(0, self.graph.getNumVertices()): commonNeighbours = numpy.nonzero(W[vertexIndices[i], :] * W[j, :])[0] for k in commonNeighbours: q = numpy.log(numpy.nonzero(W[k, :])[0].shape[0]) if q != 0: scores[j] = scores[j] + 1/q P[i, :], S[i, :] = self.indicesFromScores(vertexIndices[i], scores) return P, S
def complement(self): """ Returns a graph with identical vertices (same reference) to the current one, but with the complement of the set of edges. Edges that do not exist have weight 1. """ Util.abstract()
def __updateEigenSystem(self, lmbda, Q, deltaW, W): """ Give the eigenvalues lmbda, eigenvectors Q and a deltaW matrix of weight changes, compute sequence of incidence vectors and update eigensystem. The deltaW is the change in edges from the current weight martrix which is given by W. """ changeInds = deltaW.nonzero() for s in range(changeInds[0].shape[0]): Util.printIteration(s, 10, changeInds[0].shape[0]) i = changeInds[0][s] j = changeInds[1][s] if i>=j: # only consider lower diagonal changes continue assert deltaW[i, j] != 0 # if deltaW[i, j] < 0: # logging.warn(" deltaW is usually positive (here deltaW=" +str(deltaW[i, j]) + ")") #Note: update W at each iteration here lmbda, Q = self.incrementEigenSystem(lmbda, Q, W, i, j, deltaW[i,j]) W[i, j] += deltaW[i, j] W[j, i] += deltaW[i, j] return lmbda, Q
def getVertices(self, vertexIndices): """ Returns a list of vertices specified by vertexIndices. :param vertexIndices: a list of vertex indices. """ Util.abstract()
def saveResult(self, X, Y, learner, fileName): """ Save a single result to file, checking if the results have already been computed """ fileBaseName, sep, ext = fileName.rpartition(".") lockFileName = fileBaseName + ".lock" gc.collect() if not os.path.isfile(fileName) and not os.path.isfile(lockFileName): try: lockFile = open(lockFileName, 'w') lockFile.close() logging.debug("Created lock file " + lockFileName) logging.debug("Computing file " + fileName) logging.debug(learner) (bestParams, allMetrics, bestMetaDicts) = learner.evaluateCvOuter(X, Y, self.folds) cvResults = {"bestParams":bestParams, "allMetrics":allMetrics, "metaDicts":bestMetaDicts} Util.savePickle(cvResults, fileName) os.remove(lockFileName) logging.debug("Deleted lock file " + lockFileName) except: logging.debug("Caught an error in the code ... skipping") raise else: logging.debug("File exists, or is locked: " + fileName)
def cleanXML(self): """ Take the original XML file and clean up HTML characters and & symbols. We also create a list of possible matches for the experts. """ if not os.path.exists(self.xmlCleanFilename): logging.debug("Cleaning XML") h = HTMLParser.HTMLParser() inFile = open(self.xmlFileName) outFile = open(self.xmlCleanFilename, "w") i = 0 for line in inFile: Util.printIteration(i, self.stepSize, self.numLines) outLine = h.unescape(line).replace("&", "&") outLine = re.sub("<title>.*[\<\>].*</title>", "<title>Default Title</title>", outLine) outLine = re.sub("<ee>.*[\<\>].*</ee>", "<ee>Default text</ee>", outLine) outFile.write(outLine) i += 1 inFile.close() outFile.close() logging.debug("All done") else: logging.debug("File already generated: " + self.xmlCleanFilename)
def sequenceVectorStats(self, graph, subgraphIndices, treeStats=False, eigenStats=True): """ Pass in a list of graphs are returns a series of statistics. Each list element is a dict of vector statistics. """ Parameter.checkClass(graph, AbstractMatrixGraph) for inds in subgraphIndices: Parameter.checkList(inds, Parameter.checkInt, [0, graph.getNumVertices()]) Parameter.checkBoolean(treeStats) numGraphs = len(subgraphIndices) statsDictList = [] for i in range(numGraphs): Util.printIteration(i, self.vectorPrintStep, numGraphs) subgraph = graph.subgraph(subgraphIndices[i]) statsDictList.append( self.vectorStatistics(subgraph, treeStats, eigenStats)) return statsDictList
def simulateModel(theta): """ The parameter t is the particle index. """ logging.debug("theta=" + str(theta)) #We start with the observed graph at the start date graph = targetGraph.subgraph(targetGraph.removedIndsAt(startDate)) graph.addVertices(M-graph.size) p = Util.powerLawProbs(alpha, zeroVal) hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices()) featureInds = numpy.ones(graph.vlist.getNumFeatures(), numpy.bool) featureInds[HIVVertices.dobIndex] = False featureInds[HIVVertices.infectionTimeIndex] = False featureInds[HIVVertices.hiddenDegreeIndex] = False featureInds[HIVVertices.stateIndex] = False featureInds = numpy.arange(featureInds.shape[0])[featureInds] matcher = GraphMatch(matchAlg, alpha=matchAlpha, featureInds=featureInds, useWeightM=False) graphMetrics = HIVGraphMetrics2(targetGraph, breakSize, matcher, float(endDate)) recordStep = (endDate-startDate)/float(numRecordSteps) rates = HIVRates(graph, hiddenDegSeq) model = HIVEpidemicModel(graph, rates, T=float(endDate), T0=float(startDate), metrics=graphMetrics) model.setRecordStep(recordStep) model.setParams(theta) model.simulate() objective = model.objective() return objective
def sequenceScalarStats(self, graph, subgraphIndices, slowStats=True, treeStats=False): """ Pass in a graph and list of subgraph indices and returns a series of statistics. Each row corresponds to the statistics on the subgraph. """ Parameter.checkClass(graph, AbstractMatrixGraph) for inds in subgraphIndices: Parameter.checkList(inds, Parameter.checkInt, [0, graph.getNumVertices()]) Parameter.checkBoolean(slowStats) Parameter.checkBoolean(treeStats) numGraphs = len(subgraphIndices) statsMatrix = numpy.zeros((numGraphs, self.numStats)) for i in range(numGraphs): Util.printIteration(i, self.printStep, numGraphs) #logging.debug("Subgraph size: " + str(len(subgraphIndices[i]))) subgraph = graph.subgraph(subgraphIndices[i]) statsMatrix[i, :] = self.scalarStatistics(subgraph, slowStats, treeStats) return statsMatrix
def getWeightMatrix(self): """ Returns a numpy array of the weight matrix of this graph. :returns: The weight matrix of this graph. """ Util.abstract()
def processRatings(self): """ Convert the dataset into a matrix and save the results for faster access. """ if not os.path.exists(self.ratingFileName) or not os.path.exists(self.custDictFileName): dataDir = PathDefaults.getDataDir() + "netflix/training_set/" logging.debug("Processing ratings given in " + dataDir) custIdDict = {} custIdSet = set([]) movieIds = array.array("I") custIds = array.array("I") ratings = array.array("B") dates = array.array("L") j = 0 for i in range(self.startMovieID, self.endMovieID+1): Util.printIteration(i-1, 1, self.endMovieID-1) ratingsFile = open(dataDir + "mv_" + str(i).zfill(7) + ".txt") ratingsFile.readline() for line in ratingsFile: vals = line.split(",") custId = int(vals[0]) if custId not in custIdSet: custIdSet.add(custId) custIdDict[custId] = j custInd = j j += 1 else: custInd = custIdDict[custId] rating = int(vals[1]) t = datetime.strptime(vals[2].strip(), "%Y-%m-%d") movieIds.append(i-1) custIds.append(custInd) ratings.append(rating) dates.append(int(time.mktime(t.timetuple()))) movieIds = numpy.array(movieIds, numpy.uint32) custIds = numpy.array(custIds, numpy.uint32) ratings = numpy.array(ratings, numpy.uint8) dates = numpy.array(dates, numpy.uint32) assert ratings.shape[0] == self.numRatings numpy.savez(self.ratingFileName, movieIds, custIds, ratings, dates) logging.debug("Saved ratings file as " + self.ratingFileName) pickle.dump(custIdDict, open(self.custDictFileName, 'wb')) logging.debug("Saved custIdDict as " + self.custDictFileName) else: logging.debug("Ratings file " + str(self.ratingFileName) + " already processed")
def load(filename): """ Load this object from filename. :param filename: The name of the file to load. :type filename: :class:`str` """ Util.abstract()
def getVertex(self, index): """ Returns the value of a vertex. :param index: the index of the vertex. :type index: :class:`int` """ Util.abstract()
def clearVertex(self, index): """ Sets a vertex to None :param index: the index of the vertex to assign a value. :type index: :class:`int` """ Util.abstract()
def save(self, filename): """ Save this object to filename.nvl. :param filename: The name of the file to save. :type filename: :class:`str` """ Util.abstract()
def getAllVertexIds(self): """ Return a list of all indices of the vertices :returns: A numpy array of all the vertex indices in this graph. """ Util.abstract()
def removeEdge(self, vertexIndex1, vertexIndex2, edgeTypeIndex): """ Remove an edge between two vertices. @param vertexIndex1: The index of the first vertex. @param vertexIndex1: The index of the second vertex. """ Util.abstract()
def predict(self, X): """ Make a prediction for a set of examples given as the rows of the matrix X. :param X: A matrix with examples as rows :type X: :class:`ndarray` """ Util.abstract()
def testExpandIntArray(self): v = numpy.array([1, 3, 2, 4], numpy.int) w = Util.expandIntArray(v) self.assertTrue((w == numpy.array([0,1,1,1,2,2,3,3,3,3], numpy.int)).all()) v = numpy.array([], numpy.int) w = Util.expandIntArray(v) self.assertTrue((w == numpy.array([], numpy.int)).all())
def evaluateLearn(X, y, idx, learnModel, predict, metricMethod, progress=True): """ Evaluate this learning algorithm using the given list of training/test splits The metricMethod is a method which takes (predictedY, realY) as input and returns a metric about the quality of the evaluation. :param X: A matrix with examples as rows :type X: :class:`ndarray` :param y: A vector of labels :type y: :class:`ndarray` :param idx: A list of training/test splits :type idx: :class:`list` :param learnModel: A function such that learnModel(X, y) finds a mapping from X to y :type learnModel: :class:`function` :param predict: A function such that predict(X) makes predictions for X :type predict: :class:`function` :param metricMethod: A function such that metricMethod(predY, testY) returns the quality of predicted labels predY :type metricMethod: :class:`function` Output: the mean and variation of the cross validation folds. """ #Parameter.checkClass(idx, list) Parameter.checkClass(X, numpy.ndarray) Parameter.checkArray(X, softCheck=True) Parameter.checkInt(X.shape[0], 1, float('inf')) Parameter.checkClass(y, numpy.ndarray) Parameter.checkArray(y, softCheck=True) if y.ndim != 1: raise ValueError("Dimention of y must be 1") i = 0 metrics = numpy.zeros(len(idx)) logging.debug("EvaluateLearn: Using " + str(len(idx)) + " splits on " + str(X.shape[0]) + " examples") for idxtr, idxts in idx: if progress: Util.printConciseIteration(i, 1, len(idx)) trainX, testX = X[idxtr, :], X[idxts, :] trainY, testY = y[idxtr], y[idxts] #logging.debug("Distribution of labels in evaluateLearn train: " + str(numpy.bincount(trainY))) #logging.debug("Distribution of labels in evaluateLearn test: " + str(numpy.bincount(testY))) learnModel(trainX, trainY) predY = predict(testX) gc.collect() metrics[i] = metricMethod(predY, testY) i += 1 return metrics
def eigenAdd(omega, Q, Y, k): """ Perform an eigen update of the form A*A + Y*Y in which Y is a low-rank matrix and A^*A = Q Omega Q*. We use the rank-k approximation of A: Q_k Omega_k Q_k^* and then approximate [A^*A_k Y^*Y]_k. """ #logging.debug("< eigenAdd >") Parameter.checkInt(k, 0, omega.shape[0]) #if not numpy.isrealobj(omega) or not numpy.isrealobj(Q): # raise ValueError("Eigenvalues and eigenvectors must be real") if omega.ndim != 1: raise ValueError("omega must be 1-d array") if omega.shape[0] != Q.shape[1]: raise ValueError("Must have same number of eigenvalues and eigenvectors") if __debug__: Parameter.checkOrthogonal(Q, tol=EigenUpdater.tol, softCheck=True, arrayInfo="input Q in eigenAdd()") #Taking the abs of the eigenvalues is correct inds = numpy.flipud(numpy.argsort(numpy.abs(omega))) omega, Q = Util.indEig(omega, Q, inds[numpy.abs(omega)>EigenUpdater.tol]) Omega = numpy.diag(omega) YY = Y.conj().T.dot(Y) QQ = Q.dot(Q.conj().T) Ybar = Y - Y.dot(QQ) Pbar, sigmaBar, Qbar = numpy.linalg.svd(Ybar, full_matrices=False) inds = numpy.flipud(numpy.argsort(numpy.abs(sigmaBar))) inds = inds[numpy.abs(sigmaBar)>EigenUpdater.tol] Pbar, sigmaBar, Qbar = Util.indSvd(Pbar, sigmaBar, Qbar, inds) SigmaBar = numpy.diag(sigmaBar) Qbar = Ybar.T.dot(Pbar) Qbar = Qbar.dot(numpy.diag(numpy.diag(Qbar.T.dot(Qbar))**-0.5)) r = sigmaBar.shape[0] YQ = Y.dot(Q) Zeros = numpy.zeros((r, omega.shape[0])) D = numpy.c_[Q, Qbar] YYQQ = YY.dot(QQ) Z = D.conj().T.dot(YYQQ + YYQQ.conj().T).dot(D) F = numpy.c_[numpy.r_[Omega - YQ.conj().T.dot(YQ), Zeros], numpy.r_[Zeros.T, SigmaBar.conj().dot(SigmaBar)]] F = F + Z pi, H = scipy.linalg.eigh(F) inds = numpy.flipud(numpy.argsort(numpy.abs(pi))) H = H[:, inds[0:k]] pi = pi[inds[0:k]] V = D.dot(H) #logging.debug("</ eigenAdd >") return pi, V
def eigpsd(X, n): """ Find the eigenvalues and eigenvectors of a positive semi-definite symmetric matrix. The input matrix X can be a numpy array or a scipy sparse matrix. In the case that n==X.shape[0] we convert to an ndarray. :param X: The matrix to find the eigenvalues of. :type X: :class:`ndarray` :param n: If n is an int, then it is the number of columns to sample otherwise n is an array of column indices. :return lmbda: The set of eigenvalues :return V: The matrix of eigenvectors as a ndarray """ if type(n) == int: n = min(n, X.shape[0]) inds = numpy.sort(numpy.random.permutation(X.shape[0])[0:n]) elif type(n) == numpy.ndarray: inds = n else: raise ValueError("Invalid n value: " + str(n)) invInds = numpy.setdiff1d(numpy.arange(X.shape[0]), inds) if numpy.sort(inds).shape[0] == X.shape[0] and (numpy.sort(inds) == numpy.arange(X.shape[0])).all(): if scipy.sparse.issparse(X): X = numpy.array(X.todense()) lmbda, V = Util.safeEigh(X) return lmbda, V tmp = X[inds, :] A = tmp[:, inds] B = tmp[:, invInds] if scipy.sparse.issparse(X): A = numpy.array(A.todense()) BB = numpy.array((B*B.T).todense()) else: BB = B.dot(B.T) #Following line is very slow #Am12 = scipy.linalg.sqrtm(numpy.linalg.pinv(A)) Am12 = Util.matrixPowerh(A, -0.5) S = A + Am12.dot(BB).dot(Am12) S = (S.T + S)/2 lmbda, U = Util.safeEigh(S) tol = 10**-10 lmbdaN = lmbda.copy() lmbdaN[numpy.abs(lmbda) < tol] = 0 lmbdaN[numpy.abs(lmbda) > tol] = lmbdaN[numpy.abs(lmbda) > tol]**-0.5 V = X[:, inds].dot(Am12.dot(U)*lmbdaN) return lmbda, V
def processProbe(self): """ Go through the probe set and label the corresponding ratings in the full dataset as test. """ if not os.path.exists(self.isTrainRatingsFileName): custIdDict = pickle.load(open(self.custDictFileName)) dataArr = numpy.load(self.ratingFileName) movieInds, custInds, ratings, dates = dataArr["arr_0"], dataArr["arr_1"], dataArr["arr_2"], dataArr["arr_3"] logging.debug("Number of ratings: " + str(ratings.shape[0]+1)) del ratings, dates logging.debug("Training data loaded") isTrainRating = numpy.ones(movieInds.shape[0], numpy.bool) probeFile = open(self.probeFileName) i = 0 #First figure out the movie boundaries movieBoundaries = numpy.nonzero(numpy.diff(movieInds) != 0)[0] + 1 movieBoundaries = numpy.insert(movieBoundaries, 0, 0) movieBoundaries = numpy.append(movieBoundaries, movieInds.shape[0]) assert movieBoundaries.shape[0] == self.numMovies+1 assert movieBoundaries[-1] == movieInds.shape[0] for line in probeFile: if line.find(":") != -1: Util.printIteration(i, 10, self.numProbeMovies) movieId = line[0:-2] movieInd = int(movieId)-1 startInd = movieBoundaries[movieInd] endInd = movieBoundaries[movieInd+1] #All the customers that watches movie movieInd tempCustInds = custInds[startInd:endInd] sortedInds = numpy.argsort(tempCustInds) assert (movieInds[startInd:endInd] == movieInd).all() i += 1 else: custId = int(line.strip()) custInd = custIdDict[custId] offset = numpy.searchsorted(tempCustInds[sortedInds], custInd) isTrainRating[startInd + sortedInds[offset]] = 0 assert custInds[startInd + sortedInds[offset]] == custInd assert i == self.numProbeMovies assert numpy.logical_not(isTrainRating).sum() == self.numProbeRatings numpy.savez(self.isTrainRatingsFileName, isTrainRating) logging.debug("Saved file as " + self.isTrainRatingsFileName) else: logging.debug("Train/test indicators file " + str(self.isTrainRatingsFileName) + " already processed")
def setVertex(self, index, value): """ Set a vertex to the corresponding value. :param index: the index of the vertex to assign a value. :type index: :class:`int` :param value: the value to assign to the vertex. """ Util.abstract()
def addEdge(self, vertexIndex1, vertexIndex2, edgeTypeIndex, edge): """ Add an edge to the graph between two vertices. @param vertexIndex1: The index of the first vertex. @param vertexIndex1: The index of the second vertex. @param edge: The value to assign to the edge. """ Util.abstract()
def testEntropy(self): v = numpy.array([0, 0, 0, 1, 1, 1]) self.assertEquals(Util.entropy(v), 1) v = numpy.array([0, 0, 0]) self.assertEquals(Util.entropy(v), 0) v = numpy.array([1, 1, 1]) self.assertEquals(Util.entropy(v), 0)
def testMatrixPowerh(self): A = numpy.random.rand(10, 10) A = A.T.dot(A) tol = 10**-6 A2 = A.dot(A) lmbda, V = scipy.linalg.eig(A) A12 = Util.matrixPowerh(A, 0.5) self.assertTrue(numpy.linalg.norm(A12.dot(A12) - A) < tol) self.assertTrue(numpy.linalg.norm(numpy.linalg.inv(A) - Util.matrixPowerh(A, -1)) < tol) self.assertTrue(numpy.linalg.norm(A - Util.matrixPowerh(A, 1)) < tol) self.assertTrue(numpy.linalg.norm(A2 - Util.matrixPowerh(A, 2)) < tol) self.assertTrue(numpy.linalg.norm(numpy.linalg.inv(A).dot(numpy.linalg.inv(A)) - Util.matrixPowerh(A, -2)) < tol) #Now lets test on a low rank matrix lmbda[5:] = 0 A = V.dot(numpy.diag(lmbda)).dot(numpy.linalg.inv(V)) A2 = A.dot(A) A12 = Util.matrixPowerh(A, 0.5) Am12 = Util.matrixPowerh(A, -0.5) self.assertTrue(numpy.linalg.norm(numpy.linalg.pinv(A) - Util.matrixPowerh(A, -1)) < tol) self.assertTrue(numpy.linalg.norm(numpy.linalg.pinv(A) - Am12.dot(Am12)) < tol) self.assertTrue(numpy.linalg.norm(A12.dot(A12) - A) < tol) self.assertTrue(numpy.linalg.norm(A - Util.matrixPowerh(A, 1)) < tol) self.assertTrue(numpy.linalg.norm(A2 - Util.matrixPowerh(A, 2)) < tol)
def distance2(self, graph1, graph2, permutation): """ Compute a graph distance metric between two graphs give a permutation vector. This is given by F(P) = (1-alpha)/(||W1||^2_F + ||W2||^2_F) (||W1 - P W2 P.T||^2_F) - alpha 1/(||V1||_F^2 + ||V2||_F^2) ||V1 - P.T V2||^2_F and is bounded between 0 and 1. :param graph1: A graph object :param graph2: The second graph object to match :param permutation: An array of permutation indices matching the first to second graph :type permutation: `numpy.ndarray` """ if self.useWeightM: W1 = graph1.getWeightMatrix() W2 = graph2.getWeightMatrix() else: W1 = graph1.adjacencyMatrix() W2 = graph2.adjacencyMatrix() if W1.shape[0] < W2.shape[0]: W1 = Util.extendArray(W1, W2.shape) elif W2.shape[0] < W1.shape[0]: W2 = Util.extendArray(W2, W1.shape) n = W1.shape[0] P = numpy.zeros((n, n)) P[(numpy.arange(n), permutation)] = 1 dist1 = numpy.linalg.norm(W1 - P.dot(W2).dot(P.T))**2 #Now compute the vertex similarities distance V1 = graph1.getVertexList().getVertices() V2 = graph2.getVertexList().getVertices() if V1.shape[0] < V2.shape[0]: V1 = Util.extendArray(V1, V2.shape) elif V2.shape[0] < V1.shape[0]: V2 = Util.extendArray(V2, V1.shape) dist2 = numpy.sum((V1 - P.T.dot(V2))**2) norm1 = ((W1**2).sum() + (W2**2).sum()) norm2 = ((V1**2).sum() + (V2**2).sum()) if norm1!= 0: dist1 = dist1/norm1 if norm2!= 0: dist2 = dist2/norm2 dist = (1-self.alpha)*dist1 + self.alpha*dist2 return dist
def save(self, filename): """ Save this object to filename.nvl. :param filename: The name of the file to save to. :type filename: :class:`str` :returns: The name of the saved file including extension. """ Util.savePickle(self.V, filename + self.ext, overwrite=True) return filename + self.ext
def removeEdge(self, vertexIndex1, vertexIndex2): """ Remove an edge between two vertices. :param vertexIndex1: The index of the first vertex. :type vertexIndex1: :class:`int` :param vertexIndex2: The index of the second vertex. :type vertexIndex2: :class:`int` """ Util.abstract()
def testMode(self): x = numpy.array([1,1,1,2,2,3,3,3,3,3,5,5]) self.assertEquals(Util.mode(x), 3) x = numpy.array([1,1,1,2,2,3,3,3,5,5]) self.assertEquals(Util.mode(x), 1) x = numpy.array([1,2,3,4]) self.assertEquals(Util.mode(x), 1) x = numpy.array([0]) self.assertEquals(Util.mode(x), 0)
def testCumMin(self): v = numpy.array([5, 6, 4, 5, 1]) u = Util.cumMin(v) nptst.assert_array_equal(u, numpy.array([5, 5, 4, 4, 1])) v = numpy.array([5, 4, 3, 2, 1]) u = Util.cumMin(v) nptst.assert_array_equal(u, v) v = numpy.array([1, 2, 3]) u = Util.cumMin(v) nptst.assert_array_equal(u, numpy.ones(3))
def testRank(self): X = numpy.random.rand(10, 1) self.assertEquals(Util.rank(X), 1) X = numpy.random.rand(10, 12) self.assertEquals(Util.rank(X), 10) X = numpy.random.rand(31, 12) self.assertEquals(Util.rank(X), 12) K = numpy.dot(X, X.T) self.assertEquals(Util.rank(X), 12)
def load(cls, filename): """ Load the graph object from the corresponding file. Data is loaded in a zip format as created using save(). :param filename: The name of the file to load. :type filename: :class:`str` :returns: A graph corresponding to the one saved in filename. """ Parameter.checkClass(filename, str) import zipfile (path, filename) = os.path.split(filename) if path == "": path = "./" tempPath = tempfile.mkdtemp() originalPath = os.getcwd() try: os.chdir(path) myzip = zipfile.ZipFile(filename + '.zip', 'r') myzip.extractall(tempPath) myzip.close() os.chdir(tempPath) #Deal with legacy files try: W = cls.loadMatrix(cls._wFilename) metaDict = Util.loadPickle(cls._metaFilename) vList = globals()[metaDict["vListType"]].load(cls._verticesFilename) undirected = metaDict["undirected"] except IOError: W = cls.loadMatrix(filename + cls._matExt) vList = VertexList.load(filename) undirected = Util.loadPickle(filename + cls._boolExt) graph = cls(vList, undirected) graph.W = W for tempFile in myzip.namelist(): os.remove(tempFile) finally: os.chdir(originalPath) os.rmdir(tempPath) return graph
def getFeatureDistribution(self, fIndex, vIndices=None): """ Returns a tuple (frequencies, items) about a particular feature given by fIndex. This method is depricated. """ Parameter.checkIndex(fIndex, 0, self.getNumFeatures()) if vIndices == None: (freqs, items) = Util.histogram(self.V[:, fIndex]) else: (freqs, items) = Util.histogram(self.V[vIndices, fIndex]) return (freqs, items)
def save(self, filename): """ Save the graph object to the corresponding filename under the .zip extension. The adjacency matrix is stored in matrix market format and the AbstractVertexList decides how to store the vertex labels. :param filename: The name of the file to save. :type filename: :class:`str` :returns: The name of the saved zip file. """ Parameter.checkClass(filename, str) import zipfile (path, filename) = os.path.split(filename) if path == "": path = "./" tempPath = tempfile.mkdtemp() originalPath = os.getcwd() try: os.chdir(tempPath) self.saveMatrix(self.W, self._wFilename) vListFilename = self.vList.save(self._verticesFilename) metaDict = {} metaDict["version"] = apgl.__version__ metaDict["undirected"] = self.undirected metaDict["vListType"] = self.vList.__class__.__name__ Util.savePickle(metaDict, self._metaFilename) myzip = zipfile.ZipFile(filename + '.zip', 'w') myzip.write(self._wFilename) myzip.write(vListFilename) myzip.write(self._metaFilename) myzip.close() os.remove(self._wFilename) os.remove(vListFilename) os.remove(self._metaFilename) shutil.move(filename + ".zip", path + "/" + filename + '.zip') finally: os.chdir(originalPath) os.rmdir(tempPath) return path + "/" + filename + '.zip'
def addEdge(self, vertexIndex1, vertexIndex2, edgeValue): """ Add a non-zero edge between two vertices. :param vertexIndex1: The index of the first vertex. :type vertexIndex1: :class:`int` :param vertexIndex2: The index of the second vertex. :type vertexIndex2: :class:`int` :param edgeValue: The value to assign to the edge. """ Util.abstract()
def generate(self, graph, requireEmpty=True): ''' Create an Configuration Model graph. Note the the degree sequence(s) given in the constructor cannot be guarenteed. The algorithm randomly selects two free "spokes" and then tried to connect them. If two vertices are already connected the corresponding spokes are not used again. In the case that requireEmpty is False then a non-empty graph can be used and the given degree sequence(s) is(are) the difference(s) in degrees between the output graph and input one. :param graph: a graph to populate with edges :type graph: :class:`apgl.graph.AbstractMatrixGraph` :param requireEmpty: if this is set to true then we require an empty graph. :type requireEmpty: :class:`bool` :returns: The modified input graph. ''' Parameter.checkClass(graph, AbstractMatrixGraph) if requireEmpty and graph.getNumEdges() != 0: raise ValueError("Graph must have no edges") if graph.getNumVertices() != self.outDegSequence.shape[0]: raise ValueError( "Graph must have same number of vertices as degree sequence") if self.getInDegSequence() != None and graph.isUndirected(): raise ValueError( "In-degree sequence must be used in conjunction with directed graphs" ) if self.getInDegSequence() == None: expandedInds = Util.expandIntArray(self.outDegSequence) numpy.random.shuffle(expandedInds) for i in range(0, len(expandedInds), 2): if i != len(expandedInds) - 1: graph.addEdge(expandedInds[i], expandedInds[i + 1]) else: expandedOutInds = Util.expandIntArray(self.outDegSequence) expandedInInds = Util.expandIntArray(self.inDegSequence) numpy.random.shuffle(expandedOutInds) numpy.random.shuffle(expandedInInds) for i in range( numpy.min( numpy.array([ expandedOutInds.shape[0], expandedInInds.shape[0] ]))): graph.addEdge(expandedOutInds[i], expandedInInds[i]) return graph
def testRandom2Choice(self): n = 1000 V = numpy.array([[0.3, 0.7], [0.5, 0.5]]) J = Util.random2Choice(V, n) self.assertAlmostEquals(numpy.sum(J[0, :]==0)/float(n), V[0, 0], places=1) self.assertAlmostEquals(numpy.sum(J[0, :]==1)/float(n), V[0, 1], places=1) self.assertAlmostEquals(numpy.sum(J[1, :]==0)/float(n), V[1, 0], places=1) self.assertAlmostEquals(numpy.sum(J[1, :]==1)/float(n), V[1, 1], places=1) #Now use a vector of probabilities v = numpy.array([0.3, 0.7]) j = Util.random2Choice(v, n) self.assertAlmostEquals(numpy.sum(j==0)/float(n), v[0], places=1) self.assertAlmostEquals(numpy.sum(j==1)/float(n), v[1], places=1)
def maxProductPaths(self): """ Find the maximum product paths between all pairs of vertices using a modified version of the Floyd-Warshall algorithm. :returns: A matrix P whose ijth entry corresponds to the maximal product of edge weights between them. """ numVertices = self.vList.getNumVertices() P = self.getWeightMatrix().copy() stepSize = min(100, numVertices-1) for k in range(0, numVertices): Util.printIteration(k, stepSize, numVertices) P2 = numpy.outer(P[:, k], P[k, :]) P = numpy.maximum(P, P2) return P