def __updateEigenSystem(self, lmbda, Q, deltaW, W): """ Give the eigenvalues lmbda, eigenvectors Q and a deltaW matrix of weight changes, compute sequence of incidence vectors and update eigensystem. The deltaW is the change in edges from the current weight martrix which is given by W. """ changeInds = deltaW.nonzero() for s in range(changeInds[0].shape[0]): Util.printIteration(s, 10, changeInds[0].shape[0]) i = changeInds[0][s] j = changeInds[1][s] if i >= j: # only consider lower diagonal changes continue assert deltaW[i, j] != 0 # if deltaW[i, j] < 0: # logging.warn(" deltaW is usually positive (here deltaW=" +str(deltaW[i, j]) + ")") #Note: update W at each iteration here lmbda, Q = self.incrementEigenSystem(lmbda, Q, W, i, j, deltaW[i, j]) W[i, j] += deltaW[i, j] W[j, i] += deltaW[i, j] return lmbda, Q
def readAuthorsAndDocuments(self, useAbstract=True): logging.debug("About to read file " + self.dataFilename) inFile = open(self.dataFilename) authorList = [] citationList = [] documentList = [] lastAbstract = "" lastVenue = "" lastTitle = "" lastAuthors = [] lastCitationNo = 0 for i, line in enumerate(inFile): Util.printIteration(i, self.stepSize, self.numLines) #Match the fields in the file emptyLine = line == "\n" title = re.findall("#\*(.*)", line) currentAuthors = re.findall("#@(.*)", line) abstract = re.findall("#!(.*)", line) venue = re.findall("#conf(.*)", line) citationNo = re.findall("#citation(.*)", line) if emptyLine: if useAbstract: document = lastTitle + " " + lastAbstract else: document = lastTitle documentList.append(document) authorList.append(lastAuthors) citationList.append(lastCitationNo) lastAbstract = "" lastTitle = "" lastAuthors = [] lastCitationNo = 0 if len(title) != 0 and len(title[0]) != 0: lastTitle = title[0] if len(venue) != 0 and len(venue[0]) != 0: lastVenue = venue[0] if len(abstract) != 0 and len(abstract[0]) != 0: lastAbstract = abstract[0] if len(citationNo) != 0 and len(citationNo[0]) != 0: lastCitationNo = int(citationNo[0]) if len(currentAuthors) != 0: currentAuthors = currentAuthors[0].split(",") currentAuthors = set([x.strip() for x in currentAuthors]) currentAuthors = currentAuthors.difference(set([""])) lastAuthors = currentAuthors inFile.close() logging.debug("Finished reading " + str(len(documentList)) + " articles") return authorList, documentList, citationList
def __updateEigenSystem(self, lmbda, Q, deltaW, W): """ Give the eigenvalues lmbda, eigenvectors Q and a deltaW matrix of weight changes, compute sequence of incidence vectors and update eigensystem. The deltaW is the change in edges from the current weight martrix which is given by W. """ changeInds = deltaW.nonzero() for s in range(changeInds[0].shape[0]): Util.printIteration(s, 10, changeInds[0].shape[0]) i = changeInds[0][s] j = changeInds[1][s] if i>=j: # only consider lower diagonal changes continue assert deltaW[i, j] != 0 # if deltaW[i, j] < 0: # logging.warn(" deltaW is usually positive (here deltaW=" +str(deltaW[i, j]) + ")") #Note: update W at each iteration here lmbda, Q = self.incrementEigenSystem(lmbda, Q, W, i, j, deltaW[i,j]) W[i, j] += deltaW[i, j] W[j, i] += deltaW[i, j] return lmbda, Q
def predictEdges(self, vertexIndices): """ This makes a prediction for a series of edges using the following score \sum_z \in n(x) \cup n(y) = 1/|log(n(z)| Returns a matrix with rows are a ranked list of verticies of length self.windowSize. """ Parameter.checkInt(self.windowSize, 1, self.graph.getNumVertices()) logging.info("Running predictEdges in " + str(self.__class__.__name__)) P = numpy.zeros((vertexIndices.shape[0], self.windowSize)) S = numpy.zeros((vertexIndices.shape[0], self.windowSize)) W = self.graph.getWeightMatrix() for i in range(vertexIndices.shape[0]): Util.printIteration(i, self.printStep, vertexIndices.shape[0]) scores = numpy.zeros(self.graph.getNumVertices()) for j in range(0, self.graph.getNumVertices()): commonNeighbours = numpy.nonzero(W[vertexIndices[i], :] * W[j, :])[0] for k in commonNeighbours: q = numpy.log(numpy.nonzero(W[k, :])[0].shape[0]) if q != 0: scores[j] = scores[j] + 1/q P[i, :], S[i, :] = self.indicesFromScores(vertexIndices[i], scores) return P, S
def evaluate(self, g1, g2, debug=False): """ Find the kernel evaluation between two graphs """ #W1 is always the smallest graph if g1.getNumVertices() > g2.getNumVertices(): return self.evaluate(g2, g1) #We ought to have something that makes the matrices the same size W1, W2 = self.__getWeightMatrices(g1, g2) K1, K2 = self.__getKernelMatrices(g1, g2) #Find common eigenspace S1, U = numpy.linalg.eigh(self.tau*W1 + (1-self.tau)*K1) S2, V = numpy.linalg.eigh(self.tau*W2 + (1-self.tau)*K2) #Find appoximate diagonals SK1 = numpy.diag(Util.mdot(U.T, K1, U)) SW1 = numpy.diag(Util.mdot(U.T, W1, U)) SK2 = numpy.diag(Util.mdot(V.T, K2, V)) SW2 = numpy.diag(Util.mdot(V.T, W2, V)) evaluation = self.tau * numpy.dot(SW1, SW2) + (1-self.tau)*numpy.dot(SK1, SK2) if debug: P = numpy.dot(V, U.T) f = self.getObjectiveValue(self.tau, P, g1, g2) return (evaluation, f, P, SW1, SW2, SK1, SK2) else: return evaluation
def cleanXML(self): """ Take the original XML file and clean up HTML characters and & symbols. We also create a list of possible matches for the experts. """ if not os.path.exists(self.xmlCleanFilename): logging.debug("Cleaning XML") h = HTMLParser.HTMLParser() inFile = open(self.xmlFileName) outFile = open(self.xmlCleanFilename, "w") i = 0 for line in inFile: Util.printIteration(i, self.stepSize, self.numLines) outLine = h.unescape(line).replace("&", "&") outLine = re.sub("<title>.*[\<\>].*</title>", "<title>Default Title</title>", outLine) outLine = re.sub("<ee>.*[\<\>].*</ee>", "<ee>Default text</ee>", outLine) outFile.write(outLine) i += 1 inFile.close() outFile.close() logging.debug("All done") else: logging.debug("File already generated: " + self.xmlCleanFilename)
def simulateModel(theta): """ The parameter t is the particle index. """ logging.debug("theta=" + str(theta)) #We start with the observed graph at the start date graph = targetGraph.subgraph(targetGraph.removedIndsAt(startDate)) graph.addVertices(M-graph.size) p = Util.powerLawProbs(alpha, zeroVal) hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices()) featureInds = numpy.ones(graph.vlist.getNumFeatures(), numpy.bool) featureInds[HIVVertices.dobIndex] = False featureInds[HIVVertices.infectionTimeIndex] = False featureInds[HIVVertices.hiddenDegreeIndex] = False featureInds[HIVVertices.stateIndex] = False featureInds = numpy.arange(featureInds.shape[0])[featureInds] matcher = GraphMatch(matchAlg, alpha=matchAlpha, featureInds=featureInds, useWeightM=False) graphMetrics = HIVGraphMetrics2(targetGraph, breakSize, matcher, float(endDate)) recordStep = (endDate-startDate)/float(numRecordSteps) rates = HIVRates(graph, hiddenDegSeq) model = HIVEpidemicModel(graph, rates, T=float(endDate), T0=float(startDate), metrics=graphMetrics) model.setRecordStep(recordStep) model.setParams(theta) model.simulate() objective = model.objective() return objective
def predict(self, X): """ Make a prediction for a set of examples given as the rows of the matrix X. :param X: A matrix with examples as rows :type X: :class:`ndarray` """ Util.abstract()
def testIncrementEigenSystem(self): print "< testIncrementEigenSystem >" numVertices = 10 graph = SparseGraph(GeneralVertexList(numVertices)) p = 0.4 generator = ErdosRenyiGenerator(p) graph = generator.generate(graph) W = graph.getWeightMatrix() L = graph.laplacianMatrix() degrees = graph.outDegreeSequence() D = numpy.diag(degrees) lmbda1, Q1 = scipy.linalg.eig(L, D) lmbda1 = lmbda1.real Q1 = Q1.dot(numpy.diag(numpy.diag(Q1.T.dot(D).dot(Q1))**-0.5)) tol = 10**-6 k = 3 inds = numpy.argsort(lmbda1)[0:k] lmbda1, Q1 = Util.indEig(lmbda1, Q1, inds) #Similarity change vector w = graph.getEdge(5, 7) deltaW = 0.5 k = 3 clusterer = NingSpectralClustering(k) lmbda2Approx, Q2Approx = clusterer.incrementEigenSystem( lmbda1, Q1, scipy.sparse.csr_matrix(W), 5, 7, deltaW) #Compute real eigenvectors then compare against these Lhat = L.copy() Lhat[5, 5] += deltaW Lhat[7, 7] += deltaW Lhat[5, 7] -= deltaW Lhat[7, 5] -= deltaW Dhat = numpy.diag(numpy.diag(Lhat)) lmbda2, Q2 = scipy.linalg.eig(Lhat, Dhat) lmbda2, Q2 = Util.indEig(lmbda2, Q2, inds) Q2Approx = Q2Approx.dot( numpy.diag(numpy.diag(Q2Approx.T.dot(Q2Approx))**-0.5)) Q2 = Q2.dot(numpy.diag(numpy.sum(Q2**2, 0)**-0.5)) Q1 = Q1.dot(numpy.diag(numpy.sum(Q1**2, 0)**-0.5)) #Errors in the eigenvalues logging.debug("Eigenvalue Errors") logging.debug(numpy.linalg.norm(lmbda2 - lmbda2Approx)) logging.debug(numpy.linalg.norm(lmbda2 - lmbda1)) #Compute error according to the paper error = numpy.sum(1 - numpy.diag(Q2.T.dot(Q2Approx))**2) error2 = numpy.sum(1 - numpy.diag(Q2.T.dot(Q1))**2) logging.debug("Eigenvector Errors") logging.debug(error) logging.debug(error2)
def addRows(U, s, V, B, k=None): """ Find the SVD of a matrix [A ; B] where A = U diag(s) V.T. Uses the QR decomposition to find an orthogonal basis on B. :param U: The left singular vectors of A :param s: The singular values of A :param V: The right singular vectors of A :param B: The matrix to append to A """ if V.shape[0] != B.shape[1]: raise ValueError("U must have same number of rows as B cols") if s.shape[0] != U.shape[1]: raise ValueError("Number of cols of U must be the same size as s") if s.shape[0] != V.shape[1]: raise ValueError("Number of cols of V must be the same size as s") if k == None: k = U.shape[1] m, p = U.shape r = B.shape[0] C = B.T - V.dot(V.T).dot(B.T) Q, R = numpy.linalg.qr(C) rPrime = Util.rank(C) Q = Q[:, 0:rPrime] R = R[0:rPrime, :] D = numpy.c_[numpy.diag(s), numpy.zeros((p, rPrime))] E = numpy.c_[B.dot(V), R.T] D = numpy.r_[D, E] G1 = numpy.c_[U, numpy.zeros((m, r))] G2 = numpy.c_[numpy.zeros((r, p)), numpy.eye(r)] G = numpy.r_[G1, G2] H = numpy.c_[V, Q] nptst.assert_array_almost_equal(G.T.dot(G), numpy.eye(G.shape[1])) nptst.assert_array_almost_equal(H.T.dot(H), numpy.eye(H.shape[1])) nptst.assert_array_almost_equal( G.dot(D).dot(H.T), numpy.r_[(U * s).dot(V.T), B]) Uhat, sHat, Vhat = numpy.linalg.svd(D, full_matrices=False) inds = numpy.flipud(numpy.argsort(sHat))[0:k] Uhat, sHat, Vhat = Util.indSvd(Uhat, sHat, Vhat, inds) #The best rank k approximation of [A ; B] Utilde = G.dot(Uhat) Stilde = sHat Vtilde = H.dot(Vhat) return Utilde, Stilde, Vtilde
def testExpandIntArray(self): v = numpy.array([1, 3, 2, 4], numpy.int) w = Util.expandIntArray(v) self.assertTrue((w == numpy.array([0,1,1,1,2,2,3,3,3,3], numpy.int)).all()) v = numpy.array([], numpy.int) w = Util.expandIntArray(v) self.assertTrue((w == numpy.array([], numpy.int)).all())
def eigpsd(X, n): """ Find the eigenvalues and eigenvectors of a positive semi-definite symmetric matrix. The input matrix X can be a numpy array or a scipy sparse matrix. In the case that n==X.shape[0] we convert to an ndarray. :param X: The matrix to find the eigenvalues of. :type X: :class:`ndarray` :param n: If n is an int, then it is the number of columns to sample otherwise n is an array of column indices. :return lmbda: The set of eigenvalues :return V: The matrix of eigenvectors as a ndarray """ if type(n) == int: n = min(n, X.shape[0]) inds = numpy.sort(numpy.random.permutation(X.shape[0])[0:n]) elif type(n) == numpy.ndarray: inds = numpy.sort(n) else: raise ValueError("Invalid n value: " + str(n)) invInds = numpy.setdiff1d(numpy.arange(X.shape[0]), inds) if inds.shape[0] == X.shape[0] and (inds == numpy.arange( X.shape[0])).all(): if scipy.sparse.issparse(X): X = numpy.array(X.todense()) lmbda, V = Util.safeEigh(X) return lmbda, V tmp = X[inds, :] A = tmp[:, inds] B = tmp[:, invInds] if scipy.sparse.issparse(X): A = numpy.array(A.todense()) BB = numpy.array((B.dot(B.T)).todense()) else: BB = B.dot(B.T) #Following line is very slow #Am12 = scipy.linalg.sqrtm(numpy.linalg.pinv(A)) Am12 = Util.matrixPowerh(A, -0.5) S = A + Am12.dot(BB).dot(Am12) S = (S.T + S) / 2 lmbda, U = Util.safeEigh(S) tol = 10**-10 lmbdaN = lmbda.copy() lmbdaN[numpy.abs(lmbda) < tol] = 0 lmbdaN[numpy.abs(lmbda) > tol] = lmbdaN[numpy.abs(lmbda) > tol]**-0.5 V = X[:, inds].dot(Am12.dot(U) * lmbdaN) return lmbda, V
def evaluateLearn(X, y, idx, learnModel, predict, metricMethod, progress=True): """ Evaluate this learning algorithm using the given list of training/test splits The metricMethod is a method which takes (predictedY, realY) as input and returns a metric about the quality of the evaluation. :param X: A matrix with examples as rows :type X: :class:`ndarray` :param y: A vector of labels :type y: :class:`ndarray` :param idx: A list of training/test splits :type idx: :class:`list` :param learnModel: A function such that learnModel(X, y) finds a mapping from X to y :type learnModel: :class:`function` :param predict: A function such that predict(X) makes predictions for X :type predict: :class:`function` :param metricMethod: A function such that metricMethod(predY, testY) returns the quality of predicted labels predY :type metricMethod: :class:`function` Output: the mean and variation of the cross validation folds. """ #Parameter.checkClass(idx, list) Parameter.checkClass(X, numpy.ndarray) Parameter.checkArray(X, softCheck=True) Parameter.checkInt(X.shape[0], 1, float('inf')) Parameter.checkClass(y, numpy.ndarray) Parameter.checkArray(y, softCheck=True) if y.ndim != 1: raise ValueError("Dimention of y must be 1") i = 0 metrics = numpy.zeros(len(idx)) logging.debug("EvaluateLearn: Using " + str(len(idx)) + " splits on " + str(X.shape[0]) + " examples") for idxtr, idxts in idx: if progress: Util.printConciseIteration(i, 1, len(idx)) trainX, testX = X[idxtr, :], X[idxts, :] trainY, testY = y[idxtr], y[idxts] #logging.debug("Distribution of labels in evaluateLearn train: " + str(numpy.bincount(trainY))) #logging.debug("Distribution of labels in evaluateLearn test: " + str(numpy.bincount(testY))) learnModel(trainX, trainY) predY = predict(testX) gc.collect() metrics[i] = metricMethod(predY, testY) i += 1 return metrics
def eigenAdd(omega, Q, Y, k): """ Perform an eigen update of the form A*A + Y*Y in which Y is a low-rank matrix and A^*A = Q Omega Q*. We use the rank-k approximation of A: Q_k Omega_k Q_k^* and then approximate [A^*A_k Y^*Y]_k. """ #logging.debug("< eigenAdd >") Parameter.checkInt(k, 0, omega.shape[0]) #if not numpy.isrealobj(omega) or not numpy.isrealobj(Q): # raise ValueError("Eigenvalues and eigenvectors must be real") if omega.ndim != 1: raise ValueError("omega must be 1-d array") if omega.shape[0] != Q.shape[1]: raise ValueError("Must have same number of eigenvalues and eigenvectors") if __debug__: Parameter.checkOrthogonal(Q, tol=EigenUpdater.tol, softCheck=True, arrayInfo="input Q in eigenAdd()") #Taking the abs of the eigenvalues is correct inds = numpy.flipud(numpy.argsort(numpy.abs(omega))) omega, Q = Util.indEig(omega, Q, inds[numpy.abs(omega)>EigenUpdater.tol]) Omega = numpy.diag(omega) YY = Y.conj().T.dot(Y) QQ = Q.dot(Q.conj().T) Ybar = Y - Y.dot(QQ) Pbar, sigmaBar, Qbar = numpy.linalg.svd(Ybar, full_matrices=False) inds = numpy.flipud(numpy.argsort(numpy.abs(sigmaBar))) inds = inds[numpy.abs(sigmaBar)>EigenUpdater.tol] Pbar, sigmaBar, Qbar = Util.indSvd(Pbar, sigmaBar, Qbar, inds) SigmaBar = numpy.diag(sigmaBar) Qbar = Ybar.T.dot(Pbar) Qbar = Qbar.dot(numpy.diag(numpy.diag(Qbar.T.dot(Qbar))**-0.5)) r = sigmaBar.shape[0] YQ = Y.dot(Q) Zeros = numpy.zeros((r, omega.shape[0])) D = numpy.c_[Q, Qbar] YYQQ = YY.dot(QQ) Z = D.conj().T.dot(YYQQ + YYQQ.conj().T).dot(D) F = numpy.c_[numpy.r_[Omega - YQ.conj().T.dot(YQ), Zeros], numpy.r_[Zeros.T, SigmaBar.conj().dot(SigmaBar)]] F = F + Z pi, H = scipy.linalg.eigh(F) inds = numpy.flipud(numpy.argsort(numpy.abs(pi))) H = H[:, inds[0:k]] pi = pi[inds[0:k]] V = D.dot(H) #logging.debug("</ eigenAdd >") return pi, V
def evaluateCvOuter(self, X, Y, folds): """ Run cross validation and output some ROC curves. In this case Y is a 1D array. :param X: A matrix with examples as rows :type X: :class:`ndarray` :param y: A vector of labels :type y: :class:`ndarray` :param folds: The number of cross validation folds :type folds: :class:`int` """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkInt(folds, 2, float('inf')) if Y.ndim != 1: raise ValueError("Expecting Y to be 1D") indexList = cross_val.StratifiedKFold(Y, folds) bestParams = [] bestTrainAUCs = numpy.zeros(folds) bestTrainROCs = [] bestTestAUCs = numpy.zeros(folds) bestTestROCs = [] bestMetaDicts = [] i = 0 for trainInds, testInds in indexList: Util.printIteration(i, 1, folds, "Outer CV: ") trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] self.learnModel(trainX, trainY) #self.learnModelCut(trainX, trainY) predTrainY = self.predict(trainX) predTestY = self.predict(testX) bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY) bestTestAUCs[i] = Evaluator.auc(predTestY, testY) #Store the parameters and ROC curves bestTrainROCs.append(Evaluator.roc(trainY, predTrainY)) bestTestROCs.append(Evaluator.roc(testY, predTestY)) metaDict = {} bestMetaDicts.append(metaDict) i += 1 logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs))) logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs))) allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs] return (bestParams, allMetrics, bestMetaDicts)
def eigpsd(X, n): """ Find the eigenvalues and eigenvectors of a positive semi-definite symmetric matrix. The input matrix X can be a numpy array or a scipy sparse matrix. In the case that n==X.shape[0] we convert to an ndarray. :param X: The matrix to find the eigenvalues of. :type X: :class:`ndarray` :param n: If n is an int, then it is the number of columns to sample otherwise n is an array of column indices. :return lmbda: The set of eigenvalues :return V: The matrix of eigenvectors as a ndarray """ if type(n) == int: n = min(n, X.shape[0]) inds = numpy.sort(numpy.random.permutation(X.shape[0])[0:n]) elif type(n) == numpy.ndarray: inds = numpy.sort(n) else: raise ValueError("Invalid n value: " + str(n)) invInds = numpy.setdiff1d(numpy.arange(X.shape[0]), inds) if inds.shape[0] == X.shape[0] and (inds == numpy.arange(X.shape[0])).all(): if scipy.sparse.issparse(X): X = numpy.array(X.todense()) lmbda, V = Util.safeEigh(X) return lmbda, V tmp = X[inds, :] A = tmp[:, inds] B = tmp[:, invInds] if scipy.sparse.issparse(X): A = numpy.array(A.todense()) BB = numpy.array((B.dot(B.T)).todense()) else: BB = B.dot(B.T) # Following line is very slow # Am12 = scipy.linalg.sqrtm(numpy.linalg.pinv(A)) Am12 = Util.matrixPowerh(A, -0.5) S = A + Am12.dot(BB).dot(Am12) S = (S.T + S) / 2 lmbda, U = Util.safeEigh(S) tol = 10 ** -10 lmbdaN = lmbda.copy() lmbdaN[numpy.abs(lmbda) < tol] = 0 lmbdaN[numpy.abs(lmbda) > tol] = lmbdaN[numpy.abs(lmbda) > tol] ** -0.5 V = X[:, inds].dot(Am12.dot(U) * lmbdaN) return lmbda, V
def addRows(U, s, V, B, k=None): """ Find the SVD of a matrix [A ; B] where A = U diag(s) V.T. Uses the QR decomposition to find an orthogonal basis on B. :param U: The left singular vectors of A :param s: The singular values of A :param V: The right singular vectors of A :param B: The matrix to append to A """ if V.shape[0] != B.shape[1]: raise ValueError("U must have same number of rows as B cols") if s.shape[0] != U.shape[1]: raise ValueError("Number of cols of U must be the same size as s") if s.shape[0] != V.shape[1]: raise ValueError("Number of cols of V must be the same size as s") if k == None: k = U.shape[1] m, p = U.shape r = B.shape[0] C = B.T - V.dot(V.T).dot(B.T) Q, R = numpy.linalg.qr(C) rPrime = Util.rank(C) Q = Q[:, 0:rPrime] R = R[0:rPrime, :] D = numpy.c_[numpy.diag(s), numpy.zeros((p, rPrime))] E = numpy.c_[B.dot(V), R.T] D = numpy.r_[D, E] G1 = numpy.c_[U, numpy.zeros((m, r))] G2 = numpy.c_[numpy.zeros((r, p)), numpy.eye(r)] G = numpy.r_[G1, G2] H = numpy.c_[V, Q] nptst.assert_array_almost_equal(G.T.dot(G), numpy.eye(G.shape[1])) nptst.assert_array_almost_equal(H.T.dot(H), numpy.eye(H.shape[1])) nptst.assert_array_almost_equal(G.dot(D).dot(H.T), numpy.r_[(U*s).dot(V.T), B]) Uhat, sHat, Vhat = numpy.linalg.svd(D, full_matrices=False) inds = numpy.flipud(numpy.argsort(sHat))[0:k] Uhat, sHat, Vhat = Util.indSvd(Uhat, sHat, Vhat, inds) #The best rank k approximation of [A ; B] Utilde = G.dot(Uhat) Stilde = sHat Vtilde = H.dot(Vhat) return Utilde, Stilde, Vtilde
def testExpandIntArray(self): v = numpy.array([1, 3, 2, 4], numpy.int) w = Util.expandIntArray(v) self.assertTrue((w == numpy.array([0, 1, 1, 1, 2, 2, 3, 3, 3, 3], numpy.int)).all()) v = numpy.array([], numpy.int) w = Util.expandIntArray(v) self.assertTrue((w == numpy.array([], numpy.int)).all())
def testEntropy(self): v = numpy.array([0, 0, 0, 1, 1, 1]) self.assertEquals(Util.entropy(v), 1) v = numpy.array([0, 0, 0]) self.assertEquals(Util.entropy(v), 0) v = numpy.array([1, 1, 1]) self.assertEquals(Util.entropy(v), 0)
def testEigenAdd2(self): tol = 10**-6 for i in range(10): m = numpy.random.randint(5, 10) n = numpy.random.randint(5, 10) p = numpy.random.randint(5, 10) A = numpy.random.randn(m, n) Y1 = numpy.random.randn(n, p) Y2 = numpy.random.randn(n, p) AA = A.conj().T.dot(A) Y1Y2 = Y1.dot(Y2.conj().T) lastError = 100 omega, Q = numpy.linalg.eigh(AA) self.assertTrue( numpy.linalg.norm(AA - (Q * omega).dot(Q.conj().T)) < tol) C = AA + Y1Y2 + Y1Y2.conj().T for k in range(1, 9): pi, V, D, DUD = EigenUpdater.eigenAdd2(omega, Q, Y1, Y2, k, debug=True) # V is "orthogonal" self.assertTrue( numpy.linalg.norm(V.conj().T.dot(V) - numpy.eye(V.shape[1])) < tol) # The approximation converges to the exact decomposition C_k = (V * pi).dot(V.conj().T) error = numpy.linalg.norm(C - C_k) if Util.rank(C) == k: self.assertTrue(error <= tol) lastError = error # DomegaD corresponds to AA_k omega_k, Q_k = Util.indEig( omega, Q, numpy.flipud(numpy.argsort(omega))[0:k]) DomegakD = (D * numpy.c_[omega_k[numpy.newaxis, :], numpy.zeros( (1, max(D.shape[1] - k, 0)))]).dot( D.conj().T) self.assertTrue( numpy.linalg.norm((Q_k * omega_k).dot(Q_k.conj().T) - DomegakD) < tol) # DUD is exactly decomposed self.assertTrue( numpy.linalg.norm(Y1Y2 + Y1Y2.conj().T - D.dot(DUD).dot(D.conj().T)) < tol)
def testMatrixPowerh(self): A = numpy.random.rand(10, 10) A = A.T.dot(A) tol = 10**-6 A2 = A.dot(A) lmbda, V = scipy.linalg.eig(A) A12 = Util.matrixPowerh(A, 0.5) self.assertTrue(numpy.linalg.norm(A12.dot(A12) - A) < tol) self.assertTrue(numpy.linalg.norm(numpy.linalg.inv(A) - Util.matrixPowerh(A, -1)) < tol) self.assertTrue(numpy.linalg.norm(A - Util.matrixPowerh(A, 1)) < tol) self.assertTrue(numpy.linalg.norm(A2 - Util.matrixPowerh(A, 2)) < tol) self.assertTrue(numpy.linalg.norm(numpy.linalg.inv(A).dot(numpy.linalg.inv(A)) - Util.matrixPowerh(A, -2)) < tol) #Now lets test on a low rank matrix lmbda[5:] = 0 A = V.dot(numpy.diag(lmbda)).dot(numpy.linalg.inv(V)) A2 = A.dot(A) A12 = Util.matrixPowerh(A, 0.5) Am12 = Util.matrixPowerh(A, -0.5) self.assertTrue(numpy.linalg.norm(numpy.linalg.pinv(A) - Util.matrixPowerh(A, -1)) < tol) self.assertTrue(numpy.linalg.norm(numpy.linalg.pinv(A) - Am12.dot(Am12)) < tol) self.assertTrue(numpy.linalg.norm(A12.dot(A12) - A) < tol) self.assertTrue(numpy.linalg.norm(A - Util.matrixPowerh(A, 1)) < tol) self.assertTrue(numpy.linalg.norm(A2 - Util.matrixPowerh(A, 2)) < tol)
def supervisedMC23(lists, itemList, topQList, verbose=False): """ A supervised version of MC2 of our own invention. The idea is to find a linear combination of transition matrices to fit a given one. We just make sure it fits the stationary distribution. """ import cvxopt import cvxopt.solvers ell = len(lists) n = len(itemList) outputList, scores, PList = RankAggregator.MC2(lists, itemList, verbose=True) Py = RankAggregator.generateTransitionMatrix(topQList, itemList) u, v = scipy.sparse.linalg.eigs(Py.T, 1) v = numpy.array(v).flatten() c = numpy.zeros(v.shape[0]) for i, P in enumerate(PList): Q[:, i] = cvxopt.matrix(numpy.array(P.todense()).ravel()) c = cvxopt.matrix(c) QQ = Q.T * Q Py = RankAggregator.generateTransitionMatrix(topQList, itemList) s = numpy.array(Py.todense()).ravel() s = cvxopt.matrix(s) G = cvxopt.spdiag((-numpy.ones(ell)).tolist()) h = cvxopt.matrix(numpy.zeros(ell)) A = cvxopt.matrix(numpy.ones(ell), (1, ell)) b = cvxopt.matrix(numpy.ones(1)) q = -Q.T * s sol = cvxopt.solvers.qp(QQ, q, G, h, A, b) alpha = numpy.array(sol['x']) #Combine the matrices P = numpy.zeros((n, n)) for j, Pj in enumerate(PList): Util.printIteration(j, 1, ell) P += alpha[j] * numpy.array(Pj.todense()) P /= ell outputList, scores = RankAggregator.computeOutputList(P, itemList) if verbose: return outputList, scores, PList else: return outputList, scores
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) localAucs = numpy.zeros( (self.ks.shape[0], self.lmbdas.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): maxLocalAuc = self.copy() maxLocalAuc.k = k paramList.append((trainX, testX, testOmegaList, maxLocalAuc)) pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize) #import itertools #resultsIterator = itertools.imap(localAucsLmbdas, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempAucs = resultsIterator.next() localAucs[i, :, icv] = tempAucs pool.terminate() meanLocalAucs = numpy.mean(localAucs, 2) stdLocalAucs = numpy.std(localAucs, 2) logging.debug(meanLocalAucs) k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]] lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]] logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda)) self.k = k self.lmbda = lmbda return meanLocalAucs, stdLocalAucs
def testIncrementEigenSystem(self): print "< testIncrementEigenSystem >" numVertices = 10 graph = SparseGraph(GeneralVertexList(numVertices)) p = 0.4 generator = ErdosRenyiGenerator(p) graph = generator.generate(graph) W = graph.getWeightMatrix() L = graph.laplacianMatrix() degrees = graph.outDegreeSequence() D = numpy.diag(degrees) lmbda1, Q1 = scipy.linalg.eig(L, D) lmbda1 = lmbda1.real Q1 = Q1.dot(numpy.diag(numpy.diag(Q1.T.dot(D).dot(Q1))**-0.5)) tol = 10**-6 k = 3 inds = numpy.argsort(lmbda1)[0:k] lmbda1, Q1 = Util.indEig(lmbda1, Q1, inds) #Similarity change vector w = graph.getEdge(5,7) deltaW = 0.5 k = 3 clusterer = NingSpectralClustering(k) lmbda2Approx, Q2Approx = clusterer.incrementEigenSystem(lmbda1, Q1, scipy.sparse.csr_matrix(W), 5, 7, deltaW) #Compute real eigenvectors then compare against these Lhat = L.copy(); Lhat[5,5] += deltaW; Lhat[7,7] += deltaW Lhat[5,7] -= deltaW; Lhat[7,5] -= deltaW Dhat = numpy.diag(numpy.diag(Lhat)) lmbda2, Q2 = scipy.linalg.eig(Lhat, Dhat) lmbda2, Q2 = Util.indEig(lmbda2, Q2, inds) Q2Approx = Q2Approx.dot(numpy.diag(numpy.diag(Q2Approx.T.dot(Q2Approx))**-0.5)) Q2 = Q2.dot(numpy.diag(numpy.sum(Q2**2, 0)**-0.5)) Q1 = Q1.dot(numpy.diag(numpy.sum(Q1**2, 0)**-0.5)) #Errors in the eigenvalues logging.debug("Eigenvalue Errors") logging.debug(numpy.linalg.norm(lmbda2 - lmbda2Approx)) logging.debug(numpy.linalg.norm(lmbda2 - lmbda1)) #Compute error according to the paper error = numpy.sum(1 - numpy.diag(Q2.T.dot(Q2Approx))**2) error2 = numpy.sum(1 - numpy.diag(Q2.T.dot(Q1))**2) logging.debug("Eigenvector Errors") logging.debug(error) logging.debug(error2)
def distance2(self, graph1, graph2, permutation): """ Compute a graph distance metric between two graphs give a permutation vector. This is given by F(P) = (1-alpha)/(||W1||^2_F + ||W2||^2_F) (||W1 - P W2 P.T||^2_F) - alpha 1/(||V1||_F^2 + ||V2||_F^2) ||V1 - P.T V2||^2_F and is bounded between 0 and 1. :param graph1: A graph object :param graph2: The second graph object to match :param permutation: An array of permutation indices matching the first to second graph :type permutation: `numpy.ndarray` """ if self.useWeightM: W1 = graph1.getWeightMatrix() W2 = graph2.getWeightMatrix() else: W1 = graph1.adjacencyMatrix() W2 = graph2.adjacencyMatrix() if W1.shape[0] < W2.shape[0]: W1 = Util.extendArray(W1, W2.shape) elif W2.shape[0] < W1.shape[0]: W2 = Util.extendArray(W2, W1.shape) n = W1.shape[0] P = numpy.zeros((n, n)) P[(numpy.arange(n), permutation)] = 1 dist1 = numpy.linalg.norm(W1 - P.dot(W2).dot(P.T)) ** 2 # Now compute the vertex similarities distance V1 = graph1.getVertexList().getVertices() V2 = graph2.getVertexList().getVertices() if V1.shape[0] < V2.shape[0]: V1 = Util.extendArray(V1, V2.shape) elif V2.shape[0] < V1.shape[0]: V2 = Util.extendArray(V2, V1.shape) dist2 = numpy.sum((V1 - P.T.dot(V2)) ** 2) norm1 = (W1 ** 2).sum() + (W2 ** 2).sum() norm2 = (V1 ** 2).sum() + (V2 ** 2).sum() if norm1 != 0: dist1 = dist1 / norm1 if norm2 != 0: dist2 = dist2 / norm2 dist = (1 - self.alpha) * dist1 + self.alpha * dist2 return dist
def distance2(self, graph1, graph2, permutation): """ Compute a graph distance metric between two graphs give a permutation vector. This is given by F(P) = (1-alpha)/(||W1||^2_F + ||W2||^2_F) (||W1 - P W2 P.T||^2_F) - alpha 1/(||V1||_F^2 + ||V2||_F^2) ||V1 - P.T V2||^2_F and is bounded between 0 and 1. :param graph1: A graph object :param graph2: The second graph object to match :param permutation: An array of permutation indices matching the first to second graph :type permutation: `numpy.ndarray` """ if self.useWeightM: W1 = graph1.getWeightMatrix() W2 = graph2.getWeightMatrix() else: W1 = graph1.adjacencyMatrix() W2 = graph2.adjacencyMatrix() if W1.shape[0] < W2.shape[0]: W1 = Util.extendArray(W1, W2.shape) elif W2.shape[0] < W1.shape[0]: W2 = Util.extendArray(W2, W1.shape) n = W1.shape[0] P = numpy.zeros((n, n)) P[(numpy.arange(n), permutation)] = 1 dist1 = numpy.linalg.norm(W1 - P.dot(W2).dot(P.T))**2 #Now compute the vertex similarities distance V1 = graph1.getVertexList().getVertices() V2 = graph2.getVertexList().getVertices() if V1.shape[0] < V2.shape[0]: V1 = Util.extendArray(V1, V2.shape) elif V2.shape[0] < V1.shape[0]: V2 = Util.extendArray(V2, V1.shape) dist2 = numpy.sum((V1 - P.T.dot(V2))**2) norm1 = ((W1**2).sum() + (W2**2).sum()) norm2 = ((V1**2).sum() + (V2**2).sum()) if norm1 != 0: dist1 = dist1 / norm1 if norm2 != 0: dist2 = dist2 / norm2 dist = (1 - self.alpha) * dist1 + self.alpha * dist2 return dist
def learnModel(self, X, Y): """ Learn the CCA primal-dual directions. """ self.trainX = X self.trainY = Y numExamples = X.shape[0] numFeatures = Y.shape[1] a = 10**-5 I = numpy.eye(numExamples) I2 = numpy.eye(numFeatures) Kx = self.kernelX.evaluate(X, X) + a * I Kxx = numpy.dot(Kx, Kx) Kxy = numpy.dot(Kx, Y) Cyy = numpy.dot(Y.T, Y) + a * I2 Z1 = numpy.zeros((numExamples, numExamples)) Z2 = numpy.zeros((numFeatures, numFeatures)) Z3 = numpy.zeros((numExamples, numFeatures)) #Note we add a small value to the diagonal of A and B to deal with low-rank A = numpy.c_[Z1, Kxy] A1 = numpy.c_[Kxy.T, Z2] A = numpy.r_[A, A1] A = (A + A.T) / 2 #Stupid stupidness B = numpy.c_[(1 - self.tau1) * Kxx - self.tau1 * Kx, Z3] B1 = numpy.c_[Z3.T, (1 - self.tau2) * Cyy - self.tau2 * I2] B = numpy.r_[B, B1] B = (B + B.T) / 2 (D, W) = scipy.linalg.eig(A, B) #Only select eigenvalues which are greater than zero W = W[:, D > 0] #We need to return those eigenvectors corresponding to positive eigenvalues self.alpha = W[0:numExamples, :] self.V = W[numExamples:, :] self.lmbdas = D[D > 0] alphaDiag = Util.mdot(self.alpha.T, Kxx, self.alpha) alphaDiag = alphaDiag + numpy.array(alphaDiag < 0, numpy.int) vDiag = Util.mdot(self.V.T, Cyy, self.V) vDiag = vDiag + numpy.array(vDiag < 0, numpy.int) self.alpha = numpy.dot( self.alpha, numpy.diag(1 / numpy.sqrt(numpy.diag(alphaDiag)))) self.V = numpy.dot(self.V, numpy.diag(1 / numpy.sqrt(numpy.diag(vDiag)))) return self.alpha, self.V, self.lmbdas
def testMode(self): x = numpy.array([1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 5, 5]) self.assertEquals(Util.mode(x), 3) x = numpy.array([1, 1, 1, 2, 2, 3, 3, 3, 5, 5]) self.assertEquals(Util.mode(x), 1) x = numpy.array([1, 2, 3, 4]) self.assertEquals(Util.mode(x), 1) x = numpy.array([0]) self.assertEquals(Util.mode(x), 0)
def learnModel(self, X, y): """ Learn a model for a set of examples given as the rows of the matrix X, with corresponding labels given in the elements of 1D array y. :param X: A matrix with examples as rows :type X: :class:`ndarray` :param y: A vector of labels :type y: :class:`ndarray` """ Util.abstract()
def testMode(self): x = numpy.array([1,1,1,2,2,3,3,3,3,3,5,5]) self.assertEquals(Util.mode(x), 3) x = numpy.array([1,1,1,2,2,3,3,3,5,5]) self.assertEquals(Util.mode(x), 1) x = numpy.array([1,2,3,4]) self.assertEquals(Util.mode(x), 1) x = numpy.array([0]) self.assertEquals(Util.mode(x), 0)
def testCumMin(self): v = numpy.array([5, 6, 4, 5, 1]) u = Util.cumMin(v) nptst.assert_array_equal(u, numpy.array([5, 5, 4, 4, 1])) v = numpy.array([5, 4, 3, 2, 1]) u = Util.cumMin(v) nptst.assert_array_equal(u, v) v = numpy.array([1, 2, 3]) u = Util.cumMin(v) nptst.assert_array_equal(u, numpy.ones(3))
def learnModel(self, graph): """ Learn a prediction model based on considering ego networks as independent. For each ego, X contains a list of neighbours and the corresponding labels are the values of the edge labels. We then find the set of primal weights w for each ego network and then regress onto the set of weights using the ego labels. :param graph: The input graph to learn from. :type graph: class:`apgl.graph.AbstractSingleGraph` """ logging.info("Learning model on graph of size " + str(graph.getNumVertices())) logging.info("EgoLearner: " + str(self.egoRegressor)) logging.info("AlterLearner: " + str(self.alterRegressor)) allIndices = numpy.arange(0, graph.getNumVertices()) V = graph.getVertexList().getVertices(list(allIndices)) W = numpy.zeros((0, graph.getVertexList().getNumFeatures())) Xe = numpy.zeros((0, graph.getVertexList().getNumFeatures())) printStep = numpy.floor(graph.getNumVertices() / 10) alterError = 0.0 for i in range(graph.getNumVertices()): Util.printIteration(i, printStep, graph.getNumVertices()) neighbours = graph.neighbours(i) if neighbours.shape[0] != 0: X = V[neighbours, :] y = numpy.ones(X.shape[0]) for j in range(neighbours.shape[0]): y[j] = graph.getEdge(i, neighbours[j]) w = self.alterRegressor.learnModel(X, y) #alterError = numpy.mean(numpy.abs(self.alterRegressor.predict(X) - y)) W = numpy.r_[W, numpy.array([w])] Xe = numpy.r_[Xe, numpy.array([V[i, :]])] #Now we need to solve least to find regressor of Xe onto W logging.info( "Finding regression matrix onto weights using matrix of size " + str(Xe.shape)) gc.collect() #self.standardiser = Standardiser() #self.standardiser2 = Standardiser() #Xe = self.standardiser.standardiseArray(Xe) #W = self.standardiser2.standardiseArray(W) self.egoRegressor.learnModel(Xe, W) return W
def testRank(self): X = numpy.random.rand(10, 1) self.assertEquals(Util.rank(X), 1) X = numpy.random.rand(10, 12) self.assertEquals(Util.rank(X), 10) X = numpy.random.rand(31, 12) self.assertEquals(Util.rank(X), 12) K = numpy.dot(X, X.T) self.assertEquals(Util.rank(X), 12)
def learnModel(self, graph): """ Learn a prediction model based on considering ego networks as independent. For each ego, X contains a list of neighbours and the corresponding labels are the values of the edge labels. We then find the set of primal weights w for each ego network and then regress onto the set of weights using the ego labels. :param graph: The input graph to learn from. :type graph: class:`apgl.graph.AbstractSingleGraph` """ logging.info("Learning model on graph of size " + str(graph.getNumVertices())) logging.info("EgoLearner: " + str(self.egoRegressor)) logging.info("AlterLearner: " + str(self.alterRegressor)) allIndices = numpy.arange(0, graph.getNumVertices()) V = graph.getVertexList().getVertices(list(allIndices)) W = numpy.zeros((0, graph.getVertexList().getNumFeatures())) Xe = numpy.zeros((0, graph.getVertexList().getNumFeatures())) printStep = numpy.floor(graph.getNumVertices()/10) alterError = 0.0 for i in range(graph.getNumVertices()): Util.printIteration(i, printStep, graph.getNumVertices()) neighbours = graph.neighbours(i) if neighbours.shape[0] != 0: X = V[neighbours, :] y = numpy.ones(X.shape[0]) for j in range(neighbours.shape[0]): y[j] = graph.getEdge(i, neighbours[j]) w = self.alterRegressor.learnModel(X, y) #alterError = numpy.mean(numpy.abs(self.alterRegressor.predict(X) - y)) W = numpy.r_[W, numpy.array([w])] Xe = numpy.r_[Xe, numpy.array([V[i, :]])] #Now we need to solve least to find regressor of Xe onto W logging.info("Finding regression matrix onto weights using matrix of size " + str(Xe.shape)) gc.collect() #self.standardiser = Standardiser() #self.standardiser2 = Standardiser() #Xe = self.standardiser.standardiseArray(Xe) #W = self.standardiser2.standardiseArray(W) self.egoRegressor.learnModel(Xe, W) return W
def evaluateCvOuter(self, X, Y, folds, leafRank): """ Run cross validation and output some ROC curves. In this case Y is a 1D array. """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkInt(folds, 2, float('inf')) if Y.ndim != 1: raise ValueError("Expecting Y to be 1D") indexList = cross_val.StratifiedKFold(Y, folds) self.setLeafRank(leafRank) bestParams = [] bestTrainAUCs = numpy.zeros(folds) bestTrainROCs = [] bestTestAUCs = numpy.zeros(folds) bestTestROCs = [] bestMetaDicts = [] i = 0 for trainInds, testInds in indexList: Util.printIteration(i, 1, folds) trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY))) logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY))) self.learnModel(trainX, trainY) predTrainY = self.predict(trainX) predTestY = self.predict(testX) bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY) bestTestAUCs[i] = Evaluator.auc(predTestY, testY) #Store the parameters and ROC curves bestTrainROCs.append(Evaluator.roc(trainY, predTrainY)) bestTestROCs.append(Evaluator.roc(testY, predTestY)) metaDict = {} bestMetaDicts.append(metaDict) i += 1 logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs))) logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs))) allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs] return (bestParams, allMetrics, bestMetaDicts)
def learnModel(self, X, Y): """ Learn the CCA primal-dual directions. """ self.trainX = X self.trainY = Y numExamples = X.shape[0] numFeatures = Y.shape[1] a = 10**-5 I = numpy.eye(numExamples) I2 = numpy.eye(numFeatures) Kx = self.kernelX.evaluate(X, X) + a*I Kxx = numpy.dot(Kx, Kx) Kxy = numpy.dot(Kx, Y) Cyy = numpy.dot(Y.T, Y) + a*I2 Z1 = numpy.zeros((numExamples, numExamples)) Z2 = numpy.zeros((numFeatures, numFeatures)) Z3 = numpy.zeros((numExamples, numFeatures)) #Note we add a small value to the diagonal of A and B to deal with low-rank A = numpy.c_[Z1, Kxy] A1 = numpy.c_[Kxy.T, Z2] A = numpy.r_[A, A1] A = (A+A.T)/2 #Stupid stupidness B = numpy.c_[(1-self.tau1)*Kxx - self.tau1*Kx, Z3] B1 = numpy.c_[Z3.T, (1-self.tau2)*Cyy - self.tau2*I2] B = numpy.r_[B, B1] B = (B+B.T)/2 (D, W) = scipy.linalg.eig(A, B) #Only select eigenvalues which are greater than zero W = W[:, D>0] #We need to return those eigenvectors corresponding to positive eigenvalues self.alpha = W[0:numExamples, :] self.V = W[numExamples:, :] self.lmbdas = D[D>0] alphaDiag = Util.mdot(self.alpha.T, Kxx, self.alpha) alphaDiag = alphaDiag + numpy.array(alphaDiag < 0, numpy.int) vDiag = Util.mdot(self.V.T, Cyy, self.V) vDiag = vDiag + numpy.array(vDiag < 0, numpy.int) self.alpha = numpy.dot(self.alpha, numpy.diag(1/numpy.sqrt(numpy.diag(alphaDiag)))) self.V = numpy.dot(self.V, numpy.diag(1/numpy.sqrt(numpy.diag(vDiag)))) return self.alpha, self.V, self.lmbdas
def growTree(self, X, y, argsortX, startId): """ Grow a tree using a stack. Give a sample of data and a node index, we find the best split and add children to the tree accordingly. We perform pre-pruning based on the penalty. """ eps = 10**-4 idStack = [startId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y, node.getTrainInds(), argsortX) #Choose best feature based on gains accuracies += eps bestFeatureInd = Util.randomChoice(accuracies)[0] bestThreshold = thresholds[bestFeatureInd] nodeInds = node.getTrainInds() bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[ X[:, bestFeatureInd][nodeInds] < bestThreshold]]) bestRightInds = numpy.sort(nodeInds[numpy.arange( nodeInds.shape[0])[ X[:, bestFeatureInd][nodeInds] >= bestThreshold]]) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum( ) != 0 and self.tree.depth() < self.maxDepth: node.setError(1 - accuracies[bestFeatureInd]) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, Util.mode(y[bestLeftInds])) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, Util.mode(y[bestRightInds])) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(rightChildId)
def supervisedMC22(lists, itemList, topQList, verbose=False): """ A supervised version of MC2 of our own invention. The idea is to find a linear combination of transition matrices to fit a given one. """ import cvxopt import cvxopt.solvers ell = len(lists) n = len(itemList) outputList, scores, PList = RankAggregator.MC2(lists, itemList, verbose=True) Q = cvxopt.spmatrix([], [], [], (n*n, len(lists))) for i, P in enumerate(PList): #print(P.todense()) Q[:, i] = cvxopt.matrix(numpy.array(P.todense()).ravel()) QQ = Q.T * Q Py = RankAggregator.generateTransitionMatrix(topQList, itemList) s = numpy.array(Py.todense()).ravel() s = cvxopt.matrix(s) G = cvxopt.spdiag((-numpy.ones(ell)).tolist()) h = cvxopt.matrix(numpy.zeros(ell)) A = cvxopt.matrix(numpy.ones(ell), (1, ell)) b = cvxopt.matrix(numpy.ones(1)) q = -Q.T * s sol = cvxopt.solvers.qp(QQ, q, G, h, A, b) alpha = numpy.array(sol['x']) #Combine the matrices P = numpy.zeros((n, n)) for j, Pj in enumerate(PList): Util.printIteration(j, 1, ell) P += alpha[j] * numpy.array(Pj.todense()) P /= ell outputList, scores = RankAggregator.computeOutputList(P, itemList) if verbose: return outputList, scores, PList else: return outputList, scores
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) localAucs = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): maxLocalAuc = self.copy() maxLocalAuc.k = k paramList.append((trainX, testX, testOmegaList, maxLocalAuc)) pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize) #import itertools #resultsIterator = itertools.imap(localAucsLmbdas, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempAucs = resultsIterator.next() localAucs[i, :, icv] = tempAucs pool.terminate() meanLocalAucs = numpy.mean(localAucs, 2) stdLocalAucs = numpy.std(localAucs, 2) logging.debug(meanLocalAucs) k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]] lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]] logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda)) self.k = k self.lmbda = lmbda return meanLocalAucs, stdLocalAucs
def addCols2(U, s, V, B): """ Find the SVD of a matrix [A, B] where A = U diag(s) V.T. Uses the SVD decomposition to find an orthogonal basis on B. :param U: The left singular vectors of A :param s: The singular values of A :param V: The right singular vectors of A :param B: The matrix to append to A """ if U.shape[0] != B.shape[0]: raise ValueError("U must have same number of rows as B") if s.shape[0] != U.shape[1]: raise ValueError("Number of cols of U must be the same size as s") if s.shape[0] != V.shape[1]: raise ValueError("Number of cols of V must be the same size as s") m, k = U.shape r = B.shape[1] n = V.shape[0] C = numpy.dot(numpy.eye(m) - numpy.dot(U, U.T), B) Ubar, sBar, Vbar = numpy.linalg.svd(C, full_matrices=False) inds = numpy.flipud(numpy.argsort(sBar))[0:k] Ubar, sBar, Vbar = Util.indSvd(Ubar, sBar, Vbar, inds) rPrime = Ubar.shape[1] D = numpy.r_[numpy.diag(s), numpy.zeros((rPrime, k))] E = numpy.r_[numpy.dot(U.T, B), numpy.diag(sBar).dot(Vbar.T)] D = numpy.c_[D, E] Uhat, sHat, Vhat = numpy.linalg.svd(D, full_matrices=False) inds = numpy.flipud(numpy.argsort(sHat))[0:k] Uhat, sHat, Vhat = Util.indSvd(Uhat, sHat, Vhat, inds) #The best rank k approximation of [A, B] Utilde = numpy.dot(numpy.c_[U, Ubar], Uhat) sTilde = sHat G1 = numpy.r_[V, numpy.zeros((r, k))] G2 = numpy.r_[numpy.zeros((n ,r)), numpy.eye(r)] Vtilde = numpy.dot(numpy.c_[G1, G2], Vhat) return Utilde, sTilde, Vtilde
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) precisions = numpy.zeros((self.ks.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): learner = self.copy() learner.k = k paramList.append((trainX, testX, testOmegaList, learner)) #pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) #resultsIterator = pool.imap(computePrecision, paramList, self.chunkSize) import itertools resultsIterator = itertools.imap(computePrecision, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempPrecision = resultsIterator.next() precisions[i, icv] = tempPrecision #pool.terminate() meanPrecisions = numpy.mean(precisions, 1) stdPrecisions = numpy.std(precisions, 1) logging.debug(meanPrecisions) k = self.ks[numpy.argmax(meanPrecisions)] logging.debug("Model parameters: k=" + str(k)) self.k = k return meanPrecisions, stdPrecisions
def addCols2(U, s, V, B): """ Find the SVD of a matrix [A, B] where A = U diag(s) V.T. Uses the SVD decomposition to find an orthogonal basis on B. :param U: The left singular vectors of A :param s: The singular values of A :param V: The right singular vectors of A :param B: The matrix to append to A """ if U.shape[0] != B.shape[0]: raise ValueError("U must have same number of rows as B") if s.shape[0] != U.shape[1]: raise ValueError("Number of cols of U must be the same size as s") if s.shape[0] != V.shape[1]: raise ValueError("Number of cols of V must be the same size as s") m, k = U.shape r = B.shape[1] n = V.shape[0] C = numpy.dot(numpy.eye(m) - numpy.dot(U, U.T), B) Ubar, sBar, Vbar = numpy.linalg.svd(C, full_matrices=False) inds = numpy.flipud(numpy.argsort(sBar))[0:k] Ubar, sBar, Vbar = Util.indSvd(Ubar, sBar, Vbar, inds) rPrime = Ubar.shape[1] D = numpy.r_[numpy.diag(s), numpy.zeros((rPrime, k))] E = numpy.r_[numpy.dot(U.T, B), numpy.diag(sBar).dot(Vbar.T)] D = numpy.c_[D, E] Uhat, sHat, Vhat = numpy.linalg.svd(D, full_matrices=False) inds = numpy.flipud(numpy.argsort(sHat))[0:k] Uhat, sHat, Vhat = Util.indSvd(Uhat, sHat, Vhat, inds) #The best rank k approximation of [A, B] Utilde = numpy.dot(numpy.c_[U, Ubar], Uhat) sTilde = sHat G1 = numpy.r_[V, numpy.zeros((r, k))] G2 = numpy.r_[numpy.zeros((n, r)), numpy.eye(r)] Vtilde = numpy.dot(numpy.c_[G1, G2], Vhat) return Utilde, sTilde, Vtilde
def learnModel(self, graph): """ Learn a prediction model based on all of the edges of the input graph. For each ego, X contains a list of neighbours and non-neighbours in the same ratio, and y = 1 when for a neighbour otherwise -1. We then find the set of primal weights w for each ego network and then regress onto the set of weights using the ego labels. One can either learn by comparing neighbours and non-neighbours, or alternatively using the labels of edges and making prediction on unlabelled edges. :param graph: The input graph to learn from. :type graph: class:`apgl.graph.AbstractSingleGraph` :param randomNegLabel: How to compute edge labels, False means use the labels themselves, and True means randomly pick non-neighbours to have -1 labels :type randomNegLabel: class `bool` """ Parameter.checkInt(self.windowSize, 1, graph.getNumVertices()) self.graph = graph logging.info("Learning model on graph of size " + str(graph.getNumVertices())) allIndices = numpy.arange(0, graph.getNumVertices()) V = graph.getVertexList().getVertices(allIndices) W = numpy.zeros((0, graph.getVertexList().getNumFeatures())) Xe = numpy.zeros((0, graph.getVertexList().getNumFeatures())) printStep = numpy.floor(graph.getNumVertices() / 10) for i in range(graph.getNumVertices()): Util.printIteration(i, printStep, graph.getNumVertices()) neighbours = graph.neighbours(i) if neighbours.shape[0] != 0: compNeighbours = numpy.setdiff1d(allIndices, neighbours) perm = numpy.random.permutation( compNeighbours.shape[0])[0:neighbours.shape[0]] negativeVertices = V[compNeighbours[perm], :] X = numpy.r_[V[neighbours, :], negativeVertices] y = numpy.ones(X.shape[0]) y[neighbours.shape[0]:] = -1 w = self.alterRegressor.learnModel(X, y) W = numpy.r_[W, numpy.array([w])] Xe = numpy.r_[Xe, numpy.array([V[i, :]])] #Now we need to solve least to find regressor of Xe onto W self.egoRegressor.learnModel(Xe, W)
def generate_data_file(dir, nb_user=None): logging.debug("nb_user: "******"creating file " + str(f_data_name)) shutil.copy(BemolData.get_file_name(dir, None), f_data_name) # other files to generate nb_user_to_generate = [] current_nb_user = BemolData.get_nb_user_to_read(nb_user) logging.debug("current_nb_user before while: " + str(current_nb_user)) # !!!!! security failure TOCTTOU while (not os.path.exists(BemolData.get_file_name(dir, current_nb_user))): logging.debug("current_nb_user in while: " + str(current_nb_user)) nb_user_to_generate.append(current_nb_user) current_nb_user = BemolData.get_nb_user_to_read(current_nb_user+1) nb_user_to_generate.reverse() # generate other files for current_nb_user in nb_user_to_generate: # read data f_existing_data_name = BemolData.get_file_name(dir, current_nb_user+1) f_to_create_data_name = BemolData.get_file_name(dir, current_nb_user) logging.info("creating file " + f_to_create_data_name) dict_user = MyDictionary() try: f_existing_data = gzip.open(f_existing_data_name, 'rb') f_to_create_data = gzip.open(f_to_create_data_name, 'wb') i = 0 i_max = BemolData.get_nb_line(f_existing_data_name) for line in f_existing_data: Util.printIteration(i, 1000, i_max); i += 1 m = re.match("(\d+)\s(\d+)\s(\d+)\s(\d+)", line) if dict_user.index(int(m.group(1))) < current_nb_user: f_to_create_data.write(line) except IOError as error: if error.filename == f_existing_data: raise RGIOError(error, RGIOError.indent() + 'it disappeared in the meanwhile') else: raise error
def recommendAtk(U, V, k, blockSize=1000, omegaList=None, verbose=False): """ Compute the matrix Z = U V^T and then find the k largest indices for each row. """ blocksize = 1000 numBlocks = int(ceil(U.shape[0] / float(blocksize))) orderedItems = numpy.zeros((U.shape[0], k), numpy.int32) scores = numpy.zeros((U.shape[0], k), numpy.float) for j in range(numBlocks): logging.debug("Block " + str(j) + " of " + str(numBlocks)) endInd = min(U.shape[0], (j + 1) * blocksize) UV = U[j * blocksize:endInd, :].dot(V.T) orderedItems[j * blocksize:endInd, :] = Util.argmaxN(UV, k) rowInds = numpy.repeat(numpy.arange(endInd - j * blocksize), k) colInds = orderedItems[j * blocksize:endInd, :].flatten() scores[j * blocksize:endInd, :] = numpy.reshape( UV[rowInds, colInds], (endInd - j * blocksize, k)) #orderedItems[j*blocksize:endInd, :] = Util.argmaxN2d(scores, k) #Now delete items in omegaList if given if omegaList != None: for i in range(j * blocksize, endInd): nonTrainItems = orderedItems[i, :][numpy.logical_not( numpy.in1d(orderedItems[i, :], omegaList[i]))] orderedItems[i, 0:nonTrainItems.shape[0]] = nonTrainItems orderedItems[i, nonTrainItems.shape[0]:] = -1 if verbose: return orderedItems, scores else: return orderedItems
def testSvd(self): n = 100 m = 80 A = scipy.sparse.rand(m, n, 0.1) ks = [10, 20, 30, 40] q = 2 lastError = numpy.linalg.norm(A.todense()) for k in ks: U, s, V = RandomisedSVD.svd(A, k, q) nptst.assert_array_almost_equal(U.T.dot(U), numpy.eye(k)) nptst.assert_array_almost_equal(V.T.dot(V), numpy.eye(k)) A2 = (U*s).dot(V.T) error = numpy.linalg.norm(A - A2) self.assertTrue(error <= lastError) lastError = error #Compare versus exact svd U, s, V = numpy.linalg.svd(numpy.array(A.todense())) inds = numpy.flipud(numpy.argsort(s))[0:k*2] U, s, V = Util.indSvd(U, s, V, inds) Ak = (U*s).dot(V.T) error2 = numpy.linalg.norm(A - Ak) self.assertTrue(error2 <= error)
def matrixSimilarity(self, V1, V2): """ Compute a vertex similarity matrix C, such that the ijth entry is the matching score between V1_i and V2_j, where larger is a better match. """ X = numpy.r_[V1, V2] standardiser = Standardiser() X = standardiser.normaliseArray(X) V1 = X[0:V1.shape[0], :] V2 = X[V1.shape[0]:, :] #print(X) #Extend arrays with zeros to make them the same size #if V1.shape[0] < V2.shape[0]: # V1 = Util.extendArray(V1, V2.shape, numpy.min(V1)) #elif V2.shape[0] < V1.shape[0]: # V2 = Util.extendArray(V2, V1.shape, numpy.min(V2)) #Let's compute C as the distance between vertices #Distance is bounded by 1 D = Util.distanceMatrix(V1, V2) maxD = numpy.max(D) minD = numpy.min(D) if (maxD - minD) != 0: C = (maxD - D) / (maxD - minD) else: C = numpy.ones((V1.shape[0], V2.shape[0])) return C
def runRandomChoice(): #can just do non-zero entries w = Util.expandIntArray(v) reps = 10000 for i in range(reps): w[numpy.random.randint(0, w.shape[0])]
def testSvdSoft(self): A = scipy.sparse.rand(10, 10, 0.2) A = A.tocsc() lmbda = 0.2 U, s, V = SparseUtils.svdSoft(A, lmbda) ATilde = U.dot(numpy.diag(s)).dot(V.T) #Now compute the same matrix using numpy A = A.todense() U2, s2, V2 = numpy.linalg.svd(A) inds = numpy.flipud(numpy.argsort(s2)) inds = inds[s2[inds] > lmbda] U2, s2, V2 = Util.indSvd(U2, s2, V2, inds) s2 = s2 - lmbda s2 = numpy.clip(s, 0, numpy.max(s2)) ATilde2 = U2.dot(numpy.diag(s2)).dot(V2.T) nptst.assert_array_almost_equal(s, s) nptst.assert_array_almost_equal(ATilde, ATilde2) #Now run svdSoft with a numpy array U3, s3, V3 = SparseUtils.svdSoft(A, lmbda) ATilde3 = U.dot(numpy.diag(s)).dot(V.T) nptst.assert_array_almost_equal(s, s3) nptst.assert_array_almost_equal(ATilde3, ATilde2)
def svdSoft(X, lmbda, kmax=None): """ Find the partial SVD of the sparse or dense matrix X, for which singular values are >= lmbda. Soft threshold the resulting singular values so that s <- max(s - lambda, 0) """ if scipy.sparse.issparse(X): k = min(X.shape[0], X.shape[1]) L = scipy.sparse.linalg.aslinearoperator(X) U, s, V = SparseUtils.svdPropack(L, k, kmax=kmax) V = V.T else: U, s, V = numpy.linalg.svd(X) inds = numpy.flipud(numpy.argsort(s)) inds = inds[s[inds] >= lmbda] U, s, V = Util.indSvd(U, s, V, inds) #Soft threshold if s.shape[0] != 0: s = s - lmbda s = numpy.clip(s, 0, numpy.max(s)) return U, s, V
def testAddRows(self): #Test case when k = rank Utilde, Stilde, Vtilde = SVDUpdate.addRows(self.U, self.s, self.V, self.C) nptst.assert_array_almost_equal(Utilde.T.dot(Utilde), numpy.eye(Utilde.shape[1])) nptst.assert_array_almost_equal(Vtilde.T.dot(Vtilde), numpy.eye(Vtilde.shape[1])) self.assertEquals(Stilde.shape[0], self.k) #Check we get the original solution with full SVD U, s, V = numpy.linalg.svd(self.A) inds = numpy.flipud(numpy.argsort(s)) U, s, V = Util.indSvd(U, s, V, inds) Utilde, Stilde, Vtilde = SVDUpdate.addRows(U, s, V, self.C) D = numpy.r_[self.A, self.C] nptst.assert_array_almost_equal(D, (Utilde * Stilde).dot(Vtilde.T), 4) #Check solution for partial rank SVD k = 20 U, s, V = numpy.linalg.svd(self.A) inds = numpy.flipud(numpy.argsort(s))[0:k] U, s, V = Util.indSvd(U, s, V, inds) Utilde, Stilde, Vtilde = SVDUpdate.addRows(U, s, V, self.C) D = numpy.r_[(U * s).dot(V.T), self.C] U, s, V = numpy.linalg.svd(D) inds = numpy.flipud(numpy.argsort(s))[0:k] U, s, V = Util.indSvd(U, s, V, inds) nptst.assert_array_almost_equal((U * s).dot(V.T), (Utilde * Stilde).dot(Vtilde.T), 4) #Test if same as add cols U, s, V = numpy.linalg.svd(self.A) inds = numpy.flipud(numpy.argsort(s))[0:k] U, s, V = Util.indSvd(U, s, V, inds) Utilde, sTilde, Vtilde = SVDUpdate.addRows(U, s, V, self.C) Vtilde2, sTilde2, Utilde2 = SVDUpdate.addCols(V, s, U, self.C.T) nptst.assert_array_almost_equal((Utilde * sTilde).dot(Vtilde.T), (Utilde2 * sTilde2).dot(Vtilde2.T))