Пример #1
0
    def __updateEigenSystem(self, lmbda, Q, deltaW, W):
        """
        Give the eigenvalues lmbda, eigenvectors Q and a deltaW matrix of weight
        changes, compute sequence of incidence vectors and update eigensystem.
        The deltaW is the change in edges from the current weight martrix which
        is given by W. 
        """
        changeInds = deltaW.nonzero()

        for s in range(changeInds[0].shape[0]):
            Util.printIteration(s, 10, changeInds[0].shape[0])
            i = changeInds[0][s]
            j = changeInds[1][s]
            if i >= j:  # only consider lower diagonal changes
                continue

            assert deltaW[i, j] != 0
            #            if deltaW[i, j] < 0:
            #                logging.warn(" deltaW is usually positive (here deltaW=" +str(deltaW[i, j]) + ")")

            #Note: update W at each iteration here
            lmbda, Q = self.incrementEigenSystem(lmbda, Q, W, i, j, deltaW[i,
                                                                           j])
            W[i, j] += deltaW[i, j]
            W[j, i] += deltaW[i, j]

        return lmbda, Q
Пример #2
0
    def readAuthorsAndDocuments(self, useAbstract=True): 
        logging.debug("About to read file " + self.dataFilename)
        inFile = open(self.dataFilename)  
        authorList = []
        citationList = []
        documentList = []
                    
        lastAbstract = ""
        lastVenue = ""
        lastTitle = ""    
        lastAuthors = []     
        lastCitationNo = 0                
                    
        for i, line in enumerate(inFile):
            Util.printIteration(i, self.stepSize, self.numLines)
                
            #Match the fields in the file 
            emptyLine = line == "\n"
            title = re.findall("#\*(.*)", line)
            currentAuthors = re.findall("#@(.*)", line)  
            abstract = re.findall("#!(.*)", line)
            venue = re.findall("#conf(.*)", line)
            citationNo = re.findall("#citation(.*)", line)
            
            if emptyLine:
                if useAbstract: 
                    document = lastTitle + " " + lastAbstract 
                else: 
                    document = lastTitle     
                documentList.append(document) 
                authorList.append(lastAuthors)
                citationList.append(lastCitationNo)

                lastAbstract = ""
                lastTitle = ""
                lastAuthors = []
                lastCitationNo = 0   
 
            if len(title) != 0 and len(title[0]) != 0: 
                lastTitle = title[0]
                
            if len(venue) != 0 and len(venue[0]) != 0: 
                lastVenue = venue[0]  
            
            if len(abstract) != 0 and len(abstract[0]) != 0: 
                lastAbstract = abstract[0]
                
            if len(citationNo) != 0 and len(citationNo[0]) != 0: 
                lastCitationNo = int(citationNo[0])
                       
            if len(currentAuthors) != 0: 
                currentAuthors = currentAuthors[0].split(",")  
                currentAuthors = set([x.strip() for x in currentAuthors])
                currentAuthors = currentAuthors.difference(set([""]))
                lastAuthors = currentAuthors                     

        inFile.close() 
        logging.debug("Finished reading " + str(len(documentList)) + " articles")  
        
        return authorList, documentList, citationList
Пример #3
0
    def __updateEigenSystem(self, lmbda, Q, deltaW, W):
        """
        Give the eigenvalues lmbda, eigenvectors Q and a deltaW matrix of weight
        changes, compute sequence of incidence vectors and update eigensystem.
        The deltaW is the change in edges from the current weight martrix which
        is given by W. 
        """
        changeInds = deltaW.nonzero()

        for s in range(changeInds[0].shape[0]):
            Util.printIteration(s, 10, changeInds[0].shape[0])
            i = changeInds[0][s]
            j = changeInds[1][s]
            if i>=j: # only consider lower diagonal changes
                continue

            assert deltaW[i, j] != 0
#            if deltaW[i, j] < 0:
#                logging.warn(" deltaW is usually positive (here deltaW=" +str(deltaW[i, j]) + ")")

            #Note: update W at each iteration here
            lmbda, Q = self.incrementEigenSystem(lmbda, Q, W, i, j, deltaW[i,j])
            W[i, j] += deltaW[i, j]
            W[j, i] += deltaW[i, j]
        
        return lmbda, Q 
Пример #4
0
    def predictEdges(self, vertexIndices):
        """
        This makes a prediction for a series of edges using the following score
        \sum_z \in n(x) \cup n(y) = 1/|log(n(z)|
        Returns a matrix with rows are a ranked list of verticies of length self.windowSize.
        """

        Parameter.checkInt(self.windowSize, 1, self.graph.getNumVertices())
        logging.info("Running predictEdges in " + str(self.__class__.__name__))

        P = numpy.zeros((vertexIndices.shape[0], self.windowSize))
        S = numpy.zeros((vertexIndices.shape[0], self.windowSize))
        W = self.graph.getWeightMatrix()


        for i in range(vertexIndices.shape[0]):
            Util.printIteration(i, self.printStep, vertexIndices.shape[0])
            scores = numpy.zeros(self.graph.getNumVertices())

            for j in range(0, self.graph.getNumVertices()):
                commonNeighbours = numpy.nonzero(W[vertexIndices[i], :] * W[j, :])[0]

                for k in commonNeighbours:
                    q = numpy.log(numpy.nonzero(W[k, :])[0].shape[0])
                    if q != 0:
                        scores[j] = scores[j] + 1/q


            P[i, :], S[i, :] = self.indicesFromScores(vertexIndices[i], scores)

        return P, S
Пример #5
0
    def evaluate(self, g1, g2, debug=False):
        """
        Find the kernel evaluation between two graphs
        """
        #W1 is always the smallest graph
        if g1.getNumVertices() > g2.getNumVertices():
            return self.evaluate(g2, g1)

        #We ought to have something that makes the matrices the same size 
        W1, W2 = self.__getWeightMatrices(g1, g2)
        K1, K2 = self.__getKernelMatrices(g1, g2)

        #Find common eigenspace
        S1, U = numpy.linalg.eigh(self.tau*W1 + (1-self.tau)*K1)
        S2, V = numpy.linalg.eigh(self.tau*W2 + (1-self.tau)*K2)

        #Find appoximate diagonals
        SK1 = numpy.diag(Util.mdot(U.T, K1, U))
        SW1 = numpy.diag(Util.mdot(U.T, W1, U))
        SK2 = numpy.diag(Util.mdot(V.T, K2, V))
        SW2 = numpy.diag(Util.mdot(V.T, W2, V))

        evaluation = self.tau * numpy.dot(SW1, SW2) + (1-self.tau)*numpy.dot(SK1, SK2)
        
        if debug:
            P = numpy.dot(V, U.T)
            f = self.getObjectiveValue(self.tau, P, g1, g2)
            return (evaluation, f, P, SW1, SW2, SK1, SK2)
        else:
            return evaluation
Пример #6
0
 def cleanXML(self):
     """
     Take the original XML file and clean up HTML characters and & symbols. We 
     also create a list of possible matches for the experts. 
     """
     if not os.path.exists(self.xmlCleanFilename):
         logging.debug("Cleaning XML")
         h = HTMLParser.HTMLParser()
         
         inFile = open(self.xmlFileName)
         outFile = open(self.xmlCleanFilename, "w")
         i = 0 
         
         for line in inFile: 
             Util.printIteration(i, self.stepSize, self.numLines)
             outLine = h.unescape(line).replace("&", "&amp;")
             outLine = re.sub("<title>.*[\<\>].*</title>", "<title>Default Title</title>", outLine)
             outLine = re.sub("<ee>.*[\<\>].*</ee>", "<ee>Default text</ee>", outLine)
             outFile.write(outLine) 
             i += 1
         
         inFile.close() 
         outFile.close() 
         logging.debug("All done")
     else: 
         logging.debug("File already generated: " + self.xmlCleanFilename)
Пример #7
0
    def simulateModel(theta):
        """
        The parameter t is the particle index. 
        """
        logging.debug("theta=" + str(theta))
 
        #We start with the observed graph at the start date 
        graph = targetGraph.subgraph(targetGraph.removedIndsAt(startDate)) 
        graph.addVertices(M-graph.size)

        p = Util.powerLawProbs(alpha, zeroVal)
        hiddenDegSeq = Util.randomChoice(p, graph.getNumVertices())
        
        featureInds = numpy.ones(graph.vlist.getNumFeatures(), numpy.bool)
        featureInds[HIVVertices.dobIndex] = False 
        featureInds[HIVVertices.infectionTimeIndex] = False 
        featureInds[HIVVertices.hiddenDegreeIndex] = False 
        featureInds[HIVVertices.stateIndex] = False
        featureInds = numpy.arange(featureInds.shape[0])[featureInds]
        matcher = GraphMatch(matchAlg, alpha=matchAlpha, featureInds=featureInds, useWeightM=False)
        graphMetrics = HIVGraphMetrics2(targetGraph, breakSize, matcher, float(endDate))
        
        recordStep = (endDate-startDate)/float(numRecordSteps)
        rates = HIVRates(graph, hiddenDegSeq)
        model = HIVEpidemicModel(graph, rates, T=float(endDate), T0=float(startDate), metrics=graphMetrics)
        model.setRecordStep(recordStep)
        model.setParams(theta)
        
        model.simulate() 
    
        objective = model.objective()
        return objective
Пример #8
0
    def predict(self, X):
        """
        Make a prediction for a set of examples given as the rows of the matrix X.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`
        """
        Util.abstract()
Пример #9
0
    def predict(self, X):
        """
        Make a prediction for a set of examples given as the rows of the matrix X.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`
        """
        Util.abstract()
Пример #10
0
    def testIncrementEigenSystem(self):
        print "< testIncrementEigenSystem >"
        numVertices = 10
        graph = SparseGraph(GeneralVertexList(numVertices))

        p = 0.4
        generator = ErdosRenyiGenerator(p)
        graph = generator.generate(graph)

        W = graph.getWeightMatrix()
        L = graph.laplacianMatrix()
        degrees = graph.outDegreeSequence()
        D = numpy.diag(degrees)

        lmbda1, Q1 = scipy.linalg.eig(L, D)
        lmbda1 = lmbda1.real
        Q1 = Q1.dot(numpy.diag(numpy.diag(Q1.T.dot(D).dot(Q1))**-0.5))

        tol = 10**-6
        k = 3
        inds = numpy.argsort(lmbda1)[0:k]
        lmbda1, Q1 = Util.indEig(lmbda1, Q1, inds)

        #Similarity change vector
        w = graph.getEdge(5, 7)
        deltaW = 0.5

        k = 3
        clusterer = NingSpectralClustering(k)
        lmbda2Approx, Q2Approx = clusterer.incrementEigenSystem(
            lmbda1, Q1, scipy.sparse.csr_matrix(W), 5, 7, deltaW)

        #Compute real eigenvectors then compare against these
        Lhat = L.copy()
        Lhat[5, 5] += deltaW
        Lhat[7, 7] += deltaW
        Lhat[5, 7] -= deltaW
        Lhat[7, 5] -= deltaW
        Dhat = numpy.diag(numpy.diag(Lhat))
        lmbda2, Q2 = scipy.linalg.eig(Lhat, Dhat)
        lmbda2, Q2 = Util.indEig(lmbda2, Q2, inds)

        Q2Approx = Q2Approx.dot(
            numpy.diag(numpy.diag(Q2Approx.T.dot(Q2Approx))**-0.5))
        Q2 = Q2.dot(numpy.diag(numpy.sum(Q2**2, 0)**-0.5))
        Q1 = Q1.dot(numpy.diag(numpy.sum(Q1**2, 0)**-0.5))

        #Errors in the eigenvalues
        logging.debug("Eigenvalue Errors")
        logging.debug(numpy.linalg.norm(lmbda2 - lmbda2Approx))
        logging.debug(numpy.linalg.norm(lmbda2 - lmbda1))

        #Compute error according to the paper
        error = numpy.sum(1 - numpy.diag(Q2.T.dot(Q2Approx))**2)
        error2 = numpy.sum(1 - numpy.diag(Q2.T.dot(Q1))**2)
        logging.debug("Eigenvector Errors")
        logging.debug(error)
        logging.debug(error2)
Пример #11
0
    def addRows(U, s, V, B, k=None):
        """
        Find the SVD of a matrix [A ; B] where  A = U diag(s) V.T. Uses the QR 
        decomposition to find an orthogonal basis on B. 
        
        :param U: The left singular vectors of A  
        
        :param s: The singular values of A 
        
        :param V: The right singular vectors of A 
        
        :param B: The matrix to append to A 
        """
        if V.shape[0] != B.shape[1]:
            raise ValueError("U must have same number of rows as B cols")
        if s.shape[0] != U.shape[1]:
            raise ValueError("Number of cols of U must be the same size as s")
        if s.shape[0] != V.shape[1]:
            raise ValueError("Number of cols of V must be the same size as s")

        if k == None:
            k = U.shape[1]
        m, p = U.shape
        r = B.shape[0]

        C = B.T - V.dot(V.T).dot(B.T)
        Q, R = numpy.linalg.qr(C)

        rPrime = Util.rank(C)
        Q = Q[:, 0:rPrime]
        R = R[0:rPrime, :]

        D = numpy.c_[numpy.diag(s), numpy.zeros((p, rPrime))]
        E = numpy.c_[B.dot(V), R.T]
        D = numpy.r_[D, E]

        G1 = numpy.c_[U, numpy.zeros((m, r))]
        G2 = numpy.c_[numpy.zeros((r, p)), numpy.eye(r)]
        G = numpy.r_[G1, G2]

        H = numpy.c_[V, Q]

        nptst.assert_array_almost_equal(G.T.dot(G), numpy.eye(G.shape[1]))
        nptst.assert_array_almost_equal(H.T.dot(H), numpy.eye(H.shape[1]))
        nptst.assert_array_almost_equal(
            G.dot(D).dot(H.T), numpy.r_[(U * s).dot(V.T), B])

        Uhat, sHat, Vhat = numpy.linalg.svd(D, full_matrices=False)
        inds = numpy.flipud(numpy.argsort(sHat))[0:k]
        Uhat, sHat, Vhat = Util.indSvd(Uhat, sHat, Vhat, inds)

        #The best rank k approximation of [A ; B]
        Utilde = G.dot(Uhat)
        Stilde = sHat
        Vtilde = H.dot(Vhat)

        return Utilde, Stilde, Vtilde
Пример #12
0
    def testExpandIntArray(self):
        v = numpy.array([1, 3, 2, 4], numpy.int)
        w = Util.expandIntArray(v)

        self.assertTrue((w == numpy.array([0,1,1,1,2,2,3,3,3,3], numpy.int)).all())

        v = numpy.array([], numpy.int)
        w = Util.expandIntArray(v)
        self.assertTrue((w == numpy.array([], numpy.int)).all())
Пример #13
0
    def eigpsd(X, n):
        """
        Find the eigenvalues and eigenvectors of a positive semi-definite symmetric matrix.
        The input matrix X can be a numpy array or a scipy sparse matrix. In the case that
        n==X.shape[0] we convert to an ndarray. 

        :param X: The matrix to find the eigenvalues of.
        :type X: :class:`ndarray`

        :param n: If n is an int, then it is the number of columns to sample otherwise n is an array of column indices.

        :return lmbda: The set of eigenvalues 
        :return V: The matrix of eigenvectors as a ndarray
        """
        if type(n) == int:
            n = min(n, X.shape[0])
            inds = numpy.sort(numpy.random.permutation(X.shape[0])[0:n])
        elif type(n) == numpy.ndarray:
            inds = numpy.sort(n)
        else:
            raise ValueError("Invalid n value: " + str(n))

        invInds = numpy.setdiff1d(numpy.arange(X.shape[0]), inds)

        if inds.shape[0] == X.shape[0] and (inds == numpy.arange(
                X.shape[0])).all():
            if scipy.sparse.issparse(X):
                X = numpy.array(X.todense())
            lmbda, V = Util.safeEigh(X)
            return lmbda, V

        tmp = X[inds, :]
        A = tmp[:, inds]
        B = tmp[:, invInds]

        if scipy.sparse.issparse(X):
            A = numpy.array(A.todense())
            BB = numpy.array((B.dot(B.T)).todense())
        else:
            BB = B.dot(B.T)

        #Following line is very slow
        #Am12 = scipy.linalg.sqrtm(numpy.linalg.pinv(A))
        Am12 = Util.matrixPowerh(A, -0.5)
        S = A + Am12.dot(BB).dot(Am12)
        S = (S.T + S) / 2

        lmbda, U = Util.safeEigh(S)

        tol = 10**-10
        lmbdaN = lmbda.copy()
        lmbdaN[numpy.abs(lmbda) < tol] = 0
        lmbdaN[numpy.abs(lmbda) > tol] = lmbdaN[numpy.abs(lmbda) > tol]**-0.5

        V = X[:, inds].dot(Am12.dot(U) * lmbdaN)

        return lmbda, V
Пример #14
0
    def evaluateLearn(X, y, idx, learnModel, predict, metricMethod, progress=True):
        """
        Evaluate this learning algorithm using the given list of training/test splits 
        The metricMethod is a method which takes (predictedY, realY) as input
        and returns a metric about the quality of the evaluation.

        :param X: A matrix with examples as rows 
        :type X: :class:`ndarray`

        :param y: A vector of labels 
        :type y: :class:`ndarray`

        :param idx: A list of training/test splits 
        :type idx: :class:`list`

        :param learnModel: A function such that learnModel(X, y) finds a mapping from X to y 
        :type learnModel: :class:`function`

        :param predict: A function such that predict(X) makes predictions for X
        :type predict: :class:`function`

        :param metricMethod: A function such that metricMethod(predY, testY) returns the quality of predicted labels predY
        :type metricMethod: :class:`function`

        Output: the mean and variation of the cross validation folds. 
        """
        #Parameter.checkClass(idx, list)
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkArray(X, softCheck=True)
        Parameter.checkInt(X.shape[0], 1, float('inf'))
        Parameter.checkClass(y, numpy.ndarray)
        Parameter.checkArray(y, softCheck=True)

        if y.ndim != 1:
            raise ValueError("Dimention of y must be 1")
        
        i = 0
        metrics = numpy.zeros(len(idx))
        logging.debug("EvaluateLearn: Using " + str(len(idx)) + " splits on " + str(X.shape[0]) + " examples")

        for idxtr, idxts in idx:
            if progress:
                Util.printConciseIteration(i, 1, len(idx))

            trainX, testX = X[idxtr, :], X[idxts, :]
            trainY, testY = y[idxtr], y[idxts]
            #logging.debug("Distribution of labels in evaluateLearn train: " + str(numpy.bincount(trainY)))
            #logging.debug("Distribution of labels in evaluateLearn test: " + str(numpy.bincount(testY)))

            learnModel(trainX, trainY)
            predY = predict(testX)
            gc.collect()

            metrics[i] = metricMethod(predY, testY)
            i += 1

        return metrics
Пример #15
0
    def eigenAdd(omega, Q, Y, k):
        """
        Perform an eigen update of the form A*A + Y*Y in which Y is a low-rank matrix
        and A^*A = Q Omega Q*. We use the rank-k approximation of A:  Q_k Omega_k Q_k^*
        and then approximate [A^*A_k Y^*Y]_k.
        """
        #logging.debug("< eigenAdd >")
        Parameter.checkInt(k, 0, omega.shape[0])
        #if not numpy.isrealobj(omega) or not numpy.isrealobj(Q):
        #    raise ValueError("Eigenvalues and eigenvectors must be real")
        if omega.ndim != 1:
            raise ValueError("omega must be 1-d array")
        if omega.shape[0] != Q.shape[1]:
            raise ValueError("Must have same number of eigenvalues and eigenvectors")

        if __debug__:
            Parameter.checkOrthogonal(Q, tol=EigenUpdater.tol, softCheck=True, arrayInfo="input Q in eigenAdd()")

        #Taking the abs of the eigenvalues is correct
        inds = numpy.flipud(numpy.argsort(numpy.abs(omega)))

        omega, Q = Util.indEig(omega, Q, inds[numpy.abs(omega)>EigenUpdater.tol])
        Omega = numpy.diag(omega)

        YY = Y.conj().T.dot(Y)
        QQ = Q.dot(Q.conj().T)
        Ybar = Y - Y.dot(QQ)

        Pbar, sigmaBar, Qbar = numpy.linalg.svd(Ybar, full_matrices=False)
        inds = numpy.flipud(numpy.argsort(numpy.abs(sigmaBar)))
        inds = inds[numpy.abs(sigmaBar)>EigenUpdater.tol]
        Pbar, sigmaBar, Qbar = Util.indSvd(Pbar, sigmaBar, Qbar, inds)
        
        SigmaBar = numpy.diag(sigmaBar)
        Qbar = Ybar.T.dot(Pbar)
        Qbar = Qbar.dot(numpy.diag(numpy.diag(Qbar.T.dot(Qbar))**-0.5))

        r = sigmaBar.shape[0]

        YQ = Y.dot(Q)
        Zeros = numpy.zeros((r, omega.shape[0]))
        D = numpy.c_[Q, Qbar]

        YYQQ = YY.dot(QQ)
        Z = D.conj().T.dot(YYQQ + YYQQ.conj().T).dot(D)
        F = numpy.c_[numpy.r_[Omega - YQ.conj().T.dot(YQ), Zeros], numpy.r_[Zeros.T, SigmaBar.conj().dot(SigmaBar)]]
        F = F + Z 

        pi, H = scipy.linalg.eigh(F)
        inds = numpy.flipud(numpy.argsort(numpy.abs(pi)))

        H = H[:, inds[0:k]]
        pi = pi[inds[0:k]]

        V = D.dot(H)
        #logging.debug("</ eigenAdd >")
        return pi, V
Пример #16
0
    def evaluateCvOuter(self, X, Y, folds):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param y: A vector of labels
        :type y: :class:`ndarray`

        :param folds: The number of cross validation folds
        :type folds: :class:`int`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds, "Outer CV: ")
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            self.learnModel(trainX, trainY)
            #self.learnModelCut(trainX, trainY)

            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
Пример #17
0
    def eigpsd(X, n):
        """
        Find the eigenvalues and eigenvectors of a positive semi-definite symmetric matrix.
        The input matrix X can be a numpy array or a scipy sparse matrix. In the case that
        n==X.shape[0] we convert to an ndarray. 

        :param X: The matrix to find the eigenvalues of.
        :type X: :class:`ndarray`

        :param n: If n is an int, then it is the number of columns to sample otherwise n is an array of column indices.

        :return lmbda: The set of eigenvalues 
        :return V: The matrix of eigenvectors as a ndarray
        """
        if type(n) == int:
            n = min(n, X.shape[0])
            inds = numpy.sort(numpy.random.permutation(X.shape[0])[0:n])
        elif type(n) == numpy.ndarray:
            inds = numpy.sort(n)
        else:
            raise ValueError("Invalid n value: " + str(n))

        invInds = numpy.setdiff1d(numpy.arange(X.shape[0]), inds)

        if inds.shape[0] == X.shape[0] and (inds == numpy.arange(X.shape[0])).all():
            if scipy.sparse.issparse(X):
                X = numpy.array(X.todense())
            lmbda, V = Util.safeEigh(X)
            return lmbda, V

        tmp = X[inds, :]
        A = tmp[:, inds]
        B = tmp[:, invInds]

        if scipy.sparse.issparse(X):
            A = numpy.array(A.todense())
            BB = numpy.array((B.dot(B.T)).todense())
        else:
            BB = B.dot(B.T)

        # Following line is very slow
        # Am12 = scipy.linalg.sqrtm(numpy.linalg.pinv(A))
        Am12 = Util.matrixPowerh(A, -0.5)
        S = A + Am12.dot(BB).dot(Am12)
        S = (S.T + S) / 2

        lmbda, U = Util.safeEigh(S)

        tol = 10 ** -10
        lmbdaN = lmbda.copy()
        lmbdaN[numpy.abs(lmbda) < tol] = 0
        lmbdaN[numpy.abs(lmbda) > tol] = lmbdaN[numpy.abs(lmbda) > tol] ** -0.5

        V = X[:, inds].dot(Am12.dot(U) * lmbdaN)

        return lmbda, V
Пример #18
0
    def evaluateCvOuter(self, X, Y, folds):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param y: A vector of labels
        :type y: :class:`ndarray`

        :param folds: The number of cross validation folds
        :type folds: :class:`int`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds, "Outer CV: ")
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            self.learnModel(trainX, trainY)
            #self.learnModelCut(trainX, trainY)

            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
Пример #19
0
    def addRows(U, s, V, B, k=None): 
        """
        Find the SVD of a matrix [A ; B] where  A = U diag(s) V.T. Uses the QR 
        decomposition to find an orthogonal basis on B. 
        
        :param U: The left singular vectors of A  
        
        :param s: The singular values of A 
        
        :param V: The right singular vectors of A 
        
        :param B: The matrix to append to A 
        """
        if V.shape[0] != B.shape[1]:
            raise ValueError("U must have same number of rows as B cols")
        if s.shape[0] != U.shape[1]:
            raise ValueError("Number of cols of U must be the same size as s")
        if s.shape[0] != V.shape[1]:
            raise ValueError("Number of cols of V must be the same size as s")
    
        if k == None: 
            k = U.shape[1]
        m, p = U.shape
        r = B.shape[0]
        
        C = B.T - V.dot(V.T).dot(B.T)
        Q, R = numpy.linalg.qr(C)

        rPrime = Util.rank(C)
        Q = Q[:, 0:rPrime]
        R = R[0:rPrime, :]

        D = numpy.c_[numpy.diag(s), numpy.zeros((p, rPrime))]
        E = numpy.c_[B.dot(V), R.T]
        D = numpy.r_[D, E]
        
        G1 = numpy.c_[U, numpy.zeros((m, r))]
        G2 = numpy.c_[numpy.zeros((r, p)), numpy.eye(r)]
        G = numpy.r_[G1, G2]
        
        H = numpy.c_[V, Q]
        
        nptst.assert_array_almost_equal(G.T.dot(G), numpy.eye(G.shape[1])) 
        nptst.assert_array_almost_equal(H.T.dot(H), numpy.eye(H.shape[1])) 
        nptst.assert_array_almost_equal(G.dot(D).dot(H.T), numpy.r_[(U*s).dot(V.T), B])

        Uhat, sHat, Vhat = numpy.linalg.svd(D, full_matrices=False)
        inds = numpy.flipud(numpy.argsort(sHat))[0:k]
        Uhat, sHat, Vhat = Util.indSvd(Uhat, sHat, Vhat, inds)

        #The best rank k approximation of [A ; B]
        Utilde = G.dot(Uhat)
        Stilde = sHat
        Vtilde = H.dot(Vhat)

        return Utilde, Stilde, Vtilde
Пример #20
0
    def testExpandIntArray(self):
        v = numpy.array([1, 3, 2, 4], numpy.int)
        w = Util.expandIntArray(v)

        self.assertTrue((w == numpy.array([0, 1, 1, 1, 2, 2, 3, 3, 3, 3],
                                          numpy.int)).all())

        v = numpy.array([], numpy.int)
        w = Util.expandIntArray(v)
        self.assertTrue((w == numpy.array([], numpy.int)).all())
Пример #21
0
    def testEntropy(self):
        v = numpy.array([0, 0, 0, 1, 1, 1])

        self.assertEquals(Util.entropy(v), 1)

        v = numpy.array([0, 0, 0])
        self.assertEquals(Util.entropy(v), 0)

        v = numpy.array([1, 1, 1])
        self.assertEquals(Util.entropy(v), 0)
Пример #22
0
    def testEntropy(self):
        v = numpy.array([0, 0, 0, 1, 1, 1])

        self.assertEquals(Util.entropy(v), 1)

        v = numpy.array([0, 0, 0])
        self.assertEquals(Util.entropy(v), 0)

        v = numpy.array([1, 1, 1])
        self.assertEquals(Util.entropy(v), 0)
Пример #23
0
    def testEigenAdd2(self):
        tol = 10**-6

        for i in range(10):
            m = numpy.random.randint(5, 10)
            n = numpy.random.randint(5, 10)
            p = numpy.random.randint(5, 10)
            A = numpy.random.randn(m, n)
            Y1 = numpy.random.randn(n, p)
            Y2 = numpy.random.randn(n, p)

            AA = A.conj().T.dot(A)
            Y1Y2 = Y1.dot(Y2.conj().T)
            lastError = 100

            omega, Q = numpy.linalg.eigh(AA)
            self.assertTrue(
                numpy.linalg.norm(AA - (Q * omega).dot(Q.conj().T)) < tol)
            C = AA + Y1Y2 + Y1Y2.conj().T
            for k in range(1, 9):
                pi, V, D, DUD = EigenUpdater.eigenAdd2(omega,
                                                       Q,
                                                       Y1,
                                                       Y2,
                                                       k,
                                                       debug=True)
                # V is "orthogonal"
                self.assertTrue(
                    numpy.linalg.norm(V.conj().T.dot(V) -
                                      numpy.eye(V.shape[1])) < tol)

                # The approximation converges to the exact decomposition
                C_k = (V * pi).dot(V.conj().T)
                error = numpy.linalg.norm(C - C_k)
                if Util.rank(C) == k:
                    self.assertTrue(error <= tol)
                lastError = error

                # DomegaD corresponds to AA_k
                omega_k, Q_k = Util.indEig(
                    omega, Q,
                    numpy.flipud(numpy.argsort(omega))[0:k])
                DomegakD = (D *
                            numpy.c_[omega_k[numpy.newaxis, :],
                                     numpy.zeros(
                                         (1, max(D.shape[1] - k, 0)))]).dot(
                                             D.conj().T)
                self.assertTrue(
                    numpy.linalg.norm((Q_k * omega_k).dot(Q_k.conj().T) -
                                      DomegakD) < tol)

                # DUD is exactly decomposed
                self.assertTrue(
                    numpy.linalg.norm(Y1Y2 + Y1Y2.conj().T -
                                      D.dot(DUD).dot(D.conj().T)) < tol)
Пример #24
0
    def testMatrixPowerh(self):
        A = numpy.random.rand(10, 10)
        A = A.T.dot(A)            
            
        tol = 10**-6 
        A2 = A.dot(A)

        lmbda, V = scipy.linalg.eig(A)

        A12 = Util.matrixPowerh(A, 0.5)

        self.assertTrue(numpy.linalg.norm(A12.dot(A12)  - A) < tol)
        self.assertTrue(numpy.linalg.norm(numpy.linalg.inv(A) - Util.matrixPowerh(A, -1)) < tol)
        self.assertTrue(numpy.linalg.norm(A - Util.matrixPowerh(A, 1)) < tol)
        self.assertTrue(numpy.linalg.norm(A2 - Util.matrixPowerh(A, 2)) < tol)
        self.assertTrue(numpy.linalg.norm(numpy.linalg.inv(A).dot(numpy.linalg.inv(A)) - Util.matrixPowerh(A, -2)) < tol)        
        
        #Now lets test on a low rank matrix
        lmbda[5:] = 0
        A = V.dot(numpy.diag(lmbda)).dot(numpy.linalg.inv(V))
        A2 = A.dot(A)
        A12 = Util.matrixPowerh(A, 0.5)
        Am12 = Util.matrixPowerh(A, -0.5)

        
        self.assertTrue(numpy.linalg.norm(numpy.linalg.pinv(A) - Util.matrixPowerh(A, -1)) < tol)
        self.assertTrue(numpy.linalg.norm(numpy.linalg.pinv(A) - Am12.dot(Am12)) < tol)
        self.assertTrue(numpy.linalg.norm(A12.dot(A12)  - A) < tol)
        self.assertTrue(numpy.linalg.norm(A - Util.matrixPowerh(A, 1)) < tol)
        self.assertTrue(numpy.linalg.norm(A2 - Util.matrixPowerh(A, 2)) < tol)
Пример #25
0
    def supervisedMC23(lists, itemList, topQList, verbose=False): 
        """
        A supervised version of MC2 of our own invention. The idea is to find a 
        linear combination of transition matrices to fit a given one. We just make
        sure it fits the stationary distribution. 
        """
        import cvxopt
        import cvxopt.solvers
        ell = len(lists)
        n = len(itemList)
        outputList, scores, PList = RankAggregator.MC2(lists, itemList, verbose=True)
        
        Py = RankAggregator.generateTransitionMatrix(topQList, itemList)
        u, v = scipy.sparse.linalg.eigs(Py.T, 1)
        v = numpy.array(v).flatten()

        c = numpy.zeros(v.shape[0])

        for i, P in enumerate(PList): 
            Q[:, i] = cvxopt.matrix(numpy.array(P.todense()).ravel()) 
            
        c = cvxopt.matrix(c)
        QQ = Q.T * Q
        
        Py = RankAggregator.generateTransitionMatrix(topQList, itemList)
        s = numpy.array(Py.todense()).ravel()
        s = cvxopt.matrix(s)
        
        G = cvxopt.spdiag((-numpy.ones(ell)).tolist())
        h = cvxopt.matrix(numpy.zeros(ell))
        
        A = cvxopt.matrix(numpy.ones(ell), (1, ell))
        b = cvxopt.matrix(numpy.ones(1))        
                
        q = -Q.T * s  
        
        sol = cvxopt.solvers.qp(QQ, q, G, h, A, b)
        
        alpha = numpy.array(sol['x'])
        
        #Combine the matrices 
        P = numpy.zeros((n, n))       
        
        for j, Pj in enumerate(PList): 
            Util.printIteration(j, 1, ell)
            P += alpha[j] * numpy.array(Pj.todense()) 

        P /= ell 
        
        outputList, scores = RankAggregator.computeOutputList(P, itemList)
        
        if verbose: 
            return outputList, scores, PList
        else: 
            return outputList, scores        
Пример #26
0
    def modelSelect(self, X):
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        localAucs = numpy.zeros(
            (self.ks.shape[0], self.lmbdas.shape[0], len(cvInds)))

        logging.debug("Performing model selection")
        paramList = []

        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, self.folds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)

            testOmegaList = SparseUtils.getOmegaList(testX)

            for i, k in enumerate(self.ks):
                maxLocalAuc = self.copy()
                maxLocalAuc.k = k
                paramList.append((trainX, testX, testOmegaList, maxLocalAuc))

        pool = multiprocessing.Pool(processes=self.numProcesses,
                                    maxtasksperchild=100)
        resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize)
        #import itertools
        #resultsIterator = itertools.imap(localAucsLmbdas, paramList)

        for icv, (trainInds, testInds) in enumerate(cvInds):
            for i, k in enumerate(self.ks):
                tempAucs = resultsIterator.next()
                localAucs[i, :, icv] = tempAucs

        pool.terminate()

        meanLocalAucs = numpy.mean(localAucs, 2)
        stdLocalAucs = numpy.std(localAucs, 2)

        logging.debug(meanLocalAucs)

        k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs),
                                        meanLocalAucs.shape)[0]]
        lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs),
                                                meanLocalAucs.shape)[1]]

        logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda))

        self.k = k
        self.lmbda = lmbda

        return meanLocalAucs, stdLocalAucs
    def testIncrementEigenSystem(self):
        print "< testIncrementEigenSystem >"
        numVertices = 10
        graph = SparseGraph(GeneralVertexList(numVertices))

        p = 0.4
        generator = ErdosRenyiGenerator(p)
        graph = generator.generate(graph)

        W = graph.getWeightMatrix()
        L = graph.laplacianMatrix()
        degrees = graph.outDegreeSequence()
        D = numpy.diag(degrees)
        
        lmbda1, Q1 = scipy.linalg.eig(L, D)
        lmbda1 = lmbda1.real
        Q1 = Q1.dot(numpy.diag(numpy.diag(Q1.T.dot(D).dot(Q1))**-0.5))

        tol = 10**-6
        k = 3
        inds = numpy.argsort(lmbda1)[0:k]
        lmbda1, Q1 = Util.indEig(lmbda1, Q1, inds)

        #Similarity change vector
        w = graph.getEdge(5,7)
        deltaW = 0.5

        k = 3
        clusterer = NingSpectralClustering(k)
        lmbda2Approx, Q2Approx = clusterer.incrementEigenSystem(lmbda1, Q1, scipy.sparse.csr_matrix(W), 5, 7, deltaW)

        #Compute real eigenvectors then compare against these
        Lhat = L.copy();
        Lhat[5,5] += deltaW; Lhat[7,7] += deltaW
        Lhat[5,7] -= deltaW; Lhat[7,5] -= deltaW
        Dhat = numpy.diag(numpy.diag(Lhat))
        lmbda2, Q2 = scipy.linalg.eig(Lhat, Dhat)
        lmbda2, Q2 = Util.indEig(lmbda2, Q2, inds)

        Q2Approx = Q2Approx.dot(numpy.diag(numpy.diag(Q2Approx.T.dot(Q2Approx))**-0.5))
        Q2 = Q2.dot(numpy.diag(numpy.sum(Q2**2, 0)**-0.5))
        Q1 = Q1.dot(numpy.diag(numpy.sum(Q1**2, 0)**-0.5))

        #Errors in the eigenvalues
        logging.debug("Eigenvalue Errors")
        logging.debug(numpy.linalg.norm(lmbda2 - lmbda2Approx))
        logging.debug(numpy.linalg.norm(lmbda2 - lmbda1))

        #Compute error according to the paper 
        error = numpy.sum(1 - numpy.diag(Q2.T.dot(Q2Approx))**2)
        error2 = numpy.sum(1 - numpy.diag(Q2.T.dot(Q1))**2)
        logging.debug("Eigenvector Errors")
        logging.debug(error)
        logging.debug(error2)
Пример #28
0
    def distance2(self, graph1, graph2, permutation):
        """
        Compute a graph distance metric between two graphs give a permutation 
        vector. This is given by F(P) = (1-alpha)/(||W1||^2_F + ||W2||^2_F)
        (||W1 - P W2 P.T||^2_F) - alpha 1/(||V1||_F^2 + ||V2||_F^2) ||V1 - P.T V2||^2_F 
        and is bounded between 0 and 1. 
        
        :param graph1: A graph object 
        
        :param graph2: The second graph object to match 
        
        :param permutation: An array of permutation indices matching the first to second graph 
        :type permutation: `numpy.ndarray`
        
        """
        if self.useWeightM:
            W1 = graph1.getWeightMatrix()
            W2 = graph2.getWeightMatrix()
        else:
            W1 = graph1.adjacencyMatrix()
            W2 = graph2.adjacencyMatrix()

        if W1.shape[0] < W2.shape[0]:
            W1 = Util.extendArray(W1, W2.shape)
        elif W2.shape[0] < W1.shape[0]:
            W2 = Util.extendArray(W2, W1.shape)

        n = W1.shape[0]
        P = numpy.zeros((n, n))
        P[(numpy.arange(n), permutation)] = 1
        dist1 = numpy.linalg.norm(W1 - P.dot(W2).dot(P.T)) ** 2

        # Now compute the vertex similarities distance
        V1 = graph1.getVertexList().getVertices()
        V2 = graph2.getVertexList().getVertices()

        if V1.shape[0] < V2.shape[0]:
            V1 = Util.extendArray(V1, V2.shape)
        elif V2.shape[0] < V1.shape[0]:
            V2 = Util.extendArray(V2, V1.shape)

        dist2 = numpy.sum((V1 - P.T.dot(V2)) ** 2)

        norm1 = (W1 ** 2).sum() + (W2 ** 2).sum()
        norm2 = (V1 ** 2).sum() + (V2 ** 2).sum()

        if norm1 != 0:
            dist1 = dist1 / norm1
        if norm2 != 0:
            dist2 = dist2 / norm2

        dist = (1 - self.alpha) * dist1 + self.alpha * dist2

        return dist
Пример #29
0
    def distance2(self, graph1, graph2, permutation):
        """
        Compute a graph distance metric between two graphs give a permutation 
        vector. This is given by F(P) = (1-alpha)/(||W1||^2_F + ||W2||^2_F)
        (||W1 - P W2 P.T||^2_F) - alpha 1/(||V1||_F^2 + ||V2||_F^2) ||V1 - P.T V2||^2_F 
        and is bounded between 0 and 1. 
        
        :param graph1: A graph object 
        
        :param graph2: The second graph object to match 
        
        :param permutation: An array of permutation indices matching the first to second graph 
        :type permutation: `numpy.ndarray`
        
        """
        if self.useWeightM:
            W1 = graph1.getWeightMatrix()
            W2 = graph2.getWeightMatrix()
        else:
            W1 = graph1.adjacencyMatrix()
            W2 = graph2.adjacencyMatrix()

        if W1.shape[0] < W2.shape[0]:
            W1 = Util.extendArray(W1, W2.shape)
        elif W2.shape[0] < W1.shape[0]:
            W2 = Util.extendArray(W2, W1.shape)

        n = W1.shape[0]
        P = numpy.zeros((n, n))
        P[(numpy.arange(n), permutation)] = 1
        dist1 = numpy.linalg.norm(W1 - P.dot(W2).dot(P.T))**2

        #Now compute the vertex similarities distance
        V1 = graph1.getVertexList().getVertices()
        V2 = graph2.getVertexList().getVertices()

        if V1.shape[0] < V2.shape[0]:
            V1 = Util.extendArray(V1, V2.shape)
        elif V2.shape[0] < V1.shape[0]:
            V2 = Util.extendArray(V2, V1.shape)

        dist2 = numpy.sum((V1 - P.T.dot(V2))**2)

        norm1 = ((W1**2).sum() + (W2**2).sum())
        norm2 = ((V1**2).sum() + (V2**2).sum())

        if norm1 != 0:
            dist1 = dist1 / norm1
        if norm2 != 0:
            dist2 = dist2 / norm2

        dist = (1 - self.alpha) * dist1 + self.alpha * dist2

        return dist
Пример #30
0
    def learnModel(self, X, Y):
        """
        Learn the CCA primal-dual directions.
        """
        self.trainX = X
        self.trainY = Y

        numExamples = X.shape[0]
        numFeatures = Y.shape[1]

        a = 10**-5
        I = numpy.eye(numExamples)
        I2 = numpy.eye(numFeatures)
        Kx = self.kernelX.evaluate(X, X) + a * I
        Kxx = numpy.dot(Kx, Kx)
        Kxy = numpy.dot(Kx, Y)
        Cyy = numpy.dot(Y.T, Y) + a * I2

        Z1 = numpy.zeros((numExamples, numExamples))
        Z2 = numpy.zeros((numFeatures, numFeatures))
        Z3 = numpy.zeros((numExamples, numFeatures))

        #Note we add a small value to the diagonal of A and B to deal with low-rank
        A = numpy.c_[Z1, Kxy]
        A1 = numpy.c_[Kxy.T, Z2]
        A = numpy.r_[A, A1]
        A = (A + A.T) / 2  #Stupid stupidness

        B = numpy.c_[(1 - self.tau1) * Kxx - self.tau1 * Kx, Z3]
        B1 = numpy.c_[Z3.T, (1 - self.tau2) * Cyy - self.tau2 * I2]
        B = numpy.r_[B, B1]
        B = (B + B.T) / 2

        (D, W) = scipy.linalg.eig(A, B)

        #Only select eigenvalues which are greater than zero
        W = W[:, D > 0]

        #We need to return those eigenvectors corresponding to positive eigenvalues
        self.alpha = W[0:numExamples, :]
        self.V = W[numExamples:, :]
        self.lmbdas = D[D > 0]

        alphaDiag = Util.mdot(self.alpha.T, Kxx, self.alpha)
        alphaDiag = alphaDiag + numpy.array(alphaDiag < 0, numpy.int)
        vDiag = Util.mdot(self.V.T, Cyy, self.V)
        vDiag = vDiag + numpy.array(vDiag < 0, numpy.int)
        self.alpha = numpy.dot(
            self.alpha, numpy.diag(1 / numpy.sqrt(numpy.diag(alphaDiag))))
        self.V = numpy.dot(self.V,
                           numpy.diag(1 / numpy.sqrt(numpy.diag(vDiag))))

        return self.alpha, self.V, self.lmbdas
Пример #31
0
    def testMode(self):
        x = numpy.array([1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 5, 5])
        self.assertEquals(Util.mode(x), 3)

        x = numpy.array([1, 1, 1, 2, 2, 3, 3, 3, 5, 5])
        self.assertEquals(Util.mode(x), 1)

        x = numpy.array([1, 2, 3, 4])
        self.assertEquals(Util.mode(x), 1)

        x = numpy.array([0])
        self.assertEquals(Util.mode(x), 0)
Пример #32
0
    def learnModel(self, X, y):
        """
        Learn a model for a set of examples given as the rows of the matrix X,
        with corresponding labels given in the elements of 1D array y.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param y: A vector of labels
        :type y: :class:`ndarray`
        """
        Util.abstract()
Пример #33
0
    def testMode(self):
        x = numpy.array([1,1,1,2,2,3,3,3,3,3,5,5])
        self.assertEquals(Util.mode(x), 3)

        x = numpy.array([1,1,1,2,2,3,3,3,5,5])
        self.assertEquals(Util.mode(x), 1)

        x = numpy.array([1,2,3,4])
        self.assertEquals(Util.mode(x), 1)

        x = numpy.array([0])
        self.assertEquals(Util.mode(x), 0)
Пример #34
0
    def testCumMin(self):
        v = numpy.array([5, 6, 4, 5, 1])
        u = Util.cumMin(v)
        nptst.assert_array_equal(u, numpy.array([5, 5, 4, 4, 1]))

        v = numpy.array([5, 4, 3, 2, 1])
        u = Util.cumMin(v)
        nptst.assert_array_equal(u, v)

        v = numpy.array([1, 2, 3])
        u = Util.cumMin(v)
        nptst.assert_array_equal(u, numpy.ones(3))
Пример #35
0
    def learnModel(self, graph):
        """
        Learn a prediction model based on considering ego networks as independent.
        For each ego, X contains a list of neighbours and the corresponding labels
        are the values of the edge labels. We then find the set of primal weights
        w for each ego network and then regress onto the set of weights using the
        ego labels.

        :param graph: The input graph to learn from.
        :type graph: class:`apgl.graph.AbstractSingleGraph`
        """

        logging.info("Learning model on graph of size " +
                     str(graph.getNumVertices()))
        logging.info("EgoLearner: " + str(self.egoRegressor))
        logging.info("AlterLearner: " + str(self.alterRegressor))

        allIndices = numpy.arange(0, graph.getNumVertices())
        V = graph.getVertexList().getVertices(list(allIndices))
        W = numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        Xe = numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        printStep = numpy.floor(graph.getNumVertices() / 10)
        alterError = 0.0

        for i in range(graph.getNumVertices()):
            Util.printIteration(i, printStep, graph.getNumVertices())
            neighbours = graph.neighbours(i)

            if neighbours.shape[0] != 0:
                X = V[neighbours, :]
                y = numpy.ones(X.shape[0])

                for j in range(neighbours.shape[0]):
                    y[j] = graph.getEdge(i, neighbours[j])

                w = self.alterRegressor.learnModel(X, y)
                #alterError = numpy.mean(numpy.abs(self.alterRegressor.predict(X) - y))

                W = numpy.r_[W, numpy.array([w])]
                Xe = numpy.r_[Xe, numpy.array([V[i, :]])]

        #Now we need to solve least to find regressor of Xe onto W
        logging.info(
            "Finding regression matrix onto weights using matrix of size " +
            str(Xe.shape))
        gc.collect()
        #self.standardiser = Standardiser()
        #self.standardiser2 = Standardiser()
        #Xe = self.standardiser.standardiseArray(Xe)
        #W = self.standardiser2.standardiseArray(W)
        self.egoRegressor.learnModel(Xe, W)

        return W
Пример #36
0
    def learnModel(self, X, y):
        """
        Learn a model for a set of examples given as the rows of the matrix X,
        with corresponding labels given in the elements of 1D array y.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param y: A vector of labels
        :type y: :class:`ndarray`
        """
        Util.abstract()
Пример #37
0
    def testRank(self):
        X = numpy.random.rand(10, 1)
        self.assertEquals(Util.rank(X), 1)

        X = numpy.random.rand(10, 12)
        self.assertEquals(Util.rank(X), 10)

        X = numpy.random.rand(31, 12)
        self.assertEquals(Util.rank(X), 12)

        K = numpy.dot(X, X.T)
        self.assertEquals(Util.rank(X), 12)
Пример #38
0
    def testRank(self):
        X = numpy.random.rand(10, 1)
        self.assertEquals(Util.rank(X), 1)

        X = numpy.random.rand(10, 12)
        self.assertEquals(Util.rank(X), 10)

        X = numpy.random.rand(31, 12)
        self.assertEquals(Util.rank(X), 12)

        K = numpy.dot(X, X.T)
        self.assertEquals(Util.rank(X), 12)
Пример #39
0
 def testCumMin(self): 
     v = numpy.array([5, 6, 4, 5, 1])
     u = Util.cumMin(v)
     nptst.assert_array_equal(u, numpy.array([5, 5, 4, 4, 1]))
     
     v = numpy.array([5, 4, 3, 2, 1])
     u = Util.cumMin(v)
     nptst.assert_array_equal(u, v)
 
     v = numpy.array([1, 2, 3])
     u = Util.cumMin(v)
     nptst.assert_array_equal(u, numpy.ones(3))    
Пример #40
0
    def learnModel(self, graph):
        """
        Learn a prediction model based on considering ego networks as independent.
        For each ego, X contains a list of neighbours and the corresponding labels
        are the values of the edge labels. We then find the set of primal weights
        w for each ego network and then regress onto the set of weights using the
        ego labels.

        :param graph: The input graph to learn from.
        :type graph: class:`apgl.graph.AbstractSingleGraph`
        """

        logging.info("Learning model on graph of size " + str(graph.getNumVertices()))
        logging.info("EgoLearner: " + str(self.egoRegressor))
        logging.info("AlterLearner: " + str(self.alterRegressor))

        allIndices = numpy.arange(0, graph.getNumVertices())
        V = graph.getVertexList().getVertices(list(allIndices))
        W = numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        Xe  =  numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        printStep = numpy.floor(graph.getNumVertices()/10)
        alterError = 0.0 

        for i in range(graph.getNumVertices()):
            Util.printIteration(i, printStep, graph.getNumVertices())
            neighbours = graph.neighbours(i)

            if neighbours.shape[0] != 0:
                X = V[neighbours, :]
                y = numpy.ones(X.shape[0])

                for j in range(neighbours.shape[0]):
                    y[j] = graph.getEdge(i, neighbours[j])


                w = self.alterRegressor.learnModel(X, y)
                #alterError = numpy.mean(numpy.abs(self.alterRegressor.predict(X) - y))

                W = numpy.r_[W, numpy.array([w])]
                Xe = numpy.r_[Xe, numpy.array([V[i, :]])]

        #Now we need to solve least to find regressor of Xe onto W
        logging.info("Finding regression matrix onto weights using matrix of size " + str(Xe.shape))
        gc.collect()
        #self.standardiser = Standardiser()
        #self.standardiser2 = Standardiser()
        #Xe = self.standardiser.standardiseArray(Xe)
        #W = self.standardiser2.standardiseArray(W)
        self.egoRegressor.learnModel(Xe, W)


        return W 
Пример #41
0
    def evaluateCvOuter(self, X, Y, folds, leafRank):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)
        self.setLeafRank(leafRank)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds)
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            logging.debug("Distribution of labels in train: " +
                          str(numpy.bincount(trainY)))
            logging.debug("Distribution of labels in test: " +
                          str(numpy.bincount(testY)))

            self.learnModel(trainX, trainY)
            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
Пример #42
0
    def learnModel(self, X, Y):
        """
        Learn the CCA primal-dual directions.
        """
        self.trainX = X
        self.trainY = Y

        numExamples = X.shape[0]
        numFeatures = Y.shape[1]

        a = 10**-5
        I = numpy.eye(numExamples)
        I2 = numpy.eye(numFeatures)
        Kx = self.kernelX.evaluate(X, X) + a*I
        Kxx = numpy.dot(Kx, Kx)
        Kxy = numpy.dot(Kx, Y) 
        Cyy = numpy.dot(Y.T, Y) + a*I2

        Z1 = numpy.zeros((numExamples, numExamples))
        Z2 = numpy.zeros((numFeatures, numFeatures))
        Z3 = numpy.zeros((numExamples, numFeatures))

        #Note we add a small value to the diagonal of A and B to deal with low-rank
        A = numpy.c_[Z1, Kxy]
        A1 = numpy.c_[Kxy.T, Z2]
        A = numpy.r_[A, A1]
        A = (A+A.T)/2 #Stupid stupidness 

        B = numpy.c_[(1-self.tau1)*Kxx - self.tau1*Kx, Z3]
        B1 = numpy.c_[Z3.T, (1-self.tau2)*Cyy - self.tau2*I2]
        B = numpy.r_[B, B1]
        B = (B+B.T)/2

        (D, W) = scipy.linalg.eig(A, B)

        #Only select eigenvalues which are greater than zero
        W = W[:, D>0]

        #We need to return those eigenvectors corresponding to positive eigenvalues
        self.alpha = W[0:numExamples, :]
        self.V = W[numExamples:, :]
        self.lmbdas = D[D>0]

        alphaDiag = Util.mdot(self.alpha.T, Kxx, self.alpha)
        alphaDiag = alphaDiag + numpy.array(alphaDiag < 0, numpy.int)
        vDiag = Util.mdot(self.V.T, Cyy, self.V)
        vDiag = vDiag + numpy.array(vDiag < 0, numpy.int)
        self.alpha = numpy.dot(self.alpha, numpy.diag(1/numpy.sqrt(numpy.diag(alphaDiag))))
        self.V = numpy.dot(self.V, numpy.diag(1/numpy.sqrt(numpy.diag(vDiag))))

        return self.alpha, self.V, self.lmbdas
Пример #43
0
    def growTree(self, X, y, argsortX, startId):
        """
        Grow a tree using a stack. Give a sample of data and a node index, we 
        find the best split and add children to the tree accordingly. We perform 
        pre-pruning based on the penalty. 
        """
        eps = 10**-4
        idStack = [startId]

        while len(idStack) != 0:
            nodeId = idStack.pop()
            node = self.tree.getVertex(nodeId)
            accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y,
                                                       node.getTrainInds(),
                                                       argsortX)

            #Choose best feature based on gains
            accuracies += eps
            bestFeatureInd = Util.randomChoice(accuracies)[0]
            bestThreshold = thresholds[bestFeatureInd]

            nodeInds = node.getTrainInds()
            bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[
                X[:, bestFeatureInd][nodeInds] < bestThreshold]])
            bestRightInds = numpy.sort(nodeInds[numpy.arange(
                nodeInds.shape[0])[
                    X[:, bestFeatureInd][nodeInds] >= bestThreshold]])

            #The split may have 0 items in one set, so don't split
            if bestLeftInds.sum() != 0 and bestRightInds.sum(
            ) != 0 and self.tree.depth() < self.maxDepth:
                node.setError(1 - accuracies[bestFeatureInd])
                node.setFeatureInd(bestFeatureInd)
                node.setThreshold(bestThreshold)

                leftChildId = self.getLeftChildId(nodeId)
                leftChild = DecisionNode(bestLeftInds,
                                         Util.mode(y[bestLeftInds]))
                self.tree.addChild(nodeId, leftChildId, leftChild)

                if leftChild.getTrainInds().shape[0] >= self.minSplit:
                    idStack.append(leftChildId)

                rightChildId = self.getRightChildId(nodeId)
                rightChild = DecisionNode(bestRightInds,
                                          Util.mode(y[bestRightInds]))
                self.tree.addChild(nodeId, rightChildId, rightChild)

                if rightChild.getTrainInds().shape[0] >= self.minSplit:
                    idStack.append(rightChildId)
Пример #44
0
    def supervisedMC22(lists, itemList, topQList, verbose=False): 
        """
        A supervised version of MC2 of our own invention. The idea is to find a 
        linear combination of transition matrices to fit a given one. 
        """
        import cvxopt
        import cvxopt.solvers
        ell = len(lists)
        n = len(itemList)
        outputList, scores, PList = RankAggregator.MC2(lists, itemList, verbose=True)
        
        Q = cvxopt.spmatrix([], [], [], (n*n, len(lists)))

        for i, P in enumerate(PList): 
            #print(P.todense())
            Q[:, i] = cvxopt.matrix(numpy.array(P.todense()).ravel()) 
            
        QQ = Q.T * Q
        
        Py = RankAggregator.generateTransitionMatrix(topQList, itemList)
        s = numpy.array(Py.todense()).ravel()
        s = cvxopt.matrix(s)
        
        G = cvxopt.spdiag((-numpy.ones(ell)).tolist())
        h = cvxopt.matrix(numpy.zeros(ell))
        
        A = cvxopt.matrix(numpy.ones(ell), (1, ell))
        b = cvxopt.matrix(numpy.ones(1))        
                
        q = -Q.T * s  
        
        sol = cvxopt.solvers.qp(QQ, q, G, h, A, b)
        
        alpha = numpy.array(sol['x'])
        
        #Combine the matrices 
        P = numpy.zeros((n, n))       
        
        for j, Pj in enumerate(PList): 
            Util.printIteration(j, 1, ell)
            P += alpha[j] * numpy.array(Pj.todense()) 

        P /= ell 
        
        outputList, scores = RankAggregator.computeOutputList(P, itemList)
        
        if verbose: 
            return outputList, scores, PList
        else: 
            return outputList, scores
Пример #45
0
    def modelSelect(self, X): 
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        localAucs = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(cvInds)))
        
        logging.debug("Performing model selection")
        paramList = []        
        
        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, self.folds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)
            
            testOmegaList = SparseUtils.getOmegaList(testX)
            
            for i, k in enumerate(self.ks): 
                maxLocalAuc = self.copy()
                maxLocalAuc.k = k
                paramList.append((trainX, testX, testOmegaList, maxLocalAuc))
                    
        pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
        resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize)
        #import itertools
        #resultsIterator = itertools.imap(localAucsLmbdas, paramList)
        
        for icv, (trainInds, testInds) in enumerate(cvInds):        
            for i, k in enumerate(self.ks): 
                tempAucs = resultsIterator.next()
                localAucs[i, :, icv] = tempAucs
        
        pool.terminate()
        
        meanLocalAucs = numpy.mean(localAucs, 2)
        stdLocalAucs = numpy.std(localAucs, 2)
        
        logging.debug(meanLocalAucs)
        
        k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]]
        lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]]
        
        logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda))
        
        self.k = k 
        self.lmbda = lmbda 
        
        return meanLocalAucs, stdLocalAucs
Пример #46
0
    def evaluateCvOuter(self, X, Y, folds, leafRank):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)
        self.setLeafRank(leafRank)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds)
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY)))
            logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY)))

            self.learnModel(trainX, trainY)
            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
Пример #47
0
    def addCols2(U, s, V, B):
        """
        Find the SVD of a matrix [A, B] where  A = U diag(s) V.T. Uses the SVD 
        decomposition to find an orthogonal basis on B. 
        
        :param U: The left singular vectors of A  
        
        :param s: The singular values of A 
        
        :param V: The right singular vectors of A 
        
        :param B: The matrix to append to A 
        
        """
        if U.shape[0] != B.shape[0]:
            raise ValueError("U must have same number of rows as B")
        if s.shape[0] != U.shape[1]:
            raise ValueError("Number of cols of U must be the same size as s")
        if s.shape[0] != V.shape[1]:
            raise ValueError("Number of cols of V must be the same size as s")

        m, k = U.shape
        r = B.shape[1]
        n = V.shape[0]

        C = numpy.dot(numpy.eye(m) - numpy.dot(U, U.T), B)
        Ubar, sBar, Vbar = numpy.linalg.svd(C, full_matrices=False)
        inds = numpy.flipud(numpy.argsort(sBar))[0:k]
        Ubar, sBar, Vbar = Util.indSvd(Ubar, sBar, Vbar, inds)

        rPrime = Ubar.shape[1]

        D = numpy.r_[numpy.diag(s), numpy.zeros((rPrime, k))]
        E = numpy.r_[numpy.dot(U.T, B), numpy.diag(sBar).dot(Vbar.T)]
        D = numpy.c_[D, E]

        Uhat, sHat, Vhat = numpy.linalg.svd(D, full_matrices=False)
        inds = numpy.flipud(numpy.argsort(sHat))[0:k]
        Uhat, sHat, Vhat = Util.indSvd(Uhat, sHat, Vhat, inds)

        #The best rank k approximation of [A, B]
        Utilde = numpy.dot(numpy.c_[U, Ubar], Uhat)
        sTilde = sHat

        G1 = numpy.r_[V, numpy.zeros((r, k))]
        G2 = numpy.r_[numpy.zeros((n ,r)), numpy.eye(r)]
        Vtilde = numpy.dot(numpy.c_[G1, G2], Vhat)

        return Utilde, sTilde, Vtilde
Пример #48
0
    def modelSelect(self, X): 
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        precisions = numpy.zeros((self.ks.shape[0], len(cvInds)))
        
        logging.debug("Performing model selection")
        paramList = []        
        
        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, self.folds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)
            
            testOmegaList = SparseUtils.getOmegaList(testX)
            
            for i, k in enumerate(self.ks): 
                learner = self.copy()
                learner.k = k
                paramList.append((trainX, testX, testOmegaList, learner))
                    
        #pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
        #resultsIterator = pool.imap(computePrecision, paramList, self.chunkSize)
        import itertools
        resultsIterator = itertools.imap(computePrecision, paramList)
        
        for icv, (trainInds, testInds) in enumerate(cvInds):        
            for i, k in enumerate(self.ks): 
                tempPrecision = resultsIterator.next()
                precisions[i, icv] = tempPrecision
        
        #pool.terminate()
        
        meanPrecisions = numpy.mean(precisions, 1)
        stdPrecisions = numpy.std(precisions, 1)
        
        logging.debug(meanPrecisions)
        
        k = self.ks[numpy.argmax(meanPrecisions)]

        
        logging.debug("Model parameters: k=" + str(k)) 
        
        self.k = k 
        
        return meanPrecisions, stdPrecisions
Пример #49
0
    def addCols2(U, s, V, B):
        """
        Find the SVD of a matrix [A, B] where  A = U diag(s) V.T. Uses the SVD 
        decomposition to find an orthogonal basis on B. 
        
        :param U: The left singular vectors of A  
        
        :param s: The singular values of A 
        
        :param V: The right singular vectors of A 
        
        :param B: The matrix to append to A 
        
        """
        if U.shape[0] != B.shape[0]:
            raise ValueError("U must have same number of rows as B")
        if s.shape[0] != U.shape[1]:
            raise ValueError("Number of cols of U must be the same size as s")
        if s.shape[0] != V.shape[1]:
            raise ValueError("Number of cols of V must be the same size as s")

        m, k = U.shape
        r = B.shape[1]
        n = V.shape[0]

        C = numpy.dot(numpy.eye(m) - numpy.dot(U, U.T), B)
        Ubar, sBar, Vbar = numpy.linalg.svd(C, full_matrices=False)
        inds = numpy.flipud(numpy.argsort(sBar))[0:k]
        Ubar, sBar, Vbar = Util.indSvd(Ubar, sBar, Vbar, inds)

        rPrime = Ubar.shape[1]

        D = numpy.r_[numpy.diag(s), numpy.zeros((rPrime, k))]
        E = numpy.r_[numpy.dot(U.T, B), numpy.diag(sBar).dot(Vbar.T)]
        D = numpy.c_[D, E]

        Uhat, sHat, Vhat = numpy.linalg.svd(D, full_matrices=False)
        inds = numpy.flipud(numpy.argsort(sHat))[0:k]
        Uhat, sHat, Vhat = Util.indSvd(Uhat, sHat, Vhat, inds)

        #The best rank k approximation of [A, B]
        Utilde = numpy.dot(numpy.c_[U, Ubar], Uhat)
        sTilde = sHat

        G1 = numpy.r_[V, numpy.zeros((r, k))]
        G2 = numpy.r_[numpy.zeros((n, r)), numpy.eye(r)]
        Vtilde = numpy.dot(numpy.c_[G1, G2], Vhat)

        return Utilde, sTilde, Vtilde
Пример #50
0
    def learnModel(self, graph):
        """
        Learn a prediction model based on all of the edges of the input graph.
        For each ego, X contains a list of neighbours and non-neighbours in the same
        ratio, and y = 1 when for a neighbour otherwise -1. We then find the set of
        primal weights w for each ego network and then regress onto the set of weights
        using the ego labels.

        One can either learn by comparing neighbours and non-neighbours, or alternatively
        using the labels of edges and making prediction on unlabelled edges. 

        :param graph: The input graph to learn from.
        :type graph: class:`apgl.graph.AbstractSingleGraph`

        :param randomNegLabel: How to compute edge labels, False means use the labels
        themselves, and True means randomly pick non-neighbours to have -1 labels
        :type randomNegLabel: class `bool`
        """

        Parameter.checkInt(self.windowSize, 1, graph.getNumVertices())
        self.graph = graph
        logging.info("Learning model on graph of size " +
                     str(graph.getNumVertices()))

        allIndices = numpy.arange(0, graph.getNumVertices())
        V = graph.getVertexList().getVertices(allIndices)
        W = numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        Xe = numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        printStep = numpy.floor(graph.getNumVertices() / 10)

        for i in range(graph.getNumVertices()):
            Util.printIteration(i, printStep, graph.getNumVertices())
            neighbours = graph.neighbours(i)

            if neighbours.shape[0] != 0:
                compNeighbours = numpy.setdiff1d(allIndices, neighbours)
                perm = numpy.random.permutation(
                    compNeighbours.shape[0])[0:neighbours.shape[0]]
                negativeVertices = V[compNeighbours[perm], :]
                X = numpy.r_[V[neighbours, :], negativeVertices]
                y = numpy.ones(X.shape[0])
                y[neighbours.shape[0]:] = -1

                w = self.alterRegressor.learnModel(X, y)
                W = numpy.r_[W, numpy.array([w])]
                Xe = numpy.r_[Xe, numpy.array([V[i, :]])]

        #Now we need to solve least to find regressor of Xe onto W
        self.egoRegressor.learnModel(Xe, W)
Пример #51
0
    def generate_data_file(dir, nb_user=None):
        logging.debug("nb_user: "******"creating file " + str(f_data_name))
            shutil.copy(BemolData.get_file_name(dir, None), f_data_name)

        # other files to generate
        nb_user_to_generate = []
        current_nb_user = BemolData.get_nb_user_to_read(nb_user)
        logging.debug("current_nb_user before while: " + str(current_nb_user))
        # !!!!! security failure TOCTTOU
        while (not os.path.exists(BemolData.get_file_name(dir, current_nb_user))):
            logging.debug("current_nb_user in while: " + str(current_nb_user))
            nb_user_to_generate.append(current_nb_user)
            current_nb_user = BemolData.get_nb_user_to_read(current_nb_user+1)
        nb_user_to_generate.reverse()

    
        # generate other files
        for current_nb_user in nb_user_to_generate:
            # read data
            f_existing_data_name = BemolData.get_file_name(dir, current_nb_user+1)
            f_to_create_data_name = BemolData.get_file_name(dir, current_nb_user)
            logging.info("creating file " + f_to_create_data_name)
            dict_user = MyDictionary()
            try:
                f_existing_data = gzip.open(f_existing_data_name, 'rb')
                f_to_create_data = gzip.open(f_to_create_data_name, 'wb')

                i = 0
                i_max = BemolData.get_nb_line(f_existing_data_name)
                for line in f_existing_data:
                    Util.printIteration(i, 1000, i_max); i += 1
                    m = re.match("(\d+)\s(\d+)\s(\d+)\s(\d+)", line)
                    if dict_user.index(int(m.group(1))) < current_nb_user:
                        f_to_create_data.write(line)
            except IOError as error:
                if error.filename == f_existing_data:
                    raise RGIOError(error, RGIOError.indent() + 'it disappeared in the meanwhile')
                else:
                    raise error
Пример #52
0
    def recommendAtk(U, V, k, blockSize=1000, omegaList=None, verbose=False):
        """
        Compute the matrix Z = U V^T and then find the k largest indices for each row. 
        """
        blocksize = 1000
        numBlocks = int(ceil(U.shape[0] / float(blocksize)))
        orderedItems = numpy.zeros((U.shape[0], k), numpy.int32)
        scores = numpy.zeros((U.shape[0], k), numpy.float)

        for j in range(numBlocks):
            logging.debug("Block " + str(j) + " of " + str(numBlocks))
            endInd = min(U.shape[0], (j + 1) * blocksize)
            UV = U[j * blocksize:endInd, :].dot(V.T)
            orderedItems[j * blocksize:endInd, :] = Util.argmaxN(UV, k)

            rowInds = numpy.repeat(numpy.arange(endInd - j * blocksize), k)
            colInds = orderedItems[j * blocksize:endInd, :].flatten()

            scores[j * blocksize:endInd, :] = numpy.reshape(
                UV[rowInds, colInds], (endInd - j * blocksize, k))
            #orderedItems[j*blocksize:endInd, :] = Util.argmaxN2d(scores, k)

            #Now delete items in omegaList if given
            if omegaList != None:
                for i in range(j * blocksize, endInd):

                    nonTrainItems = orderedItems[i, :][numpy.logical_not(
                        numpy.in1d(orderedItems[i, :], omegaList[i]))]
                    orderedItems[i, 0:nonTrainItems.shape[0]] = nonTrainItems
                    orderedItems[i, nonTrainItems.shape[0]:] = -1

        if verbose:
            return orderedItems, scores
        else:
            return orderedItems
Пример #53
0
 def testSvd(self): 
     n = 100 
     m = 80
     A = scipy.sparse.rand(m, n, 0.1)
     
     ks = [10, 20, 30, 40] 
     q = 2 
     
     lastError = numpy.linalg.norm(A.todense())        
     
     for k in ks: 
         U, s, V = RandomisedSVD.svd(A, k, q)
         
         nptst.assert_array_almost_equal(U.T.dot(U), numpy.eye(k))
         nptst.assert_array_almost_equal(V.T.dot(V), numpy.eye(k))
         A2 = (U*s).dot(V.T)
         
         error = numpy.linalg.norm(A - A2)
         self.assertTrue(error <= lastError)
         lastError = error 
         
         #Compare versus exact svd 
         U, s, V = numpy.linalg.svd(numpy.array(A.todense()))
         inds = numpy.flipud(numpy.argsort(s))[0:k*2]
         U, s, V = Util.indSvd(U, s, V, inds)
         
         Ak = (U*s).dot(V.T)
         
         error2 = numpy.linalg.norm(A - Ak)
         self.assertTrue(error2 <= error)
Пример #54
0
    def matrixSimilarity(self, V1, V2):
        """
        Compute a vertex similarity matrix C, such that the ijth entry is the matching 
        score between V1_i and V2_j, where larger is a better match. 
        """
        X = numpy.r_[V1, V2]
        standardiser = Standardiser()
        X = standardiser.normaliseArray(X)

        V1 = X[0:V1.shape[0], :]
        V2 = X[V1.shape[0]:, :]

        #print(X)

        #Extend arrays with zeros to make them the same size
        #if V1.shape[0] < V2.shape[0]:
        #    V1 = Util.extendArray(V1, V2.shape, numpy.min(V1))
        #elif V2.shape[0] < V1.shape[0]:
        #    V2 = Util.extendArray(V2, V1.shape, numpy.min(V2))

        #Let's compute C as the distance between vertices
        #Distance is bounded by 1
        D = Util.distanceMatrix(V1, V2)
        maxD = numpy.max(D)
        minD = numpy.min(D)
        if (maxD - minD) != 0:
            C = (maxD - D) / (maxD - minD)
        else:
            C = numpy.ones((V1.shape[0], V2.shape[0]))

        return C
Пример #55
0
        def runRandomChoice():
            #can just do non-zero entries
            w = Util.expandIntArray(v)

            reps = 10000
            for i in range(reps):
                w[numpy.random.randint(0, w.shape[0])]
Пример #56
0
    def testSvdSoft(self):
        A = scipy.sparse.rand(10, 10, 0.2)
        A = A.tocsc()

        lmbda = 0.2
        U, s, V = SparseUtils.svdSoft(A, lmbda)
        ATilde = U.dot(numpy.diag(s)).dot(V.T)

        #Now compute the same matrix using numpy
        A = A.todense()

        U2, s2, V2 = numpy.linalg.svd(A)
        inds = numpy.flipud(numpy.argsort(s2))
        inds = inds[s2[inds] > lmbda]
        U2, s2, V2 = Util.indSvd(U2, s2, V2, inds)

        s2 = s2 - lmbda
        s2 = numpy.clip(s, 0, numpy.max(s2))

        ATilde2 = U2.dot(numpy.diag(s2)).dot(V2.T)

        nptst.assert_array_almost_equal(s, s)
        nptst.assert_array_almost_equal(ATilde, ATilde2)

        #Now run svdSoft with a numpy array
        U3, s3, V3 = SparseUtils.svdSoft(A, lmbda)
        ATilde3 = U.dot(numpy.diag(s)).dot(V.T)

        nptst.assert_array_almost_equal(s, s3)
        nptst.assert_array_almost_equal(ATilde3, ATilde2)
Пример #57
0
    def svdSoft(X, lmbda, kmax=None):
        """
        Find the partial SVD of the sparse or dense matrix X, for which singular
        values are >= lmbda. Soft threshold the resulting singular values
        so that s <- max(s - lambda, 0)
        """
        if scipy.sparse.issparse(X):
            k = min(X.shape[0], X.shape[1])
            L = scipy.sparse.linalg.aslinearoperator(X)

            U, s, V = SparseUtils.svdPropack(L, k, kmax=kmax)
            V = V.T
        else:
            U, s, V = numpy.linalg.svd(X)

        inds = numpy.flipud(numpy.argsort(s))
        inds = inds[s[inds] >= lmbda]
        U, s, V = Util.indSvd(U, s, V, inds)

        #Soft threshold
        if s.shape[0] != 0:
            s = s - lmbda
            s = numpy.clip(s, 0, numpy.max(s))

        return U, s, V
Пример #58
0
    def testAddRows(self):

        #Test case when k = rank
        Utilde, Stilde, Vtilde = SVDUpdate.addRows(self.U, self.s, self.V,
                                                   self.C)

        nptst.assert_array_almost_equal(Utilde.T.dot(Utilde),
                                        numpy.eye(Utilde.shape[1]))
        nptst.assert_array_almost_equal(Vtilde.T.dot(Vtilde),
                                        numpy.eye(Vtilde.shape[1]))

        self.assertEquals(Stilde.shape[0], self.k)

        #Check we get the original solution with full SVD
        U, s, V = numpy.linalg.svd(self.A)
        inds = numpy.flipud(numpy.argsort(s))
        U, s, V = Util.indSvd(U, s, V, inds)

        Utilde, Stilde, Vtilde = SVDUpdate.addRows(U, s, V, self.C)
        D = numpy.r_[self.A, self.C]

        nptst.assert_array_almost_equal(D, (Utilde * Stilde).dot(Vtilde.T), 4)

        #Check solution for partial rank SVD
        k = 20
        U, s, V = numpy.linalg.svd(self.A)
        inds = numpy.flipud(numpy.argsort(s))[0:k]
        U, s, V = Util.indSvd(U, s, V, inds)

        Utilde, Stilde, Vtilde = SVDUpdate.addRows(U, s, V, self.C)
        D = numpy.r_[(U * s).dot(V.T), self.C]
        U, s, V = numpy.linalg.svd(D)
        inds = numpy.flipud(numpy.argsort(s))[0:k]
        U, s, V = Util.indSvd(U, s, V, inds)

        nptst.assert_array_almost_equal((U * s).dot(V.T),
                                        (Utilde * Stilde).dot(Vtilde.T), 4)

        #Test if same as add cols
        U, s, V = numpy.linalg.svd(self.A)
        inds = numpy.flipud(numpy.argsort(s))[0:k]
        U, s, V = Util.indSvd(U, s, V, inds)
        Utilde, sTilde, Vtilde = SVDUpdate.addRows(U, s, V, self.C)
        Vtilde2, sTilde2, Utilde2 = SVDUpdate.addCols(V, s, U, self.C.T)

        nptst.assert_array_almost_equal((Utilde * sTilde).dot(Vtilde.T),
                                        (Utilde2 * sTilde2).dot(Vtilde2.T))