예제 #1
0
    def eigenAdd2(omega, Q, Y1, Y2, k, debug= False):
        """
        Compute an approximation of the eigendecomposition A^*A + Y1Y2^* +Y2Y1^*
        in which Y1, Y2 are low rank matrices, Y1^*Y2=0 and A^*A = Q Omega Q*. We 
        use the rank-k approximation of A^*A: Q_k Omega_k Q_k^* and then find
        [A^*A_k + Y1Y2^* + Y2Y1^*]. If debug=False then pi, V are returned which 
        respectively correspond to all the eigenvalues/eigenvectors of 
        [A^*A_k + Y1Y2^* + Y2Y1^*]. 
        """
        #logging.debug("< eigenAdd2 >")
        Parameter.checkInt(k, 0, float('inf'))
        Parameter.checkClass(omega, numpy.ndarray)
        Parameter.checkClass(Q, numpy.ndarray)
        Parameter.checkClass(Y1, numpy.ndarray)
        Parameter.checkClass(Y2, numpy.ndarray)
        if not numpy.isrealobj(omega) or not numpy.isrealobj(Q):
            logging.warn("Eigenvalues or eigenvectors are not real")
        if not numpy.isrealobj(Y1) or not numpy.isrealobj(Y2):
            logging.warn("Y1 or Y2 are not real")
        if omega.ndim != 1:
            raise ValueError("omega must be 1-d array")
        if omega.shape[0] != Q.shape[1]:
            raise ValueError("Must have same number of eigenvalues and eigenvectors")
        if Q.shape[0] != Y1.shape[0]:
            raise ValueError("Q must have the same number of rows as Y1 rows")
        if Q.shape[0] != Y2.shape[0]:
            raise ValueError("Q must have the same number of rows as Y2 rows")
        if Y1.shape[1] != Y2.shape[1]:
            raise ValueError("Y1 must have the same number of columns as Y2 columns")

        if __debug__:
            Parameter.checkArray(omega, softCheck=True, arrayInfo="omega as input in eigenAdd2()")
            Parameter.checkArray(Q, softCheck=True, arrayInfo="Q as input in eigenAdd2()")
            Parameter.checkOrthogonal(Q, tol=EigenUpdater.tol, softCheck=True, arrayInfo="Q as input in eigenAdd2()")
            Parameter.checkArray(Y1, softCheck=True, arrayInfo="Y1 as input in eigenAdd2()")
            Parameter.checkArray(Y2, softCheck=True, arrayInfo="Y2 as input in eigenAdd2()")
            


        #Get first k eigenvectors/values of A^*A
        omega, Q = Util.indEig(omega, Q, numpy.flipud(numpy.argsort(omega))[0:k])

        QY1 = Q.conj().T.dot(Y1)
        Y1bar = Y1 - Q.dot(QY1)

        P1bar, sigma1Bar, Q1bar = Util.safeSvd(Y1bar)
        inds = numpy.arange(sigma1Bar.shape[0])[numpy.abs(sigma1Bar)>EigenUpdater.tol]
        P1bar, sigma1Bar, Q1bar = Util.indSvd(P1bar, sigma1Bar, Q1bar, inds)
        # checks on SVD decomposition of Y1bar
        if __debug__:
            Parameter.checkArray(QY1, softCheck=True, arrayInfo="QY1 in eigenAdd2()")
            Parameter.checkArray(Y1bar, softCheck=True, arrayInfo="Y1bar in eigenAdd2()")
            Parameter.checkArray(P1bar, softCheck=True, arrayInfo="P1bar in eigenAdd2()")
            if not Parameter.checkOrthogonal(P1bar, tol=EigenUpdater.tol, softCheck=True, arrayInfo="P1bar in eigenAdd2()", investigate=True):
                print ("corresponding sigma: ", sigma1Bar)
            Parameter.checkArray(sigma1Bar, softCheck=True, arrayInfo="sigma1Bar in eigenAdd2()")
            Parameter.checkArray(Q1bar, softCheck=True, arrayInfo="Q1bar in eigenAdd2()")
            if not Parameter.checkOrthogonal(Q1bar, tol=EigenUpdater.tol, softCheck=True, arrayInfo="Q1bar in eigenAdd2()"):
                print ("corresponding sigma: ", sigma1Bar)

        del Y1bar

        P1barY2 = P1bar.conj().T.dot(Y2)
        QY2 = Q.conj().T.dot(Y2)
        Y2bar = Y2 - Q.dot(QY2) - P1bar.dot(P1barY2)
        
        P2bar, sigma2Bar, Q2bar = Util.safeSvd(Y2bar)
        inds = numpy.arange(sigma2Bar.shape[0])[numpy.abs(sigma2Bar)>EigenUpdater.tol]
        P2bar, sigma2Bar, Q2bar = Util.indSvd(P2bar, sigma2Bar, Q2bar, inds)
        # checks on SVD decomposition of Y1bar
        if __debug__:
            Parameter.checkArray(P1barY2, softCheck=True, arrayInfo="P1barY2 in eigenAdd2()")
            Parameter.checkArray(QY2, softCheck=True, arrayInfo="QY2 in eigenAdd2()")
            Parameter.checkArray(Y2bar, softCheck=True, arrayInfo="Y2bar in eigenAdd2()")
            Parameter.checkArray(P2bar, softCheck=True, arrayInfo="P2bar in eigenAdd2()")
            Parameter.checkOrthogonal(P2bar, tol=EigenUpdater.tol, softCheck=True, arrayInfo="P2bar in eigenAdd2()")
            Parameter.checkArray(sigma2Bar, softCheck=True, arrayInfo="sigma2Bar in eigenAdd2()")
            Parameter.checkArray(Q2bar, softCheck=True, arrayInfo="Q2bar in eigenAdd2()")
            Parameter.checkOrthogonal(Q2bar, tol=EigenUpdater.tol, softCheck=True, arrayInfo="Q2bar in eigenAdd2()")

        del Y2bar 

        r = omega.shape[0]
        p = Y1.shape[1]
        p1 = sigma1Bar.shape[0]
        p2 = sigma2Bar.shape[0]

        D = numpy.c_[Q, P1bar, P2bar]
        del P1bar
        del P2bar 
        # rem: A*s = A.dot(diag(s)) ; A*s[:,new] = diag(s).dot(A)
        DStarY1 = numpy.r_[QY1, sigma1Bar[:,numpy.newaxis] * Q1bar.conj().T, numpy.zeros((p2, p))]
        DStarY2 = numpy.r_[QY2, P1barY2, sigma2Bar[:,numpy.newaxis] * Q2bar.conj().T]
        DStarY1Y2StarD = DStarY1.dot(DStarY2.conj().T)

        del DStarY1
        del DStarY2
        
        r = omega.shape[0]
        F = numpy.zeros((r+p1+p2, r+p1+p2))
        F[range(r),range(r)] = omega
        F = F + DStarY1Y2StarD + DStarY1Y2StarD.conj().T

        #A check to make sure DFD^T is AA_k + Y1Y2 + Y2Y1
        #assert numpy.linalg.norm(D.dot(F).dot(D.T) - Q.dot(numpy.diag(omega).dot(Q.T)) - Y1.dot(Y2.T) - Y2.dot(Y1.T)) < 10**-6
        
        # checks on F
        if __debug__:
            #Parameter.checkArray(DStarY1, softCheck=True, arrayInfo="DStarY1 in eigenAdd2()")
            #Parameter.checkArray(DStarY2, softCheck=True, arrayInfo="DStarY2 in eigenAdd2()")
            Parameter.checkArray(DStarY1Y2StarD, softCheck=True, arrayInfo="DStarY1Y2StarD in eigenAdd2()")
            Parameter.checkArray(F, softCheck=True, arrayInfo="F in eigenAdd2()")
            Parameter.checkSymmetric(F, tol=EigenUpdater.tol, softCheck=True, arrayInfo="F in eigenAdd2()")

        pi, H = scipy.linalg.eigh(F)
        # remove too small eigenvalues
        pi, H = Util.indEig(pi, H, numpy.arange(pi.shape[0])[numpy.abs(pi)>EigenUpdater.tol])
        # keep greatest eigenvalues
        #pi, H = Util.indEig(pi, H, numpy.flipud(numpy.argsort(pi))[:min(k,pi.shape[0])])


        V = D.dot(H)

        if __debug__:
            if not Parameter.checkOrthogonal(D, tol=EigenUpdater.tol, softCheck=True, investigate=True, arrayInfo="D in eigenAdd2()"):
                print("pi:\n", pi)
            if not Parameter.checkOrthogonal(H, tol=EigenUpdater.tol, softCheck=True, investigate=True, arrayInfo="H in eigenAdd2()"):
                print("pi:\n", pi)

        if ProfileUtils.memory() > 10**9:
            ProfileUtils.memDisplay(locals())
            
        #logging.debug("</ eigenAdd2 >")
        if debug:
            return pi, V, D, DStarY1Y2StarD + DStarY1Y2StarD.conj().T
        else:
            return pi, V
    def clusterFromIterator(self, graphListIterator, verbose=False):
        """
        Find a set of clusters for the graphs given by the iterator. If verbose 
        is true the each iteration is timed and bounded the results are returned 
        as lists.
        
        The difference between a weight matrix and the previous one should be
        positive.
        """
        clustersList = []
        decompositionTimeList = []
        kMeansTimeList = []
        boundList = []
        sinThetaList = []
        i = 0

        for subW in graphListIterator:
            if __debug__:
                Parameter.checkSymmetric(subW)

            if self.logStep and i % self.logStep == 0:
                logging.debug("Graph index: " + str(i))
            logging.debug("Clustering graph of size " + str(subW.shape))
            if self.alg != "efficientNystrom":
                ABBA = GraphUtils.shiftLaplacian(subW)

            # --- Eigen value decomposition ---
            startTime = time.time()
            if self.alg == "IASC":
                if i % self.T != 0:
                    omega, Q = self.approxUpdateEig(subW, ABBA, omega, Q)

                    if self.computeBound:
                        inds = numpy.flipud(numpy.argsort(omega))
                        Q = Q[:, inds]
                        omega = omega[inds]
                        bounds = self.pertBound(omega, Q, omegaKbot, AKbot,
                                                self.k2)
                        #boundList.append([i, bounds[0], bounds[1]])

                        #Now use accurate values of norm of R and delta
                        rank = Util.rank(ABBA.todense())
                        gamma, U = scipy.sparse.linalg.eigsh(ABBA,
                                                             rank - 1,
                                                             which="LM",
                                                             ncv=ABBA.shape[0])
                        #logging.debug("gamma=" + str(gamma))
                        bounds2 = self.realBound(omega, Q, gamma, AKbot,
                                                 self.k2)
                        boundList.append(
                            [bounds[0], bounds[1], bounds2[0], bounds2[1]])
                else:
                    logging.debug("Computing exact eigenvectors")
                    self.storeInformation(subW, ABBA)

                    if self.computeBound:
                        #omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2*2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0]))
                        rank = Util.rank(ABBA.todense())
                        omega, Q = scipy.sparse.linalg.eigsh(ABBA,
                                                             rank - 1,
                                                             which="LM",
                                                             ncv=ABBA.shape[0])
                        inds = numpy.flipud(numpy.argsort(omega))
                        omegaKbot = omega[inds[self.k2:]]
                        QKbot = Q[:, inds[self.k2:]]
                        AKbot = (QKbot * omegaKbot).dot(QKbot.T)

                        omegaSort = numpy.flipud(numpy.sort(omega))
                        boundList.append([0] * 4)
                    else:
                        omega, Q = scipy.sparse.linalg.eigsh(
                            ABBA,
                            min(self.k2, ABBA.shape[0] - 1),
                            which="LM",
                            ncv=min(10 * self.k2, ABBA.shape[0]))

            elif self.alg == "nystrom":
                omega, Q = Nystrom.eigpsd(ABBA, self.k3)
            elif self.alg == "exact":
                omega, Q = scipy.sparse.linalg.eigsh(
                    ABBA,
                    min(self.k1, ABBA.shape[0] - 1),
                    which="LM",
                    ncv=min(15 * self.k1, ABBA.shape[0]))
            elif self.alg == "efficientNystrom":
                omega, Q = EfficientNystrom.eigWeight(subW, self.k2, self.k1)
            elif self.alg == "randomisedSvd":
                Q, omega, R = RandomisedSVD.svd(ABBA, self.k4)
            else:
                raise ValueError("Invalid Algorithm: " + str(self.alg))

            if self.computeSinTheta:
                omegaExact, QExact = scipy.linalg.eigh(ABBA.todense())
                inds = numpy.flipud(numpy.argsort(omegaExact))
                QExactKbot = QExact[:, inds[self.k1:]]
                inds = numpy.flipud(numpy.argsort(omega))
                QApproxK = Q[:, inds[:self.k1]]
                sinThetaList.append(
                    scipy.linalg.norm(QExactKbot.T.dot(QApproxK)))

            decompositionTimeList.append(time.time() - startTime)

            if self.alg == "IASC":
                self.storeInformation(subW, ABBA)

            # --- Kmeans ---
            startTime = time.time()
            inds = numpy.flipud(numpy.argsort(omega))

            standardiser = Standardiser()
            #For some very strange reason we get an overflow when computing the
            #norm of the rows of Q even though its elements are bounded by 1.
            #We'll ignore it for now
            try:
                V = standardiser.normaliseArray(Q[:, inds[0:self.k1]].real.T).T
            except FloatingPointError as e:
                logging.warn("FloatingPointError: " + str(e))
            V = VqUtils.whiten(V)
            if i == 0:
                centroids, distortion = vq.kmeans(V,
                                                  self.k1,
                                                  iter=self.nb_iter_kmeans)
            else:
                centroids = self.findCentroids(V, clusters[:subW.shape[0]])
                if centroids.shape[0] < self.k1:
                    nb_missing_centroids = self.k1 - centroids.shape[0]
                    random_centroids = V[numpy.random.randint(
                        0, V.shape[0], nb_missing_centroids), :]
                    centroids = numpy.vstack((centroids, random_centroids))
                centroids, distortion = vq.kmeans(
                    V, centroids)  #iter can only be 1
            clusters, distortion = vq.vq(V, centroids)
            kMeansTimeList.append(time.time() - startTime)

            clustersList.append(clusters)

            #logging.debug("subW.shape: " + str(subW.shape))
            #logging.debug("len(clusters): " + str(len(clusters)))
            #from sandbox.util.ProfileUtils import ProfileUtils
            #logging.debug("Total memory usage: " + str(ProfileUtils.memory()/10**6) + "MB")
            if ProfileUtils.memory() > 10**9:
                ProfileUtils.memDisplay(locals())

            i += 1

        if verbose:
            eigenQuality = {
                "boundList": boundList,
                "sinThetaList": sinThetaList
            }
            return clustersList, numpy.array(
                (decompositionTimeList, kMeansTimeList)).T, eigenQuality
        else:
            return clustersList
    def clusterFromIterator(self, graphListIterator, verbose=False):
        """
        Find a set of clusters for the graphs given by the iterator. If verbose 
        is true the each iteration is timed and bounded the results are returned 
        as lists.
        
        The difference between a weight matrix and the previous one should be
        positive.
        """
        clustersList = []
        decompositionTimeList = [] 
        kMeansTimeList = [] 
        boundList = []
        sinThetaList = []
        i = 0

        for subW in graphListIterator:
            if __debug__:
                Parameter.checkSymmetric(subW)

            if self.logStep and i % self.logStep == 0:
                logging.debug("Graph index: " + str(i))
            logging.debug("Clustering graph of size " + str(subW.shape))
            if self.alg!="efficientNystrom": 
                ABBA = GraphUtils.shiftLaplacian(subW)

            # --- Eigen value decomposition ---
            startTime = time.time()
            if self.alg=="IASC": 
                if i % self.T != 0:
                    omega, Q = self.approxUpdateEig(subW, ABBA, omega, Q)   
                    
                    if self.computeBound:
                        inds = numpy.flipud(numpy.argsort(omega))
                        Q = Q[:, inds]
                        omega = omega[inds]
                        bounds = self.pertBound(omega, Q, omegaKbot, AKbot, self.k2)
                        #boundList.append([i, bounds[0], bounds[1]])
                        
                        #Now use accurate values of norm of R and delta   
                        rank = Util.rank(ABBA.todense())
                        gamma, U = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0])
                        #logging.debug("gamma=" + str(gamma))
                        bounds2 = self.realBound(omega, Q, gamma, AKbot, self.k2)                  
                        boundList.append([bounds[0], bounds[1], bounds2[0], bounds2[1]])      
                else: 
                    logging.debug("Computing exact eigenvectors")
                    self.storeInformation(subW, ABBA)

                    if self.computeBound: 
                        #omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2*2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0]))
                        rank = Util.rank(ABBA.todense())
                        omega, Q = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0])
                        inds = numpy.flipud(numpy.argsort(omega))
                        omegaKbot = omega[inds[self.k2:]]  
                        QKbot = Q[:, inds[self.k2:]] 
                        AKbot = (QKbot*omegaKbot).dot(QKbot.T)
                        
                        omegaSort = numpy.flipud(numpy.sort(omega))
                        boundList.append([0]*4)      
                    else: 
                        omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0]))
                            
            elif self.alg == "nystrom":
                omega, Q = Nystrom.eigpsd(ABBA, self.k3)
            elif self.alg == "exact": 
                omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k1, ABBA.shape[0]-1), which="LM", ncv = min(15*self.k1, ABBA.shape[0]))
            elif self.alg == "efficientNystrom":
                omega, Q = EfficientNystrom.eigWeight(subW, self.k2, self.k1)
            elif self.alg == "randomisedSvd": 
                Q, omega, R = RandomisedSVD.svd(ABBA, self.k4)
            else:
                raise ValueError("Invalid Algorithm: " + str(self.alg))

            if self.computeSinTheta:
                omegaExact, QExact = scipy.linalg.eigh(ABBA.todense())
                inds = numpy.flipud(numpy.argsort(omegaExact))
                QExactKbot = QExact[:, inds[self.k1:]]
                inds = numpy.flipud(numpy.argsort(omega))
                QApproxK = Q[:,inds[:self.k1]]
                sinThetaList.append(scipy.linalg.norm(QExactKbot.T.dot(QApproxK)))
          
            decompositionTimeList.append(time.time()-startTime)                  
                  
            if self.alg=="IASC":
                self.storeInformation(subW, ABBA)
            
            # --- Kmeans ---
            startTime = time.time()
            inds = numpy.flipud(numpy.argsort(omega))

            standardiser = Standardiser()
            #For some very strange reason we get an overflow when computing the
            #norm of the rows of Q even though its elements are bounded by 1.
            #We'll ignore it for now
            try:
                V = standardiser.normaliseArray(Q[:, inds[0:self.k1]].real.T).T
            except FloatingPointError as e:
                logging.warn("FloatingPointError: " + str(e))
            V = VqUtils.whiten(V)
            if i == 0:
                centroids, distortion = vq.kmeans(V, self.k1, iter=self.nb_iter_kmeans)
            else:
                centroids = self.findCentroids(V, clusters[:subW.shape[0]])
                if centroids.shape[0] < self.k1:
                    nb_missing_centroids = self.k1 - centroids.shape[0]
                    random_centroids = V[numpy.random.randint(0, V.shape[0], nb_missing_centroids),:]
                    centroids = numpy.vstack((centroids, random_centroids))
                centroids, distortion = vq.kmeans(V, centroids) #iter can only be 1
            clusters, distortion = vq.vq(V, centroids)
            kMeansTimeList.append(time.time()-startTime)

            clustersList.append(clusters)

            #logging.debug("subW.shape: " + str(subW.shape))
            #logging.debug("len(clusters): " + str(len(clusters)))
            #from sandbox.util.ProfileUtils import ProfileUtils
            #logging.debug("Total memory usage: " + str(ProfileUtils.memory()/10**6) + "MB")
            if ProfileUtils.memory() > 10**9:
                ProfileUtils.memDisplay(locals())

            i += 1

        if verbose:
            eigenQuality = {"boundList" : boundList, "sinThetaList" : sinThetaList}
            return clustersList, numpy.array((decompositionTimeList, kMeansTimeList)).T, eigenQuality
        else:
            return clustersList