예제 #1
0
    def readAuthorsAndDocuments(self, useAbstract=True): 
        logging.debug("About to read file " + self.dataFilename)
        inFile = open(self.dataFilename)  
        authorList = []
        citationList = []
        documentList = []
                    
        lastAbstract = ""
        lastVenue = ""
        lastTitle = ""    
        lastAuthors = []     
        lastCitationNo = 0                
                    
        for i, line in enumerate(inFile):
            Util.printIteration(i, self.stepSize, self.numLines)
                
            #Match the fields in the file 
            emptyLine = line == "\n"
            title = re.findall("#\*(.*)", line)
            currentAuthors = re.findall("#@(.*)", line)  
            abstract = re.findall("#!(.*)", line)
            venue = re.findall("#conf(.*)", line)
            citationNo = re.findall("#citation(.*)", line)
            
            if emptyLine:
                if useAbstract: 
                    document = lastTitle + " " + lastAbstract 
                else: 
                    document = lastTitle     
                documentList.append(document) 
                authorList.append(lastAuthors)
                citationList.append(lastCitationNo)

                lastAbstract = ""
                lastTitle = ""
                lastAuthors = []
                lastCitationNo = 0   
 
            if len(title) != 0 and len(title[0]) != 0: 
                lastTitle = title[0]
                
            if len(venue) != 0 and len(venue[0]) != 0: 
                lastVenue = venue[0]  
            
            if len(abstract) != 0 and len(abstract[0]) != 0: 
                lastAbstract = abstract[0]
                
            if len(citationNo) != 0 and len(citationNo[0]) != 0: 
                lastCitationNo = int(citationNo[0])
                       
            if len(currentAuthors) != 0: 
                currentAuthors = currentAuthors[0].split(",")  
                currentAuthors = set([x.strip() for x in currentAuthors])
                currentAuthors = currentAuthors.difference(set([""]))
                lastAuthors = currentAuthors                     

        inFile.close() 
        logging.debug("Finished reading " + str(len(documentList)) + " articles")  
        
        return authorList, documentList, citationList
예제 #2
0
    def __updateEigenSystem(self, lmbda, Q, deltaW, W):
        """
        Give the eigenvalues lmbda, eigenvectors Q and a deltaW matrix of weight
        changes, compute sequence of incidence vectors and update eigensystem.
        The deltaW is the change in edges from the current weight martrix which
        is given by W. 
        """
        changeInds = deltaW.nonzero()

        for s in range(changeInds[0].shape[0]):
            Util.printIteration(s, 10, changeInds[0].shape[0])
            i = changeInds[0][s]
            j = changeInds[1][s]
            if i>=j: # only consider lower diagonal changes
                continue

            assert deltaW[i, j] != 0
#            if deltaW[i, j] < 0:
#                logging.warn(" deltaW is usually positive (here deltaW=" +str(deltaW[i, j]) + ")")

            #Note: update W at each iteration here
            lmbda, Q = self.incrementEigenSystem(lmbda, Q, W, i, j, deltaW[i,j])
            W[i, j] += deltaW[i, j]
            W[j, i] += deltaW[i, j]
        
        return lmbda, Q 
예제 #3
0
    def __updateEigenSystem(self, lmbda, Q, deltaW, W):
        """
        Give the eigenvalues lmbda, eigenvectors Q and a deltaW matrix of weight
        changes, compute sequence of incidence vectors and update eigensystem.
        The deltaW is the change in edges from the current weight martrix which
        is given by W. 
        """
        changeInds = deltaW.nonzero()

        for s in range(changeInds[0].shape[0]):
            Util.printIteration(s, 10, changeInds[0].shape[0])
            i = changeInds[0][s]
            j = changeInds[1][s]
            if i >= j:  # only consider lower diagonal changes
                continue

            assert deltaW[i, j] != 0
            #            if deltaW[i, j] < 0:
            #                logging.warn(" deltaW is usually positive (here deltaW=" +str(deltaW[i, j]) + ")")

            #Note: update W at each iteration here
            lmbda, Q = self.incrementEigenSystem(lmbda, Q, W, i, j, deltaW[i,
                                                                           j])
            W[i, j] += deltaW[i, j]
            W[j, i] += deltaW[i, j]

        return lmbda, Q
예제 #4
0
    def predictEdges(self, vertexIndices):
        """
        This makes a prediction for a series of edges using the following score
        \sum_z \in n(x) \cup n(y) = 1/|log(n(z)|
        Returns a matrix with rows are a ranked list of verticies of length self.windowSize.
        """

        Parameter.checkInt(self.windowSize, 1, self.graph.getNumVertices())
        logging.info("Running predictEdges in " + str(self.__class__.__name__))

        P = numpy.zeros((vertexIndices.shape[0], self.windowSize))
        S = numpy.zeros((vertexIndices.shape[0], self.windowSize))
        W = self.graph.getWeightMatrix()


        for i in range(vertexIndices.shape[0]):
            Util.printIteration(i, self.printStep, vertexIndices.shape[0])
            scores = numpy.zeros(self.graph.getNumVertices())

            for j in range(0, self.graph.getNumVertices()):
                commonNeighbours = numpy.nonzero(W[vertexIndices[i], :] * W[j, :])[0]

                for k in commonNeighbours:
                    q = numpy.log(numpy.nonzero(W[k, :])[0].shape[0])
                    if q != 0:
                        scores[j] = scores[j] + 1/q


            P[i, :], S[i, :] = self.indicesFromScores(vertexIndices[i], scores)

        return P, S
예제 #5
0
 def cleanXML(self):
     """
     Take the original XML file and clean up HTML characters and & symbols. We 
     also create a list of possible matches for the experts. 
     """
     if not os.path.exists(self.xmlCleanFilename):
         logging.debug("Cleaning XML")
         h = HTMLParser.HTMLParser()
         
         inFile = open(self.xmlFileName)
         outFile = open(self.xmlCleanFilename, "w")
         i = 0 
         
         for line in inFile: 
             Util.printIteration(i, self.stepSize, self.numLines)
             outLine = h.unescape(line).replace("&", "&amp;")
             outLine = re.sub("<title>.*[\<\>].*</title>", "<title>Default Title</title>", outLine)
             outLine = re.sub("<ee>.*[\<\>].*</ee>", "<ee>Default text</ee>", outLine)
             outFile.write(outLine) 
             i += 1
         
         inFile.close() 
         outFile.close() 
         logging.debug("All done")
     else: 
         logging.debug("File already generated: " + self.xmlCleanFilename)
예제 #6
0
    def evaluateCvOuter(self, X, Y, folds):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param y: A vector of labels
        :type y: :class:`ndarray`

        :param folds: The number of cross validation folds
        :type folds: :class:`int`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds, "Outer CV: ")
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            self.learnModel(trainX, trainY)
            #self.learnModelCut(trainX, trainY)

            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
예제 #7
0
    def evaluateCvOuter(self, X, Y, folds):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param y: A vector of labels
        :type y: :class:`ndarray`

        :param folds: The number of cross validation folds
        :type folds: :class:`int`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds, "Outer CV: ")
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            self.learnModel(trainX, trainY)
            #self.learnModelCut(trainX, trainY)

            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
예제 #8
0
    def supervisedMC23(lists, itemList, topQList, verbose=False): 
        """
        A supervised version of MC2 of our own invention. The idea is to find a 
        linear combination of transition matrices to fit a given one. We just make
        sure it fits the stationary distribution. 
        """
        import cvxopt
        import cvxopt.solvers
        ell = len(lists)
        n = len(itemList)
        outputList, scores, PList = RankAggregator.MC2(lists, itemList, verbose=True)
        
        Py = RankAggregator.generateTransitionMatrix(topQList, itemList)
        u, v = scipy.sparse.linalg.eigs(Py.T, 1)
        v = numpy.array(v).flatten()

        c = numpy.zeros(v.shape[0])

        for i, P in enumerate(PList): 
            Q[:, i] = cvxopt.matrix(numpy.array(P.todense()).ravel()) 
            
        c = cvxopt.matrix(c)
        QQ = Q.T * Q
        
        Py = RankAggregator.generateTransitionMatrix(topQList, itemList)
        s = numpy.array(Py.todense()).ravel()
        s = cvxopt.matrix(s)
        
        G = cvxopt.spdiag((-numpy.ones(ell)).tolist())
        h = cvxopt.matrix(numpy.zeros(ell))
        
        A = cvxopt.matrix(numpy.ones(ell), (1, ell))
        b = cvxopt.matrix(numpy.ones(1))        
                
        q = -Q.T * s  
        
        sol = cvxopt.solvers.qp(QQ, q, G, h, A, b)
        
        alpha = numpy.array(sol['x'])
        
        #Combine the matrices 
        P = numpy.zeros((n, n))       
        
        for j, Pj in enumerate(PList): 
            Util.printIteration(j, 1, ell)
            P += alpha[j] * numpy.array(Pj.todense()) 

        P /= ell 
        
        outputList, scores = RankAggregator.computeOutputList(P, itemList)
        
        if verbose: 
            return outputList, scores, PList
        else: 
            return outputList, scores        
예제 #9
0
    def modelSelect(self, X):
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        localAucs = numpy.zeros(
            (self.ks.shape[0], self.lmbdas.shape[0], len(cvInds)))

        logging.debug("Performing model selection")
        paramList = []

        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, self.folds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)

            testOmegaList = SparseUtils.getOmegaList(testX)

            for i, k in enumerate(self.ks):
                maxLocalAuc = self.copy()
                maxLocalAuc.k = k
                paramList.append((trainX, testX, testOmegaList, maxLocalAuc))

        pool = multiprocessing.Pool(processes=self.numProcesses,
                                    maxtasksperchild=100)
        resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize)
        #import itertools
        #resultsIterator = itertools.imap(localAucsLmbdas, paramList)

        for icv, (trainInds, testInds) in enumerate(cvInds):
            for i, k in enumerate(self.ks):
                tempAucs = resultsIterator.next()
                localAucs[i, :, icv] = tempAucs

        pool.terminate()

        meanLocalAucs = numpy.mean(localAucs, 2)
        stdLocalAucs = numpy.std(localAucs, 2)

        logging.debug(meanLocalAucs)

        k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs),
                                        meanLocalAucs.shape)[0]]
        lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs),
                                                meanLocalAucs.shape)[1]]

        logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda))

        self.k = k
        self.lmbda = lmbda

        return meanLocalAucs, stdLocalAucs
예제 #10
0
    def learnModel(self, graph):
        """
        Learn a prediction model based on considering ego networks as independent.
        For each ego, X contains a list of neighbours and the corresponding labels
        are the values of the edge labels. We then find the set of primal weights
        w for each ego network and then regress onto the set of weights using the
        ego labels.

        :param graph: The input graph to learn from.
        :type graph: class:`apgl.graph.AbstractSingleGraph`
        """

        logging.info("Learning model on graph of size " +
                     str(graph.getNumVertices()))
        logging.info("EgoLearner: " + str(self.egoRegressor))
        logging.info("AlterLearner: " + str(self.alterRegressor))

        allIndices = numpy.arange(0, graph.getNumVertices())
        V = graph.getVertexList().getVertices(list(allIndices))
        W = numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        Xe = numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        printStep = numpy.floor(graph.getNumVertices() / 10)
        alterError = 0.0

        for i in range(graph.getNumVertices()):
            Util.printIteration(i, printStep, graph.getNumVertices())
            neighbours = graph.neighbours(i)

            if neighbours.shape[0] != 0:
                X = V[neighbours, :]
                y = numpy.ones(X.shape[0])

                for j in range(neighbours.shape[0]):
                    y[j] = graph.getEdge(i, neighbours[j])

                w = self.alterRegressor.learnModel(X, y)
                #alterError = numpy.mean(numpy.abs(self.alterRegressor.predict(X) - y))

                W = numpy.r_[W, numpy.array([w])]
                Xe = numpy.r_[Xe, numpy.array([V[i, :]])]

        #Now we need to solve least to find regressor of Xe onto W
        logging.info(
            "Finding regression matrix onto weights using matrix of size " +
            str(Xe.shape))
        gc.collect()
        #self.standardiser = Standardiser()
        #self.standardiser2 = Standardiser()
        #Xe = self.standardiser.standardiseArray(Xe)
        #W = self.standardiser2.standardiseArray(W)
        self.egoRegressor.learnModel(Xe, W)

        return W
예제 #11
0
    def learnModel(self, graph):
        """
        Learn a prediction model based on considering ego networks as independent.
        For each ego, X contains a list of neighbours and the corresponding labels
        are the values of the edge labels. We then find the set of primal weights
        w for each ego network and then regress onto the set of weights using the
        ego labels.

        :param graph: The input graph to learn from.
        :type graph: class:`apgl.graph.AbstractSingleGraph`
        """

        logging.info("Learning model on graph of size " + str(graph.getNumVertices()))
        logging.info("EgoLearner: " + str(self.egoRegressor))
        logging.info("AlterLearner: " + str(self.alterRegressor))

        allIndices = numpy.arange(0, graph.getNumVertices())
        V = graph.getVertexList().getVertices(list(allIndices))
        W = numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        Xe  =  numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        printStep = numpy.floor(graph.getNumVertices()/10)
        alterError = 0.0 

        for i in range(graph.getNumVertices()):
            Util.printIteration(i, printStep, graph.getNumVertices())
            neighbours = graph.neighbours(i)

            if neighbours.shape[0] != 0:
                X = V[neighbours, :]
                y = numpy.ones(X.shape[0])

                for j in range(neighbours.shape[0]):
                    y[j] = graph.getEdge(i, neighbours[j])


                w = self.alterRegressor.learnModel(X, y)
                #alterError = numpy.mean(numpy.abs(self.alterRegressor.predict(X) - y))

                W = numpy.r_[W, numpy.array([w])]
                Xe = numpy.r_[Xe, numpy.array([V[i, :]])]

        #Now we need to solve least to find regressor of Xe onto W
        logging.info("Finding regression matrix onto weights using matrix of size " + str(Xe.shape))
        gc.collect()
        #self.standardiser = Standardiser()
        #self.standardiser2 = Standardiser()
        #Xe = self.standardiser.standardiseArray(Xe)
        #W = self.standardiser2.standardiseArray(W)
        self.egoRegressor.learnModel(Xe, W)


        return W 
예제 #12
0
    def evaluateCvOuter(self, X, Y, folds, leafRank):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)
        self.setLeafRank(leafRank)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds)
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            logging.debug("Distribution of labels in train: " +
                          str(numpy.bincount(trainY)))
            logging.debug("Distribution of labels in test: " +
                          str(numpy.bincount(testY)))

            self.learnModel(trainX, trainY)
            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
예제 #13
0
    def supervisedMC22(lists, itemList, topQList, verbose=False): 
        """
        A supervised version of MC2 of our own invention. The idea is to find a 
        linear combination of transition matrices to fit a given one. 
        """
        import cvxopt
        import cvxopt.solvers
        ell = len(lists)
        n = len(itemList)
        outputList, scores, PList = RankAggregator.MC2(lists, itemList, verbose=True)
        
        Q = cvxopt.spmatrix([], [], [], (n*n, len(lists)))

        for i, P in enumerate(PList): 
            #print(P.todense())
            Q[:, i] = cvxopt.matrix(numpy.array(P.todense()).ravel()) 
            
        QQ = Q.T * Q
        
        Py = RankAggregator.generateTransitionMatrix(topQList, itemList)
        s = numpy.array(Py.todense()).ravel()
        s = cvxopt.matrix(s)
        
        G = cvxopt.spdiag((-numpy.ones(ell)).tolist())
        h = cvxopt.matrix(numpy.zeros(ell))
        
        A = cvxopt.matrix(numpy.ones(ell), (1, ell))
        b = cvxopt.matrix(numpy.ones(1))        
                
        q = -Q.T * s  
        
        sol = cvxopt.solvers.qp(QQ, q, G, h, A, b)
        
        alpha = numpy.array(sol['x'])
        
        #Combine the matrices 
        P = numpy.zeros((n, n))       
        
        for j, Pj in enumerate(PList): 
            Util.printIteration(j, 1, ell)
            P += alpha[j] * numpy.array(Pj.todense()) 

        P /= ell 
        
        outputList, scores = RankAggregator.computeOutputList(P, itemList)
        
        if verbose: 
            return outputList, scores, PList
        else: 
            return outputList, scores
예제 #14
0
    def modelSelect(self, X): 
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        localAucs = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(cvInds)))
        
        logging.debug("Performing model selection")
        paramList = []        
        
        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, self.folds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)
            
            testOmegaList = SparseUtils.getOmegaList(testX)
            
            for i, k in enumerate(self.ks): 
                maxLocalAuc = self.copy()
                maxLocalAuc.k = k
                paramList.append((trainX, testX, testOmegaList, maxLocalAuc))
                    
        pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
        resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize)
        #import itertools
        #resultsIterator = itertools.imap(localAucsLmbdas, paramList)
        
        for icv, (trainInds, testInds) in enumerate(cvInds):        
            for i, k in enumerate(self.ks): 
                tempAucs = resultsIterator.next()
                localAucs[i, :, icv] = tempAucs
        
        pool.terminate()
        
        meanLocalAucs = numpy.mean(localAucs, 2)
        stdLocalAucs = numpy.std(localAucs, 2)
        
        logging.debug(meanLocalAucs)
        
        k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]]
        lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]]
        
        logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda))
        
        self.k = k 
        self.lmbda = lmbda 
        
        return meanLocalAucs, stdLocalAucs
예제 #15
0
    def learnModel(self, graph):
        """
        Learn a prediction model based on all of the edges of the input graph.
        For each ego, X contains a list of neighbours and non-neighbours in the same
        ratio, and y = 1 when for a neighbour otherwise -1. We then find the set of
        primal weights w for each ego network and then regress onto the set of weights
        using the ego labels.

        One can either learn by comparing neighbours and non-neighbours, or alternatively
        using the labels of edges and making prediction on unlabelled edges. 

        :param graph: The input graph to learn from.
        :type graph: class:`apgl.graph.AbstractSingleGraph`

        :param randomNegLabel: How to compute edge labels, False means use the labels
        themselves, and True means randomly pick non-neighbours to have -1 labels
        :type randomNegLabel: class `bool`
        """

        Parameter.checkInt(self.windowSize, 1, graph.getNumVertices())
        self.graph = graph
        logging.info("Learning model on graph of size " +
                     str(graph.getNumVertices()))

        allIndices = numpy.arange(0, graph.getNumVertices())
        V = graph.getVertexList().getVertices(allIndices)
        W = numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        Xe = numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        printStep = numpy.floor(graph.getNumVertices() / 10)

        for i in range(graph.getNumVertices()):
            Util.printIteration(i, printStep, graph.getNumVertices())
            neighbours = graph.neighbours(i)

            if neighbours.shape[0] != 0:
                compNeighbours = numpy.setdiff1d(allIndices, neighbours)
                perm = numpy.random.permutation(
                    compNeighbours.shape[0])[0:neighbours.shape[0]]
                negativeVertices = V[compNeighbours[perm], :]
                X = numpy.r_[V[neighbours, :], negativeVertices]
                y = numpy.ones(X.shape[0])
                y[neighbours.shape[0]:] = -1

                w = self.alterRegressor.learnModel(X, y)
                W = numpy.r_[W, numpy.array([w])]
                Xe = numpy.r_[Xe, numpy.array([V[i, :]])]

        #Now we need to solve least to find regressor of Xe onto W
        self.egoRegressor.learnModel(Xe, W)
예제 #16
0
    def evaluateCvOuter(self, X, Y, folds, leafRank):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)
        self.setLeafRank(leafRank)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds)
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY)))
            logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY)))

            self.learnModel(trainX, trainY)
            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
예제 #17
0
    def modelSelect(self, X): 
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        precisions = numpy.zeros((self.ks.shape[0], len(cvInds)))
        
        logging.debug("Performing model selection")
        paramList = []        
        
        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, self.folds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)
            
            testOmegaList = SparseUtils.getOmegaList(testX)
            
            for i, k in enumerate(self.ks): 
                learner = self.copy()
                learner.k = k
                paramList.append((trainX, testX, testOmegaList, learner))
                    
        #pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
        #resultsIterator = pool.imap(computePrecision, paramList, self.chunkSize)
        import itertools
        resultsIterator = itertools.imap(computePrecision, paramList)
        
        for icv, (trainInds, testInds) in enumerate(cvInds):        
            for i, k in enumerate(self.ks): 
                tempPrecision = resultsIterator.next()
                precisions[i, icv] = tempPrecision
        
        #pool.terminate()
        
        meanPrecisions = numpy.mean(precisions, 1)
        stdPrecisions = numpy.std(precisions, 1)
        
        logging.debug(meanPrecisions)
        
        k = self.ks[numpy.argmax(meanPrecisions)]

        
        logging.debug("Model parameters: k=" + str(k)) 
        
        self.k = k 
        
        return meanPrecisions, stdPrecisions
예제 #18
0
    def generate_data_file(dir, nb_user=None):
        logging.debug("nb_user: "******"creating file " + str(f_data_name))
            shutil.copy(BemolData.get_file_name(dir, None), f_data_name)

        # other files to generate
        nb_user_to_generate = []
        current_nb_user = BemolData.get_nb_user_to_read(nb_user)
        logging.debug("current_nb_user before while: " + str(current_nb_user))
        # !!!!! security failure TOCTTOU
        while (not os.path.exists(BemolData.get_file_name(dir, current_nb_user))):
            logging.debug("current_nb_user in while: " + str(current_nb_user))
            nb_user_to_generate.append(current_nb_user)
            current_nb_user = BemolData.get_nb_user_to_read(current_nb_user+1)
        nb_user_to_generate.reverse()

    
        # generate other files
        for current_nb_user in nb_user_to_generate:
            # read data
            f_existing_data_name = BemolData.get_file_name(dir, current_nb_user+1)
            f_to_create_data_name = BemolData.get_file_name(dir, current_nb_user)
            logging.info("creating file " + f_to_create_data_name)
            dict_user = MyDictionary()
            try:
                f_existing_data = gzip.open(f_existing_data_name, 'rb')
                f_to_create_data = gzip.open(f_to_create_data_name, 'wb')

                i = 0
                i_max = BemolData.get_nb_line(f_existing_data_name)
                for line in f_existing_data:
                    Util.printIteration(i, 1000, i_max); i += 1
                    m = re.match("(\d+)\s(\d+)\s(\d+)\s(\d+)", line)
                    if dict_user.index(int(m.group(1))) < current_nb_user:
                        f_to_create_data.write(line)
            except IOError as error:
                if error.filename == f_existing_data:
                    raise RGIOError(error, RGIOError.indent() + 'it disappeared in the meanwhile')
                else:
                    raise error
예제 #19
0
    def learnModel(self, graph):
        """
        Learn a prediction model based on all of the edges of the input graph.
        For each ego, X contains a list of neighbours and non-neighbours in the same
        ratio, and y = 1 when for a neighbour otherwise -1. We then find the set of
        primal weights w for each ego network and then regress onto the set of weights
        using the ego labels.

        One can either learn by comparing neighbours and non-neighbours, or alternatively
        using the labels of edges and making prediction on unlabelled edges. 

        :param graph: The input graph to learn from.
        :type graph: class:`apgl.graph.AbstractSingleGraph`

        :param randomNegLabel: How to compute edge labels, False means use the labels
        themselves, and True means randomly pick non-neighbours to have -1 labels
        :type randomNegLabel: class `bool`
        """

        Parameter.checkInt(self.windowSize, 1, graph.getNumVertices())
        self.graph = graph
        logging.info("Learning model on graph of size " + str(graph.getNumVertices()))

        allIndices = numpy.arange(0, graph.getNumVertices())
        V = graph.getVertexList().getVertices(allIndices)
        W = numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        Xe  =  numpy.zeros((0, graph.getVertexList().getNumFeatures()))
        printStep = numpy.floor(graph.getNumVertices()/10)

        for i in range(graph.getNumVertices()):
            Util.printIteration(i, printStep, graph.getNumVertices())
            neighbours = graph.neighbours(i)

            if neighbours.shape[0] != 0:
                compNeighbours = numpy.setdiff1d(allIndices, neighbours)
                perm = numpy.random.permutation(compNeighbours.shape[0])[0:neighbours.shape[0]]
                negativeVertices = V[compNeighbours[perm], :]
                X = numpy.r_[V[neighbours, :], negativeVertices]
                y = numpy.ones(X.shape[0])
                y[neighbours.shape[0]:] = -1
 
                w = self.alterRegressor.learnModel(X, y)
                W = numpy.r_[W, numpy.array([w])]
                Xe = numpy.r_[Xe, numpy.array([V[i, :]])]

        #Now we need to solve least to find regressor of Xe onto W
        self.egoRegressor.learnModel(Xe, W)
예제 #20
0
 def coauthorsGraphFromAuthors(self, relevantExperts): 
     """
     Take a set of relevant authors and return the graph. 
     """
     dataFile = open(self.dataFilename)  
     authorIndexer = IdIndexer()
     author1Inds = array.array("i")
     author2Inds = array.array("i")
     
     for relevantExpert in relevantExperts: 
         authorIndexer.append(relevantExpert)
     
     for i, line in enumerate(dataFile):
         Util.printIteration(i, self.stepSize, self.numLines)
         authors = re.findall("#@(.*)", line)  
                         
         if len(authors) != 0: 
             authors = set([x.strip() for x in authors[0].split(",")]) 
             if len(authors.intersection(relevantExperts)) != 0: 
                 iterator = itertools.combinations(authors, 2)
             
                 for author1, author2 in iterator: 
                     if author1 in relevantExperts and author2 in relevantExperts: 
                         author1Ind = authorIndexer.append(author1) 
                         author2Ind = authorIndexer.append(author2)
                             
                         author1Inds.append(author1Ind)
                         author2Inds.append(author2Ind)
     
     logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors")
                            
     #Coauthor graph is undirected 
     author1Inds = numpy.array(author1Inds, numpy.int)
     author2Inds = numpy.array(author2Inds, numpy.int)
     edges = numpy.c_[author1Inds, author2Inds]            
     
     graph = igraph.Graph()
     graph.add_vertices(len(authorIndexer.getIdDict()))
     graph.add_edges(edges)
     graph.es["weight"] = numpy.ones(graph.ecount())
     graph.simplify(combine_edges=sum)   
     graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) 
     
     return graph, authorIndexer
예제 #21
0
    def MC2(lists, itemList, alpha=None, verbose=False): 
        """
        Perform weighted rank aggregation using MC2 as given in Rank Aggregation Methods 
        for the Web, Dwork et al. The weighting vector is given by alpha. 
        
        :param lists: A list of lists. Each sublist is an ordered set of a subset of the items from itemList 
        
        :param itemList: A list of all possible items 
        
        :param alpha: A vector of weights for the transition matrices 
        """
        
        n = len(itemList)
        ell = len(lists)
        
        if alpha == None: 
            alpha = numpy.ones(ell)/ell
        
        #P = numpy.zeros((n, n))
        P = scipy.sparse.csr_matrix((n, n))        
        PList = [] 
        
        logging.debug("Computing permutation matrices")
        for j, lst in enumerate(lists): 
            Util.printIteration(j, 1, ell)
            Pj = RankAggregator.generateTransitionMatrix(lst, itemList)

            P = P + alpha[j] * Pj 
            PList.append(Pj)
        
        P /= ell 
        logging.debug("Done")

        outputList,scores = RankAggregator.computeOutputList(P, itemList)
        
        if verbose: 
            return outputList, scores, PList
        else: 
            return outputList, scores
예제 #22
0
    def predictEdges(self, vertexIndices):
        """
        This makes a prediction for a series of edges using the Jacard Index.
        Returns a matrix with rows are a ranked list of verticies of length windowSize.
        """
        """
        The score is the |n(x) \cup n(y)|/|n(x) \cap n(y)|. This is faster than
        the other method. 
        """
        logging.info("Running predictEdges in " + str(self.__class__.__name__))
        printStep = 50

        P = numpy.zeros((vertexIndices.shape[0], self.windowSize))
        S = numpy.zeros((vertexIndices.shape[0], self.windowSize))
        W = self.graph.getWeightMatrix()

        for i in range(vertexIndices.shape[0]):
            Util.printIteration(i, printStep, vertexIndices.shape[0])
            scores = numpy.zeros(self.graph.getNumVertices())

            #Maybe something like this:
            #WI = W[vertexIndices[i], :] + W
            #WU = W[vertexIndices[i], :] * W

            for j in range(0, self.graph.getNumVertices()):
                scores[j] = numpy.nonzero(W[vertexIndices[i], :] +
                                          W[j, :])[0].shape[0]

                if scores[j] != 0:
                    scores[j] = numpy.nonzero(
                        W[vertexIndices[i], :] * W[j, :])[0].shape[0] / float(
                            scores[j])

            P[i, :], S[i, :] = self.indicesFromScores(vertexIndices[i], scores)

        return P, S
예제 #23
0
 def matchExperts(self): 
     expertsSet = self.loadExperts(self.expertsFileName)
     
     if not os.path.exists(self.expertMatchesFilename): 
         inFile = open(self.xmlCleanFilename)    
         expertMatches = set([])
         i = 0 
         
         for line in inFile:
             Util.printIteration(i, self.stepSize, self.numLines)
             if i % self.stepSize == 0: 
                 logging.debug(expertMatches)
                 
             author = re.findall("<author>(.*)</author>", line)  
             if len(author) != 0: 
                 possibleMatches = difflib.get_close_matches(author[0], expertsSet, cutoff=self.matchCutoff)
                 if len(possibleMatches) != 0: 
                     expertMatches.add(author[0])
                     expertsSet.remove(possibleMatches[0])
                     
                     if len(expertsSet) == 0: 
                         logging.debug("Found all experts, breaking")
                         break 
             
             i += 1
         
         expertMatches = sorted(list(expertMatches))
         expertMatchesFile = open(self.expertMatchesFilename, "w")
         
         for expert in expertMatches: 
             expertMatchesFile.write(expert + "\n")
         expertMatchesFile.close()
         
         logging.debug("All done")
     else: 
         logging.debug("File already generated: " + self.expertMatchesFilename)
예제 #24
0
    def predictEdges(self, vertexIndices):
        """
        This makes a prediction for a series of edges using the Jacard Index.
        Returns a matrix with rows are a ranked list of verticies of length windowSize.
        """

        """
        The score is the |n(x) \cup n(y)|/|n(x) \cap n(y)|. This is faster than
        the other method. 
        """
        logging.info("Running predictEdges in " + str(self.__class__.__name__))
        printStep = 50 

        P = numpy.zeros((vertexIndices.shape[0], self.windowSize))
        S = numpy.zeros((vertexIndices.shape[0], self.windowSize))
        W = self.graph.getWeightMatrix()


        for i in range(vertexIndices.shape[0]):
            Util.printIteration(i, printStep, vertexIndices.shape[0])
            scores = numpy.zeros(self.graph.getNumVertices())

            #Maybe something like this: 
            #WI = W[vertexIndices[i], :] + W
            #WU = W[vertexIndices[i], :] * W

            for j in range(0, self.graph.getNumVertices()):
                scores[j] = numpy.nonzero(W[vertexIndices[i], :] + W[j, :])[0].shape[0]

                if scores[j] != 0:
                    scores[j] = numpy.nonzero(W[vertexIndices[i], :] * W[j, :])[0].shape[0]/float(scores[j])

            
            P[i, :], S[i, :] = self.indicesFromScores(vertexIndices[i], scores)

        return P, S
예제 #25
0
    def processRatings(self): 
        """
        Convert the dataset into a matrix and save the results for faster 
        access. 
        """
        if not os.path.exists(self.ratingFileName) or not os.path.exists(self.custDictFileName): 
            dataDir = PathDefaults.getDataDir() + "movielens/"

            logging.debug("Processing ratings given in " + dataDir)

            custIdDict = {} 
            custIdSet = set([])    
            
            movieIdDict = {} 
            movieIdSet = set([])
            
            movieInds = array.array("I")
            custInds = array.array("I")
            ratings = array.array("f")
            dates = array.array("L")
            i = 0            
            j = 0
            
            itr = 0 
            ratingsFile = open(dataDir + "ratings.dat")
            
            for line in ratingsFile: 
                Util.printIteration(itr, 100000, self.numRatings)
                vals = line.split("::")
                
                custId = int(vals[0])
                
                if custId not in custIdSet: 
                    custIdSet.add(custId)
                    custIdDict[custId] = j
                    custInd = j 
                    j += 1 
                else: 
                    custInd = custIdDict[custId]
                    
                movieId = int(vals[1])
                
                if movieId not in movieIdSet: 
                    movieIdSet.add(movieId)
                    movieIdDict[movieId] = i
                    movieInd = i 
                    i += 1 
                else: 
                    movieInd = movieIdDict[movieId]
                    
                rating = float(vals[2])     
                time = int(vals[3])
            
                movieInds.append(movieInd)
                custInds.append(custInd)   
                ratings.append(rating)
                dates.append(time)
                itr += 1 
                    
            movieInds = numpy.array(movieInds, numpy.uint32)
            custInds = numpy.array(custInds, numpy.uint32)
            ratings = numpy.array(ratings, numpy.float)
            dates = numpy.array(dates, numpy.uint32)
            
            assert ratings.shape[0] == self.numRatings            
            
            numpy.savez(self.ratingFileName, movieInds, custInds, ratings, dates) 
            logging.debug("Saved ratings file as " + self.ratingFileName)
            
            pickle.dump(custIdDict, open(self.custDictFileName, 'wb'))
            logging.debug("Saved custIdDict as " + self.custDictFileName)
            
            pickle.dump(movieIdDict, open(self.movieDictFileName, 'wb'))
            logging.debug("Saved movieIdDict as " + self.movieDictFileName)
        else: 
            logging.debug("Ratings file " + str(self.ratingFileName) + " already processed")
예제 #26
0
    def modelSelect(self, X, rhos, ks, cvInds):
        """
        Pick a value of rho based on a single matrix X. We do cross validation
        within, and return the best value of lambda (according to the mean
        squared error). The rhos must be in decreasing order and we use 
        warm restarts. 
        """
        if (numpy.flipud(numpy.sort(rhos)) != rhos).all(): 
            raise ValueError("rhos must be in descending order")    

        errors = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds)))
        
        if self.metric == "mse": 
            metricFuction = learnPredictMSE
        elif self.metric == "f1" or self.metric == "mrr": 
            metricFuction = learnPredictRanking
        else: 
            raise ValueError("Unknown metric: " + self.metric)
            
        for i, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(i, 1, len(cvInds), "Fold: ")
            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)

            assert trainX.nnz == trainInds.shape[0]
            assert testX.nnz == testInds.shape[0]
            #nptst.assert_array_almost_equal((testX+trainX).data, X.data)

            paramList = []
        
            for m, k in enumerate(ks): 
                learner = self.copy()
                learner.updateAlg="initial" 
                learner.setK(k)
                paramList.append((learner, trainX, testX, rhos)) 
            
            if self.numProcesses != 1: 
                pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10)
                results = pool.imap(metricFuction, paramList)
            else: 
                results = itertools.imap(metricFuction, paramList)
            
            for m, rhoErrors in enumerate(results): 
                errors[:, m, i] = rhoErrors
            
            if self.numProcesses != 1: 
                pool.terminate()

        meanMetrics = errors.mean(2)
        stdMetrics = errors.std(2)
        
        logging.debug(meanMetrics)
        
        #Set the parameters 
        if self.metric == "mse": 
            self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[0]]) 
            self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[1]])
        elif self.metric == "f1" or self.metric == "mrr": 
            self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[0]]) 
            self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[1]])
            
        logging.debug("Model parameters: k=" + str(self.k) + " rho=" + str(self.rho))

        return meanMetrics, stdMetrics
예제 #27
0
    def modelSelect2(self, X, rhos, ks, cvInds, colProbs=None):
        """
        Pick a value of rho based on a single matrix X. We do cross validation
        within, and return the best value of lambda (according to the mean
        squared error). The rhos must be in decreasing order and we use 
        warm restarts. In this case we remove a few non zeros from each row 
        to form the test set. 
        """
        if (numpy.flipud(numpy.sort(rhos)) != rhos).all():
            raise ValueError("rhos must be in descending order")

        trainTestXs = Sampling.shuffleSplitRows(X,
                                                self.folds,
                                                self.validationSize,
                                                csarray=False,
                                                rowMajor=False,
                                                colProbs=colProbs)
        metrics = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds)))

        if self.metric == "mse":
            metricFuction = learnPredictMSE
        elif self.metric == "f1" or self.metric == "mrr":
            metricFuction = learnPredictRanking
        else:
            raise ValueError("Unknown metric: " + self.metric)

        paramList = []

        for i, (trainX, testX) in enumerate(trainTestXs):
            Util.printIteration(i, 1, len(cvInds), "Fold: ")

            for m, k in enumerate(ks):
                learner = self.copy()
                learner.updateAlg = "initial"
                learner.setK(k)
                paramList.append((learner, trainX, testX, rhos))

        if self.numProcesses != 1:
            pool = multiprocessing.Pool(processes=self.numProcesses,
                                        maxtasksperchild=10)
            resultsIter = pool.imap(metricFuction, paramList)
        else:
            resultsIter = itertools.imap(metricFuction, paramList)

        for i, (trainX, testX) in enumerate(trainTestXs):
            for m, k in enumerate(ks):
                metrics[:, m, i] = resultsIter.next()

        if self.numProcesses != 1:
            pool.terminate()

        meanMetrics = metrics.mean(2)
        stdMetrics = metrics.std(2)

        logging.debug("ks=" + str(ks))
        logging.debug("rhos=" + str(rhos))
        logging.debug(meanMetrics)

        #Set the parameters
        if self.metric == "mse":
            self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics),
                                                 meanMetrics.shape)[0]])
            self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics),
                                             meanMetrics.shape)[1]])
        elif self.metric == "f1" or self.metric == "mrr":
            self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics),
                                                 meanMetrics.shape)[0]])
            self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics),
                                             meanMetrics.shape)[1]])

        logging.debug("Model parameters: k=" + str(self.k) + " rho=" +
                      str(self.rho))

        return meanMetrics, stdMetrics
    def testGetIterator(self):
        generator = CitationIterGenerator()
        iterator = generator.getIterator()

        lastW = iterator.next()

        for W in iterator:
            self.assertTrue((W-W.T).getnnz() == 0)
            self.assertTrue((lastW - W[0:lastW.shape[0], 0:lastW.shape[0]]).getnnz() ==0  )
            lastW = W

        numVertices = W.shape[0]

        #Now compute the vertexIds manually:
        dataDir = PathDefaults.getDataDir() + "cluster/"
        edgesFilename = dataDir + "Cit-HepTh.txt"
        dateFilename = dataDir + "Cit-HepTh-dates.txt"

        #We can't load in numbers using numpy since some may start with zero 
        edges = []
        file = open(edgesFilename, 'r')
        file.readline()
        file.readline()
        file.readline()
        file.readline()

        for line in file:
            (vertex1, sep, vertex2) = line.partition("\t")
            vertex1 = vertex1.strip()
            vertex2 = vertex2.strip()
            edges.append([int("1" + vertex1), int("1" + vertex2)])

        edges = numpy.array(edges, numpy.int)

        #Check file read correctly
        self.assertTrue((edges[0, :] == numpy.array([11001, 19304045])).all())
        self.assertTrue((edges[1, :] == numpy.array([11001, 19308122])).all())
        self.assertTrue((edges[9, :] == numpy.array([11001, 19503124])).all())
        vertexIds1 = numpy.unique(edges)
        logging.info("Number of graph vertices: " + str(vertexIds1.shape[0]))

        file = open(dateFilename, 'r')
        file.readline()
        vertexIds2 = []

        for line in file:
            (id, sep, date) = line.partition("\t")
            id = id.strip()
            date = date.strip()
            vertexIds2.append(int("1" + id))

        #Check file read correctly 
        vertexIds2 = numpy.array(vertexIds2, numpy.int)
        self.assertTrue((vertexIds2[0:10] == numpy.array([19203201, 19203202, 19203203, 19203204, 19203205, 19203206, 19203207, 19203208, 19203209, 19203210], numpy.int)).all())
        vertexIds2 = numpy.unique(numpy.array(vertexIds2, numpy.int))

        graph = DictGraph(False)
        graph.addEdges(edges)

        #Find the set of vertices with known citation
        vertices = []
        vertexId2Set = set(vertexIds2.tolist())
        for i in graph.getAllVertexIds():
            Util.printIteration(i, 50000, edges.shape[0])
            if i in vertexId2Set:
                vertices.append(i)
                vertices.extend(graph.neighbours(i))

        logging.debug("Number of final vertices: " + str(numVertices))
        numVertices2 = numpy.unique(numpy.array(vertices)).shape[0]
        self.assertEquals(numVertices, numVertices2)

        #Now compare the weight matrices using the undirected graph
        #Note the order of vertices is different from the iterator 
        graph = DictGraph()
        graph.addEdges(edges)
        subgraph = graph.subgraph(numpy.unique(numpy.array(vertices)))
        W2 = subgraph.getSparseWeightMatrix()

        self.assertEquals(W.getnnz(), W2.getnnz())
예제 #29
0
def plotMaxTreesStats():
    biSums1 = []
    heteroSums1 = []
    biSums2 = []
    heteroSums2 = []

    treeDepth1 = [] 
    treeSize1 = []
    treeDepth2 = []
    treeSize2 = [] 

    logging.info("Finding trees")
    trees = sGraph.findTrees()

    maxTree = sGraph.subgraph(trees[0])
    secondTree = sGraph.subgraph(trees[1])

    maxRootIndex = trees[0][numpy.nonzero(sGraph.inDegreeSequence()[trees[0]] == 0)[0]]
    secondRootIndex = trees[1][numpy.nonzero(sGraph.inDegreeSequence()[trees[1]] == 0)[0]]

    for j in range(len(subgraphIndicesList)):
        Util.printIteration(j, 1, len(subgraphIndicesList))
        subgraphIndices = subgraphIndicesList[j]
        subgraphIndices = numpy.array(subgraphIndices)

        currentMaxRootIndex = numpy.nonzero(subgraphIndices == maxRootIndex)[0]
        currentSecondRootIndex = numpy.nonzero(subgraphIndices == secondRootIndex)[0]
        subgraph = sGraph.subgraph(subgraphIndices)

        if currentMaxRootIndex.shape[0] == 1:
            maxTree = subgraph.subgraph(subgraph.depthFirstSearch(currentMaxRootIndex[0]))
        else:
            maxTree = subgraph.subgraph(numpy.array([]))

        if currentSecondRootIndex.shape[0] == 1:
            secondTree = subgraph.subgraph(subgraph.depthFirstSearch(currentSecondRootIndex[0]))
        else:
            secondTree = subgraph.subgraph(numpy.array([]))

        subgraphVertexArray = maxTree.getVertexList().getVertices()
        subgraphVertexArray2 = secondTree.getVertexList().getVertices()
        #Compute proportion of MSM, Male, Female, Hetero
        heteroSums1.append(numpy.sum(subgraphVertexArray[:, orientationIndex]==0))
        biSums1.append(numpy.sum(subgraphVertexArray[:, orientationIndex]==1))

        heteroSums2.append(numpy.sum(subgraphVertexArray2[:, orientationIndex]==0))
        biSums2.append(numpy.sum(subgraphVertexArray2[:, orientationIndex]==1))

        treeDepth1.append(GraphUtils.treeDepth(maxTree))
        treeSize1.append(maxTree.getNumVertices())
        treeDepth2.append(GraphUtils.treeDepth(secondTree))
        treeSize2.append(secondTree.getNumVertices())

    resultsFilename = resultsDir + "treeSizesDepths.npz"
    file = open(resultsFilename, 'w')
    numpy.savez(file, treeDepth1, treeSize1, treeDepth2, treeSize2)

    global plotInd

    plt.figure(plotInd)
    plt.plot(absDayList, heteroSums1, plotStyles3[0], absDayList, biSums1, plotStyles3[1], absDayList, heteroSums2, plotStyles3[2], absDayList, biSums2, plotStyles3[3])
    plt.xticks(locs, labels)
    plt.xlabel("Year")
    plt.ylabel("Detections")
    plt.legend(("Max tree heterosexual", "Max tree MSM", "2nd tree heterosexual", "2nd tree MSM"), loc="upper left")
    plt.savefig(figureDir + "MaxTreeOrientGender.eps")
    plotInd += 1
예제 #30
0
    def processRatings(self): 
        """
        Convert the dataset into a matrix and save the results for faster 
        access. 
        """
        if not os.path.exists(self.ratingFileName) or not os.path.exists(self.custDictFileName): 
            dataDir = PathDefaults.getDataDir() + "flixster/"

            logging.debug("Processing ratings given in " + dataDir)

            custIdDict = {} 
            custIdSet = set([])    
            
            itemIdDict = {} 
            itemIdSet = set([])
            
            itemInds = array.array("I")
            custInds = array.array("I")
            ratings = array.array("f")
            dates = array.array("L")
            i = 0            
            j = 0
            
            itr = 0 
            ratingsFile = open(dataDir + "Ratings.timed.txt")
            ratingsFile.readline()
            
            for line in ratingsFile: 
                Util.printIteration(itr, 100000, self.numRatings)
                vals = line.split()
                
                custId = int(vals[0])
                
                if custId not in custIdSet: 
                    custIdSet.add(custId)
                    custIdDict[custId] = j
                    custInd = j 
                    j += 1 
                else: 
                    custInd = custIdDict[custId]
                    
                itemId = int(vals[1])
                
                if itemId not in itemIdSet: 
                    itemIdSet.add(itemId)
                    itemIdDict[itemId] = i
                    itemInd = i 
                    i += 1 
                else: 
                    itemInd = itemIdDict[itemId]
                    
                rating = float(vals[2])
                

                t = datetime.strptime(vals[3].strip(), "%Y-%m-%d")
                t = int(time.mktime(t.timetuple()))       
                
                #Some dates are before 1970 
                
                if t >= 0:           
                    itemInds.append(itemInd)
                    custInds.append(custInd)   
                    ratings.append(rating)
                    dates.append(t)
                    itr += 1 
                    
            itemInds = numpy.array(itemInds, numpy.uint32)
            custInds = numpy.array(custInds, numpy.uint32)
            ratings = numpy.array(ratings, numpy.float)
            dates = numpy.array(dates, numpy.uint64)
            
            assert ratings.shape[0] == self.numRatings   
            logging.debug("Number of ratings " + str(ratings.shape[0]))

            #Prune data             
            X = scipy.sparse.csc_matrix((ratings, (custInds, itemInds)))
            X2 = scipy.sparse.csc_matrix((dates, (custInds, itemInds)))
            print(X.shape)            
            
            X, rowInds, colInds = SparseUtils.pruneMatrix(X, minNnzRows=10, minNnzCols=10, verbose=True)
            X2 = X2[:, colInds][rowInds, :]
            print(X.shape)
            (custInds, itemInds) = X.nonzero()
            ratings = X.data 
            dates = X2.data
            logging.debug("New number of ratings " + str(ratings.shape[0]))
            
            numpy.savez(self.ratingFileName, itemInds, custInds, ratings, dates) 
            logging.debug("Saved ratings file as " + self.ratingFileName)
            
            pickle.dump(custIdDict, open(self.custDictFileName, 'wb'))
            logging.debug("Saved custIdDict as " + self.custDictFileName)
            
            pickle.dump(itemIdDict, open(self.itemDictFileName, 'wb'))
            logging.debug("Saved itemIdDict as " + self.itemDictFileName)
        else: 
            logging.debug("Ratings file " + str(self.ratingFileName) + " already processed")
예제 #31
0
def plotTreeStats():
    logging.info("Computing tree stats")
    resultsFileName = resultsDir + "InfectGrowthTreeStats.pkl"

    if saveResults:
        statsDictList = []

        for j in range(len(subgraphIndicesList2)):
            Util.printIteration(j, 1, len(subgraphIndicesList2))
            subgraphIndices = subgraphIndicesList2[j]
            subgraph = sGraph.subgraph(subgraphIndices)
            logging.info("Finding trees")
            trees = subgraph.findTrees()
            logging.info("Computing tree statistics")
            statsDict = {}

            locationEntropy = []
            orientEntropy = []
            detectionRanges = []

            for i in range(len(trees)):
                if len(trees[i]) > 1:
                    treeGraph = subgraph.subgraph(trees[i])
                    vertexArray = treeGraph.getVertexList().getVertices(list(range(treeGraph.getNumVertices())))
                    
                    locationEntropy.append(Util.entropy(vertexArray[:, locationIndex]))
                    orientEntropy.append(Util.entropy(vertexArray[:, orientationIndex]))
                    
                    detections = vertexArray[:, detectionIndex]
                    detectionRanges.append(numpy.max(detections) - numpy.min(detections))

            statsDict["locationEnt"] = numpy.array(locationEntropy)
            statsDict["orientEnt"] = numpy.array(orientEntropy)
            statsDict["detectRanges"] = numpy.array(detectionRanges)
            statsDictList.append(statsDict)

        Util.savePickle(statsDictList, resultsFileName, True)
    else:
        statsDictList = Util.loadPickle(resultsFileName)
        
        locBins = numpy.arange(0, 2.4, 0.2)
        detectBins = numpy.arange(0, 6500, 500)
        locationEntDists = []
        orientEntDists = []
        detectionDists = [] 

        for j in range(0, len(dayList2)):
            dateStr = (str(DateUtils.getDateStrFromDay(dayList2[j], startYear)))
            logging.info(dateStr)
            statsDict = statsDictList[j]
            plotInd2 = plotInd

            locationEntDists.append(statsDict["locationEnt"])
            orientEntDists.append(statsDict["orientEnt"])
            detectionDists.append(statsDict["detectRanges"])

        #for j in range(len(orientEntDists)):
        #    print(numpy.sum(numpy.histogram(orientEntDists[j])[0]))
        #    print(numpy.histogram(orientEntDists[j])[0]/float(orientEntDists[j].shape[0]))

        dateStrs = [DateUtils.getDateStrFromDay(dayList2[i], startYear) for i in range(1, len(dayList2))]

        plt.figure(plotInd2)
        histOut = plt.hist(locationEntDists, locBins, normed=True)
        plt.xlabel("Location Entropy")
        plt.ylabel("Probability Density")
        plt.savefig(figureDir + "LocationEnt" +  ".eps")
        #plt.legend()
        plotInd2 += 1

        plt.figure(plotInd2)
        histOut = plt.hist(orientEntDists, normed=True)
        plt.xlabel("Orientation Entropy")
        plt.ylabel("Probability Density")
        plt.savefig(figureDir + "OrientEnt" +  ".eps")
        #plt.legend()
        plotInd2 += 1

        plt.figure(plotInd2)
        histOut = plt.hist(detectionDists, detectBins, normed=True)
        plt.xlabel("Detection Range (days)")
        plt.ylabel("Probability Density")
        plt.savefig(figureDir + "DetectionRanges" +  ".eps")
        #plt.legend()
        plotInd2 += 1
예제 #32
0
    def evaluateCvOuter(self, X, Y, folds, leafRank, innerFolds=3):
        """
        Run model selection and output some ROC curves. In this case Y is a 1D array. 
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)

        maxDepths = numpy.flipud(numpy.arange(1, 12, 1))
        if leafRank == self.getTreeRankLib().LRforest:
            varSplits = numpy.arange(0.6, 1.01, 0.2)
        else:
            varSplits = numpy.array([1])
        #According to Nicolas nfcv>1 doesn't help
        nfcvs = [1]
        #This is tied in with depth 
        mincrit = 0.00
        #If minsplit is too low sometimes get a node with no positive labels
        minSplits = numpy.array([50])

        self.setLeafRank(leafRank)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0 

        for trainInds, testInds in indexList:
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            meanParamAUCs = []
            paramList = [] 

            logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY)))
            logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY)))

            for varSplit in varSplits:
                for nfcv in nfcvs:
                    for minSplit in minSplits:

                        self.setMaxDepth(maxDepths[0])
                        self.setVarSplit(varSplit)
                        self.setNfcv(nfcv)
                        self.setMinSplit(minSplit)
                        logging.debug(self)
                        idx = cross_val.StratifiedKFold(trainY, innerFolds)

                        j = 0
                        metrics = numpy.zeros((len(idx), maxDepths.shape[0]))

                        for idxtr, idxts in idx:
                            Util.printIteration(j, 1, innerFolds)

                            innerTrainX, innerTestX = trainX[idxtr, :], trainX[idxts, :]
                            innerTrainY, innerTestY = trainY[idxtr], trainY[idxts]

                            self.learnModel(innerTrainX, innerTrainY)

                            for k in range(maxDepths.shape[0]):
                                maxDepth = maxDepths[k]

                                robjects.globalenv["maxDepth"] = maxDepth
                                robjects.globalenv["tree"] = self.tree
                                nodeList = robjects.r('tree$nodes[tree$depth>=maxDepth]')
                                self.tree = self.treeRankLib.subTreeRank(self.tree, nodeList)

                                predY = self.predict(innerTestX)
                                gc.collect()

                                metrics[j, k] = Evaluator.auc(predY, innerTestY)
                                
                            j += 1

                        meanAUC = numpy.mean(metrics, 0)
                        varAUC = numpy.var(metrics, 0)
                        logging.warn(self.baseLib.warnings())
                        logging.debug("Mean AUCs and variances at each depth " + str((meanAUC, varAUC)))

                        for k in range(maxDepths.shape[0]):
                            maxDepth = maxDepths[k]
                            meanParamAUCs.append(meanAUC[k])
                            paramList.append((maxDepth, varSplit, nfcv, minSplit))

                        #Try to get some memory back
                        gc.collect()
                        robjects.r('gc(verbose=TRUE)')
                        robjects.r('memory.profile()')

                        #print(self.hp.heap())

            #Now choose best params
            bestInd = numpy.argmax(numpy.array(meanParamAUCs))

            self.setMaxDepth(paramList[bestInd][0])
            self.setVarSplit(paramList[bestInd][1])
            self.setNfcv(paramList[bestInd][2])
            self.setMinSplit(paramList[bestInd][3])

            self.learnModel(trainX, trainY)
            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestParams.append(paramList[bestInd])
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            metaDict["size"] = self.getTreeSize()
            metaDict["depth"] = self.getTreeDepth()
            bestMetaDicts.append(metaDict)

            i += 1

        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
예제 #33
0
    def modelSelect2(self, X, rhos, ks, cvInds, colProbs=None):
        """
        Pick a value of rho based on a single matrix X. We do cross validation
        within, and return the best value of lambda (according to the mean
        squared error). The rhos must be in decreasing order and we use 
        warm restarts. In this case we remove a few non zeros from each row 
        to form the test set. 
        """
        if (numpy.flipud(numpy.sort(rhos)) != rhos).all(): 
            raise ValueError("rhos must be in descending order")    

        trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, rowMajor=False, colProbs=colProbs)
        metrics = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds)))
        
        if self.metric == "mse": 
            metricFuction = learnPredictMSE
        elif self.metric == "f1" or self.metric == "mrr": 
            metricFuction = learnPredictRanking
        else: 
            raise ValueError("Unknown metric: " + self.metric)
            
            
        paramList = []
        
        for i, (trainX, testX) in enumerate(trainTestXs):
            Util.printIteration(i, 1, len(cvInds), "Fold: ")

            for m, k in enumerate(ks): 
                learner = self.copy()
                learner.updateAlg="initial" 
                learner.setK(k)
                paramList.append((learner, trainX, testX, rhos)) 
            
        if self.numProcesses != 1: 
            pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=10)
            resultsIter = pool.imap(metricFuction, paramList)
        else: 
            resultsIter = itertools.imap(metricFuction, paramList)
        
        for i, (trainX, testX) in enumerate(trainTestXs):
            for m, k in enumerate(ks):
                metrics[:, m, i] = resultsIter.next()
        
        if self.numProcesses != 1: 
            pool.terminate()

        meanMetrics = metrics.mean(2)
        stdMetrics = metrics.std(2)
        
        logging.debug("ks=" + str(ks))
        logging.debug("rhos=" + str(rhos))
        logging.debug(meanMetrics)
        
        #Set the parameters 
        if self.metric == "mse": 
            self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[0]]) 
            self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[1]])
        elif self.metric == "f1" or self.metric == "mrr": 
            self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[0]]) 
            self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[1]])
            

        logging.debug("Model parameters: k=" + str(self.k) + " rho=" + str(self.rho))

        return meanMetrics, stdMetrics
예제 #34
0
def runToyExp(datasetNames, sampleSizes, foldsSet, cvScalings, sampleMethods, numProcesses, fileNameSuffix):
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
    outputDir = PathDefaults.getOutputDir() + "modelPenalisation/"

    svm = LibSVM()
    numCs = svm.getCs().shape[0]
    numGammas = svm.getGammas().shape[0]
    numMethods = 1+(1+cvScalings.shape[0])
    numParams = 2

    runIdeal = True
    runCv = True
    runVfpen = True

    for i in range(len(datasetNames)):
        datasetName = datasetNames[i][0]
        numRealisations = datasetNames[i][1]
        logging.debug("Learning using dataset " + datasetName)

        for s in range(len(sampleMethods)):
            sampleMethod = sampleMethods[s][1]
            outfileName = outputDir + datasetName + sampleMethods[s][0] + fileNameSuffix

            fileLock = FileLock(outfileName + ".npz")
            if not fileLock.isLocked() and not fileLock.fileExists():
                fileLock.lock()
                errors = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods))
                params = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numParams))
                errorGrids = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numCs, numGammas))
                approxGrids = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numCs, numGammas))
                idealGrids = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numCs, numGammas))

                data = numpy.load(dataDir + datasetName + ".npz")
                gridPoints, trainX, trainY, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]

                #We form a test set from the grid points
                testX = numpy.zeros((gridPoints.shape[0]**2, 2))
                for m in range(gridPoints.shape[0]):
                    testX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints
                    testX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m]

                for j in range(numRealisations):
                    Util.printIteration(j, 1, numRealisations, "Realisation: ")

                    for k in range(sampleSizes.shape[0]):
                        sampleSize = sampleSizes[k]
                        for m in range(foldsSet.shape[0]):
                            folds = foldsSet[m]
                            logging.debug("Using sample size " + str(sampleSize) + " and " + str(folds) + " folds")
                            perm = numpy.random.permutation(trainX.shape[0])
                            trainInds = perm[0:sampleSize]
                            validX = trainX[trainInds, :]
                            validY = trainY[trainInds]

                            svm = LibSVM(processes=numProcesses)
                            #Find ideal penalties
                            if runIdeal:
                                logging.debug("Finding ideal grid of penalties")
                                idealGrids[j, k, m, :, :] = parallelPenaltyGridRbf(svm, validX, validY, testX, gridPoints, pdfX, pdfY1X, pdfYminus1X)

                            #Cross validation
                            if runCv:
                                logging.debug("Running V-fold cross validation")
                                methodInd = 0
                                idx = sampleMethod(folds, validY.shape[0])
                                if sampleMethod == Sampling.bootstrap:
                                    bootstrap = True
                                else:
                                    bootstrap = False

                                bestSVM, cvGrid = svm.parallelVfcvRbf(validX, validY, idx, True, bootstrap)
                                predY, decisionsY = bestSVM.predict(testX, True)
                                decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F")
                                errors[j, k, m, methodInd] = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X)
                                params[j, k, m, methodInd, :] = numpy.array([bestSVM.getC(), bestSVM.getKernelParams()])
                                errorGrids[j, k, m, methodInd, :, :] = cvGrid

                            #v fold penalisation
                            if runVfpen:
                                logging.debug("Running penalisation")
                                #BIC penalisation
                                Cv = float((folds-1) * numpy.log(validX.shape[0])/2)
                                tempCvScalings = cvScalings*(folds-1)
                                tempCvScalings = numpy.insert(tempCvScalings, 0, Cv)

                                #Use cross validation
                                idx = sampleMethod(folds, validY.shape[0])
                                svmGridResults = svm.parallelVfPenRbf(validX, validY, idx, tempCvScalings)

                                for n in range(len(tempCvScalings)):
                                    bestSVM, trainErrors, approxGrid = svmGridResults[n]
                                    methodInd = n+1
                                    predY, decisionsY = bestSVM.predict(testX, True)
                                    decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F")
                                    errors[j, k, m, methodInd] = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X)
                                    params[j, k, m, methodInd, :] = numpy.array([bestSVM.getC(), bestSVM.getKernelParams()])
                                    errorGrids[j, k, m, methodInd, :, :] = trainErrors + approxGrid
                                    approxGrids[j, k, m, methodInd, :, :] = approxGrid


                meanErrors = numpy.mean(errors, 0)
                print(meanErrors)

                meanParams = numpy.mean(params, 0)
                print(meanParams)

                meanErrorGrids = numpy.mean(errorGrids, 0)
                stdErrorGrids = numpy.std(errorGrids, 0)

                meanIdealGrids = numpy.mean(idealGrids, 0)
                stdIdealGrids = numpy.std(idealGrids, 0)

                meanApproxGrids = numpy.mean(approxGrids, 0)
                stdApproxGrids = numpy.std(approxGrids, 0)

                numpy.savez(outfileName, errors, params, meanErrorGrids, stdErrorGrids, meanIdealGrids, stdIdealGrids, meanApproxGrids, stdApproxGrids)
                logging.debug("Saved results as file " + outfileName + ".npz")
                fileLock.unlock()
            else:
                logging.debug("Results already computed")

    logging.debug("All done!")
예제 #35
0
 def celf(graph, k, numRuns=100, p=0.5, verbose=False): 
     """
     Maximising the influence using the CELF algorithm of Leskovec et al. 
     """
     k = min(graph.vcount(), k)   
     
     influenceSet = set([])
     influenceList = []    
     influenceScores = []            
     negMarginalIncreases = []
     
     
     #For the initial values we compute marginal increases with respect to the empty set 
     influences = numpy.zeros(graph.vcount())
     
     for i in range(numRuns): 
         influences += MaxInfluence.simulateAllCascades(graph, [], p=p)
     
     influences /= float(numRuns)  
     logging.debug("Simulated initial cascades")          
     
     for vertexInd in range(graph.vcount()):        
         #Note that we store the negation of the influence since heappop chooses the smallest value 
         heapq.heappush(negMarginalIncreases, (-influences[vertexInd], vertexInd))
     
     
     """
     for vertexInd in range(graph.vcount()):
         currentInfluence = MaxInfluence.simulateCascades(graph, influenceSet.union([vertexInd]), numRuns, p)
         #Note that we store the negation of the influence since heappop chooses the smallest value
         heapq.heappush(negMarginalIncreases, (-currentInfluence, vertexInd))
     """
        
     negLastInfluence, bestVertexInd = heapq.heappop(negMarginalIncreases)
     influenceSet.add(bestVertexInd)
     influenceList.append(bestVertexInd)
     influenceScores.append(-negLastInfluence)
     logging.debug("Picking additional vertices")
             
     for i in range(1, k):
         Util.printIteration(i-1, 1, k-1)
         valid = numpy.zeros(graph.vcount(), numpy.bool) 
         negMarginalInfluence, currentBestVertexInd = heapq.heappop(negMarginalIncreases)    
         
         j = 0             
         
         while not valid[currentBestVertexInd]: 
             marginalInfluence = MaxInfluence.simulateCascades(graph, influenceSet.union([currentBestVertexInd]), numRuns, p) 
             marginalInfluence += negLastInfluence 
             
             #Note that we store the negation of the influence since heappop chooses the smallest value 
             heapq.heappush(negMarginalIncreases, (-marginalInfluence, currentBestVertexInd)) 
             valid[currentBestVertexInd] = True
             
             negMarginalInfluence, currentBestVertexInd = heapq.heappop(negMarginalIncreases) 
             totalInfluence = -(negMarginalInfluence + negLastInfluence)
             j+=1 
             #print(j)
             
         logging.debug("Required " + str(j) + " evaluations to find influential vertex")
         
         negLastInfluence = -totalInfluence 
         
         influenceSet.add(currentBestVertexInd)
         influenceList.append(currentBestVertexInd)
         influenceScores.append(-negLastInfluence)
     
     if verbose: 
         return influenceList, influenceScores
     else: 
         return influenceList
예제 #36
0
    def modelSelect(self, X, rhos, ks, cvInds):
        """
        Pick a value of rho based on a single matrix X. We do cross validation
        within, and return the best value of lambda (according to the mean
        squared error). The rhos must be in decreasing order and we use 
        warm restarts. 
        """
        if (numpy.flipud(numpy.sort(rhos)) != rhos).all():
            raise ValueError("rhos must be in descending order")

        errors = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds)))

        if self.metric == "mse":
            metricFuction = learnPredictMSE
        elif self.metric == "f1" or self.metric == "mrr":
            metricFuction = learnPredictRanking
        else:
            raise ValueError("Unknown metric: " + self.metric)

        for i, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(i, 1, len(cvInds), "Fold: ")
            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)

            assert trainX.nnz == trainInds.shape[0]
            assert testX.nnz == testInds.shape[0]
            #nptst.assert_array_almost_equal((testX+trainX).data, X.data)

            paramList = []

            for m, k in enumerate(ks):
                learner = self.copy()
                learner.updateAlg = "initial"
                learner.setK(k)
                paramList.append((learner, trainX, testX, rhos))

            if self.numProcesses != 1:
                pool = multiprocessing.Pool(
                    processes=multiprocessing.cpu_count() / 2,
                    maxtasksperchild=10)
                results = pool.imap(metricFuction, paramList)
            else:
                results = itertools.imap(metricFuction, paramList)

            for m, rhoErrors in enumerate(results):
                errors[:, m, i] = rhoErrors

            if self.numProcesses != 1:
                pool.terminate()

        meanMetrics = errors.mean(2)
        stdMetrics = errors.std(2)

        logging.debug(meanMetrics)

        #Set the parameters
        if self.metric == "mse":
            self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics),
                                                 meanMetrics.shape)[0]])
            self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics),
                                             meanMetrics.shape)[1]])
        elif self.metric == "f1" or self.metric == "mrr":
            self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics),
                                                 meanMetrics.shape)[0]])
            self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics),
                                             meanMetrics.shape)[1]])

        logging.debug("Model parameters: k=" + str(self.k) + " rho=" +
                      str(self.rho))

        return meanMetrics, stdMetrics
예제 #37
0
#    numRepetitions = 2
    do_Nings = True
    
    clustErrApprox = numpy.zeros((ps.shape[0], numGraphs, numRepetitions, len(k2s)))
    clustErrExact = numpy.zeros((ps.shape[0], numGraphs, numRepetitions))
    clustErrNings = numpy.zeros((ps.shape[0], numGraphs, numRepetitions))
    clustErrNystrom = numpy.zeros((ps.shape[0], numGraphs, numRepetitions, len(k3s)))
    clustErrRandSvd = numpy.zeros((ps.shape[0], numGraphs, numRepetitions, len(k4s)))
    sinThetaApprox = numpy.zeros((ps.shape[0], numGraphs, numRepetitions, len(k2s)))
    sinThetaExact = numpy.zeros((ps.shape[0], numGraphs, numRepetitions))
    sinThetaNings = numpy.zeros((ps.shape[0], numGraphs, numRepetitions))
    sinThetaNystrom = numpy.zeros((ps.shape[0], numGraphs, numRepetitions, len(k3s)))
    sinThetaRandSvd = numpy.zeros((ps.shape[0], numGraphs, numRepetitions, len(k4s)))
    
    for r in range(numRepetitions):
        Util.printIteration(r, 1, numRepetitions)
    
        for t in range(ps.shape[0]):
            logging.info("Run " + str(r) + "  p " + str(ps[t]))
            p = ps[t]
    
            logging.debug("Running exact method")
            graphIterator = ThreeClustIterator(p, numClusters, r).getIterator()
            resExact = exactClusterer.clusterFromIterator(graphIterator, True)
            
            logging.debug("Running approximate method")
            resApproxList = []
            for i in range(len(k2s)): 
                graphIterator = ThreeClustIterator(p, numClusters, r).getIterator()
                resApproxList.append(iascClusterers[i].clusterFromIterator(graphIterator, True))