예제 #1
0
    def cluster(self, graph):
        """
        Take a graph and cluster using the method in "On spectral clusering: analysis
        and algorithm" by Ng et al., 2001. 

        :param graph: the graph to cluster
        :type graph: :class:`apgl.graph.AbstractMatrixGraph`

        :returns:  An array of size graph.getNumVertices() of cluster membership 
        """
        L = graph.normalisedLaplacianSym()

        omega, Q = numpy.linalg.eig(L)
        inds = numpy.argsort(omega)

        #First normalise rows, then columns
        standardiser = Standardiser()
        V = standardiser.normaliseArray(Q[:, inds[0:self.k]].T).T
        V = vq.whiten(V)
        #Using kmeans2 here seems to result in a high variance
        #in the quality of clustering. Therefore stick to kmeans
        centroids, clusters = vq.kmeans(V, self.k, iter=self.numIterKmeans)
        clusters, distortion = vq.vq(V, centroids)

        return clusters
예제 #2
0
    def matrixSimilarity(self, V1, V2):
        """
        Compute a vertex similarity matrix C, such that the ijth entry is the matching 
        score between V1_i and V2_j, where larger is a better match. 
        """
        X = numpy.r_[V1, V2]
        standardiser = Standardiser()
        X = standardiser.normaliseArray(X)

        V1 = X[0 : V1.shape[0], :]
        V2 = X[V1.shape[0] :, :]

        # print(X)

        # Extend arrays with zeros to make them the same size
        # if V1.shape[0] < V2.shape[0]:
        #    V1 = Util.extendArray(V1, V2.shape, numpy.min(V1))
        # elif V2.shape[0] < V1.shape[0]:
        #    V2 = Util.extendArray(V2, V1.shape, numpy.min(V2))

        # Let's compute C as the distance between vertices
        # Distance is bounded by 1
        D = Util.distanceMatrix(V1, V2)
        maxD = numpy.max(D)
        minD = numpy.min(D)
        if (maxD - minD) != 0:
            C = (maxD - D) / (maxD - minD)
        else:
            C = numpy.ones((V1.shape[0], V2.shape[0]))

        return C
예제 #3
0
    def matrixSimilarity(self, V1, V2):
        """
        Compute a vertex similarity matrix C, such that the ijth entry is the matching 
        score between V1_i and V2_j, where larger is a better match. 
        """
        X = numpy.r_[V1, V2]
        standardiser = Standardiser()
        X = standardiser.normaliseArray(X)

        V1 = X[0:V1.shape[0], :]
        V2 = X[V1.shape[0]:, :]

        #print(X)

        #Extend arrays with zeros to make them the same size
        #if V1.shape[0] < V2.shape[0]:
        #    V1 = Util.extendArray(V1, V2.shape, numpy.min(V1))
        #elif V2.shape[0] < V1.shape[0]:
        #    V2 = Util.extendArray(V2, V1.shape, numpy.min(V2))

        #Let's compute C as the distance between vertices
        #Distance is bounded by 1
        D = Util.distanceMatrix(V1, V2)
        maxD = numpy.max(D)
        minD = numpy.min(D)
        if (maxD - minD) != 0:
            C = (maxD - D) / (maxD - minD)
        else:
            C = numpy.ones((V1.shape[0], V2.shape[0]))

        return C
예제 #4
0
    def cluster(self, graph):
        """
        Take a graph and cluster using the method in "On spectral clusering: analysis
        and algorithm" by Ng et al., 2001. 

        :param graph: the graph to cluster
        :type graph: :class:`apgl.graph.AbstractMatrixGraph`

        :returns:  An array of size graph.getNumVertices() of cluster membership 
        """
        L = graph.normalisedLaplacianSym()

        omega, Q = numpy.linalg.eig(L)
        inds = numpy.argsort(omega)

        #First normalise rows, then columns
        standardiser = Standardiser()
        V = standardiser.normaliseArray(Q[:, inds[0:self.k]].T).T
        V = vq.whiten(V)
        #Using kmeans2 here seems to result in a high variance
        #in the quality of clustering. Therefore stick to kmeans
        centroids, clusters = vq.kmeans(V, self.k, iter=self.numIterKmeans)
        clusters, distortion = vq.vq(V, centroids)

        return clusters
예제 #5
0
    def testScaleArray(self):
        numExamples = 10
        numFeatures = 3
        X = numpy.random.rand(numExamples, numFeatures)

        preprocessor = Standardiser()
        Xs = preprocessor.scaleArray(X)

        minVals = numpy.amin(Xs, 0)
        maxVals = numpy.amax(Xs, 0)

        tol = 10 ** -6
        self.assertTrue(numpy.linalg.norm(minVals + numpy.ones(X.shape[1])) <= tol)
        self.assertTrue(numpy.linalg.norm(maxVals - numpy.ones(X.shape[1])) <= tol)

        # Now test stanrdisation on other matrix

        X = numpy.array([[2, 1], [-1, -2], [0.6, 0.3]])
        preprocessor = Standardiser()
        Xs = preprocessor.scaleArray(X)

        X2 = numpy.array([[2, 1], [-1, -2], [0.6, 0.3], [4, 2]])
        Xs2 = preprocessor.scaleArray(X2)

        self.assertTrue(numpy.linalg.norm(Xs2[0:3, :] - Xs) < tol)
예제 #6
0
    def testUnstandardiseArray(self):
        numExamples = 10
        numFeatures = 3

        tol = 10 ** -6
        preprocessor = Standardiser()

        # Test an everyday matrix
        X = numpy.random.rand(numExamples, numFeatures)
        Xs = preprocessor.standardiseArray(X)
        X2 = preprocessor.unstandardiseArray(Xs)

        self.assertTrue(numpy.linalg.norm(X2 - X) < tol)
예제 #7
0
 def testLearningRate(self): 
     numExamples = 100
     trainX, trainY = data.make_regression(numExamples) 
     trainX = Standardiser().normaliseArray(trainX)
     trainY = Standardiser().normaliseArray(trainY)
     learner = DecisionTreeLearner(pruneType="CART", maxDepth=20, minSplit=1)
     
     
     foldsSet = numpy.arange(2, 7, 2)
     
     gammas = numpy.array(numpy.round(2**numpy.arange(1, 8, 1)-1), dtype=numpy.int)
     paramDict = {} 
     paramDict["setGamma"] = gammas
     
     betaGrid = learner.learningRate(trainX, trainY, foldsSet, paramDict)
     
     #Compute beta more directly 
     numParams = gammas.shape[0]
     sampleSize = trainX.shape[0]
     sampleMethod = Sampling.crossValidation
     Cvs = numpy.array([1])
     penalties = numpy.zeros((foldsSet.shape[0], numParams))
     betas = numpy.zeros(gammas.shape[0])
     
     for k in range(foldsSet.shape[0]): 
         folds = foldsSet[k]
         logging.debug("Folds " + str(folds))
         
         idx = sampleMethod(folds, trainX.shape[0])   
         
         #Now try penalisation
         resultsList = learner.parallelPen(trainX, trainY, idx, paramDict, Cvs)
         bestLearner, trainErrors, currentPenalties = resultsList[0]
         penalties[k, :] = currentPenalties
     
     for i in range(gammas.shape[0]): 
         inds = numpy.logical_and(numpy.isfinite(penalties[:, i]), penalties[:, i]>0)
         tempPenalties = penalties[:, i][inds]
         tempfoldsSet = numpy.array(foldsSet, numpy.float)[inds]                            
         
         if tempPenalties.shape[0] > 1: 
             x = numpy.log((tempfoldsSet-1)/tempfoldsSet*sampleSize)
             y = numpy.log(tempPenalties)+numpy.log(tempfoldsSet)   
         
             clf = linear_model.LinearRegression()
             clf.fit(numpy.array([x]).T, y)
             betas[i] = clf.coef_[0]    
             
     betas = -betas   
     
     nptst.assert_array_equal(betaGrid, betas)
    def testLearnModel(self):
        numExamples = 50
        numFeatures = 200
        preprocessor = Standardiser()
        X = numpy.random.randn(numExamples, numFeatures)
        X = preprocessor.standardiseArray(X)
        c = numpy.random.rand(numFeatures)
        y = numpy.dot(X, c)

        tol = 0.05
        kernel = LinearKernel()
        lmbda = 0.0001
        predictor = KernelShiftRegression(kernel, lmbda)

        alpha, b = predictor.learnModel(X, y)
        predY = predictor.predict(X)

        self.assertTrue(Evaluator.rootMeanSqError(y, predY) < tol)

        #Try increasing y
        y = y + 5
        lmbda = 0.2
        predictor = KernelShiftRegression(kernel, lmbda)
        alpha, b = predictor.learnModel(X, y)
        predY = predictor.predict(X)

        self.assertTrue(numpy.abs(b - 5) < 0.1)
        self.assertTrue(Evaluator.rootMeanSqError(y, predY) < 0.1)

        #Try making prediction for multilabel Y
        C = numpy.random.rand(numFeatures, numFeatures)
        Y = numpy.dot(X, C)

        predictor = KernelShiftRegression(kernel, lmbda)
        alpha, b = predictor.learnModel(X, Y)
        predY = predictor.predict(X)

        self.assertTrue(Evaluator.rootMeanSqError(Y, predY) < 0.1)

        #Now, shift the data 
        s = numpy.random.rand(numFeatures)
        Y = Y + s

        predictor = KernelShiftRegression(kernel, lmbda)
        alpha, b = predictor.learnModel(X, Y)
        predY = predictor.predict(X)

        self.assertTrue(numpy.linalg.norm(b - s) < 0.1)
        self.assertTrue(Evaluator.rootMeanSqError(Y, predY) < 0.1)
    def testLearnModel(self):
        numExamples = 50
        numFeatures = 200
        preprocessor = Standardiser()
        X = numpy.random.randn(numExamples, numFeatures)
        X = preprocessor.standardiseArray(X)
        c = numpy.random.rand(numFeatures)
        y = numpy.dot(X, c)

        tol = 0.05
        kernel = LinearKernel()
        lmbda = 0.0001
        predictor = KernelShiftRegression(kernel, lmbda)

        alpha, b = predictor.learnModel(X, y)
        predY = predictor.predict(X)

        self.assertTrue(Evaluator.rootMeanSqError(y, predY) < tol)

        #Try increasing y
        y = y + 5
        lmbda = 0.2
        predictor = KernelShiftRegression(kernel, lmbda)
        alpha, b = predictor.learnModel(X, y)
        predY = predictor.predict(X)

        self.assertTrue(numpy.abs(b - 5) < 0.1)
        self.assertTrue(Evaluator.rootMeanSqError(y, predY) < 0.1)

        #Try making prediction for multilabel Y
        C = numpy.random.rand(numFeatures, numFeatures)
        Y = numpy.dot(X, C)

        predictor = KernelShiftRegression(kernel, lmbda)
        alpha, b = predictor.learnModel(X, Y)
        predY = predictor.predict(X)

        self.assertTrue(Evaluator.rootMeanSqError(Y, predY) < 0.1)

        #Now, shift the data
        s = numpy.random.rand(numFeatures)
        Y = Y + s

        predictor = KernelShiftRegression(kernel, lmbda)
        alpha, b = predictor.learnModel(X, Y)
        predY = predictor.predict(X)

        self.assertTrue(numpy.linalg.norm(b - s) < 0.1)
        self.assertTrue(Evaluator.rootMeanSqError(Y, predY) < 0.1)
예제 #10
0
 def testRecursiveSetPrune(self): 
     numExamples = 1000
     X, y = data.make_regression(numExamples)  
     
     y = Standardiser().normaliseArray(y)
     
     numTrain = numpy.round(numExamples * 0.66)     
     
     trainX = X[0:numTrain, :]
     trainY = y[0:numTrain]
     testX = X[numTrain:, :]
     testY = y[numTrain:]
     
     learner = DecisionTreeLearner()
     learner.learnModel(trainX, trainY)
     
     rootId = (0,)
     learner.tree.getVertex(rootId).setTestInds(numpy.arange(testX.shape[0]))
     learner.recursiveSetPrune(testX, testY, rootId)
     
     for vertexId in learner.tree.getAllVertexIds(): 
         tempY = testY[learner.tree.getVertex(vertexId).getTestInds()]
         predY = numpy.ones(tempY.shape[0])*learner.tree.getVertex(vertexId).getValue()
         error = numpy.sum((tempY-predY)**2)
         self.assertAlmostEquals(error, learner.tree.getVertex(vertexId).getTestError())
         
     #Check leaf indices form all indices 
     inds = numpy.array([])        
     
     for vertexId in learner.tree.leaves(): 
         inds = numpy.union1d(inds, learner.tree.getVertex(vertexId).getTestInds())
         
     nptst.assert_array_equal(inds, numpy.arange(testY.shape[0]))
예제 #11
0
    def testStandardiseArray(self):
        numExamples = 10
        numFeatures = 3

        preprocessor = Standardiser()

        # Test an everyday matrix
        X = numpy.random.rand(numExamples, numFeatures)
        Xs = preprocessor.standardiseArray(X)

        self.assertAlmostEquals(numpy.sum(Xs), 0, places=3)
        self.assertAlmostEquals(numpy.sum(Xs * Xs), numFeatures, places=3)

        # Now, test on a portion of a matrix
        Xss = preprocessor.standardiseArray(X[1:5, :])
        self.assertTrue((Xss == Xs[1:5, :]).all())
예제 #12
0
    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName,
                           skiprows=1,
                           usecols=(1, 2, 3),
                           delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName,
                               skiprows=1,
                               usecols=(1, 2, 3),
                               delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        maxDepths = range(3, 10)
        trainAucs = numpy.array([
            0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508,
            0.7367508, 0.7367508
        ])
        testAucs = numpy.array([
            0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400,
            0.6874400, 0.6874400
        ])
        i = 0

        #The results are approximately the same, but not exactly
        for maxDepth in maxDepths:
            treeRank = TreeRank(self.leafRanklearner)
            treeRank.setMaxDepth(maxDepth)
            treeRank.learnModel(X, y)
            trainScores = treeRank.predict(X)
            testScores = treeRank.predict(testX)

            self.assertAlmostEquals(Evaluator.auc(trainScores, y),
                                    trainAucs[i], 2)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY),
                                    testAucs[i], 1)
            i += 1
예제 #13
0
    def testCentreArray(self):
        numExamples = 10
        numFeatures = 3

        preprocessor = Standardiser()

        # Test an everyday matrix
        X = numpy.random.rand(numExamples, numFeatures)
        Xc = preprocessor.centreArray(X)
        centreV = preprocessor.getCentreVector()
        self.assertAlmostEquals(numpy.sum(Xc), 0, places=3)
        self.assertTrue((X - centreV == Xc).all())

        # Now take out 3 rows of X, normalise and compare to normalised X
        Xs = X[0:3, :]
        Xsc = preprocessor.centreArray(Xs)
        self.assertTrue((Xsc == Xc[0:3, :]).all())
예제 #14
0
    def testNormaliseArray(self):
        numExamples = 10
        numFeatures = 3

        preprocessor = Standardiser()

        # Test an everyday matrix
        X = numpy.random.rand(numExamples, numFeatures)
        Xn = preprocessor.normaliseArray(X)
        normV = preprocessor.getNormVector()
        self.assertAlmostEquals(numpy.sum(Xn * Xn), numFeatures, places=3)

        norms = numpy.sum(Xn * Xn, 0)

        for i in range(0, norms.shape[0]):
            self.assertAlmostEquals(norms[i], 1, places=3)

        self.assertTrue((X / normV == Xn).all())

        # Zero one column
        preprocessor = Standardiser()
        X[:, 1] = 0
        Xn = preprocessor.normaliseArray(X)
        normV = preprocessor.getNormVector()
        self.assertAlmostEquals(numpy.sum(Xn * Xn), numFeatures - 1, places=3)
        self.assertTrue((X / normV == Xn).all())

        # Now take out 3 rows of X, normalise and compare to normalised X
        Xs = X[0:3, :]
        Xsn = preprocessor.normaliseArray(Xs)
        self.assertTrue((Xsn == Xn[0:3, :]).all())
예제 #15
0
    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]
        
        y = y*2 - 1 

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]
        
        testY = testY*2-1

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        numTrees = 5
        minSplit = 50 
        maxDepths = range(3, 10)
        trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347])
        testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801])

        i = 0
        
        #The results are approximately the same, but not exactly 
        for maxDepth in maxDepths:
            treeRankForest = TreeRankForest(self.leafRanklearner)
            treeRankForest.setMaxDepth(maxDepth)
            treeRankForest.setMinSplit(minSplit)
            treeRankForest.setNumTrees(numTrees)
            treeRankForest.learnModel(X, y)
            trainScores = treeRankForest.predict(X)
            testScores = treeRankForest.predict(testX)

            print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY))

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i+=1
예제 #16
0
    def testClassify(self):
        numExamples = 10
        numFeatures = 20

        X = numpy.random.randn(numExamples, numFeatures)
        y = numpy.sign(numpy.random.randn(numExamples))
        logging.debug(y)

        preprocessor = Standardiser()
        X = preprocessor.standardiseArray(X)

        tol = 10**-5
        lmbda = 1.0
        kernel = LinearKernel()

        predictor = KernelRidgeRegression(kernel, lmbda)
        predictor.learnModel(X, y)
        classY, predY = predictor.classify(X)

        self.assertTrue(numpy.logical_or(classY == 1, classY == -1).all())
    def testLearnModel2(self):
        numExamples = 200
        numFeatures = 100

        X = numpy.random.randn(numExamples, numFeatures)
        y = numpy.random.randn(numExamples)

        preprocessor = Standardiser()
        X = preprocessor.standardiseArray(X)

        tol = 10**-3
        kernel = LinearKernel()

        #Try using a low-rank matrix
        lmbda = 0.001
        predictor = KernelShiftRegression(kernel, lmbda)

        alpha, b = predictor.learnModel(X, y)
        predY = predictor.predict(X)

        logging.debug((numpy.linalg.norm(y)))
        logging.debug((numpy.linalg.norm(predY - y)))
    def testLearnModel2(self):
        numExamples = 200
        numFeatures = 100

        X = numpy.random.randn(numExamples, numFeatures)
        y = numpy.random.randn(numExamples)

        preprocessor = Standardiser()
        X = preprocessor.standardiseArray(X)

        tol = 10**-3
        kernel = LinearKernel()

        #Try using a low-rank matrix 
        lmbda = 0.001
        predictor = KernelShiftRegression(kernel, lmbda)

        alpha, b = predictor.learnModel(X, y)
        predY = predictor.predict(X)

        logging.debug((numpy.linalg.norm(y)))
        logging.debug((numpy.linalg.norm(predY - y)))
예제 #19
0
    def setUp(self):
        numpy.random.seed(21)
        numpy.seterr(all="raise")

        numExamples = 100
        numFeatures = 10
        self.X = numpy.random.rand(numExamples, numFeatures)
        c = numpy.random.rand(numFeatures)
        self.y = numpy.array(
            numpy.sign(self.X.dot(c) - numpy.mean(self.X.dot(c))), numpy.int)

        logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

        self.X = Standardiser().standardiseArray(self.X)
예제 #20
0
    def testSetSvmType(self):
        try:
            import sklearn
        except ImportError as error:
            return

        numExamples = 100
        numFeatures = 10
        X = numpy.random.randn(numExamples, numFeatures)
        X = Standardiser().standardiseArray(X)
        c = numpy.random.randn(numFeatures)

        y = numpy.dot(X, numpy.array([c]).T).ravel() + 1
        y2 = numpy.array(y > 0, numpy.int32) * 2 - 1

        svm = LibSVM()

        svm.setSvmType("Epsilon_SVR")

        self.assertEquals(svm.getType(), "Epsilon_SVR")

        #Try to get a good error
        Cs = 2**numpy.arange(-6, 4, dtype=numpy.float)
        epsilons = 2**numpy.arange(-6, 4, dtype=numpy.float)

        bestError = 10
        for C in Cs:
            for epsilon in epsilons:
                svm.setEpsilon(epsilon)
                svm.setC(C)
                svm.learnModel(X, y)
                yp = svm.predict(X)

                if Evaluator.rootMeanSqError(y, yp) < bestError:
                    bestError = Evaluator.rootMeanSqError(y, yp)

        self.assertTrue(
            bestError < Evaluator.rootMeanSqError(y, numpy.zeros(y.shape[0])))

        svm.setSvmType("C_SVC")
        svm.learnModel(X, y2)
        yp2 = svm.predict(X)

        self.assertTrue(0 <= Evaluator.binaryError(y2, yp2) <= 1)
예제 #21
0
    def testCARTPrune(self): 
        numExamples = 500
        X, y = data.make_regression(numExamples)  
        
        y = Standardiser().standardiseArray(y)
        
        numTrain = numpy.round(numExamples * 0.33)     
        numValid = numpy.round(numExamples * 0.33) 
        
        trainX = X[0:numTrain, :]
        trainY = y[0:numTrain]
        validX = X[numTrain:numTrain+numValid, :]
        validY = y[numTrain:numTrain+numValid]
        testX = X[numTrain+numValid:, :]
        testY = y[numTrain+numValid:]
        
        learner = DecisionTreeLearner(pruneType="none", maxDepth=10, minSplit=2)
        learner.learnModel(trainX, trainY)    
        
        learner = DecisionTreeLearner(pruneType="CART", maxDepth=10, minSplit=2, gamma=1000)
        learner.learnModel(trainX, trainY)
        self.assertTrue(learner.tree.getNumVertices() <= 1000)
        predY = learner.predict(trainX)

        learner.setGamma(200)
        learner.learnModel(trainX, trainY)
        self.assertTrue(learner.tree.getNumVertices() <= 200)
        
        learner.setGamma(100)
        learner.learnModel(trainX, trainY)
        self.assertTrue(learner.tree.getNumVertices() <= 100)
        

        learner = DecisionTreeLearner(pruneType="none", maxDepth=10, minSplit=2)
        learner.learnModel(trainX, trainY)
        predY2 = learner.predict(trainX)
        
        #Gamma = 0 implies no pruning 
        nptst.assert_array_equal(predY, predY2)
        
        #Full pruning 
        learner = DecisionTreeLearner(pruneType="CART", maxDepth=3, gamma=1)
        learner.learnModel(trainX, trainY)
        self.assertEquals(learner.tree.getNumVertices(), 1)
예제 #22
0
 def testCvPrune(self): 
     numExamples = 500
     X, y = data.make_regression(numExamples)  
     
     y = Standardiser().standardiseArray(y)
     
     numTrain = numpy.round(numExamples * 0.33)     
     numValid = numpy.round(numExamples * 0.33) 
     
     trainX = X[0:numTrain, :]
     trainY = y[0:numTrain]
     validX = X[numTrain:numTrain+numValid, :]
     validY = y[numTrain:numTrain+numValid]
     testX = X[numTrain+numValid:, :]
     testY = y[numTrain+numValid:]
     
     learner = DecisionTreeLearner()
     learner.learnModel(trainX, trainY)
     error1 = Evaluator.rootMeanSqError(learner.predict(testX), testY)
     
     #print(learner.getTree())
     unprunedTree = learner.tree.copy() 
     learner.setGamma(1000)
     learner.cvPrune(trainX, trainY)
     
     self.assertEquals(unprunedTree.getNumVertices(), learner.tree.getNumVertices())
     learner.setGamma(100)
     learner.cvPrune(trainX, trainY)
     
     #Test if pruned tree is subtree of current: 
     for vertexId in learner.tree.getAllVertexIds(): 
         self.assertTrue(vertexId in unprunedTree.getAllVertexIds())
         
     #The error should be better after pruning 
     learner.learnModel(trainX, trainY)
     #learner.cvPrune(validX, validY, 0.0, 5)
     learner.repPrune(validX, validY)
   
     error2 = Evaluator.rootMeanSqError(learner.predict(testX), testY)
     
     self.assertTrue(error1 >= error2)
예제 #23
0
    def testModelSelect(self): 
        
        """
        We test the results on some data and compare to SVR. 
        """
        numExamples = 200
        X, y = data.make_regression(numExamples, noise=0.5)  
        
        X = Standardiser().standardiseArray(X)
        y = Standardiser().standardiseArray(y)
        
        trainX = X[0:100, :]
        trainY = y[0:100]
        testX = X[100:, :]
        testY = y[100:]
        
        learner = DecisionTreeLearner(maxDepth=20, minSplit=10, pruneType="REP-CV")
        learner.setPruneCV(8)
        
        paramDict = {} 
        paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 10) 
        paramDict["setPruneCV"] = numpy.arange(6, 11, 2, numpy.int)
        
        folds = 5
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)


        predY = bestTree.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
        
        
        learner = DecisionTreeLearner(maxDepth=20, minSplit=5, pruneType="CART")
        
        paramDict = {} 
        paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 50) 
        
        folds = 5
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)


        predY = bestTree.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
              
        return 
        #Let's compare to the SVM 
        learner2 = LibSVM(kernel='gaussian', type="Epsilon_SVR") 
        
        paramDict = {} 
        paramDict["setC"] = 2.0**numpy.arange(-10, 14, 2, dtype=numpy.float)
        paramDict["setGamma"] = 2.0**numpy.arange(-10, 4, 2, dtype=numpy.float)
        paramDict["setEpsilon"] = learner2.getEpsilons()
        
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestSVM, cvGrid = learner2.parallelModelSelect(trainX, trainY, idx, paramDict)

        predY = bestSVM.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
예제 #24
0
    def testLearnModel(self):
        numExamples = 50
        numFeatures = 200

        X = numpy.random.randn(numExamples, numFeatures)
        y = numpy.random.randn(numExamples)

        preprocessor = Standardiser()
        X = preprocessor.standardiseArray(X)

        tol = 10**-3
        kernel = LinearKernel()

        #Compare Linear kernel with linear ridge regression
        lmbda = 0.1
        predictor = KernelRidgeRegression(kernel, lmbda)

        alpha = predictor.learnModel(X, y)
        predY = predictor.predict(X)

        K = numpy.dot(X, X.T)
        alpha2 = numpy.dot(
            numpy.linalg.inv(K + lmbda * numpy.eye(numExamples)), y)
        predY2 = X.dot(
            numpy.linalg.inv(
                numpy.dot(X.T, X) + lmbda * numpy.eye(numFeatures))).dot(
                    X.T).dot(y)

        #logging.debug(numpy.linalg.norm(alpha - alpha2))

        self.assertTrue(numpy.linalg.norm(alpha - alpha2) < tol)
        self.assertTrue(numpy.linalg.norm(predY - predY2) < tol)

        lmbda = 0.5
        predictor = KernelRidgeRegression(kernel, lmbda)

        alpha = predictor.learnModel(X, y)
        predY = predictor.predict(X)

        K = numpy.dot(X, X.T)
        alpha2 = numpy.dot(
            numpy.linalg.inv(K + lmbda * numpy.eye(numExamples)), y)
        predY2 = X.dot(
            numpy.linalg.inv(
                numpy.dot(X.T, X) + lmbda * numpy.eye(numFeatures))).dot(
                    X.T).dot(y)

        self.assertTrue(numpy.linalg.norm(alpha - alpha2) < tol)
        self.assertTrue(numpy.linalg.norm(predY - predY2) < tol)

        #Now test on an alternative test set
        numTestExamples = 50
        testX = numpy.random.randn(numTestExamples, numFeatures)
        predictor = KernelRidgeRegression(kernel, lmbda)

        alpha = predictor.learnModel(X, y)
        predY = predictor.predict(testX)

        K = numpy.dot(X, X.T)
        alpha2 = numpy.dot(
            numpy.linalg.inv(K + lmbda * numpy.eye(numExamples)), y)
        predY2 = testX.dot(
            numpy.linalg.inv(
                numpy.dot(X.T, X) + lmbda * numpy.eye(numFeatures))).dot(
                    X.T).dot(y)

        self.assertTrue(numpy.linalg.norm(alpha - alpha2) < tol)
        self.assertTrue(numpy.linalg.norm(predY - predY2) < tol)

        #Use the method against a multi-label example
        Y = numpy.random.randn(numExamples, numFeatures)

        alpha = predictor.learnModel(X, Y)

        self.assertTrue(alpha.shape == (numExamples, numFeatures))
    def clusterFromIterator(self, graphListIterator, verbose=False):
        """
        Find a set of clusters for the graphs given by the iterator. If verbose 
        is true the each iteration is timed and bounded the results are returned 
        as lists.
        
        The difference between a weight matrix and the previous one should be
        positive.
        """
        clustersList = []
        decompositionTimeList = []
        kMeansTimeList = []
        boundList = []
        sinThetaList = []
        i = 0

        for subW in graphListIterator:
            if __debug__:
                Parameter.checkSymmetric(subW)

            if self.logStep and i % self.logStep == 0:
                logging.debug("Graph index: " + str(i))
            logging.debug("Clustering graph of size " + str(subW.shape))
            if self.alg != "efficientNystrom":
                ABBA = GraphUtils.shiftLaplacian(subW)

            # --- Eigen value decomposition ---
            startTime = time.time()
            if self.alg == "IASC":
                if i % self.T != 0:
                    omega, Q = self.approxUpdateEig(subW, ABBA, omega, Q)

                    if self.computeBound:
                        inds = numpy.flipud(numpy.argsort(omega))
                        Q = Q[:, inds]
                        omega = omega[inds]
                        bounds = self.pertBound(omega, Q, omegaKbot, AKbot,
                                                self.k2)
                        #boundList.append([i, bounds[0], bounds[1]])

                        #Now use accurate values of norm of R and delta
                        rank = Util.rank(ABBA.todense())
                        gamma, U = scipy.sparse.linalg.eigsh(ABBA,
                                                             rank - 1,
                                                             which="LM",
                                                             ncv=ABBA.shape[0])
                        #logging.debug("gamma=" + str(gamma))
                        bounds2 = self.realBound(omega, Q, gamma, AKbot,
                                                 self.k2)
                        boundList.append(
                            [bounds[0], bounds[1], bounds2[0], bounds2[1]])
                else:
                    logging.debug("Computing exact eigenvectors")
                    self.storeInformation(subW, ABBA)

                    if self.computeBound:
                        #omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2*2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0]))
                        rank = Util.rank(ABBA.todense())
                        omega, Q = scipy.sparse.linalg.eigsh(ABBA,
                                                             rank - 1,
                                                             which="LM",
                                                             ncv=ABBA.shape[0])
                        inds = numpy.flipud(numpy.argsort(omega))
                        omegaKbot = omega[inds[self.k2:]]
                        QKbot = Q[:, inds[self.k2:]]
                        AKbot = (QKbot * omegaKbot).dot(QKbot.T)

                        omegaSort = numpy.flipud(numpy.sort(omega))
                        boundList.append([0] * 4)
                    else:
                        omega, Q = scipy.sparse.linalg.eigsh(
                            ABBA,
                            min(self.k2, ABBA.shape[0] - 1),
                            which="LM",
                            ncv=min(10 * self.k2, ABBA.shape[0]))

            elif self.alg == "nystrom":
                omega, Q = Nystrom.eigpsd(ABBA, self.k3)
            elif self.alg == "exact":
                omega, Q = scipy.sparse.linalg.eigsh(
                    ABBA,
                    min(self.k1, ABBA.shape[0] - 1),
                    which="LM",
                    ncv=min(15 * self.k1, ABBA.shape[0]))
            elif self.alg == "efficientNystrom":
                omega, Q = EfficientNystrom.eigWeight(subW, self.k2, self.k1)
            elif self.alg == "randomisedSvd":
                Q, omega, R = RandomisedSVD.svd(ABBA, self.k4)
            else:
                raise ValueError("Invalid Algorithm: " + str(self.alg))

            if self.computeSinTheta:
                omegaExact, QExact = scipy.linalg.eigh(ABBA.todense())
                inds = numpy.flipud(numpy.argsort(omegaExact))
                QExactKbot = QExact[:, inds[self.k1:]]
                inds = numpy.flipud(numpy.argsort(omega))
                QApproxK = Q[:, inds[:self.k1]]
                sinThetaList.append(
                    scipy.linalg.norm(QExactKbot.T.dot(QApproxK)))

            decompositionTimeList.append(time.time() - startTime)

            if self.alg == "IASC":
                self.storeInformation(subW, ABBA)

            # --- Kmeans ---
            startTime = time.time()
            inds = numpy.flipud(numpy.argsort(omega))

            standardiser = Standardiser()
            #For some very strange reason we get an overflow when computing the
            #norm of the rows of Q even though its elements are bounded by 1.
            #We'll ignore it for now
            try:
                V = standardiser.normaliseArray(Q[:, inds[0:self.k1]].real.T).T
            except FloatingPointError as e:
                logging.warn("FloatingPointError: " + str(e))
            V = VqUtils.whiten(V)
            if i == 0:
                centroids, distortion = vq.kmeans(V,
                                                  self.k1,
                                                  iter=self.nb_iter_kmeans)
            else:
                centroids = self.findCentroids(V, clusters[:subW.shape[0]])
                if centroids.shape[0] < self.k1:
                    nb_missing_centroids = self.k1 - centroids.shape[0]
                    random_centroids = V[numpy.random.randint(
                        0, V.shape[0], nb_missing_centroids), :]
                    centroids = numpy.vstack((centroids, random_centroids))
                centroids, distortion = vq.kmeans(
                    V, centroids)  #iter can only be 1
            clusters, distortion = vq.vq(V, centroids)
            kMeansTimeList.append(time.time() - startTime)

            clustersList.append(clusters)

            #logging.debug("subW.shape: " + str(subW.shape))
            #logging.debug("len(clusters): " + str(len(clusters)))
            #from sandbox.util.ProfileUtils import ProfileUtils
            #logging.debug("Total memory usage: " + str(ProfileUtils.memory()/10**6) + "MB")
            if ProfileUtils.memory() > 10**9:
                ProfileUtils.memDisplay(locals())

            i += 1

        if verbose:
            eigenQuality = {
                "boundList": boundList,
                "sinThetaList": sinThetaList
            }
            return clustersList, numpy.array(
                (decompositionTimeList, kMeansTimeList)).T, eigenQuality
        else:
            return clustersList
    def clusterFromIterator(self, graphListIterator, verbose=False):
        """
        Find a set of clusters for the graphs given by the iterator. If verbose 
        is true the each iteration is timed and bounded the results are returned 
        as lists.
        
        The difference between a weight matrix and the previous one should be
        positive.
        """
        clustersList = []
        decompositionTimeList = [] 
        kMeansTimeList = [] 
        boundList = []
        sinThetaList = []
        i = 0

        for subW in graphListIterator:
            if __debug__:
                Parameter.checkSymmetric(subW)

            if self.logStep and i % self.logStep == 0:
                logging.debug("Graph index: " + str(i))
            logging.debug("Clustering graph of size " + str(subW.shape))
            if self.alg!="efficientNystrom": 
                ABBA = GraphUtils.shiftLaplacian(subW)

            # --- Eigen value decomposition ---
            startTime = time.time()
            if self.alg=="IASC": 
                if i % self.T != 0:
                    omega, Q = self.approxUpdateEig(subW, ABBA, omega, Q)   
                    
                    if self.computeBound:
                        inds = numpy.flipud(numpy.argsort(omega))
                        Q = Q[:, inds]
                        omega = omega[inds]
                        bounds = self.pertBound(omega, Q, omegaKbot, AKbot, self.k2)
                        #boundList.append([i, bounds[0], bounds[1]])
                        
                        #Now use accurate values of norm of R and delta   
                        rank = Util.rank(ABBA.todense())
                        gamma, U = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0])
                        #logging.debug("gamma=" + str(gamma))
                        bounds2 = self.realBound(omega, Q, gamma, AKbot, self.k2)                  
                        boundList.append([bounds[0], bounds[1], bounds2[0], bounds2[1]])      
                else: 
                    logging.debug("Computing exact eigenvectors")
                    self.storeInformation(subW, ABBA)

                    if self.computeBound: 
                        #omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2*2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0]))
                        rank = Util.rank(ABBA.todense())
                        omega, Q = scipy.sparse.linalg.eigsh(ABBA, rank-1, which="LM", ncv = ABBA.shape[0])
                        inds = numpy.flipud(numpy.argsort(omega))
                        omegaKbot = omega[inds[self.k2:]]  
                        QKbot = Q[:, inds[self.k2:]] 
                        AKbot = (QKbot*omegaKbot).dot(QKbot.T)
                        
                        omegaSort = numpy.flipud(numpy.sort(omega))
                        boundList.append([0]*4)      
                    else: 
                        omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k2, ABBA.shape[0]-1), which="LM", ncv = min(10*self.k2, ABBA.shape[0]))
                            
            elif self.alg == "nystrom":
                omega, Q = Nystrom.eigpsd(ABBA, self.k3)
            elif self.alg == "exact": 
                omega, Q = scipy.sparse.linalg.eigsh(ABBA, min(self.k1, ABBA.shape[0]-1), which="LM", ncv = min(15*self.k1, ABBA.shape[0]))
            elif self.alg == "efficientNystrom":
                omega, Q = EfficientNystrom.eigWeight(subW, self.k2, self.k1)
            elif self.alg == "randomisedSvd": 
                Q, omega, R = RandomisedSVD.svd(ABBA, self.k4)
            else:
                raise ValueError("Invalid Algorithm: " + str(self.alg))

            if self.computeSinTheta:
                omegaExact, QExact = scipy.linalg.eigh(ABBA.todense())
                inds = numpy.flipud(numpy.argsort(omegaExact))
                QExactKbot = QExact[:, inds[self.k1:]]
                inds = numpy.flipud(numpy.argsort(omega))
                QApproxK = Q[:,inds[:self.k1]]
                sinThetaList.append(scipy.linalg.norm(QExactKbot.T.dot(QApproxK)))
          
            decompositionTimeList.append(time.time()-startTime)                  
                  
            if self.alg=="IASC":
                self.storeInformation(subW, ABBA)
            
            # --- Kmeans ---
            startTime = time.time()
            inds = numpy.flipud(numpy.argsort(omega))

            standardiser = Standardiser()
            #For some very strange reason we get an overflow when computing the
            #norm of the rows of Q even though its elements are bounded by 1.
            #We'll ignore it for now
            try:
                V = standardiser.normaliseArray(Q[:, inds[0:self.k1]].real.T).T
            except FloatingPointError as e:
                logging.warn("FloatingPointError: " + str(e))
            V = VqUtils.whiten(V)
            if i == 0:
                centroids, distortion = vq.kmeans(V, self.k1, iter=self.nb_iter_kmeans)
            else:
                centroids = self.findCentroids(V, clusters[:subW.shape[0]])
                if centroids.shape[0] < self.k1:
                    nb_missing_centroids = self.k1 - centroids.shape[0]
                    random_centroids = V[numpy.random.randint(0, V.shape[0], nb_missing_centroids),:]
                    centroids = numpy.vstack((centroids, random_centroids))
                centroids, distortion = vq.kmeans(V, centroids) #iter can only be 1
            clusters, distortion = vq.vq(V, centroids)
            kMeansTimeList.append(time.time()-startTime)

            clustersList.append(clusters)

            #logging.debug("subW.shape: " + str(subW.shape))
            #logging.debug("len(clusters): " + str(len(clusters)))
            #from sandbox.util.ProfileUtils import ProfileUtils
            #logging.debug("Total memory usage: " + str(ProfileUtils.memory()/10**6) + "MB")
            if ProfileUtils.memory() > 10**9:
                ProfileUtils.memDisplay(locals())

            i += 1

        if verbose:
            eigenQuality = {"boundList" : boundList, "sinThetaList" : sinThetaList}
            return clustersList, numpy.array((decompositionTimeList, kMeansTimeList)).T, eigenQuality
        else:
            return clustersList