示例#1
0
    def recordResults(self, clusterList, timeList, fileName):
        """
        Save results for a particular clustering
        """
        iterator = self.getIterator()
        measures = []
        graphInfo =  []
        logging.debug("Computing cluster measures")

        for i in range(len(clusterList)):
            Util.printIteration(i, self.logStep, len(clusterList))
            W = next(iterator)
            #G = networkx.Graph(W)
            #Store modularity, k-way normalised cut, and cluster size 
            currentMeasures = [GraphUtils.modularity(W, clusterList[i]), GraphUtils.kwayNormalisedCut(W, clusterList[i]), len(numpy.unique(clusterList[i]))] 
            measures.append(currentMeasures) 

            # graph size
            currentGraphInfo = [W.shape[0]]
            graphInfo.append(currentGraphInfo)
            # nb connected components
            #graphInfo[i, 1] = networkx.number_connected_components(G)
        
        measures = numpy.array(measures)
        graphInfo = numpy.array(graphInfo)
        
        numpy.savez(fileName, measures, timeList, graphInfo)
        logging.debug("Saved file as " + fileName)
示例#2
0
    def modelSelect(self, X, ks, lmbdas, gammas, nFolds, maxNTry=5):
        """
        Choose parameters based on a single matrix X. We do cross validation
        within, and set parameters according to the mean squared error.
        Return nothing.
        """
        logging.debug("Performing model selection")

        # usefull
        X = X.tocoo()
        gc.collect()
        nK = len(ks)
        nLmbda = len(lmbdas)
        nGamma = len(gammas)
        nLG = nLmbda * nGamma
        errors = scipy.zeros((nK, nLmbda, nGamma, nFolds))

        # generate cross validation sets
        cvInds = Sampling.randCrossValidation(nFolds, X.nnz)

        # compute error for each fold / setting
        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, nFolds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)

            assert trainX.nnz == trainInds.shape[0]
            assert testX.nnz == testInds.shape[0]
            nptst.assert_array_almost_equal((testX + trainX).data, X.data)

            paramList = []

            for ik, k in enumerate(ks):
                for ilmbda, lmbda in enumerate(lmbdas):
                    for igamma, gamma in enumerate(gammas):
                        paramList.append(
                            (trainX, testX, k, lmbda, gamma, maxNTry))

            # ! Remark !
            # we can parallelize the run of parameters easely.
            # parallelize the run of cv-folds is not done as it is much more
            # memory-consuming

            # parallel version (copied from IteraticeSoftImpute, but not tested)
            #pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10)
            #results = pool.imap(self.learnPredict, paramList)
            #pool.terminate()

            # non-parallel version
            results = scipy.array(
                list(itertools.starmap(self.learnPredict, paramList)))

            errors[:, :, :, icv] = scipy.array(results).reshape(
                (nK, nLmbda, nGamma))

        # compute cross validation error for each setting
        errors[errors == float("inf")] = errors[errors != float("inf")].max()
        errors[numpy.isnan(errors)] = numpy.max(errors[numpy.logical_not(
            numpy.isnan(errors))])
        meanErrors = errors.mean(3)
        stdErrors = errors.std(3)
        logging.debug("Mean errors given (k, lambda, gamma):")
        logging.debug(meanErrors)
        logging.debug("... with standard deviation:")
        logging.debug(stdErrors)

        # keep the best
        iMin = meanErrors.argmin()
        kMin = ks[int(scipy.floor(iMin / (nLG)))]
        lmbdaMin = lmbdas[int(scipy.floor((iMin % nLG) / nGamma))]
        gammaMin = gammas[int(scipy.floor(iMin % nGamma))]
        logging.debug("argmin: (k, lambda, gamma) = (" + str(kMin) + ", " +
                      str(lmbdaMin) + ", " + str(gammaMin) + ")")
        logging.debug("min = " +
                      str(meanErrors[int(scipy.floor(iMin / (nLG))),
                                     int(scipy.floor((iMin % nLG) / nGamma)),
                                     int(scipy.floor(iMin % nGamma))]))

        self.baseLearner.k = kMin
        self.baseLearner.lmbda = lmbdaMin
        self.baseLearner.gamma = gammaMin

        return
    def modelSelect(self, X, ks, lmbdas, gammas, nFolds, maxNTry=5):
        """
        Choose parameters based on a single matrix X. We do cross validation
        within, and set parameters according to the mean squared error.
        Return nothing.
        """
        logging.debug("Performing model selection")

        # usefull
        X = X.tocoo()
        gc.collect()
        nK = len(ks) 
        nLmbda = len(lmbdas) 
        nGamma = len(gammas) 
        nLG = nLmbda * nGamma
        errors = scipy.zeros((nK, nLmbda, nGamma, nFolds))
       
        # generate cross validation sets
        cvInds = Sampling.randCrossValidation(nFolds, X.nnz)
        
        # compute error for each fold / setting
        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, nFolds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)

            assert trainX.nnz == trainInds.shape[0]
            assert testX.nnz == testInds.shape[0]
            nptst.assert_array_almost_equal((testX+trainX).data, X.data)

            paramList = []
        
            for ik, k in enumerate(ks):
                for ilmbda, lmbda in enumerate(lmbdas):
                    for igamma, gamma in enumerate(gammas):
                        paramList.append((trainX, testX, k, lmbda, gamma, maxNTry)) 
            
            # ! Remark !
            # we can parallelize the run of parameters easely.
            # parallelize the run of cv-folds is not done as it is much more
            # memory-consuming 
            
            # parallel version (copied from IteraticeSoftImpute, but not tested) 
            #pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10)
            #results = pool.imap(self.learnPredict, paramList)
            #pool.terminate()

            # non-parallel version 
            results = scipy.array(list(itertools.starmap(self.learnPredict, paramList)))

            errors[:, :, :, icv] = scipy.array(results).reshape((nK, nLmbda, nGamma))
        
        # compute cross validation error for each setting
        errors[errors == float("inf")] = errors[errors != float("inf")].max()
        errors[numpy.isnan(errors)] = numpy.max(errors[numpy.logical_not(numpy.isnan(errors))])
        meanErrors = errors.mean(3)
        stdErrors = errors.std(3)
        logging.debug("Mean errors given (k, lambda, gamma):")
        logging.debug(meanErrors)
        logging.debug("... with standard deviation:")
        logging.debug(stdErrors)

        # keep the best
        iMin = meanErrors.argmin()
        kMin = ks[int(scipy.floor(iMin/(nLG)))]
        lmbdaMin = lmbdas[int(scipy.floor((iMin%nLG)/nGamma))]
        gammaMin = gammas[int(scipy.floor(iMin%nGamma))]
        logging.debug("argmin: (k, lambda, gamma) = (" + str(kMin) + ", " + str(lmbdaMin) + ", " + str(gammaMin) + ")")
        logging.debug("min = " + str(meanErrors[int(scipy.floor(iMin/(nLG))), int(scipy.floor((iMin%nLG)/nGamma)), int(scipy.floor(iMin%nGamma))]))
        
        self.baseLearner.k = kMin
        self.baseLearner.lmbda = lmbdaMin
        self.baseLearner.gamma = gammaMin
        
        return