def recordResults(self, clusterList, timeList, fileName): """ Save results for a particular clustering """ iterator = self.getIterator() measures = [] graphInfo = [] logging.debug("Computing cluster measures") for i in range(len(clusterList)): Util.printIteration(i, self.logStep, len(clusterList)) W = next(iterator) #G = networkx.Graph(W) #Store modularity, k-way normalised cut, and cluster size currentMeasures = [GraphUtils.modularity(W, clusterList[i]), GraphUtils.kwayNormalisedCut(W, clusterList[i]), len(numpy.unique(clusterList[i]))] measures.append(currentMeasures) # graph size currentGraphInfo = [W.shape[0]] graphInfo.append(currentGraphInfo) # nb connected components #graphInfo[i, 1] = networkx.number_connected_components(G) measures = numpy.array(measures) graphInfo = numpy.array(graphInfo) numpy.savez(fileName, measures, timeList, graphInfo) logging.debug("Saved file as " + fileName)
def modelSelect(self, X, ks, lmbdas, gammas, nFolds, maxNTry=5): """ Choose parameters based on a single matrix X. We do cross validation within, and set parameters according to the mean squared error. Return nothing. """ logging.debug("Performing model selection") # usefull X = X.tocoo() gc.collect() nK = len(ks) nLmbda = len(lmbdas) nGamma = len(gammas) nLG = nLmbda * nGamma errors = scipy.zeros((nK, nLmbda, nGamma, nFolds)) # generate cross validation sets cvInds = Sampling.randCrossValidation(nFolds, X.nnz) # compute error for each fold / setting for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, nFolds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) assert trainX.nnz == trainInds.shape[0] assert testX.nnz == testInds.shape[0] nptst.assert_array_almost_equal((testX + trainX).data, X.data) paramList = [] for ik, k in enumerate(ks): for ilmbda, lmbda in enumerate(lmbdas): for igamma, gamma in enumerate(gammas): paramList.append( (trainX, testX, k, lmbda, gamma, maxNTry)) # ! Remark ! # we can parallelize the run of parameters easely. # parallelize the run of cv-folds is not done as it is much more # memory-consuming # parallel version (copied from IteraticeSoftImpute, but not tested) #pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10) #results = pool.imap(self.learnPredict, paramList) #pool.terminate() # non-parallel version results = scipy.array( list(itertools.starmap(self.learnPredict, paramList))) errors[:, :, :, icv] = scipy.array(results).reshape( (nK, nLmbda, nGamma)) # compute cross validation error for each setting errors[errors == float("inf")] = errors[errors != float("inf")].max() errors[numpy.isnan(errors)] = numpy.max(errors[numpy.logical_not( numpy.isnan(errors))]) meanErrors = errors.mean(3) stdErrors = errors.std(3) logging.debug("Mean errors given (k, lambda, gamma):") logging.debug(meanErrors) logging.debug("... with standard deviation:") logging.debug(stdErrors) # keep the best iMin = meanErrors.argmin() kMin = ks[int(scipy.floor(iMin / (nLG)))] lmbdaMin = lmbdas[int(scipy.floor((iMin % nLG) / nGamma))] gammaMin = gammas[int(scipy.floor(iMin % nGamma))] logging.debug("argmin: (k, lambda, gamma) = (" + str(kMin) + ", " + str(lmbdaMin) + ", " + str(gammaMin) + ")") logging.debug("min = " + str(meanErrors[int(scipy.floor(iMin / (nLG))), int(scipy.floor((iMin % nLG) / nGamma)), int(scipy.floor(iMin % nGamma))])) self.baseLearner.k = kMin self.baseLearner.lmbda = lmbdaMin self.baseLearner.gamma = gammaMin return
def modelSelect(self, X, ks, lmbdas, gammas, nFolds, maxNTry=5): """ Choose parameters based on a single matrix X. We do cross validation within, and set parameters according to the mean squared error. Return nothing. """ logging.debug("Performing model selection") # usefull X = X.tocoo() gc.collect() nK = len(ks) nLmbda = len(lmbdas) nGamma = len(gammas) nLG = nLmbda * nGamma errors = scipy.zeros((nK, nLmbda, nGamma, nFolds)) # generate cross validation sets cvInds = Sampling.randCrossValidation(nFolds, X.nnz) # compute error for each fold / setting for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, nFolds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) assert trainX.nnz == trainInds.shape[0] assert testX.nnz == testInds.shape[0] nptst.assert_array_almost_equal((testX+trainX).data, X.data) paramList = [] for ik, k in enumerate(ks): for ilmbda, lmbda in enumerate(lmbdas): for igamma, gamma in enumerate(gammas): paramList.append((trainX, testX, k, lmbda, gamma, maxNTry)) # ! Remark ! # we can parallelize the run of parameters easely. # parallelize the run of cv-folds is not done as it is much more # memory-consuming # parallel version (copied from IteraticeSoftImpute, but not tested) #pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10) #results = pool.imap(self.learnPredict, paramList) #pool.terminate() # non-parallel version results = scipy.array(list(itertools.starmap(self.learnPredict, paramList))) errors[:, :, :, icv] = scipy.array(results).reshape((nK, nLmbda, nGamma)) # compute cross validation error for each setting errors[errors == float("inf")] = errors[errors != float("inf")].max() errors[numpy.isnan(errors)] = numpy.max(errors[numpy.logical_not(numpy.isnan(errors))]) meanErrors = errors.mean(3) stdErrors = errors.std(3) logging.debug("Mean errors given (k, lambda, gamma):") logging.debug(meanErrors) logging.debug("... with standard deviation:") logging.debug(stdErrors) # keep the best iMin = meanErrors.argmin() kMin = ks[int(scipy.floor(iMin/(nLG)))] lmbdaMin = lmbdas[int(scipy.floor((iMin%nLG)/nGamma))] gammaMin = gammas[int(scipy.floor(iMin%nGamma))] logging.debug("argmin: (k, lambda, gamma) = (" + str(kMin) + ", " + str(lmbdaMin) + ", " + str(gammaMin) + ")") logging.debug("min = " + str(meanErrors[int(scipy.floor(iMin/(nLG))), int(scipy.floor((iMin%nLG)/nGamma)), int(scipy.floor(iMin%nGamma))])) self.baseLearner.k = kMin self.baseLearner.lmbda = lmbdaMin self.baseLearner.gamma = gammaMin return