def profileRunExperiment(self): def run(): dataArgs = argparse.Namespace() dataArgs.maxIter = 3 #Set iterStartDate to None for all iterations #dataArgs.iterStartTimeStamp = None dataArgs.iterStartTimeStamp = time.mktime(datetime(2005,1,1).timetuple()) generator = MovieLensDataset(maxIter=dataArgs.maxIter, iterStartTimeStamp=dataArgs.iterStartTimeStamp) defaultAlgoArgs = argparse.Namespace() defaultAlgoArgs.ks = numpy.array(2**numpy.arange(6, 7, 0.5), numpy.int) defaultAlgoArgs.svdAlgs = ["rsvd"] defaultAlgoArgs.runSoftImpute = True dataParser = argparse.ArgumentParser(description="", add_help=False) dataParser.add_argument("-h", "--help", action="store_true", help="show this help message and exit") devNull, remainingArgs = dataParser.parse_known_args(namespace=dataArgs) dataArgs.extendedDirName = "" dataArgs.extendedDirName += "MovieLensDataset" recommendExpHelper = RecommendExpHelper(generator.getTrainIteratorFunc, generator.getTestIteratorFunc, remainingArgs, defaultAlgoArgs, dataArgs.extendedDirName) recommendExpHelper.printAlgoArgs() # os.makedirs(resultsDir, exist_ok=True) # for python 3.2 try: os.makedirs(recommendExpHelper.resultsDir) except OSError as err: if err.errno != errno.EEXIST: raise recommendExpHelper.runExperiment() ProfileUtils.profile('run()', globals(), locals())
def profileIterator(self): def run(): subgraphIndicesList = [] for W in self.iterator: subgraphIndicesList.append(range(W.shape[0])) ProfileUtils.profile('run()', globals(), locals())
def profileModelSelection(self): dataset = ArnetMinerDataset(runLSI=False) dataset.overwrite = True dataset.overwriteVectoriser = True dataset.overwriteModel = True dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" ProfileUtils.profile('dataset.modelSelection()', globals(), locals())
def profileMC2(self): numVals = 5000 list1 = numpy.random.permutation(numVals).tolist() list2 = numpy.random.permutation(numVals).tolist() lists = [list1, list2] itemList = numpy.arange(numVals).tolist() ProfileUtils.profile('RankAggregator.MC2(lists, itemList)', globals(), locals())
def profileSvd(self): n = 5000 p = 0.1 L = scipy.sparse.rand(n, n, p) L = L.T.dot(L) k = 50 q = 2 ProfileUtils.profile('RandomisedSVD.svd(L, k, q)', globals(), locals())
def profileClusterFromIterator(self): iterator = IncreasingSubgraphListIterator(self.graph, self.subgraphIndicesList) dataDir = PathDefaults.getDataDir() + "cluster/" #iterator = getBemolGraphIterator(dataDir) def run(): clusterList, timeList, boundList = self.clusterer.clusterFromIterator(iterator, verbose=True) print(timeList.cumsum(0)) ProfileUtils.profile('run()', globals(), locals())
def profileGreedyMethod2(self): n = 1000 p = 0.1 graph = igraph.Graph.Erdos_Renyi(n, p) print(graph.summary()) k = 5 numpy.random.seed(21) ProfileUtils.profile("MaxInfluence.greedyMethod2(graph, k, p=0.5, numRuns=1000)", globals(), locals())
def profileTrainIterator(self): def run(): dataset = NetflixDataset(maxIter=30) trainIterator = dataset.getTrainIteratorFunc() for trainX in trainIterator: print(trainX.shape) ProfileUtils.profile('run()', globals(), locals())
def profileComputeLDA(self): field = "Boosting" dataset = ArnetMinerDataset(field) dataset.overwrite = True dataset.overwriteVectoriser = True dataset.overwriteModel = True dataset.maxRelevantAuthors = 100 dataset.k = 200 dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" ProfileUtils.profile('dataset.computeLDA()', globals(), locals())
def profileDecisionTreeRegressor(self): numExamples = 1000 numFeatures = 20 minSplit = 10 maxDepth = 20 generator = ExamplesGenerator() X, y = generator.generateBinaryExamples(numExamples, numFeatures) regressor = DecisionTreeRegressor(min_split=minSplit, max_depth=maxDepth, min_density=0.0) ProfileUtils.profile('regressor.fit(X, y)', globals(), locals())
def profileSimulateCascades(self): n = 500 p = 0.1 graph = igraph.Graph.Erdos_Renyi(n, p) k = 50 activeVertices = set(numpy.random.randint(0, n, 10)) numRuns = 100 ProfileUtils.profile( "MaxInfluence.simulateCascades(graph, activeVertices, numRuns, p=0.5)", globals(), locals() )
def profileEigenRemove(self): k = 50 n = 1000 X = numpy.random.rand(n, n) m = 900 XX = X.dot(X.T) self.omega, self.Q = numpy.linalg.eig(XX) def runEigenRemove(): for i in range(10): EigenUpdater.eigenRemove(self.omega, self.Q, m, k) ProfileUtils.profile('runEigenRemove()', globals(), locals())
def profileEigenConcat(self): k = 10 n = 1000 m = 100 X = numpy.random.rand(n, n) XX = X.dot(X.T) self.AA = XX[0:m, 0:m] self.AB = XX[0:m, m:] self.BB = XX[m:, m:] self.omega, self.Q = numpy.linalg.eig(self.AA) ProfileUtils.profile('EigenUpdater.eigenConcat(self.omega, self.Q, self.AB, self.BB, k)', globals(), locals())
def profilePowerIteration2(self): p = 100 q = 5 omega = numpy.random.randn(self.X.shape[1], p) L = GeneralLinearOperator.asLinearOperator(self.X, parallel=True) def run(): Y = L.matmat(omega) for i in range(q): Y = L.rmatmat(Y) Y = L.matmat(Y) ProfileUtils.profile('run()', globals(), locals())
def profileModelSelect(self): learner = LibSVM() numExamples = 10000 numFeatures = 10 X = numpy.random.rand(numExamples, numFeatures) Y = numpy.array(numpy.random.rand(numExamples) < 0.1, numpy.int)*2-1 def run(): for i in range(5): print("Iteration " + str(i)) idx = Sampling.crossValidation(self.folds, numExamples) learner.parallelModelSelect(X, Y, idx, self.paramDict) ProfileUtils.profile('run()', globals(), locals())
def learnModel(self, X, Y): Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkArray(X) Parameter.checkArray(Y) if numpy.unique(Y).shape[0] < 2: raise ValueError("Vector of labels must be binary, currently numpy.unique(Y) = " + str(numpy.unique(Y))) #If Y is 1D make it 2D if Y.ndim == 1: Y = numpy.array([Y]).T XY = self._getDataFrame(X, Y) formula = robjects.Formula('class ~ .') self.learnModelDataFrame(formula, XY) gc.collect() robjects.r('gc(verbose=TRUE)') robjects.r('memory.profile()') gc.collect() if self.printMemStats: logging.debug(self.getLsos()()) logging.debug(ProfileUtils.memDisplay(locals()))
def profileEigenAdd2(self): k = 10 n = 1000 m = 200 X = numpy.random.rand(n, n) Y = numpy.random.rand(n, m) XX = X.dot(X.T) self.omega, self.Q = numpy.linalg.eig(XX) def runEigenAdd2(): for i in range(10): EigenUpdater.eigenAdd2(self.omega, self.Q, Y, Y, k) ProfileUtils.profile('runEigenAdd2()', globals(), locals())
def profileLearnModel(self): numExamples = 1000 numFeatures = 50 minSplit = 10 maxDepth = 20 generator = ExamplesGenerator() X, y = generator.generateBinaryExamples(numExamples, numFeatures) y = numpy.array(y, numpy.float) learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth, pruneType="REP-CV") #learner.learnModel(X, y) #print("Done") ProfileUtils.profile('learner.learnModel(X, y) ', globals(), locals()) print(learner.getTree().getNumVertices())
def profilePredict(self): #Make the prdiction function faster numExamples = 1000 numFeatures = 20 minSplit = 1 maxDepth = 20 generator = ExamplesGenerator() X, y = generator.generateBinaryExamples(numExamples, numFeatures) learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) learner.learnModel(X, y) print(learner.getTree().getNumVertices()) ProfileUtils.profile('learner.predict(X)', globals(), locals()) print(learner.getTree().getNumVertices())
def profileParallelPen(self): learner = LibSVM(processes=8) learner.setChunkSize(2) numExamples = 10000 numFeatures = 10 X = numpy.random.rand(numExamples, numFeatures) Y = numpy.array(numpy.random.rand(numExamples) < 0.1, numpy.int)*2-1 Cvs = [self.folds-1] def run(): for i in range(2): print("Iteration " + str(i)) idx = Sampling.crossValidation(self.folds, numExamples) learner.parallelPen(X, Y, idx, self.paramDict, Cvs) ProfileUtils.profile('run()', globals(), locals())
def profileLearnModel(self): treeRank = TreeRank(self.leafRanklearner) treeRank.setMaxDepth(10) treeRank.setMinSplit(50) numExamples = 5000 numFeatures = 10 X = numpy.random.rand(numExamples, numFeatures) Y = numpy.array(numpy.random.rand(numExamples) < 0.1, numpy.int)*2-1 def run(): for i in range(5): print("Iteration " + str(i)) treeRank.learnModel(X, Y) #print(treeRank.getTreeSize()) #print(treeRank.getTreeDepth()) ProfileUtils.profile('run()', globals(), locals())
def profileModelSelect(self): learner = DecisionTreeLearner(minSplit=5, maxDepth=30, pruneType="CART") numExamples = 1000 numFeatures = 10 folds = 5 paramDict = {} paramDict["setGamma"] = numpy.array(numpy.round(2**numpy.arange(1, 7.5, 0.5)-1), dtype=numpy.int) X = numpy.random.rand(numExamples, numFeatures) Y = numpy.array(numpy.random.rand(numExamples) < 0.1, numpy.int)*2-1 def run(): for i in range(5): print("Iteration " + str(i)) idx = Sampling.crossValidation(folds, numExamples) learner.parallelModelSelect(X, Y, idx, paramDict) ProfileUtils.profile('run()', globals(), locals())
def profileLearnModel(self): treeRankForest = TreeRankForestR() treeRankForest.printMemStats = True treeRankForest.setMaxDepth(2) treeRankForest.setNumTrees(5) numExamples = 650 numFeatures = 950 X = numpy.random.rand(numExamples, numFeatures) Y = numpy.array(numpy.random.rand(numExamples) < 0.1, numpy.int) def run(): for i in range(10): print("Iteration " + str(i)) treeRankForest.learnModel(X, Y) #print(treeRank.getTreeSize()) #print(treeRank.getTreeDepth()) ProfileUtils.profile('run()', globals(), locals())
def profileLearnModel(self): treeRank = TreeRankR() treeRank.printMemStats = True treeRank.setMaxDepth(2) treeRank.setMinSplit(50) treeRank.setLeafRank(treeRank.getLrLinearSvmPlain()) numExamples = 650 numFeatures = 950 X = numpy.random.rand(numExamples, numFeatures) Y = numpy.array(numpy.random.rand(numExamples) < 0.1, numpy.int) def run(): for i in range(5): print("Iteration " + str(i)) treeRank.learnModel(X, Y) #print(treeRank.getTreeSize()) #print(treeRank.getTreeDepth()) ProfileUtils.profile('run()', globals(), locals())
def profileFindBestSplit(self): numExamples = 1000 numFeatures = 100 minSplit = 1 maxDepth = 20 generator = ExamplesGenerator() X, y = generator.generateBinaryExamples(numExamples, numFeatures) X = numpy.array(X, order="F") nodeInds = numpy.arange(X.shape[0]) argsortX = numpy.zeros(X.shape, numpy.int, order="F") for i in range(X.shape[1]): argsortX[:, i] = numpy.argsort(X[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) def run(): for i in range(10): findBestSplit3(minSplit, X, y, nodeInds, argsortX) ProfileUtils.profile('run()', globals(), locals())
def profileCluster2(self): numVertices = 250 graph = SparseGraph(GeneralVertexList(numVertices)) p = 0.1 generator = ErdosRenyiGenerator(p) graph = generator.generate(graph) W = graph.getWeightMatrix() WList = [] for i in range(50): s = numpy.random.randint(0, numVertices) t = numpy.random.randint(0, numVertices) logging.info(s, t) W[s, t] += 0.5 W[t, s] += 0.5 WList.append(W.copy()) iterator = iter(WList) ProfileUtils.profile('self.clusterer.cluster(iterator)', globals(), locals())
def safeSvd(A, eps=10**-8, tol=10**-8): """ Compute the SVD of a matrix using scipy.linalg.svd, and if convergence fails revert to Util.svd. """ # check input matrix if __debug__: if not Parameter.checkArray(A, softCheck = True): logging.info("... in Util.safeSvd") try: # run scipy.linalg.svd try: P, sigma, Qh = scipy.linalg.svd(A, full_matrices=False) except scipy.linalg.LinAlgError as e: logging.warn(str(e)) raise Exception('SVD decomposition has to be computed from EVD decomposition') # --- only when the SVD decomposition comes from scipy.linalg.svd --- # clean output singular values (sometimes scipy.linalg.svd returns NaN or negative singular values, let's remove them) inds = numpy.arange(sigma.shape[0])[sigma > tol] if inds.shape[0] < sigma.shape[0]: P, sigma, Q = Util.indSvd(P, sigma, Qh, inds) Qh = Q.conj().T # an expensive check but we really need it # rem: A*s = A.dot(diag(s)) ; A*s[:,new] = diag(s).dot(A) if not scipy.allclose(A, (P*sigma).dot(Qh)): logging.warn(" After cleaning singular values from scipy.linalg.svd, the SVD decomposition is too far from the original matrix") # numpy.savez("matrix_leading_to_bad_SVD.npz", A) raise Exception('SVD decomposition has to be computed from EVD decomposition') # check scipy.linalg.svd output matrices (expensive) if __debug__: badAnswerFromScipySvd = False if not Parameter.checkArray(P, softCheck=True, arrayInfo="P in Util.safeSvd()"): badAnswerFromScipySvd = True if not Parameter.checkArray(sigma, softCheck = True, arrayInfo="sigma in Util.safeSvd()"): badAnswerFromScipySvd = True if not Parameter.checkArray(Qh, softCheck = True, arrayInfo="Qh in Util.safeSvd()"): badAnswerFromScipySvd = True if badAnswerFromScipySvd: logging.warn(" After cleaning singular values from scipy.linalg.svd, the SVD decomposition still contains 'NaN', 'inf' or complex values") raise Exception('SVD decomposition has to be computed from EVD decomposition') except Exception as inst: if inst.args != ('SVD decomposition has to be computed from EVD decomposition',): raise logging.warn(" Using EVD method to compute the SVD.") P, sigma, Qh = Util.svd(A, eps, tol) # check Util.svd output matrices (expensive) if __debug__: badAnswerFromUtilSvd = False if not Parameter.checkArray(P, softCheck = True): logging.info("... in P in Util.safeSvd") badAnswerFromUtilSvd = True # print nan_rows in P: numpy.isnan(P).sum(0).nonzero() if not Parameter.checkArray(sigma, softCheck = True): logging.info("... in sigma in Util.safeSvd") badAnswerFromUtilSvd = True # print numpy.isnan(sigma).nonzero() if not Parameter.checkArray(Qh, softCheck = True): logging.info("... in Q in Util.safeSvd") badAnswerFromUtilSvd = True # blop = numpy.isnan(Qh).sum(1) # print blop.nonzero() # print blop[blop.nonzero()] if badAnswerFromUtilSvd: logging.warn(" SVD decomposition obtained from EVD decomposition contains 'NaN', 'inf' or real values") from apgl.util.ProfileUtils import ProfileUtils if ProfileUtils.memory() > 10**9: ProfileUtils.memDisplay(locals()) return P, sigma, Qh
def testMemory(self): logging.info(ProfileUtils.memory())
def profileCluster(self): iterator = IncreasingSubgraphListIterator(self.graph, self.subgraphIndicesList) ProfileUtils.profile('self.clusterer.cluster(iterator)', globals(), locals())
def profileVectoriseDocuments(self): field = "Boosting" dataset = ArnetMinerDataset(field) ProfileUtils.profile('dataset.vectoriseDocuments()', globals(), locals())