def readAuthorsAndDocuments(self, useAbstract=True): logging.debug("About to read file " + self.dataFilename) inFile = open(self.dataFilename) authorList = [] citationList = [] documentList = [] lastAbstract = "" lastVenue = "" lastTitle = "" lastAuthors = [] lastCitationNo = 0 for i, line in enumerate(inFile): Util.printIteration(i, self.stepSize, self.numLines) #Match the fields in the file emptyLine = line == "\n" title = re.findall("#\*(.*)", line) currentAuthors = re.findall("#@(.*)", line) abstract = re.findall("#!(.*)", line) venue = re.findall("#conf(.*)", line) citationNo = re.findall("#citation(.*)", line) if emptyLine: if useAbstract: document = lastTitle + " " + lastAbstract else: document = lastTitle documentList.append(document) authorList.append(lastAuthors) citationList.append(lastCitationNo) lastAbstract = "" lastTitle = "" lastAuthors = [] lastCitationNo = 0 if len(title) != 0 and len(title[0]) != 0: lastTitle = title[0] if len(venue) != 0 and len(venue[0]) != 0: lastVenue = venue[0] if len(abstract) != 0 and len(abstract[0]) != 0: lastAbstract = abstract[0] if len(citationNo) != 0 and len(citationNo[0]) != 0: lastCitationNo = int(citationNo[0]) if len(currentAuthors) != 0: currentAuthors = currentAuthors[0].split(",") currentAuthors = set([x.strip() for x in currentAuthors]) currentAuthors = currentAuthors.difference(set([""])) lastAuthors = currentAuthors inFile.close() logging.debug("Finished reading " + str(len(documentList)) + " articles") return authorList, documentList, citationList
def __updateEigenSystem(self, lmbda, Q, deltaW, W): """ Give the eigenvalues lmbda, eigenvectors Q and a deltaW matrix of weight changes, compute sequence of incidence vectors and update eigensystem. The deltaW is the change in edges from the current weight martrix which is given by W. """ changeInds = deltaW.nonzero() for s in range(changeInds[0].shape[0]): Util.printIteration(s, 10, changeInds[0].shape[0]) i = changeInds[0][s] j = changeInds[1][s] if i>=j: # only consider lower diagonal changes continue assert deltaW[i, j] != 0 # if deltaW[i, j] < 0: # logging.warn(" deltaW is usually positive (here deltaW=" +str(deltaW[i, j]) + ")") #Note: update W at each iteration here lmbda, Q = self.incrementEigenSystem(lmbda, Q, W, i, j, deltaW[i,j]) W[i, j] += deltaW[i, j] W[j, i] += deltaW[i, j] return lmbda, Q
def __updateEigenSystem(self, lmbda, Q, deltaW, W): """ Give the eigenvalues lmbda, eigenvectors Q and a deltaW matrix of weight changes, compute sequence of incidence vectors and update eigensystem. The deltaW is the change in edges from the current weight martrix which is given by W. """ changeInds = deltaW.nonzero() for s in range(changeInds[0].shape[0]): Util.printIteration(s, 10, changeInds[0].shape[0]) i = changeInds[0][s] j = changeInds[1][s] if i >= j: # only consider lower diagonal changes continue assert deltaW[i, j] != 0 # if deltaW[i, j] < 0: # logging.warn(" deltaW is usually positive (here deltaW=" +str(deltaW[i, j]) + ")") #Note: update W at each iteration here lmbda, Q = self.incrementEigenSystem(lmbda, Q, W, i, j, deltaW[i, j]) W[i, j] += deltaW[i, j] W[j, i] += deltaW[i, j] return lmbda, Q
def predictEdges(self, vertexIndices): """ This makes a prediction for a series of edges using the following score \sum_z \in n(x) \cup n(y) = 1/|log(n(z)| Returns a matrix with rows are a ranked list of verticies of length self.windowSize. """ Parameter.checkInt(self.windowSize, 1, self.graph.getNumVertices()) logging.info("Running predictEdges in " + str(self.__class__.__name__)) P = numpy.zeros((vertexIndices.shape[0], self.windowSize)) S = numpy.zeros((vertexIndices.shape[0], self.windowSize)) W = self.graph.getWeightMatrix() for i in range(vertexIndices.shape[0]): Util.printIteration(i, self.printStep, vertexIndices.shape[0]) scores = numpy.zeros(self.graph.getNumVertices()) for j in range(0, self.graph.getNumVertices()): commonNeighbours = numpy.nonzero(W[vertexIndices[i], :] * W[j, :])[0] for k in commonNeighbours: q = numpy.log(numpy.nonzero(W[k, :])[0].shape[0]) if q != 0: scores[j] = scores[j] + 1/q P[i, :], S[i, :] = self.indicesFromScores(vertexIndices[i], scores) return P, S
def cleanXML(self): """ Take the original XML file and clean up HTML characters and & symbols. We also create a list of possible matches for the experts. """ if not os.path.exists(self.xmlCleanFilename): logging.debug("Cleaning XML") h = HTMLParser.HTMLParser() inFile = open(self.xmlFileName) outFile = open(self.xmlCleanFilename, "w") i = 0 for line in inFile: Util.printIteration(i, self.stepSize, self.numLines) outLine = h.unescape(line).replace("&", "&") outLine = re.sub("<title>.*[\<\>].*</title>", "<title>Default Title</title>", outLine) outLine = re.sub("<ee>.*[\<\>].*</ee>", "<ee>Default text</ee>", outLine) outFile.write(outLine) i += 1 inFile.close() outFile.close() logging.debug("All done") else: logging.debug("File already generated: " + self.xmlCleanFilename)
def evaluateCvOuter(self, X, Y, folds): """ Run cross validation and output some ROC curves. In this case Y is a 1D array. :param X: A matrix with examples as rows :type X: :class:`ndarray` :param y: A vector of labels :type y: :class:`ndarray` :param folds: The number of cross validation folds :type folds: :class:`int` """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkInt(folds, 2, float('inf')) if Y.ndim != 1: raise ValueError("Expecting Y to be 1D") indexList = cross_val.StratifiedKFold(Y, folds) bestParams = [] bestTrainAUCs = numpy.zeros(folds) bestTrainROCs = [] bestTestAUCs = numpy.zeros(folds) bestTestROCs = [] bestMetaDicts = [] i = 0 for trainInds, testInds in indexList: Util.printIteration(i, 1, folds, "Outer CV: ") trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] self.learnModel(trainX, trainY) #self.learnModelCut(trainX, trainY) predTrainY = self.predict(trainX) predTestY = self.predict(testX) bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY) bestTestAUCs[i] = Evaluator.auc(predTestY, testY) #Store the parameters and ROC curves bestTrainROCs.append(Evaluator.roc(trainY, predTrainY)) bestTestROCs.append(Evaluator.roc(testY, predTestY)) metaDict = {} bestMetaDicts.append(metaDict) i += 1 logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs))) logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs))) allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs] return (bestParams, allMetrics, bestMetaDicts)
def supervisedMC23(lists, itemList, topQList, verbose=False): """ A supervised version of MC2 of our own invention. The idea is to find a linear combination of transition matrices to fit a given one. We just make sure it fits the stationary distribution. """ import cvxopt import cvxopt.solvers ell = len(lists) n = len(itemList) outputList, scores, PList = RankAggregator.MC2(lists, itemList, verbose=True) Py = RankAggregator.generateTransitionMatrix(topQList, itemList) u, v = scipy.sparse.linalg.eigs(Py.T, 1) v = numpy.array(v).flatten() c = numpy.zeros(v.shape[0]) for i, P in enumerate(PList): Q[:, i] = cvxopt.matrix(numpy.array(P.todense()).ravel()) c = cvxopt.matrix(c) QQ = Q.T * Q Py = RankAggregator.generateTransitionMatrix(topQList, itemList) s = numpy.array(Py.todense()).ravel() s = cvxopt.matrix(s) G = cvxopt.spdiag((-numpy.ones(ell)).tolist()) h = cvxopt.matrix(numpy.zeros(ell)) A = cvxopt.matrix(numpy.ones(ell), (1, ell)) b = cvxopt.matrix(numpy.ones(1)) q = -Q.T * s sol = cvxopt.solvers.qp(QQ, q, G, h, A, b) alpha = numpy.array(sol['x']) #Combine the matrices P = numpy.zeros((n, n)) for j, Pj in enumerate(PList): Util.printIteration(j, 1, ell) P += alpha[j] * numpy.array(Pj.todense()) P /= ell outputList, scores = RankAggregator.computeOutputList(P, itemList) if verbose: return outputList, scores, PList else: return outputList, scores
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) localAucs = numpy.zeros( (self.ks.shape[0], self.lmbdas.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): maxLocalAuc = self.copy() maxLocalAuc.k = k paramList.append((trainX, testX, testOmegaList, maxLocalAuc)) pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize) #import itertools #resultsIterator = itertools.imap(localAucsLmbdas, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempAucs = resultsIterator.next() localAucs[i, :, icv] = tempAucs pool.terminate() meanLocalAucs = numpy.mean(localAucs, 2) stdLocalAucs = numpy.std(localAucs, 2) logging.debug(meanLocalAucs) k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]] lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]] logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda)) self.k = k self.lmbda = lmbda return meanLocalAucs, stdLocalAucs
def learnModel(self, graph): """ Learn a prediction model based on considering ego networks as independent. For each ego, X contains a list of neighbours and the corresponding labels are the values of the edge labels. We then find the set of primal weights w for each ego network and then regress onto the set of weights using the ego labels. :param graph: The input graph to learn from. :type graph: class:`apgl.graph.AbstractSingleGraph` """ logging.info("Learning model on graph of size " + str(graph.getNumVertices())) logging.info("EgoLearner: " + str(self.egoRegressor)) logging.info("AlterLearner: " + str(self.alterRegressor)) allIndices = numpy.arange(0, graph.getNumVertices()) V = graph.getVertexList().getVertices(list(allIndices)) W = numpy.zeros((0, graph.getVertexList().getNumFeatures())) Xe = numpy.zeros((0, graph.getVertexList().getNumFeatures())) printStep = numpy.floor(graph.getNumVertices() / 10) alterError = 0.0 for i in range(graph.getNumVertices()): Util.printIteration(i, printStep, graph.getNumVertices()) neighbours = graph.neighbours(i) if neighbours.shape[0] != 0: X = V[neighbours, :] y = numpy.ones(X.shape[0]) for j in range(neighbours.shape[0]): y[j] = graph.getEdge(i, neighbours[j]) w = self.alterRegressor.learnModel(X, y) #alterError = numpy.mean(numpy.abs(self.alterRegressor.predict(X) - y)) W = numpy.r_[W, numpy.array([w])] Xe = numpy.r_[Xe, numpy.array([V[i, :]])] #Now we need to solve least to find regressor of Xe onto W logging.info( "Finding regression matrix onto weights using matrix of size " + str(Xe.shape)) gc.collect() #self.standardiser = Standardiser() #self.standardiser2 = Standardiser() #Xe = self.standardiser.standardiseArray(Xe) #W = self.standardiser2.standardiseArray(W) self.egoRegressor.learnModel(Xe, W) return W
def learnModel(self, graph): """ Learn a prediction model based on considering ego networks as independent. For each ego, X contains a list of neighbours and the corresponding labels are the values of the edge labels. We then find the set of primal weights w for each ego network and then regress onto the set of weights using the ego labels. :param graph: The input graph to learn from. :type graph: class:`apgl.graph.AbstractSingleGraph` """ logging.info("Learning model on graph of size " + str(graph.getNumVertices())) logging.info("EgoLearner: " + str(self.egoRegressor)) logging.info("AlterLearner: " + str(self.alterRegressor)) allIndices = numpy.arange(0, graph.getNumVertices()) V = graph.getVertexList().getVertices(list(allIndices)) W = numpy.zeros((0, graph.getVertexList().getNumFeatures())) Xe = numpy.zeros((0, graph.getVertexList().getNumFeatures())) printStep = numpy.floor(graph.getNumVertices()/10) alterError = 0.0 for i in range(graph.getNumVertices()): Util.printIteration(i, printStep, graph.getNumVertices()) neighbours = graph.neighbours(i) if neighbours.shape[0] != 0: X = V[neighbours, :] y = numpy.ones(X.shape[0]) for j in range(neighbours.shape[0]): y[j] = graph.getEdge(i, neighbours[j]) w = self.alterRegressor.learnModel(X, y) #alterError = numpy.mean(numpy.abs(self.alterRegressor.predict(X) - y)) W = numpy.r_[W, numpy.array([w])] Xe = numpy.r_[Xe, numpy.array([V[i, :]])] #Now we need to solve least to find regressor of Xe onto W logging.info("Finding regression matrix onto weights using matrix of size " + str(Xe.shape)) gc.collect() #self.standardiser = Standardiser() #self.standardiser2 = Standardiser() #Xe = self.standardiser.standardiseArray(Xe) #W = self.standardiser2.standardiseArray(W) self.egoRegressor.learnModel(Xe, W) return W
def evaluateCvOuter(self, X, Y, folds, leafRank): """ Run cross validation and output some ROC curves. In this case Y is a 1D array. """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkInt(folds, 2, float('inf')) if Y.ndim != 1: raise ValueError("Expecting Y to be 1D") indexList = cross_val.StratifiedKFold(Y, folds) self.setLeafRank(leafRank) bestParams = [] bestTrainAUCs = numpy.zeros(folds) bestTrainROCs = [] bestTestAUCs = numpy.zeros(folds) bestTestROCs = [] bestMetaDicts = [] i = 0 for trainInds, testInds in indexList: Util.printIteration(i, 1, folds) trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY))) logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY))) self.learnModel(trainX, trainY) predTrainY = self.predict(trainX) predTestY = self.predict(testX) bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY) bestTestAUCs[i] = Evaluator.auc(predTestY, testY) #Store the parameters and ROC curves bestTrainROCs.append(Evaluator.roc(trainY, predTrainY)) bestTestROCs.append(Evaluator.roc(testY, predTestY)) metaDict = {} bestMetaDicts.append(metaDict) i += 1 logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs))) logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs))) allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs] return (bestParams, allMetrics, bestMetaDicts)
def supervisedMC22(lists, itemList, topQList, verbose=False): """ A supervised version of MC2 of our own invention. The idea is to find a linear combination of transition matrices to fit a given one. """ import cvxopt import cvxopt.solvers ell = len(lists) n = len(itemList) outputList, scores, PList = RankAggregator.MC2(lists, itemList, verbose=True) Q = cvxopt.spmatrix([], [], [], (n*n, len(lists))) for i, P in enumerate(PList): #print(P.todense()) Q[:, i] = cvxopt.matrix(numpy.array(P.todense()).ravel()) QQ = Q.T * Q Py = RankAggregator.generateTransitionMatrix(topQList, itemList) s = numpy.array(Py.todense()).ravel() s = cvxopt.matrix(s) G = cvxopt.spdiag((-numpy.ones(ell)).tolist()) h = cvxopt.matrix(numpy.zeros(ell)) A = cvxopt.matrix(numpy.ones(ell), (1, ell)) b = cvxopt.matrix(numpy.ones(1)) q = -Q.T * s sol = cvxopt.solvers.qp(QQ, q, G, h, A, b) alpha = numpy.array(sol['x']) #Combine the matrices P = numpy.zeros((n, n)) for j, Pj in enumerate(PList): Util.printIteration(j, 1, ell) P += alpha[j] * numpy.array(Pj.todense()) P /= ell outputList, scores = RankAggregator.computeOutputList(P, itemList) if verbose: return outputList, scores, PList else: return outputList, scores
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) localAucs = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): maxLocalAuc = self.copy() maxLocalAuc.k = k paramList.append((trainX, testX, testOmegaList, maxLocalAuc)) pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize) #import itertools #resultsIterator = itertools.imap(localAucsLmbdas, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempAucs = resultsIterator.next() localAucs[i, :, icv] = tempAucs pool.terminate() meanLocalAucs = numpy.mean(localAucs, 2) stdLocalAucs = numpy.std(localAucs, 2) logging.debug(meanLocalAucs) k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]] lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]] logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda)) self.k = k self.lmbda = lmbda return meanLocalAucs, stdLocalAucs
def learnModel(self, graph): """ Learn a prediction model based on all of the edges of the input graph. For each ego, X contains a list of neighbours and non-neighbours in the same ratio, and y = 1 when for a neighbour otherwise -1. We then find the set of primal weights w for each ego network and then regress onto the set of weights using the ego labels. One can either learn by comparing neighbours and non-neighbours, or alternatively using the labels of edges and making prediction on unlabelled edges. :param graph: The input graph to learn from. :type graph: class:`apgl.graph.AbstractSingleGraph` :param randomNegLabel: How to compute edge labels, False means use the labels themselves, and True means randomly pick non-neighbours to have -1 labels :type randomNegLabel: class `bool` """ Parameter.checkInt(self.windowSize, 1, graph.getNumVertices()) self.graph = graph logging.info("Learning model on graph of size " + str(graph.getNumVertices())) allIndices = numpy.arange(0, graph.getNumVertices()) V = graph.getVertexList().getVertices(allIndices) W = numpy.zeros((0, graph.getVertexList().getNumFeatures())) Xe = numpy.zeros((0, graph.getVertexList().getNumFeatures())) printStep = numpy.floor(graph.getNumVertices() / 10) for i in range(graph.getNumVertices()): Util.printIteration(i, printStep, graph.getNumVertices()) neighbours = graph.neighbours(i) if neighbours.shape[0] != 0: compNeighbours = numpy.setdiff1d(allIndices, neighbours) perm = numpy.random.permutation( compNeighbours.shape[0])[0:neighbours.shape[0]] negativeVertices = V[compNeighbours[perm], :] X = numpy.r_[V[neighbours, :], negativeVertices] y = numpy.ones(X.shape[0]) y[neighbours.shape[0]:] = -1 w = self.alterRegressor.learnModel(X, y) W = numpy.r_[W, numpy.array([w])] Xe = numpy.r_[Xe, numpy.array([V[i, :]])] #Now we need to solve least to find regressor of Xe onto W self.egoRegressor.learnModel(Xe, W)
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) precisions = numpy.zeros((self.ks.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): learner = self.copy() learner.k = k paramList.append((trainX, testX, testOmegaList, learner)) #pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) #resultsIterator = pool.imap(computePrecision, paramList, self.chunkSize) import itertools resultsIterator = itertools.imap(computePrecision, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempPrecision = resultsIterator.next() precisions[i, icv] = tempPrecision #pool.terminate() meanPrecisions = numpy.mean(precisions, 1) stdPrecisions = numpy.std(precisions, 1) logging.debug(meanPrecisions) k = self.ks[numpy.argmax(meanPrecisions)] logging.debug("Model parameters: k=" + str(k)) self.k = k return meanPrecisions, stdPrecisions
def generate_data_file(dir, nb_user=None): logging.debug("nb_user: "******"creating file " + str(f_data_name)) shutil.copy(BemolData.get_file_name(dir, None), f_data_name) # other files to generate nb_user_to_generate = [] current_nb_user = BemolData.get_nb_user_to_read(nb_user) logging.debug("current_nb_user before while: " + str(current_nb_user)) # !!!!! security failure TOCTTOU while (not os.path.exists(BemolData.get_file_name(dir, current_nb_user))): logging.debug("current_nb_user in while: " + str(current_nb_user)) nb_user_to_generate.append(current_nb_user) current_nb_user = BemolData.get_nb_user_to_read(current_nb_user+1) nb_user_to_generate.reverse() # generate other files for current_nb_user in nb_user_to_generate: # read data f_existing_data_name = BemolData.get_file_name(dir, current_nb_user+1) f_to_create_data_name = BemolData.get_file_name(dir, current_nb_user) logging.info("creating file " + f_to_create_data_name) dict_user = MyDictionary() try: f_existing_data = gzip.open(f_existing_data_name, 'rb') f_to_create_data = gzip.open(f_to_create_data_name, 'wb') i = 0 i_max = BemolData.get_nb_line(f_existing_data_name) for line in f_existing_data: Util.printIteration(i, 1000, i_max); i += 1 m = re.match("(\d+)\s(\d+)\s(\d+)\s(\d+)", line) if dict_user.index(int(m.group(1))) < current_nb_user: f_to_create_data.write(line) except IOError as error: if error.filename == f_existing_data: raise RGIOError(error, RGIOError.indent() + 'it disappeared in the meanwhile') else: raise error
def learnModel(self, graph): """ Learn a prediction model based on all of the edges of the input graph. For each ego, X contains a list of neighbours and non-neighbours in the same ratio, and y = 1 when for a neighbour otherwise -1. We then find the set of primal weights w for each ego network and then regress onto the set of weights using the ego labels. One can either learn by comparing neighbours and non-neighbours, or alternatively using the labels of edges and making prediction on unlabelled edges. :param graph: The input graph to learn from. :type graph: class:`apgl.graph.AbstractSingleGraph` :param randomNegLabel: How to compute edge labels, False means use the labels themselves, and True means randomly pick non-neighbours to have -1 labels :type randomNegLabel: class `bool` """ Parameter.checkInt(self.windowSize, 1, graph.getNumVertices()) self.graph = graph logging.info("Learning model on graph of size " + str(graph.getNumVertices())) allIndices = numpy.arange(0, graph.getNumVertices()) V = graph.getVertexList().getVertices(allIndices) W = numpy.zeros((0, graph.getVertexList().getNumFeatures())) Xe = numpy.zeros((0, graph.getVertexList().getNumFeatures())) printStep = numpy.floor(graph.getNumVertices()/10) for i in range(graph.getNumVertices()): Util.printIteration(i, printStep, graph.getNumVertices()) neighbours = graph.neighbours(i) if neighbours.shape[0] != 0: compNeighbours = numpy.setdiff1d(allIndices, neighbours) perm = numpy.random.permutation(compNeighbours.shape[0])[0:neighbours.shape[0]] negativeVertices = V[compNeighbours[perm], :] X = numpy.r_[V[neighbours, :], negativeVertices] y = numpy.ones(X.shape[0]) y[neighbours.shape[0]:] = -1 w = self.alterRegressor.learnModel(X, y) W = numpy.r_[W, numpy.array([w])] Xe = numpy.r_[Xe, numpy.array([V[i, :]])] #Now we need to solve least to find regressor of Xe onto W self.egoRegressor.learnModel(Xe, W)
def coauthorsGraphFromAuthors(self, relevantExperts): """ Take a set of relevant authors and return the graph. """ dataFile = open(self.dataFilename) authorIndexer = IdIndexer() author1Inds = array.array("i") author2Inds = array.array("i") for relevantExpert in relevantExperts: authorIndexer.append(relevantExpert) for i, line in enumerate(dataFile): Util.printIteration(i, self.stepSize, self.numLines) authors = re.findall("#@(.*)", line) if len(authors) != 0: authors = set([x.strip() for x in authors[0].split(",")]) if len(authors.intersection(relevantExperts)) != 0: iterator = itertools.combinations(authors, 2) for author1, author2 in iterator: if author1 in relevantExperts and author2 in relevantExperts: author1Ind = authorIndexer.append(author1) author2Ind = authorIndexer.append(author2) author1Inds.append(author1Ind) author2Inds.append(author2Ind) logging.debug("Found " + str(len(authorIndexer.getIdDict())) + " coauthors") #Coauthor graph is undirected author1Inds = numpy.array(author1Inds, numpy.int) author2Inds = numpy.array(author2Inds, numpy.int) edges = numpy.c_[author1Inds, author2Inds] graph = igraph.Graph() graph.add_vertices(len(authorIndexer.getIdDict())) graph.add_edges(edges) graph.es["weight"] = numpy.ones(graph.ecount()) graph.simplify(combine_edges=sum) graph.es["invWeight"] = 1.0/(numpy.array(graph.es["weight"])) return graph, authorIndexer
def MC2(lists, itemList, alpha=None, verbose=False): """ Perform weighted rank aggregation using MC2 as given in Rank Aggregation Methods for the Web, Dwork et al. The weighting vector is given by alpha. :param lists: A list of lists. Each sublist is an ordered set of a subset of the items from itemList :param itemList: A list of all possible items :param alpha: A vector of weights for the transition matrices """ n = len(itemList) ell = len(lists) if alpha == None: alpha = numpy.ones(ell)/ell #P = numpy.zeros((n, n)) P = scipy.sparse.csr_matrix((n, n)) PList = [] logging.debug("Computing permutation matrices") for j, lst in enumerate(lists): Util.printIteration(j, 1, ell) Pj = RankAggregator.generateTransitionMatrix(lst, itemList) P = P + alpha[j] * Pj PList.append(Pj) P /= ell logging.debug("Done") outputList,scores = RankAggregator.computeOutputList(P, itemList) if verbose: return outputList, scores, PList else: return outputList, scores
def predictEdges(self, vertexIndices): """ This makes a prediction for a series of edges using the Jacard Index. Returns a matrix with rows are a ranked list of verticies of length windowSize. """ """ The score is the |n(x) \cup n(y)|/|n(x) \cap n(y)|. This is faster than the other method. """ logging.info("Running predictEdges in " + str(self.__class__.__name__)) printStep = 50 P = numpy.zeros((vertexIndices.shape[0], self.windowSize)) S = numpy.zeros((vertexIndices.shape[0], self.windowSize)) W = self.graph.getWeightMatrix() for i in range(vertexIndices.shape[0]): Util.printIteration(i, printStep, vertexIndices.shape[0]) scores = numpy.zeros(self.graph.getNumVertices()) #Maybe something like this: #WI = W[vertexIndices[i], :] + W #WU = W[vertexIndices[i], :] * W for j in range(0, self.graph.getNumVertices()): scores[j] = numpy.nonzero(W[vertexIndices[i], :] + W[j, :])[0].shape[0] if scores[j] != 0: scores[j] = numpy.nonzero( W[vertexIndices[i], :] * W[j, :])[0].shape[0] / float( scores[j]) P[i, :], S[i, :] = self.indicesFromScores(vertexIndices[i], scores) return P, S
def matchExperts(self): expertsSet = self.loadExperts(self.expertsFileName) if not os.path.exists(self.expertMatchesFilename): inFile = open(self.xmlCleanFilename) expertMatches = set([]) i = 0 for line in inFile: Util.printIteration(i, self.stepSize, self.numLines) if i % self.stepSize == 0: logging.debug(expertMatches) author = re.findall("<author>(.*)</author>", line) if len(author) != 0: possibleMatches = difflib.get_close_matches(author[0], expertsSet, cutoff=self.matchCutoff) if len(possibleMatches) != 0: expertMatches.add(author[0]) expertsSet.remove(possibleMatches[0]) if len(expertsSet) == 0: logging.debug("Found all experts, breaking") break i += 1 expertMatches = sorted(list(expertMatches)) expertMatchesFile = open(self.expertMatchesFilename, "w") for expert in expertMatches: expertMatchesFile.write(expert + "\n") expertMatchesFile.close() logging.debug("All done") else: logging.debug("File already generated: " + self.expertMatchesFilename)
def predictEdges(self, vertexIndices): """ This makes a prediction for a series of edges using the Jacard Index. Returns a matrix with rows are a ranked list of verticies of length windowSize. """ """ The score is the |n(x) \cup n(y)|/|n(x) \cap n(y)|. This is faster than the other method. """ logging.info("Running predictEdges in " + str(self.__class__.__name__)) printStep = 50 P = numpy.zeros((vertexIndices.shape[0], self.windowSize)) S = numpy.zeros((vertexIndices.shape[0], self.windowSize)) W = self.graph.getWeightMatrix() for i in range(vertexIndices.shape[0]): Util.printIteration(i, printStep, vertexIndices.shape[0]) scores = numpy.zeros(self.graph.getNumVertices()) #Maybe something like this: #WI = W[vertexIndices[i], :] + W #WU = W[vertexIndices[i], :] * W for j in range(0, self.graph.getNumVertices()): scores[j] = numpy.nonzero(W[vertexIndices[i], :] + W[j, :])[0].shape[0] if scores[j] != 0: scores[j] = numpy.nonzero(W[vertexIndices[i], :] * W[j, :])[0].shape[0]/float(scores[j]) P[i, :], S[i, :] = self.indicesFromScores(vertexIndices[i], scores) return P, S
def processRatings(self): """ Convert the dataset into a matrix and save the results for faster access. """ if not os.path.exists(self.ratingFileName) or not os.path.exists(self.custDictFileName): dataDir = PathDefaults.getDataDir() + "movielens/" logging.debug("Processing ratings given in " + dataDir) custIdDict = {} custIdSet = set([]) movieIdDict = {} movieIdSet = set([]) movieInds = array.array("I") custInds = array.array("I") ratings = array.array("f") dates = array.array("L") i = 0 j = 0 itr = 0 ratingsFile = open(dataDir + "ratings.dat") for line in ratingsFile: Util.printIteration(itr, 100000, self.numRatings) vals = line.split("::") custId = int(vals[0]) if custId not in custIdSet: custIdSet.add(custId) custIdDict[custId] = j custInd = j j += 1 else: custInd = custIdDict[custId] movieId = int(vals[1]) if movieId not in movieIdSet: movieIdSet.add(movieId) movieIdDict[movieId] = i movieInd = i i += 1 else: movieInd = movieIdDict[movieId] rating = float(vals[2]) time = int(vals[3]) movieInds.append(movieInd) custInds.append(custInd) ratings.append(rating) dates.append(time) itr += 1 movieInds = numpy.array(movieInds, numpy.uint32) custInds = numpy.array(custInds, numpy.uint32) ratings = numpy.array(ratings, numpy.float) dates = numpy.array(dates, numpy.uint32) assert ratings.shape[0] == self.numRatings numpy.savez(self.ratingFileName, movieInds, custInds, ratings, dates) logging.debug("Saved ratings file as " + self.ratingFileName) pickle.dump(custIdDict, open(self.custDictFileName, 'wb')) logging.debug("Saved custIdDict as " + self.custDictFileName) pickle.dump(movieIdDict, open(self.movieDictFileName, 'wb')) logging.debug("Saved movieIdDict as " + self.movieDictFileName) else: logging.debug("Ratings file " + str(self.ratingFileName) + " already processed")
def modelSelect(self, X, rhos, ks, cvInds): """ Pick a value of rho based on a single matrix X. We do cross validation within, and return the best value of lambda (according to the mean squared error). The rhos must be in decreasing order and we use warm restarts. """ if (numpy.flipud(numpy.sort(rhos)) != rhos).all(): raise ValueError("rhos must be in descending order") errors = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds))) if self.metric == "mse": metricFuction = learnPredictMSE elif self.metric == "f1" or self.metric == "mrr": metricFuction = learnPredictRanking else: raise ValueError("Unknown metric: " + self.metric) for i, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(i, 1, len(cvInds), "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) assert trainX.nnz == trainInds.shape[0] assert testX.nnz == testInds.shape[0] #nptst.assert_array_almost_equal((testX+trainX).data, X.data) paramList = [] for m, k in enumerate(ks): learner = self.copy() learner.updateAlg="initial" learner.setK(k) paramList.append((learner, trainX, testX, rhos)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10) results = pool.imap(metricFuction, paramList) else: results = itertools.imap(metricFuction, paramList) for m, rhoErrors in enumerate(results): errors[:, m, i] = rhoErrors if self.numProcesses != 1: pool.terminate() meanMetrics = errors.mean(2) stdMetrics = errors.std(2) logging.debug(meanMetrics) #Set the parameters if self.metric == "mse": self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[1]]) elif self.metric == "f1" or self.metric == "mrr": self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[1]]) logging.debug("Model parameters: k=" + str(self.k) + " rho=" + str(self.rho)) return meanMetrics, stdMetrics
def modelSelect2(self, X, rhos, ks, cvInds, colProbs=None): """ Pick a value of rho based on a single matrix X. We do cross validation within, and return the best value of lambda (according to the mean squared error). The rhos must be in decreasing order and we use warm restarts. In this case we remove a few non zeros from each row to form the test set. """ if (numpy.flipud(numpy.sort(rhos)) != rhos).all(): raise ValueError("rhos must be in descending order") trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, rowMajor=False, colProbs=colProbs) metrics = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds))) if self.metric == "mse": metricFuction = learnPredictMSE elif self.metric == "f1" or self.metric == "mrr": metricFuction = learnPredictRanking else: raise ValueError("Unknown metric: " + self.metric) paramList = [] for i, (trainX, testX) in enumerate(trainTestXs): Util.printIteration(i, 1, len(cvInds), "Fold: ") for m, k in enumerate(ks): learner = self.copy() learner.updateAlg = "initial" learner.setK(k) paramList.append((learner, trainX, testX, rhos)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=10) resultsIter = pool.imap(metricFuction, paramList) else: resultsIter = itertools.imap(metricFuction, paramList) for i, (trainX, testX) in enumerate(trainTestXs): for m, k in enumerate(ks): metrics[:, m, i] = resultsIter.next() if self.numProcesses != 1: pool.terminate() meanMetrics = metrics.mean(2) stdMetrics = metrics.std(2) logging.debug("ks=" + str(ks)) logging.debug("rhos=" + str(rhos)) logging.debug(meanMetrics) #Set the parameters if self.metric == "mse": self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[1]]) elif self.metric == "f1" or self.metric == "mrr": self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[1]]) logging.debug("Model parameters: k=" + str(self.k) + " rho=" + str(self.rho)) return meanMetrics, stdMetrics
def testGetIterator(self): generator = CitationIterGenerator() iterator = generator.getIterator() lastW = iterator.next() for W in iterator: self.assertTrue((W-W.T).getnnz() == 0) self.assertTrue((lastW - W[0:lastW.shape[0], 0:lastW.shape[0]]).getnnz() ==0 ) lastW = W numVertices = W.shape[0] #Now compute the vertexIds manually: dataDir = PathDefaults.getDataDir() + "cluster/" edgesFilename = dataDir + "Cit-HepTh.txt" dateFilename = dataDir + "Cit-HepTh-dates.txt" #We can't load in numbers using numpy since some may start with zero edges = [] file = open(edgesFilename, 'r') file.readline() file.readline() file.readline() file.readline() for line in file: (vertex1, sep, vertex2) = line.partition("\t") vertex1 = vertex1.strip() vertex2 = vertex2.strip() edges.append([int("1" + vertex1), int("1" + vertex2)]) edges = numpy.array(edges, numpy.int) #Check file read correctly self.assertTrue((edges[0, :] == numpy.array([11001, 19304045])).all()) self.assertTrue((edges[1, :] == numpy.array([11001, 19308122])).all()) self.assertTrue((edges[9, :] == numpy.array([11001, 19503124])).all()) vertexIds1 = numpy.unique(edges) logging.info("Number of graph vertices: " + str(vertexIds1.shape[0])) file = open(dateFilename, 'r') file.readline() vertexIds2 = [] for line in file: (id, sep, date) = line.partition("\t") id = id.strip() date = date.strip() vertexIds2.append(int("1" + id)) #Check file read correctly vertexIds2 = numpy.array(vertexIds2, numpy.int) self.assertTrue((vertexIds2[0:10] == numpy.array([19203201, 19203202, 19203203, 19203204, 19203205, 19203206, 19203207, 19203208, 19203209, 19203210], numpy.int)).all()) vertexIds2 = numpy.unique(numpy.array(vertexIds2, numpy.int)) graph = DictGraph(False) graph.addEdges(edges) #Find the set of vertices with known citation vertices = [] vertexId2Set = set(vertexIds2.tolist()) for i in graph.getAllVertexIds(): Util.printIteration(i, 50000, edges.shape[0]) if i in vertexId2Set: vertices.append(i) vertices.extend(graph.neighbours(i)) logging.debug("Number of final vertices: " + str(numVertices)) numVertices2 = numpy.unique(numpy.array(vertices)).shape[0] self.assertEquals(numVertices, numVertices2) #Now compare the weight matrices using the undirected graph #Note the order of vertices is different from the iterator graph = DictGraph() graph.addEdges(edges) subgraph = graph.subgraph(numpy.unique(numpy.array(vertices))) W2 = subgraph.getSparseWeightMatrix() self.assertEquals(W.getnnz(), W2.getnnz())
def plotMaxTreesStats(): biSums1 = [] heteroSums1 = [] biSums2 = [] heteroSums2 = [] treeDepth1 = [] treeSize1 = [] treeDepth2 = [] treeSize2 = [] logging.info("Finding trees") trees = sGraph.findTrees() maxTree = sGraph.subgraph(trees[0]) secondTree = sGraph.subgraph(trees[1]) maxRootIndex = trees[0][numpy.nonzero(sGraph.inDegreeSequence()[trees[0]] == 0)[0]] secondRootIndex = trees[1][numpy.nonzero(sGraph.inDegreeSequence()[trees[1]] == 0)[0]] for j in range(len(subgraphIndicesList)): Util.printIteration(j, 1, len(subgraphIndicesList)) subgraphIndices = subgraphIndicesList[j] subgraphIndices = numpy.array(subgraphIndices) currentMaxRootIndex = numpy.nonzero(subgraphIndices == maxRootIndex)[0] currentSecondRootIndex = numpy.nonzero(subgraphIndices == secondRootIndex)[0] subgraph = sGraph.subgraph(subgraphIndices) if currentMaxRootIndex.shape[0] == 1: maxTree = subgraph.subgraph(subgraph.depthFirstSearch(currentMaxRootIndex[0])) else: maxTree = subgraph.subgraph(numpy.array([])) if currentSecondRootIndex.shape[0] == 1: secondTree = subgraph.subgraph(subgraph.depthFirstSearch(currentSecondRootIndex[0])) else: secondTree = subgraph.subgraph(numpy.array([])) subgraphVertexArray = maxTree.getVertexList().getVertices() subgraphVertexArray2 = secondTree.getVertexList().getVertices() #Compute proportion of MSM, Male, Female, Hetero heteroSums1.append(numpy.sum(subgraphVertexArray[:, orientationIndex]==0)) biSums1.append(numpy.sum(subgraphVertexArray[:, orientationIndex]==1)) heteroSums2.append(numpy.sum(subgraphVertexArray2[:, orientationIndex]==0)) biSums2.append(numpy.sum(subgraphVertexArray2[:, orientationIndex]==1)) treeDepth1.append(GraphUtils.treeDepth(maxTree)) treeSize1.append(maxTree.getNumVertices()) treeDepth2.append(GraphUtils.treeDepth(secondTree)) treeSize2.append(secondTree.getNumVertices()) resultsFilename = resultsDir + "treeSizesDepths.npz" file = open(resultsFilename, 'w') numpy.savez(file, treeDepth1, treeSize1, treeDepth2, treeSize2) global plotInd plt.figure(plotInd) plt.plot(absDayList, heteroSums1, plotStyles3[0], absDayList, biSums1, plotStyles3[1], absDayList, heteroSums2, plotStyles3[2], absDayList, biSums2, plotStyles3[3]) plt.xticks(locs, labels) plt.xlabel("Year") plt.ylabel("Detections") plt.legend(("Max tree heterosexual", "Max tree MSM", "2nd tree heterosexual", "2nd tree MSM"), loc="upper left") plt.savefig(figureDir + "MaxTreeOrientGender.eps") plotInd += 1
def processRatings(self): """ Convert the dataset into a matrix and save the results for faster access. """ if not os.path.exists(self.ratingFileName) or not os.path.exists(self.custDictFileName): dataDir = PathDefaults.getDataDir() + "flixster/" logging.debug("Processing ratings given in " + dataDir) custIdDict = {} custIdSet = set([]) itemIdDict = {} itemIdSet = set([]) itemInds = array.array("I") custInds = array.array("I") ratings = array.array("f") dates = array.array("L") i = 0 j = 0 itr = 0 ratingsFile = open(dataDir + "Ratings.timed.txt") ratingsFile.readline() for line in ratingsFile: Util.printIteration(itr, 100000, self.numRatings) vals = line.split() custId = int(vals[0]) if custId not in custIdSet: custIdSet.add(custId) custIdDict[custId] = j custInd = j j += 1 else: custInd = custIdDict[custId] itemId = int(vals[1]) if itemId not in itemIdSet: itemIdSet.add(itemId) itemIdDict[itemId] = i itemInd = i i += 1 else: itemInd = itemIdDict[itemId] rating = float(vals[2]) t = datetime.strptime(vals[3].strip(), "%Y-%m-%d") t = int(time.mktime(t.timetuple())) #Some dates are before 1970 if t >= 0: itemInds.append(itemInd) custInds.append(custInd) ratings.append(rating) dates.append(t) itr += 1 itemInds = numpy.array(itemInds, numpy.uint32) custInds = numpy.array(custInds, numpy.uint32) ratings = numpy.array(ratings, numpy.float) dates = numpy.array(dates, numpy.uint64) assert ratings.shape[0] == self.numRatings logging.debug("Number of ratings " + str(ratings.shape[0])) #Prune data X = scipy.sparse.csc_matrix((ratings, (custInds, itemInds))) X2 = scipy.sparse.csc_matrix((dates, (custInds, itemInds))) print(X.shape) X, rowInds, colInds = SparseUtils.pruneMatrix(X, minNnzRows=10, minNnzCols=10, verbose=True) X2 = X2[:, colInds][rowInds, :] print(X.shape) (custInds, itemInds) = X.nonzero() ratings = X.data dates = X2.data logging.debug("New number of ratings " + str(ratings.shape[0])) numpy.savez(self.ratingFileName, itemInds, custInds, ratings, dates) logging.debug("Saved ratings file as " + self.ratingFileName) pickle.dump(custIdDict, open(self.custDictFileName, 'wb')) logging.debug("Saved custIdDict as " + self.custDictFileName) pickle.dump(itemIdDict, open(self.itemDictFileName, 'wb')) logging.debug("Saved itemIdDict as " + self.itemDictFileName) else: logging.debug("Ratings file " + str(self.ratingFileName) + " already processed")
def plotTreeStats(): logging.info("Computing tree stats") resultsFileName = resultsDir + "InfectGrowthTreeStats.pkl" if saveResults: statsDictList = [] for j in range(len(subgraphIndicesList2)): Util.printIteration(j, 1, len(subgraphIndicesList2)) subgraphIndices = subgraphIndicesList2[j] subgraph = sGraph.subgraph(subgraphIndices) logging.info("Finding trees") trees = subgraph.findTrees() logging.info("Computing tree statistics") statsDict = {} locationEntropy = [] orientEntropy = [] detectionRanges = [] for i in range(len(trees)): if len(trees[i]) > 1: treeGraph = subgraph.subgraph(trees[i]) vertexArray = treeGraph.getVertexList().getVertices(list(range(treeGraph.getNumVertices()))) locationEntropy.append(Util.entropy(vertexArray[:, locationIndex])) orientEntropy.append(Util.entropy(vertexArray[:, orientationIndex])) detections = vertexArray[:, detectionIndex] detectionRanges.append(numpy.max(detections) - numpy.min(detections)) statsDict["locationEnt"] = numpy.array(locationEntropy) statsDict["orientEnt"] = numpy.array(orientEntropy) statsDict["detectRanges"] = numpy.array(detectionRanges) statsDictList.append(statsDict) Util.savePickle(statsDictList, resultsFileName, True) else: statsDictList = Util.loadPickle(resultsFileName) locBins = numpy.arange(0, 2.4, 0.2) detectBins = numpy.arange(0, 6500, 500) locationEntDists = [] orientEntDists = [] detectionDists = [] for j in range(0, len(dayList2)): dateStr = (str(DateUtils.getDateStrFromDay(dayList2[j], startYear))) logging.info(dateStr) statsDict = statsDictList[j] plotInd2 = plotInd locationEntDists.append(statsDict["locationEnt"]) orientEntDists.append(statsDict["orientEnt"]) detectionDists.append(statsDict["detectRanges"]) #for j in range(len(orientEntDists)): # print(numpy.sum(numpy.histogram(orientEntDists[j])[0])) # print(numpy.histogram(orientEntDists[j])[0]/float(orientEntDists[j].shape[0])) dateStrs = [DateUtils.getDateStrFromDay(dayList2[i], startYear) for i in range(1, len(dayList2))] plt.figure(plotInd2) histOut = plt.hist(locationEntDists, locBins, normed=True) plt.xlabel("Location Entropy") plt.ylabel("Probability Density") plt.savefig(figureDir + "LocationEnt" + ".eps") #plt.legend() plotInd2 += 1 plt.figure(plotInd2) histOut = plt.hist(orientEntDists, normed=True) plt.xlabel("Orientation Entropy") plt.ylabel("Probability Density") plt.savefig(figureDir + "OrientEnt" + ".eps") #plt.legend() plotInd2 += 1 plt.figure(plotInd2) histOut = plt.hist(detectionDists, detectBins, normed=True) plt.xlabel("Detection Range (days)") plt.ylabel("Probability Density") plt.savefig(figureDir + "DetectionRanges" + ".eps") #plt.legend() plotInd2 += 1
def evaluateCvOuter(self, X, Y, folds, leafRank, innerFolds=3): """ Run model selection and output some ROC curves. In this case Y is a 1D array. """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkInt(folds, 2, float('inf')) if Y.ndim != 1: raise ValueError("Expecting Y to be 1D") indexList = cross_val.StratifiedKFold(Y, folds) maxDepths = numpy.flipud(numpy.arange(1, 12, 1)) if leafRank == self.getTreeRankLib().LRforest: varSplits = numpy.arange(0.6, 1.01, 0.2) else: varSplits = numpy.array([1]) #According to Nicolas nfcv>1 doesn't help nfcvs = [1] #This is tied in with depth mincrit = 0.00 #If minsplit is too low sometimes get a node with no positive labels minSplits = numpy.array([50]) self.setLeafRank(leafRank) bestParams = [] bestTrainAUCs = numpy.zeros(folds) bestTrainROCs = [] bestTestAUCs = numpy.zeros(folds) bestTestROCs = [] bestMetaDicts = [] i = 0 for trainInds, testInds in indexList: trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] meanParamAUCs = [] paramList = [] logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY))) logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY))) for varSplit in varSplits: for nfcv in nfcvs: for minSplit in minSplits: self.setMaxDepth(maxDepths[0]) self.setVarSplit(varSplit) self.setNfcv(nfcv) self.setMinSplit(minSplit) logging.debug(self) idx = cross_val.StratifiedKFold(trainY, innerFolds) j = 0 metrics = numpy.zeros((len(idx), maxDepths.shape[0])) for idxtr, idxts in idx: Util.printIteration(j, 1, innerFolds) innerTrainX, innerTestX = trainX[idxtr, :], trainX[idxts, :] innerTrainY, innerTestY = trainY[idxtr], trainY[idxts] self.learnModel(innerTrainX, innerTrainY) for k in range(maxDepths.shape[0]): maxDepth = maxDepths[k] robjects.globalenv["maxDepth"] = maxDepth robjects.globalenv["tree"] = self.tree nodeList = robjects.r('tree$nodes[tree$depth>=maxDepth]') self.tree = self.treeRankLib.subTreeRank(self.tree, nodeList) predY = self.predict(innerTestX) gc.collect() metrics[j, k] = Evaluator.auc(predY, innerTestY) j += 1 meanAUC = numpy.mean(metrics, 0) varAUC = numpy.var(metrics, 0) logging.warn(self.baseLib.warnings()) logging.debug("Mean AUCs and variances at each depth " + str((meanAUC, varAUC))) for k in range(maxDepths.shape[0]): maxDepth = maxDepths[k] meanParamAUCs.append(meanAUC[k]) paramList.append((maxDepth, varSplit, nfcv, minSplit)) #Try to get some memory back gc.collect() robjects.r('gc(verbose=TRUE)') robjects.r('memory.profile()') #print(self.hp.heap()) #Now choose best params bestInd = numpy.argmax(numpy.array(meanParamAUCs)) self.setMaxDepth(paramList[bestInd][0]) self.setVarSplit(paramList[bestInd][1]) self.setNfcv(paramList[bestInd][2]) self.setMinSplit(paramList[bestInd][3]) self.learnModel(trainX, trainY) predTrainY = self.predict(trainX) predTestY = self.predict(testX) bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY) bestTestAUCs[i] = Evaluator.auc(predTestY, testY) #Store the parameters and ROC curves bestParams.append(paramList[bestInd]) bestTrainROCs.append(Evaluator.roc(trainY, predTrainY)) bestTestROCs.append(Evaluator.roc(testY, predTestY)) metaDict = {} metaDict["size"] = self.getTreeSize() metaDict["depth"] = self.getTreeDepth() bestMetaDicts.append(metaDict) i += 1 allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs] return (bestParams, allMetrics, bestMetaDicts)
def modelSelect2(self, X, rhos, ks, cvInds, colProbs=None): """ Pick a value of rho based on a single matrix X. We do cross validation within, and return the best value of lambda (according to the mean squared error). The rhos must be in decreasing order and we use warm restarts. In this case we remove a few non zeros from each row to form the test set. """ if (numpy.flipud(numpy.sort(rhos)) != rhos).all(): raise ValueError("rhos must be in descending order") trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, rowMajor=False, colProbs=colProbs) metrics = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds))) if self.metric == "mse": metricFuction = learnPredictMSE elif self.metric == "f1" or self.metric == "mrr": metricFuction = learnPredictRanking else: raise ValueError("Unknown metric: " + self.metric) paramList = [] for i, (trainX, testX) in enumerate(trainTestXs): Util.printIteration(i, 1, len(cvInds), "Fold: ") for m, k in enumerate(ks): learner = self.copy() learner.updateAlg="initial" learner.setK(k) paramList.append((learner, trainX, testX, rhos)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=10) resultsIter = pool.imap(metricFuction, paramList) else: resultsIter = itertools.imap(metricFuction, paramList) for i, (trainX, testX) in enumerate(trainTestXs): for m, k in enumerate(ks): metrics[:, m, i] = resultsIter.next() if self.numProcesses != 1: pool.terminate() meanMetrics = metrics.mean(2) stdMetrics = metrics.std(2) logging.debug("ks=" + str(ks)) logging.debug("rhos=" + str(rhos)) logging.debug(meanMetrics) #Set the parameters if self.metric == "mse": self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[1]]) elif self.metric == "f1" or self.metric == "mrr": self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[1]]) logging.debug("Model parameters: k=" + str(self.k) + " rho=" + str(self.rho)) return meanMetrics, stdMetrics
def runToyExp(datasetNames, sampleSizes, foldsSet, cvScalings, sampleMethods, numProcesses, fileNameSuffix): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" outputDir = PathDefaults.getOutputDir() + "modelPenalisation/" svm = LibSVM() numCs = svm.getCs().shape[0] numGammas = svm.getGammas().shape[0] numMethods = 1+(1+cvScalings.shape[0]) numParams = 2 runIdeal = True runCv = True runVfpen = True for i in range(len(datasetNames)): datasetName = datasetNames[i][0] numRealisations = datasetNames[i][1] logging.debug("Learning using dataset " + datasetName) for s in range(len(sampleMethods)): sampleMethod = sampleMethods[s][1] outfileName = outputDir + datasetName + sampleMethods[s][0] + fileNameSuffix fileLock = FileLock(outfileName + ".npz") if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() errors = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods)) params = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numParams)) errorGrids = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numCs, numGammas)) approxGrids = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numCs, numGammas)) idealGrids = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numCs, numGammas)) data = numpy.load(dataDir + datasetName + ".npz") gridPoints, trainX, trainY, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] #We form a test set from the grid points testX = numpy.zeros((gridPoints.shape[0]**2, 2)) for m in range(gridPoints.shape[0]): testX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints testX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m] for j in range(numRealisations): Util.printIteration(j, 1, numRealisations, "Realisation: ") for k in range(sampleSizes.shape[0]): sampleSize = sampleSizes[k] for m in range(foldsSet.shape[0]): folds = foldsSet[m] logging.debug("Using sample size " + str(sampleSize) + " and " + str(folds) + " folds") perm = numpy.random.permutation(trainX.shape[0]) trainInds = perm[0:sampleSize] validX = trainX[trainInds, :] validY = trainY[trainInds] svm = LibSVM(processes=numProcesses) #Find ideal penalties if runIdeal: logging.debug("Finding ideal grid of penalties") idealGrids[j, k, m, :, :] = parallelPenaltyGridRbf(svm, validX, validY, testX, gridPoints, pdfX, pdfY1X, pdfYminus1X) #Cross validation if runCv: logging.debug("Running V-fold cross validation") methodInd = 0 idx = sampleMethod(folds, validY.shape[0]) if sampleMethod == Sampling.bootstrap: bootstrap = True else: bootstrap = False bestSVM, cvGrid = svm.parallelVfcvRbf(validX, validY, idx, True, bootstrap) predY, decisionsY = bestSVM.predict(testX, True) decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F") errors[j, k, m, methodInd] = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X) params[j, k, m, methodInd, :] = numpy.array([bestSVM.getC(), bestSVM.getKernelParams()]) errorGrids[j, k, m, methodInd, :, :] = cvGrid #v fold penalisation if runVfpen: logging.debug("Running penalisation") #BIC penalisation Cv = float((folds-1) * numpy.log(validX.shape[0])/2) tempCvScalings = cvScalings*(folds-1) tempCvScalings = numpy.insert(tempCvScalings, 0, Cv) #Use cross validation idx = sampleMethod(folds, validY.shape[0]) svmGridResults = svm.parallelVfPenRbf(validX, validY, idx, tempCvScalings) for n in range(len(tempCvScalings)): bestSVM, trainErrors, approxGrid = svmGridResults[n] methodInd = n+1 predY, decisionsY = bestSVM.predict(testX, True) decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F") errors[j, k, m, methodInd] = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X) params[j, k, m, methodInd, :] = numpy.array([bestSVM.getC(), bestSVM.getKernelParams()]) errorGrids[j, k, m, methodInd, :, :] = trainErrors + approxGrid approxGrids[j, k, m, methodInd, :, :] = approxGrid meanErrors = numpy.mean(errors, 0) print(meanErrors) meanParams = numpy.mean(params, 0) print(meanParams) meanErrorGrids = numpy.mean(errorGrids, 0) stdErrorGrids = numpy.std(errorGrids, 0) meanIdealGrids = numpy.mean(idealGrids, 0) stdIdealGrids = numpy.std(idealGrids, 0) meanApproxGrids = numpy.mean(approxGrids, 0) stdApproxGrids = numpy.std(approxGrids, 0) numpy.savez(outfileName, errors, params, meanErrorGrids, stdErrorGrids, meanIdealGrids, stdIdealGrids, meanApproxGrids, stdApproxGrids) logging.debug("Saved results as file " + outfileName + ".npz") fileLock.unlock() else: logging.debug("Results already computed") logging.debug("All done!")
def celf(graph, k, numRuns=100, p=0.5, verbose=False): """ Maximising the influence using the CELF algorithm of Leskovec et al. """ k = min(graph.vcount(), k) influenceSet = set([]) influenceList = [] influenceScores = [] negMarginalIncreases = [] #For the initial values we compute marginal increases with respect to the empty set influences = numpy.zeros(graph.vcount()) for i in range(numRuns): influences += MaxInfluence.simulateAllCascades(graph, [], p=p) influences /= float(numRuns) logging.debug("Simulated initial cascades") for vertexInd in range(graph.vcount()): #Note that we store the negation of the influence since heappop chooses the smallest value heapq.heappush(negMarginalIncreases, (-influences[vertexInd], vertexInd)) """ for vertexInd in range(graph.vcount()): currentInfluence = MaxInfluence.simulateCascades(graph, influenceSet.union([vertexInd]), numRuns, p) #Note that we store the negation of the influence since heappop chooses the smallest value heapq.heappush(negMarginalIncreases, (-currentInfluence, vertexInd)) """ negLastInfluence, bestVertexInd = heapq.heappop(negMarginalIncreases) influenceSet.add(bestVertexInd) influenceList.append(bestVertexInd) influenceScores.append(-negLastInfluence) logging.debug("Picking additional vertices") for i in range(1, k): Util.printIteration(i-1, 1, k-1) valid = numpy.zeros(graph.vcount(), numpy.bool) negMarginalInfluence, currentBestVertexInd = heapq.heappop(negMarginalIncreases) j = 0 while not valid[currentBestVertexInd]: marginalInfluence = MaxInfluence.simulateCascades(graph, influenceSet.union([currentBestVertexInd]), numRuns, p) marginalInfluence += negLastInfluence #Note that we store the negation of the influence since heappop chooses the smallest value heapq.heappush(negMarginalIncreases, (-marginalInfluence, currentBestVertexInd)) valid[currentBestVertexInd] = True negMarginalInfluence, currentBestVertexInd = heapq.heappop(negMarginalIncreases) totalInfluence = -(negMarginalInfluence + negLastInfluence) j+=1 #print(j) logging.debug("Required " + str(j) + " evaluations to find influential vertex") negLastInfluence = -totalInfluence influenceSet.add(currentBestVertexInd) influenceList.append(currentBestVertexInd) influenceScores.append(-negLastInfluence) if verbose: return influenceList, influenceScores else: return influenceList
def modelSelect(self, X, rhos, ks, cvInds): """ Pick a value of rho based on a single matrix X. We do cross validation within, and return the best value of lambda (according to the mean squared error). The rhos must be in decreasing order and we use warm restarts. """ if (numpy.flipud(numpy.sort(rhos)) != rhos).all(): raise ValueError("rhos must be in descending order") errors = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds))) if self.metric == "mse": metricFuction = learnPredictMSE elif self.metric == "f1" or self.metric == "mrr": metricFuction = learnPredictRanking else: raise ValueError("Unknown metric: " + self.metric) for i, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(i, 1, len(cvInds), "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) assert trainX.nnz == trainInds.shape[0] assert testX.nnz == testInds.shape[0] #nptst.assert_array_almost_equal((testX+trainX).data, X.data) paramList = [] for m, k in enumerate(ks): learner = self.copy() learner.updateAlg = "initial" learner.setK(k) paramList.append((learner, trainX, testX, rhos)) if self.numProcesses != 1: pool = multiprocessing.Pool( processes=multiprocessing.cpu_count() / 2, maxtasksperchild=10) results = pool.imap(metricFuction, paramList) else: results = itertools.imap(metricFuction, paramList) for m, rhoErrors in enumerate(results): errors[:, m, i] = rhoErrors if self.numProcesses != 1: pool.terminate() meanMetrics = errors.mean(2) stdMetrics = errors.std(2) logging.debug(meanMetrics) #Set the parameters if self.metric == "mse": self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[1]]) elif self.metric == "f1" or self.metric == "mrr": self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[1]]) logging.debug("Model parameters: k=" + str(self.k) + " rho=" + str(self.rho)) return meanMetrics, stdMetrics
# numRepetitions = 2 do_Nings = True clustErrApprox = numpy.zeros((ps.shape[0], numGraphs, numRepetitions, len(k2s))) clustErrExact = numpy.zeros((ps.shape[0], numGraphs, numRepetitions)) clustErrNings = numpy.zeros((ps.shape[0], numGraphs, numRepetitions)) clustErrNystrom = numpy.zeros((ps.shape[0], numGraphs, numRepetitions, len(k3s))) clustErrRandSvd = numpy.zeros((ps.shape[0], numGraphs, numRepetitions, len(k4s))) sinThetaApprox = numpy.zeros((ps.shape[0], numGraphs, numRepetitions, len(k2s))) sinThetaExact = numpy.zeros((ps.shape[0], numGraphs, numRepetitions)) sinThetaNings = numpy.zeros((ps.shape[0], numGraphs, numRepetitions)) sinThetaNystrom = numpy.zeros((ps.shape[0], numGraphs, numRepetitions, len(k3s))) sinThetaRandSvd = numpy.zeros((ps.shape[0], numGraphs, numRepetitions, len(k4s))) for r in range(numRepetitions): Util.printIteration(r, 1, numRepetitions) for t in range(ps.shape[0]): logging.info("Run " + str(r) + " p " + str(ps[t])) p = ps[t] logging.debug("Running exact method") graphIterator = ThreeClustIterator(p, numClusters, r).getIterator() resExact = exactClusterer.clusterFromIterator(graphIterator, True) logging.debug("Running approximate method") resApproxList = [] for i in range(len(k2s)): graphIterator = ThreeClustIterator(p, numClusters, r).getIterator() resApproxList.append(iascClusterers[i].clusterFromIterator(graphIterator, True))