def __init__(self, field): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "dblp/" self.xmlFileName = dataDir + "dblp.xml" self.xmlCleanFilename = dataDir + "dblpClean.xml" resultsDir = PathDefaults.getDataDir() + "reputation/" + field + "/" self.expertsFileName = resultsDir + "experts.txt" self.expertMatchesFilename = resultsDir + "experts_matches.csv" self.trainExpertMatchesFilename = resultsDir + "experts_train_matches.csv" self.testExpertMatchesFilename = resultsDir + "experts_test_matches.csv" self.coauthorsFilename = resultsDir + "coauthors.csv" self.publicationsFilename = resultsDir + "publications.csv" self.stepSize = 100000 self.numLines = 33532888 self.publicationTypes = set(["article" , "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www"]) self.p = 0.5 self.matchCutoff = 0.95 self.cleanXML() self.matchExperts() logging.warning("Now you must disambiguate the matched experts if not ready done")
def __init__(self, maxIter=None, iterStartTimeStamp=None): outputDir = PathDefaults.getOutputDir() + "recommend/erasm/" if not os.path.exists(outputDir): os.mkdir(outputDir) #iterStartDate is the starting date of the iterator if iterStartTimeStamp != None: self.iterStartTimeStamp = iterStartTimeStamp else: self.iterStartTimeStamp = 1286229600 self.timeStep = timedelta(30).total_seconds() self.ratingFileName = outputDir + "data.npz" self.userDictFileName = outputDir + "userIdDict.pkl" self.groupDictFileName = outputDir + "groupIdDict.pkl" self.isTrainRatingsFileName = outputDir + "is_train.npz" self.dataDir = PathDefaults.getDataDir() + "erasm/" self.dataFileName = self.dataDir + "groupMembers-29-11-12" self.maxIter = maxIter self.trainSplit = 4.0/5 self.processRatings() self.splitDataset() self.loadProcessedData()
def processSimpleDataset(name, numRealisations, split, ext=".csv", delimiter=",", usecols=None, skiprows=1, converters=None): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" fileName = dataDir + name + ext print("Loading data from file " + fileName) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "/" XY = numpy.loadtxt(fileName, delimiter=delimiter, skiprows=skiprows, usecols=usecols, converters=converters) X = XY[:, :-1] y = XY[:, -1] idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split) preprocessSave(X, y, outputDir, idx)
def testGenerateRandomGraph(self): egoFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoData.csv" alterFileName = PathDefaults.getDataDir() + "infoDiffusion/AlterData.csv" numVertices = 1000 infoProb = 0.1 p = 0.1 neighbours = 10 generator = SmallWorldGenerator(p, neighbours) graph = SparseGraph(VertexList(numVertices, 0)) graph = generator.generate(graph) self.svmEgoSimulator.generateRandomGraph(egoFileName, alterFileName, infoProb, graph)
def saveRatingMatrix(): """ Take the coauthor graph above and make vertices indexed from 0 then save as matrix market format. """ edgeFileName = PathDefaults.getOutputDir() + "erasm/edges2.txt" logging.debug("Reading edge list") edges = numpy.loadtxt(edgeFileName, delimiter=",", dtype=numpy.int) logging.debug("Total number of edges: " + str(edges.shape[0])) vertexIdDict = {} vertexIdSet = set([]) i = 0 for edge in edges: if edge[0] not in vertexIdSet: vertexIdDict[edge[0]] = i vertexIdSet.add(edge[0]) i += 1 if edge[1] not in vertexIdSet: vertexIdDict[edge[1]] = i vertexIdSet.add(edge[1]) i += 1 n = len(vertexIdDict) R = scipy.sparse.lil_matrix((n, n)) logging.debug("Creating sparse matrix") for edge in edges: R[vertexIdDict[edge[0]], vertexIdDict[edge[1]]] += 1 R[vertexIdDict[edge[1]], vertexIdDict[edge[0]]] += 1 logging.debug("Created matrix " + str(R.shape) + " with " + str(R.getnnz()) + " non zeros") R = R.tocsr() minCoauthors = 20 logging.debug("Removing vertices with <" + str(minCoauthors) + " coauthors") nonzeros = R.nonzero() inds = numpy.arange(nonzeros[0].shape[0])[numpy.bincount(nonzeros[0]) >= minCoauthors] R = R[inds, :][:, inds] logging.debug("Matrix has shape " + str(R.shape) + " with " + str(R.getnnz()) + " non zeros") matrixFileName = PathDefaults.getOutputDir() + "erasm/R" scipy.io.mmwrite(matrixFileName, R) logging.debug("Wrote matrix to file " + matrixFileName)
def __init__(self, trainXIteratorFunc, testXIteratorFunc, cmdLine=None, defaultAlgoArgs = None, dirName=""): """ priority for default args - best priority: command-line value - middle priority: set-by-function value - lower priority: class value """ # Parameters to choose which methods to run # Obtained merging default parameters from the class with those from the user self.algoArgs = RecommendExpHelper.newAlgoParams(defaultAlgoArgs) #Function to return iterators to the training and test matrices self.trainXIteratorFunc = trainXIteratorFunc self.testXIteratorFunc = testXIteratorFunc #How often to print output self.logStep = 10 #The max number of observations to use for model selection self.sampleSize = 5*10**6 # basic resultsDir self.resultsDir = PathDefaults.getOutputDir() + "recommend/" + dirName + "/" # update algoParams from command line self.readAlgoParams(cmdLine)
def main(argv=None): if argv is None: argv = sys.argv try: # read options try: opts, args = getopt.getopt(argv[1:], "hd:n:D", ["help", "dir=", "nb_user="******"debug"]) except getopt.error as msg: raise RGUsage(msg) # apply options dir = PathDefaults.getDataDir() + "cluster/" nb_user = None log_level = logging.INFO for o, a in opts: if o in ("-h", "--help"): print(__doc__) return 0 elif o in ("-d", "--dir"): dir = a elif o in ("-n", "--nb_user"): nb_user = int(a) elif o in ("-D", "--debug"): log_level = logging.DEBUG logging.basicConfig(stream=sys.stdout, level=log_level, format='%(levelname)s (%(asctime)s):%(message)s') # process: generate data files BemolData.generate_data_file(dir, nb_user) except RGUsage as err: logging.error(err.msg) logging.error("for help use --help") return 2
def testLoadParams(self): try: lmbda = 0.01 alterRegressor = PrimalRidgeRegression(lmbda) egoRegressor = PrimalRidgeRegression(lmbda) predictor = EgoEdgeLabelPredictor(alterRegressor, egoRegressor) params = [0.1, 0.2] paramFuncs = [egoRegressor.setLambda, alterRegressor.setLambda] fileName = PathDefaults.getTempDir() + "tempParams.pkl" predictor.saveParams(params, paramFuncs, fileName) params2 = predictor.loadParams(fileName) self.assertTrue(params2[0][0] == "apgl.predictors.PrimalRidgeRegression") self.assertTrue(params2[0][1] == "setLambda") self.assertTrue(params2[0][2] == 0.1) self.assertTrue(params2[1][0] == "apgl.predictors.PrimalRidgeRegression") self.assertTrue(params2[1][1] == "setLambda") self.assertTrue(params2[1][2] == 0.2) except IOError as e: logging.warn(e)
def testWriteToFile3(self): """ We will test out writing out some random graphs to Pajek """ numVertices = 20 numFeatures = 0 vList = VertexList(numVertices, numFeatures) graph = SparseGraph(vList) p = 0.1 generator = ErdosRenyiGenerator(p) graph = generator.generate(graph) pw = PajekWriter() directory = PathDefaults.getOutputDir() + "test/" pw.writeToFile(directory + "erdosRenyi20", graph) #Now write a small world graph p = 0.2 k = 3 graph.removeAllEdges() generator = SmallWorldGenerator(p, k) graph = generator.generate(graph) pw.writeToFile(directory + "smallWorld20", graph)
def testToyData(self): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" data = numpy.load(dataDir + "toyData.npz") gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] pxSum = 0 pY1XSum = 0 pYminus1XSum = 0 px2Sum = 0 squareArea = (gridPoints[1]-gridPoints[0])**2 for i in range(gridPoints.shape[0]-1): for j in range(gridPoints.shape[0]-1): px = (pdfX[i,j]+pdfX[i+1,j]+pdfX[i, j+1]+pdfX[i+1, j+1])/4 pxSum += px*squareArea pY1X = (pdfY1X[i,j]+pdfY1X[i+1,j]+pdfY1X[i, j+1]+pdfY1X[i+1, j+1])/4 pY1XSum += pY1X*squareArea pYminus1X = (pdfYminus1X[i,j]+pdfYminus1X[i+1,j]+pdfYminus1X[i, j+1]+pdfYminus1X[i+1, j+1])/4 pYminus1XSum += pYminus1X*squareArea px2Sum += px*pY1X*squareArea + px*pYminus1X*squareArea self.assertAlmostEquals(pxSum, 1) print(pY1XSum) print(pYminus1XSum) self.assertAlmostEquals(px2Sum, 1)
def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) maxDepths = range(3, 10) trainAucs = numpy.array([0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508]) testAucs = numpy.array([0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRank = TreeRank(self.leafRanklearner) treeRank.setMaxDepth(maxDepth) treeRank.learnModel(X, y) trainScores = treeRank.predict(X) testScores = treeRank.predict(testX) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i+=1
def testEstimate(self): #Lets set up a simple model based on normal dist abcParams = ABCParameters() epsilonArray = numpy.array([0.5, 0.2, 0.1]) posteriorSampleSize = 20 #Lets get an empirical estimate of Sprime model = NormalModel(abcMetrics) model.setMu(theta[0]) model.setSigma(theta[1]) Sprime = abcMetrics.summary(model.simulate()) logging.debug(("Real summary statistic: " + str(Sprime))) thetaDir = PathDefaults.getTempDir() abcSMC = ABCSMC(epsilonArray, createNormalModel, abcParams, thetaDir) abcSMC.maxRuns = 100000 abcSMC.setPosteriorSampleSize(posteriorSampleSize) thetasArray = abcSMC.run() thetasArray = numpy.array(thetasArray) meanTheta = numpy.mean(thetasArray, 0) logging.debug((thetasArray.shape)) logging.debug(thetasArray) logging.debug(meanTheta) print(thetasArray.shape[0], posteriorSampleSize) #Note only mean needs to be similar self.assertTrue(thetasArray.shape[0] >= posteriorSampleSize) self.assertEquals(thetasArray.shape[1], 2) self.assertTrue(numpy.linalg.norm(theta[0] - meanTheta[0]) < 0.2)
def testGraphFromMatFile(self): matFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoAlterTransmissions1000.mat" sGraph = EgoUtils.graphFromMatFile(matFileName) examplesList = ExamplesList.readFromMatFile(matFileName) numFeatures = examplesList.getDataFieldSize("X", 1) self.assertEquals(examplesList.getNumExamples(), sGraph.getNumEdges()) self.assertEquals(examplesList.getNumExamples()*2, sGraph.getNumVertices()) self.assertEquals(numFeatures/2+1, sGraph.getVertexList().getNumFeatures()) #Every even vertex has information, odd does not for i in range(0, sGraph.getNumVertices()): vertex = sGraph.getVertex(i) if i%2 == 0: self.assertEquals(vertex[sGraph.getVertexList().getNumFeatures()-1], 1) else: self.assertEquals(vertex[sGraph.getVertexList().getNumFeatures()-1], 0) #Test the first few vertices are the same for i in range(0, 10): vertex1 = sGraph.getVertex(i*2)[0:numFeatures/2] vertex2 = sGraph.getVertex(i*2+1)[0:numFeatures/2] vertexEx1 = examplesList.getSubDataField("X", numpy.array([i])).ravel()[0:numFeatures/2] vertexEx2 = examplesList.getSubDataField("X", numpy.array([i])).ravel()[numFeatures/2:numFeatures] self.assertTrue((vertex1 == vertexEx1).all()) self.assertTrue((vertex2 == vertexEx2).all())
def testReadFromMatFile(self): numExamples = 10 dir = PathDefaults.getTempDir() fileName = dir + "examplesList1" X = rand(numExamples, 10) ml = ExamplesList(numExamples) ml.addDataField("X", X) ml.writeToMatFile(fileName) ml2 = ExamplesList.readFromMatFile(fileName) self.assertTrue(ml == ml2) Y = rand(numExamples, 20) ml.addDataField("Y", Y) ml.writeToMatFile(fileName) ml2 = ExamplesList.readFromMatFile(fileName) self.assertTrue(ml == ml2) Z = rand(numExamples, 50) ml.addDataField("Z", Z) ml.writeToMatFile(fileName) ml2 = ExamplesList.readFromMatFile(fileName) self.assertTrue(ml == ml2)
def testComputeIdealPenalty(self): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" data = numpy.load(dataDir + "toyData.npz") gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] sampleSize = 100 trainX, trainY = X[0:sampleSize, :], y[0:sampleSize] testX, testY = X[sampleSize:, :], y[sampleSize:] #We form a test set from the grid points fullX = numpy.zeros((gridPoints.shape[0]**2, 2)) for m in range(gridPoints.shape[0]): fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m] C = 1.0 gamma = 1.0 args = (trainX, trainY, fullX, C, gamma, gridPoints, pdfX, pdfY1X, pdfYminus1X) penalty = computeIdealPenalty(args) #Now compute penalty using data args = (trainX, trainY, testX, testY, C, gamma) penalty2 = computeIdealPenalty2(args) self.assertAlmostEquals(penalty2, penalty, 2)
def __init__(self): self.labelNames = ["Cortisol.val", "Testosterone.val", "IGF1.val"] self.dataDir = PathDefaults.getDataDir() + "metabolomic/" self.boundsDict = {} self.boundsDict["Cortisol"] = numpy.array([0, 89, 225, 573]) self.boundsDict["Testosterone"] = numpy.array([0, 3, 9, 13]) self.boundsDict["IGF1"] = numpy.array([0, 200, 441, 782])
def getLsos(self): """ Return a function to display R memory usage """ fileName = PathDefaults.getSourceDir() + "/apgl/metabolomics/R/Util.R" robjects.r["source"](fileName) return robjects.r['lsos']
def __init__(self, YList, X, featuresName, ages, args): super(MetabolomicsExpRunner, self).__init__(args=args) self.X = X self.YList = YList #The list of concentrations self.featuresName = featuresName self.args = args self.ages = ages self.maxDepth = 10 self.numTrees = 10 self.sampleSize = 1.0 self.sampleReplace = True self.folds = 5 self.resultsDir = PathDefaults.getOutputDir() + "metabolomics/" self.leafRankGenerators = [] self.leafRankGenerators.append((LinearSvmGS.generate(), "SVM")) self.leafRankGenerators.append((SvcGS.generate(), "RBF-SVM")) self.leafRankGenerators.append((DecisionTree.generate(), "CART")) self.pcaLeafRankGenerators = [(LinearSvmPca.generate(), "LinearSVM-PCA")] self.funcLeafRankGenerators = [] self.funcLeafRankGenerators.append((LinearSvmFGs.generate, "SVMF")) self.funcLeafRankGenerators.append((SvcFGs.generate, "RBF-SVMF")) self.funcLeafRankGenerators.append((DecisionTreeF.generate, "CARTF")) #Store all the label vectors and their missing values YIgf1Inds, YICortisolInds, YTestoInds = MetabolomicsUtils.createIndicatorLabels(YList) self.hormoneInds = [YIgf1Inds, YICortisolInds, YTestoInds] self.hormoneNames = MetabolomicsUtils.getLabelNames()
def __init__(self, maxIter=None, iterStartTimeStamp=None): """ Return a training and test set for movielens based on the time each rating was made. """ self.timeStep = timedelta(30).total_seconds() # iterStartDate is the starting date of the iterator if iterStartTimeStamp != None: self.iterStartTimeStamp = iterStartTimeStamp else: self.iterStartTimeStamp = 789652009 outputDir = PathDefaults.getOutputDir() + "recommend/erasm/" self.numRatings = 402872 self.minContacts = 10 if not os.path.exists(outputDir): os.mkdir(outputDir) self.ratingFileName = outputDir + "data.npz" self.userDictFileName = outputDir + "userIdDict.pkl" self.isTrainRatingsFileName = outputDir + "is_train.npz" self.maxIter = maxIter self.trainSplit = 4.0 / 5 self.processRatings() self.splitDataset() self.loadProcessedData() if self.maxIter != None: logging.debug("Maximum number of iterations: " + str(self.maxIter))
def testEdgeFile(self): """ Figure out the problem with the edge file """ dataDir = PathDefaults.getDataDir() + "cluster/" edgesFilename = dataDir + "Cit-HepTh.txt" edges = {} file = open(edgesFilename, 'r') file.readline() file.readline() file.readline() file.readline() vertices = {} for line in file: (vertex1, sep, vertex2) = line.partition("\t") vertex1 = vertex1.strip() vertex2 = vertex2.strip() edges[(vertex1, vertex2)] = 0 vertices[vertex1] = 0 vertices[vertex2] = 0 #It says there are 352807 edges in paper and 27770 vertices self.assertEquals(len(edges), 352807) self.assertEquals(len(vertices), 27770)
def __init__(self, maxIter=None, iterStartTimeStamp=None): """ Return a training and test set for netflix based on the time each rating was made. There are 62 iterations. """ self.timeStep = timedelta(30).total_seconds() #startDate is used to convert dates into ints #self.startDate = datetime(1998,1,1) #self.endDate = datetime(2005,12,31) #iterStartDate is the starting date of the iterator if iterStartTimeStamp != None: self.iterStartTimeStamp = iterStartTimeStamp else: self.iterStartTimeStamp = time.mktime(datetime(2001,1,1).timetuple()) self.startMovieID = 1 self.endMovieID = 17770 self.numMovies = 17770 self.numRatings = 100480507 self.numProbeMovies = 16938 self.numProbeRatings = 1408395 self.numCustomers = 480189 outputDir = PathDefaults.getOutputDir() + "recommend/netflix/" if not os.path.exists(outputDir): os.mkdir(outputDir) self.ratingFileName = outputDir + "data.npz" self.custDictFileName = outputDir + "custIdDict.pkl" self.probeFileName = PathDefaults.getDataDir() + "netflix/probe.txt" self.testRatingsFileName = outputDir + "test_data.npz" self.isTrainRatingsFileName = outputDir + "is_train.npz" self.maxIter = maxIter self.trainSplit = 4.0/5 self.processRatings() #self.processProbe() self.splitDataset() self.loadProcessedData() if self.maxIter != None: logging.debug("Maximum number of iterations: " + str(self.maxIter))
def processRatings(self): """ Convert the dataset into a matrix and save the results for faster access. """ if not os.path.exists(self.ratingFileName) or not os.path.exists(self.custDictFileName): dataDir = PathDefaults.getDataDir() + "netflix/training_set/" logging.debug("Processing ratings given in " + dataDir) custIdDict = {} custIdSet = set([]) movieIds = array.array("I") custIds = array.array("I") ratings = array.array("B") dates = array.array("L") j = 0 for i in range(self.startMovieID, self.endMovieID+1): Util.printIteration(i-1, 1, self.endMovieID-1) ratingsFile = open(dataDir + "mv_" + str(i).zfill(7) + ".txt") ratingsFile.readline() for line in ratingsFile: vals = line.split(",") custId = int(vals[0]) if custId not in custIdSet: custIdSet.add(custId) custIdDict[custId] = j custInd = j j += 1 else: custInd = custIdDict[custId] rating = int(vals[1]) t = datetime.strptime(vals[2].strip(), "%Y-%m-%d") movieIds.append(i-1) custIds.append(custInd) ratings.append(rating) dates.append(int(time.mktime(t.timetuple()))) movieIds = numpy.array(movieIds, numpy.uint32) custIds = numpy.array(custIds, numpy.uint32) ratings = numpy.array(ratings, numpy.uint8) dates = numpy.array(dates, numpy.uint32) assert ratings.shape[0] == self.numRatings numpy.savez(self.ratingFileName, movieIds, custIds, ratings, dates) logging.debug("Saved ratings file as " + self.ratingFileName) pickle.dump(custIdDict, open(self.custDictFileName, 'wb')) logging.debug("Saved custIdDict as " + self.custDictFileName) else: logging.debug("Ratings file " + str(self.ratingFileName) + " already processed")
def loadData(): """ Return the raw spectra and the MDS transformed data as well as the DataFrame for the MDS data. """ utilsLib = importr('utils') dataDir = PathDefaults.getDataDir() + "metabolomic/" fileName = dataDir + "data.RMN.total.6.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") maxNMRIndex = 951 X = df.rx(robjects.IntVector(range(1, maxNMRIndex))) X = numpy.array(X).T #Load age and normalise (missing values are assinged the mean) ages = numpy.array(df.rx(robjects.StrVector(["Age"]))).ravel() meanAge = numpy.mean(ages[numpy.logical_not(numpy.isnan(ages))]) ages[numpy.isnan(ages)] = meanAge ages = Standardiser().standardiseArray(ages) Xs = X.copy() standardiser = Standardiser() Xs = standardiser.standardiseArray(X) fileName = dataDir + "data.sportsmen.log.AP.1.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") maxNMRIndex = 419 X2 = df.rx(robjects.IntVector(range(1, maxNMRIndex))) X2 = numpy.array(X2).T #Load the OPLS corrected files fileName = dataDir + "IGF1.log.OSC.1.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") minNMRIndex = 22 maxNMRIndex = 441 Xopls1 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex))) Xopls1 = numpy.array(Xopls1).T fileName = dataDir + "cort.log.OSC.1.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") minNMRIndex = 20 maxNMRIndex = 439 Xopls2 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex))) Xopls2 = numpy.array(Xopls2).T fileName = dataDir + "testo.log.OSC.1.txt" df = utilsLib.read_table(fileName, header=True, row_names=1, sep=",") minNMRIndex = 22 maxNMRIndex = 441 Xopls3 = df.rx(robjects.IntVector(range(minNMRIndex, maxNMRIndex))) Xopls3 = numpy.array(Xopls3).T #Let's load all the label data here labelNames = MetabolomicsUtils.getLabelNames() YList = MetabolomicsUtils.createLabelList(df, labelNames) return X, X2, Xs, (Xopls1, Xopls2, Xopls3), YList, ages, df
def testWriteToFile(self): sgw = SimpleGraphWriter() directory = PathDefaults.getOutputDir() + "test/" #Have to check the files fileName1 = directory + "dictTestUndirected" sgw.writeToFile(fileName1, self.dctGraph1) fileName2 = directory + "dictTestDirected" sgw.writeToFile(fileName2, self.dctGraph2)
def profileClusterFromIterator(self): iterator = IncreasingSubgraphListIterator(self.graph, self.subgraphIndicesList) dataDir = PathDefaults.getDataDir() + "cluster/" #iterator = getBemolGraphIterator(dataDir) def run(): clusterList, timeList, boundList = self.clusterer.clusterFromIterator(iterator, verbose=True) print(timeList.cumsum(0)) ProfileUtils.profile('run()', globals(), locals())
def getIterator(): dataDir = PathDefaults.getDataDir() + "cluster/" nbUser = 10000 # set to 'None' to have all users nbPurchasesPerIt = 500 # set to 'None' to take all the purchases per date startingIteration = 300 endingIteration = 600 # set to 'None' to have all iterations stepSize = 1 return itertools.islice(BemolData.getGraphIterator(dataDir, nbUser, nbPurchasesPerIt), startingIteration, endingIteration, stepSize)
def testReadGraph(self): fileName = PathDefaults.getDataDir() + "test/simpleGraph.txt" graphReader = SimpleGraphReader() graph = graphReader.readFromFile(fileName) logging.debug((graph.getAllEdges())) self.assertEquals(graph.isUndirected(), True) self.assertEquals(graph.getNumVertices(), 5) self.assertEquals(graph.getNumEdges(), 4) self.assertEquals(graph.getEdge(0, 1), 1) self.assertEquals(graph.getEdge(2, 4), 1) self.assertEquals(graph.getEdge(2, 2), 1) self.assertEquals(graph.getEdge(4, 0), 1) #Now test reading a file with the same graph but vertices indexed differently fileName = PathDefaults.getDataDir() + "test/simpleGraph2.txt" graph = graphReader.readFromFile(fileName) self.assertEquals(graph.isUndirected(), True) self.assertEquals(graph.getNumVertices(), 5) self.assertEquals(graph.getNumEdges(), 4) self.assertEquals(graph.getEdge(0, 1), 1.1) self.assertEquals(graph.getEdge(2, 4), 1) self.assertEquals(graph.getEdge(2, 2), 1.6) self.assertEquals(graph.getEdge(4, 0), 1) #Now test a file with directed edges fileName = PathDefaults.getDataDir() + "test/simpleGraph3.txt" graph = graphReader.readFromFile(fileName) self.assertEquals(graph.isUndirected(), False) self.assertEquals(graph.getNumVertices(), 5) self.assertEquals(graph.getNumEdges(), 4) self.assertEquals(graph.getEdge(0, 1), 1) self.assertEquals(graph.getEdge(2, 4), 1) self.assertEquals(graph.getEdge(2, 2), 1) self.assertEquals(graph.getEdge(4, 0), 1)
def __init__(self): dataDir = PathDefaults.getDataDir() + "cluster/" nbUser = 2000 # set to 'None' to have all users nbPurchasesPerIt = 50 # set to 'None' to take all the purchases # per date startingIteration = 20 endingIteration = None # set to 'None' to have all iterations stepSize = 10 iterator = itertools.islice(BemolData.getGraphIterator(dataDir, nbUser, nbPurchasesPerIt), startingIteration, endingIteration, stepSize) self.iterator = iterator
def getOutputFileName(graphType, p, k, infoProb): outputDirectory = PathDefaults.getOutputDir() if graphType == "SmallWorld": outputFileName = outputDirectory + "SvmEgoOutput_type=" + graphType + "_p=" + str(p) + "_k=" + str(k) + "_q=" + str(infoProb) elif graphType == "ErdosRenyi": outputFileName = outputDirectory + "SvmEgoOutput_type=" + graphType + "_p=" + str(p) + "_q=" + str(infoProb) else: raise ValueError("Invalid graph type: " + graphType) return outputFileName
def testGetTrainIteratorFunc(self): dataFilename = PathDefaults.getDataDir() + "reference/author_document_count" dataset = Static2IdValDataset(dataFilename) trainIterator = dataset.getTrainIteratorFunc()() testIterator = dataset.getTestIteratorFunc()() for trainX in trainIterator: testX = testIterator.next() print(trainX.shape, trainX.nnz, testX.nnz) self.assertEquals(trainX.shape, testX.shape)
def testCreateIndicatorLabels(self): metaUtils = MetabolomicsUtils() X, XStd, X2, (XoplsCortisol, XoplsTesto, XoplsIgf1), YCortisol, YTesto, YIgf1, ages = metaUtils.loadData() YCortisol = YCortisol[numpy.logical_not(numpy.isnan(YCortisol))] YCortisolIndicators = metaUtils.createIndicatorLabel(YCortisol, metaUtils.boundsDict["Cortisol"]) YTesto = YTesto[numpy.logical_not(numpy.isnan(YTesto))] YTestoIndicators = metaUtils.createIndicatorLabel(YTesto, metaUtils.boundsDict["Testosterone"]) YIgf1 = YIgf1[numpy.logical_not(numpy.isnan(YIgf1))] YIgf1Indicators = metaUtils.createIndicatorLabel(YIgf1, metaUtils.boundsDict["IGF1"]) s = numpy.sum(YCortisolIndicators, 1) nptst.assert_array_equal(s, numpy.ones(s.shape[0])) s = numpy.sum(YTestoIndicators, 1) nptst.assert_array_equal(s, numpy.ones(s.shape[0])) s = numpy.sum(YIgf1Indicators, 1) nptst.assert_array_equal(s, numpy.ones(s.shape[0])) #Now compare to those labels in the file X, X2, (XoplsCortisol, XoplsTesto, XoplsIgf1), YCortisol, YTesto, YIgf1, ages = metaUtils.loadData() dataDir = PathDefaults.getDataDir() + "metabolomic/" fileName = dataDir + "data.RMN.total.6.txt" data = pandas.read_csv(fileName, delimiter=",") YCortisolIndicators = metaUtils.createIndicatorLabel(YCortisol, metaUtils.boundsDict["Cortisol"]) YCortisolIndicators2 = numpy.array(data[["Ind.Cortisol.1", "Ind.Cortisol.2", "Ind.Cortisol.3"]]) for i in range(YCortisolIndicators.shape[0]): if not numpy.isnan(YCortisol[i]) and not numpy.isnan(YCortisolIndicators2[i, :]).any(): #nptst.assert_almost_equal(YCortisolIndicators2[i, :], YCortisolIndicators[i, :]) pass YTestoIndicators = metaUtils.createIndicatorLabel(YTesto, metaUtils.boundsDict["Testosterone"]) YTestoIndicators2 = numpy.array(data[["Ind.Testo.1", "Ind.Testo.2", "Ind.Testo.3"]]) for i in range(YTestoIndicators.shape[0]): if not numpy.isnan(YTesto[i]) and not numpy.isnan(YTestoIndicators2[i, :]).any(): #print(i, YTesto[i]) nptst.assert_almost_equal(YTestoIndicators2[i, :], YTestoIndicators[i, :]) YIgf1Indicators = metaUtils.createIndicatorLabel(YIgf1, metaUtils.boundsDict["IGF1"]) YIgf1Indicators2 = numpy.array(data[["Ind.IGF1.1", "Ind.IGF1.2", "Ind.IGF1.3"]]) for i in range(YIgf1Indicators.shape[0]): if not numpy.isnan(YIgf1[i]) and not numpy.isnan(YIgf1Indicators2[i, :]).any(): #print(i, YIgf1[i]) #nptst.assert_almost_equal(YIgf1Indicators2[i, :], YIgf1Indicators[i, :]) pass
def testWriteToFile(self): graph = DictGraph() numVertices = 5 numFeatures = 3 V = numpy.random.rand(numVertices, numFeatures) for i in range(0, numVertices): graph.setVertex(i, V[i, :]) fileName = PathDefaults.getOutputDir() + "test/vertices" verterWriter = CsvVertexWriter() verterWriter.writeToFile(fileName, graph) logging.debug(V)
def testSaveLoad(self): try: vList = VertexList(self.numVertices, self.numFeatures) vList.setVertex(0, numpy.array([1, 2, 3])) vList.setVertex(1, numpy.array([4, 5, 6])) vList.setVertex(2, numpy.array([7, 8, 9])) tempDir = PathDefaults.getTempDir() fileName = tempDir + "vList" vList.save(fileName) vList2 = VertexList.load(fileName) self.assertTrue( (vList.getVertices() == vList2.getVertices()).all()) except IOError as e: logging.warn(e) pass
def testSaveLoad(self): try: vList = GeneralVertexList(self.numVertices) vList.setVertex(0, "abc") vList.setVertex(1, 12) vList.setVertex(2, "num") tempDir = PathDefaults.getTempDir() fileName = tempDir + "vList" vList.save(fileName) vList2 = GeneralVertexList.load(fileName) for i in range(self.numVertices): self.assertEquals(vList.getVertex(i), vList2.getVertex(i)) except IOError as e: logging.warn(e) pass
def testWriteToFile(self): pw = PajekWriter() directory = PathDefaults.getOutputDir() + "test/" #Have to check the files fileName1 = directory + "denseTestUndirected" pw.writeToFile(fileName1, self.dGraph1) fileName2 = directory + "denseTestDirected" pw.writeToFile(fileName2, self.dGraph2) fileName3 = directory + "sparseTestUndirected" pw.writeToFile(fileName3, self.sGraph1) fileName4 = directory + "sparseTestDirected" pw.writeToFile(fileName4, self.sGraph2) fileName5 = directory + "dictTestUndirected" pw.writeToFile(fileName5, self.dctGraph1) fileName6 = directory + "dictTestDirected" pw.writeToFile(fileName6, self.dctGraph2)
def testWriteToFile2(self): pw = PajekWriter() directory = PathDefaults.getOutputDir() + "test/" def setVertexColour(vertexIndex, graph): colours = ["grey05", "grey10", "grey15", "grey20", "grey25"] return colours[vertexIndex] def setVertexSize(vertexIndex, graph): return vertexIndex def setEdgeColour(vertexIndex1, vertexIndex2, graph): colours = ["grey05", "grey10", "grey15", "grey20", "grey25"] return colours[vertexIndex1] def setEdgeSize(vertexIndex1, vertexIndex2, graph): return vertexIndex1 + vertexIndex2 pw.setVertexColourFunction(setVertexColour) fileName1 = directory + "vertexColourTest" pw.writeToFile(fileName1, self.dGraph1) pw.setVertexColourFunction(None) pw.setVertexSizeFunction(setVertexSize) fileName1 = directory + "vertexSizeTest" pw.writeToFile(fileName1, self.dGraph1) pw.setVertexSizeFunction(None) pw.setEdgeColourFunction(setEdgeColour) fileName1 = directory + "edgeColourTest" pw.writeToFile(fileName1, self.dGraph1) pw.setEdgeColourFunction(None) pw.setEdgeSizeFunction(setEdgeSize) fileName1 = directory + "edgeSizeTest" pw.writeToFile(fileName1, self.dGraph1) pw.setEdgeColourFunction(None)
def test(): """ A function which uses the unittest library to find all tests within apgl (those files matching "*Test.py"), and run those tests. In python 2.7 and above the unittest framework is used otherwise one needs unittest2 for python 2.3-2.6. """ try: import traceback import sys import os import logging from apgl.util.PathDefaults import PathDefaults logging.disable(logging.WARNING) #logging.disable(logging.INFO) sourceDir = PathDefaults.getSourceDir() print("Running tests from " + sourceDir) version = getPythonVersion() if version >= 2.7: import unittest else: import unittest2 as unittest overallTestSuite = unittest.TestSuite() overallTestSuite.addTest(unittest.defaultTestLoader.discover(os.path.join(sourceDir, "generator"), pattern='*Test.py', top_level_dir=sourceDir)) overallTestSuite.addTest(unittest.defaultTestLoader.discover(os.path.join(sourceDir, "graph"), pattern='*Test.py', top_level_dir=sourceDir)) overallTestSuite.addTest(unittest.defaultTestLoader.discover(os.path.join(sourceDir, "util"), pattern='*Test.py', top_level_dir=sourceDir)) unittest.TextTestRunner(verbosity=1).run(overallTestSuite) except ImportError as error: traceback.print_exc(file=sys.stdout)
def setUp(self): tempDir = PathDefaults.getTempDir() self.fileName = tempDir + "abc"
def testReadGraph(self): dir = PathDefaults.getDataDir() vertexFileName = dir + "test/deggraf10.csv" edgeFileNames = [dir + "test/testEdges1.csv", dir + "test/testEdges2.csv"] def genderConv(x): genderDict = {'"M"': 0, '"F"': 1} return genderDict[x] def orientConv(x): orientDict = {'"HT"': 0, '"HB"': 1} return orientDict[x] def fteConv(x): fteDict = {'"INTER"': 0, '"CONTA"': 1} return fteDict[x] def provConv(x): provDict = {'"CH"': 0, '"SC"': 1, '"SS"': 2, '"LH"' : 3, '"GM"' : 4} return provDict[x] converters = {3: genderConv, 4: orientConv, 5:fteConv, 6:provConv} idIndex = 0 featureIndices = list(range(1,11)) multiGraphCsvReader = MultiGraphCsvReader(idIndex, featureIndices, converters) sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames) vertexValues = numpy.zeros((10, 10)) vertexValues[0, :] = numpy.array([1986, 32, 0, 0, 0, 0, 0, 3, 3, 1]) vertexValues[1, :] = numpy.array([1986, 27, 0, 0, 0, 1, 0, 4, 4, 1]) vertexValues[2, :] = numpy.array([1986, 20, 0, 0, 0, 1, 0, 1, 1, 0]) vertexValues[3, :] = numpy.array([1986, 20, 0, 0, 0, 1, 0, 2, 2, 0]) vertexValues[4, :] = numpy.array([1986, 20, 0, 0, 0, 2, 0, 5, 5, 0]) vertexValues[5, :] = numpy.array([1986, 28, 0, 0, 0, 3, 0, 1, 1, 1]) vertexValues[6, :] = numpy.array([1986, 26, 1, 0, 1, 3, 6, 1, 1, 1]) vertexValues[7, :] = numpy.array([1986, 35, 0, 0, 0, 2, 0, 0, 0, 0]) vertexValues[8, :] = numpy.array([1986, 37, 0, 1, 0, 3, 0, 5, 3, 0]) vertexValues[9, :] = numpy.array([1986, 40, 0, 1, 0, 4, 0, 3, 3, 0]) #Check if the values of the vertices are correct for i in range(sparseMultiGraph.getNumVertices()): self.assertTrue((sparseMultiGraph.getVertex(i) == vertexValues[i]).all()) #Now check edges edges = numpy.zeros((10, 3)) edges[0, :] = numpy.array([4, 0, 0]) edges[1, :] = numpy.array([4, 1, 0]) edges[2, :] = numpy.array([5, 3, 0]) edges[3, :] = numpy.array([7, 1, 0]) edges[4, :] = numpy.array([8, 0, 0]) edges[5, :] = numpy.array([4, 1, 1]) edges[6, :] = numpy.array([8, 1, 1]) edges[7, :] = numpy.array([8, 2, 1]) edges[8, :] = numpy.array([8, 4, 1]) edges[9, :] = numpy.array([9, 0, 1]) self.assertTrue((sparseMultiGraph.getAllEdges() == edges).all()) #Now test directed graphs sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames, False) for i in range(sparseMultiGraph.getNumVertices()): self.assertTrue((sparseMultiGraph.getVertex(i) == vertexValues[i]).all()) edges = numpy.zeros((10, 3)) edges[0, :] = numpy.array([0, 4, 0]) edges[1, :] = numpy.array([1, 7, 0]) edges[2, :] = numpy.array([3, 5, 0]) edges[3, :] = numpy.array([4, 1, 0]) edges[4, :] = numpy.array([8, 0, 0]) edges[5, :] = numpy.array([0, 9, 1]) edges[6, :] = numpy.array([1, 8, 1]) edges[7, :] = numpy.array([2, 8, 1]) edges[8, :] = numpy.array([4, 1, 1]) edges[9, :] = numpy.array([8, 4, 1]) self.assertTrue((sparseMultiGraph.getAllEdges() == edges).all())
def testMDLGraphsReader(self): reader = MDLGraphsReader() dir = PathDefaults.getDataDir() fileName = dir + "test/testGraphs.mdl" graphs = reader.readFromFile(fileName) self.assertEquals(len(graphs), 2) #Check the first graph self.assertEquals(graphs[0].getNumVertices(), 26) self.assertEquals(graphs[0].getNumEdges(), 28) def getEdge(graph, i, j): return graph.getEdge(i - 1, j - 1) self.assertEquals(getEdge(graphs[0], 1, 6), 1) self.assertEquals(getEdge(graphs[0], 1, 2), 1) self.assertEquals(getEdge(graphs[0], 1, 18), 1) self.assertEquals(getEdge(graphs[0], 2, 3), 1) self.assertEquals(getEdge(graphs[0], 2, 19), 1) self.assertEquals(getEdge(graphs[0], 3, 4), 1) self.assertEquals(getEdge(graphs[0], 3, 20), 1) self.assertEquals(getEdge(graphs[0], 4, 10), 1) self.assertEquals(getEdge(graphs[0], 4, 5), 1) self.assertEquals(getEdge(graphs[0], 5, 6), 1) self.assertEquals(getEdge(graphs[0], 5, 7), 1) self.assertEquals(getEdge(graphs[0], 6, 21), 1) self.assertEquals(getEdge(graphs[0], 7, 8), 1) self.assertEquals(getEdge(graphs[0], 7, 22), 1) self.assertEquals(getEdge(graphs[0], 8, 9), 1) self.assertEquals(getEdge(graphs[0], 8, 23), 1) self.assertEquals(getEdge(graphs[0], 9, 14), 1) self.assertEquals(getEdge(graphs[0], 9, 10), 1) self.assertEquals(getEdge(graphs[0], 10, 11), 1) self.assertEquals(getEdge(graphs[0], 11, 12), 1) self.assertEquals(getEdge(graphs[0], 11, 24), 1) self.assertEquals(getEdge(graphs[0], 12, 13), 1) self.assertEquals(getEdge(graphs[0], 12, 25), 1) self.assertEquals(getEdge(graphs[0], 13, 14), 1) self.assertEquals(getEdge(graphs[0], 13, 15), 1) self.assertEquals(getEdge(graphs[0], 14, 26), 1) self.assertEquals(getEdge(graphs[0], 15, 16), 1) self.assertEquals(getEdge(graphs[0], 15, 17), 1) #Check the second graph self.assertEquals(graphs[1].getNumVertices(), 19) self.assertEquals(graphs[1].getNumEdges(), 20) self.assertEquals(getEdge(graphs[1], 1, 10), 1) self.assertEquals(getEdge(graphs[1], 1, 2), 1) self.assertEquals(getEdge(graphs[1], 1, 14), 1) self.assertEquals(getEdge(graphs[1], 2, 3), 1) self.assertEquals(getEdge(graphs[1], 2, 15), 1) self.assertEquals(getEdge(graphs[1], 3, 8), 1) self.assertEquals(getEdge(graphs[1], 3, 4), 1) self.assertEquals(getEdge(graphs[1], 4, 5), 1) self.assertEquals(getEdge(graphs[1], 4, 16), 1) self.assertEquals(getEdge(graphs[1], 5, 6), 1) self.assertEquals(getEdge(graphs[1], 5, 17), 1) self.assertEquals(getEdge(graphs[1], 6, 7), 1) self.assertEquals(getEdge(graphs[1], 6, 18), 1) self.assertEquals(getEdge(graphs[1], 7, 8), 1) self.assertEquals(getEdge(graphs[1], 8, 9), 1) self.assertEquals(getEdge(graphs[1], 9, 10), 1) self.assertEquals(getEdge(graphs[1], 9, 11), 1) self.assertEquals(getEdge(graphs[1], 10, 19), 1) self.assertEquals(getEdge(graphs[1], 11, 12), 1) self.assertEquals(getEdge(graphs[1], 11, 13), 1)
def testGetOutputDir(self): print((PathDefaults.getOutputDir()))
def testGetProjectDir(self): print((PathDefaults.getSourceDir()))
def testGetDataDir(self): print((PathDefaults.getDataDir()))
def testReadFromFile(self): vertex1Indices = [0, 2, 3, 4, 5] vertex2Indices = [1, 6, 7, 8, 9] def genderConv(x): genderDict = {'"M"': 0, '"F"': 1} return genderDict[x] def orientConv(x): orientDict = {'"HT"': 0, '"HB"': 1} return orientDict[x] converters = {2: genderConv, 6: genderConv, 3:orientConv, 7:orientConv} csvGraphReader = CsvGraphReader(vertex1Indices, vertex2Indices, converters) dir = PathDefaults.getDataDir() fileName = dir + "test/infect5.csv" graph = csvGraphReader.readFromFile(fileName) self.assertTrue((graph.getVertex(0) == numpy.array([0, 0, 28, 1])).all()) self.assertTrue((graph.getVertex(1) == numpy.array([1, 0, 26, 1])).all()) self.assertTrue((graph.getVertex(2) == numpy.array([0, 1, 42, 2])).all()) self.assertTrue((graph.getVertex(3) == numpy.array([1, 0, 33, 1])).all()) self.assertTrue((graph.getVertex(4) == numpy.array([0, 1, 35, 37])).all()) self.assertTrue(graph.getEdge(0, 1) == 1) self.assertTrue(graph.getEdge(2, 3) == 1) self.assertTrue(graph.getEdge(4, 6) == 1) self.assertTrue(graph.getEdge(6, 7) == 1) self.assertTrue(graph.getEdge(5, 8) == 1) self.assertEquals(graph.getNumEdges(), 5) self.assertTrue(graph.isUndirected()) #Test a directed graph csvGraphReader = CsvGraphReader(vertex1Indices, vertex2Indices, converters, undirected=False) graph = csvGraphReader.readFromFile(fileName) self.assertTrue(graph.getEdge(1, 0) == None) self.assertTrue(graph.getEdge(3, 2) == None) self.assertTrue(graph.getEdge(6, 4) == None) self.assertTrue(graph.getEdge(7, 6) == None) self.assertTrue(graph.getEdge(8, 5) == None) self.assertEquals(graph.getNumEdges(), 5) self.assertFalse(graph.isUndirected()) #Test graph with no vertex information vertex1Indices = [0] vertex2Indices = [1] fileName = dir + "test/infect5-0.csv" csvGraphReader = CsvGraphReader(vertex1Indices, vertex2Indices, {}) graph = csvGraphReader.readFromFile(fileName) self.assertTrue(graph.getEdge(0, 1) == 1) self.assertTrue(graph.getEdge(2, 3) == 1) self.assertTrue(graph.getEdge(4, 6) == 1) self.assertTrue(graph.getEdge(6, 7) == 1) self.assertTrue(graph.getEdge(5, 8) == 1) self.assertEquals(graph.getNumEdges(), 5) self.assertTrue(graph.isUndirected()) self.assertEquals(graph.getVertexList().getNumFeatures(), 0)