def __init__(self, field): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "dblp/" self.xmlFileName = dataDir + "dblp.xml" self.xmlCleanFilename = dataDir + "dblpClean.xml" resultsDir = PathDefaults.getDataDir() + "reputation/" + field + "/" self.expertsFileName = resultsDir + "experts.txt" self.expertMatchesFilename = resultsDir + "experts_matches.csv" self.trainExpertMatchesFilename = resultsDir + "experts_train_matches.csv" self.testExpertMatchesFilename = resultsDir + "experts_test_matches.csv" self.coauthorsFilename = resultsDir + "coauthors.csv" self.publicationsFilename = resultsDir + "publications.csv" self.stepSize = 100000 self.numLines = 33532888 self.publicationTypes = set(["article" , "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www"]) self.p = 0.5 self.matchCutoff = 0.95 self.cleanXML() self.matchExperts() logging.warning("Now you must disambiguate the matched experts if not ready done")
def processSimpleDataset(name, numRealisations, split, ext=".csv", delimiter=",", usecols=None, skiprows=1, converters=None): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" fileName = dataDir + name + ext print("Loading data from file " + fileName) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "/" XY = numpy.loadtxt(fileName, delimiter=delimiter, skiprows=skiprows, usecols=usecols, converters=converters) X = XY[:, :-1] y = XY[:, -1] idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split) preprocessSave(X, y, outputDir, idx)
def testGenerateRandomGraph(self): egoFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoData.csv" alterFileName = PathDefaults.getDataDir() + "infoDiffusion/AlterData.csv" numVertices = 1000 infoProb = 0.1 p = 0.1 neighbours = 10 generator = SmallWorldGenerator(p, neighbours) graph = SparseGraph(VertexList(numVertices, 0)) graph = generator.generate(graph) self.svmEgoSimulator.generateRandomGraph(egoFileName, alterFileName, infoProb, graph)
def flixster(minNnzRows=10, minNnzCols=2, quantile=90): matrixFileName = PathDefaults.getDataDir() + "flixster/Ratings.timed.txt" matrixFile = open(matrixFileName) matrixFile.readline() userIndexer = IdIndexer("i") movieIndexer = IdIndexer("i") ratings = array.array("f") logging.debug("Loading ratings from " + matrixFileName) for i, line in enumerate(matrixFile): if i % 1000000 == 0: logging.debug("Iteration: " + str(i)) vals = line.split() userIndexer.append(vals[0]) movieIndexer.append(vals[1]) ratings.append(float(vals[2])) rowInds = userIndexer.getArray() colInds = movieIndexer.getArray() ratings = numpy.array(ratings) X = sppy.csarray((len(userIndexer.getIdDict()), len(movieIndexer.getIdDict())), storagetype="row", dtype=numpy.int) X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True) X.prune() X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + matrixFileName) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) #X = Sampling.sampleUsers(X, 1000) return X
def main(argv=None): if argv is None: argv = sys.argv try: # read options try: opts, args = getopt.getopt(argv[1:], "hd:n:D", ["help", "dir=", "nb_user="******"debug"]) except getopt.error as msg: raise RGUsage(msg) # apply options dir = PathDefaults.getDataDir() + "cluster/" nb_user = None log_level = logging.INFO for o, a in opts: if o in ("-h", "--help"): print(__doc__) return 0 elif o in ("-d", "--dir"): dir = a elif o in ("-n", "--nb_user"): nb_user = int(a) elif o in ("-D", "--debug"): log_level = logging.DEBUG logging.basicConfig(stream=sys.stdout, level=log_level, format='%(levelname)s (%(asctime)s):%(message)s') # process: generate data files BemolData.generate_data_file(dir, nb_user) except RGUsage as err: logging.error(err.msg) logging.error("for help use --help") return 2
def __init__(self, maxIter=None, iterStartTimeStamp=None): outputDir = PathDefaults.getOutputDir() + "recommend/erasm/" if not os.path.exists(outputDir): os.mkdir(outputDir) #iterStartDate is the starting date of the iterator if iterStartTimeStamp != None: self.iterStartTimeStamp = iterStartTimeStamp else: self.iterStartTimeStamp = 1286229600 self.timeStep = timedelta(30).total_seconds() self.ratingFileName = outputDir + "data.npz" self.userDictFileName = outputDir + "userIdDict.pkl" self.groupDictFileName = outputDir + "groupIdDict.pkl" self.isTrainRatingsFileName = outputDir + "is_train.npz" self.dataDir = PathDefaults.getDataDir() + "erasm/" self.dataFileName = self.dataDir + "groupMembers-29-11-12" self.maxIter = maxIter self.trainSplit = 4.0/5 self.processRatings() self.splitDataset() self.loadProcessedData()
def testEdgeFile(self): """ Figure out the problem with the edge file """ dataDir = PathDefaults.getDataDir() + "cluster/" edgesFilename = dataDir + "Cit-HepTh.txt" edges = {} file = open(edgesFilename, 'r') file.readline() file.readline() file.readline() file.readline() vertices = {} for line in file: (vertex1, sep, vertex2) = line.partition("\t") vertex1 = vertex1.strip() vertex2 = vertex2.strip() edges[(vertex1, vertex2)] = 0 vertices[vertex1] = 0 vertices[vertex2] = 0 #It says there are 352807 edges in paper and 27770 vertices self.assertEquals(len(edges), 352807) self.assertEquals(len(vertices), 27770)
def epinions(minNnzRows=10, minNnzCols=3, quantile=90): matrixFileName = PathDefaults.getDataDir() + "epinions/rating.mat" A = scipy.io.loadmat(matrixFileName)["rating"] userIndexer = IdIndexer("i") itemIndexer = IdIndexer("i") for i in range(A.shape[0]): userIndexer.append(A[i, 0]) itemIndexer.append(A[i, 1]) rowInds = userIndexer.getArray() colInds = itemIndexer.getArray() ratings = A[:, 3] X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int) X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True) X.prune() X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + matrixFileName) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) return X
def testToyData(self): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" data = numpy.load(dataDir + "toyData.npz") gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] pxSum = 0 pY1XSum = 0 pYminus1XSum = 0 px2Sum = 0 squareArea = (gridPoints[1]-gridPoints[0])**2 for i in range(gridPoints.shape[0]-1): for j in range(gridPoints.shape[0]-1): px = (pdfX[i,j]+pdfX[i+1,j]+pdfX[i, j+1]+pdfX[i+1, j+1])/4 pxSum += px*squareArea pY1X = (pdfY1X[i,j]+pdfY1X[i+1,j]+pdfY1X[i, j+1]+pdfY1X[i+1, j+1])/4 pY1XSum += pY1X*squareArea pYminus1X = (pdfYminus1X[i,j]+pdfYminus1X[i+1,j]+pdfYminus1X[i, j+1]+pdfYminus1X[i+1, j+1])/4 pYminus1XSum += pYminus1X*squareArea px2Sum += px*pY1X*squareArea + px*pYminus1X*squareArea self.assertAlmostEquals(pxSum, 1) print(pY1XSum) print(pYminus1XSum) self.assertAlmostEquals(px2Sum, 1)
def testComputeIdealPenalty(self): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" data = numpy.load(dataDir + "toyData.npz") gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] sampleSize = 100 trainX, trainY = X[0:sampleSize, :], y[0:sampleSize] testX, testY = X[sampleSize:, :], y[sampleSize:] #We form a test set from the grid points fullX = numpy.zeros((gridPoints.shape[0]**2, 2)) for m in range(gridPoints.shape[0]): fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints fullX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m] C = 1.0 gamma = 1.0 args = (trainX, trainY, fullX, C, gamma, gridPoints, pdfX, pdfY1X, pdfYminus1X) penalty = computeIdealPenalty(args) #Now compute penalty using data args = (trainX, trainY, testX, testY, C, gamma) penalty2 = computeIdealPenalty2(args) self.assertAlmostEquals(penalty2, penalty, 2)
def __init__(self): self.labelNames = ["Cortisol.val", "Testosterone.val", "IGF1.val"] self.dataDir = PathDefaults.getDataDir() + "metabolomic/" self.boundsDict = {} self.boundsDict["Cortisol"] = numpy.array([0, 89, 225, 573]) self.boundsDict["Testosterone"] = numpy.array([0, 3, 9, 13]) self.boundsDict["IGF1"] = numpy.array([0, 200, 441, 782])
def testPredict2(self): # Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) maxDepths = range(3, 10) trainAucs = numpy.array( [0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508] ) testAucs = numpy.array([0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400]) i = 0 # The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRank = TreeRank(self.leafRanklearner) treeRank.setMaxDepth(maxDepth) treeRank.learnModel(X, y) trainScores = treeRank.predict(X) testScores = treeRank.predict(testX) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i += 1
def getIterator(): dataDir = PathDefaults.getDataDir() + "cluster/" nbUser = 10000 # set to 'None' to have all users nbPurchasesPerIt = 500 # set to 'None' to take all the purchases per date startingIteration = 300 endingIteration = 600 # set to 'None' to have all iterations stepSize = 1 return itertools.islice(BemolData.getGraphIterator(dataDir, nbUser, nbPurchasesPerIt), startingIteration, endingIteration, stepSize)
def profileClusterFromIterator(self): iterator = IncreasingSubgraphListIterator(self.graph, self.subgraphIndicesList) dataDir = PathDefaults.getDataDir() + "cluster/" #iterator = getBemolGraphIterator(dataDir) def run(): clusterList, timeList, boundList = self.clusterer.clusterFromIterator(iterator, verbose=True) print(timeList.cumsum(0)) ProfileUtils.profile('run()', globals(), locals())
def syntheticDataset2(): """ Create a simple synthetic dataset using a power law distribution on users and items """ resultsDir = PathDefaults.getDataDir() + "syntheticRanking/" matrixFileName = resultsDir + "dataset1.mtx" X = sppy.io.mmread(matrixFileName, storagetype="row") return X
def testGetTrainIteratorFunc(self): dataFilename = PathDefaults.getDataDir() + "reference/author_document_count" dataset = Static2IdValDataset(dataFilename) trainIterator = dataset.getTrainIteratorFunc()() testIterator = dataset.getTestIteratorFunc()() for trainX in trainIterator: testX = testIterator.next() print(trainX.shape, trainX.nnz, testX.nnz) self.assertEquals(trainX.shape, testX.shape)
def processParkinsonsDataset(name, numRealisations): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" fileName = dataDir + name + ".data" XY = numpy.loadtxt(fileName, delimiter=",", skiprows=1) inds = list(set(range(XY.shape[1])) - set([5, 6])) X = XY[:, inds] y1 = XY[:, 5] y2 = XY[:, 6] #We don't keep whole collections of patients split = 0.5 idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-motor/" preprocessSave(X, y1, outputDir, idx) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-total/" preprocessSave(X, y2, outputDir, idx)
def profileClusterFromIterator(self): iterator = IncreasingSubgraphListIterator(self.graph, self.subgraphIndicesList) dataDir = PathDefaults.getDataDir() + "cluster/" #iterator = getBemolGraphIterator(dataDir) def run(): clusterList, timeList, boundList = self.clusterer.clusterFromIterator( iterator, verbose=True) print(timeList.cumsum(0)) ProfileUtils.profile('run()', globals(), locals())
def profileSvd2(self): dataDir = PathDefaults.getDataDir() + "erasm/contacts/" trainFilename = dataDir + "contacts_train" trainX = scipy.io.mmread(trainFilename) trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8) k = 500 U, s, V = RandomisedSVD.svd(trainX, k) print(s) print("All done")
def profileSvd2(self): dataDir = PathDefaults.getDataDir() + "erasm/contacts/" trainFilename = dataDir + "contacts_train" trainX = scipy.io.mmread(trainFilename) trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8) k = 500 U, s, V = RandomisedSVD.svd(trainX, k) print(s) print("All done")
def mendeley2(minNnzRows=10, minNnzCols=2, quantile=90, dataset="Document"): authorAuthorFileName = PathDefaults.getDataDir() + "reference/author" + dataset + "Matrix.mtx" logging.debug("Reading file: " + authorAuthorFileName) X = sppy.io.mmread(authorAuthorFileName, storagetype="row") logging.debug("Raw non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + authorAuthorFileName) logging.debug("Non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) return X
def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] * 2 - 1 fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] * 2 - 1 #X = Standardiser().standardiseArray(X) #testX = Standardiser().standardiseArray(testX) maxDepths = range(3, 10) trainAucs = numpy.array([ 0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508 ]) testAucs = numpy.array([ 0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400 ]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRank = TreeRank(DecisionTree) treeRank.setMaxDepth(maxDepth) treeRank.learnModel(X, y) trainScores = treeRank.predict(X) testScores = treeRank.predict(testX) #print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY)) #self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2) #self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i += 1 #Compare tree to that of R version tree = treeRank.getTree()
def profilePropackSvd(self): dataDir = PathDefaults.getDataDir() + "erasm/contacts/" trainFilename = dataDir + "contacts_train" trainX = scipy.io.mmread(trainFilename) trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8) k = 500 U, s, V = SparseUtils.svdPropack(trainX, k, kmax=k * 5) print(s) #Memory consumption is dependent on kmax print("All done")
def profilePropackSvd(self): dataDir = PathDefaults.getDataDir() + "erasm/contacts/" trainFilename = dataDir + "contacts_train" trainX = scipy.io.mmread(trainFilename) trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.int8) k = 500 U, s, V = SparseUtils.svdPropack(trainX, k, kmax=k * 5) print(s) # Memory consumption is dependent on kmax print("All done")
def testBayesError(self): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" data = numpy.load(dataDir + "toyData.npz") gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] sampleSize = 100 trainX, trainY = X[0:sampleSize, :], y[0:sampleSize] testX, testY = X[sampleSize:, :], y[sampleSize:] #We form a test set from the grid points gridX = numpy.zeros((gridPoints.shape[0]**2, 2)) for m in range(gridPoints.shape[0]): gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m] Cs = 2**numpy.arange(-5, 5, dtype=numpy.float) gammas = 2**numpy.arange(-5, 5, dtype=numpy.float) bestError = 1 for C in Cs: for gamma in gammas: svm = LibSVM(kernel="gaussian", C=C, kernelParam=gamma) svm.learnModel(trainX, trainY) predY, decisionsY = svm.predict(gridX, True) decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F") error = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X) predY, decisionsY = svm.predict(testX, True) error2 = Evaluator.binaryError(testY, predY) print(error, error2) if error < bestError: error = bestError bestC = C bestGamma = gamma svm = LibSVM(kernel="gaussian", C=bestC, kernelParam=bestGamma) svm.learnModel(trainX, trainY) predY, decisionsY = svm.predict(gridX, True) plt.figure(0) plt.contourf(gridPoints, gridPoints, decisionGrid, 100) plt.colorbar() plt.figure(1) plt.scatter(X[y==1, 0], X[y==1, 1], c='r' ,label="-1") plt.scatter(X[y==-1, 0], X[y==-1, 1], c='b',label="+1") plt.legend() plt.show()
def movieLens(minNnzRows=10, minNnzCols=2, quantile=90): matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data" data = numpy.loadtxt(matrixFileName) X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row", dtype=numpy.int) X.put(numpy.array(data[:, 2]>3, numpy.int), numpy.array(data[:, 0]-1, numpy.int32), numpy.array(data[:, 1]-1, numpy.int32), init=True) #X = SparseUtilsCython.centerRowsCsarray(X) #X[X.nonzero()] = X.values()>0 X.prune() #maxNnz = numpy.percentile(X.sum(0), quantile) #X = SparseUtils.pruneMatrixCols(X, minNnz=minNnzCols, maxNnz=maxNnz) X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + matrixFileName) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) return X
def profileArpackSvd(self): dataDir = PathDefaults.getDataDir() + "erasm/contacts/" trainFilename = dataDir + "contacts_train" trainX = scipy.io.mmread(trainFilename) trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.float32) print(trainX.dtype.char, trainX.dtype) k = 500 U, s, V = SparseUtils.svdArpack(trainX, k, kmax=k * 5) print(s) # Memory consumption is dependent on kmax and less than PROPACK print("All done")
def profileArpackSvd(self): dataDir = PathDefaults.getDataDir() + "erasm/contacts/" trainFilename = dataDir + "contacts_train" trainX = scipy.io.mmread(trainFilename) trainX = scipy.sparse.csc_matrix(trainX, dtype=numpy.float32) print(trainX.dtype.char, trainX.dtype) k = 500 U, s, V = SparseUtils.svdArpack(trainX, k, kmax=k * 5) print(s) #Memory consumption is dependent on kmax and less than PROPACK print("All done")
def cluster(): k1 = 20 # numCluster to learn k2 = 40 # numEigenVector kept dir = PathDefaults.getDataDir() + "cluster/" graphIterator = getBemolGraphIterator(dir) #=========================================== # cluster print("compute clusters") clusterer = IterativeSpectralClustering(k1, k2) clustersList = clusterer.clusterFromIterator(graphIterator, True) for i in range(len(clustersList)): clusters = clustersList[i] print(clusters)
def testRunSimulation(self): egoFileName = PathDefaults.getDataDir() + "infoDiffusion/EgoData.csv" alterFileName = PathDefaults.getDataDir() + "infoDiffusion/AlterData.csv" numVertices = 1000 infoProb = 0.1 p = 0.1 neighbours = 10 generator = SmallWorldGenerator(p, neighbours) graph = SparseGraph(VertexList(numVertices, 0)) graph = generator.generate(graph) CVal = 1.0 kernel = "linear" kernelParamVal = 0.0 errorCost = 0.5 folds = 6 sampleSize = 1000 maxIterations = 5 self.svmEgoSimulator.trainClassifier(CVal, kernel, kernelParamVal, errorCost, sampleSize) self.svmEgoSimulator.generateRandomGraph(egoFileName, alterFileName, infoProb, graph) self.svmEgoSimulator.runSimulation(maxIterations)
def readHIVGraph(self, undirected=True, indicators=True): """ We will use pacdate5389.csv which contains the data of infection. The undirected parameter instructs whether to create an undirected graph. If indicators is true then categorical varibles are turned into collections of indicator ones. """ converters = {1: CsvConverters.dateConv, 3:CsvConverters.dateConv, 5:CsvConverters.detectionConv, 6:CsvConverters.provConv, 8: CsvConverters.dateConv } converters[9] = CsvConverters.genderConv converters[10] = CsvConverters.orientConv converters[11] = CsvConverters.numContactsConv converters[12] = CsvConverters.numContactsConv converters[13] = CsvConverters.numContactsConv def nanProcessor(X): means = numpy.zeros(X.shape[1]) for i in range(X.shape[1]): if numpy.sum(numpy.isnan(X[:, i])) > 0: logging.info("No. missing values in " + str(i) + "th column: " + str(numpy.sum(numpy.isnan(X[:, i])))) means[i] = numpy.mean(X[:, i][numpy.isnan(X[:, i]) == False]) X[numpy.isnan(X[:, i]), i] = means[i] return X idIndex = 0 featureIndices = converters.keys() multiGraphCsvReader = MultiGraphCsvReader(idIndex, featureIndices, converters, nanProcessor) dataDir = PathDefaults.getDataDir() vertexFileName = dataDir + "HIV/alldata.csv" edgeFileNames = [dataDir + "HIV/grafdet2.csv", dataDir + "HIV/infect2.csv"] sparseMultiGraph = multiGraphCsvReader.readGraph(vertexFileName, edgeFileNames, undirected, delimiter="\t") #For learning purposes we will convert categorial variables into a set of #indicator features if indicators: logging.info("Converting categorial features") vList = sparseMultiGraph.getVertexList() V = vList.getVertices(list(range(vList.getNumVertices()))) catInds = [2, 3] generator = FeatureGenerator() V = generator.categoricalToIndicator(V, catInds) vList.replaceVertices(V) logging.info("Created " + str(sparseMultiGraph.getNumVertices()) + " examples with " + str(sparseMultiGraph.getVertexList().getNumFeatures()) + " features") return sparseMultiGraph
def mendeley(minNnzRows=10, minNnzCols=2, quantile=90, dataset="Doc", sigma=0.05, indicator=True): authorAuthorFileName = PathDefaults.getDataDir() + "reference/authorAuthor"+ dataset + "Matrix_sigma=" + str(sigma) + ".mtx" logging.debug("Reading file: " + authorAuthorFileName) X = sppy.io.mmread(authorAuthorFileName, storagetype="row") if indicator: X[X.nonzero()] = 1 X.prune() logging.debug("Raw non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + authorAuthorFileName) logging.debug("Non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) return X
def saveResults(orderedItems, scores, dataset, similaritiesFileName, contactsFilename, interestsFilename, minScore, minContacts, minAcceptableSims): #Now let's write out the similarities file logging.debug("Generating recommendations for authors") authorIndexerFilename = PathDefaults.getDataDir() + "reference/authorIndexer" + dataset + ".pkl" authorIndexerFile = open(authorIndexerFilename) authorIndexer = pickle.load(authorIndexerFile) authorIndexerFile.close() logging.debug("Loaded author indexer") reverseIndexer = authorIndexer.reverseTranslateDict() outputFile = open(similaritiesFileName, "w") csvFile = csv.writer(outputFile, delimiter='\t') for i in range(orderedItems.shape[0]): if i % 10000 == 0 : logging.debug("Iteration: " + str(i)) row = [reverseIndexer[i]] #Check author isn't recommended him/herself for j in range(orderedItems.shape[1]): if orderedItems[i, j] != i: row = [reverseIndexer[i], reverseIndexer[orderedItems[i, j]], scores[i, j]] csvFile.writerow(row) outputFile.close() logging.debug("Wrote recommendations to " + similaritiesFileName) #Figure out how good the recommendations are on the contacts network contacts = read_contacts(contactsFilename) research_interests = read_interests(interestsFilename) sims = read_similar_authors(similaritiesFileName, minScore) logging.debug('Evaluating against contacts...') meanStatsContacts = evaluate_against_contacts(sims, contacts, minContacts) logging.debug('Evaluating against research interests...') meanStatsInterests = evaluate_against_research_interests(sims, research_interests, minAcceptableSims) logging.debug("Mean stats on contacts: " + str(meanStatsContacts)) logging.debug("Mean stats on interests:" + str(meanStatsInterests)) return meanStatsContacts, meanStatsInterests
def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] y = y*2 - 1 fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] testY = testY*2-1 X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) numTrees = 5 minSplit = 50 maxDepths = range(3, 10) trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347]) testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRankForest = TreeRankForest(self.leafRanklearner) treeRankForest.setMaxDepth(maxDepth) treeRankForest.setMinSplit(minSplit) treeRankForest.setNumTrees(numTrees) treeRankForest.learnModel(X, y) trainScores = treeRankForest.predict(X) testScores = treeRankForest.predict(testX) print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY)) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i+=1
def testPredict2(self): #We play around with parameters to maximise AUC on the IGF1_0-Haar data dataDir = PathDefaults.getDataDir() fileName = dataDir + "IGF1_0-Haar.npy" XY = numpy.load(fileName) X = XY[:, 0:XY.shape[1]-1] y = XY[:, XY.shape[1]-1].ravel() weight = numpy.bincount(numpy.array(y, numpy.int))[0]/float(y.shape[0]) #weight = 0.5 #weight = 0.9 folds = 3 decisionTree = DecisionTree() decisionTree.setWeight(weight) decisionTree.setMaxDepth(50) #decisionTree.setMinSplit(100) mean, var = decisionTree.evaluateCv(X, y, folds, Evaluator.auc) logging.debug("AUC = " + str(mean)) logging.debug("Var = " + str(var))
def testPredict2(self): #We play around with parameters to maximise AUC on the IGF1_0-Haar data dataDir = PathDefaults.getDataDir() fileName = dataDir + "IGF1_0-Haar.npy" XY = numpy.load(fileName) X = XY[:, 0:XY.shape[1]-1] y = XY[:, XY.shape[1]-1].ravel() weight = numpy.bincount(numpy.array(y, numpy.int))[0]/float(y.shape[0]) #weight = 0.5 #weight = 0.9 folds = 3 randomForest = RandomForest() randomForest.setWeight(weight) randomForest.setMaxDepth(50) #randomForest.setMinSplit(100) mean, var = randomForest.evaluateCv(X, y, folds, Evaluator.auc) logging.debug("AUC = " + str(mean)) logging.debug("Var = " + str(var))
def main(): import sys logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data" data = numpy.loadtxt(matrixFileName) X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row") X[data[:, 0] - 1, data[:, 1] - 1] = numpy.array(data[:, 2] > 3, numpy.int) logging.debug("Read file: " + matrixFileName) logging.debug("Shape of data: " + str(X.shape)) logging.debug("Number of non zeros " + str(X.nnz)) u = 0.1 w = 1 - u (m, n) = X.shape validationSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, 1, validationSize) trainX, testX = trainTestXs[0] trainX = trainX.toScipyCsr() learner = CLiMF(k=20, lmbda=0.001, gamma=0.0001) learner.learnModel(trainX)
logging.debug('process id:' + str(os.getpid())) self.saveResults(self.leafRankGenerators, True) def run2(self): logging.debug('module name:' + __name__) logging.debug('parent process:' + str(os.getppid())) logging.debug('process id:' + str(os.getpid())) self.saveResults(self.funcLeafRankGenerators, False) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) logging.debug("Running from machine " + str(gethostname())) numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "metabolomic/" X, X2, Xs, XOpls, YList, ages, df = MetabolomicsUtils.loadData() waveletStr = 'db4' mode = "cpd" level = 10 XwDb4 = MetabolomicsUtils.getWaveletFeatures(X, 'db4', level, mode) XwDb8 = MetabolomicsUtils.getWaveletFeatures(X, 'db8', level, mode) XwHaar = MetabolomicsUtils.getWaveletFeatures(X, 'haar', level, mode) dataList = [] dataList.extend([(XwDb4, "db4")]) lock = multiprocessing.Lock() numpy.random.seed(datetime.datetime.now().microsecond)