def profileModelSelection(self): dataset = ArnetMinerDataset(runLSI=False) dataset.overwrite = True dataset.overwriteVectoriser = True dataset.overwriteModel = True dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" ProfileUtils.profile('dataset.modelSelection()', globals(), locals())
def profileComputeLDA(self): field = "Boosting" dataset = ArnetMinerDataset(field) dataset.overwrite = True dataset.overwriteVectoriser = True dataset.overwriteModel = True dataset.maxRelevantAuthors = 100 dataset.k = 200 dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" ProfileUtils.profile('dataset.computeLDA()', globals(), locals())
def testFindSimilarDocumentsLDA(self): self.dataset = ArnetMinerDataset() self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-1000.txt" self.dataset.overwrite = True self.dataset.overwriteModel = True self.dataset.overwriteVectoriser = True self.dataset.k = 20 #Check document is correct as well as authors self.dataset.findSimilarDocumentsLDA(self.field) #Let's test order of ranking on larger dataset print("Running on 10000 dataset using LDA") dataset = ArnetMinerDataset() dataset.minDf = 10**-5 dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt" dataset.vectoriseDocuments() relevantExperts = dataset.findSimilarDocumentsLDA("Neural Networks")
def testFindSimilarDocuments(self): field = "Object" self.dataset = ArnetMinerDataset() self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-test.txt" #Check document is correct as well as authors self.dataset.vectoriseDocuments() relevantExperts = self.dataset.findSimilarDocumentsLSI(field) self.assertEquals(['Jos\xc3\xa9 A. Blakeley'], relevantExperts) #Let's test order of ranking on larger dataset print("Running on 10000 dataset") dataset = ArnetMinerDataset() dataset.minDf = 10**-6 dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt" dataset.vectoriseDocuments() relevantExperts = dataset.findSimilarDocumentsLSI("Neural Networks") self.assertEquals(['Christopher M. Bishop', 'Michael I. Jordan', 'Fred L. Kitchens', 'Ai Cheo', 'Cesare Alippi', 'Giovanni Vanini', 'C. C. Taylor', 'David J. Spiegelhalter', 'Donald Michie'], relevantExperts)
numpy.random.seed(21) parser = argparse.ArgumentParser(description='Run reputation evaluation experiments') parser.add_argument("-r", "--runLDA", action="store_true", help="Run Latent Dirchlet Allocation") args = parser.parse_args() averagePrecisionN = 50 similarityCutoff = 0.30 ns = numpy.arange(5, 105, 5) runLSI = not args.runLDA dataset = ArnetMinerDataset(runLSI=runLSI) #dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-1000000.txt" dataset.dataFilename = dataset.dataDir + "DBLP-citation-1000000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-7000000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-Feb21.txt" dataset.overwriteGraph = True dataset.overwriteModel = True dataset.overwriteVectoriser = True dataset.vectoriseDocuments() dataset.loadVectoriser() X = scipy.io.mmread(dataset.docTermMatrixFilename + ".mtx") X = X.tocsc() X.data[:] = 1 print(numpy.max(X.data), numpy.min(X.data))