def testFindSimilarDocumentsLDA(self): self.dataset = ArnetMinerDataset() self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-1000.txt" self.dataset.overwrite = True self.dataset.overwriteModel = True self.dataset.overwriteVectoriser = True self.dataset.k = 20 #Check document is correct as well as authors self.dataset.findSimilarDocumentsLDA(self.field) #Let's test order of ranking on larger dataset print("Running on 10000 dataset using LDA") dataset = ArnetMinerDataset() dataset.minDf = 10**-5 dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt" dataset.vectoriseDocuments() relevantExperts = dataset.findSimilarDocumentsLDA("Neural Networks")
def testFindSimilarDocuments(self): field = "Object" self.dataset = ArnetMinerDataset() self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-test.txt" #Check document is correct as well as authors self.dataset.vectoriseDocuments() relevantExperts = self.dataset.findSimilarDocumentsLSI(field) self.assertEquals(['Jos\xc3\xa9 A. Blakeley'], relevantExperts) #Let's test order of ranking on larger dataset print("Running on 10000 dataset") dataset = ArnetMinerDataset() dataset.minDf = 10**-6 dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt" dataset.vectoriseDocuments() relevantExperts = dataset.findSimilarDocumentsLSI("Neural Networks") self.assertEquals(['Christopher M. Bishop', 'Michael I. Jordan', 'Fred L. Kitchens', 'Ai Cheo', 'Cesare Alippi', 'Giovanni Vanini', 'C. C. Taylor', 'David J. Spiegelhalter', 'Donald Michie'], relevantExperts)
similarityCutoff = 0.30 ns = numpy.arange(5, 105, 5) runLSI = not args.runLDA dataset = ArnetMinerDataset(runLSI=runLSI) #dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-1000000.txt" dataset.dataFilename = dataset.dataDir + "DBLP-citation-1000000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-7000000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-Feb21.txt" dataset.overwriteGraph = True dataset.overwriteModel = True dataset.overwriteVectoriser = True dataset.vectoriseDocuments() dataset.loadVectoriser() X = scipy.io.mmread(dataset.docTermMatrixFilename + ".mtx") X = X.tocsc() X.data[:] = 1 print(numpy.max(X.data), numpy.min(X.data)) rowSums = numpy.array(X.sum(0), numpy.int).flatten() colSums = numpy.array(X.sum(1), numpy.int).flatten() sortedIndsColSums = numpy.argsort(rowSums) featureNames = dataset.vectoriser.get_feature_names()
class ArnetMinerDatasetTest(unittest.TestCase): def setUp(self): numpy.random.seed(22) numpy.set_printoptions(suppress=True, precision=3) #logging.basicConfig(stream=sys.stdout, level=logging.ERROR) self.field = "Database" self.dataset = ArnetMinerDataset(additionalFields=[self.field]) self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-test.txt" self.dataset.overwrite = True self.dataset.overwriteModel = True self.dataset.overwriteVectoriser = True def testVectoriseDocuments(self): #Check document is correct as well as authors self.dataset.vectoriseDocuments() def testFindSimilarDocuments(self): field = "Object" self.dataset = ArnetMinerDataset() self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-test.txt" #Check document is correct as well as authors self.dataset.vectoriseDocuments() relevantExperts = self.dataset.findSimilarDocumentsLSI(field) self.assertEquals(['Jos\xc3\xa9 A. Blakeley'], relevantExperts) #Let's test order of ranking on larger dataset print("Running on 10000 dataset") dataset = ArnetMinerDataset() dataset.minDf = 10**-6 dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt" dataset.vectoriseDocuments() relevantExperts = dataset.findSimilarDocumentsLSI("Neural Networks") self.assertEquals(['Christopher M. Bishop', 'Michael I. Jordan', 'Fred L. Kitchens', 'Ai Cheo', 'Cesare Alippi', 'Giovanni Vanini', 'C. C. Taylor', 'David J. Spiegelhalter', 'Donald Michie'], relevantExperts) def testFindCoauthors(self): #Check document is correct as well as authors self.dataset.vectoriseDocuments() relevantExperts = self.dataset.findSimilarDocumentsLSI(self.field) self.dataset.coauthorsGraph(self.field, relevantExperts) def testCoauthorsGraphFromAuthors(self): releventExperts = set(["Yuri Breitbart", "Hector Garcia-Molina"]) graph, authorIndexer = self.dataset.coauthorsGraphFromAuthors(releventExperts) self.assertEquals(graph.get_edgelist(), [(0, 1), (0, 2), (0, 4), (1, 2), (1, 3)]) self.assertEquals(graph.es["weight"], [1, 1, 1, 1, 1]) self.assertEquals(graph.es["invWeight"], [1 ,1,1,1,1]) self.assertEquals(len(authorIndexer.getIdDict()), 5) def testMatchExperts(self): #TODO: self.dataset.vectoriseDocuments() relevantExperts = self.dataset.findSimilarDocumentsLSI("DBMS") expertsSet = self.dataset.expertsDict[self.field] expertMatches = self.dataset.matchExperts(relevantExperts, expertsSet) self.assertEquals(expertMatches, ['Nathan Goodman']) self.assertEquals(expertsSet, set(['Hector Garcia-Molina', 'Yuri Breitbart', 'Nathan Goodman'])) def testExpertsFromDocSimilarities(self): self.dataset.authorList = [["Joe Bloggs", "Alfred Nobel"], ["Ian Hislop"], ["Alfred Nobel", "Ian Hislop"]] similarities = numpy.array([0.4, 0.5, 0.8]) experts = self.dataset.expertsFromDocSimilarities(similarities) self.assertEquals(experts, ['Ian Hislop', 'Alfred Nobel', 'Joe Bloggs']) def testFindSimilarDocumentsLDA(self): self.dataset = ArnetMinerDataset() self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-1000.txt" self.dataset.overwrite = True self.dataset.overwriteModel = True self.dataset.overwriteVectoriser = True self.dataset.k = 20 #Check document is correct as well as authors self.dataset.findSimilarDocumentsLDA(self.field) #Let's test order of ranking on larger dataset print("Running on 10000 dataset using LDA") dataset = ArnetMinerDataset() dataset.minDf = 10**-5 dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt" dataset.vectoriseDocuments() relevantExperts = dataset.findSimilarDocumentsLDA("Neural Networks") #self.assertEquals(['Christopher M. Bishop', 'Michael I. Jordan', 'Fred L. Kitchens', 'Ai Cheo', 'Cesare Alippi', 'Giovanni Vanini', 'C. C. Taylor', 'David J. Spiegelhalter', 'Donald Michie'], relevantExperts) @unittest.skip("") def testModelSelectionLSI(self): self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-1000.txt" self.dataset.overwrite = True self.dataset.overwriteModel = True self.dataset.overwriteVectoriser = True self.dataset.vectoriseDocuments() self.dataset.modelSelectionLSI()