def testFindSimilarDocumentsLDA(self): self.dataset = ArnetMinerDataset() self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-1000.txt" self.dataset.overwrite = True self.dataset.overwriteModel = True self.dataset.overwriteVectoriser = True self.dataset.k = 20 #Check document is correct as well as authors self.dataset.findSimilarDocumentsLDA(self.field) #Let's test order of ranking on larger dataset print("Running on 10000 dataset using LDA") dataset = ArnetMinerDataset() dataset.minDf = 10**-5 dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt" dataset.vectoriseDocuments() relevantExperts = dataset.findSimilarDocumentsLDA("Neural Networks")
def testFindSimilarDocuments(self): field = "Object" self.dataset = ArnetMinerDataset() self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-test.txt" #Check document is correct as well as authors self.dataset.vectoriseDocuments() relevantExperts = self.dataset.findSimilarDocumentsLSI(field) self.assertEquals(['Jos\xc3\xa9 A. Blakeley'], relevantExperts) #Let's test order of ranking on larger dataset print("Running on 10000 dataset") dataset = ArnetMinerDataset() dataset.minDf = 10**-6 dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt" dataset.vectoriseDocuments() relevantExperts = dataset.findSimilarDocumentsLSI("Neural Networks") self.assertEquals(['Christopher M. Bishop', 'Michael I. Jordan', 'Fred L. Kitchens', 'Ai Cheo', 'Cesare Alippi', 'Giovanni Vanini', 'C. C. Taylor', 'David J. Spiegelhalter', 'Donald Michie'], relevantExperts)