def profileModelSelection(self): dataset = ArnetMinerDataset(runLSI=False) dataset.overwrite = True dataset.overwriteVectoriser = True dataset.overwriteModel = True dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" ProfileUtils.profile('dataset.modelSelection()', globals(), locals())
def setUp(self): numpy.random.seed(22) numpy.set_printoptions(suppress=True, precision=3) #logging.basicConfig(stream=sys.stdout, level=logging.ERROR) self.field = "Database" self.dataset = ArnetMinerDataset(additionalFields=[self.field]) self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-test.txt" self.dataset.overwrite = True self.dataset.overwriteModel = True self.dataset.overwriteVectoriser = True
def profileComputeLDA(self): field = "Boosting" dataset = ArnetMinerDataset(field) dataset.overwrite = True dataset.overwriteVectoriser = True dataset.overwriteModel = True dataset.maxRelevantAuthors = 100 dataset.k = 200 dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" ProfileUtils.profile('dataset.computeLDA()', globals(), locals())
def testFindSimilarDocumentsLDA(self): self.dataset = ArnetMinerDataset() self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-1000.txt" self.dataset.overwrite = True self.dataset.overwriteModel = True self.dataset.overwriteVectoriser = True self.dataset.k = 20 #Check document is correct as well as authors self.dataset.findSimilarDocumentsLDA(self.field) #Let's test order of ranking on larger dataset print("Running on 10000 dataset using LDA") dataset = ArnetMinerDataset() dataset.minDf = 10**-5 dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt" dataset.vectoriseDocuments() relevantExperts = dataset.findSimilarDocumentsLDA("Neural Networks")
def testFindSimilarDocuments(self): field = "Object" self.dataset = ArnetMinerDataset() self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-test.txt" #Check document is correct as well as authors self.dataset.vectoriseDocuments() relevantExperts = self.dataset.findSimilarDocumentsLSI(field) self.assertEquals(['Jos\xc3\xa9 A. Blakeley'], relevantExperts) #Let's test order of ranking on larger dataset print("Running on 10000 dataset") dataset = ArnetMinerDataset() dataset.minDf = 10**-6 dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt" dataset.vectoriseDocuments() relevantExperts = dataset.findSimilarDocumentsLSI("Neural Networks") self.assertEquals(['Christopher M. Bishop', 'Michael I. Jordan', 'Fred L. Kitchens', 'Ai Cheo', 'Cesare Alippi', 'Giovanni Vanini', 'C. C. Taylor', 'David J. Spiegelhalter', 'Donald Michie'], relevantExperts)
""" Find out which experts exist in the DBLP dataset and how many abstracts they have. """ import logging import sys import itertools import numpy from exp.influence2.ArnetMinerDataset import ArnetMinerDataset logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) dataset = ArnetMinerDataset() #dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" authorList, documentList, citationList = dataset.readAuthorsAndDocuments() authorSet = set(itertools.chain.from_iterable(authorList)) print("Found all authors") expertMatchesDict = {} for field in dataset.fields: expertMatchesDict[field] = 0 for expert in dataset.expertsDict[field]: if expert in authorSet: expertMatchesDict[field] += 1 expertMatchesDict[field] /= float(len(dataset.expertsDict[field]))
from apgl.util.Evaluator import Evaluator logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) numpy.set_printoptions(suppress=True, precision=3, linewidth=160) numpy.random.seed(21) parser = argparse.ArgumentParser(description='Run reputation evaluation experiments') parser.add_argument("-r", "--runLDA", action="store_true", help="Run Latent Dirchlet Allocation") args = parser.parse_args() averagePrecisionN = 50 similarityCutoff = 0.30 ns = numpy.arange(5, 105, 5) runLSI = not args.runLDA dataset = ArnetMinerDataset(runLSI=runLSI) #dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-1000000.txt" dataset.dataFilename = dataset.dataDir + "DBLP-citation-1000000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-7000000.txt" #dataset.dataFilename = dataset.dataDir + "DBLP-citation-Feb21.txt" dataset.overwriteGraph = True dataset.overwriteModel = True dataset.overwriteVectoriser = True dataset.vectoriseDocuments() dataset.loadVectoriser() X = scipy.io.mmread(dataset.docTermMatrixFilename + ".mtx")
class ArnetMinerDatasetTest(unittest.TestCase): def setUp(self): numpy.random.seed(22) numpy.set_printoptions(suppress=True, precision=3) #logging.basicConfig(stream=sys.stdout, level=logging.ERROR) self.field = "Database" self.dataset = ArnetMinerDataset(additionalFields=[self.field]) self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-test.txt" self.dataset.overwrite = True self.dataset.overwriteModel = True self.dataset.overwriteVectoriser = True def testVectoriseDocuments(self): #Check document is correct as well as authors self.dataset.vectoriseDocuments() def testFindSimilarDocuments(self): field = "Object" self.dataset = ArnetMinerDataset() self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-test.txt" #Check document is correct as well as authors self.dataset.vectoriseDocuments() relevantExperts = self.dataset.findSimilarDocumentsLSI(field) self.assertEquals(['Jos\xc3\xa9 A. Blakeley'], relevantExperts) #Let's test order of ranking on larger dataset print("Running on 10000 dataset") dataset = ArnetMinerDataset() dataset.minDf = 10**-6 dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt" dataset.vectoriseDocuments() relevantExperts = dataset.findSimilarDocumentsLSI("Neural Networks") self.assertEquals(['Christopher M. Bishop', 'Michael I. Jordan', 'Fred L. Kitchens', 'Ai Cheo', 'Cesare Alippi', 'Giovanni Vanini', 'C. C. Taylor', 'David J. Spiegelhalter', 'Donald Michie'], relevantExperts) def testFindCoauthors(self): #Check document is correct as well as authors self.dataset.vectoriseDocuments() relevantExperts = self.dataset.findSimilarDocumentsLSI(self.field) self.dataset.coauthorsGraph(self.field, relevantExperts) def testCoauthorsGraphFromAuthors(self): releventExperts = set(["Yuri Breitbart", "Hector Garcia-Molina"]) graph, authorIndexer = self.dataset.coauthorsGraphFromAuthors(releventExperts) self.assertEquals(graph.get_edgelist(), [(0, 1), (0, 2), (0, 4), (1, 2), (1, 3)]) self.assertEquals(graph.es["weight"], [1, 1, 1, 1, 1]) self.assertEquals(graph.es["invWeight"], [1 ,1,1,1,1]) self.assertEquals(len(authorIndexer.getIdDict()), 5) def testMatchExperts(self): #TODO: self.dataset.vectoriseDocuments() relevantExperts = self.dataset.findSimilarDocumentsLSI("DBMS") expertsSet = self.dataset.expertsDict[self.field] expertMatches = self.dataset.matchExperts(relevantExperts, expertsSet) self.assertEquals(expertMatches, ['Nathan Goodman']) self.assertEquals(expertsSet, set(['Hector Garcia-Molina', 'Yuri Breitbart', 'Nathan Goodman'])) def testExpertsFromDocSimilarities(self): self.dataset.authorList = [["Joe Bloggs", "Alfred Nobel"], ["Ian Hislop"], ["Alfred Nobel", "Ian Hislop"]] similarities = numpy.array([0.4, 0.5, 0.8]) experts = self.dataset.expertsFromDocSimilarities(similarities) self.assertEquals(experts, ['Ian Hislop', 'Alfred Nobel', 'Joe Bloggs']) def testFindSimilarDocumentsLDA(self): self.dataset = ArnetMinerDataset() self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-1000.txt" self.dataset.overwrite = True self.dataset.overwriteModel = True self.dataset.overwriteVectoriser = True self.dataset.k = 20 #Check document is correct as well as authors self.dataset.findSimilarDocumentsLDA(self.field) #Let's test order of ranking on larger dataset print("Running on 10000 dataset using LDA") dataset = ArnetMinerDataset() dataset.minDf = 10**-5 dataset.dataFilename = dataset.dataDir + "DBLP-citation-10000.txt" dataset.vectoriseDocuments() relevantExperts = dataset.findSimilarDocumentsLDA("Neural Networks") #self.assertEquals(['Christopher M. Bishop', 'Michael I. Jordan', 'Fred L. Kitchens', 'Ai Cheo', 'Cesare Alippi', 'Giovanni Vanini', 'C. C. Taylor', 'David J. Spiegelhalter', 'Donald Michie'], relevantExperts) @unittest.skip("") def testModelSelectionLSI(self): self.dataset.dataFilename = self.dataset.dataDir + "DBLP-citation-1000.txt" self.dataset.overwrite = True self.dataset.overwriteModel = True self.dataset.overwriteVectoriser = True self.dataset.vectoriseDocuments() self.dataset.modelSelectionLSI()
""" Looking at all articles with an abstract, restrict and save the experts """ import logging import sys import itertools import numpy from exp.influence2.ArnetMinerDataset import ArnetMinerDataset logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) dataset = ArnetMinerDataset() #dataset.dataFilename = dataset.dataDir + "DBLP-citation-100000.txt" authorList, documentList, citationList = dataset.readAuthorsAndDocuments() authorSet = set(itertools.chain.from_iterable(authorList)) print("Found all authors") expertMatchesDict = {} for field in dataset.fields: expertMatchesDict[field] = set([]) for expert in dataset.expertsDict[field]: if expert in authorSet: expertMatchesDict[field].add(expert) expertMatchesDict[field] = sorted(list(expertMatchesDict[field]))