def run(citationCounts, publicationCounts): """ :param citationCounts: :param publicationCounts: """ experiment = AuthorsPathSimAPCPAExperiment( None, 'Most Similar APCPA PathSim Authors', outputFilePath=os.path.join('../../results', 'authors', 'apcpaPathSim')) # Compute once, since these never change graph, nodeIndex = cPickle.load( open(os.path.join('../../data', 'graphWithCitations'))) # Compute APCPA adjacency matrix apcAdjMatrix, extraData = getMetaPathAdjacencyData( graph, nodeIndex, ['author', 'paper', 'conference'], rows=True) cpaAdjMatrix, data = getMetaPathAdjacencyData( graph, nodeIndex, ['conference', 'paper', 'author']) apcpaAdjMatrix = lil_matrix(apcAdjMatrix * cpaAdjMatrix) # Correct the toNodes content in extraData extraData['toNodes'] = data['toNodes'] extraData['toNodesIndex'] = data['toNodesIndex'] for testAuthor in testAuthors: experiment.runFor(testAuthor, apcpaAdjMatrix, extraData, citationCounts, publicationCounts)
def run(citationCounts, publicationCounts): """ :param citationCounts: :param publicationCounts: """ experiment = AuthorsPathSimAPCPAExperiment( None, 'Most Similar APCPA PathSim Authors', outputFilePath = os.path.join('../../results','authors','apcpaPathSim') ) # Compute once, since these never change graph, nodeIndex = cPickle.load(open(os.path.join('../../data', 'graphWithCitations'))) # Compute APCPA adjacency matrix apcAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['author', 'paper', 'conference'], rows=True) cpaAdjMatrix, data = getMetaPathAdjacencyData(graph, nodeIndex, ['conference', 'paper', 'author']) apcpaAdjMatrix = lil_matrix(apcAdjMatrix * cpaAdjMatrix) # Correct the toNodes content in extraData extraData['toNodes'] = data['toNodes'] extraData['toNodesIndex'] = data['toNodesIndex'] for testAuthor in testAuthors: experiment.runFor(testAuthor, apcpaAdjMatrix, extraData, citationCounts, publicationCounts)
def getPartialMetaPath(graph, metaPathPart, nodeIndex, repetitions): adjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, metaPathPart) if metaPathPart[0] == metaPathPart[-1]: adjMatrices = [adjMatrix] * repetitions else: otherAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, list(reversed(metaPathPart))) adjMatrices = [adjMatrix, otherAdjMatrix] return adjMatrices, adjMatrix
def getPartialMetaPath(graph, metaPathPart, nodeIndex, repetitions): adjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, metaPathPart) if metaPathPart[0] == metaPathPart[-1]: adjMatrices = [adjMatrix] * repetitions else: otherAdjMatrix, extraData = getMetaPathAdjacencyData( graph, nodeIndex, list(reversed(metaPathPart))) adjMatrices = [adjMatrix, otherAdjMatrix] return adjMatrices, adjMatrix
def run(): # Experiments to run with meta path lengths (map of length to trial paths) p, a, t, c = 'paper', 'author', 'term', 'conference' metaPathLengthExperiments = { 3: [ [a, p, a], ], 4: [ [a, p, p, a] ], 5: [ [a, p, a, p, a], ], 7: [ [a, p, p, a, p, p, a] ], } graph, nodeIndex = cPickle.load(open(os.path.join('../', 'data', 'graphWithCitations'))) # Map of experiment length to experiment, which contains a tuple of average time # without and with saving adj matrix metaPathLengthExperimentResults = defaultdict(list) for pathLength in sorted(metaPathLengthExperiments.keys()): for metaPath in metaPathLengthExperiments[pathLength]: # Time getting adjacency matrix directly fullTime = timeit.timeit(lambda: getMetaPathAdjacencyData(graph, nodeIndex, metaPath), number=10) # Split meta path if pathLength in {3, 5}: metaPathPart = [p, a, p] if metaPath[0] == p else [a, p, a] repetitions = ((len(metaPath) - 1) / 2) else: # 4, 7 -- only repeat twice metaPathPart = metaPath[:(len(metaPath)/2 + 1)] print metaPathPart repetitions = 2 # Find the partial meta path adjacency list adjMatrices, adjMatrix = getPartialMetaPath(graph, metaPathPart, nodeIndex, repetitions) partialTime = timeit.timeit(lambda: getPartialMetaPath(graph, metaPathPart, nodeIndex, repetitions), number=10) # Get the number of bytes to store partial adj matrices bytesForMatrices = sys.getsizeof(adjMatrix) # Multiply for full adj matrix multiplyTime = timeit.timeit(lambda: multiplyFullAdjMatrix(adjMatrices, repetitions), number=10) # Output results metaPathLengthExperimentResults[pathLength].append(( fullTime, partialTime, multiplyTime, bytesForMatrices )) print "Full Path: %.3f seconds, Partial Paths: %.3f seconds, Multiplication Only: %.3f, Bytes: %d [%s]" % ( fullTime, partialTime, multiplyTime, bytesForMatrices, ', '.join(metaPath) ) cPickle.dump(metaPathLengthExperimentResults, open('results', 'w'))
def run(citationCounts, publicationCounts): experiment = AuthorsNeighborSimCPCPPAExperiment( None, 'Most Similar CPCPPA NeighborSim Authors', outputFilePath = os.path.join('../../results','authors','cpcppaNeighborSim') ) # Compute once, since these never change graph, nodeIndex = cPickle.load(open(os.path.join('../../data', 'graphWithCitations'))) cpcAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['conference', 'paper', 'conference']) cppaAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['conference', 'paper', 'paper', 'author']) cpcppaAdjMatrix = cpcAdjMatrix * cppaAdjMatrix extraData['fromNodes'] = extraData['toNodes'] extraData['fromNodesIndex'] = extraData['toNodesIndex'] for testAuthor in testAuthors: experiment.runFor(testAuthor, cpcppaAdjMatrix, extraData, citationCounts, publicationCounts)
def run(): experiment = PapersPathSimPTPExperiment( None, 'Most Similar PCP PathSim Papers', outputFilePath='results/papers/ptpPathSim') # Compute once, since these never change graph, nodeIndex = cPickle.load(open(os.path.join('../../data', 'graphWithCitations'))) # Compute APCPA adjacency matrix ptAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['paper', 'term'], rows=True) tpAdjMatrix, data = getMetaPathAdjacencyData(graph, nodeIndex, ['term', 'paper']) ptpAdjMatrix = lil_matrix(ptAdjMatrix * tpAdjMatrix) # Correct the toNodes content in extraData extraData['toNodes'] = data['toNodes'] extraData['toNodesIndex'] = data['toNodesIndex'] for testPaper in testPapers: experiment.runFor(testPaper, ptpAdjMatrix, extraData)
def run(): experiment = PapersPathSimPAPExperiment( None, 'Most Similar PAP PathSim Papers', outputFilePath='results/papers/papPathSim') # Compute once, since these never change graph, nodeIndex = cPickle.load(open(os.path.join('../../data', 'graphWithCitations'))) # Compute APCPA adjacency matrix paAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['paper', 'author'], rows=True) apAdjMatrix, data = getMetaPathAdjacencyData(graph, nodeIndex, ['author', 'paper']) papAdjMatrix = lil_matrix(paAdjMatrix * apAdjMatrix) # Correct the toNodes content in extraData extraData['toNodes'] = data['toNodes'] extraData['toNodesIndex'] = data['toNodesIndex'] for testPaper in testPapers: experiment.runFor(testPaper, papAdjMatrix, extraData)
def run(): experiment = PapersPathSimPTPExperiment( None, "Most Similar PCP PathSim Papers", outputFilePath="results/papers/ptpPathSim" ) # Compute once, since these never change graph, nodeIndex = cPickle.load(open(os.path.join("../../data", "graphWithCitations"))) # Compute APCPA adjacency matrix ptAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ["paper", "term"], rows=True) tpAdjMatrix, data = getMetaPathAdjacencyData(graph, nodeIndex, ["term", "paper"]) ptpAdjMatrix = lil_matrix(ptAdjMatrix * tpAdjMatrix) # Correct the toNodes content in extraData extraData["toNodes"] = data["toNodes"] extraData["toNodesIndex"] = data["toNodesIndex"] for testPaper in testPapers: experiment.runFor(testPaper, ptpAdjMatrix, extraData)
def run(): experiment = PapersNeighborSimAPPExperiment( None, 'Most Similar APP NeighborSim Authors', outputFilePath='results/papers/appNeighborSim') # Compute once, since these never change graph, nodeIndex = cPickle.load(open(os.path.join('../../data', 'graphWithCitations'))) appAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['author', 'paper', 'paper']) extraData['fromNodes'] = extraData['toNodes'] extraData['fromNodesIndex'] = extraData['toNodesIndex'] for testPaper in testPapers: experiment.runFor(testPaper, appAdjMatrix, extraData)
def run(citationCounts, publicationCounts): experiment = AuthorsNeighborSimCPCPPAExperiment( None, 'Most Similar CPCPPA NeighborSim Authors', outputFilePath=os.path.join('../../results', 'authors', 'cpcppaNeighborSim')) # Compute once, since these never change graph, nodeIndex = cPickle.load( open(os.path.join('../../data', 'graphWithCitations'))) cpcAdjMatrix, extraData = getMetaPathAdjacencyData( graph, nodeIndex, ['conference', 'paper', 'conference']) cppaAdjMatrix, extraData = getMetaPathAdjacencyData( graph, nodeIndex, ['conference', 'paper', 'paper', 'author']) cpcppaAdjMatrix = cpcAdjMatrix * cppaAdjMatrix extraData['fromNodes'] = extraData['toNodes'] extraData['fromNodesIndex'] = extraData['toNodesIndex'] for testAuthor in testAuthors: experiment.runFor(testAuthor, cpcppaAdjMatrix, extraData, citationCounts, publicationCounts)
def run(): experiment = PapersNeighborSimTPPExperiment( None, "Most Similar TPP NeighborSim Authors", outputFilePath="results/papers/tppNeighborSim" ) # Compute once, since these never change graph, nodeIndex = cPickle.load(open(os.path.join("../../data", "graphWithCitations"))) appAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ["term", "paper", "paper"]) extraData["fromNodes"] = extraData["toNodes"] extraData["fromNodesIndex"] = extraData["toNodesIndex"] for testPaper in testPapers: experiment.runFor(testPaper, appAdjMatrix, extraData)
def run(): experiment = AuthorsNeighborSimPPAExperiment( None, 'Most Similar PPA NeighborSim Authors', outputFilePath=os.path.join('../../results', 'authors', 'ppaNeighborSim')) # Compute once, since these never change graph, nodeIndex = cPickle.load( open(os.path.join('../../data', 'graphWithCitations'))) ppaAdjMatrix, extraData = getMetaPathAdjacencyData( graph, nodeIndex, ['paper', 'paper', 'author']) extraData['fromNodes'] = extraData['toNodes'] extraData['fromNodesIndex'] = extraData['toNodesIndex'] # Read paper citation counts paperCitationsFile = open(os.path.join('../../data', 'paperCitationCounts')) paperCitationCounts = {} for line in paperCitationsFile: splitIndex = line.find(': ') count, title = int(line[:splitIndex]), line[splitIndex + 2:].strip() paperCitationCounts[title] = int(count) # Compute author publication counts allPapers = set(nodeIndex['paper'].values()) allAuthors = set(nodeIndex['author'].values()) publicationCounts, citationCounts = defaultdict(int), defaultdict(int) for author in allAuthors: for node in graph.successors(author): if node in allPapers: publicationCounts[author] += 1 citationCounts[author] += paperCitationCounts[ node] if node in paperCitationCounts else 0 # Output author citation counts citationCountsList = sorted(citationCounts.iteritems(), key=operator.itemgetter(1)) citationCountsList.reverse() with open(os.path.join('../../data', 'authorCitationCounts'), 'w') as outputFile: map( lambda (author, count): outputFile.write('%d: %s\n' % (int(count), author)), citationCountsList) for testAuthor in testAuthors: experiment.runFor(testAuthor, ppaAdjMatrix, extraData, citationCounts, publicationCounts) return citationCounts, publicationCounts
def run(citationCounts, publicationCounts): experiment = AuthorsNeighborSimCPPAExperiment( None, "Most Similar CPPA NeighborSim Authors", outputFilePath=os.path.join("../../results", "authors", "cppaNeighborSim"), ) # Compute once, since these never change graph, nodeIndex = cPickle.load(open(os.path.join("../../data", "graphWithCitations"))) cppaAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ["conference", "paper", "paper", "author"]) extraData["fromNodes"] = extraData["toNodes"] extraData["fromNodesIndex"] = extraData["toNodesIndex"] for testAuthor in testAuthors: experiment.runFor(testAuthor, cppaAdjMatrix, extraData, citationCounts, publicationCounts)
def run(): experiment = PapersNeighborSimTPPExperiment( None, 'Most Similar TPP NeighborSim Authors', outputFilePath='results/papers/tppNeighborSim') # Compute once, since these never change graph, nodeIndex = cPickle.load( open(os.path.join('../../data', 'graphWithCitations'))) appAdjMatrix, extraData = getMetaPathAdjacencyData( graph, nodeIndex, ['term', 'paper', 'paper']) extraData['fromNodes'] = extraData['toNodes'] extraData['fromNodesIndex'] = extraData['toNodesIndex'] for testPaper in testPapers: experiment.runFor(testPaper, appAdjMatrix, extraData)
def run(): experiment = ConferencesPathSimTPPCExperiment( None, 'Most Similar TPPC NeighborSim Conferences', outputFilePath=os.path.join('..', '..', 'results', 'conferences', 'tppcNeighborSim') ) # Compute once, since these never change graph, nodeIndex = cPickle.load(open(os.path.join('..', '..', 'data', 'graphWithCitations'))) cppaAdjMatrix, extraData = getMetaPathAdjacencyData( graph, nodeIndex, ['term', 'paper', 'paper', 'conference'] ) extraData['fromNodes'] = extraData['toNodes'] extraData['fromNodesIndex'] = extraData['toNodesIndex'] confPublications, confCitations = cPickle.load(open(os.path.join('..', '..', 'data', 'conferenceStats'))) # Actually run the similarity experiments for testConference in testConferences: experiment.runFor(testConference, cppaAdjMatrix, extraData, confPublications, confCitations)
def run(): experiment = AuthorsNeighborSimPPAExperiment( None, 'Most Similar PPA NeighborSim Authors', outputFilePath = os.path.join('../../results','authors','ppaNeighborSim') ) # Compute once, since these never change graph, nodeIndex = cPickle.load(open(os.path.join('../../data', 'graphWithCitations'))) ppaAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['paper', 'paper', 'author']) extraData['fromNodes'] = extraData['toNodes'] extraData['fromNodesIndex'] = extraData['toNodesIndex'] # Read paper citation counts paperCitationsFile = open(os.path.join('../../data', 'paperCitationCounts')) paperCitationCounts = {} for line in paperCitationsFile: splitIndex = line.find(': ') count, title = int(line[:splitIndex]), line[splitIndex+2:].strip() paperCitationCounts[title] = int(count) # Compute author publication counts allPapers = set(nodeIndex['paper'].values()) allAuthors = set(nodeIndex['author'].values()) publicationCounts, citationCounts = defaultdict(int), defaultdict(int) for author in allAuthors: for node in graph.successors(author): if node in allPapers: publicationCounts[author] += 1 citationCounts[author] += paperCitationCounts[node] if node in paperCitationCounts else 0 # Output author citation counts citationCountsList = sorted(citationCounts.iteritems(), key=operator.itemgetter(1)) citationCountsList.reverse() with open(os.path.join('../../data', 'authorCitationCounts'), 'w') as outputFile: map(lambda (author, count): outputFile.write('%d: %s\n' % (int(count), author)), citationCountsList) for testAuthor in testAuthors: experiment.runFor(testAuthor, ppaAdjMatrix, extraData, citationCounts, publicationCounts) return citationCounts, publicationCounts
def run(): experiment = ConferencesPathSimTPPCExperiment( None, 'Most Similar TPPC NeighborSim Conferences', outputFilePath=os.path.join('..', '..', 'results', 'conferences', 'tppcNeighborSim')) # Compute once, since these never change graph, nodeIndex = cPickle.load( open(os.path.join('..', '..', 'data', 'graphWithCitations'))) cppaAdjMatrix, extraData = getMetaPathAdjacencyData( graph, nodeIndex, ['term', 'paper', 'paper', 'conference']) extraData['fromNodes'] = extraData['toNodes'] extraData['fromNodesIndex'] = extraData['toNodesIndex'] confPublications, confCitations = cPickle.load( open(os.path.join('..', '..', 'data', 'conferenceStats'))) # Actually run the similarity experiments for testConference in testConferences: experiment.runFor(testConference, cppaAdjMatrix, extraData, confPublications, confCitations)
def run(): # Experiments to run with meta path lengths (map of length to trial paths) p, a, t, c = 'paper', 'author', 'term', 'conference' metaPathLengthExperiments = { 3: [ [a, p, a], ], 4: [[a, p, p, a]], 5: [ [a, p, a, p, a], ], 7: [[a, p, p, a, p, p, a]], } graph, nodeIndex = cPickle.load( open(os.path.join('../', 'data', 'graphWithCitations'))) # Map of experiment length to experiment, which contains a tuple of average time # without and with saving adj matrix metaPathLengthExperimentResults = defaultdict(list) for pathLength in sorted(metaPathLengthExperiments.keys()): for metaPath in metaPathLengthExperiments[pathLength]: # Time getting adjacency matrix directly fullTime = timeit.timeit( lambda: getMetaPathAdjacencyData(graph, nodeIndex, metaPath), number=10) # Split meta path if pathLength in {3, 5}: metaPathPart = [p, a, p] if metaPath[0] == p else [a, p, a] repetitions = ((len(metaPath) - 1) / 2) else: # 4, 7 -- only repeat twice metaPathPart = metaPath[:(len(metaPath) / 2 + 1)] print metaPathPart repetitions = 2 # Find the partial meta path adjacency list adjMatrices, adjMatrix = getPartialMetaPath( graph, metaPathPart, nodeIndex, repetitions) partialTime = timeit.timeit(lambda: getPartialMetaPath( graph, metaPathPart, nodeIndex, repetitions), number=10) # Get the number of bytes to store partial adj matrices bytesForMatrices = sys.getsizeof(adjMatrix) # Multiply for full adj matrix multiplyTime = timeit.timeit( lambda: multiplyFullAdjMatrix(adjMatrices, repetitions), number=10) # Output results metaPathLengthExperimentResults[pathLength].append( (fullTime, partialTime, multiplyTime, bytesForMatrices)) print "Full Path: %.3f seconds, Partial Paths: %.3f seconds, Multiplication Only: %.3f, Bytes: %d [%s]" % ( fullTime, partialTime, multiplyTime, bytesForMatrices, ', '.join(metaPath)) cPickle.dump(metaPathLengthExperimentResults, open('results', 'w'))
def imbalancedCitationsPublicationsExample(): """ Illustrative example of imbalanced citations / publications to verify ShapeSim is working correctly """ graph = MultiDiGraph() authors = ['Alice', 'Bob', 'Carol', 'Dave', 'Ed', 'Frank'] conference = 'KDD' # Citation & publication count configuration citationsPublications = { 'Alice': (100, 10), 'Bob': (80, 10), 'Carol': (100, 100), 'Dave': (50, 10), 'Ed': (10, 10), 'Frank': (1000, 100) } actualCitationsPublications = defaultdict(lambda: (0, 0)) # Helper functions for repeatedly adding papers to the graph addPapersToAuthor = lambda n, author: [addPublicationPaper(author) for _ in itertools.repeat(None, n)] addCitationsToPaper = lambda n, paper, author: [addCitationPaper(paper, author) for _ in itertools.repeat(None, n)] # Helper for getting the next id def __getNextId(): global nextId oldId = nextId nextId += 1 return oldId def addPublicationPaper(author): """ Helper method to add a 'publication' paper, connected to both an author and a conference """ paper = "%s's Paper %d" % (author, (__getNextId())) graph.add_node(paper) graph.add_edges_from([(author, paper), (paper, author), (paper, conference), (conference, paper)]) citationCount, publicationCount = actualCitationsPublications[author] actualCitationsPublications[author] = (citationCount, publicationCount + 1) return paper def addCitationPaper(citedPaper, citedAuthor): """ Helper method to add a 'citation' paper, which is only connected to the conference and the paper it cites """ citingPaper = "Citing Paper %d" % __getNextId() graph.add_node(citingPaper) graph.add_edges_from([(conference, citingPaper), (citingPaper, conference), (citingPaper, citedPaper)]) citationCount, publicationCount = actualCitationsPublications[citedAuthor] actualCitationsPublications[citedAuthor] = (citationCount + 1, publicationCount) return citingPaper allPapers = [] # Construct the graph graph.add_nodes_from(authors + [conference]) for authorName in citationsPublications: citationCount, publicationCount = citationsPublications[authorName] # Add citations & publications to author authorPapers = addPapersToAuthor(publicationCount, authorName) allPapers.extend(authorPapers) citationsPerPaper = citationCount / publicationCount for paper in authorPapers: citingPapers = addCitationsToPaper(citationsPerPaper, paper, authorName) allPapers.extend(citingPapers) nodeIndex = { 'paper': {i: allPapers[i] for i in xrange(0, len(allPapers))}, 'conference': {0: 'KDD'}, 'author': {0: 'Alice', 1: 'Bob', 2: 'Carol', 3: 'Dave', 4: 'Ed', 5: 'Frank'} } # Test PathSim / NeighborSim cpaAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['conference', 'paper', 'author']) extraData['fromNodes'] = extraData['toNodes'] extraData['fromNodesIndex'] = extraData['toNodesIndex'] neighborSimMostSimilar, similarityScores = findMostSimilarNodes( cpaAdjMatrix, 'Alice', extraData, method=getNeighborSimScore ) # Test ShapeSim cppaAdjTensor, extraData = getMetaPathAdjacencyTensorData( graph, nodeIndex, ['conference', 'paper', 'paper', 'author'] ) extraData['fromNodes'] = extraData['toNodes'] extraData['fromNodesIndex'] = extraData['toNodesIndex'] shapeSimMostSimilar, similarityScores = findMostSimilarNodes( cppaAdjTensor, 'Alice', extraData, method=getNumpyShapeSimScore, alpha=1.0 ) # Output similarity scores for name, mostSimilar in [('NeighborSim', neighborSimMostSimilar), ('ShapeSim', shapeSimMostSimilar)]: print('\n%s Most Similar to "%s":' % (name, 'Alice')) mostSimilarTable = texttable.Texttable() rows = [['Author', 'Score']] rows += [[name, score] for name, score in mostSimilar] mostSimilarTable.add_rows(rows) print(mostSimilarTable.draw())