Пример #1
def run(citationCounts, publicationCounts):

    :param citationCounts:
    :param publicationCounts:
    experiment = AuthorsPathSimAPCPAExperiment(
        'Most Similar APCPA PathSim Authors',
        outputFilePath=os.path.join('../../results', 'authors',

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(
        open(os.path.join('../../data', 'graphWithCitations')))

    # Compute APCPA adjacency matrix
    apcAdjMatrix, extraData = getMetaPathAdjacencyData(
        graph, nodeIndex, ['author', 'paper', 'conference'], rows=True)
    cpaAdjMatrix, data = getMetaPathAdjacencyData(
        graph, nodeIndex, ['conference', 'paper', 'author'])
    apcpaAdjMatrix = lil_matrix(apcAdjMatrix * cpaAdjMatrix)

    # Correct the toNodes content in extraData
    extraData['toNodes'] = data['toNodes']
    extraData['toNodesIndex'] = data['toNodesIndex']

    for testAuthor in testAuthors:
        experiment.runFor(testAuthor, apcpaAdjMatrix, extraData,
                          citationCounts, publicationCounts)
def run(citationCounts, publicationCounts):

    :param citationCounts:
    :param publicationCounts:
    experiment = AuthorsPathSimAPCPAExperiment(
        'Most Similar APCPA PathSim Authors',
        outputFilePath = os.path.join('../../results','authors','apcpaPathSim')

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(open(os.path.join('../../data', 'graphWithCitations')))

    # Compute APCPA adjacency matrix
    apcAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['author', 'paper', 'conference'], rows=True)
    cpaAdjMatrix, data = getMetaPathAdjacencyData(graph, nodeIndex, ['conference', 'paper', 'author'])
    apcpaAdjMatrix = lil_matrix(apcAdjMatrix * cpaAdjMatrix)

    # Correct the toNodes content in extraData
    extraData['toNodes'] = data['toNodes']
    extraData['toNodesIndex'] = data['toNodesIndex']

    for testAuthor in testAuthors:
        experiment.runFor(testAuthor, apcpaAdjMatrix, extraData, citationCounts, publicationCounts)
def getPartialMetaPath(graph, metaPathPart, nodeIndex, repetitions):
    adjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, metaPathPart)
    if metaPathPart[0] == metaPathPart[-1]:
        adjMatrices = [adjMatrix] * repetitions
        otherAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, list(reversed(metaPathPart)))
        adjMatrices = [adjMatrix, otherAdjMatrix]
    return adjMatrices, adjMatrix
Пример #4
def getPartialMetaPath(graph, metaPathPart, nodeIndex, repetitions):
    adjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex,
    if metaPathPart[0] == metaPathPart[-1]:
        adjMatrices = [adjMatrix] * repetitions
        otherAdjMatrix, extraData = getMetaPathAdjacencyData(
            graph, nodeIndex, list(reversed(metaPathPart)))
        adjMatrices = [adjMatrix, otherAdjMatrix]
    return adjMatrices, adjMatrix
def run():

    # Experiments to run with meta path lengths (map of length to trial paths)
    p, a, t, c = 'paper', 'author', 'term', 'conference'
    metaPathLengthExperiments = {
        3: [
            [a, p, a],
        4: [
            [a, p, p, a]
        5: [
            [a, p, a, p, a],
        7: [
            [a, p, p, a, p, p, a]

    graph, nodeIndex = cPickle.load(open(os.path.join('../', 'data', 'graphWithCitations')))

    # Map of experiment length to experiment, which contains a tuple of average time
    # without and with saving adj matrix
    metaPathLengthExperimentResults = defaultdict(list)

    for pathLength in sorted(metaPathLengthExperiments.keys()):
        for metaPath in metaPathLengthExperiments[pathLength]:

            # Time getting adjacency matrix directly
            fullTime = timeit.timeit(lambda: getMetaPathAdjacencyData(graph, nodeIndex, metaPath), number=10)

            # Split meta path
            if pathLength in {3, 5}:
                metaPathPart = [p, a, p] if metaPath[0] == p else [a, p, a]
                repetitions = ((len(metaPath) - 1) / 2)
            else: # 4, 7 -- only repeat twice
                metaPathPart = metaPath[:(len(metaPath)/2 + 1)]
                print metaPathPart
                repetitions = 2

            # Find the partial meta path adjacency list
            adjMatrices, adjMatrix = getPartialMetaPath(graph, metaPathPart, nodeIndex, repetitions)
            partialTime = timeit.timeit(lambda: getPartialMetaPath(graph, metaPathPart, nodeIndex, repetitions), number=10)

            # Get the number of bytes to store partial adj matrices
            bytesForMatrices = sys.getsizeof(adjMatrix)

            # Multiply for full adj matrix
            multiplyTime = timeit.timeit(lambda: multiplyFullAdjMatrix(adjMatrices, repetitions), number=10)

            # Output results
                fullTime, partialTime, multiplyTime, bytesForMatrices
            print "Full Path: %.3f seconds, Partial Paths: %.3f seconds, Multiplication Only: %.3f, Bytes: %d  [%s]" % (
                fullTime, partialTime, multiplyTime, bytesForMatrices, ', '.join(metaPath)

    cPickle.dump(metaPathLengthExperimentResults, open('results', 'w'))
def run(citationCounts, publicationCounts):
    experiment = AuthorsNeighborSimCPCPPAExperiment(
        'Most Similar CPCPPA NeighborSim Authors',
        outputFilePath = os.path.join('../../results','authors','cpcppaNeighborSim')

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(open(os.path.join('../../data', 'graphWithCitations')))
    cpcAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['conference', 'paper', 'conference'])
    cppaAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['conference', 'paper', 'paper', 'author'])
    cpcppaAdjMatrix = cpcAdjMatrix * cppaAdjMatrix
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']

    for testAuthor in testAuthors:
        experiment.runFor(testAuthor, cpcppaAdjMatrix, extraData, citationCounts, publicationCounts)
Пример #7
def run():
    experiment = PapersPathSimPTPExperiment(
        None, 'Most Similar PCP PathSim Papers', outputFilePath='results/papers/ptpPathSim')

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(open(os.path.join('../../data', 'graphWithCitations')))

    # Compute APCPA adjacency matrix
    ptAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['paper', 'term'], rows=True)
    tpAdjMatrix, data = getMetaPathAdjacencyData(graph, nodeIndex, ['term', 'paper'])
    ptpAdjMatrix = lil_matrix(ptAdjMatrix * tpAdjMatrix)

    # Correct the toNodes content in extraData
    extraData['toNodes'] = data['toNodes']
    extraData['toNodesIndex'] = data['toNodesIndex']

    for testPaper in testPapers:
        experiment.runFor(testPaper, ptpAdjMatrix, extraData)
def run():
    experiment = PapersPathSimPAPExperiment(
        None, 'Most Similar PAP PathSim Papers', outputFilePath='results/papers/papPathSim')

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(open(os.path.join('../../data', 'graphWithCitations')))

    # Compute APCPA adjacency matrix
    paAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['paper', 'author'], rows=True)
    apAdjMatrix, data = getMetaPathAdjacencyData(graph, nodeIndex, ['author', 'paper'])
    papAdjMatrix = lil_matrix(paAdjMatrix * apAdjMatrix)

    # Correct the toNodes content in extraData
    extraData['toNodes'] = data['toNodes']
    extraData['toNodesIndex'] = data['toNodesIndex']

    for testPaper in testPapers:
        experiment.runFor(testPaper, papAdjMatrix, extraData)
def run():
    experiment = PapersPathSimPTPExperiment(
        None, "Most Similar PCP PathSim Papers", outputFilePath="results/papers/ptpPathSim"

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(open(os.path.join("../../data", "graphWithCitations")))

    # Compute APCPA adjacency matrix
    ptAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ["paper", "term"], rows=True)
    tpAdjMatrix, data = getMetaPathAdjacencyData(graph, nodeIndex, ["term", "paper"])
    ptpAdjMatrix = lil_matrix(ptAdjMatrix * tpAdjMatrix)

    # Correct the toNodes content in extraData
    extraData["toNodes"] = data["toNodes"]
    extraData["toNodesIndex"] = data["toNodesIndex"]

    for testPaper in testPapers:
        experiment.runFor(testPaper, ptpAdjMatrix, extraData)
def run():
    experiment = PapersNeighborSimAPPExperiment(
        None, 'Most Similar APP NeighborSim Authors', outputFilePath='results/papers/appNeighborSim')

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(open(os.path.join('../../data', 'graphWithCitations')))
    appAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['author', 'paper', 'paper'])
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']

    for testPaper in testPapers:
        experiment.runFor(testPaper, appAdjMatrix, extraData)
Пример #11
def run(citationCounts, publicationCounts):
    experiment = AuthorsNeighborSimCPCPPAExperiment(
        'Most Similar CPCPPA NeighborSim Authors',
        outputFilePath=os.path.join('../../results', 'authors',

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(
        open(os.path.join('../../data', 'graphWithCitations')))
    cpcAdjMatrix, extraData = getMetaPathAdjacencyData(
        graph, nodeIndex, ['conference', 'paper', 'conference'])
    cppaAdjMatrix, extraData = getMetaPathAdjacencyData(
        graph, nodeIndex, ['conference', 'paper', 'paper', 'author'])
    cpcppaAdjMatrix = cpcAdjMatrix * cppaAdjMatrix
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']

    for testAuthor in testAuthors:
        experiment.runFor(testAuthor, cpcppaAdjMatrix, extraData,
                          citationCounts, publicationCounts)
def run():
    experiment = PapersNeighborSimTPPExperiment(
        None, "Most Similar TPP NeighborSim Authors", outputFilePath="results/papers/tppNeighborSim"

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(open(os.path.join("../../data", "graphWithCitations")))
    appAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ["term", "paper", "paper"])
    extraData["fromNodes"] = extraData["toNodes"]
    extraData["fromNodesIndex"] = extraData["toNodesIndex"]

    for testPaper in testPapers:
        experiment.runFor(testPaper, appAdjMatrix, extraData)
Пример #13
def run():
    experiment = AuthorsNeighborSimPPAExperiment(
        'Most Similar PPA NeighborSim Authors',
        outputFilePath=os.path.join('../../results', 'authors',

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(
        open(os.path.join('../../data', 'graphWithCitations')))
    ppaAdjMatrix, extraData = getMetaPathAdjacencyData(
        graph, nodeIndex, ['paper', 'paper', 'author'])
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']

    # Read paper citation counts
    paperCitationsFile = open(os.path.join('../../data',
    paperCitationCounts = {}
    for line in paperCitationsFile:
        splitIndex = line.find(': ')
        count, title = int(line[:splitIndex]), line[splitIndex + 2:].strip()
        paperCitationCounts[title] = int(count)

    # Compute author publication counts
    allPapers = set(nodeIndex['paper'].values())
    allAuthors = set(nodeIndex['author'].values())
    publicationCounts, citationCounts = defaultdict(int), defaultdict(int)
    for author in allAuthors:
        for node in graph.successors(author):
            if node in allPapers:
                publicationCounts[author] += 1
                citationCounts[author] += paperCitationCounts[
                    node] if node in paperCitationCounts else 0

    # Output author citation counts
    citationCountsList = sorted(citationCounts.iteritems(),
    with open(os.path.join('../../data', 'authorCitationCounts'),
              'w') as outputFile:
            lambda (author, count): outputFile.write('%d: %s\n' %
                                                     (int(count), author)),

    for testAuthor in testAuthors:
        experiment.runFor(testAuthor, ppaAdjMatrix, extraData, citationCounts,

    return citationCounts, publicationCounts
def run(citationCounts, publicationCounts):
    experiment = AuthorsNeighborSimCPPAExperiment(
        "Most Similar CPPA NeighborSim Authors",
        outputFilePath=os.path.join("../../results", "authors", "cppaNeighborSim"),

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(open(os.path.join("../../data", "graphWithCitations")))
    cppaAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ["conference", "paper", "paper", "author"])
    extraData["fromNodes"] = extraData["toNodes"]
    extraData["fromNodesIndex"] = extraData["toNodesIndex"]

    for testAuthor in testAuthors:
        experiment.runFor(testAuthor, cppaAdjMatrix, extraData, citationCounts, publicationCounts)
Пример #15
def run():
    experiment = PapersNeighborSimTPPExperiment(
        'Most Similar TPP NeighborSim Authors',

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(
        open(os.path.join('../../data', 'graphWithCitations')))
    appAdjMatrix, extraData = getMetaPathAdjacencyData(
        graph, nodeIndex, ['term', 'paper', 'paper'])
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']

    for testPaper in testPapers:
        experiment.runFor(testPaper, appAdjMatrix, extraData)
def run():
    experiment = ConferencesPathSimTPPCExperiment(
        'Most Similar TPPC NeighborSim Conferences',
        outputFilePath=os.path.join('..', '..', 'results', 'conferences', 'tppcNeighborSim')

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(open(os.path.join('..', '..', 'data', 'graphWithCitations')))
    cppaAdjMatrix, extraData = getMetaPathAdjacencyData(
        graph, nodeIndex, ['term', 'paper', 'paper', 'conference']
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']

    confPublications, confCitations = cPickle.load(open(os.path.join('..', '..', 'data', 'conferenceStats')))

    # Actually run the similarity experiments
    for testConference in testConferences:
        experiment.runFor(testConference, cppaAdjMatrix, extraData, confPublications, confCitations)
def run():
    experiment = AuthorsNeighborSimPPAExperiment(
        'Most Similar PPA NeighborSim Authors',
        outputFilePath = os.path.join('../../results','authors','ppaNeighborSim')

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(open(os.path.join('../../data', 'graphWithCitations')))
    ppaAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['paper', 'paper', 'author'])
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']

    # Read paper citation counts
    paperCitationsFile = open(os.path.join('../../data', 'paperCitationCounts'))
    paperCitationCounts = {}
    for line in paperCitationsFile:
        splitIndex = line.find(': ')
        count, title = int(line[:splitIndex]), line[splitIndex+2:].strip()
        paperCitationCounts[title] = int(count)

    # Compute author publication counts
    allPapers = set(nodeIndex['paper'].values())
    allAuthors = set(nodeIndex['author'].values())
    publicationCounts, citationCounts = defaultdict(int), defaultdict(int)
    for author in allAuthors:
        for node in graph.successors(author):
            if node in allPapers:
                publicationCounts[author] += 1
                citationCounts[author] += paperCitationCounts[node] if node in paperCitationCounts else 0

    # Output author citation counts
    citationCountsList = sorted(citationCounts.iteritems(), key=operator.itemgetter(1))
    with open(os.path.join('../../data', 'authorCitationCounts'), 'w') as outputFile:
        map(lambda (author, count): outputFile.write('%d: %s\n' % (int(count), author)), citationCountsList)

    for testAuthor in testAuthors:
        experiment.runFor(testAuthor, ppaAdjMatrix, extraData, citationCounts, publicationCounts)

    return citationCounts, publicationCounts
def run():
    experiment = ConferencesPathSimTPPCExperiment(
        'Most Similar TPPC NeighborSim Conferences',
        outputFilePath=os.path.join('..', '..', 'results', 'conferences',

    # Compute once, since these never change
    graph, nodeIndex = cPickle.load(
        open(os.path.join('..', '..', 'data', 'graphWithCitations')))
    cppaAdjMatrix, extraData = getMetaPathAdjacencyData(
        graph, nodeIndex, ['term', 'paper', 'paper', 'conference'])
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']

    confPublications, confCitations = cPickle.load(
        open(os.path.join('..', '..', 'data', 'conferenceStats')))

    # Actually run the similarity experiments
    for testConference in testConferences:
        experiment.runFor(testConference, cppaAdjMatrix, extraData,
                          confPublications, confCitations)
Пример #19
def run():

    # Experiments to run with meta path lengths (map of length to trial paths)
    p, a, t, c = 'paper', 'author', 'term', 'conference'
    metaPathLengthExperiments = {
        3: [
            [a, p, a],
        4: [[a, p, p, a]],
        5: [
            [a, p, a, p, a],
        7: [[a, p, p, a, p, p, a]],

    graph, nodeIndex = cPickle.load(
        open(os.path.join('../', 'data', 'graphWithCitations')))

    # Map of experiment length to experiment, which contains a tuple of average time
    # without and with saving adj matrix
    metaPathLengthExperimentResults = defaultdict(list)

    for pathLength in sorted(metaPathLengthExperiments.keys()):
        for metaPath in metaPathLengthExperiments[pathLength]:

            # Time getting adjacency matrix directly
            fullTime = timeit.timeit(
                lambda: getMetaPathAdjacencyData(graph, nodeIndex, metaPath),

            # Split meta path
            if pathLength in {3, 5}:
                metaPathPart = [p, a, p] if metaPath[0] == p else [a, p, a]
                repetitions = ((len(metaPath) - 1) / 2)
            else:  # 4, 7 -- only repeat twice
                metaPathPart = metaPath[:(len(metaPath) / 2 + 1)]
                print metaPathPart
                repetitions = 2

            # Find the partial meta path adjacency list
            adjMatrices, adjMatrix = getPartialMetaPath(
                graph, metaPathPart, nodeIndex, repetitions)
            partialTime = timeit.timeit(lambda: getPartialMetaPath(
                graph, metaPathPart, nodeIndex, repetitions),

            # Get the number of bytes to store partial adj matrices
            bytesForMatrices = sys.getsizeof(adjMatrix)

            # Multiply for full adj matrix
            multiplyTime = timeit.timeit(
                lambda: multiplyFullAdjMatrix(adjMatrices, repetitions),

            # Output results
                (fullTime, partialTime, multiplyTime, bytesForMatrices))
            print "Full Path: %.3f seconds, Partial Paths: %.3f seconds, Multiplication Only: %.3f, Bytes: %d  [%s]" % (
                fullTime, partialTime, multiplyTime, bytesForMatrices,
                ', '.join(metaPath))

    cPickle.dump(metaPathLengthExperimentResults, open('results', 'w'))
Пример #20
def imbalancedCitationsPublicationsExample():
      Illustrative example of imbalanced citations / publications to verify ShapeSim is working correctly

    graph = MultiDiGraph()
    authors = ['Alice', 'Bob', 'Carol', 'Dave', 'Ed', 'Frank']
    conference = 'KDD'

    # Citation & publication count configuration
    citationsPublications = {
        'Alice': (100, 10),
        'Bob': (80, 10),
        'Carol': (100, 100),
        'Dave': (50, 10),
        'Ed': (10, 10),
        'Frank': (1000, 100)

    actualCitationsPublications = defaultdict(lambda: (0, 0))

    # Helper functions for repeatedly adding papers to the graph
    addPapersToAuthor = lambda n, author: [addPublicationPaper(author) for _ in itertools.repeat(None, n)]
    addCitationsToPaper = lambda n, paper, author: [addCitationPaper(paper, author) for _ in itertools.repeat(None, n)]

    # Helper for getting the next id
    def __getNextId():
        global nextId
        oldId = nextId
        nextId += 1
        return oldId

    def addPublicationPaper(author):
          Helper method to add a 'publication' paper, connected to both an author and a conference
        paper = "%s's Paper %d" % (author, (__getNextId()))
        graph.add_edges_from([(author, paper), (paper, author), (paper, conference), (conference, paper)])

        citationCount, publicationCount = actualCitationsPublications[author]
        actualCitationsPublications[author] = (citationCount, publicationCount + 1)

        return paper

    def addCitationPaper(citedPaper, citedAuthor):
          Helper method to add a 'citation' paper, which is only connected to the conference and the paper it cites
        citingPaper = "Citing Paper %d" % __getNextId()
        graph.add_edges_from([(conference, citingPaper), (citingPaper, conference), (citingPaper, citedPaper)])

        citationCount, publicationCount = actualCitationsPublications[citedAuthor]
        actualCitationsPublications[citedAuthor] = (citationCount + 1, publicationCount)

        return citingPaper

    allPapers = []

    # Construct the graph
    graph.add_nodes_from(authors + [conference])
    for authorName in citationsPublications:
        citationCount, publicationCount = citationsPublications[authorName]

        # Add citations & publications to author
        authorPapers = addPapersToAuthor(publicationCount, authorName)
        citationsPerPaper = citationCount / publicationCount
        for paper in authorPapers:
            citingPapers = addCitationsToPaper(citationsPerPaper, paper, authorName)

    nodeIndex = {
        'paper': {i: allPapers[i] for i in xrange(0, len(allPapers))},
        'conference': {0: 'KDD'},
        'author': {0: 'Alice', 1: 'Bob', 2: 'Carol', 3: 'Dave', 4: 'Ed', 5: 'Frank'}

    # Test PathSim / NeighborSim
    cpaAdjMatrix, extraData = getMetaPathAdjacencyData(graph, nodeIndex, ['conference', 'paper', 'author'])
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']
    neighborSimMostSimilar, similarityScores = findMostSimilarNodes(
        cpaAdjMatrix, 'Alice', extraData, method=getNeighborSimScore

    # Test ShapeSim
    cppaAdjTensor, extraData = getMetaPathAdjacencyTensorData(
        graph, nodeIndex, ['conference', 'paper', 'paper', 'author']
    extraData['fromNodes'] = extraData['toNodes']
    extraData['fromNodesIndex'] = extraData['toNodesIndex']
    shapeSimMostSimilar, similarityScores = findMostSimilarNodes(
        cppaAdjTensor, 'Alice', extraData, method=getNumpyShapeSimScore, alpha=1.0

    # Output similarity scores
    for name, mostSimilar in [('NeighborSim', neighborSimMostSimilar), ('ShapeSim', shapeSimMostSimilar)]:
        print('\n%s Most Similar to "%s":' % (name, 'Alice'))
        mostSimilarTable = texttable.Texttable()
        rows = [['Author', 'Score']]
        rows += [[name, score] for name, score in mostSimilar]