Exemplo n.º 1
0
    def setUp(self):

        self.maxDiff = None

        # Construct template graph for tests
        graph = GraphFactory.createInstance()

        # Put references to graph objects on test object
        self.author = Author(0, 'author')
        self.coauthor = Author(1, 'coauthor')
        self.conference1 = Conference(0, 'conference1')
        self.conference2 = Conference(1, 'conference2')
        self.paper1 = Paper(0, 'paper1')
        self.paper2 = Paper(1, 'paper2')
        self.paper3 = Paper(2, 'paper3')

        # Construct graph
        graph.addNodes([
            self.author, self.conference1, self.conference2, self.paper1,
            self.paper2, self.paper3
        ])
        graph.addBothEdges(self.paper1, self.author, Authorship())
        graph.addBothEdges(self.paper2, self.author, Authorship())
        graph.addBothEdges(self.paper3, self.author, Authorship())
        graph.addBothEdges(self.paper3, self.coauthor, Authorship())
        graph.addBothEdges(self.paper1, self.conference1, Publication())
        graph.addBothEdges(self.paper2, self.conference1, Publication())
        graph.addBothEdges(self.paper3, self.conference2, Publication())
        graph.addEdge(self.paper1, self.paper2, Citation())
        graph.addBothEdges(self.paper2, self.paper3, Citation())

        self.templateGraph = graph

        self.metaPathUtility = self._getImplementation()
Exemplo n.º 2
0
    def testCoAuthorsGraph(self):
        """
          Sample (simple) scenario as the first case, except that three authors exist, and two of them are co-authors.
        """

        # Build sample data & expected output
        parsedData = {
            0: {
                'id': 0,
                'arnetid': 1,
                'authors': ['Author One', 'Author Three'],
                'conference': 'Conference One',
                'references': [],
                'title': 'Databases',
                'year': 1995
            },
            1: {
                'id': 1,
                'arnetid': 2,
                'authors': ['Author Two'],
                'conference': 'Conference Two',
                'references': [],
                'title': 'Databases',
                'year': 1999
            }
        }

        expectedGraph = GraphFactory.createInstance()

        # Expect unspecified ids to auto-increment
        author1 = Author(0, 'Author One')
        author2 = Author(2, 'Author Two')
        author3 = Author(1, 'Author Three')
        paper1 = Paper(0, 'Databases')
        paper2 = Paper(1, 'Databases')
        topic = Topic(0, ['databas'])
        conference1 = Conference(0, 'Conference One')
        conference2 = Conference(1, 'Conference Two')
        expectedGraph.addNode(author1)
        expectedGraph.addNode(author2)
        expectedGraph.addNode(author3)
        expectedGraph.addNode(paper1)
        expectedGraph.addNode(paper2)
        expectedGraph.addNode(topic)
        expectedGraph.addNode(conference1)
        expectedGraph.addNode(conference2)

        expectedGraph.addBothEdges(author1, paper1, Authorship())
        expectedGraph.addBothEdges(author3, paper1, Authorship())
        expectedGraph.addBothEdges(author2, paper2, Authorship())
        expectedGraph.addBothEdges(paper1, topic, Mention())
        expectedGraph.addBothEdges(paper2, topic, Mention())
        expectedGraph.addBothEdges(paper1, conference1, Publication())
        expectedGraph.addBothEdges(paper2, conference2, Publication())

        actualGraph = self.dataImporter.buildGraph(parsedData)

        self.assertGraphsEqual(actualGraph, expectedGraph)
Exemplo n.º 3
0
    def testMutualCitationGraph(self):

        # Build sample data & expected output
        parsedData = {
            0: {
                'id': 0,
                'arnetid': 1,
                'authors': ['Author One'],
                'conference': 'Conference One',
                'references': [1],
                'title': 'Databases',
                'year': 1999
            },
            1: {
                'id': 1,
                'arnetid': 2,
                'authors': ['Author Two'],
                'conference': 'Conference Two',
                'references': [0],
                'title': 'Databases',
                'year': 1999
            }
        }

        expectedGraph = GraphFactory.createInstance()

        # Expect unspecified ids to auto-increment
        author1 = Author(0, 'Author One')
        author2 = Author(1, 'Author Two')
        paper1 = Paper(0, 'Databases')
        paper2 = Paper(1, 'Databases')
        topic = Topic(0, ['databas'])
        conference1 = Conference(0, 'Conference One')
        conference2 = Conference(1, 'Conference Two')
        expectedGraph.addNode(author1)
        expectedGraph.addNode(author2)
        expectedGraph.addNode(paper1)
        expectedGraph.addNode(paper2)
        expectedGraph.addNode(topic)
        expectedGraph.addNode(conference1)
        expectedGraph.addNode(conference2)

        expectedGraph.addBothEdges(author1, paper1, Authorship())
        expectedGraph.addBothEdges(author2, paper2, Authorship())
        expectedGraph.addBothEdges(paper1, topic, Mention())
        expectedGraph.addBothEdges(paper2, topic, Mention())
        expectedGraph.addBothEdges(paper1, conference1, Publication())
        expectedGraph.addBothEdges(paper2, conference2, Publication())

        # Symmetric in this case only!
        expectedGraph.addBothEdges(paper1, paper2, Citation())

        actualGraph = self.dataImporter.buildGraph(parsedData)

        self.assertGraphsEqual(actualGraph, expectedGraph)
Exemplo n.º 4
0
    def testSeparatePapersAuthorsTopicSharedConferenceGraph(self):

        # Build sample data & expected output
        parsedData = {
            0: {
                'id': 0,
                'arnetid': 1,
                'authors': ['Author One'],
                'conference': 'Conference One',
                'references': [],
                'title': 'Databases',
                'year': 1995
            },
            1: {
                'id': 1,
                'arnetid': 2,
                'authors': ['Author Two'],
                'conference': 'Conference One',
                'references': [],
                'title': 'All The Knowledge',
                'year': 1999
            }
        }

        expectedGraph = GraphFactory.createInstance()

        # Expect unspecified ids to auto-increment
        author1 = Author(0, 'Author One')
        author2 = Author(1, 'Author Two')
        paper1 = Paper(0, 'Databases')
        paper2 = Paper(1, 'All The Knowledge')
        topic1 = Topic(0, ['databas'])
        topic2 = Topic(1, ['knowledg'])
        conference = Conference(0, 'Conference One')
        expectedGraph.addNode(author1)
        expectedGraph.addNode(author2)
        expectedGraph.addNode(paper1)
        expectedGraph.addNode(paper2)
        expectedGraph.addNode(topic1)
        expectedGraph.addNode(topic2)
        expectedGraph.addNode(conference)

        expectedGraph.addBothEdges(author1, paper1, Authorship())
        expectedGraph.addBothEdges(author2, paper2, Authorship())
        expectedGraph.addBothEdges(paper1, topic1, Mention())
        expectedGraph.addBothEdges(paper2, topic2, Mention())
        expectedGraph.addBothEdges(paper1, conference, Publication())
        expectedGraph.addBothEdges(paper2, conference, Publication())

        actualGraph = self.dataImporter.buildGraph(parsedData)

        self.assertGraphsEqual(actualGraph, expectedGraph)
Exemplo n.º 5
0
        def addCitationPaper(citedPaper, citedAuthor):
            """
              Helper method to add a 'citation' paper, which is only connected to the conference and the paper it cites
            """
            nextId = SampleGraphUtility.__getNextId()
            citingPaper = Paper(nextId, "Citing Paper %d" % nextId)
            graph.addNode(citingPaper)
            graph.addBothEdges(citingPaper, conference)
            graph.addEdge(citingPaper, citedPaper)

            citationCount, publicationCount = actualCitationsPublications[citedAuthor]
            actualCitationsPublications[citedAuthor] = (citationCount + 1, publicationCount)
Exemplo n.º 6
0
        def addPublicationPaper(author):
            """
              Helper method to add a 'publication' paper, connected to both an author and a conference
            """
            nextId = SampleGraphUtility.__getNextId()
            paper = Paper(nextId, "%s's Paper %d" % (author.name, nextId))
            graph.addNode(paper)
            graph.addBothEdges(author, paper)
            graph.addBothEdges(paper, conference)

            citationCount, publicationCount = actualCitationsPublications[author]
            actualCitationsPublications[author] = (citationCount, publicationCount + 1)

            return paper
Exemplo n.º 7
0
    def __addSimilarAuthorsPapers(graph, author, firstConference, secondConference, authorConferencePaperMap):
        """
          Helper function to construct the papers & edges associated with the three very similar authors in example 3.
          (i.e. Mike, Mary, and Bob). Will only construct the third paper if these papers are not from Mary.
        """

        paper1 = Paper(SampleGraphUtility.__getNextId(), 'Paper 1')
        paper2 = Paper(SampleGraphUtility.__getNextId(), 'Paper 2')
        graph.addNode(paper1)
        graph.addNode(paper2)

        graph.addBothEdges(author, paper1, Authorship())
        graph.addBothEdges(author, paper2, Authorship())
        graph.addBothEdges(paper1, firstConference, Publication())
        graph.addBothEdges(paper2, firstConference, Publication())
        authorConferencePaperMap[author][firstConference].append(paper1)
        authorConferencePaperMap[author][firstConference].append(paper2)

        paper3 = Paper(SampleGraphUtility.__getNextId(), 'Paper 3')
        graph.addNode(paper3)
        graph.addBothEdges(author, paper3, Authorship())
        graph.addBothEdges(paper3, secondConference, Publication())
        authorConferencePaperMap[author][secondConference].append(paper3)
    def testCreateDBLPNode(self):

        paperDict = {'type': 'Paper', 'id': 68, 'title': 'VLDB Paper 57'}
        expectedPaper = Paper(id=68, title='VLDB Paper 57')
        actualPaper = GraphObjectFactory.createDBLPNode(paperDict)
        self.assertEqual(actualPaper, expectedPaper)

        authorDict = {'type': 'Author', 'id': 0, 'name': 'Mike'}
        expectedAuthor = Author(id=0, name='Mike')
        actualAuthor = GraphObjectFactory.createDBLPNode(authorDict)
        self.assertEqual(actualAuthor, expectedAuthor)

        conferenceDict = {'type': 'Conference', 'id': 6, 'name': 'VLDB'}
        expectedConference = Conference(id=6, name='VLDB')
        actualConference = GraphObjectFactory.createDBLPNode(conferenceDict)
        self.assertEqual(actualConference, expectedConference)
    def buildGraph(self, parsedData):
        """
          Form the DBLP graph structure from the parsed data
        """

        graph = GraphFactory.createInstance()

        # First, build the nodes for the graph
        authors = {}  # Indexed by name
        papers = {}  # Indexed by paper id
        topics = {}  # Indexed by keyword
        conferences = {}  # Indexed by name
        citationMap = {}  # Map of paper id to referenced paper ids

        # Construct everything except reference edges
        for paperId in parsedData:
            paperData = parsedData[paperId]

            paper = Paper(paperId, paperData['title'])
            citationMap[paperId] = paperData['references']

            # Create or get conference for this paper
            conferenceName = paperData['conference']
            if conferenceName not in conferences:
                conference = Conference(len(conferences), conferenceName)
                conferences[conferenceName] = conference
                graph.addNode(conference)
            else:
                conference = conferences[conferenceName]

            # Create or get authors for this paper
            paperAuthors = []
            for authorName in paperData['authors']:
                if authorName not in authors:
                    author = Author(len(authors), authorName)
                    authors[authorName] = author
                    graph.addNode(author)
                else:
                    author = authors[authorName]
                paperAuthors.append(author)

            # Extract keywords from title, and use as topics
            keywords = self.__extractKeywords(paperData['title'])
            for keyword in keywords:
                if keyword not in topics:
                    topic = Topic(len(topics), [keyword])
                    topics[keyword] = topic
                    graph.addNode(topic)
                else:
                    topic = topics[keyword]
                graph.addEdge(topic, paper, Mention())
                graph.addEdge(paper, topic, Mention())

            # Add new paper to the graph
            papers[paperId] = paper
            graph.addNode(paper)

            # Add corresponding edges in the graph
            for author in paperAuthors:
                graph.addEdge(paper, author, Authorship())
                graph.addEdge(author, paper, Authorship())
            graph.addEdge(paper, conference, Publication())
            graph.addEdge(conference, paper, Publication())

        # Add citations to the graph
        for paperId in citationMap:
            references = citationMap[paperId]
            paper = papers[paperId]
            for citedPaperId in references:
                citedPaper = papers[citedPaperId]
                graph.addEdge(paper, citedPaper, Citation())

        return graph
 def paperLineParser(line):
     paperData = line.split()
     paperId = int(self.__removeControlCharacters(paperData[0]))
     paperTitle = ' '.join(paperData[1:])
     paper = Paper(paperId, paperTitle)
     return paperId, paper
Exemplo n.º 11
0
    def constructMultiDisciplinaryAuthorExample(indirectAuthor=False, uneven=False):
        """
            Construct example DBLP graph where two authors are multi disciplinary, and no one else
        """

        graph = GraphFactory.createInstance()
        authorMap = {}
        conferenceMap = {}

        # Add authors
        a = Author(SampleGraphUtility.__getNextId(), 'A')
        b = Author(SampleGraphUtility.__getNextId(), 'B')
        c = Author(SampleGraphUtility.__getNextId(), 'C')
        d = Author(SampleGraphUtility.__getNextId(), 'D')
        e = Author(SampleGraphUtility.__getNextId(), 'E')
        f = Author(SampleGraphUtility.__getNextId(), 'F')
        g = Author(SampleGraphUtility.__getNextId(), 'G')
        h = Author(SampleGraphUtility.__getNextId(), 'H')
        i = Author(SampleGraphUtility.__getNextId(), 'I')
        authors = [a, b, c, d, e, f, g, h, i]
        if indirectAuthor:
            authors.append(Author(SampleGraphUtility.__getNextId(), 'J'))
        graph.addNodes(authors)

        # Add conferences
        vldb = Conference(SampleGraphUtility.__getNextId(), 'VLDB')  # Databases
        kdd = Conference(SampleGraphUtility.__getNextId(), 'KDD')  # Data mining
        conferences = [vldb, kdd]
        graph.addNodes(conferences)

        # Add author / conference index
        for author in authors:
            authorMap[author.name] = author
        for conference in conferences:
            conferenceMap[conference.name] = conference

        # Helper dictionary of total citation counts for each author (to fabricate) -- all divisible by 5, and multi-discipline authors divisible by 10
        # Results in the following total counts: {'A':100, 'B':80, 'C':10, 'D':120, 'E':60, 'F':100, 'G':80, 'H':10, 'I':24}
        citationCounts = {'A': 100, 'B': 80, 'C': 10, 'D': 60, 'E': 45, 'F': 100, 'G': 80, 'H': 10, 'I': 12, 'J': 60}

        # Create two papers for each author, one paper in each conference in each area
        dmAuthorNames = ['D', 'E', 'F', 'G', 'H', 'I']
        dbAuthorNames = ['A', 'B', 'C', 'D', 'E', 'I']
        if indirectAuthor:
            dmAuthorNames += ['J']
            dbAuthorNames += ['J']
        duplicateNames = set(dmAuthorNames).intersection(set(dbAuthorNames))
        dmConferenceNames = ['KDD']
        dbConferenceNames = ['VLDB']

        def f(x):
            totalCitationCount[x] = 0

        # Create equal number of citations from each other paper in the research area for each author's papers
        totalCitationCount = defaultdict(int)
        map(f, set(dmAuthorNames).union(set(dbAuthorNames)))
        for authorNames, conferenceNames in [(dmAuthorNames, dmConferenceNames), (dbAuthorNames, dbConferenceNames)]:
            for authorName in authorNames:

                citedPaperMap = {}
                for conferenceName in conferenceNames:

                    # Add paper to be cited for author
                    citedPaper = Paper(SampleGraphUtility.__getNextId(), '%sPaperIn%s' % (authorName, conferenceName))
                    graph.addNode(citedPaper)
                    graph.addBothEdges(citedPaper, conferenceMap[conferenceName], Publication())
                    graph.addBothEdges(citedPaper, authorMap[authorName], Authorship())

                    citedPaperMap[conferenceName] = citedPaper

                # Figure out the number of incoming citation for this author from each other eligible authors
                if authorName in duplicateNames:
                    citingAuthors = set(authorNames).difference(duplicateNames)
                else:
                    citingAuthors = set(authorNames)
                    citingAuthors.remove(authorName)
                citationsPerAuthor = citationCounts[authorName] / len(citingAuthors)

                # Make sure J is cited by the two non-D multi-disciplinary authors
                if authorName == 'J':
                    citationsPerAuthor = citationCounts[authorName] / 2
                    citingAuthors = ['E', 'I']

                # Loop through papers of all other authors
                for otherAuthorName in citingAuthors:
                    if authorName == otherAuthorName: continue
                    for conferenceName in conferenceNames:
                        for i in xrange(0, citationsPerAuthor):

                            # Add fake paper for citing the other author
                            citingPaper = Paper(SampleGraphUtility.__getNextId(), 'Citation%d%sPaperIn%s' % (i, otherAuthorName, conferenceName))
                            graph.addNode(citingPaper)
                            graph.addBothEdges(authorMap[otherAuthorName], citingPaper, Authorship())
                            graph.addBothEdges(citingPaper, conferenceMap[conferenceName], Publication())

                            # Add citation
                            graph.addEdge(citingPaper, citedPaperMap[conferenceName], Citation())
                            totalCitationCount[authorName] += 1

        if not uneven:
            return graph, authorMap, conferenceMap, totalCitationCount

        # If this flag is set, add three papers per author in data mining, and citations from all other authors
        for authorNamesList, conferenceNamesList in \
                [(dmAuthorNames, dmConferenceNames), (dbAuthorNames, dbConferenceNames)]:

            extraPapers = []

            # Add publications
            for authorName in authorNamesList:
                for conferenceName in conferenceNamesList:

                    # Add paper to be cited for author
                    citedPaper = Paper(SampleGraphUtility.__getNextId(), '%sPaperIn%s' % (authorName, conferenceName))
                    graph.addNode(citedPaper)
                    graph.addBothEdges(citedPaper, conferenceMap[conferenceName], Publication())
                    graph.addBothEdges(citedPaper, authorMap[authorName], Authorship())
                    extraPapers.append((authorName, citedPaper))

            random.seed()

            # Add randomized citations from authors to these papers
            for citingAuthorName in authorNamesList:
                for conferenceName in conferenceNamesList:
                    for citedAuthorName, citedPaper in extraPapers:

                        # Skip papers authored by this author
                        if citedAuthorName == citingAuthorName:
                            continue

                        # Randomly add a number of citations to this paper
                        for i in xrange(0, random.randint(0, 3)):

                            # Add fake paper for citing the other author
                            citingPaper = Paper(SampleGraphUtility.__getNextId(), 'Citation%d%sPaperIn%s' % (
                                i, citingAuthorName, conferenceName
                            ))
                            graph.addNode(citingPaper)
                            graph.addBothEdges(authorMap[citingAuthorName], citingPaper, Authorship())
                            graph.addBothEdges(citingPaper, conferenceMap[conferenceName], Publication())

                            # Add citation
                            graph.addEdge(citingPaper, citedPaper, Citation())
                            totalCitationCount[citedAuthorName] += 1



        return graph, authorMap, conferenceMap, totalCitationCount
Exemplo n.º 12
0
    def constructPathSimExampleThree(extraAuthorsAndCitations=False, citationMap=None):
        """
          Constructs "Example 3" from PathSim publication, ignoring topic nodes

            @see    http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.220.2455
        """

        graph = GraphFactory.createInstance()
        authorMap = {}
        conferenceMap = {}

        # Add authors
        mike = Author(SampleGraphUtility.__getNextId(), 'Mike')
        jim = Author(SampleGraphUtility.__getNextId(), 'Jim')
        mary = Author(SampleGraphUtility.__getNextId(), 'Mary')
        bob = Author(SampleGraphUtility.__getNextId(), 'Bob')
        ann = Author(SampleGraphUtility.__getNextId(), 'Ann')
        authors = [mike, jim, mary, bob, ann]
        if extraAuthorsAndCitations:
            joe = Author(SampleGraphUtility.__getNextId(), 'Joe')
            nancy = Author(SampleGraphUtility.__getNextId(), 'Nancy')
            authors += [joe, nancy]
        else:
            joe, nancy = None, None
        graph.addNodes(authors)

        # Add conferences
        sigmod = Conference(SampleGraphUtility.__getNextId(), 'SIGMOD')
        vldb = Conference(SampleGraphUtility.__getNextId(), 'VLDB')
        icde = Conference(SampleGraphUtility.__getNextId(), 'ICDE')
        kdd = Conference(SampleGraphUtility.__getNextId(), 'KDD')
        conferences = [sigmod, vldb, icde, kdd]
        graph.addNodes([sigmod, vldb, icde, kdd])

        # Add author / conference index
        for author in authors:
            authorMap[author.name] = author
        for conference in conferences:
            conferenceMap[conference.name] = conference

        # Add author / conference / papers index
        authorConferencePaperMap = defaultdict(lambda : defaultdict(list))

        # Add jim's papers
        for i in xrange(0, 70):
            conference = sigmod if i < 50 else vldb
            paper = Paper(SampleGraphUtility.__getNextId(), '%s Paper %d' % (conference.name, i + 1))
            graph.addNode(paper)
            graph.addBothEdges(jim, paper, Authorship())
            graph.addBothEdges(paper, conference, Publication())
            authorConferencePaperMap[jim][conference].append(paper)

        # Add ann's papers
        annsPaper1 = Paper(SampleGraphUtility.__getNextId(), 'ICDE Paper')
        annsPaper2 = Paper(SampleGraphUtility.__getNextId(), 'KDD Paper')
        graph.addBothEdges(ann, annsPaper1, Authorship())
        graph.addBothEdges(ann, annsPaper2, Authorship())
        graph.addBothEdges(annsPaper1, icde, Publication())
        graph.addBothEdges(annsPaper2, kdd, Publication())
        authorConferencePaperMap[ann][icde].append(annsPaper1)
        authorConferencePaperMap[ann][kdd].append(annsPaper2)

        # Auto-add remaining authors (2,1) paper numbers
        SampleGraphUtility.__addSimilarAuthorsPapers(graph, mike, sigmod, vldb, authorConferencePaperMap)
        SampleGraphUtility.__addSimilarAuthorsPapers(graph, mary, sigmod, icde, authorConferencePaperMap)
        SampleGraphUtility.__addSimilarAuthorsPapers(graph, bob, sigmod, vldb, authorConferencePaperMap)

        # Add extra authors & citation data
        if extraAuthorsAndCitations:
            SampleGraphUtility.__addSimilarAuthorsPapers(graph, joe, sigmod, vldb, authorConferencePaperMap)
            SampleGraphUtility.__addSimilarAuthorsPapers(graph, nancy, sigmod, vldb, authorConferencePaperMap)
            SampleGraphUtility.__constructCitations(graph, authorMap, conferenceMap, authorConferencePaperMap, citationMap)

        return graph, authorMap, conferenceMap