def findAllCitations(self):
        ref_processor = PaperReferenceExtractor()

        ref_content = ref_processor.getReferencesContent(self.__pdfObj)

        if (self.getInfo()['Publisher'] == 'Springer US'):
            parser = SpringerReferenceParser()
        elif (self.getInfo()['Publisher'] == 'IEEE'):
            parser = IeeeReferenceParser()
        else:
            raise Exception('Publisher not recognized; no citation parser for this format')

        citation_list = parser.citeParse(ref_content)

        for idx, citation in enumerate(citation_list):
            citation = Citation(citation)
            citation_list[idx] = citation

        return citation_list
Пример #2
0
    def findAllCitations(self):
        ref_processor = PaperReferenceExtractor()

        ref_content = ref_processor.getReferencesContent(self.__pdfObj)

        if (self.getInfo()['Publisher'] == 'Springer US'):
            parser = SpringerReferenceParser()
        elif (self.getInfo()['Publisher'] == 'IEEE'):
            parser = IeeeReferenceParser()
        else:
            raise Exception(
                'Publisher not recognized; no citation parser for this format')

        citation_list = parser.citeParse(ref_content)

        for idx, citation in enumerate(citation_list):
            citation = Citation(citation)
            citation_list[idx] = citation

        return citation_list
def count_cross_cites (author, x_most_rel, top_x, y_most_rel):
    author.loadPapers(x_most_rel, pubFilter=True, delay=True)
    paper_list = author.getPapers()
    x_most_rel = len(paper_list)
    ORIG_FNAME = author.getFirstName()
    ORIG_LNAME = author.getLastName()
    print("Total number of valid GSC papers: " + str(len(paper_list)))
    citation_list = []

    springer_bot = SpringerReferenceParser()
    ieee_bot = IeeeReferenceParser()

    # gets all the citations from all the papers in the list
    print('STAGE 1 GETTING CITATIONS')
    print("-----------------------------------------------------------")
    for paper in paper_list:
        pub = paper.getInfo()['Publisher']
        pdf_paper = paper.getPdfObj()
        print('Paper title: ' + str(paper.getInfo()['Title']))
        if (pdf_paper is None):
            print('paper object is none')
            continue

        extractor = PaperReferenceExtractor()
        ref_content = extractor.getReferencesContent(pdf_paper)

        if (ref_content is None):
            continue
        try:
            if (pub == 'IEEE'):
                citations = ieee_bot.citeParse(ref_content)
            elif (pub == 'Springer US'):
                citations = springer_bot.citeParse(ref_content)
            else:
                print('Invalid publication format from: ' + pub)
                continue
        except Exception as e:
            print('An exception occured with parsing citations: ' + str(e))

        citation_list += citations
    print("STAGE 1 COMPLETE -----------------------------------------------------------")
    print('From the valid top ' + str(top_x) +' papers, all the citations found: ' + str(citation_list))

    author_dist = {}

    #goes through each citation and takes out authors and paper names and puts it in the valid frequency dictionary
    # end results: {'author': {'freq': int frequency original author cites him, 'paper': [array of paper titles in which the cited author is cited]}, 
    print('STAGE 2 AGGREGATING CITATION COUNTS BY AUTHOR ------------------------------------')

    for citation in citation_list:
        title = citation['title']
        for cited_author in citation['authors']:
            if cited_author in author_dist:
                author_dist[cited_author]['freq'] += 1
                if title not in author_dist[cited_author]['papers']:
                    author_dist[cited_author]['papers'].append(title)
            else:
                author_dist[cited_author] = {}
                author_dist[cited_author]['freq'] = 1
                author_dist[cited_author]['papers'] = [title]


    #sorts the dictionary - now an array of tuples that are sorted by frequency
    #author_dist should be in the form [('author', {'freq': 5, 'papers':[]}), ...]
    author_dist = list(reversed(sorted(author_dist.items(), key=lambda x: x[1]['freq'])))
    print('STAGE 2 COMPLETE -----------------------------------------------------------------')
    print('sorted author list in tuples:')
    print(author_dist)

    count_cross_cites_stage3(author, author_dist, x_most_rel, top_x, y_most_rel)
Пример #4
0
def count_cross_cites(author, x_most_rel, top_x, y_most_rel):
    author.loadPapers(x_most_rel, pubFilter=True, delay=True)
    paper_list = author.getPapers()
    x_most_rel = len(paper_list)
    ORIG_FNAME = author.getFirstName()
    ORIG_LNAME = author.getLastName()
    print("Total number of valid GSC papers: " + str(len(paper_list)))
    citation_list = []

    springer_bot = SpringerReferenceParser()
    ieee_bot = IeeeReferenceParser()

    # gets all the citations from all the papers in the list
    print('STAGE 1 GETTING CITATIONS')
    print("-----------------------------------------------------------")
    for paper in paper_list:
        pub = paper.getInfo()['Publisher']
        pdf_paper = paper.getPdfObj()
        print('Paper title: ' + str(paper.getInfo()['Title']))
        if (pdf_paper is None):
            print('paper object is none')
            continue

        extractor = PaperReferenceExtractor()
        ref_content = extractor.getReferencesContent(pdf_paper)

        if (ref_content is None):
            continue
        try:
            if (pub == 'IEEE'):
                citations = ieee_bot.citeParse(ref_content)
            elif (pub == 'Springer US'):
                citations = springer_bot.citeParse(ref_content)
            else:
                print('Invalid publication format from: ' + pub)
                continue
        except Exception as e:
            print('An exception occured with parsing citations: ' + str(e))

        citation_list += citations
    print(
        "STAGE 1 COMPLETE -----------------------------------------------------------"
    )
    print('From the valid top ' + str(top_x) +
          ' papers, all the citations found: ' + str(citation_list))

    author_dist = {}

    #goes through each citation and takes out authors and paper names and puts it in the valid frequency dictionary
    # end results: {'author': {'freq': int frequency original author cites him, 'paper': [array of paper titles in which the cited author is cited]},
    print(
        'STAGE 2 AGGREGATING CITATION COUNTS BY AUTHOR ------------------------------------'
    )

    for citation in citation_list:
        title = citation['title']
        for cited_author in citation['authors']:
            if cited_author in author_dist:
                author_dist[cited_author]['freq'] += 1
                if title not in author_dist[cited_author]['papers']:
                    author_dist[cited_author]['papers'].append(title)
            else:
                author_dist[cited_author] = {}
                author_dist[cited_author]['freq'] = 1
                author_dist[cited_author]['papers'] = [title]

    #sorts the dictionary - now an array of tuples that are sorted by frequency
    #author_dist should be in the form [('author', {'freq': 5, 'papers':[]}), ...]
    author_dist = list(
        reversed(sorted(author_dist.items(), key=lambda x: x[1]['freq'])))
    print(
        'STAGE 2 COMPLETE -----------------------------------------------------------------'
    )
    print('sorted author list in tuples:')
    print(author_dist)

    count_cross_cites_stage3(author, author_dist, x_most_rel, top_x,
                             y_most_rel)