Python PaperReferenceExtractor.getCitesToAuthor示例，ReferenceParser.PaperReferenceExtractor.getCitesToAuthor Python示例

示例#1

0

显示文件

def count_self_cites(author, num_load):
    author.loadPapers(num_load, loadPaperPDFs=False, pubFilter=False)
    self_cite_arr = []
    print("Author fully loaded. Processing loaded papers...")

    try:
        for idx, paper in enumerate(author.getPapers()):
            #auth_word = get_ref_author_format(fname, lname, paper.getInfo()['Publisher'])
            auth_word = author.getLastName().title()

            title = paper.getInfo()['Title']
            print('Paper title: ' + str(title))
            paper.setPdfObj()

            analyzer = PaperReferenceExtractor()

            pdf_paper = paper.getPdfObj()
            if (pdf_paper is None):
                print('No PDF object for this paper, skipping.')
                self_cite_arr.append({
                    'Paper Title': title,
                    'Self Cites': 'No PDF found'
                })
                continue
            refContent = analyzer.getReferencesContent(pdf_paper)

            num_cites = 0
            if (refContent is not None):

                num_cites = analyzer.getCitesToAuthor(auth_word, refContent)
                #print (fname+ ' '+lname+ ' has '+str(numCites)+' number of self-cites in paper: '+ paper.getInfo()['Title'])
                self_cites_info = {
                    'Paper Title': title,
                    'Self Cites': num_cites
                }
            else:
                self_cites_info = {
                    'Paper Title': title,
                    'Self Cites': 'No PDF found'
                }

            print('Paper title: ' + str(title) + ' has self cites: ' +
                  str(num_cites))
            self_cite_arr.append(self_cites_info)

    except KeyboardInterrupt:
        print('key board KeyboardInterrupt returninbg self cite array')

    print(self_cite_arr)
    return self_cite_arr

示例#2

0

显示文件

文件： scrapper.py 项目： AnkaiJie/Google-Scholar-Citation-Fraud-Data-Collector

def count_overcites_paper(paper, author, cite_num_to_load=40):
    overcites_info = []
    try:
        all_pdfObjs = paper.getCitingPdfs(cite_num_to_load)

        analyzer = PaperReferenceExtractor()

        for idx, pdf in enumerate(all_pdfObjs):
            content = analyzer.getReferencesContent(pdf)
            title = pdf.getTitle()
            
            if content is None and title is not None:
                print("Citing paper number " + str(idx+1) + ": " + title + " had no PDF content found.")
                info_dict = {}
                info_dict['Citing Paper Number'] = idx+1
                info_dict['Title'] = title
                info_dict['Over-cite Count'] = "No PDF Found"
                overcites_info.append(info_dict)
                continue
            elif content is None:
                continue
                
            # print(content)
            lname = author.getLastName().title()
            numCites = analyzer.getCitesToAuthor(lname, content)
            if title is None:
                title = 'Unknown Title'
            print("Citing paper number " + str(idx+1) + ": " + title + " cites " + lname + " " + str(numCites) + " times.")
            info_dict = {}
            info_dict['Citing Paper Number'] = idx+1
            info_dict['Title'] = title
            info_dict['Over-cite Count'] = numCites
            overcites_info.append(info_dict)

    except AttributeError as e:
        print('google scholar possibly has blocked you, sending back collected data...')
        print(e)
        return overcites_info
    except KeyboardInterrupt:
        print('User ended program. Returning existing Data')
        return overcites_info

    return overcites_info

示例#3

0

显示文件

文件： scrapper.py 项目： AnkaiJie/Google-Scholar-Citation-Fraud-Data-Collector

def count_self_cites(author, num_load):
    author.loadPapers(num_load, loadPaperPDFs=False, pubFilter=False)
    self_cite_arr = []
    print("Author fully loaded. Processing loaded papers...")

    try:
        for idx, paper in enumerate(author.getPapers()):
            #auth_word = get_ref_author_format(fname, lname, paper.getInfo()['Publisher'])
            auth_word = author.getLastName().title()


            title = paper.getInfo()['Title']
            print('Paper title: ' + str(title))
            paper.setPdfObj()

            analyzer = PaperReferenceExtractor()

            pdf_paper = paper.getPdfObj()
            if (pdf_paper is None):
                print('No PDF object for this paper, skipping.')
                self_cite_arr.append({'Paper Title': title, 'Self Cites': 'No PDF found'})
                continue
            refContent = analyzer.getReferencesContent(pdf_paper)

            num_cites = 0
            if (refContent is not None):

                num_cites = analyzer.getCitesToAuthor(auth_word, refContent)
                #print (fname+ ' '+lname+ ' has '+str(numCites)+' number of self-cites in paper: '+ paper.getInfo()['Title'])
                self_cites_info = {'Paper Title': title, 'Self Cites': num_cites}
            else:
                self_cites_info = {'Paper Title': title, 'Self Cites': 'No PDF found'}

            print('Paper title: ' + str(title) + ' has self cites: ' + str(num_cites))
            self_cite_arr.append(self_cites_info)

        
    except KeyboardInterrupt:
        print('key board KeyboardInterrupt returninbg self cite array')

    print(self_cite_arr)
    return self_cite_arr

示例#4

0

显示文件

def count_overcites_paper(paper, author, cite_num_to_load=40):
    overcites_info = []
    try:
        all_pdfObjs = paper.getCitingPdfs(cite_num_to_load)

        analyzer = PaperReferenceExtractor()

        for idx, pdf in enumerate(all_pdfObjs):
            content = analyzer.getReferencesContent(pdf)
            title = pdf.getTitle()
            
            if content is None and title is not None:
                print("Citing paper number " + str(idx+1) + ": " + title + " had no PDF content found.")
                info_dict = {}
                info_dict['Citing Paper Number'] = idx+1
                info_dict['Title'] = title
                info_dict['Over-cite Count'] = "No PDF Found"
                overcites_info.append(info_dict)
                continue
            elif content is None:
                continue
                
            # print(content)
            lname = author.getLastName().title()
            numCites = analyzer.getCitesToAuthor(lname, content)
            if title is None:
                title = 'Unknown Title'
            print("Citing paper number " + str(idx+1) + ": " + title + " cites " + lname + " " + str(numCites) + " times.")
            info_dict = {}
            info_dict['Citing Paper Number'] = idx+1
            info_dict['Title'] = title
            info_dict['Over-cite Count'] = numCites
            overcites_info.append(info_dict)

    except KeyboardInterrupt:
        print('User ended program. Returning existing Data')
        WatLibSeleniumParser.reset()
        return overcites_info

    return overcites_info

示例#5

0

显示文件

文件： scrapper.py 项目： AnkaiJie/Google-Scholar-Citation-Fraud-Data-Collector

def count_overcites_paper(paper, author, cite_num_to_load=30):
    try:
        pdfExtractor = GscPdfExtractor()

        cited_by_url = paper.getCitedByUrl()
        url_part_one = SessionInitializer.ROOT_URL + '/scholar?start='
        url_part_two = '&hl=en&as_sdt=0,5&sciodt=0,5&cites='
        cited_by_url = cited_by_url[:cited_by_url.rfind('&')]
        paper_code = cited_by_url[cited_by_url.rfind('=')+1:]

        all_pdfObjs = []
        overcites_info = []

        print('-----------------------------------LOADING CITING PAPERS-----------------------------------')
        for i in range (0, cite_num_to_load, 10):
            time.sleep(10)
            final_url = url_part_one+str(i)+url_part_two+paper_code
            print('page url for citations:')
            print(final_url)
            current_pdfObjs = pdfExtractor.findPapersFromCitations(final_url)
            all_pdfObjs += current_pdfObjs

        print('-----------------------------------DONE CITING PAPERS-------------------------------------')

        print('Loaded: ' + str(len(all_pdfObjs)) + ' pdf objects.')

        analyzer = PaperReferenceExtractor()

        for idx, pdf in enumerate(all_pdfObjs):
            content = analyzer.getReferencesContent(pdf)
            title = pdf.getTitle()
            
            if content is None and title is not None:
                print("Citing paper number " + str(idx+1) + ": " + title + " had no PDF content found.")
                info_dict = {}
                info_dict['Citing Paper Number'] = idx+1
                info_dict['Title'] = title
                info_dict['Over-cite Count'] = "No PDF Found"
                overcites_info.append(info_dict)
                continue
            elif content is None:
                continue
                
            # print(content)
            lname = author.getLastName().title()
            numCites = analyzer.getCitesToAuthor(lname, content)
            if title is None:
                title = 'Unknown Title'
            print("Citing paper number " + str(idx+1) + ": " + title + " cites " + lname + " " + str(numCites) + " times.")
            info_dict = {}
            info_dict['Citing Paper Number'] = idx+1
            info_dict['Title'] = title
            info_dict['Over-cite Count'] = numCites
            overcites_info.append(info_dict)

    except AttributeError as e:
        print('google scholar possibly has blocked you, sending back collected data...')
        print(e)
        return overcites_info
    except KeyboardInterrupt:
        print('User ended program. Returning existing Data')
        return overcites_info

    return overcites_info

示例#6

0

显示文件

文件： scrapper.py 项目： AnkaiJie/Google-Scholar-Citation-Fraud-Data-Collector

def count_cross_cites_stage3(orig_author, author_dist, x_most_rel, top_x, y_most_rel):
    gsc_bot = GscHtmlFunctions()
    top_x_authors = []
    print('STAGE 3 CREATING NEW AUTHOR OBJECTS ---------------------------------------------------------')
    count_x = 0
    #this part will create valid author objects for each of the top cited authors and append it to a list
    for index, author_info in enumerate(author_dist):
        count_x+=1
        if count_x>y_most_rel:
            break

        time.sleep(10)

        if (index > top_x - 1):
            break

        #author info should be in the form ('author', {'freq': 5, 'papers':[]})
        first_paper_title = author_info[1]['papers'][0]
        frequency = author_info[1]['freq']
        author_name = author_info[0]
        print('Trying to find author: ' + str(author_info))
        returned_author = gsc_bot.get_author_from_search(author_name, first_paper_title)
        if returned_author is None:
            #if can't find gsc profile for author, go onto the next top cited author
            top_x += 1
        else:
            #each value is an array of two values: author object, and frequency cited
            top_x_authors.append([returned_author, returned_author.getFirstName(), returned_author.getLastName(), frequency])
    print('DONE STAGE 3 --------------------------------------------------------------------------')
    print('Top citing authors: ')
    print(top_x_authors)

    print('STAGE 4 COUNTING NUMBER OF CITATIONS TO ORIGINAL AUTHOR --------------------------------')
    #gets number of times each of these authors cites the original author

    # array to store another array of author, and how many times they cite the original author
    cited_author_info_arr = []
    ORIG_FNAME = orig_author.getFirstName()
    ORIG_LNAME = orig_author.getLastName()

    for cited_author_freq_arr in top_x_authors:
        time.sleep(5)
        top_cited_author = cited_author_freq_arr[0]

        top_cited_author.loadPapers(y_most_rel, pubFilter=True, delay=True)

        cited_fname = top_cited_author.getFirstName()
        cited_lname = top_cited_author.getLastName()

        print('ANALYZING AUTHOR: ' + str(cited_fname) + ' ' + str(cited_lname))

        temp_paper_lst = top_cited_author.getPapers()
        # Take out Papers with no PDFs
        temp_paper_lst = [p for p in temp_paper_lst if p.getPdfObj() is not None]
        pap_list_len = len(temp_paper_lst)
        total_paper_cites = []

        #determines number of times the paper cites the original author
        for paper in temp_paper_lst:
            pap_title = paper.getInfo()['Title']
            print('Paper title: ' + pap_title)
            
            # For ambiguous names
            auth_word = get_ref_author_format(ORIG_FNAME, ORIG_LNAME, paper.getInfo()['Publisher'])

            pdf_paper = paper.getPdfObj()
            analyzer = PaperReferenceExtractor()
            content = analyzer.getReferencesContent(pdf_paper)
            if (content is None):
                total_paper_cites.append([pap_title, -1]) 
                continue
            elif auth_word is None:
                print('for some reason, authword is none. Shouldnt be happening')
                continue

            num_cites = analyzer.getCitesToAuthor(auth_word, content)
            total_paper_cites.append([pap_title, num_cites]) 
            print(total_paper_cites)


        cited_author_info_arr.append([top_cited_author, cited_fname, cited_lname, total_paper_cites, pap_list_len])
    print('STAGE 4 COMPLETE ---------------------------------------------------------------------')
    print('cited_author_info_arr: ' + str(cited_author_info_arr))


    print('FINAL INFO DICTIONARY -------------------------------------------------------------')
    #compilation of all the information
    final_info_dict = {'First Name': ORIG_FNAME, 'Last Name': ORIG_LNAME, 
    'Author_citation_frequency': top_x_authors, 'Cited_authors_overcite_frequency': cited_author_info_arr,
    'x_most_rel': x_most_rel, 'y_most_rel': y_most_rel}
    print(final_info_dict)
    return final_info_dict

示例#7

0

显示文件

def count_overcites_paper(paper, author, cite_num_to_load=30):
    try:
        pdfExtractor = GscPdfExtractor()

        cited_by_url = paper.getCitedByUrl()
        url_part_one = SessionInitializer.ROOT_URL + '/scholar?start='
        url_part_two = '&hl=en&as_sdt=0,5&sciodt=0,5&cites='
        cited_by_url = cited_by_url[:cited_by_url.rfind('&')]
        paper_code = cited_by_url[cited_by_url.rfind('=') + 1:]

        all_pdfObjs = []
        overcites_info = []

        print(
            '-----------------------------------LOADING CITING PAPERS-----------------------------------'
        )
        for i in range(0, cite_num_to_load, 10):
            time.sleep(10)
            final_url = url_part_one + str(i) + url_part_two + paper_code
            print('page url for citations:')
            print(final_url)
            current_pdfObjs = pdfExtractor.findPapersFromCitations(final_url)
            all_pdfObjs += current_pdfObjs

        print(
            '-----------------------------------DONE CITING PAPERS-------------------------------------'
        )

        print('Loaded: ' + str(len(all_pdfObjs)) + ' pdf objects.')

        analyzer = PaperReferenceExtractor()

        for idx, pdf in enumerate(all_pdfObjs):
            content = analyzer.getReferencesContent(pdf)
            title = pdf.getTitle()

            if content is None and title is not None:
                print("Citing paper number " + str(idx + 1) + ": " + title +
                      " had no PDF content found.")
                info_dict = {}
                info_dict['Citing Paper Number'] = idx + 1
                info_dict['Title'] = title
                info_dict['Over-cite Count'] = "No PDF Found"
                overcites_info.append(info_dict)
                continue
            elif content is None:
                continue

            # print(content)
            lname = author.getLastName().title()
            numCites = analyzer.getCitesToAuthor(lname, content)
            if title is None:
                title = 'Unknown Title'
            print("Citing paper number " + str(idx + 1) + ": " + title +
                  " cites " + lname + " " + str(numCites) + " times.")
            info_dict = {}
            info_dict['Citing Paper Number'] = idx + 1
            info_dict['Title'] = title
            info_dict['Over-cite Count'] = numCites
            overcites_info.append(info_dict)

    except AttributeError as e:
        print(
            'google scholar possibly has blocked you, sending back collected data...'
        )
        print(e)
        return overcites_info
    except KeyboardInterrupt:
        print('User ended program. Returning existing Data')
        return overcites_info

    return overcites_info

示例#8

0

显示文件

def count_cross_cites_stage3(orig_author, author_dist, x_most_rel, top_x,
                             y_most_rel):
    gsc_bot = GscHtmlFunctions()
    top_x_authors = []
    print(
        'STAGE 3 CREATING NEW AUTHOR OBJECTS ---------------------------------------------------------'
    )
    count_x = 0
    #this part will create valid author objects for each of the top cited authors and append it to a list
    for index, author_info in enumerate(author_dist):
        count_x += 1
        if count_x > y_most_rel:
            break

        time.sleep(10)

        if (index > top_x - 1):
            break

        #author info should be in the form ('author', {'freq': 5, 'papers':[]})
        first_paper_title = author_info[1]['papers'][0]
        frequency = author_info[1]['freq']
        author_name = author_info[0]
        print('Trying to find author: ' + str(author_info))
        returned_author = gsc_bot.get_author_from_search(
            author_name, first_paper_title)
        if returned_author is None:
            #if can't find gsc profile for author, go onto the next top cited author
            top_x += 1
        else:
            #each value is an array of two values: author object, and frequency cited
            top_x_authors.append([
                returned_author,
                returned_author.getFirstName(),
                returned_author.getLastName(), frequency
            ])
    print(
        'DONE STAGE 3 --------------------------------------------------------------------------'
    )
    print('Top citing authors: ')
    print(top_x_authors)

    print(
        'STAGE 4 COUNTING NUMBER OF CITATIONS TO ORIGINAL AUTHOR --------------------------------'
    )
    #gets number of times each of these authors cites the original author

    # array to store another array of author, and how many times they cite the original author
    cited_author_info_arr = []
    ORIG_FNAME = orig_author.getFirstName()
    ORIG_LNAME = orig_author.getLastName()

    for cited_author_freq_arr in top_x_authors:
        time.sleep(5)
        top_cited_author = cited_author_freq_arr[0]

        top_cited_author.loadPapers(y_most_rel, pubFilter=True, delay=True)

        cited_fname = top_cited_author.getFirstName()
        cited_lname = top_cited_author.getLastName()

        print('ANALYZING AUTHOR: ' + str(cited_fname) + ' ' + str(cited_lname))

        temp_paper_lst = top_cited_author.getPapers()
        # Take out Papers with no PDFs
        temp_paper_lst = [
            p for p in temp_paper_lst if p.getPdfObj() is not None
        ]
        pap_list_len = len(temp_paper_lst)
        total_paper_cites = []

        #determines number of times the paper cites the original author
        for paper in temp_paper_lst:
            pap_title = paper.getInfo()['Title']
            print('Paper title: ' + pap_title)

            # For ambiguous names
            auth_word = get_ref_author_format(ORIG_FNAME, ORIG_LNAME,
                                              paper.getInfo()['Publisher'])

            pdf_paper = paper.getPdfObj()
            analyzer = PaperReferenceExtractor()
            content = analyzer.getReferencesContent(pdf_paper)
            if (content is None):
                total_paper_cites.append([pap_title, -1])
                continue
            elif auth_word is None:
                print(
                    'for some reason, authword is none. Shouldnt be happening')
                continue

            num_cites = analyzer.getCitesToAuthor(auth_word, content)
            total_paper_cites.append([pap_title, num_cites])
            print(total_paper_cites)

        cited_author_info_arr.append([
            top_cited_author, cited_fname, cited_lname, total_paper_cites,
            pap_list_len
        ])
    print(
        'STAGE 4 COMPLETE ---------------------------------------------------------------------'
    )
    print('cited_author_info_arr: ' + str(cited_author_info_arr))

    print(
        'FINAL INFO DICTIONARY -------------------------------------------------------------'
    )
    #compilation of all the information
    final_info_dict = {
        'First Name': ORIG_FNAME,
        'Last Name': ORIG_LNAME,
        'Author_citation_frequency': top_x_authors,
        'Cited_authors_overcite_frequency': cited_author_info_arr,
        'x_most_rel': x_most_rel,
        'y_most_rel': y_most_rel
    }
    print(final_info_dict)
    return final_info_dict