def count_self_cites(author, num_load): author.loadPapers(num_load, loadPaperPDFs=False, pubFilter=False) self_cite_arr = [] print("Author fully loaded. Processing loaded papers...") try: for idx, paper in enumerate(author.getPapers()): #auth_word = get_ref_author_format(fname, lname, paper.getInfo()['Publisher']) auth_word = author.getLastName().title() title = paper.getInfo()['Title'] print('Paper title: ' + str(title)) paper.setPdfObj() analyzer = PaperReferenceExtractor() pdf_paper = paper.getPdfObj() if (pdf_paper is None): print('No PDF object for this paper, skipping.') self_cite_arr.append({ 'Paper Title': title, 'Self Cites': 'No PDF found' }) continue refContent = analyzer.getReferencesContent(pdf_paper) num_cites = 0 if (refContent is not None): num_cites = analyzer.getCitesToAuthor(auth_word, refContent) #print (fname+ ' '+lname+ ' has '+str(numCites)+' number of self-cites in paper: '+ paper.getInfo()['Title']) self_cites_info = { 'Paper Title': title, 'Self Cites': num_cites } else: self_cites_info = { 'Paper Title': title, 'Self Cites': 'No PDF found' } print('Paper title: ' + str(title) + ' has self cites: ' + str(num_cites)) self_cite_arr.append(self_cites_info) except KeyboardInterrupt: print('key board KeyboardInterrupt returninbg self cite array') print(self_cite_arr) return self_cite_arr
def count_overcites_paper(paper, author, cite_num_to_load=40): overcites_info = [] try: all_pdfObjs = paper.getCitingPdfs(cite_num_to_load) analyzer = PaperReferenceExtractor() for idx, pdf in enumerate(all_pdfObjs): content = analyzer.getReferencesContent(pdf) title = pdf.getTitle() if content is None and title is not None: print("Citing paper number " + str(idx+1) + ": " + title + " had no PDF content found.") info_dict = {} info_dict['Citing Paper Number'] = idx+1 info_dict['Title'] = title info_dict['Over-cite Count'] = "No PDF Found" overcites_info.append(info_dict) continue elif content is None: continue # print(content) lname = author.getLastName().title() numCites = analyzer.getCitesToAuthor(lname, content) if title is None: title = 'Unknown Title' print("Citing paper number " + str(idx+1) + ": " + title + " cites " + lname + " " + str(numCites) + " times.") info_dict = {} info_dict['Citing Paper Number'] = idx+1 info_dict['Title'] = title info_dict['Over-cite Count'] = numCites overcites_info.append(info_dict) except AttributeError as e: print('google scholar possibly has blocked you, sending back collected data...') print(e) return overcites_info except KeyboardInterrupt: print('User ended program. Returning existing Data') return overcites_info return overcites_info
def count_self_cites(author, num_load): author.loadPapers(num_load, loadPaperPDFs=False, pubFilter=False) self_cite_arr = [] print("Author fully loaded. Processing loaded papers...") try: for idx, paper in enumerate(author.getPapers()): #auth_word = get_ref_author_format(fname, lname, paper.getInfo()['Publisher']) auth_word = author.getLastName().title() title = paper.getInfo()['Title'] print('Paper title: ' + str(title)) paper.setPdfObj() analyzer = PaperReferenceExtractor() pdf_paper = paper.getPdfObj() if (pdf_paper is None): print('No PDF object for this paper, skipping.') self_cite_arr.append({'Paper Title': title, 'Self Cites': 'No PDF found'}) continue refContent = analyzer.getReferencesContent(pdf_paper) num_cites = 0 if (refContent is not None): num_cites = analyzer.getCitesToAuthor(auth_word, refContent) #print (fname+ ' '+lname+ ' has '+str(numCites)+' number of self-cites in paper: '+ paper.getInfo()['Title']) self_cites_info = {'Paper Title': title, 'Self Cites': num_cites} else: self_cites_info = {'Paper Title': title, 'Self Cites': 'No PDF found'} print('Paper title: ' + str(title) + ' has self cites: ' + str(num_cites)) self_cite_arr.append(self_cites_info) except KeyboardInterrupt: print('key board KeyboardInterrupt returninbg self cite array') print(self_cite_arr) return self_cite_arr
def count_overcites_paper(paper, author, cite_num_to_load=40): overcites_info = [] try: all_pdfObjs = paper.getCitingPdfs(cite_num_to_load) analyzer = PaperReferenceExtractor() for idx, pdf in enumerate(all_pdfObjs): content = analyzer.getReferencesContent(pdf) title = pdf.getTitle() if content is None and title is not None: print("Citing paper number " + str(idx+1) + ": " + title + " had no PDF content found.") info_dict = {} info_dict['Citing Paper Number'] = idx+1 info_dict['Title'] = title info_dict['Over-cite Count'] = "No PDF Found" overcites_info.append(info_dict) continue elif content is None: continue # print(content) lname = author.getLastName().title() numCites = analyzer.getCitesToAuthor(lname, content) if title is None: title = 'Unknown Title' print("Citing paper number " + str(idx+1) + ": " + title + " cites " + lname + " " + str(numCites) + " times.") info_dict = {} info_dict['Citing Paper Number'] = idx+1 info_dict['Title'] = title info_dict['Over-cite Count'] = numCites overcites_info.append(info_dict) except KeyboardInterrupt: print('User ended program. Returning existing Data') WatLibSeleniumParser.reset() return overcites_info return overcites_info
def count_overcites_paper(paper, author, cite_num_to_load=30): try: pdfExtractor = GscPdfExtractor() cited_by_url = paper.getCitedByUrl() url_part_one = SessionInitializer.ROOT_URL + '/scholar?start=' url_part_two = '&hl=en&as_sdt=0,5&sciodt=0,5&cites=' cited_by_url = cited_by_url[:cited_by_url.rfind('&')] paper_code = cited_by_url[cited_by_url.rfind('=')+1:] all_pdfObjs = [] overcites_info = [] print('-----------------------------------LOADING CITING PAPERS-----------------------------------') for i in range (0, cite_num_to_load, 10): time.sleep(10) final_url = url_part_one+str(i)+url_part_two+paper_code print('page url for citations:') print(final_url) current_pdfObjs = pdfExtractor.findPapersFromCitations(final_url) all_pdfObjs += current_pdfObjs print('-----------------------------------DONE CITING PAPERS-------------------------------------') print('Loaded: ' + str(len(all_pdfObjs)) + ' pdf objects.') analyzer = PaperReferenceExtractor() for idx, pdf in enumerate(all_pdfObjs): content = analyzer.getReferencesContent(pdf) title = pdf.getTitle() if content is None and title is not None: print("Citing paper number " + str(idx+1) + ": " + title + " had no PDF content found.") info_dict = {} info_dict['Citing Paper Number'] = idx+1 info_dict['Title'] = title info_dict['Over-cite Count'] = "No PDF Found" overcites_info.append(info_dict) continue elif content is None: continue # print(content) lname = author.getLastName().title() numCites = analyzer.getCitesToAuthor(lname, content) if title is None: title = 'Unknown Title' print("Citing paper number " + str(idx+1) + ": " + title + " cites " + lname + " " + str(numCites) + " times.") info_dict = {} info_dict['Citing Paper Number'] = idx+1 info_dict['Title'] = title info_dict['Over-cite Count'] = numCites overcites_info.append(info_dict) except AttributeError as e: print('google scholar possibly has blocked you, sending back collected data...') print(e) return overcites_info except KeyboardInterrupt: print('User ended program. Returning existing Data') return overcites_info return overcites_info
def count_cross_cites_stage3(orig_author, author_dist, x_most_rel, top_x, y_most_rel): gsc_bot = GscHtmlFunctions() top_x_authors = [] print('STAGE 3 CREATING NEW AUTHOR OBJECTS ---------------------------------------------------------') count_x = 0 #this part will create valid author objects for each of the top cited authors and append it to a list for index, author_info in enumerate(author_dist): count_x+=1 if count_x>y_most_rel: break time.sleep(10) if (index > top_x - 1): break #author info should be in the form ('author', {'freq': 5, 'papers':[]}) first_paper_title = author_info[1]['papers'][0] frequency = author_info[1]['freq'] author_name = author_info[0] print('Trying to find author: ' + str(author_info)) returned_author = gsc_bot.get_author_from_search(author_name, first_paper_title) if returned_author is None: #if can't find gsc profile for author, go onto the next top cited author top_x += 1 else: #each value is an array of two values: author object, and frequency cited top_x_authors.append([returned_author, returned_author.getFirstName(), returned_author.getLastName(), frequency]) print('DONE STAGE 3 --------------------------------------------------------------------------') print('Top citing authors: ') print(top_x_authors) print('STAGE 4 COUNTING NUMBER OF CITATIONS TO ORIGINAL AUTHOR --------------------------------') #gets number of times each of these authors cites the original author # array to store another array of author, and how many times they cite the original author cited_author_info_arr = [] ORIG_FNAME = orig_author.getFirstName() ORIG_LNAME = orig_author.getLastName() for cited_author_freq_arr in top_x_authors: time.sleep(5) top_cited_author = cited_author_freq_arr[0] top_cited_author.loadPapers(y_most_rel, pubFilter=True, delay=True) cited_fname = top_cited_author.getFirstName() cited_lname = top_cited_author.getLastName() print('ANALYZING AUTHOR: ' + str(cited_fname) + ' ' + str(cited_lname)) temp_paper_lst = top_cited_author.getPapers() # Take out Papers with no PDFs temp_paper_lst = [p for p in temp_paper_lst if p.getPdfObj() is not None] pap_list_len = len(temp_paper_lst) total_paper_cites = [] #determines number of times the paper cites the original author for paper in temp_paper_lst: pap_title = paper.getInfo()['Title'] print('Paper title: ' + pap_title) # For ambiguous names auth_word = get_ref_author_format(ORIG_FNAME, ORIG_LNAME, paper.getInfo()['Publisher']) pdf_paper = paper.getPdfObj() analyzer = PaperReferenceExtractor() content = analyzer.getReferencesContent(pdf_paper) if (content is None): total_paper_cites.append([pap_title, -1]) continue elif auth_word is None: print('for some reason, authword is none. Shouldnt be happening') continue num_cites = analyzer.getCitesToAuthor(auth_word, content) total_paper_cites.append([pap_title, num_cites]) print(total_paper_cites) cited_author_info_arr.append([top_cited_author, cited_fname, cited_lname, total_paper_cites, pap_list_len]) print('STAGE 4 COMPLETE ---------------------------------------------------------------------') print('cited_author_info_arr: ' + str(cited_author_info_arr)) print('FINAL INFO DICTIONARY -------------------------------------------------------------') #compilation of all the information final_info_dict = {'First Name': ORIG_FNAME, 'Last Name': ORIG_LNAME, 'Author_citation_frequency': top_x_authors, 'Cited_authors_overcite_frequency': cited_author_info_arr, 'x_most_rel': x_most_rel, 'y_most_rel': y_most_rel} print(final_info_dict) return final_info_dict
def count_overcites_paper(paper, author, cite_num_to_load=30): try: pdfExtractor = GscPdfExtractor() cited_by_url = paper.getCitedByUrl() url_part_one = SessionInitializer.ROOT_URL + '/scholar?start=' url_part_two = '&hl=en&as_sdt=0,5&sciodt=0,5&cites=' cited_by_url = cited_by_url[:cited_by_url.rfind('&')] paper_code = cited_by_url[cited_by_url.rfind('=') + 1:] all_pdfObjs = [] overcites_info = [] print( '-----------------------------------LOADING CITING PAPERS-----------------------------------' ) for i in range(0, cite_num_to_load, 10): time.sleep(10) final_url = url_part_one + str(i) + url_part_two + paper_code print('page url for citations:') print(final_url) current_pdfObjs = pdfExtractor.findPapersFromCitations(final_url) all_pdfObjs += current_pdfObjs print( '-----------------------------------DONE CITING PAPERS-------------------------------------' ) print('Loaded: ' + str(len(all_pdfObjs)) + ' pdf objects.') analyzer = PaperReferenceExtractor() for idx, pdf in enumerate(all_pdfObjs): content = analyzer.getReferencesContent(pdf) title = pdf.getTitle() if content is None and title is not None: print("Citing paper number " + str(idx + 1) + ": " + title + " had no PDF content found.") info_dict = {} info_dict['Citing Paper Number'] = idx + 1 info_dict['Title'] = title info_dict['Over-cite Count'] = "No PDF Found" overcites_info.append(info_dict) continue elif content is None: continue # print(content) lname = author.getLastName().title() numCites = analyzer.getCitesToAuthor(lname, content) if title is None: title = 'Unknown Title' print("Citing paper number " + str(idx + 1) + ": " + title + " cites " + lname + " " + str(numCites) + " times.") info_dict = {} info_dict['Citing Paper Number'] = idx + 1 info_dict['Title'] = title info_dict['Over-cite Count'] = numCites overcites_info.append(info_dict) except AttributeError as e: print( 'google scholar possibly has blocked you, sending back collected data...' ) print(e) return overcites_info except KeyboardInterrupt: print('User ended program. Returning existing Data') return overcites_info return overcites_info
def count_cross_cites_stage3(orig_author, author_dist, x_most_rel, top_x, y_most_rel): gsc_bot = GscHtmlFunctions() top_x_authors = [] print( 'STAGE 3 CREATING NEW AUTHOR OBJECTS ---------------------------------------------------------' ) count_x = 0 #this part will create valid author objects for each of the top cited authors and append it to a list for index, author_info in enumerate(author_dist): count_x += 1 if count_x > y_most_rel: break time.sleep(10) if (index > top_x - 1): break #author info should be in the form ('author', {'freq': 5, 'papers':[]}) first_paper_title = author_info[1]['papers'][0] frequency = author_info[1]['freq'] author_name = author_info[0] print('Trying to find author: ' + str(author_info)) returned_author = gsc_bot.get_author_from_search( author_name, first_paper_title) if returned_author is None: #if can't find gsc profile for author, go onto the next top cited author top_x += 1 else: #each value is an array of two values: author object, and frequency cited top_x_authors.append([ returned_author, returned_author.getFirstName(), returned_author.getLastName(), frequency ]) print( 'DONE STAGE 3 --------------------------------------------------------------------------' ) print('Top citing authors: ') print(top_x_authors) print( 'STAGE 4 COUNTING NUMBER OF CITATIONS TO ORIGINAL AUTHOR --------------------------------' ) #gets number of times each of these authors cites the original author # array to store another array of author, and how many times they cite the original author cited_author_info_arr = [] ORIG_FNAME = orig_author.getFirstName() ORIG_LNAME = orig_author.getLastName() for cited_author_freq_arr in top_x_authors: time.sleep(5) top_cited_author = cited_author_freq_arr[0] top_cited_author.loadPapers(y_most_rel, pubFilter=True, delay=True) cited_fname = top_cited_author.getFirstName() cited_lname = top_cited_author.getLastName() print('ANALYZING AUTHOR: ' + str(cited_fname) + ' ' + str(cited_lname)) temp_paper_lst = top_cited_author.getPapers() # Take out Papers with no PDFs temp_paper_lst = [ p for p in temp_paper_lst if p.getPdfObj() is not None ] pap_list_len = len(temp_paper_lst) total_paper_cites = [] #determines number of times the paper cites the original author for paper in temp_paper_lst: pap_title = paper.getInfo()['Title'] print('Paper title: ' + pap_title) # For ambiguous names auth_word = get_ref_author_format(ORIG_FNAME, ORIG_LNAME, paper.getInfo()['Publisher']) pdf_paper = paper.getPdfObj() analyzer = PaperReferenceExtractor() content = analyzer.getReferencesContent(pdf_paper) if (content is None): total_paper_cites.append([pap_title, -1]) continue elif auth_word is None: print( 'for some reason, authword is none. Shouldnt be happening') continue num_cites = analyzer.getCitesToAuthor(auth_word, content) total_paper_cites.append([pap_title, num_cites]) print(total_paper_cites) cited_author_info_arr.append([ top_cited_author, cited_fname, cited_lname, total_paper_cites, pap_list_len ]) print( 'STAGE 4 COMPLETE ---------------------------------------------------------------------' ) print('cited_author_info_arr: ' + str(cited_author_info_arr)) print( 'FINAL INFO DICTIONARY -------------------------------------------------------------' ) #compilation of all the information final_info_dict = { 'First Name': ORIG_FNAME, 'Last Name': ORIG_LNAME, 'Author_citation_frequency': top_x_authors, 'Cited_authors_overcite_frequency': cited_author_info_arr, 'x_most_rel': x_most_rel, 'y_most_rel': y_most_rel } print(final_info_dict) return final_info_dict